2008-10-21 22:00:00 +08:00
/* libs/pixelflinger/codeflinger/texturing.cpp
** Copyright 2006, The Android Open Source Project
2016-09-29 01:07:20 +08:00
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
2008-10-21 22:00:00 +08:00
2016-09-29 01:07:20 +08:00
** http://www.apache.org/licenses/LICENSE-2.0
2008-10-21 22:00:00 +08:00
2016-09-29 01:07:20 +08:00
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** See the License for the specific language governing permissions and
2008-10-21 22:00:00 +08:00
** limitations under the License.
2016-10-18 05:28:00 +08:00
#define LOG_TAG "pixelflinger-code"
2008-10-21 22:00:00 +08:00
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
2016-09-29 01:07:20 +08:00
#include <stdlib.h>
2008-10-21 22:00:00 +08:00
#include <sys/types.h>
2017-01-11 05:19:54 +08:00
#include <log/log.h>
2008-10-21 22:00:00 +08:00
2013-04-02 06:17:55 +08:00
#include "GGLAssembler.h"
2008-10-21 22:00:00 +08:00
namespace android {
// ---------------------------------------------------------------------------
// iterators are initialized like this:
// (intToFixedCenter(x) * dx)>>16 + x0
// ((x<<16 + 0x8000) * dx)>>16 + x0
// ((x<<16)*dx + (0x8000*dx))>>16 + x0
// ( (x*dx) + dx>>1 ) + x0
// (x*dx) + (dx>>1 + x0)
void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
context_t const* c = mBuilderContext.c;
if (mSmooth) {
// NOTE: we could take this case in the mDithering + !mSmooth case,
// but this would use up to 4 more registers for the color components
// for only a little added quality.
// Currently, this causes the system to run out of registers in
// some case (see issue #719496)
comment("compute initial iterated color (smooth and/or dither case)");
parts.iterated_packed = 0;
parts.packed = 0;
// 0x1: color component
// 0x2: iterators
const int optReload = mOptLevel >> 1;
if (optReload >= 3) parts.reload = 0; // reload nothing
else if (optReload == 2) parts.reload = 2; // reload iterators
else if (optReload == 1) parts.reload = 1; // reload colors
else if (optReload <= 0) parts.reload = 3; // reload both
if (!mSmooth) {
// we're not smoothing (just dithering), we never have to
// reload the iterators
parts.reload &= ~2;
Scratch scratches(registerFile());
const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
for (int i=0 ; i<4 ; i++) {
if (!mInfo[i].iterated)
// this component exists in the destination and is not replaced
// by a texture unit.
const int c = (parts.reload & 1) ? t0 : obtainReg();
if (i==0) CONTEXT_LOAD(c, iterators.ydady);
if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
parts.argb[i].reg = c;
if (mInfo[i].smooth) {
parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
const int dvdx = parts.argb_dx[i].reg;
CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
MLA(AL, 0, c, x.reg, dvdx, c);
// adjust the color iterator to make sure it won't overflow
if (!mAA) {
// this is not needed when we're using anti-aliasing
// because we will (have to) clamp the components
// anyway.
int end = scratches.obtain();
MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
MLA(AL, 1, end, dvdx, end, c);
SUB(MI, 0, c, c, end);
BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
if (parts.reload & 1) {
CONTEXT_STORE(c, generated_vars.argb[i].c);
} else {
// We're not smoothed, so we can
// just use a packed version of the color and extract the
// components as needed (or not at all if we don't blend)
// figure out if we need the iterated color
int load = 0;
for (int i=0 ; i<4 ; i++) {
component_info_t& info = mInfo[i];
if ((info.inDest || info.needed) && !info.replaced)
load |= 1;
parts.iterated_packed = 1;
parts.packed = (!mTextureMachine.mask && !mBlending
&& !mFog && !mDithering);
parts.reload = 0;
if (load || parts.packed) {
if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
comment("load initial iterated color (8888 packed)");
CONTEXT_LOAD(parts.iterated.reg, packed8888);
} else {
comment("load initial iterated color (dest format packed)");
parts.iterated.setTo(obtainReg(), &mCbFormat);
// pre-mask the iterated color
const int bits = parts.iterated.size();
const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
uint32_t mask = 0;
if (mMasking) {
for (int i=0 ; i<4 ; i++) {
const int component_mask = 1<<i;
const int h = parts.iterated.format.c[i].h;
const int l = parts.iterated.format.c[i].l;
if (h && (!(mMasking & component_mask))) {
mask |= ((1<<(h-l))-1) << l;
if (mMasking && ((mask & size)==0)) {
// none of the components are present in the mask
} else {
CONTEXT_LOAD(parts.iterated.reg, packed);
if (mCbFormat.size == 1) {
AND(AL, 0, parts.iterated.reg,
parts.iterated.reg, imm(0xFF));
} else if (mCbFormat.size == 2) {
MOV(AL, 0, parts.iterated.reg,
reg_imm(parts.iterated.reg, LSR, 16));
// pre-mask the iterated color
if (mMasking) {
build_and_immediate(parts.iterated.reg, parts.iterated.reg,
mask, bits);
void GGLAssembler::build_iterated_color(
component_t& fragment,
const fragment_parts_t& parts,
int component,
Scratch& regs)
fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
if (!mInfo[component].iterated)
if (parts.iterated_packed) {
// iterated colors are packed, extract the one we need
extract(fragment, parts.iterated, component);
} else {
fragment.h = GGL_COLOR_BITS;
fragment.l = GGL_COLOR_BITS - 8;
fragment.flags |= CLEAR_LO;
// iterated colors are held in their own register,
// (smooth and/or dithering case)
if (parts.reload==3) {
// this implies mSmooth
Scratch scratches(registerFile());
int dx = scratches.obtain();
CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
ADD(AL, 0, dx, fragment.reg, dx);
CONTEXT_STORE(dx, generated_vars.argb[component].c);
} else if (parts.reload & 1) {
CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
} else {
// we don't reload, so simply rename the register and mark as
// non CORRUPTIBLE so that the texture env or blending code
// won't modify this (renamed) register
fragment.reg = parts.argb[component].reg;
fragment.flags &= ~CORRUPTIBLE;
if (mInfo[component].smooth && mAA) {
// when using smooth shading AND anti-aliasing, we need to clamp
// the iterators because there is always an extra pixel on the
// edges, which most of the time will cause an overflow
// (since technically its outside of the domain).
BIC(AL, 0, fragment.reg, fragment.reg,
reg_imm(fragment.reg, ASR, 31));
// ---------------------------------------------------------------------------
void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
// gather some informations about the components we need to process...
const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
switch(opcode) {
case GGL_COPY:
mLogicOp = 0;
case GGL_SET:
mLogicOp = LOGIC_OP;
case GGL_AND:
case GGL_XOR:
case GGL_OR:
case GGL_NOR:
case GGL_NAND:
case GGL_NOOP:
void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
uint8_t replaced=0;
mTextureMachine.mask = 0;
mTextureMachine.activeUnits = 0;
for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
texture_unit_t& tmu = mTextureMachine.tmu[i];
if (replaced == 0xF) {
// all components are replaced, skip this TMU.
tmu.format_idx = 0;
tmu.mask = 0;
tmu.replaced = replaced;
tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
tmu.format = c->formats[tmu.format_idx];
tmu.bits = tmu.format.size*8;
tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
&& tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
// 5551 linear filtering is not supported
if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
tmu.linear = 0;
tmu.mask = 0;
tmu.replaced = replaced;
if (tmu.format_idx) {
if (tmu.format.c[0].h) tmu.mask |= 0x1;
if (tmu.format.c[1].h) tmu.mask |= 0x2;
if (tmu.format.c[2].h) tmu.mask |= 0x4;
if (tmu.format.c[3].h) tmu.mask |= 0x8;
if (tmu.env == GGL_REPLACE) {
replaced |= tmu.mask;
} else if (tmu.env == GGL_DECAL) {
if (!tmu.format.c[GGLFormat::ALPHA].h) {
// if we don't have alpha, decal does nothing
tmu.mask = 0;
} else {
// decal always ignores At
tmu.mask &= ~(1<<GGLFormat::ALPHA);
mTextureMachine.mask |= tmu.mask;
//printf("%d: mask=%08lx, replaced=%08lx\n",
// i, int(tmu.mask), int(tmu.replaced));
mTextureMachine.replaced = replaced;
mTextureMachine.directTexture = 0;
//printf("replaced=%08lx\n", mTextureMachine.replaced);
void GGLAssembler::init_textures(
tex_coord_t* coords,
const reg_t& x, const reg_t& y)
const needs_t& needs = mBuilderContext.needs;
int Rx = x.reg;
int Ry = y.reg;
if (mTextureMachine.mask) {
comment("compute texture coordinates");
// init texture coordinates for each tmu
const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
const bool multiTexture = mTextureMachine.activeUnits > 1;
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
const texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.format_idx == 0)
if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
(tmu.twrap == GGL_NEEDS_WRAP_11))
// 1:1 texture
pointer_t& txPtr = coords[i].ptr;
txPtr.setTo(obtainReg(), tmu.bits);
CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16)); // x += (s>>16)
CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16)); // y += (t>>16)
// merge base & offset
CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
SMLABB(AL, Rx, Ry, txPtr.reg, Rx); // x+y*stride
2013-02-21 18:27:40 +08:00
CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
2008-10-21 22:00:00 +08:00
base_offset(txPtr, txPtr, Rx);
} else {
Scratch scratches(registerFile());
reg_t& s = coords[i].s;
reg_t& t = coords[i].t;
// s = (x * dsdx)>>16 + ydsdy
// s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
// t = (x * dtdx)>>16 + ydtdy
// t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
const int need_w = GGL_READ_NEEDS(W, needs.n);
if (need_w) {
CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
} else {
int ydsdy = scratches.obtain();
int ydtdy = scratches.obtain();
CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
if ((mOptLevel&1)==0) {
CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
// direct texture?
if (!multiTexture && !mBlending && !mDithering && !mFog &&
cb_format_idx == tmu.format_idx && !tmu.linear &&
mTextureMachine.replaced == tmu.mask)
mTextureMachine.directTexture = i + 1;
void GGLAssembler::build_textures( fragment_parts_t& parts,
Scratch& regs)
// We don't have a way to spill registers automatically
// spill depth and AA regs, when we know we may have to.
// build the spill list...
uint32_t spill_list = 0;
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
const texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.format_idx == 0)
if (tmu.linear) {
// we may run out of register if we have linear filtering
// at 1 or 4 bytes / pixel on any texture unit.
if (tmu.format.size == 1) {
// if depth and AA enabled, we'll run out of 1 register
if (parts.z.reg > 0 && parts.covPtr.reg > 0)
spill_list |= 1<<parts.covPtr.reg;
if (tmu.format.size == 4) {
// if depth or AA enabled, we'll run out of 1 or 2 registers
if (parts.z.reg > 0)
spill_list |= 1<<parts.z.reg;
if (parts.covPtr.reg > 0)
spill_list |= 1<<parts.covPtr.reg;
Spill spill(registerFile(), *this, spill_list);
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
const texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.format_idx == 0)
pointer_t& txPtr = parts.coords[i].ptr;
pixel_t& texel = parts.texel[i];
2017-10-27 02:19:43 +08:00
2008-10-21 22:00:00 +08:00
// repeat...
if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
(tmu.twrap == GGL_NEEDS_WRAP_11))
{ // 1:1 textures
comment("fetch texel");
texel.setTo(regs.obtain(), &tmu.format);
load(txPtr, texel, WRITE_BACK);
} else {
Scratch scratches(registerFile());
reg_t& s = parts.coords[i].s;
reg_t& t = parts.coords[i].t;
if ((mOptLevel&1)==0) {
comment("reload s/t (multitexture or linear filtering)");
s.reg = scratches.obtain();
t.reg = scratches.obtain();
CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
2012-02-02 02:54:19 +08:00
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
2008-10-21 22:00:00 +08:00
comment("compute repeat/clamp");
int u = scratches.obtain();
int v = scratches.obtain();
int width = scratches.obtain();
int height = scratches.obtain();
int U = 0;
int V = 0;
2012-02-02 02:54:19 +08:00
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
2008-10-21 22:00:00 +08:00
CONTEXT_LOAD(width, generated_vars.texture[i].width);
CONTEXT_LOAD(height, generated_vars.texture[i].height);
int FRAC_BITS = 0;
if (tmu.linear) {
// linear interpolation
if (tmu.format.size == 1) {
// for 8-bits textures, we can afford
// 7 bits of fractional precision at no
// additional cost (we can't do 8 bits
// because filter8 uses signed 16 bits muls)
} else if (tmu.format.size == 2) {
// filter16() is internally limited to 4 bits, so:
// FRAC_BITS=2 generates less instructions,
// FRAC_BITS=3,4,5 creates unpleasant artifacts,
// FRAC_BITS=6+ looks good
} else if (tmu.format.size == 4) {
// filter32() is internally limited to 8 bits, so:
// FRAC_BITS=4 looks good
// FRAC_BITS=5+ looks better, but generates 3 extra ipp
} else {
// for all other cases we use 4 bits.
wrapping(u, s.reg, width, tmu.swrap, FRAC_BITS);
wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
if (tmu.linear) {
comment("compute linear filtering offsets");
// pixel size scale
const int shift = 31 - gglClz(tmu.format.size);
U = scratches.obtain();
V = scratches.obtain();
2012-02-02 02:54:19 +08:00
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
2008-10-21 22:00:00 +08:00
// sample the texel center
SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
// get the fractionnal part of U,V
AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
// compute width-1 and height-1
SUB(AL, 0, width, width, imm(1));
SUB(AL, 0, height, height, imm(1));
// get the integer part of U,V and clamp/wrap
// and compute offset to the next texel
if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
// u has already been REPEATed
MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
MOV(MI, 0, u, width);
CMP(AL, u, width);
MOV(LT, 0, width, imm(1 << shift));
if (shift)
MOV(GE, 0, width, reg_imm(width, LSL, shift));
RSB(GE, 0, width, width, imm(0));
} else {
// u has not been CLAMPed yet
// algorithm:
// if ((u>>4) >= width)
// u = width<<4
// width = 0
// else
// width = 1<<shift
// u = u>>4; // get integer part
// if (u<0)
// u = 0
// width = 0
// generated_vars.rt = width
CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
MOV(LE, 0, width, imm(0));
MOV(GT, 0, width, imm(1 << shift));
MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
MOV(MI, 0, u, imm(0));
MOV(MI, 0, width, imm(0));
CONTEXT_STORE(width, generated_vars.rt);
const int stride = width;
CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
// v has already been REPEATed
MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
MOV(MI, 0, v, height);
CMP(AL, v, height);
MOV(LT, 0, height, imm(1 << shift));
if (shift)
MOV(GE, 0, height, reg_imm(height, LSL, shift));
RSB(GE, 0, height, height, imm(0));
MUL(AL, 0, height, stride, height);
} else {
Adds UXTB16 support to Pixelflinger
* Add support for UXTB16 to the disassembler
* Add encoding of the UXTB16 instruction to the Pixelflinger JIT.
Introducing the UXTB16 instruction allows removal of some masking code, and is
beneficial from a pipeline point of view - lots of UXTB16 followed by MUL
Also, further rescheduling and use of SMULWB brings extra performance
* Use UXTB16 in bilinear filtered texturing
Uses UXTB16 to extract channels for SIMD operations, rather than creating and
ANDing with masks. Saves a register and is faster on A8, as UXTB16 result can
feed into first stage of multiply, unlike AND.
Also, used SMULWB rather than SMULBB, which allows removal of MOVs used to
rescale results.
Code has been scheduled for A8 pipeline, specifically aiming to allow
multiplies to issue in pipeline 0, for efficient dual issue operation.
Testing on SpriteMethodTest (http://code.google.com/p/apps-for-android/) gives
8% improvement (12.7 vs. 13.7 fps.)
SMULBB to SMULWB trick could be used in <v6 code path, but this hasn't been
2009-12-07 21:59:59 +08:00
// v has not been CLAMPed yet
2008-10-21 22:00:00 +08:00
CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
MOV(LE, 0, height, imm(0));
if (shift) {
MOV(GT, 0, height, reg_imm(stride, LSL, shift));
} else {
MOV(GT, 0, height, stride);
MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
MOV(MI, 0, v, imm(0));
MOV(MI, 0, height, imm(0));
CONTEXT_STORE(height, generated_vars.lb);
// iterate texture coordinates...
comment("iterate s,t");
int dsdx = scratches.obtain();
int dtdx = scratches.obtain();
2012-02-02 02:54:19 +08:00
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
2008-10-21 22:00:00 +08:00
CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
ADD(AL, 0, s.reg, s.reg, dsdx);
ADD(AL, 0, t.reg, t.reg, dtdx);
if ((mOptLevel&1)==0) {
CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
// merge base & offset...
comment("merge base & offset");
texel.setTo(regs.obtain(), &tmu.format);
txPtr.setTo(texel.reg, tmu.bits);
int stride = scratches.obtain();
2012-02-02 02:54:19 +08:00
if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
2008-10-21 22:00:00 +08:00
CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
2013-02-21 18:27:40 +08:00
CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
2008-10-21 22:00:00 +08:00
SMLABB(AL, u, v, stride, u); // u+v*stride
base_offset(txPtr, txPtr, u);
// load texel
if (!tmu.linear) {
comment("fetch texel");
load(txPtr, texel, 0);
} else {
// recycle registers we don't need anymore
comment("fetch texel, bilinear");
switch (tmu.format.size) {
case 1: filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
void GGLAssembler::build_iterate_texture_coordinates(
const fragment_parts_t& parts)
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
const texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.format_idx == 0)
if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
(tmu.twrap == GGL_NEEDS_WRAP_11))
{ // 1:1 textures
const pointer_t& txPtr = parts.coords[i].ptr;
ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
} else {
Scratch scratches(registerFile());
int s = parts.coords[i].s.reg;
int t = parts.coords[i].t.reg;
if ((mOptLevel&1)==0) {
s = scratches.obtain();
t = scratches.obtain();
CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
int dsdx = scratches.obtain();
int dtdx = scratches.obtain();
CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
ADD(AL, 0, s, s, dsdx);
ADD(AL, 0, t, t, dtdx);
if ((mOptLevel&1)==0) {
CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
void GGLAssembler::filter8(
2014-02-17 23:15:46 +08:00
const fragment_parts_t& /*parts*/,
2008-10-21 22:00:00 +08:00
pixel_t& texel, const texture_unit_t& tmu,
int U, int V, pointer_t& txPtr,
if (tmu.format.components != GGL_ALPHA &&
tmu.format.components != GGL_LUMINANCE)
// this is a packed format, and we don't support
// linear filtering (it's probably RGB 332)
// Should not happen with OpenGL|ES
LDRB(AL, texel.reg, txPtr.reg);
// ------------------------
// about ~22 cycles / pixel
Scratch scratches(registerFile());
int pixel= scratches.obtain();
int d = scratches.obtain();
int u = scratches.obtain();
int k = scratches.obtain();
int rt = scratches.obtain();
int lb = scratches.obtain();
// RB -> U * V
CONTEXT_LOAD(rt, generated_vars.rt);
CONTEXT_LOAD(lb, generated_vars.lb);
int offset = pixel;
ADD(AL, 0, offset, lb, rt);
LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
SMULBB(AL, u, U, V);
SMULBB(AL, d, pixel, u);
RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
// LB -> (1-U) * V
RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
SMULBB(AL, u, U, V);
SMLABB(AL, d, pixel, u, d);
SUB(AL, 0, k, k, u);
// LT -> (1-U)*(1-V)
RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
LDRB(AL, pixel, txPtr.reg);
SMULBB(AL, u, U, V);
SMLABB(AL, d, pixel, u, d);
// RT -> U*(1-V)
LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
SUB(AL, 0, u, k, u);
SMLABB(AL, texel.reg, pixel, u, d);
for (int i=0 ; i<4 ; i++) {
if (!texel.format.c[i].h) continue;
texel.format.c[i].h = FRAC_BITS*2+8;
texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
texel.format.size = 4;
texel.format.bitsPerPixel = 32;
texel.flags |= CLEAR_LO;
void GGLAssembler::filter16(
2014-02-17 23:15:46 +08:00
const fragment_parts_t& /*parts*/,
2008-10-21 22:00:00 +08:00
pixel_t& texel, const texture_unit_t& tmu,
int U, int V, pointer_t& txPtr,
// compute the mask
// XXX: it would be nice if the mask below could be computed
// automatically.
uint32_t mask = 0;
int shift = 0;
int prec = 0;
switch (tmu.format_idx) {
// source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
// result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
mask = 0x07E0F81F;
shift = 16;
prec = 5;
// 0000,1111,0000,1111 | 0000,1111,0000,1111
mask = 0x0F0F0F0F;
shift = 12;
prec = 4;
// 0000,0000,1111,1111 | 0000,0000,1111,1111
// AALL -> 00AA | 00LL
mask = 0x00FF00FF;
shift = 8;
prec = 8;
// unsupported format, do something sensical...
2012-01-06 22:13:42 +08:00
ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
2008-10-21 22:00:00 +08:00
LDRH(AL, texel.reg, txPtr.reg);
const int adjust = FRAC_BITS*2 - prec;
const int round = 0;
// update the texel format
texel.format.size = 4;
texel.format.bitsPerPixel = 32;
texel.flags |= CLEAR_HI|CLEAR_LO;
for (int i=0 ; i<4 ; i++) {
if (!texel.format.c[i].h) continue;
const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
// ------------------------
// about ~40 cycles / pixel
Scratch scratches(registerFile());
int pixel= scratches.obtain();
int d = scratches.obtain();
int u = scratches.obtain();
int k = scratches.obtain();
// RB -> U * V
int offset = pixel;
CONTEXT_LOAD(offset, generated_vars.rt);
CONTEXT_LOAD(u, generated_vars.lb);
ADD(AL, 0, offset, offset, u);
LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
SMULBB(AL, u, U, V);
ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
build_and_immediate(pixel, pixel, mask, 32);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
MUL(AL, 0, d, pixel, u);
RSB(AL, 0, k, u, imm(1<<prec));
// LB -> (1-U) * V
CONTEXT_LOAD(offset, generated_vars.lb);
RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
SMULBB(AL, u, U, V);
ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
build_and_immediate(pixel, pixel, mask, 32);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
MLA(AL, 0, d, pixel, u, d);
SUB(AL, 0, k, k, u);
// LT -> (1-U)*(1-V)
RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
LDRH(AL, pixel, txPtr.reg);
SMULBB(AL, u, U, V);
ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
build_and_immediate(pixel, pixel, mask, 32);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
MLA(AL, 0, d, pixel, u, d);
// RT -> U*(1-V)
CONTEXT_LOAD(offset, generated_vars.rt);
LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
SUB(AL, 0, u, k, u);
ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
build_and_immediate(pixel, pixel, mask, 32);
MLA(AL, 0, texel.reg, pixel, u, d);
void GGLAssembler::filter24(
2014-02-17 23:15:46 +08:00
const fragment_parts_t& /*parts*/,
pixel_t& texel, const texture_unit_t& /*tmu*/,
int /*U*/, int /*V*/, pointer_t& txPtr,
int /*FRAC_BITS*/)
2008-10-21 22:00:00 +08:00
// not supported yet (currently disabled)
load(txPtr, texel, 0);
void GGLAssembler::filter32(
2014-02-17 23:15:46 +08:00
const fragment_parts_t& /*parts*/,
pixel_t& texel, const texture_unit_t& /*tmu*/,
2008-10-21 22:00:00 +08:00
int U, int V, pointer_t& txPtr,
const int adjust = FRAC_BITS*2 - 8;
const int round = 0;
// ------------------------
// about ~38 cycles / pixel
Scratch scratches(registerFile());
int pixel= scratches.obtain();
int dh = scratches.obtain();
int u = scratches.obtain();
int k = scratches.obtain();
int temp = scratches.obtain();
int dl = scratches.obtain();
int mask = scratches.obtain();
MOV(AL, 0, mask, imm(0xFF));
ORR(AL, 0, mask, mask, imm(0xFF0000));
// RB -> U * V
int offset = pixel;
CONTEXT_LOAD(offset, generated_vars.rt);
CONTEXT_LOAD(u, generated_vars.lb);
ADD(AL, 0, offset, offset, u);
LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
SMULBB(AL, u, U, V);
AND(AL, 0, temp, mask, pixel);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
MUL(AL, 0, dh, temp, u);
AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
MUL(AL, 0, dl, temp, u);
RSB(AL, 0, k, u, imm(0x100));
// LB -> (1-U) * V
CONTEXT_LOAD(offset, generated_vars.lb);
RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
SMULBB(AL, u, U, V);
AND(AL, 0, temp, mask, pixel);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
MLA(AL, 0, dh, temp, u, dh);
AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
MLA(AL, 0, dl, temp, u, dl);
SUB(AL, 0, k, k, u);
// LT -> (1-U)*(1-V)
RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
LDR(AL, pixel, txPtr.reg);
SMULBB(AL, u, U, V);
AND(AL, 0, temp, mask, pixel);
if (adjust) {
if (round)
ADD(AL, 0, u, u, imm(1<<(adjust-1)));
MOV(AL, 0, u, reg_imm(u, LSR, adjust));
MLA(AL, 0, dh, temp, u, dh);
AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
MLA(AL, 0, dl, temp, u, dl);
// RT -> U*(1-V)
CONTEXT_LOAD(offset, generated_vars.rt);
LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
SUB(AL, 0, u, k, u);
AND(AL, 0, temp, mask, pixel);
MLA(AL, 0, dh, temp, u, dh);
AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
MLA(AL, 0, dl, temp, u, dl);
AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
ORR(AL, 0, texel.reg, dh, dl);
void GGLAssembler::build_texture_environment(
component_t& fragment,
const fragment_parts_t& parts,
int component,
Scratch& regs)
const uint32_t component_mask = 1<<component;
const bool multiTexture = mTextureMachine.activeUnits > 1;
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
texture_unit_t& tmu = mTextureMachine.tmu[i];
if (tmu.mask & component_mask) {
// replace or modulate with this texture
if ((tmu.replaced & component_mask) == 0) {
// not replaced by a later tmu...
Scratch scratches(registerFile());
pixel_t texel(parts.texel[i]);
2012-02-02 02:54:19 +08:00
2008-10-21 22:00:00 +08:00
if (multiTexture &&
tmu.swrap == GGL_NEEDS_WRAP_11 &&
tmu.twrap == GGL_NEEDS_WRAP_11)
texel.reg = scratches.obtain();
texel.flags |= CORRUPTIBLE;
comment("fetch texel (multitexture 1:1)");
load(parts.coords[i].ptr, texel, WRITE_BACK);
component_t incoming(fragment);
modify(fragment, regs);
switch (tmu.env) {
extract(fragment, texel, component);
modulate(fragment, incoming, texel, component);
decal(fragment, incoming, texel, component);
blend(fragment, incoming, texel, component, i);
2008-12-18 10:08:08 +08:00
case GGL_ADD:
add(fragment, incoming, texel, component);
2008-10-21 22:00:00 +08:00
// ---------------------------------------------------------------------------
void GGLAssembler::wrapping(
int d,
int coord, int size,
int tx_wrap, int tx_linear)
// notes:
// if tx_linear is set, we need 4 extra bits of precision on the result
// SMULL/UMULL is 3 cycles
Scratch scratches(registerFile());
int c = coord;
if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
// UMULL takes 4 cycles (interlocked), and we can get away with
// 2 cycles using SMULWB, but we're loosing 16 bits of precision
// out of 32 (this is not a problem because the iterator keeps
// its full precision)
// UMULL(AL, 0, size, d, c, size);
// note: we can't use SMULTB because it's signed.
MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
SMULWB(AL, d, d, size);
} else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
if (tx_linear) {
// 1 cycle
MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
} else {
// 4 cycles (common case)
MOV(AL, 0, d, reg_imm(coord, ASR, 16));
BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
CMP(AL, d, size);
SUB(GE, 0, d, size, imm(1));
// ---------------------------------------------------------------------------
void GGLAssembler::modulate(
component_t& dest,
const component_t& incoming,
const pixel_t& incomingTexel, int component)
Scratch locals(registerFile());
integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
extract(texel, incomingTexel, component);
const int Nt = texel.size();
// Nt should always be less than 10 bits because it comes
// from the TMU.
int Ni = incoming.size();
// Ni could be big because it comes from previous MODULATEs
if (Nt == 1) {
// texel acts as a bit-mask
// dest = incoming & ((texel << incoming.h)-texel)
RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
AND(AL, 0, dest.reg, dest.reg, incoming.reg);
dest.l = incoming.l;
dest.h = incoming.h;
dest.flags |= (incoming.flags & CLEAR_LO);
} else if (Ni == 1) {
MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
dest.l = 0;
dest.h = Nt;
} else {
int inReg = incoming.reg;
int shift = incoming.l;
if ((Nt + Ni) > 32) {
// we will overflow, reduce the precision of Ni to 8 bits
// (Note Nt cannot be more than 10 bits which happens with
// 565 textures and GGL_LINEAR)
shift += Ni-8;
Ni = 8;
// modulate by the component with the lowest precision
if (Nt >= Ni) {
if (shift) {
// XXX: we should be able to avoid this shift
// when shift==16 && Nt<16 && Ni<16, in which
// we could use SMULBT below.
MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
inReg = dest.reg;
shift = 0;
// operation: (Cf*Ct)/((1<<Ni)-1)
// approximated with: Cf*(Ct + Ct>>(Ni-1))>>Ni
// this operation doesn't change texel's size
ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
else MUL(AL, 0, dest.reg, texel.reg, dest.reg);
dest.l = Ni;
dest.h = Nt + Ni;
} else {
if (shift && (shift != 16)) {
// if shift==16, we can use 16-bits mul instructions later
MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
inReg = dest.reg;
shift = 0;
// operation: (Cf*Ct)/((1<<Nt)-1)
// approximated with: Ct*(Cf + Cf>>(Nt-1))>>Nt
// this operation doesn't change incoming's size
Scratch scratches(registerFile());
int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
if (t == inReg)
t = scratches.obtain();
ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
if (Nt<16 && Ni<16) {
if (shift==16) SMULBT(AL, dest.reg, t, inReg);
else SMULBB(AL, dest.reg, t, inReg);
} else MUL(AL, 0, dest.reg, t, inReg);
dest.l = Nt;
dest.h = Nt + Ni;
// low bits are not valid
dest.flags |= CLEAR_LO;
// no need to keep more than 8 bits/component
if (dest.size() > 8)
dest.l = dest.h-8;
void GGLAssembler::decal(
component_t& dest,
const component_t& incoming,
const pixel_t& incomingTexel, int component)
// RGBA:
// Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
// Av = Af
Scratch locals(registerFile());
integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
extract(texel, incomingTexel, component);
extract(factor, incomingTexel, GGLFormat::ALPHA);
// no need to keep more than 8-bits for decal
int Ni = incoming.size();
int shift = incoming.l;
if (Ni > 8) {
shift += Ni-8;
Ni = 8;
integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
if (shift) {
MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
incomingNorm.reg = dest.reg;
incomingNorm.flags |= CORRUPTIBLE;
ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
build_blendOneMinusFF(dest, factor, incomingNorm, texel);
void GGLAssembler::blend(
component_t& dest,
const component_t& incoming,
const pixel_t& incomingTexel, int component, int tmu)
// RGBA:
// Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
// Av = At*Af
if (component == GGLFormat::ALPHA) {
modulate(dest, incoming, incomingTexel, component);
Scratch locals(registerFile());
integer_t color(locals.obtain(), 8, CORRUPTIBLE);
integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
LDRB(AL, color.reg, mBuilderContext.Rctx,
extract(factor, incomingTexel, component);
// no need to keep more than 8-bits for blend
int Ni = incoming.size();
int shift = incoming.l;
if (Ni > 8) {
shift += Ni-8;
Ni = 8;
integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
if (shift) {
MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
incomingNorm.reg = dest.reg;
incomingNorm.flags |= CORRUPTIBLE;
ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
build_blendOneMinusFF(dest, factor, incomingNorm, color);
2008-12-18 10:08:08 +08:00
void GGLAssembler::add(
component_t& dest,
const component_t& incoming,
const pixel_t& incomingTexel, int component)
// RGBA:
// Cv = Cf + Ct;
Scratch locals(registerFile());
component_t incomingTemp(incoming);
// use "dest" as a temporary for extracting the texel, unless "dest"
// overlaps "incoming".
integer_t texel(dest.reg, 32, CORRUPTIBLE);
if (dest.reg == incomingTemp.reg)
texel.reg = locals.obtain();
extract(texel, incomingTexel, component);
if (texel.s < incomingTemp.size()) {
expand(texel, texel, incomingTemp.size());
} else if (texel.s > incomingTemp.size()) {
if (incomingTemp.flags & CORRUPTIBLE) {
expand(incomingTemp, incomingTemp, texel.s);
} else {
incomingTemp.reg = locals.obtain();
expand(incomingTemp, incoming, texel.s);
if (incomingTemp.l) {
ADD(AL, 0, dest.reg, texel.reg,
reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
} else {
ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
dest.l = 0;
dest.h = texel.size();
2008-10-21 22:00:00 +08:00
// ----------------------------------------------------------------------------
}; // namespace android