tcg-ia64: Move bswap for store into tlb load

Saving at least two cycles per store, and cleaning up the code.

Signed-off-by: Richard Henderson <rth@twiddle.net>
This commit is contained in:
Richard Henderson 2013-09-05 20:02:51 -04:00
parent 4c186ee2cf
commit b672cf66c3
1 changed files with 31 additions and 63 deletions

View File

@ -1571,9 +1571,11 @@ QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
/* Load and compare a TLB entry, and return the result in (p6, p7). /* Load and compare a TLB entry, and return the result in (p6, p7).
R2 is loaded with the address of the addend TLB entry. R2 is loaded with the address of the addend TLB entry.
R57 is loaded with the address, zero extented on 32-bit targets. R57 is loaded with the address, zero extented on 32-bit targets.
R1, R3 are clobbered. */ R1, R3 are clobbered, leaving R56 free for...
BSWAP_1, BSWAP_2 and I-slot insns for swapping data for store. */
static inline void tcg_out_qemu_tlb(TCGContext *s, TCGReg addr_reg, static inline void tcg_out_qemu_tlb(TCGContext *s, TCGReg addr_reg,
TCGMemOp s_bits, int off_rw, int off_add) TCGMemOp s_bits, int off_rw, int off_add,
uint64_t bswap1, uint64_t bswap2)
{ {
/* /*
.mii .mii
@ -1621,12 +1623,12 @@ static inline void tcg_out_qemu_tlb(TCGContext *s, TCGReg addr_reg,
(TARGET_LONG_BITS == 32 (TARGET_LONG_BITS == 32
? OPC_LD4_M3 : OPC_LD8_M3), TCG_REG_R3, ? OPC_LD4_M3 : OPC_LD8_M3), TCG_REG_R3,
TCG_REG_R2, off_add - off_rw), TCG_REG_R2, off_add - off_rw),
INSN_NOP_I); bswap1);
tcg_out_bundle(s, mmI, tcg_out_bundle(s, mmI,
INSN_NOP_M, INSN_NOP_M,
tcg_opc_a6 (TCG_REG_P0, OPC_CMP_EQ_A6, TCG_REG_P6, tcg_opc_a6 (TCG_REG_P0, OPC_CMP_EQ_A6, TCG_REG_P6,
TCG_REG_P7, TCG_REG_R1, TCG_REG_R3), TCG_REG_P7, TCG_REG_R1, TCG_REG_R3),
INSN_NOP_I); bswap2);
} }
/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr, /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
@ -1656,7 +1658,8 @@ static inline void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
/* Read the TLB entry */ /* Read the TLB entry */
tcg_out_qemu_tlb(s, addr_reg, s_bits, tcg_out_qemu_tlb(s, addr_reg, s_bits,
offsetof(CPUArchState, tlb_table[mem_index][0].addr_read), offsetof(CPUArchState, tlb_table[mem_index][0].addr_read),
offsetof(CPUArchState, tlb_table[mem_index][0].addend)); offsetof(CPUArchState, tlb_table[mem_index][0].addend),
INSN_NOP_I, INSN_NOP_I);
/* P6 is the fast path, and P7 the slow path */ /* P6 is the fast path, and P7 the slow path */
tcg_out_bundle(s, mLX, tcg_out_bundle(s, mLX,
@ -1727,17 +1730,31 @@ static inline void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
static const uint64_t opc_st_m4[4] = { static const uint64_t opc_st_m4[4] = {
OPC_ST1_M4, OPC_ST2_M4, OPC_ST4_M4, OPC_ST8_M4 OPC_ST1_M4, OPC_ST2_M4, OPC_ST4_M4, OPC_ST8_M4
}; };
int addr_reg, data_reg, mem_index; TCGReg addr_reg, data_reg, store_reg;
int mem_index;
uint64_t bswap1, bswap2;
TCGMemOp s_bits; TCGMemOp s_bits;
data_reg = *args++; store_reg = data_reg = *args++;
addr_reg = *args++; addr_reg = *args++;
mem_index = *args; mem_index = *args;
s_bits = opc & MO_SIZE; s_bits = opc & MO_SIZE;
bswap1 = bswap2 = INSN_NOP_I;
if (opc & MO_BSWAP) {
store_reg = TCG_REG_R56;
bswap1 = tcg_opc_bswap64_i(TCG_REG_P0, store_reg, data_reg);
if (s_bits < MO_64) {
int shift = 64 - (8 << s_bits);
bswap2 = tcg_opc_i11(TCG_REG_P0, OPC_EXTR_U_I11,
store_reg, store_reg, shift, 63 - shift);
}
}
tcg_out_qemu_tlb(s, addr_reg, s_bits, tcg_out_qemu_tlb(s, addr_reg, s_bits,
offsetof(CPUArchState, tlb_table[mem_index][0].addr_write), offsetof(CPUArchState, tlb_table[mem_index][0].addr_write),
offsetof(CPUArchState, tlb_table[mem_index][0].addend)); offsetof(CPUArchState, tlb_table[mem_index][0].addend),
bswap1, bswap2);
/* P6 is the fast path, and P7 the slow path */ /* P6 is the fast path, and P7 the slow path */
tcg_out_bundle(s, mLX, tcg_out_bundle(s, mLX,
@ -1752,63 +1769,14 @@ static inline void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
TCG_REG_R3, TCG_REG_R57), TCG_REG_R3, TCG_REG_R57),
tcg_opc_i21(TCG_REG_P7, OPC_MOV_I21, TCG_REG_B6, tcg_opc_i21(TCG_REG_P7, OPC_MOV_I21, TCG_REG_B6,
TCG_REG_R3, 0)); TCG_REG_R3, 0));
tcg_out_bundle(s, mii,
switch (opc) { tcg_opc_m1 (TCG_REG_P7, OPC_LD8_M1,
case MO_8: TCG_REG_R1, TCG_REG_R2),
case MO_16: tcg_opc_mov_a(TCG_REG_P7, TCG_REG_R58, data_reg),
case MO_32: INSN_NOP_I);
case MO_64:
tcg_out_bundle(s, mii,
tcg_opc_m1 (TCG_REG_P7, OPC_LD8_M1,
TCG_REG_R1, TCG_REG_R2),
tcg_opc_mov_a(TCG_REG_P7, TCG_REG_R58, data_reg),
INSN_NOP_I);
break;
case MO_16 | MO_BSWAP:
tcg_out_bundle(s, miI,
tcg_opc_m1 (TCG_REG_P7, OPC_LD8_M1,
TCG_REG_R1, TCG_REG_R2),
INSN_NOP_I,
tcg_opc_i12(TCG_REG_P6, OPC_DEP_Z_I12,
TCG_REG_R2, data_reg, 15, 15));
tcg_out_bundle(s, miI,
tcg_opc_mov_a(TCG_REG_P7, TCG_REG_R58, data_reg),
INSN_NOP_I,
tcg_opc_bswap64_i(TCG_REG_P6, TCG_REG_R2, TCG_REG_R2));
data_reg = TCG_REG_R2;
break;
case MO_32 | MO_BSWAP:
tcg_out_bundle(s, miI,
tcg_opc_m1 (TCG_REG_P7, OPC_LD8_M1,
TCG_REG_R1, TCG_REG_R2),
INSN_NOP_I,
tcg_opc_i12(TCG_REG_P6, OPC_DEP_Z_I12,
TCG_REG_R2, data_reg, 31, 31));
tcg_out_bundle(s, miI,
tcg_opc_mov_a(TCG_REG_P7, TCG_REG_R58, data_reg),
INSN_NOP_I,
tcg_opc_bswap64_i(TCG_REG_P6, TCG_REG_R2, TCG_REG_R2));
data_reg = TCG_REG_R2;
break;
case MO_64 | MO_BSWAP:
tcg_out_bundle(s, miI,
tcg_opc_m1 (TCG_REG_P7, OPC_LD8_M1,
TCG_REG_R1, TCG_REG_R2),
tcg_opc_mov_a(TCG_REG_P7, TCG_REG_R58, data_reg),
tcg_opc_bswap64_i(TCG_REG_P6, TCG_REG_R2, data_reg));
data_reg = TCG_REG_R2;
break;
default:
tcg_abort();
}
tcg_out_bundle(s, miB, tcg_out_bundle(s, miB,
tcg_opc_m4 (TCG_REG_P6, opc_st_m4[s_bits], tcg_opc_m4 (TCG_REG_P6, opc_st_m4[s_bits],
data_reg, TCG_REG_R3), store_reg, TCG_REG_R3),
tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R59, mem_index), tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R59, mem_index),
tcg_opc_b5 (TCG_REG_P7, OPC_BR_CALL_SPTK_MANY_B5, tcg_opc_b5 (TCG_REG_P7, OPC_BR_CALL_SPTK_MANY_B5,
TCG_REG_B0, TCG_REG_B6)); TCG_REG_B0, TCG_REG_B6));