qemu/softmmu_template.h

577 lines
21 KiB
C
Raw Normal View History

/*
* Software MMU support
*
* Generate helpers used by TCG for qemu_ld/st ops and code load
* functions.
*
* Included from target op helpers and exec.c.
*
* Copyright (c) 2003 Fabrice Bellard
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "qemu/timer.h"
#include "exec/address-spaces.h"
#include "exec/memory.h"
#define DATA_SIZE (1 << SHIFT)
#if DATA_SIZE == 8
#define SUFFIX q
#define LSUFFIX q
#define SDATA_TYPE int64_t
#define DATA_TYPE uint64_t
#elif DATA_SIZE == 4
#define SUFFIX l
#define LSUFFIX l
#define SDATA_TYPE int32_t
#define DATA_TYPE uint32_t
#elif DATA_SIZE == 2
#define SUFFIX w
#define LSUFFIX uw
#define SDATA_TYPE int16_t
#define DATA_TYPE uint16_t
#elif DATA_SIZE == 1
#define SUFFIX b
#define LSUFFIX ub
#define SDATA_TYPE int8_t
#define DATA_TYPE uint8_t
#else
#error unsupported data size
#endif
/* For the benefit of TCG generated code, we want to avoid the complication
of ABI-specific return type promotion and always return a value extended
to the register size of the host. This is tcg_target_long, except in the
case of a 32-bit host and 64-bit data, and for that we always have
uint64_t. Don't bother with this widened value for SOFTMMU_CODE_ACCESS. */
#if defined(SOFTMMU_CODE_ACCESS) || DATA_SIZE == 8
# define WORD_TYPE DATA_TYPE
# define USUFFIX SUFFIX
#else
# define WORD_TYPE tcg_target_ulong
# define USUFFIX glue(u, SUFFIX)
# define SSUFFIX glue(s, SUFFIX)
#endif
#ifdef SOFTMMU_CODE_ACCESS
#define READ_ACCESS_TYPE MMU_INST_FETCH
#define ADDR_READ addr_code
#else
#define READ_ACCESS_TYPE MMU_DATA_LOAD
#define ADDR_READ addr_read
#endif
#if DATA_SIZE == 8
# define BSWAP(X) bswap64(X)
#elif DATA_SIZE == 4
# define BSWAP(X) bswap32(X)
#elif DATA_SIZE == 2
# define BSWAP(X) bswap16(X)
#else
# define BSWAP(X) (X)
#endif
#ifdef TARGET_WORDS_BIGENDIAN
# define TGT_BE(X) (X)
# define TGT_LE(X) BSWAP(X)
#else
# define TGT_BE(X) BSWAP(X)
# define TGT_LE(X) (X)
#endif
#if DATA_SIZE == 1
# define helper_le_ld_name glue(glue(helper_ret_ld, USUFFIX), MMUSUFFIX)
# define helper_be_ld_name helper_le_ld_name
# define helper_le_lds_name glue(glue(helper_ret_ld, SSUFFIX), MMUSUFFIX)
# define helper_be_lds_name helper_le_lds_name
# define helper_le_st_name glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)
# define helper_be_st_name helper_le_st_name
#else
# define helper_le_ld_name glue(glue(helper_le_ld, USUFFIX), MMUSUFFIX)
# define helper_be_ld_name glue(glue(helper_be_ld, USUFFIX), MMUSUFFIX)
# define helper_le_lds_name glue(glue(helper_le_ld, SSUFFIX), MMUSUFFIX)
# define helper_be_lds_name glue(glue(helper_be_ld, SSUFFIX), MMUSUFFIX)
# define helper_le_st_name glue(glue(helper_le_st, SUFFIX), MMUSUFFIX)
# define helper_be_st_name glue(glue(helper_be_st, SUFFIX), MMUSUFFIX)
#endif
#ifdef TARGET_WORDS_BIGENDIAN
# define helper_te_ld_name helper_be_ld_name
# define helper_te_st_name helper_be_st_name
#else
# define helper_te_ld_name helper_le_ld_name
# define helper_te_st_name helper_le_st_name
#endif
implementing victim TLB for QEMU system emulated TLB QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 09:35:23 +08:00
/* macro to check the victim tlb */
#define VICTIM_TLB_HIT(ty) \
({ \
/* we are about to do a page table walk. our last hope is the \
* victim tlb. try to refill from the victim tlb before walking the \
* page table. */ \
int vidx; \
CPUIOTLBEntry tmpiotlb; \
implementing victim TLB for QEMU system emulated TLB QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 09:35:23 +08:00
CPUTLBEntry tmptlb; \
for (vidx = CPU_VTLB_SIZE-1; vidx >= 0; --vidx) { \
if (env->tlb_v_table[mmu_idx][vidx].ty == (addr & TARGET_PAGE_MASK)) {\
/* found entry in victim tlb, swap tlb and iotlb */ \
tmptlb = env->tlb_table[mmu_idx][index]; \
env->tlb_table[mmu_idx][index] = env->tlb_v_table[mmu_idx][vidx]; \
env->tlb_v_table[mmu_idx][vidx] = tmptlb; \
tmpiotlb = env->iotlb[mmu_idx][index]; \
env->iotlb[mmu_idx][index] = env->iotlb_v[mmu_idx][vidx]; \
env->iotlb_v[mmu_idx][vidx] = tmpiotlb; \
break; \
} \
} \
/* return true when there is a vtlb hit, i.e. vidx >=0 */ \
vidx >= 0; \
})
#ifndef SOFTMMU_CODE_ACCESS
static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
CPUIOTLBEntry *iotlbentry,
target_ulong addr,
uintptr_t retaddr)
{
uint64_t val;
CPUState *cpu = ENV_GET_CPU(env);
hwaddr physaddr = iotlbentry->addr;
MemoryRegion *mr = iotlb_to_region(cpu, physaddr);
physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
cpu->mem_io_pc = retaddr;
if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu_can_do_io(cpu)) {
cpu_io_recompile(cpu, retaddr);
}
cpu->mem_io_vaddr = addr;
memory_region_dispatch_read(mr, physaddr, &val, 1 << SHIFT,
iotlbentry->attrs);
return val;
}
#endif
#ifdef SOFTMMU_CODE_ACCESS
static __attribute__((unused))
#endif
WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
TCGMemOpIdx oi, uintptr_t retaddr)
{
unsigned mmu_idx = get_mmuidx(oi);
int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
uintptr_t haddr;
DATA_TYPE res;
/* Adjust the given return address. */
retaddr -= GETPC_ADJ;
/* If the TLB entry is for a different page, reload and try again. */
if ((addr & TARGET_PAGE_MASK)
!= (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
if ((addr & (DATA_SIZE - 1)) != 0
&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
mmu_idx, retaddr);
}
implementing victim TLB for QEMU system emulated TLB QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 09:35:23 +08:00
if (!VICTIM_TLB_HIT(ADDR_READ)) {
tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
mmu_idx, retaddr);
}
tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
}
/* Handle an IO access. */
if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
CPUIOTLBEntry *iotlbentry;
if ((addr & (DATA_SIZE - 1)) != 0) {
goto do_unaligned_access;
}
iotlbentry = &env->iotlb[mmu_idx][index];
/* ??? Note that the io helpers always read data in the target
byte ordering. We should push the LE/BE request down into io. */
res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
res = TGT_LE(res);
return res;
}
/* Handle slow unaligned access (it spans two pages or IO). */
if (DATA_SIZE > 1
&& unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>= TARGET_PAGE_SIZE)) {
target_ulong addr1, addr2;
DATA_TYPE res1, res2;
unsigned shift;
do_unaligned_access:
if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
mmu_idx, retaddr);
}
addr1 = addr & ~(DATA_SIZE - 1);
addr2 = addr1 + DATA_SIZE;
/* Note the adjustment at the beginning of the function.
Undo that for the recursion. */
res1 = helper_le_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
res2 = helper_le_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
shift = (addr & (DATA_SIZE - 1)) * 8;
/* Little-endian combine. */
res = (res1 >> shift) | (res2 << ((DATA_SIZE * 8) - shift));
return res;
}
/* Handle aligned access or unaligned access in the same page. */
if ((addr & (DATA_SIZE - 1)) != 0
&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
mmu_idx, retaddr);
}
haddr = addr + env->tlb_table[mmu_idx][index].addend;
#if DATA_SIZE == 1
res = glue(glue(ld, LSUFFIX), _p)((uint8_t *)haddr);
#else
res = glue(glue(ld, LSUFFIX), _le_p)((uint8_t *)haddr);
#endif
return res;
}
#if DATA_SIZE > 1
#ifdef SOFTMMU_CODE_ACCESS
static __attribute__((unused))
#endif
WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
TCGMemOpIdx oi, uintptr_t retaddr)
{
unsigned mmu_idx = get_mmuidx(oi);
int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
uintptr_t haddr;
DATA_TYPE res;
/* Adjust the given return address. */
retaddr -= GETPC_ADJ;
/* If the TLB entry is for a different page, reload and try again. */
if ((addr & TARGET_PAGE_MASK)
!= (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
if ((addr & (DATA_SIZE - 1)) != 0
&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
mmu_idx, retaddr);
}
implementing victim TLB for QEMU system emulated TLB QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 09:35:23 +08:00
if (!VICTIM_TLB_HIT(ADDR_READ)) {
tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
mmu_idx, retaddr);
}
tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
}
/* Handle an IO access. */
if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
CPUIOTLBEntry *iotlbentry;
if ((addr & (DATA_SIZE - 1)) != 0) {
goto do_unaligned_access;
}
iotlbentry = &env->iotlb[mmu_idx][index];
/* ??? Note that the io helpers always read data in the target
byte ordering. We should push the LE/BE request down into io. */
res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
res = TGT_BE(res);
return res;
}
/* Handle slow unaligned access (it spans two pages or IO). */
if (DATA_SIZE > 1
&& unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>= TARGET_PAGE_SIZE)) {
target_ulong addr1, addr2;
DATA_TYPE res1, res2;
unsigned shift;
do_unaligned_access:
if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
mmu_idx, retaddr);
}
addr1 = addr & ~(DATA_SIZE - 1);
addr2 = addr1 + DATA_SIZE;
/* Note the adjustment at the beginning of the function.
Undo that for the recursion. */
res1 = helper_be_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
res2 = helper_be_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
shift = (addr & (DATA_SIZE - 1)) * 8;
/* Big-endian combine. */
res = (res1 << shift) | (res2 >> ((DATA_SIZE * 8) - shift));
return res;
}
/* Handle aligned access or unaligned access in the same page. */
if ((addr & (DATA_SIZE - 1)) != 0
&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
mmu_idx, retaddr);
}
haddr = addr + env->tlb_table[mmu_idx][index].addend;
res = glue(glue(ld, LSUFFIX), _be_p)((uint8_t *)haddr);
return res;
}
#endif /* DATA_SIZE > 1 */
DATA_TYPE
glue(glue(helper_ld, SUFFIX), MMUSUFFIX)(CPUArchState *env, target_ulong addr,
int mmu_idx)
{
TCGMemOpIdx oi = make_memop_idx(SHIFT, mmu_idx);
return helper_te_ld_name (env, addr, oi, GETRA());
}
#ifndef SOFTMMU_CODE_ACCESS
/* Provide signed versions of the load routines as well. We can of course
avoid this for 64-bit data, or for 32-bit data on 32-bit host. */
#if DATA_SIZE * 8 < TCG_TARGET_REG_BITS
WORD_TYPE helper_le_lds_name(CPUArchState *env, target_ulong addr,
TCGMemOpIdx oi, uintptr_t retaddr)
{
return (SDATA_TYPE)helper_le_ld_name(env, addr, oi, retaddr);
}
# if DATA_SIZE > 1
WORD_TYPE helper_be_lds_name(CPUArchState *env, target_ulong addr,
TCGMemOpIdx oi, uintptr_t retaddr)
{
return (SDATA_TYPE)helper_be_ld_name(env, addr, oi, retaddr);
}
# endif
#endif
static inline void glue(io_write, SUFFIX)(CPUArchState *env,
CPUIOTLBEntry *iotlbentry,
DATA_TYPE val,
target_ulong addr,
uintptr_t retaddr)
{
CPUState *cpu = ENV_GET_CPU(env);
hwaddr physaddr = iotlbentry->addr;
MemoryRegion *mr = iotlb_to_region(cpu, physaddr);
physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu_can_do_io(cpu)) {
cpu_io_recompile(cpu, retaddr);
}
cpu->mem_io_vaddr = addr;
cpu->mem_io_pc = retaddr;
memory_region_dispatch_write(mr, physaddr, val, 1 << SHIFT,
iotlbentry->attrs);
}
void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
TCGMemOpIdx oi, uintptr_t retaddr)
{
unsigned mmu_idx = get_mmuidx(oi);
int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
uintptr_t haddr;
/* Adjust the given return address. */
retaddr -= GETPC_ADJ;
/* If the TLB entry is for a different page, reload and try again. */
if ((addr & TARGET_PAGE_MASK)
!= (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
if ((addr & (DATA_SIZE - 1)) != 0
&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
mmu_idx, retaddr);
}
implementing victim TLB for QEMU system emulated TLB QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 09:35:23 +08:00
if (!VICTIM_TLB_HIT(addr_write)) {
tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
implementing victim TLB for QEMU system emulated TLB QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 09:35:23 +08:00
}
tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
}
/* Handle an IO access. */
if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
CPUIOTLBEntry *iotlbentry;
if ((addr & (DATA_SIZE - 1)) != 0) {
goto do_unaligned_access;
}
iotlbentry = &env->iotlb[mmu_idx][index];
/* ??? Note that the io helpers always read data in the target
byte ordering. We should push the LE/BE request down into io. */
val = TGT_LE(val);
glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
return;
}
/* Handle slow unaligned access (it spans two pages or IO). */
if (DATA_SIZE > 1
&& unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>= TARGET_PAGE_SIZE)) {
int i;
do_unaligned_access:
if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
mmu_idx, retaddr);
}
/* XXX: not efficient, but simple */
/* Note: relies on the fact that tlb_fill() does not remove the
* previous page from the TLB cache. */
for (i = DATA_SIZE - 1; i >= 0; i--) {
/* Little-endian extract. */
uint8_t val8 = val >> (i * 8);
/* Note the adjustment at the beginning of the function.
Undo that for the recursion. */
glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
oi, retaddr + GETPC_ADJ);
}
return;
}
/* Handle aligned access or unaligned access in the same page. */
if ((addr & (DATA_SIZE - 1)) != 0
&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
mmu_idx, retaddr);
}
haddr = addr + env->tlb_table[mmu_idx][index].addend;
#if DATA_SIZE == 1
glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
#else
glue(glue(st, SUFFIX), _le_p)((uint8_t *)haddr, val);
#endif
}
#if DATA_SIZE > 1
void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
TCGMemOpIdx oi, uintptr_t retaddr)
{
unsigned mmu_idx = get_mmuidx(oi);
int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
uintptr_t haddr;
/* Adjust the given return address. */
retaddr -= GETPC_ADJ;
/* If the TLB entry is for a different page, reload and try again. */
if ((addr & TARGET_PAGE_MASK)
!= (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
if ((addr & (DATA_SIZE - 1)) != 0
&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
mmu_idx, retaddr);
}
implementing victim TLB for QEMU system emulated TLB QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 09:35:23 +08:00
if (!VICTIM_TLB_HIT(addr_write)) {
tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
implementing victim TLB for QEMU system emulated TLB QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 09:35:23 +08:00
}
tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
}
/* Handle an IO access. */
if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
CPUIOTLBEntry *iotlbentry;
if ((addr & (DATA_SIZE - 1)) != 0) {
goto do_unaligned_access;
}
iotlbentry = &env->iotlb[mmu_idx][index];
/* ??? Note that the io helpers always read data in the target
byte ordering. We should push the LE/BE request down into io. */
val = TGT_BE(val);
glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
return;
}
/* Handle slow unaligned access (it spans two pages or IO). */
if (DATA_SIZE > 1
&& unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>= TARGET_PAGE_SIZE)) {
int i;
do_unaligned_access:
if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
mmu_idx, retaddr);
}
/* XXX: not efficient, but simple */
/* Note: relies on the fact that tlb_fill() does not remove the
* previous page from the TLB cache. */
for (i = DATA_SIZE - 1; i >= 0; i--) {
/* Big-endian extract. */
uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
/* Note the adjustment at the beginning of the function.
Undo that for the recursion. */
glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
oi, retaddr + GETPC_ADJ);
}
return;
}
/* Handle aligned access or unaligned access in the same page. */
if ((addr & (DATA_SIZE - 1)) != 0
&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
mmu_idx, retaddr);
}
haddr = addr + env->tlb_table[mmu_idx][index].addend;
glue(glue(st, SUFFIX), _be_p)((uint8_t *)haddr, val);
}
#endif /* DATA_SIZE > 1 */
void
glue(glue(helper_st, SUFFIX), MMUSUFFIX)(CPUArchState *env, target_ulong addr,
DATA_TYPE val, int mmu_idx)
{
TCGMemOpIdx oi = make_memop_idx(SHIFT, mmu_idx);
helper_te_st_name(env, addr, val, oi, GETRA());
}
#endif /* !defined(SOFTMMU_CODE_ACCESS) */
#undef READ_ACCESS_TYPE
#undef SHIFT
#undef DATA_TYPE
#undef SUFFIX
#undef LSUFFIX
#undef DATA_SIZE
#undef ADDR_READ
#undef WORD_TYPE
#undef SDATA_TYPE
#undef USUFFIX
#undef SSUFFIX
#undef BSWAP
#undef TGT_BE
#undef TGT_LE
#undef CPU_BE
#undef CPU_LE
#undef helper_le_ld_name
#undef helper_be_ld_name
#undef helper_le_lds_name
#undef helper_be_lds_name
#undef helper_le_st_name
#undef helper_be_st_name
#undef helper_te_ld_name
#undef helper_te_st_name