mirror of https://gitee.com/openkylin/linux.git
powerpc/powernv/npu: Use size-based ATSD invalidates
Prior to this change only two types of ATSDs were issued to the NPU: invalidates targeting a single page and invalidates targeting the whole address space. The crossover point happened at the configurable atsd_threshold which defaulted to 2M. Invalidates that size or smaller would issue per-page invalidates for the whole range. The NPU supports more invalidation sizes however: 64K, 2M, 1G, and all. These invalidates target addresses aligned to their size. 2M is a common invalidation size for GPU-enabled applications because that is a GPU page size, so reducing the number of invalidates by 32x in that case is a clear improvement. ATSD latency is high in general so now we always issue a single invalidate rather than multiple. This will over-invalidate in some cases, but for any invalidation size over 2M it matches or improves the prior behavior. There's also an improvement for single-page invalidates since the prior version issued two invalidates for that case instead of one. With this change all issued ATSDs now perform a flush, so the flush parameter has been removed from all the helpers. To show the benefit here are some performance numbers from a microbenchmark which creates a 1G allocation then uses mprotect with PROT_NONE to trigger invalidates in strides across the allocation. One NPU (1 GPU): mprotect rate (GB/s) Stride Before After Speedup 64K 5.3 5.6 5% 1M 39.3 57.4 46% 2M 49.7 82.6 66% 4M 286.6 285.7 0% Two NPUs (6 GPUs): mprotect rate (GB/s) Stride Before After Speedup 64K 6.5 7.4 13% 1M 33.4 67.9 103% 2M 38.7 93.1 141% 4M 356.7 354.6 -1% Anything over 2M is roughly the same as before since both cases issue a single ATSD. Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com> Reviewed-By: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
parent
7ead15a144
commit
3689c37d23
|
@ -18,6 +18,7 @@
|
|||
#include <linux/memblock.h>
|
||||
#include <linux/iommu.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/sizes.h>
|
||||
|
||||
#include <asm/debugfs.h>
|
||||
#include <asm/tlb.h>
|
||||
|
@ -458,8 +459,7 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg)
|
|||
#define XTS_ATSD_AVA 1
|
||||
#define XTS_ATSD_STAT 2
|
||||
|
||||
static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
|
||||
bool flush)
|
||||
static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
|
||||
{
|
||||
unsigned long launch = 0;
|
||||
|
||||
|
@ -477,8 +477,7 @@ static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
|
|||
/* PID */
|
||||
launch |= pid << PPC_BITLSHIFT(38);
|
||||
|
||||
/* No flush */
|
||||
launch |= !flush << PPC_BITLSHIFT(39);
|
||||
/* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
|
||||
|
||||
return launch;
|
||||
}
|
||||
|
@ -501,23 +500,22 @@ static void mmio_atsd_regs_write(struct mmio_atsd_reg
|
|||
}
|
||||
|
||||
static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
|
||||
unsigned long pid, bool flush)
|
||||
unsigned long pid)
|
||||
{
|
||||
unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush);
|
||||
unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
|
||||
|
||||
/* Invalidating the entire process doesn't use a va */
|
||||
mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
|
||||
}
|
||||
|
||||
static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
|
||||
unsigned long va, unsigned long pid, bool flush)
|
||||
static void mmio_invalidate_range(struct mmio_atsd_reg
|
||||
mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
|
||||
unsigned long start, unsigned long psize)
|
||||
{
|
||||
unsigned long launch;
|
||||
|
||||
launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush);
|
||||
unsigned long launch = get_atsd_launch_val(pid, psize);
|
||||
|
||||
/* Write all VAs first */
|
||||
mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va);
|
||||
mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
|
||||
|
||||
/* Issue one barrier for all address writes */
|
||||
eieio();
|
||||
|
@ -609,14 +607,36 @@ static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
|
|||
}
|
||||
|
||||
/*
|
||||
* Invalidate either a single address or an entire PID depending on
|
||||
* the value of va.
|
||||
* Invalidate a virtual address range
|
||||
*/
|
||||
static void mmio_invalidate(struct npu_context *npu_context, int va,
|
||||
unsigned long address, bool flush)
|
||||
static void mmio_invalidate(struct npu_context *npu_context,
|
||||
unsigned long start, unsigned long size)
|
||||
{
|
||||
struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
|
||||
unsigned long pid = npu_context->mm->context.id;
|
||||
unsigned long atsd_start = 0;
|
||||
unsigned long end = start + size - 1;
|
||||
int atsd_psize = MMU_PAGE_COUNT;
|
||||
|
||||
/*
|
||||
* Convert the input range into one of the supported sizes. If the range
|
||||
* doesn't fit, use the next larger supported size. Invalidation latency
|
||||
* is high, so over-invalidation is preferred to issuing multiple
|
||||
* invalidates.
|
||||
*
|
||||
* A 4K page size isn't supported by NPU/GPU ATS, so that case is
|
||||
* ignored.
|
||||
*/
|
||||
if (size == SZ_64K) {
|
||||
atsd_start = start;
|
||||
atsd_psize = MMU_PAGE_64K;
|
||||
} else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
|
||||
atsd_start = ALIGN_DOWN(start, SZ_2M);
|
||||
atsd_psize = MMU_PAGE_2M;
|
||||
} else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
|
||||
atsd_start = ALIGN_DOWN(start, SZ_1G);
|
||||
atsd_psize = MMU_PAGE_1G;
|
||||
}
|
||||
|
||||
if (npu_context->nmmu_flush)
|
||||
/*
|
||||
|
@ -631,23 +651,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
|
|||
* an invalidate.
|
||||
*/
|
||||
acquire_atsd_reg(npu_context, mmio_atsd_reg);
|
||||
if (va)
|
||||
mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
|
||||
|
||||
if (atsd_psize == MMU_PAGE_COUNT)
|
||||
mmio_invalidate_pid(mmio_atsd_reg, pid);
|
||||
else
|
||||
mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
|
||||
mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
|
||||
atsd_psize);
|
||||
|
||||
mmio_invalidate_wait(mmio_atsd_reg);
|
||||
if (flush) {
|
||||
/*
|
||||
* The GPU requires two flush ATSDs to ensure all entries have
|
||||
* been flushed. We use PID 0 as it will never be used for a
|
||||
* process on the GPU.
|
||||
*/
|
||||
mmio_invalidate_pid(mmio_atsd_reg, 0, true);
|
||||
mmio_invalidate_wait(mmio_atsd_reg);
|
||||
mmio_invalidate_pid(mmio_atsd_reg, 0, true);
|
||||
mmio_invalidate_wait(mmio_atsd_reg);
|
||||
}
|
||||
|
||||
/*
|
||||
* The GPU requires two flush ATSDs to ensure all entries have been
|
||||
* flushed. We use PID 0 as it will never be used for a process on the
|
||||
* GPU.
|
||||
*/
|
||||
mmio_invalidate_pid(mmio_atsd_reg, 0);
|
||||
mmio_invalidate_wait(mmio_atsd_reg);
|
||||
mmio_invalidate_pid(mmio_atsd_reg, 0);
|
||||
mmio_invalidate_wait(mmio_atsd_reg);
|
||||
|
||||
release_atsd_reg(mmio_atsd_reg);
|
||||
}
|
||||
|
||||
|
@ -664,7 +686,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn,
|
|||
* There should be no more translation requests for this PID, but we
|
||||
* need to ensure any entries for it are removed from the TLB.
|
||||
*/
|
||||
mmio_invalidate(npu_context, 0, 0, true);
|
||||
mmio_invalidate(npu_context, 0, ~0UL);
|
||||
}
|
||||
|
||||
static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
|
||||
|
@ -673,8 +695,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
|
|||
pte_t pte)
|
||||
{
|
||||
struct npu_context *npu_context = mn_to_npu_context(mn);
|
||||
|
||||
mmio_invalidate(npu_context, 1, address, true);
|
||||
mmio_invalidate(npu_context, address, PAGE_SIZE);
|
||||
}
|
||||
|
||||
static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
|
||||
|
@ -682,21 +703,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
|
|||
unsigned long start, unsigned long end)
|
||||
{
|
||||
struct npu_context *npu_context = mn_to_npu_context(mn);
|
||||
unsigned long address;
|
||||
|
||||
if (end - start > atsd_threshold) {
|
||||
/*
|
||||
* Just invalidate the entire PID if the address range is too
|
||||
* large.
|
||||
*/
|
||||
mmio_invalidate(npu_context, 0, 0, true);
|
||||
} else {
|
||||
for (address = start; address < end; address += PAGE_SIZE)
|
||||
mmio_invalidate(npu_context, 1, address, false);
|
||||
|
||||
/* Do the flush only on the final addess == end */
|
||||
mmio_invalidate(npu_context, 1, address, true);
|
||||
}
|
||||
mmio_invalidate(npu_context, start, end - start);
|
||||
}
|
||||
|
||||
static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
|
||||
|
|
Loading…
Reference in New Issue