drm/i915: Move cmd parser pinning to execbuffer

We need to get rid of allocations in the cmd parser, because it needs
to be called from a signaling context, first move all pinning to
execbuf, where we already hold all locks.

Allocate jump_whitelist in the execbuffer, and add annotations around
intel_engine_cmd_parser(), to ensure we only call the command parser
without allocating any memory, or taking any locks we're not supposed to.

Because i915_gem_object_get_page() may also allocate memory, add a
path to i915_gem_object_get_sg() that prevents memory allocations,
and walk the sg list manually. It should be similarly fast.

This has the added benefit of being able to catch all memory allocation
errors before the point of no return, and return -ENOMEM safely to the
execbuf submitter.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Acked-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210323155059.628690-4-maarten.lankhorst@linux.intel.com
This commit is contained in:
Maarten Lankhorst 2021-03-23 16:49:52 +01:00 committed by Daniel Vetter
parent 2c8ab3339e
commit 0edbb9ba1b
8 changed files with 140 additions and 78 deletions

View File

@ -28,6 +28,7 @@
#include "i915_sw_fence_work.h"
#include "i915_trace.h"
#include "i915_user_extensions.h"
#include "i915_memcpy.h"
struct eb_vma {
struct i915_vma *vma;
@ -2281,24 +2282,45 @@ struct eb_parse_work {
struct i915_vma *trampoline;
unsigned long batch_offset;
unsigned long batch_length;
unsigned long *jump_whitelist;
const void *batch_map;
void *shadow_map;
};
static int __eb_parse(struct dma_fence_work *work)
{
struct eb_parse_work *pw = container_of(work, typeof(*pw), base);
int ret;
bool cookie;
return intel_engine_cmd_parser(pw->engine,
cookie = dma_fence_begin_signalling();
ret = intel_engine_cmd_parser(pw->engine,
pw->batch,
pw->batch_offset,
pw->batch_length,
pw->shadow,
pw->trampoline);
pw->jump_whitelist,
pw->shadow_map,
pw->batch_map);
dma_fence_end_signalling(cookie);
return ret;
}
static void __eb_parse_release(struct dma_fence_work *work)
{
struct eb_parse_work *pw = container_of(work, typeof(*pw), base);
if (!IS_ERR_OR_NULL(pw->jump_whitelist))
kfree(pw->jump_whitelist);
if (pw->batch_map)
i915_gem_object_unpin_map(pw->batch->obj);
else
i915_gem_object_unpin_pages(pw->batch->obj);
i915_gem_object_unpin_map(pw->shadow->obj);
if (pw->trampoline)
i915_active_release(&pw->trampoline->active);
i915_active_release(&pw->shadow->active);
@ -2348,6 +2370,8 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
struct i915_vma *trampoline)
{
struct eb_parse_work *pw;
struct drm_i915_gem_object *batch = eb->batch->vma->obj;
bool needs_clflush;
int err;
GEM_BUG_ON(overflows_type(eb->batch_start_offset, pw->batch_offset));
@ -2371,6 +2395,34 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
goto err_shadow;
}
pw->shadow_map = i915_gem_object_pin_map(shadow->obj, I915_MAP_WB);
if (IS_ERR(pw->shadow_map)) {
err = PTR_ERR(pw->shadow_map);
goto err_trampoline;
}
needs_clflush =
!(batch->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ);
pw->batch_map = ERR_PTR(-ENODEV);
if (needs_clflush && i915_has_memcpy_from_wc())
pw->batch_map = i915_gem_object_pin_map(batch, I915_MAP_WC);
if (IS_ERR(pw->batch_map)) {
err = i915_gem_object_pin_pages(batch);
if (err)
goto err_unmap_shadow;
pw->batch_map = NULL;
}
pw->jump_whitelist =
intel_engine_cmd_parser_alloc_jump_whitelist(eb->batch_len,
trampoline);
if (IS_ERR(pw->jump_whitelist)) {
err = PTR_ERR(pw->jump_whitelist);
goto err_unmap_batch;
}
dma_fence_work_init(&pw->base, &eb_parse_ops);
pw->engine = eb->engine;
@ -2410,6 +2462,16 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
dma_fence_work_commit_imm(&pw->base);
return err;
err_unmap_batch:
if (pw->batch_map)
i915_gem_object_unpin_map(batch);
else
i915_gem_object_unpin_pages(batch);
err_unmap_shadow:
i915_gem_object_unpin_map(shadow->obj);
err_trampoline:
if (trampoline)
i915_active_release(&trampoline->active);
err_shadow:
i915_active_release(&shadow->active);
err_batch:

View File

@ -299,22 +299,22 @@ struct scatterlist *
__i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
struct i915_gem_object_page_iter *iter,
unsigned int n,
unsigned int *offset);
unsigned int *offset, bool allow_alloc);
static inline struct scatterlist *
i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
unsigned int n,
unsigned int *offset)
unsigned int *offset, bool allow_alloc)
{
return __i915_gem_object_get_sg(obj, &obj->mm.get_page, n, offset);
return __i915_gem_object_get_sg(obj, &obj->mm.get_page, n, offset, allow_alloc);
}
static inline struct scatterlist *
i915_gem_object_get_sg_dma(struct drm_i915_gem_object *obj,
unsigned int n,
unsigned int *offset)
unsigned int *offset, bool allow_alloc)
{
return __i915_gem_object_get_sg(obj, &obj->mm.get_dma_page, n, offset);
return __i915_gem_object_get_sg(obj, &obj->mm.get_dma_page, n, offset, allow_alloc);
}
struct page *

View File

@ -448,7 +448,8 @@ struct scatterlist *
__i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
struct i915_gem_object_page_iter *iter,
unsigned int n,
unsigned int *offset)
unsigned int *offset,
bool allow_alloc)
{
const bool dma = iter == &obj->mm.get_dma_page;
struct scatterlist *sg;
@ -470,6 +471,9 @@ __i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
if (n < READ_ONCE(iter->sg_idx))
goto lookup;
if (!allow_alloc)
goto manual_lookup;
mutex_lock(&iter->lock);
/* We prefer to reuse the last sg so that repeated lookup of this
@ -519,7 +523,16 @@ __i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
if (unlikely(n < idx)) /* insertion completed by another thread */
goto lookup;
/* In case we failed to insert the entry into the radixtree, we need
goto manual_walk;
manual_lookup:
idx = 0;
sg = obj->mm.pages->sgl;
count = __sg_page_count(sg);
manual_walk:
/*
* In case we failed to insert the entry into the radixtree, we need
* to look beyond the current sg.
*/
while (idx + count <= n) {
@ -566,7 +579,7 @@ i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
sg = i915_gem_object_get_sg(obj, n, &offset);
sg = i915_gem_object_get_sg(obj, n, &offset, true);
return nth_page(sg_page(sg), offset);
}
@ -592,7 +605,7 @@ i915_gem_object_get_dma_address_len(struct drm_i915_gem_object *obj,
struct scatterlist *sg;
unsigned int offset;
sg = i915_gem_object_get_sg_dma(obj, n, &offset);
sg = i915_gem_object_get_sg_dma(obj, n, &offset, true);
if (len)
*len = sg_dma_len(sg) - (offset << PAGE_SHIFT);

View File

@ -1420,7 +1420,7 @@ intel_partial_pages(const struct i915_ggtt_view *view,
if (ret)
goto err_sg_alloc;
iter = i915_gem_object_get_sg_dma(obj, view->partial.offset, &offset);
iter = i915_gem_object_get_sg_dma(obj, view->partial.offset, &offset, true);
GEM_BUG_ON(!iter);
sg = st->sgl;

View File

@ -1144,38 +1144,20 @@ find_reg(const struct intel_engine_cs *engine, u32 addr)
/* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
struct drm_i915_gem_object *src_obj,
unsigned long offset, unsigned long length)
unsigned long offset, unsigned long length,
void *dst, const void *src)
{
bool needs_clflush;
void *dst, *src;
int ret;
dst = i915_gem_object_pin_map(dst_obj, I915_MAP_WB);
if (IS_ERR(dst))
return dst;
ret = i915_gem_object_pin_pages(src_obj);
if (ret) {
i915_gem_object_unpin_map(dst_obj);
return ERR_PTR(ret);
}
needs_clflush =
bool needs_clflush =
!(src_obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ);
src = ERR_PTR(-ENODEV);
if (needs_clflush && i915_has_memcpy_from_wc()) {
src = i915_gem_object_pin_map(src_obj, I915_MAP_WC);
if (!IS_ERR(src)) {
i915_unaligned_memcpy_from_wc(dst,
src + offset,
length);
i915_gem_object_unpin_map(src_obj);
}
}
if (IS_ERR(src)) {
unsigned long x, n, remain;
if (src) {
GEM_BUG_ON(!needs_clflush);
i915_unaligned_memcpy_from_wc(dst, src + offset, length);
} else {
struct scatterlist *sg;
void *ptr;
unsigned int x, sg_ofs;
unsigned long remain;
/*
* We can avoid clflushing partial cachelines before the write
@ -1192,22 +1174,30 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
ptr = dst;
x = offset_in_page(offset);
for (n = offset >> PAGE_SHIFT; remain; n++) {
int len = min(remain, PAGE_SIZE - x);
sg = i915_gem_object_get_sg(src_obj, offset >> PAGE_SHIFT, &sg_ofs, false);
src = kmap_atomic(i915_gem_object_get_page(src_obj, n));
while (remain) {
unsigned long sg_max = sg->length >> PAGE_SHIFT;
for (; remain && sg_ofs < sg_max; sg_ofs++) {
unsigned long len = min(remain, PAGE_SIZE - x);
void *map;
map = kmap_atomic(nth_page(sg_page(sg), sg_ofs));
if (needs_clflush)
drm_clflush_virt_range(src + x, len);
memcpy(ptr, src + x, len);
kunmap_atomic(src);
drm_clflush_virt_range(map + x, len);
memcpy(ptr, map + x, len);
kunmap_atomic(map);
ptr += len;
remain -= len;
x = 0;
}
}
i915_gem_object_unpin_pages(src_obj);
sg_ofs = 0;
sg = sg_next(sg);
}
}
memset32(dst + length, 0, (dst_obj->base.size - length) / sizeof(u32));
@ -1370,9 +1360,6 @@ static int check_bbstart(u32 *cmd, u32 offset, u32 length,
if (target_cmd_index == offset)
return 0;
if (IS_ERR(jump_whitelist))
return PTR_ERR(jump_whitelist);
if (!test_bit(target_cmd_index, jump_whitelist)) {
DRM_DEBUG("CMD: BB_START to 0x%llx not a previously executed cmd\n",
jump_target);
@ -1382,10 +1369,14 @@ static int check_bbstart(u32 *cmd, u32 offset, u32 length,
return 0;
}
static unsigned long *alloc_whitelist(u32 batch_length)
unsigned long *intel_engine_cmd_parser_alloc_jump_whitelist(u32 batch_length,
bool trampoline)
{
unsigned long *jmp;
if (trampoline)
return NULL;
/*
* We expect batch_length to be less than 256KiB for known users,
* i.e. we need at most an 8KiB bitmap allocation which should be
@ -1423,14 +1414,16 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
unsigned long batch_offset,
unsigned long batch_length,
struct i915_vma *shadow,
bool trampoline)
unsigned long *jump_whitelist,
void *shadow_map,
const void *batch_map)
{
u32 *cmd, *batch_end, offset = 0;
struct drm_i915_cmd_descriptor default_desc = noop_desc;
const struct drm_i915_cmd_descriptor *desc = &default_desc;
unsigned long *jump_whitelist;
u64 batch_addr, shadow_addr;
int ret = 0;
bool trampoline = !jump_whitelist;
GEM_BUG_ON(!IS_ALIGNED(batch_offset, sizeof(*cmd)));
GEM_BUG_ON(!IS_ALIGNED(batch_length, sizeof(*cmd)));
@ -1438,16 +1431,8 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
batch->size));
GEM_BUG_ON(!batch_length);
cmd = copy_batch(shadow->obj, batch->obj, batch_offset, batch_length);
if (IS_ERR(cmd)) {
DRM_DEBUG("CMD: Failed to copy batch\n");
return PTR_ERR(cmd);
}
jump_whitelist = NULL;
if (!trampoline)
/* Defer failure until attempted use */
jump_whitelist = alloc_whitelist(batch_length);
cmd = copy_batch(shadow->obj, batch->obj, batch_offset, batch_length,
shadow_map, batch_map);
shadow_addr = gen8_canonical_addr(shadow->node.start);
batch_addr = gen8_canonical_addr(batch->node.start + batch_offset);
@ -1548,9 +1533,6 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
i915_gem_object_flush_map(shadow->obj);
if (!IS_ERR_OR_NULL(jump_whitelist))
kfree(jump_whitelist);
i915_gem_object_unpin_map(shadow->obj);
return ret;
}

View File

@ -1946,12 +1946,17 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv);
int intel_engine_init_cmd_parser(struct intel_engine_cs *engine);
void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine);
unsigned long *intel_engine_cmd_parser_alloc_jump_whitelist(u32 batch_length,
bool trampoline);
int intel_engine_cmd_parser(struct intel_engine_cs *engine,
struct i915_vma *batch,
unsigned long batch_offset,
unsigned long batch_length,
struct i915_vma *shadow,
bool trampoline);
unsigned long *jump_whitelist,
void *shadow_map,
const void *batch_map);
#define I915_CMD_PARSER_TRAMPOLINE_SIZE 8
/* intel_device_info.c */

View File

@ -135,7 +135,7 @@ bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
* accepts that its arguments may not be aligned, but are valid for the
* potential 16-byte read past the end.
*/
void i915_unaligned_memcpy_from_wc(void *dst, void *src, unsigned long len)
void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len)
{
unsigned long addr;

View File

@ -13,7 +13,7 @@ struct drm_i915_private;
void i915_memcpy_init_early(struct drm_i915_private *i915);
bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
void i915_unaligned_memcpy_from_wc(void *dst, void *src, unsigned long len);
void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len);
/* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment,
* as well as SSE4.1 support. i915_memcpy_from_wc() will report if it cannot