linux/drivers/gpu/drm/i915/i915_reset.h

70 lines
1.8 KiB
C
Raw Normal View History

/*
* SPDX-License-Identifier: MIT
*
* Copyright © 2008-2018 Intel Corporation
*/
#ifndef I915_RESET_H
#define I915_RESET_H
#include <linux/compiler.h>
#include <linux/types.h>
drm/i915: Revoke mmaps and prevent access to fence registers across reset Previously, we were able to rely on the recursive properties of struct_mutex to allow us to serialise revoking mmaps and reacquiring the FENCE registers with them being clobbered over a global device reset. I then proceeded to throw out the baby with the bath water in order to pursue a struct_mutex-less reset. Perusing LWN for alternative strategies, the dilemma on how to serialise access to a global resource on one side was answered by https://lwn.net/Articles/202847/ -- Sleepable RCU: 1 int readside(void) { 2 int idx; 3 rcu_read_lock(); 4 if (nomoresrcu) { 5 rcu_read_unlock(); 6 return -EINVAL; 7 } 8 idx = srcu_read_lock(&ss); 9 rcu_read_unlock(); 10 /* SRCU read-side critical section. */ 11 srcu_read_unlock(&ss, idx); 12 return 0; 13 } 14 15 void cleanup(void) 16 { 17 nomoresrcu = 1; 18 synchronize_rcu(); 19 synchronize_srcu(&ss); 20 cleanup_srcu_struct(&ss); 21 } No more worrying about stop_machine, just an uber-complex mutex, optimised for reads, with the overhead pushed to the rare reset path. However, we do run the risk of a deadlock as we allocate underneath the SRCU read lock, and the allocation may require a GPU reset, causing a dependency cycle via the in-flight requests. We resolve that by declaring the driver wedged and cancelling all in-flight rendering. v2: Use expedited rcu barriers to match our earlier timing characteristics. v3: Try to annotate locking contexts for sparse v4: Reduce selftest lock duration to avoid a reset deadlock with fences v5: s/srcu/reset_backoff_srcu/ v6: Remove more stale comments Testcase: igt/gem_mmap_gtt/hang Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk
2019-02-08 23:37:03 +08:00
#include <linux/srcu.h>
#include "intel_engine_types.h"
struct drm_i915_private;
struct i915_request;
struct intel_engine_cs;
struct intel_guc;
__printf(4, 5)
void i915_handle_error(struct drm_i915_private *i915,
intel_engine_mask_t engine_mask,
unsigned long flags,
const char *fmt, ...);
#define I915_ERROR_CAPTURE BIT(0)
void i915_clear_error_registers(struct drm_i915_private *i915);
void i915_reset(struct drm_i915_private *i915,
intel_engine_mask_t stalled_mask,
const char *reason);
int i915_reset_engine(struct intel_engine_cs *engine,
const char *reason);
void i915_reset_request(struct i915_request *rq, bool guilty);
bool i915_reset_flush(struct drm_i915_private *i915);
drm/i915: Revoke mmaps and prevent access to fence registers across reset Previously, we were able to rely on the recursive properties of struct_mutex to allow us to serialise revoking mmaps and reacquiring the FENCE registers with them being clobbered over a global device reset. I then proceeded to throw out the baby with the bath water in order to pursue a struct_mutex-less reset. Perusing LWN for alternative strategies, the dilemma on how to serialise access to a global resource on one side was answered by https://lwn.net/Articles/202847/ -- Sleepable RCU: 1 int readside(void) { 2 int idx; 3 rcu_read_lock(); 4 if (nomoresrcu) { 5 rcu_read_unlock(); 6 return -EINVAL; 7 } 8 idx = srcu_read_lock(&ss); 9 rcu_read_unlock(); 10 /* SRCU read-side critical section. */ 11 srcu_read_unlock(&ss, idx); 12 return 0; 13 } 14 15 void cleanup(void) 16 { 17 nomoresrcu = 1; 18 synchronize_rcu(); 19 synchronize_srcu(&ss); 20 cleanup_srcu_struct(&ss); 21 } No more worrying about stop_machine, just an uber-complex mutex, optimised for reads, with the overhead pushed to the rare reset path. However, we do run the risk of a deadlock as we allocate underneath the SRCU read lock, and the allocation may require a GPU reset, causing a dependency cycle via the in-flight requests. We resolve that by declaring the driver wedged and cancelling all in-flight rendering. v2: Use expedited rcu barriers to match our earlier timing characteristics. v3: Try to annotate locking contexts for sparse v4: Reduce selftest lock duration to avoid a reset deadlock with fences v5: s/srcu/reset_backoff_srcu/ v6: Remove more stale comments Testcase: igt/gem_mmap_gtt/hang Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk
2019-02-08 23:37:03 +08:00
int __must_check i915_reset_trylock(struct drm_i915_private *i915);
void i915_reset_unlock(struct drm_i915_private *i915, int tag);
int i915_terminally_wedged(struct drm_i915_private *i915);
bool intel_has_gpu_reset(struct drm_i915_private *i915);
bool intel_has_reset_engine(struct drm_i915_private *i915);
int intel_gpu_reset(struct drm_i915_private *i915,
intel_engine_mask_t engine_mask);
int intel_reset_guc(struct drm_i915_private *i915);
struct i915_wedge_me {
struct delayed_work work;
struct drm_i915_private *i915;
const char *name;
};
void __i915_init_wedge(struct i915_wedge_me *w,
struct drm_i915_private *i915,
long timeout,
const char *name);
void __i915_fini_wedge(struct i915_wedge_me *w);
#define i915_wedge_on_timeout(W, DEV, TIMEOUT) \
for (__i915_init_wedge((W), (DEV), (TIMEOUT), __func__); \
(W)->i915; \
__i915_fini_wedge((W)))
#endif /* I915_RESET_H */