drm/i915/hangcheck: Prevent long walks across full-ppgtt
With full-ppgtt, it takes the GPU an eon to traverse the entire 256PiB address space, causing a loop to be detected. Under the current scheme, if ACTHD walks off the end of a batch buffer and into an empty address space, we "never" detect the hang. If we always increment the score as the ACTHD is progressing then we will eventually timeout (after ~46.5s (31 * 1.5s) without advancing onto a new batch). To counter act this, increase the amount we reduce the score for good batches, so that only a series of almost-bad batches trigger a full reset. DoS detection suffers slightly but series of long running shader tests will benefit. Based on a patch from Chris Wilson. Testcase: igt/drv_hangman/hangcheck-unterminated Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Link: http://patchwork.freedesktop.org/patch/msgid/1456930109-21532-1-git-send-email-mika.kuoppala@intel.com
This commit is contained in:
parent
d431440cce
commit
24a65e624b
|
@ -1367,8 +1367,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
|
||||||
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
|
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
|
||||||
(long long)ring->hangcheck.acthd,
|
(long long)ring->hangcheck.acthd,
|
||||||
(long long)acthd[i]);
|
(long long)acthd[i]);
|
||||||
seq_printf(m, "\tmax ACTHD = 0x%08llx\n",
|
|
||||||
(long long)ring->hangcheck.max_acthd);
|
|
||||||
seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
|
seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
|
||||||
seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
|
seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
|
||||||
|
|
||||||
|
|
|
@ -230,8 +230,6 @@ static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
|
||||||
return "wait";
|
return "wait";
|
||||||
case HANGCHECK_ACTIVE:
|
case HANGCHECK_ACTIVE:
|
||||||
return "active";
|
return "active";
|
||||||
case HANGCHECK_ACTIVE_LOOP:
|
|
||||||
return "active (loop)";
|
|
||||||
case HANGCHECK_KICK:
|
case HANGCHECK_KICK:
|
||||||
return "kick";
|
return "kick";
|
||||||
case HANGCHECK_HUNG:
|
case HANGCHECK_HUNG:
|
||||||
|
|
|
@ -3001,12 +3001,7 @@ head_stuck(struct intel_engine_cs *ring, u64 acthd)
|
||||||
memset(ring->hangcheck.instdone, 0,
|
memset(ring->hangcheck.instdone, 0,
|
||||||
sizeof(ring->hangcheck.instdone));
|
sizeof(ring->hangcheck.instdone));
|
||||||
|
|
||||||
if (acthd > ring->hangcheck.max_acthd) {
|
return HANGCHECK_ACTIVE;
|
||||||
ring->hangcheck.max_acthd = acthd;
|
|
||||||
return HANGCHECK_ACTIVE;
|
|
||||||
}
|
|
||||||
|
|
||||||
return HANGCHECK_ACTIVE_LOOP;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!subunits_stuck(ring))
|
if (!subunits_stuck(ring))
|
||||||
|
@ -3083,6 +3078,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
|
||||||
#define BUSY 1
|
#define BUSY 1
|
||||||
#define KICK 5
|
#define KICK 5
|
||||||
#define HUNG 20
|
#define HUNG 20
|
||||||
|
#define ACTIVE_DECAY 15
|
||||||
|
|
||||||
if (!i915.enable_hangcheck)
|
if (!i915.enable_hangcheck)
|
||||||
return;
|
return;
|
||||||
|
@ -3151,9 +3147,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
|
||||||
switch (ring->hangcheck.action) {
|
switch (ring->hangcheck.action) {
|
||||||
case HANGCHECK_IDLE:
|
case HANGCHECK_IDLE:
|
||||||
case HANGCHECK_WAIT:
|
case HANGCHECK_WAIT:
|
||||||
case HANGCHECK_ACTIVE:
|
|
||||||
break;
|
break;
|
||||||
case HANGCHECK_ACTIVE_LOOP:
|
case HANGCHECK_ACTIVE:
|
||||||
ring->hangcheck.score += BUSY;
|
ring->hangcheck.score += BUSY;
|
||||||
break;
|
break;
|
||||||
case HANGCHECK_KICK:
|
case HANGCHECK_KICK:
|
||||||
|
@ -3172,10 +3167,12 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
|
||||||
* attempts across multiple batches.
|
* attempts across multiple batches.
|
||||||
*/
|
*/
|
||||||
if (ring->hangcheck.score > 0)
|
if (ring->hangcheck.score > 0)
|
||||||
ring->hangcheck.score--;
|
ring->hangcheck.score -= ACTIVE_DECAY;
|
||||||
|
if (ring->hangcheck.score < 0)
|
||||||
|
ring->hangcheck.score = 0;
|
||||||
|
|
||||||
/* Clear head and subunit states on seqno movement */
|
/* Clear head and subunit states on seqno movement */
|
||||||
ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
|
ring->hangcheck.acthd = 0;
|
||||||
|
|
||||||
memset(ring->hangcheck.instdone, 0,
|
memset(ring->hangcheck.instdone, 0,
|
||||||
sizeof(ring->hangcheck.instdone));
|
sizeof(ring->hangcheck.instdone));
|
||||||
|
|
|
@ -79,7 +79,6 @@ enum intel_ring_hangcheck_action {
|
||||||
HANGCHECK_IDLE = 0,
|
HANGCHECK_IDLE = 0,
|
||||||
HANGCHECK_WAIT,
|
HANGCHECK_WAIT,
|
||||||
HANGCHECK_ACTIVE,
|
HANGCHECK_ACTIVE,
|
||||||
HANGCHECK_ACTIVE_LOOP,
|
|
||||||
HANGCHECK_KICK,
|
HANGCHECK_KICK,
|
||||||
HANGCHECK_HUNG,
|
HANGCHECK_HUNG,
|
||||||
};
|
};
|
||||||
|
@ -88,7 +87,6 @@ enum intel_ring_hangcheck_action {
|
||||||
|
|
||||||
struct intel_ring_hangcheck {
|
struct intel_ring_hangcheck {
|
||||||
u64 acthd;
|
u64 acthd;
|
||||||
u64 max_acthd;
|
|
||||||
u32 seqno;
|
u32 seqno;
|
||||||
int score;
|
int score;
|
||||||
enum intel_ring_hangcheck_action action;
|
enum intel_ring_hangcheck_action action;
|
||||||
|
|
Loading…
Reference in New Issue