drm/i915: Split up hangcheck phases

In order to simplify hangcheck state keeping, split hangcheck per engine loop in three phases: state load, action, state save. Add few more hangcheck actions to separate between seqno, head and subunit movements. This helps to gather all the hangcheck actions under a single switch umbrella. Cc: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
2016-11-16 17:20:29 +02:00 · 2016-11-16 17:20:29 +02:00 · 6e16d028e4
parent b2251c0820
commit 6e16d028e4
3 changed files with 145 additions and 106 deletions
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
 		return "idle";
 	case HANGCHECK_WAIT:
 		return "wait";
-	case HANGCHECK_ACTIVE:
+	case HANGCHECK_ACTIVE_SEQNO:
-		return "active";
+		return "active seqno";
 	case HANGCHECK_ACTIVE_HEAD:
 		return "active head";
 	case HANGCHECK_ACTIVE_SUBUNITS:
 		return "active subunits";
 	case HANGCHECK_KICK:
 		return "kick";
 	case HANGCHECK_HUNG:
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
 		memset(&engine->hangcheck.instdone, 0,
 		       sizeof(engine->hangcheck.instdone));
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_HEAD;
 	}
 	if (!subunits_stuck(engine))
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_SUBUNITS;
 	return HANGCHECK_HUNG;
 }
@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 static void hangcheck_load_sample(struct intel_engine_cs *engine,
 				  struct intel_engine_hangcheck *hc)
 {
 	/* We don't strictly need an irq-barrier here, as we are not
 	 * serving an interrupt request, be paranoid in case the
 	 * barrier has side-effects (such as preventing a broken
 	 * cacheline snoop) and so be sure that we can see the seqno
 	 * advance. If the seqno should stick, due to a stale
 	 * cacheline, we would erroneously declare the GPU hung.
 	 */
 	if (engine->irq_seqno_barrier)
 		engine->irq_seqno_barrier(engine);
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_seqno(engine);
 	hc->score = engine->hangcheck.score;
 }
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
 				   const struct intel_engine_hangcheck *hc)
 {
 	engine->hangcheck.acthd = hc->acthd;
 	engine->hangcheck.seqno = hc->seqno;
 	engine->hangcheck.score = hc->score;
 	engine->hangcheck.action = hc->action;
 }
 static enum intel_engine_hangcheck_action
 hangcheck_get_action(struct intel_engine_cs *engine,
 		     const struct intel_engine_hangcheck *hc)
 {
 	if (engine->hangcheck.seqno != hc->seqno)
 		return HANGCHECK_ACTIVE_SEQNO;
 	if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
 		return HANGCHECK_IDLE;
 	return engine_stuck(engine, hc->acthd);
 }
 static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
 					struct intel_engine_hangcheck *hc)
 {
 	hc->action = hangcheck_get_action(engine, hc);
 	switch (hc->action) {
 	case HANGCHECK_IDLE:
 	case HANGCHECK_WAIT:
 		break;
 	case HANGCHECK_ACTIVE_HEAD:
 	case HANGCHECK_ACTIVE_SUBUNITS:
 		/* We always increment the hangcheck score
 		 * if the engine is busy and still processing
 		 * the same request, so that no single request
 		 * can run indefinitely (such as a chain of
 		 * batches). The only time we do not increment
 		 * the hangcheck score on this ring, if this
 		 * engine is in a legitimate wait for another
 		 * engine. In that case the waiting engine is a
 		 * victim and we want to be sure we catch the
 		 * right culprit. Then every time we do kick
 		 * the ring, add a small increment to the
 		 * score so that we can catch a batch that is
 		 * being repeatedly kicked and so responsible
 		 * for stalling the machine.
 		 */
 		hc->score += 1;
 		break;
 	case HANGCHECK_KICK:
 		hc->score += 5;
 		break;
 	case HANGCHECK_HUNG:
 		hc->score += 20;
 		break;
 	case HANGCHECK_ACTIVE_SEQNO:
 		/* Gradually reduce the count so that we catch DoS
 		 * attempts across multiple batches.
 		 */
 		if (hc->score > 0)
 			hc->score -= 15;
 		if (hc->score < 0)
 			hc->score = 0;
 		/* Clear head and subunit states on seqno movement */
 		hc->acthd = 0;
 		memset(&engine->hangcheck.instdone, 0,
 		       sizeof(engine->hangcheck.instdone));
 		break;
 	default:
 		MISSING_CASE(hc->action);
 	}
 }
 static void hangcheck_declare_hang(struct drm_i915_private *i915,
 				   unsigned int hung,
 				   unsigned int stuck)
 {
 	struct intel_engine_cs *engine;
 	char msg[80];
 	unsigned int tmp;
 	int len;
 	/* If some rings hung but others were still busy, only
 	 * blame the hanging rings in the synopsis.
 	 */
 	if (stuck != hung)
 		hung &= ~stuck;
 	len = scnprintf(msg, sizeof(msg),
 			"%s on ", stuck == hung ? "No progress" : "Hang");
 	for_each_engine_masked(engine, i915, hung, tmp)
 		len += scnprintf(msg + len, sizeof(msg) - len,
 				 "%s, ", engine->name);
 	msg[len-2] = '\0';
 	return i915_handle_error(i915, hung, msg);
 }
 /*
 * This is called when the chip hasn't reported back with completed
 * batchbuffers in a long time. We keep track per ring seqno progress and
@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	enum intel_engine_id id;
 	unsigned int hung = 0, stuck = 0;
 	int busy_count = 0;
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
 #define ACTIVE_DECAY 15
 	if (!i915.enable_hangcheck)
 		return;
@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
 	for_each_engine(engine, dev_priv, id) {
-		bool busy = intel_engine_has_waiter(engine);
+		struct intel_engine_hangcheck cur_state, *hc = &cur_state;
-		u64 acthd;
+		const bool busy = intel_engine_has_waiter(engine);
 		u32 seqno;
 		u32 submit;
 		semaphore_clear_deadlocks(dev_priv);
-		/* We don't strictly need an irq-barrier here, as we are not
+		hangcheck_load_sample(engine, hc);
-		 * serving an interrupt request, be paranoid in case the
+		hangcheck_accumulate_sample(engine, hc);
-		 * barrier has side-effects (such as preventing a broken
+		hangcheck_store_sample(engine, hc);
 		 * cacheline snoop) and so be sure that we can see the seqno
 		 * advance. If the seqno should stick, due to a stale
 		 * cacheline, we would erroneously declare the GPU hung.
 		 */
 		if (engine->irq_seqno_barrier)
 			engine->irq_seqno_barrier(engine);
-		acthd = intel_engine_get_active_head(engine);
+		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
-		seqno = intel_engine_get_seqno(engine);
+			hung |= intel_engine_flag(engine);
-		submit = intel_engine_last_submit(engine);
+			if (hc->action != HANGCHECK_HUNG)
-
+				stuck |= intel_engine_flag(engine);
 		if (engine->hangcheck.seqno == seqno) {
 			if (i915_seqno_passed(seqno, submit)) {
 				engine->hangcheck.action = HANGCHECK_IDLE;
 			} else {
 				/* We always increment the hangcheck score
 				 * if the engine is busy and still processing
 				 * the same request, so that no single request
 				 * can run indefinitely (such as a chain of
 				 * batches). The only time we do not increment
 				 * the hangcheck score on this ring, if this
 				 * engine is in a legitimate wait for another
 				 * engine. In that case the waiting engine is a
 				 * victim and we want to be sure we catch the
 				 * right culprit. Then every time we do kick
 				 * the ring, add a small increment to the
 				 * score so that we can catch a batch that is
 				 * being repeatedly kicked and so responsible
 				 * for stalling the machine.
 				 */
 				engine->hangcheck.action =
 					engine_stuck(engine, acthd);
 				switch (engine->hangcheck.action) {
 				case HANGCHECK_IDLE:
 				case HANGCHECK_WAIT:
 					break;
 				case HANGCHECK_ACTIVE:
 					engine->hangcheck.score += BUSY;
 					break;
 				case HANGCHECK_KICK:
 					engine->hangcheck.score += KICK;
 					break;
 				case HANGCHECK_HUNG:
 					engine->hangcheck.score += HUNG;
 					break;
 				}
 			}
 			if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
 				hung |= intel_engine_flag(engine);
 				if (engine->hangcheck.action != HANGCHECK_HUNG)
 					stuck |= intel_engine_flag(engine);
 			}
 		} else {
 			engine->hangcheck.action = HANGCHECK_ACTIVE;
 			/* Gradually reduce the count so that we catch DoS
 			 * attempts across multiple batches.
 			 */
 			if (engine->hangcheck.score > 0)
 				engine->hangcheck.score -= ACTIVE_DECAY;
 			if (engine->hangcheck.score < 0)
 				engine->hangcheck.score = 0;
 			/* Clear head and subunit states on seqno movement */
 			acthd = 0;
 			memset(&engine->hangcheck.instdone, 0,
 			       sizeof(engine->hangcheck.instdone));
 		}
 		engine->hangcheck.seqno = seqno;
 		engine->hangcheck.acthd = acthd;
 		busy_count += busy;
 	}
-	if (hung) {
+	if (hung)
-		char msg[80];
+		hangcheck_declare_hang(dev_priv, hung, stuck);
 		unsigned int tmp;
 		int len;
 		/* If some rings hung but others were still busy, only
 		 * blame the hanging rings in the synopsis.
 		 */
 		if (stuck != hung)
 			hung &= ~stuck;
 		len = scnprintf(msg, sizeof(msg),
 				"%s on ", stuck == hung ? "No progress" : "Hang");
 		for_each_engine_masked(engine, dev_priv, hung, tmp)
 			len += scnprintf(msg + len, sizeof(msg) - len,
 					 "%s, ", engine->name);
 		msg[len-2] = '\0';
 		return i915_handle_error(dev_priv, hung, msg);
 	}
 	/* Reset timer in case GPU hangs without another request being added */
 	if (busy_count)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@ -67,7 +67,9 @@ struct intel_hw_status_page {
 enum intel_engine_hangcheck_action {
 	HANGCHECK_IDLE = 0,
 	HANGCHECK_WAIT,
-	HANGCHECK_ACTIVE,
+	HANGCHECK_ACTIVE_SEQNO,
 	HANGCHECK_ACTIVE_HEAD,
 	HANGCHECK_ACTIVE_SUBUNITS,
 	HANGCHECK_KICK,
 	HANGCHECK_HUNG,
 };