[SCSI] Reduce error recovery time by reducing use of TURs

In error recovery, most scsi error recovery stages will send a TUR command
for every bad command when a driver's error handler reports success.  When
several bad commands to the same device, this results in a device
being probed multiple times.

This becomes very problematic if the device or connection is in a state
where the device still doesn't respond to commands even after a recovery
function returns success.  The error handler must wait for the test
commands to time out.  The time waiting for the redundant commands can
drastically lengthen error recovery.

This patch alters the scsi mid-layer's error routines to send test commands
once per device instead of once per bad command.  This can drastically
lower error recovery time.

[jejb: fixed up whitespace and formatting]
Signed-of-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: James Bottomley <jbottomley@parallels.com>
This commit is contained in:
David Jeffery 2011-05-19 14:41:12 -04:00 committed by James Bottomley
parent 0bcaa11154
commit 3eef6257de
1 changed files with 67 additions and 20 deletions

View File

@ -50,6 +50,8 @@
#define BUS_RESET_SETTLE_TIME (10)
#define HOST_RESET_SETTLE_TIME (10)
static int scsi_eh_try_stu(struct scsi_cmnd *scmd);
/* called with shost->host_lock held */
void scsi_eh_wakeup(struct Scsi_Host *shost)
{
@ -946,6 +948,48 @@ static int scsi_eh_tur(struct scsi_cmnd *scmd)
}
}
/**
* scsi_eh_test_devices - check if devices are responding from error recovery.
* @cmd_list: scsi commands in error recovery.
* @work_q: queue for commands which still need more error recovery
* @done_q: queue for commands which are finished
* @try_stu: boolean on if a STU command should be tried in addition to TUR.
*
* Decription:
* Tests if devices are in a working state. Commands to devices now in
* a working state are sent to the done_q while commands to devices which
* are still failing to respond are returned to the work_q for more
* processing.
**/
static int scsi_eh_test_devices(struct list_head *cmd_list,
struct list_head *work_q,
struct list_head *done_q, int try_stu)
{
struct scsi_cmnd *scmd, *next;
struct scsi_device *sdev;
int finish_cmds;
while (!list_empty(cmd_list)) {
scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry);
sdev = scmd->device;
finish_cmds = !scsi_device_online(scmd->device) ||
(try_stu && !scsi_eh_try_stu(scmd) &&
!scsi_eh_tur(scmd)) ||
!scsi_eh_tur(scmd);
list_for_each_entry_safe(scmd, next, cmd_list, eh_entry)
if (scmd->device == sdev) {
if (finish_cmds)
scsi_eh_finish_cmd(scmd, done_q);
else
list_move_tail(&scmd->eh_entry, work_q);
}
}
return list_empty(work_q);
}
/**
* scsi_eh_abort_cmds - abort pending commands.
* @work_q: &list_head for pending commands.
@ -962,6 +1006,7 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
struct list_head *done_q)
{
struct scsi_cmnd *scmd, *next;
LIST_HEAD(check_list);
int rtn;
list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
@ -973,11 +1018,10 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
rtn = scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd);
if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD;
if (!scsi_device_online(scmd->device) ||
rtn == FAST_IO_FAIL ||
!scsi_eh_tur(scmd)) {
if (rtn == FAST_IO_FAIL)
scsi_eh_finish_cmd(scmd, done_q);
}
else
list_move_tail(&scmd->eh_entry, &check_list);
} else
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting"
" cmd failed:"
@ -986,7 +1030,7 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
scmd));
}
return list_empty(work_q);
return scsi_eh_test_devices(&check_list, work_q, done_q, 0);
}
/**
@ -1137,6 +1181,7 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
struct list_head *done_q)
{
LIST_HEAD(tmp_list);
LIST_HEAD(check_list);
list_splice_init(work_q, &tmp_list);
@ -1161,9 +1206,9 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
if (scmd_id(scmd) != id)
continue;
if ((rtn == SUCCESS || rtn == FAST_IO_FAIL)
&& (!scsi_device_online(scmd->device) ||
rtn == FAST_IO_FAIL || !scsi_eh_tur(scmd)))
if (rtn == SUCCESS)
list_move_tail(&scmd->eh_entry, &check_list);
else if (rtn == FAST_IO_FAIL)
scsi_eh_finish_cmd(scmd, done_q);
else
/* push back on work queue for further processing */
@ -1171,7 +1216,7 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
}
}
return list_empty(work_q);
return scsi_eh_test_devices(&check_list, work_q, done_q, 0);
}
/**
@ -1185,6 +1230,7 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
struct list_head *done_q)
{
struct scsi_cmnd *scmd, *chan_scmd, *next;
LIST_HEAD(check_list);
unsigned int channel;
int rtn;
@ -1216,12 +1262,14 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
rtn = scsi_try_bus_reset(chan_scmd);
if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
if (channel == scmd_channel(scmd))
if (!scsi_device_online(scmd->device) ||
rtn == FAST_IO_FAIL ||
!scsi_eh_tur(scmd))
if (channel == scmd_channel(scmd)) {
if (rtn == FAST_IO_FAIL)
scsi_eh_finish_cmd(scmd,
done_q);
else
list_move_tail(&scmd->eh_entry,
&check_list);
}
}
} else {
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BRST"
@ -1230,7 +1278,7 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
channel));
}
}
return list_empty(work_q);
return scsi_eh_test_devices(&check_list, work_q, done_q, 0);
}
/**
@ -1242,6 +1290,7 @@ static int scsi_eh_host_reset(struct list_head *work_q,
struct list_head *done_q)
{
struct scsi_cmnd *scmd, *next;
LIST_HEAD(check_list);
int rtn;
if (!list_empty(work_q)) {
@ -1252,12 +1301,10 @@ static int scsi_eh_host_reset(struct list_head *work_q,
, current->comm));
rtn = scsi_try_host_reset(scmd);
if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
if (rtn == SUCCESS) {
list_splice_init(work_q, &check_list);
} else if (rtn == FAST_IO_FAIL) {
list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
if (!scsi_device_online(scmd->device) ||
rtn == FAST_IO_FAIL ||
(!scsi_eh_try_stu(scmd) && !scsi_eh_tur(scmd)) ||
!scsi_eh_tur(scmd))
scsi_eh_finish_cmd(scmd, done_q);
}
} else {
@ -1266,7 +1313,7 @@ static int scsi_eh_host_reset(struct list_head *work_q,
current->comm));
}
}
return list_empty(work_q);
return scsi_eh_test_devices(&check_list, work_q, done_q, 1);
}
/**