powerpc/eeh: Cleanup control flow in eeh_handle_normal_event()

Rather than mixing "if (state)" blocks and gotos, convert entirely to
"if (state)" blocks to make the state machine behaviour clearer.

Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
Sam Bobroff 2018-09-12 11:23:33 +10:00 committed by Michael Ellerman
parent fef7f90552
commit b90484ec11
1 changed files with 97 additions and 105 deletions

View File

@ -808,10 +808,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
pe->phb->global_number, pe->addr, pe->phb->global_number, pe->addr,
pe->freeze_count); pe->freeze_count);
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
} }
pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
pe->freeze_count, eeh_max_freezes);
/* Walk the various device drivers attached to this slot through /* Walk the various device drivers attached to this slot through
* a reset sequence, giving each an opportunity to do what it needs * a reset sequence, giving each an opportunity to do what it needs
@ -823,31 +821,39 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
* the error. Override the result if necessary to have partially * the error. Override the result if necessary to have partially
* hotplug for this case. * hotplug for this case.
*/ */
pr_info("EEH: Notify device drivers to shutdown\n"); if (result != PCI_ERS_RESULT_DISCONNECT) {
eeh_set_channel_state(pe, pci_channel_io_frozen); pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
eeh_set_irq_state(pe, false); pe->freeze_count, eeh_max_freezes);
eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error, pr_info("EEH: Notify device drivers to shutdown\n");
&result); eeh_set_channel_state(pe, pci_channel_io_frozen);
if ((pe->type & EEH_PE_PHB) && eeh_set_irq_state(pe, false);
result != PCI_ERS_RESULT_NONE && eeh_pe_report("error_detected(IO frozen)", pe,
result != PCI_ERS_RESULT_NEED_RESET) eeh_report_error, &result);
result = PCI_ERS_RESULT_NEED_RESET; if ((pe->type & EEH_PE_PHB) &&
result != PCI_ERS_RESULT_NONE &&
result != PCI_ERS_RESULT_NEED_RESET)
result = PCI_ERS_RESULT_NEED_RESET;
}
/* Get the current PCI slot state. This can take a long time, /* Get the current PCI slot state. This can take a long time,
* sometimes over 300 seconds for certain systems. * sometimes over 300 seconds for certain systems.
*/ */
rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); if (result != PCI_ERS_RESULT_DISCONNECT) {
if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
pr_warn("EEH: Permanent failure\n"); if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
goto hard_fail; pr_warn("EEH: Permanent failure\n");
result = PCI_ERS_RESULT_DISCONNECT;
}
} }
/* Since rtas may enable MMIO when posting the error log, /* Since rtas may enable MMIO when posting the error log,
* don't post the error log until after all dev drivers * don't post the error log until after all dev drivers
* have been informed. * have been informed.
*/ */
pr_info("EEH: Collect temporary log\n"); if (result != PCI_ERS_RESULT_DISCONNECT) {
eeh_slot_error_detail(pe, EEH_LOG_TEMP); pr_info("EEH: Collect temporary log\n");
eeh_slot_error_detail(pe, EEH_LOG_TEMP);
}
/* If all device drivers were EEH-unaware, then shut /* If all device drivers were EEH-unaware, then shut
* down all of the device drivers, and hope they * down all of the device drivers, and hope they
@ -859,7 +865,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
if (rc) { if (rc) {
pr_warn("%s: Unable to reset, err=%d\n", pr_warn("%s: Unable to reset, err=%d\n",
__func__, rc); __func__, rc);
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
} }
} }
@ -868,9 +874,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
pr_info("EEH: Enable I/O for affected devices\n"); pr_info("EEH: Enable I/O for affected devices\n");
rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
if (rc < 0) if (rc < 0) {
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
if (rc) { } else if (rc) {
result = PCI_ERS_RESULT_NEED_RESET; result = PCI_ERS_RESULT_NEED_RESET;
} else { } else {
pr_info("EEH: Notify device drivers to resume I/O\n"); pr_info("EEH: Notify device drivers to resume I/O\n");
@ -884,9 +890,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
pr_info("EEH: Enabled DMA for affected devices\n"); pr_info("EEH: Enabled DMA for affected devices\n");
rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
if (rc < 0) if (rc < 0) {
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
if (rc) { } else if (rc) {
result = PCI_ERS_RESULT_NEED_RESET; result = PCI_ERS_RESULT_NEED_RESET;
} else { } else {
/* /*
@ -899,12 +905,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
} }
} }
/* If any device has a hard failure, then shut off everything. */
if (result == PCI_ERS_RESULT_DISCONNECT) {
pr_warn("EEH: Device driver gave up\n");
goto hard_fail;
}
/* If any device called out for a reset, then reset the slot */ /* If any device called out for a reset, then reset the slot */
if (result == PCI_ERS_RESULT_NEED_RESET) { if (result == PCI_ERS_RESULT_NEED_RESET) {
pr_info("EEH: Reset without hotplug activity\n"); pr_info("EEH: Reset without hotplug activity\n");
@ -912,89 +912,81 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
if (rc) { if (rc) {
pr_warn("%s: Cannot reset, err=%d\n", pr_warn("%s: Cannot reset, err=%d\n",
__func__, rc); __func__, rc);
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
} else {
result = PCI_ERS_RESULT_NONE;
eeh_set_channel_state(pe, pci_channel_io_normal);
eeh_set_irq_state(pe, true);
eeh_pe_report("slot_reset", pe, eeh_report_reset,
&result);
}
}
if ((result == PCI_ERS_RESULT_RECOVERED) ||
(result == PCI_ERS_RESULT_NONE)) {
/*
* For those hot removed VFs, we should add back them after PF
* get recovered properly.
*/
list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
rmv_entry) {
eeh_add_virt_device(edev);
list_del(&edev->rmv_entry);
} }
pr_info("EEH: Notify device drivers " /* Tell all device drivers that they can resume operations */
"the completion of reset\n"); pr_info("EEH: Notify device driver to resume\n");
result = PCI_ERS_RESULT_NONE;
eeh_set_channel_state(pe, pci_channel_io_normal); eeh_set_channel_state(pe, pci_channel_io_normal);
eeh_set_irq_state(pe, true); eeh_set_irq_state(pe, true);
eeh_pe_report("slot_reset", pe, eeh_report_reset, &result); eeh_pe_report("resume", pe, eeh_report_resume, NULL);
} eeh_for_each_pe(pe, tmp_pe) {
eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
edev->mode &= ~EEH_DEV_NO_HANDLER;
edev->in_error = false;
}
}
/* All devices should claim they have recovered by now. */ pr_info("EEH: Recovery successful.\n");
if ((result != PCI_ERS_RESULT_RECOVERED) && } else {
(result != PCI_ERS_RESULT_NONE)) { /*
pr_warn("EEH: Not recovered\n"); * About 90% of all real-life EEH failures in the field
goto hard_fail; * are due to poorly seated PCI cards. Only 10% or so are
} * due to actual, failed cards.
*/
pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
"Please try reseating or replacing it\n",
pe->phb->global_number, pe->addr);
/* eeh_slot_error_detail(pe, EEH_LOG_PERM);
* For those hot removed VFs, we should add back them after PF get
* recovered properly.
*/
list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
rmv_entry) {
eeh_add_virt_device(edev);
list_del(&edev->rmv_entry);
}
/* Tell all device drivers that they can resume operations */ /* Notify all devices that they're about to go down. */
pr_info("EEH: Notify device driver to resume\n"); eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_set_channel_state(pe, pci_channel_io_normal); eeh_set_irq_state(pe, false);
eeh_set_irq_state(pe, true); eeh_pe_report("error_detected(permanent failure)", pe,
eeh_pe_report("resume", pe, eeh_report_resume, NULL); eeh_report_failure, NULL);
eeh_for_each_pe(pe, tmp_pe) {
eeh_pe_for_each_dev(tmp_pe, edev, tmp) { /* Mark the PE to be removed permanently */
edev->mode &= ~EEH_DEV_NO_HANDLER; eeh_pe_state_mark(pe, EEH_PE_REMOVED);
edev->in_error = false;
/*
* Shut down the device drivers for good. We mark
* all removed devices correctly to avoid access
* the their PCI config any more.
*/
if (pe->type & EEH_PE_VF) {
eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
} else {
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
pci_lock_rescan_remove();
pci_hp_remove_devices(bus);
pci_unlock_rescan_remove();
/* The passed PE should no longer be used */
return;
} }
} }
pr_info("EEH: Recovery successful.\n");
goto final;
hard_fail:
/*
* About 90% of all real-life EEH failures in the field
* are due to poorly seated PCI cards. Only 10% or so are
* due to actual, failed cards.
*/
pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
"Please try reseating or replacing it\n",
pe->phb->global_number, pe->addr);
eeh_slot_error_detail(pe, EEH_LOG_PERM);
/* Notify all devices that they're about to go down. */
eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_set_irq_state(pe, false);
eeh_pe_report("error_detected(permanent failure)", pe,
eeh_report_failure, NULL);
/* Mark the PE to be removed permanently */
eeh_pe_state_mark(pe, EEH_PE_REMOVED);
/*
* Shut down the device drivers for good. We mark
* all removed devices correctly to avoid access
* the their PCI config any more.
*/
if (pe->type & EEH_PE_VF) {
eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
} else {
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
pci_lock_rescan_remove();
pci_hp_remove_devices(bus);
pci_unlock_rescan_remove();
/* The passed PE should no longer be used */
return;
}
final:
eeh_pe_state_clear(pe, EEH_PE_RECOVERING); eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
} }