summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/include/asm/eeh.h10
-rw-r--r--arch/powerpc/kernel/eeh_driver.c150
-rw-r--r--arch/powerpc/platforms/powernv/eeh-ioda.c39
3 files changed, 112 insertions, 87 deletions
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8b4b8e4a5c32..9e39ceb1d19f 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -118,6 +118,16 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
return edev ? edev->pdev : NULL;
}
+/* Return values from eeh_ops::next_error */
+enum {
+ EEH_NEXT_ERR_NONE = 0,
+ EEH_NEXT_ERR_INF,
+ EEH_NEXT_ERR_FROZEN_PE,
+ EEH_NEXT_ERR_FENCED_PHB,
+ EEH_NEXT_ERR_DEAD_PHB,
+ EEH_NEXT_ERR_DEAD_IOC
+};
+
/*
* The struct is used to trace the registered EEH operation
* callback functions. Actually, those operation callback
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7db39203a073..baa3b06e6dab 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -630,84 +630,90 @@ static void eeh_handle_special_event(void)
{
struct eeh_pe *pe, *phb_pe;
struct pci_bus *bus;
- struct pci_controller *hose, *tmp;
+ struct pci_controller *hose;
unsigned long flags;
- int rc = 0;
+ int rc;
- /*
- * The return value from next_error() has been classified as follows.
- * It might be good to enumerate them. However, next_error() is only
- * supported by PowerNV platform for now. So it would be fine to use
- * integer directly:
- *
- * 4 - Dead IOC 3 - Dead PHB
- * 2 - Fenced PHB 1 - Frozen PE
- * 0 - No error found
- *
- */
- rc = eeh_ops->next_error(&pe);
- if (rc <= 0)
- return;
- switch (rc) {
- case 4:
- /* Mark all PHBs in dead state */
- eeh_serialize_lock(&flags);
- list_for_each_entry_safe(hose, tmp,
- &hose_list, list_node) {
- phb_pe = eeh_phb_pe_get(hose);
- if (!phb_pe) continue;
-
- eeh_pe_state_mark(phb_pe,
- EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+ do {
+ rc = eeh_ops->next_error(&pe);
+
+ switch (rc) {
+ case EEH_NEXT_ERR_DEAD_IOC:
+ /* Mark all PHBs in dead state */
+ eeh_serialize_lock(&flags);
+
+ /* Purge all events */
+ eeh_remove_event(NULL);
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe) continue;
+
+ eeh_pe_state_mark(phb_pe,
+ EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+ }
+
+ eeh_serialize_unlock(flags);
+
+ break;
+ case EEH_NEXT_ERR_FROZEN_PE:
+ case EEH_NEXT_ERR_FENCED_PHB:
+ case EEH_NEXT_ERR_DEAD_PHB:
+ /* Mark the PE in fenced state */
+ eeh_serialize_lock(&flags);
+
+ /* Purge all events of the PHB */
+ eeh_remove_event(pe);
+
+ if (rc == EEH_NEXT_ERR_DEAD_PHB)
+ eeh_pe_state_mark(pe,
+ EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+ else
+ eeh_pe_state_mark(pe,
+ EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+
+ eeh_serialize_unlock(flags);
+
+ break;
+ case EEH_NEXT_ERR_NONE:
+ return;
+ default:
+ pr_warn("%s: Invalid value %d from next_error()\n",
+ __func__, rc);
+ return;
}
- eeh_serialize_unlock(flags);
-
- /* Purge all events */
- eeh_remove_event(NULL);
- break;
- case 3:
- case 2:
- case 1:
- /* Mark the PE in fenced state */
- eeh_serialize_lock(&flags);
- if (rc == 3)
- eeh_pe_state_mark(pe,
- EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
- else
- eeh_pe_state_mark(pe,
- EEH_PE_ISOLATED | EEH_PE_RECOVERING);
- eeh_serialize_unlock(flags);
-
- /* Purge all events of the PHB */
- eeh_remove_event(pe);
- break;
- default:
- pr_err("%s: Invalid value %d from next_error()\n",
- __func__, rc);
- return;
- }
- /*
- * For fenced PHB and frozen PE, it's handled as normal
- * event. We have to remove the affected PHBs for dead
- * PHB and IOC
- */
- if (rc == 2 || rc == 1)
- eeh_handle_normal_event(pe);
- else {
- list_for_each_entry_safe(hose, tmp,
- &hose_list, list_node) {
- phb_pe = eeh_phb_pe_get(hose);
- if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
- continue;
-
- bus = eeh_pe_bus_get(phb_pe);
- /* Notify all devices that they're about to go down. */
- eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
- pcibios_remove_pci_devices(bus);
+ /*
+ * For fenced PHB and frozen PE, it's handled as normal
+ * event. We have to remove the affected PHBs for dead
+ * PHB and IOC
+ */
+ if (rc == EEH_NEXT_ERR_FROZEN_PE ||
+ rc == EEH_NEXT_ERR_FENCED_PHB) {
+ eeh_handle_normal_event(pe);
+ } else {
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe ||
+ !(phb_pe->state & EEH_PE_PHB_DEAD))
+ continue;
+
+ /* Notify all devices to be down */
+ bus = eeh_pe_bus_get(phb_pe);
+ eeh_pe_dev_traverse(pe,
+ eeh_report_failure, NULL);
+ pcibios_remove_pci_devices(bus);
+ }
}
- }
+
+ /*
+ * If we have detected dead IOC, we needn't proceed
+ * any more since all PHBs would have been removed
+ */
+ if (rc == EEH_NEXT_ERR_DEAD_IOC)
+ break;
+ } while (rc != EEH_NEXT_ERR_NONE);
}
/**
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 007ac4989841..8dd16f4675a2 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -735,12 +735,12 @@ static int ioda_eeh_get_pe(struct pci_controller *hose,
*/
static int ioda_eeh_next_error(struct eeh_pe **pe)
{
- struct pci_controller *hose, *tmp;
+ struct pci_controller *hose;
struct pnv_phb *phb;
u64 frozen_pe_no;
u16 err_type, severity;
long rc;
- int ret = 1;
+ int ret = EEH_NEXT_ERR_NONE;
/*
* While running here, it's safe to purge the event queue.
@@ -750,7 +750,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
eeh_remove_event(NULL);
opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
- list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ list_for_each_entry(hose, &hose_list, list_node) {
/*
* If the subordinate PCI buses of the PHB has been
* removed, we needn't take care of it any more.
@@ -789,19 +789,19 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
switch (err_type) {
case OPAL_EEH_IOC_ERROR:
if (severity == OPAL_EEH_SEV_IOC_DEAD) {
- list_for_each_entry_safe(hose, tmp,
- &hose_list, list_node) {
+ list_for_each_entry(hose, &hose_list,
+ list_node) {
phb = hose->private_data;
phb->eeh_state |= PNV_EEH_STATE_REMOVED;
}
pr_err("EEH: dead IOC detected\n");
- ret = 4;
- goto out;
+ ret = EEH_NEXT_ERR_DEAD_IOC;
} else if (severity == OPAL_EEH_SEV_INF) {
pr_info("EEH: IOC informative error "
"detected\n");
ioda_eeh_hub_diag(hose);
+ ret = EEH_NEXT_ERR_NONE;
}
break;
@@ -813,21 +813,20 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
pr_err("EEH: dead PHB#%x detected\n",
hose->global_number);
phb->eeh_state |= PNV_EEH_STATE_REMOVED;
- ret = 3;
- goto out;
+ ret = EEH_NEXT_ERR_DEAD_PHB;
} else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
if (ioda_eeh_get_phb_pe(hose, pe))
break;
pr_err("EEH: fenced PHB#%x detected\n",
hose->global_number);
- ret = 2;
- goto out;
+ ret = EEH_NEXT_ERR_FENCED_PHB;
} else if (severity == OPAL_EEH_SEV_INF) {
pr_info("EEH: PHB#%x informative error "
"detected\n",
hose->global_number);
ioda_eeh_phb_diag(hose);
+ ret = EEH_NEXT_ERR_NONE;
}
break;
@@ -837,13 +836,23 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
(*pe)->addr, (*pe)->phb->global_number);
- ret = 1;
- goto out;
+ ret = EEH_NEXT_ERR_FROZEN_PE;
+ break;
+ default:
+ pr_warn("%s: Unexpected error type %d\n",
+ __func__, err_type);
}
+
+ /*
+ * If we have no errors on the specific PHB or only
+ * informative error there, we continue poking it.
+ * Otherwise, we need actions to be taken by upper
+ * layer.
+ */
+ if (ret > EEH_NEXT_ERR_INF)
+ break;
}
- ret = 0;
-out:
return ret;
}