diff options
author | Jon Mason <mason@myri.com> | 2011-06-27 17:57:28 +0000 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-06-29 06:02:04 -0700 |
commit | c689b81b4267b1335b11f18fe8a79c56880d9d43 (patch) | |
tree | ccfba7f4a8003a63194ab4eca263e6574b80f762 | |
parent | 7539a613c646f9e870bbedfa753a54cf13b98d22 (diff) | |
download | linux-c689b81b4267b1335b11f18fe8a79c56880d9d43.tar.bz2 |
myri10ge: rework parity error check and cleanup
Clean up watchdog reset code:
- move code that checks for stuck slice to a common routine
- unless there is a confirmed h/w fault, verify that a stuck
slice is still stuck in the watchdog worker; if the slice is no
longer stuck, abort the reset.
- this removes an egregious 2000ms pause in the watchdog worker that
was a diagnostic aid (to look for spurious resets) the snuck into
production code.
v3 includes corrections from Joe Perches
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/net/myri10ge/myri10ge.c | 100 |
1 files changed, 60 insertions, 40 deletions
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 0f0f83d50ddc..ca0345795fa6 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -193,6 +193,7 @@ struct myri10ge_slice_state { int watchdog_tx_done; int watchdog_tx_req; int watchdog_rx_done; + int stuck; #ifdef CONFIG_MYRI10GE_DCA int cached_dca_tag; int cpu; @@ -3442,6 +3443,42 @@ static u32 myri10ge_read_reboot(struct myri10ge_priv *mgp) return reboot; } +static void +myri10ge_check_slice(struct myri10ge_slice_state *ss, int *reset_needed, + int *busy_slice_cnt, u32 rx_pause_cnt) +{ + struct myri10ge_priv *mgp = ss->mgp; + int slice = ss - mgp->ss; + + if (ss->tx.req != ss->tx.done && + ss->tx.done == ss->watchdog_tx_done && + ss->watchdog_tx_req != ss->watchdog_tx_done) { + /* nic seems like it might be stuck.. */ + if (rx_pause_cnt != mgp->watchdog_pause) { + if (net_ratelimit()) + netdev_warn(mgp->dev, "slice %d: TX paused, " + "check link partner\n", slice); + } else { + netdev_warn(mgp->dev, + "slice %d: TX stuck %d %d %d %d %d %d\n", + slice, ss->tx.queue_active, ss->tx.req, + ss->tx.done, ss->tx.pkt_start, + ss->tx.pkt_done, + (int)ntohl(mgp->ss[slice].fw_stats-> + send_done_count)); + *reset_needed = 1; + ss->stuck = 1; + } + } + if (ss->watchdog_tx_done != ss->tx.done || + ss->watchdog_rx_done != ss->rx_done.cnt) { + *busy_slice_cnt += 1; + } + ss->watchdog_tx_done = ss->tx.done; + ss->watchdog_tx_req = ss->tx.req; + ss->watchdog_rx_done = ss->rx_done.cnt; +} + /* * This watchdog is used to check whether the board has suffered * from a parity error and needs to be recovered. @@ -3450,10 +3487,12 @@ static void myri10ge_watchdog(struct work_struct *work) { struct myri10ge_priv *mgp = container_of(work, struct myri10ge_priv, watchdog_work); - struct myri10ge_tx_buf *tx; - u32 reboot; + struct myri10ge_slice_state *ss; + u32 reboot, rx_pause_cnt; int status, rebooted; int i; + int reset_needed = 0; + int busy_slice_cnt = 0; u16 cmd, vendor; mgp->watchdog_resets++; @@ -3465,8 +3504,7 @@ static void myri10ge_watchdog(struct work_struct *work) * For now, just report it */ reboot = myri10ge_read_reboot(mgp); netdev_err(mgp->dev, "NIC rebooted (0x%x),%s resetting\n", - reboot, - myri10ge_reset_recover ? "" : " not"); + reboot, myri10ge_reset_recover ? "" : " not"); if (myri10ge_reset_recover == 0) return; rtnl_lock(); @@ -3498,23 +3536,24 @@ static void myri10ge_watchdog(struct work_struct *work) return; } } - /* Perhaps it is a software error. Try to reset */ - - netdev_err(mgp->dev, "device timeout, resetting\n"); + /* Perhaps it is a software error. See if stuck slice + * has recovered, reset if not */ + rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause); for (i = 0; i < mgp->num_slices; i++) { - tx = &mgp->ss[i].tx; - netdev_err(mgp->dev, "(%d): %d %d %d %d %d %d\n", - i, tx->queue_active, tx->req, - tx->done, tx->pkt_start, tx->pkt_done, - (int)ntohl(mgp->ss[i].fw_stats-> - send_done_count)); - msleep(2000); - netdev_info(mgp->dev, "(%d): %d %d %d %d %d %d\n", - i, tx->queue_active, tx->req, - tx->done, tx->pkt_start, tx->pkt_done, - (int)ntohl(mgp->ss[i].fw_stats-> - send_done_count)); + ss = mgp->ss; + if (ss->stuck) { + myri10ge_check_slice(ss, &reset_needed, + &busy_slice_cnt, + rx_pause_cnt); + ss->stuck = 0; + } } + if (!reset_needed) { + netdev_dbg(mgp->dev, "not resetting\n"); + return; + } + + netdev_err(mgp->dev, "device timeout, resetting\n"); } if (!rebooted) { @@ -3567,27 +3606,8 @@ static void myri10ge_watchdog_timer(unsigned long arg) myri10ge_fill_thresh) ss->rx_big.watchdog_needed = 0; } - - if (ss->tx.req != ss->tx.done && - ss->tx.done == ss->watchdog_tx_done && - ss->watchdog_tx_req != ss->watchdog_tx_done) { - /* nic seems like it might be stuck.. */ - if (rx_pause_cnt != mgp->watchdog_pause) { - if (net_ratelimit()) - netdev_err(mgp->dev, "slice %d: TX paused, check link partner\n", - i); - } else { - netdev_warn(mgp->dev, "slice %d stuck:", i); - reset_needed = 1; - } - } - if (ss->watchdog_tx_done != ss->tx.done || - ss->watchdog_rx_done != ss->rx_done.cnt) { - busy_slice_cnt++; - } - ss->watchdog_tx_done = ss->tx.done; - ss->watchdog_tx_req = ss->tx.req; - ss->watchdog_rx_done = ss->rx_done.cnt; + myri10ge_check_slice(ss, &reset_needed, &busy_slice_cnt, + rx_pause_cnt); } /* if we've sent or received no traffic, poll the NIC to * ensure it is still there. Otherwise, we risk not noticing |