From 6c74340bce253ea95c9ee801b3c411a333937edf Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Mon, 16 Aug 2010 21:58:03 +0200 Subject: firewire: sbp2: fix memory leak in sbp2_cancel_orbs or at send error When an ORB was canceled (Command ORB i.e. SCSI request timed out, or Management ORB timed out), or there was a send error in the initial transaction, we missed to drop one of the ORB's references and thus leaked memory. Background: In total, we hold 3 references to each Operation Request Block: - 1 during sbp2_scsi_queuecommand() or sbp2_send_management_orb() respectively, - 1 for the duration of the write transaction to the ORB_Pointer or Management_Agent register of the target, - 1 for as long as the ORB stays within the lu->orb_list, until the ORB is unlinked from the list and the orb->callback was executed. The latter one of these 3 references is finished - normally by sbp2_status_write() when the target wrote status for a pending ORB, - or by sbp2_cancel_orbs() in case of an ORB time-out, - or by complete_transaction() in case of a send error. Of them, the latter two lacked the kref_put. Add the missing kref_put()s. Add comments to the gets and puts of references for transaction callbacks and ORB callbacks so that it is easier to see what is supposed to happen. Signed-off-by: Stefan Richter --- drivers/firewire/sbp2.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/firewire/sbp2.c') diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c index 9f76171717e5..e6cbe491f7ee 100644 --- a/drivers/firewire/sbp2.c +++ b/drivers/firewire/sbp2.c @@ -450,7 +450,7 @@ static void sbp2_status_write(struct fw_card *card, struct fw_request *request, if (&orb->link != &lu->orb_list) { orb->callback(orb, &status); - kref_put(&orb->kref, free_orb); + kref_put(&orb->kref, free_orb); /* orb callback reference */ } else { fw_error("status write for unknown orb\n"); } @@ -480,12 +480,14 @@ static void complete_transaction(struct fw_card *card, int rcode, if (orb->rcode != RCODE_COMPLETE) { list_del(&orb->link); spin_unlock_irqrestore(&card->lock, flags); + orb->callback(orb, NULL); + kref_put(&orb->kref, free_orb); /* orb callback reference */ } else { spin_unlock_irqrestore(&card->lock, flags); } - kref_put(&orb->kref, free_orb); + kref_put(&orb->kref, free_orb); /* transaction callback reference */ } static void sbp2_send_orb(struct sbp2_orb *orb, struct sbp2_logical_unit *lu, @@ -501,9 +503,8 @@ static void sbp2_send_orb(struct sbp2_orb *orb, struct sbp2_logical_unit *lu, list_add_tail(&orb->link, &lu->orb_list); spin_unlock_irqrestore(&device->card->lock, flags); - /* Take a ref for the orb list and for the transaction callback. */ - kref_get(&orb->kref); - kref_get(&orb->kref); + kref_get(&orb->kref); /* transaction callback reference */ + kref_get(&orb->kref); /* orb callback reference */ fw_send_request(device->card, &orb->t, TCODE_WRITE_BLOCK_REQUEST, node_id, generation, device->max_speed, offset, @@ -530,6 +531,7 @@ static int sbp2_cancel_orbs(struct sbp2_logical_unit *lu) orb->rcode = RCODE_CANCELLED; orb->callback(orb, NULL); + kref_put(&orb->kref, free_orb); /* orb callback reference */ } return retval; -- cgit v1.2.3 From a481e97d3cdc40b9d58271675bd4f0abb79d4872 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Mon, 16 Aug 2010 22:13:34 +0200 Subject: firewire: sbp2: fix stall with "Unsolicited response" Fix I/O stalls with some 4-bay RAID enclosures which are based on OXUF936QSE: - Onnto dataTale RSM4QO, old firmware (not anymore with current firmware), - inXtron Hydra Super-S LCM, old as well as current firmware when used in RAID-5 mode, perhaps also in other RAID modes. The stalls happen during heavy or moderate disk traffic in periods that are a multiple of 5 minutes, roughly twice per hour. They are caused by the target responding too late to an ORB_Pointer register write: The target responds after Split_Timeout, hence firewire-core cancels the transaction, and firewire-sbp2 fails the SCSI request. The SCSI core retries the request, that fails again (and again), hence SCSI core calls firewire-sbp2's abort handler (and even the Management_Agent register write in the abort handler has the transaction timeout problem). During all that, the process which issued the I/O is stalled in I/O wait state. Meanwhile, the target actually acts on the first failed SCSI request: It responds to the ORB_Pointer write later (seen in the kernel log as "firewire_core: Unsolicited response") and also finishes the SCSI request with proper status (seen in the kernel log as "firewire_sbp2: status write for unknown orb"). So let's just ignore RCODE_CANCELLED in the transaction callback and wait for the target to complete the ORB nevertheless. This requires a small modification is sbp2_cancel_orbs(); it now needs to call orb->callback() regardless whether fw_cancel_transaction() found the transaction unfinished or finished. A different solution is to increase Split_Timeout on the local node. (Tested: 2000ms timeout; maybe 1000ms or something like that works too. 200ms is insufficient. Standard is 100ms.) However, I rather not do this because any software on any node could change the Split_Timeout to something unsuitable. Or such a large Split_Timeout may be undesirable for other purposes. Signed-off-by: Stefan Richter --- drivers/firewire/sbp2.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/firewire/sbp2.c') diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c index e6cbe491f7ee..bfae4b309791 100644 --- a/drivers/firewire/sbp2.c +++ b/drivers/firewire/sbp2.c @@ -472,12 +472,18 @@ static void complete_transaction(struct fw_card *card, int rcode, * So this callback only sets the rcode if it hasn't already * been set and only does the cleanup if the transaction * failed and we didn't already get a status write. + * + * Here we treat RCODE_CANCELLED like RCODE_COMPLETE because some + * OXUF936QSE firmwares occasionally respond after Split_Timeout and + * complete the ORB just fine. Note, we also get RCODE_CANCELLED + * from sbp2_cancel_orbs() if fw_cancel_transaction() == 0. */ spin_lock_irqsave(&card->lock, flags); if (orb->rcode == -1) orb->rcode = rcode; - if (orb->rcode != RCODE_COMPLETE) { + + if (orb->rcode != RCODE_COMPLETE && orb->rcode != RCODE_CANCELLED) { list_del(&orb->link); spin_unlock_irqrestore(&card->lock, flags); @@ -526,8 +532,7 @@ static int sbp2_cancel_orbs(struct sbp2_logical_unit *lu) list_for_each_entry_safe(orb, next, &list, link) { retval = 0; - if (fw_cancel_transaction(device->card, &orb->t) == 0) - continue; + fw_cancel_transaction(device->card, &orb->t); orb->rcode = RCODE_CANCELLED; orb->callback(orb, NULL); -- cgit v1.2.3