From 3d5b06051cd5fa82c9a4285f7ce8650a0f0845ff Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:20 -0700 Subject: aoe: for performance support larger packet payloads tAdd adds the ability to work with large packets composed of a number of segments, using the scatter gather feature of the block layer (biovecs) and the network layer (skb frag array). The motivation is the performance gained by using a packet data payload greater than a page size and by using the network card's scatter gather feature. Users of the out-of-tree aoe driver already had these changes, but since early 2011, they have complained of increased memory utilization and higher CPU utilization during heavy writes.[1] The commit below appears related, as it disables scatter gather on non-IP protocols inside the harmonize_features function, even when the NIC supports sg. commit f01a5236bd4b140198fbcc550f085e8361fd73fa Author: Jesse Gross Date: Sun Jan 9 06:23:31 2011 +0000 net offloading: Generalize netif_get_vlan_features(). With that regression in place, transmits always linearize sg AoE packets, but in-kernel users did not have this patch. Before 2.6.38, though, these changes were working to allow sg to increase performance. 1. http://www.spinics.net/lists/linux-mm/msg15184.html Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index db195abad698..8ca8c8a929ae 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -119,6 +119,8 @@ struct frame { ulong bcnt; sector_t lba; struct sk_buff *skb; + struct bio_vec *bv; + ulong bv_off; }; struct aoeif { -- cgit v1.2.3 From 896831f5909e2733c13c9cb13a1a215f10c3eaa8 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:21 -0700 Subject: aoe: kernel thread handles I/O completions for simple locking Make the frames the aoe driver uses to track the relationship between bios and packets more flexible and detached, so that they can be passed to an "aoe_ktio" thread for completion of I/O. The frames are handled much like skbs, with a capped amount of preallocation so that real-world use cases are likely to run smoothly and degenerate gracefully even under memory pressure. Decoupling I/O completion from the receive path and serializing it in a process makes it easier to think about the correctness of the locking in the driver, especially in the case of a remote MAC address becoming unusable. [dan.carpenter@oracle.com: cleanup an allocation a bit] Signed-off-by: Ed Cashin Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 33 +- drivers/block/aoe/aoechr.c | 3 +- drivers/block/aoe/aoecmd.c | 732 ++++++++++++++++++++++++++++---------------- drivers/block/aoe/aoedev.c | 84 +++-- drivers/block/aoe/aoemain.c | 8 +- drivers/block/aoe/aoenet.c | 6 +- 6 files changed, 560 insertions(+), 306 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 8ca8c8a929ae..0cd6c0f7a535 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -91,6 +91,7 @@ enum { NTARGETS = 8, NAOEIFS = 8, NSKBPOOLMAX = 128, + NFACTIVE = 17, TIMERTICK = HZ / 10, MINTIMER = HZ >> 2, @@ -112,13 +113,16 @@ struct buf { }; struct frame { - int tag; + struct list_head head; + u32 tag; ulong waited; struct buf *buf; + struct aoetgt *t; /* parent target I belong to */ char *bufaddr; ulong bcnt; sector_t lba; - struct sk_buff *skb; + struct sk_buff *skb; /* command skb freed on module exit */ + struct sk_buff *r_skb; /* response skb for async processing */ struct bio_vec *bv; ulong bv_off; }; @@ -133,16 +137,18 @@ struct aoeif { struct aoetgt { unsigned char addr[6]; ushort nframes; - struct frame *frames; + struct aoedev *d; /* parent device I belong to */ + struct list_head factive[NFACTIVE]; /* hash of active frames */ + struct list_head ffree; /* list of free frames */ struct aoeif ifs[NAOEIFS]; struct aoeif *ifp; /* current aoeif in use */ ushort nout; ushort maxout; u16 lasttag; /* last tag sent */ u16 useme; + ulong falloc; ulong lastwadj; /* last window adjustment */ int wpkts, rpkts; - int dataref; }; struct aoedev { @@ -169,9 +175,20 @@ struct aoedev { struct buf *inprocess; /* the one we're currently working on */ struct aoetgt *targets[NTARGETS]; struct aoetgt **tgt; /* target in use when working */ - struct aoetgt **htgt; /* target needing rexmit assistance */ + struct aoetgt *htgt; /* target needing rexmit assistance */ + ulong ntargets; + ulong kicked; }; +/* kthread tracking */ +struct ktstate { + struct completion rendez; + struct task_struct *task; + wait_queue_head_t *waitq; + int (*fn) (void); + char *name; + spinlock_t *lock; +}; int aoeblk_init(void); void aoeblk_exit(void); @@ -184,11 +201,14 @@ void aoechr_error(char *); void aoecmd_work(struct aoedev *d); void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor); -void aoecmd_ata_rsp(struct sk_buff *); +struct sk_buff *aoecmd_ata_rsp(struct sk_buff *); void aoecmd_cfg_rsp(struct sk_buff *); void aoecmd_sleepwork(struct work_struct *); void aoecmd_cleanslate(struct aoedev *); +void aoecmd_exit(void); +int aoecmd_init(void); struct sk_buff *aoecmd_ata_id(struct aoedev *); +void aoe_freetframe(struct frame *); int aoedev_init(void); void aoedev_exit(void); @@ -196,6 +216,7 @@ struct aoedev *aoedev_by_aoeaddr(int maj, int min); struct aoedev *aoedev_by_sysminor_m(ulong sysminor); void aoedev_downdev(struct aoedev *d); int aoedev_flush(const char __user *str, size_t size); +void aoe_failbuf(struct aoedev *d, struct buf *buf); int aoenet_init(void); void aoenet_exit(void); diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index e86d2062a164..f145388cb94a 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -86,10 +86,9 @@ revalidate(const char __user *str, size_t size) if (copy_from_user(buf, str, size)) return -EFAULT; - /* should be e%d.%d format */ n = sscanf(buf, "e%d.%d", &major, &minor); if (n != 2) { - printk(KERN_ERR "aoe: invalid device specification\n"); + pr_err("aoe: invalid device specification %s\n", buf); return -EINVAL; } d = aoedev_by_aoeaddr(major, minor); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 9a58242290c0..59b333c902a6 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -12,10 +12,17 @@ #include #include #include +#include +#include #include #include +#include #include "aoe.h" +#define MAXIOC (8192) /* default meant to avoid most soft lockups */ + +static void ktcomplete(struct frame *, struct sk_buff *); + static int aoe_deadsecs = 60 * 3; module_param(aoe_deadsecs, int, 0644); MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); @@ -25,6 +32,15 @@ module_param(aoe_maxout, int, 0644); MODULE_PARM_DESC(aoe_maxout, "Only aoe_maxout outstanding packets for every MAC on eX.Y."); +static wait_queue_head_t ktiowq; +static struct ktstate kts; + +/* io completion queue */ +static struct { + struct list_head head; + spinlock_t lock; +} iocq; + static struct sk_buff * new_skb(ulong len) { @@ -41,15 +57,21 @@ new_skb(ulong len) } static struct frame * -getframe(struct aoetgt *t, int tag) +getframe(struct aoetgt *t, u32 tag) { - struct frame *f, *e; + struct frame *f; + struct list_head *head, *pos, *nx; + u32 n; - f = t->frames; - e = f + t->nframes; - for (; ftag == tag) + n = tag % NFACTIVE; + head = &t->factive[n]; + list_for_each_safe(pos, nx, head) { + f = list_entry(pos, struct frame, head); + if (f->tag == tag) { + list_del(pos); return f; + } + } return NULL; } @@ -67,7 +89,7 @@ newtag(struct aoetgt *t) return n |= (++t->lasttag & 0x7fff) << 16; } -static int +static u32 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h) { u32 host_tag = newtag(t); @@ -129,75 +151,96 @@ skb_pool_get(struct aoedev *d) return NULL; } -/* freeframe is where we do our load balancing so it's a little hairy. */ +void +aoe_freetframe(struct frame *f) +{ + struct aoetgt *t; + + t = f->t; + f->buf = NULL; + f->bv = NULL; + f->r_skb = NULL; + list_add(&f->head, &t->ffree); +} + static struct frame * -freeframe(struct aoedev *d) +newtframe(struct aoedev *d, struct aoetgt *t) { - struct frame *f, *e, *rf; - struct aoetgt **t; + struct frame *f; struct sk_buff *skb; + struct list_head *pos; + + if (list_empty(&t->ffree)) { + if (t->falloc >= NSKBPOOLMAX*2) + return NULL; + f = kcalloc(1, sizeof(*f), GFP_ATOMIC); + if (f == NULL) + return NULL; + t->falloc++; + f->t = t; + } else { + pos = t->ffree.next; + list_del(pos); + f = list_entry(pos, struct frame, head); + } + + skb = f->skb; + if (skb == NULL) { + f->skb = skb = new_skb(ETH_ZLEN); + if (!skb) { +bail: aoe_freetframe(f); + return NULL; + } + } + + if (atomic_read(&skb_shinfo(skb)->dataref) != 1) { + skb = skb_pool_get(d); + if (skb == NULL) + goto bail; + skb_pool_put(d, f->skb); + f->skb = skb; + } + + skb->truesize -= skb->data_len; + skb_shinfo(skb)->nr_frags = skb->data_len = 0; + skb_trim(skb, 0); + return f; +} + +static struct frame * +newframe(struct aoedev *d) +{ + struct frame *f; + struct aoetgt *t, **tt; + int totout = 0; if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */ printk(KERN_ERR "aoe: NULL TARGETS!\n"); return NULL; } - t = d->tgt; - t++; - if (t >= &d->targets[NTARGETS] || !*t) - t = d->targets; + tt = d->tgt; /* last used target */ for (;;) { - if ((*t)->nout < (*t)->maxout + tt++; + if (tt >= &d->targets[NTARGETS] || !*tt) + tt = d->targets; + t = *tt; + totout += t->nout; + if (t->nout < t->maxout && t != d->htgt - && (*t)->ifp->nd) { - rf = NULL; - f = (*t)->frames; - e = f + (*t)->nframes; - for (; f < e; f++) { - if (f->tag != FREETAG) - continue; - skb = f->skb; - if (!skb - && !(f->skb = skb = new_skb(ETH_ZLEN))) - continue; - if (atomic_read(&skb_shinfo(skb)->dataref) - != 1) { - if (!rf) - rf = f; - continue; - } -gotone: skb->truesize -= skb->data_len; - skb_shinfo(skb)->nr_frags = skb->data_len = 0; - skb_trim(skb, 0); - d->tgt = t; - ifrotate(*t); + && t->ifp->nd) { + f = newtframe(d, t); + if (f) { + d->tgt = tt; + ifrotate(t); return f; } - /* Work can be done, but the network layer is - holding our precious packets. Try to grab - one from the pool. */ - f = rf; - if (f == NULL) { /* more paranoia */ - printk(KERN_ERR - "aoe: freeframe: %s.\n", - "unexpected null rf"); - d->flags |= DEVFL_KICKME; - return NULL; - } - skb = skb_pool_get(d); - if (skb) { - skb_pool_put(d, f->skb); - f->skb = skb; - goto gotone; - } - (*t)->dataref++; - if ((*t)->nout == 0) - d->flags |= DEVFL_KICKME; } - if (t == d->tgt) /* we've looped and found nada */ + if (tt == d->tgt) /* we've looped and found nada */ break; - t++; - if (t >= &d->targets[NTARGETS] || !*t) - t = d->targets; + } + if (totout == 0) { + d->kicked++; + d->flags |= DEVFL_KICKME; } return NULL; } @@ -220,6 +263,16 @@ loop: goto loop; } +static void +fhash(struct frame *f) +{ + struct aoetgt *t = f->t; + u32 n; + + n = f->tag % NFACTIVE; + list_add_tail(&f->head, &t->factive[n]); +} + static int aoecmd_ata_rw(struct aoedev *d) { @@ -236,7 +289,7 @@ aoecmd_ata_rw(struct aoedev *d) writebit = 0x10; extbit = 0x4; - f = freeframe(d); + f = newframe(d); if (f == NULL) return 0; t = *d->tgt; @@ -274,6 +327,7 @@ aoecmd_ata_rw(struct aoedev *d) skb_put(skb, sizeof *h + sizeof *ah); memset(h, 0, skb->len); f->tag = aoehdr_atainit(d, t, h); + fhash(f); t->nout++; f->waited = 0; f->buf = buf; @@ -358,14 +412,16 @@ cont: } static void -resend(struct aoedev *d, struct aoetgt *t, struct frame *f) +resend(struct aoedev *d, struct frame *f) { struct sk_buff *skb; struct aoe_hdr *h; struct aoe_atahdr *ah; + struct aoetgt *t; char buf[128]; u32 n; + t = f->t; ifrotate(t); n = newtag(t); skb = f->skb; @@ -379,28 +435,11 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f) aoechr_error(buf); f->tag = n; + fhash(f); h->tag = cpu_to_be32(n); memcpy(h->dst, t->addr, sizeof h->dst); memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); - switch (ah->cmdstat) { - default: - break; - case ATA_CMD_PIO_READ: - case ATA_CMD_PIO_READ_EXT: - case ATA_CMD_PIO_WRITE: - case ATA_CMD_PIO_WRITE_EXT: - put_lba(ah, f->lba); - - n = f->bcnt; - ah->scnt = n >> 9; - if (ah->aflags & AOEAFL_WRITE) { - skb_fillup(skb, f->bv, f->bv_off, n); - skb->len = sizeof *h + sizeof *ah + n; - skb->data_len = n; - skb->truesize += n; - } - } skb->dev = t->ifp->nd; skb = skb_clone(skb, GFP_ATOMIC); if (skb == NULL) @@ -409,7 +448,7 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f) } static int -tsince(int tag) +tsince(u32 tag) { int n; @@ -463,26 +502,38 @@ ejectif(struct aoetgt *t, struct aoeif *ifp) static int sthtith(struct aoedev *d) { - struct frame *f, *e, *nf; + struct frame *f, *nf; + struct list_head *nx, *pos, *head; struct sk_buff *skb; - struct aoetgt *ht = *d->htgt; - - f = ht->frames; - e = f + ht->nframes; - for (; f < e; f++) { - if (f->tag == FREETAG) - continue; - nf = freeframe(d); - if (!nf) - return 0; - skb = nf->skb; - *nf = *f; - f->skb = skb; - f->tag = FREETAG; - nf->waited = 0; - ht->nout--; - (*d->tgt)->nout++; - resend(d, *d->tgt, nf); + struct aoetgt *ht = d->htgt; + int i; + + for (i = 0; i < NFACTIVE; i++) { + head = &ht->factive[i]; + list_for_each_safe(pos, nx, head) { + f = list_entry(pos, struct frame, head); + nf = newframe(d); + if (!nf) + return 0; + + /* remove frame from active list */ + list_del(pos); + + /* reassign all pertinent bits to new outbound frame */ + skb = nf->skb; + nf->skb = f->skb; + nf->buf = f->buf; + nf->bcnt = f->bcnt; + nf->lba = f->lba; + nf->bv = f->bv; + nf->bv_off = f->bv_off; + nf->waited = 0; + f->skb = skb; + aoe_freetframe(f); + ht->nout--; + nf->t->nout++; + resend(d, nf); + } } /* he's clean, he's useless. take away his interfaces */ memset(ht->ifs, 0, sizeof ht->ifs); @@ -507,9 +558,12 @@ rexmit_timer(ulong vp) struct aoedev *d; struct aoetgt *t, **tt, **te; struct aoeif *ifp; - struct frame *f, *e; + struct frame *f; + struct list_head *head, *pos, *nx; + LIST_HEAD(flist); register long timeout; ulong flags, n; + int i; d = (struct aoedev *) vp; @@ -523,41 +577,21 @@ rexmit_timer(ulong vp) spin_unlock_irqrestore(&d->lock, flags); return; } + + /* collect all frames to rexmit into flist */ tt = d->targets; te = tt + NTARGETS; for (; tt < te && *tt; tt++) { t = *tt; - f = t->frames; - e = f + t->nframes; - for (; f < e; f++) { - if (f->tag == FREETAG - || tsince(f->tag) < timeout) - continue; - n = f->waited += timeout; - n /= HZ; - if (n > aoe_deadsecs) { - /* waited too long. device failure. */ - aoedev_downdev(d); - break; - } - - if (n > HELPWAIT /* see if another target can help */ - && (tt != d->targets || d->targets[1])) - d->htgt = tt; - - if (t->nout == t->maxout) { - if (t->maxout > 1) - t->maxout--; - t->lastwadj = jiffies; - } - - ifp = getif(t, f->skb->dev); - if (ifp && ++ifp->lost > (t->nframes << 1) - && (ifp != t->ifs || t->ifs[1].nd)) { - ejectif(t, ifp); - ifp = NULL; + for (i = 0; i < NFACTIVE; i++) { + head = &t->factive[i]; + list_for_each_safe(pos, nx, head) { + f = list_entry(pos, struct frame, head); + if (tsince(f->tag) < timeout) + continue; + /* move to flist for later processing */ + list_move_tail(pos, &flist); } - resend(d, t, f); } /* window check */ @@ -569,6 +603,44 @@ rexmit_timer(ulong vp) } } + /* process expired frames */ + while (!list_empty(&flist)) { + pos = flist.next; + f = list_entry(pos, struct frame, head); + n = f->waited += timeout; + n /= HZ; + if (n > aoe_deadsecs) { + /* Waited too long. Device failure. + * Hang all frames on first hash bucket for downdev + * to clean up. + */ + list_splice(&flist, &f->t->factive[0]); + aoedev_downdev(d); + break; + } + list_del(pos); + + t = f->t; + if (n > HELPWAIT) { + /* see if another target can help */ + if (d->ntargets > 1) + d->htgt = t; + } + if (t->nout == t->maxout) { + if (t->maxout > 1) + t->maxout--; + t->lastwadj = jiffies; + } + + ifp = getif(t, f->skb->dev); + if (ifp && ++ifp->lost > (t->nframes << 1) + && (ifp != t->ifs || t->ifs[1].nd)) { + ejectif(t, ifp); + ifp = NULL; + } + resend(d, f); + } + if (!skb_queue_empty(&d->sendq)) { n = d->rttavg <<= 1; if (n > MAXTIMER) @@ -750,7 +822,7 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector } static void -bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, ulong cnt) +bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt) { ulong fcnt; char *p; @@ -771,60 +843,225 @@ loop: } static void -fadvance(struct frame *f, ulong cnt) +ktiocomplete(struct frame *f) { - ulong fcnt; + struct aoe_hdr *hin, *hout; + struct aoe_atahdr *ahin, *ahout; + struct buf *buf; + struct sk_buff *skb; + struct aoetgt *t; + struct aoeif *ifp; + struct aoedev *d; + long n; - f->lba += cnt >> 9; -loop: - fcnt = f->bv->bv_len - (f->bv_off - f->bv->bv_offset); - if (fcnt > cnt) { - f->bv_off += cnt; + if (f == NULL) return; + + t = f->t; + d = t->d; + + hout = (struct aoe_hdr *) skb_mac_header(f->skb); + ahout = (struct aoe_atahdr *) (hout+1); + buf = f->buf; + skb = f->r_skb; + if (skb == NULL) + goto noskb; /* just fail the buf. */ + + hin = (struct aoe_hdr *) skb->data; + skb_pull(skb, sizeof(*hin)); + ahin = (struct aoe_atahdr *) skb->data; + skb_pull(skb, sizeof(*ahin)); + if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */ + pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n", + ahout->cmdstat, ahin->cmdstat, + d->aoemajor, d->aoeminor); +noskb: if (buf) + buf->flags |= BUFFL_FAIL; + goto badrsp; } - cnt -= fcnt; - f->bv++; - f->bv_off = f->bv->bv_offset; - goto loop; + + n = ahout->scnt << 9; + switch (ahout->cmdstat) { + case ATA_CMD_PIO_READ: + case ATA_CMD_PIO_READ_EXT: + if (skb->len < n) { + pr_err("aoe: runt data size in read. skb->len=%d need=%ld\n", + skb->len, n); + buf->flags |= BUFFL_FAIL; + break; + } + bvcpy(f->bv, f->bv_off, skb, n); + case ATA_CMD_PIO_WRITE: + case ATA_CMD_PIO_WRITE_EXT: + spin_lock_irq(&d->lock); + ifp = getif(t, skb->dev); + if (ifp) { + ifp->lost = 0; + if (n > DEFAULTBCNT) + ifp->lostjumbo = 0; + } + if (d->htgt == t) /* I'll help myself, thank you. */ + d->htgt = NULL; + spin_unlock_irq(&d->lock); + break; + case ATA_CMD_ID_ATA: + if (skb->len < 512) { + pr_info("aoe: runt data size in ataid. skb->len=%d\n", + skb->len); + break; + } + if (skb_linearize(skb)) + break; + spin_lock_irq(&d->lock); + ataid_complete(d, t, skb->data); + spin_unlock_irq(&d->lock); + break; + default: + pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n", + ahout->cmdstat, + be16_to_cpu(get_unaligned(&hin->major)), + hin->minor); + } +badrsp: + spin_lock_irq(&d->lock); + + aoe_freetframe(f); + + if (buf && --buf->nframesout == 0 && buf->resid == 0) { + struct bio *bio = buf->bio; + + diskstats(d->gd, bio, jiffies - buf->stime, buf->sector); + n = (buf->flags & BUFFL_FAIL) ? -EIO : 0; + mempool_free(buf, d->bufpool); + spin_unlock_irq(&d->lock); + if (n != -EIO) + bio_flush_dcache_pages(buf->bio); + bio_endio(bio, n); + } else + spin_unlock_irq(&d->lock); + dev_kfree_skb(skb); } -void +/* Enters with iocq.lock held. + * Returns true iff responses needing processing remain. + */ +static int +ktio(void) +{ + struct frame *f; + struct list_head *pos; + int i; + + for (i = 0; ; ++i) { + if (i == MAXIOC) + return 1; + if (list_empty(&iocq.head)) + return 0; + pos = iocq.head.next; + list_del(pos); + spin_unlock_irq(&iocq.lock); + f = list_entry(pos, struct frame, head); + ktiocomplete(f); + spin_lock_irq(&iocq.lock); + } +} + +static int +kthread(void *vp) +{ + struct ktstate *k; + DECLARE_WAITQUEUE(wait, current); + int more; + + k = vp; + current->flags |= PF_NOFREEZE; + set_user_nice(current, -10); + complete(&k->rendez); /* tell spawner we're running */ + do { + spin_lock_irq(k->lock); + more = k->fn(); + if (!more) { + add_wait_queue(k->waitq, &wait); + __set_current_state(TASK_INTERRUPTIBLE); + } + spin_unlock_irq(k->lock); + if (!more) { + schedule(); + remove_wait_queue(k->waitq, &wait); + } else + cond_resched(); + } while (!kthread_should_stop()); + complete(&k->rendez); /* tell spawner we're stopping */ + return 0; +} + +static void +aoe_ktstop(struct ktstate *k) +{ + kthread_stop(k->task); + wait_for_completion(&k->rendez); +} + +static int +aoe_ktstart(struct ktstate *k) +{ + struct task_struct *task; + + init_completion(&k->rendez); + task = kthread_run(kthread, k, k->name); + if (task == NULL || IS_ERR(task)) + return -ENOMEM; + k->task = task; + wait_for_completion(&k->rendez); /* allow kthread to start */ + init_completion(&k->rendez); /* for waiting for exit later */ + return 0; +} + +/* pass it off to kthreads for processing */ +static void +ktcomplete(struct frame *f, struct sk_buff *skb) +{ + ulong flags; + + f->r_skb = skb; + spin_lock_irqsave(&iocq.lock, flags); + list_add_tail(&f->head, &iocq.head); + spin_unlock_irqrestore(&iocq.lock, flags); + wake_up(&ktiowq); +} + +struct sk_buff * aoecmd_ata_rsp(struct sk_buff *skb) { - struct sk_buff_head queue; struct aoedev *d; - struct aoe_hdr *hin, *hout; - struct aoe_atahdr *ahin, *ahout; + struct aoe_hdr *h; struct frame *f; - struct buf *buf; struct aoetgt *t; - struct aoeif *ifp; - register long n; + u32 n; ulong flags; char ebuf[128]; u16 aoemajor; - hin = (struct aoe_hdr *) skb_mac_header(skb); - skb_pull(skb, sizeof(*hin)); - aoemajor = get_unaligned_be16(&hin->major); - d = aoedev_by_aoeaddr(aoemajor, hin->minor); + h = (struct aoe_hdr *) skb->data; + aoemajor = be16_to_cpu(get_unaligned(&h->major)); + d = aoedev_by_aoeaddr(aoemajor, h->minor); if (d == NULL) { snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response " "for unknown device %d.%d\n", - aoemajor, hin->minor); + aoemajor, h->minor); aoechr_error(ebuf); - return; + return skb; } spin_lock_irqsave(&d->lock, flags); - n = get_unaligned_be32(&hin->tag); - t = gettgt(d, hin->src); + n = be32_to_cpu(get_unaligned(&h->tag)); + t = gettgt(d, h->src); if (t == NULL) { printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n", - d->aoemajor, d->aoeminor, hin->src); + d->aoemajor, d->aoeminor, h->src); spin_unlock_irqrestore(&d->lock, flags); - return; + return skb; } f = getframe(t, n); if (f == NULL) { @@ -833,102 +1070,26 @@ aoecmd_ata_rsp(struct sk_buff *skb) snprintf(ebuf, sizeof ebuf, "%15s e%d.%d tag=%08x@%08lx\n", "unexpected rsp", - get_unaligned_be16(&hin->major), - hin->minor, - get_unaligned_be32(&hin->tag), + get_unaligned_be16(&h->major), + h->minor, + get_unaligned_be32(&h->tag), jiffies); aoechr_error(ebuf); - return; + return skb; } - calc_rttavg(d, tsince(f->tag)); - - ahin = (struct aoe_atahdr *) skb->data; - skb_pull(skb, sizeof(*ahin)); - hout = (struct aoe_hdr *) skb_mac_header(f->skb); - ahout = (struct aoe_atahdr *) (hout+1); - buf = f->buf; - - if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */ - printk(KERN_ERR - "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n", - ahout->cmdstat, ahin->cmdstat, - d->aoemajor, d->aoeminor); - if (buf) - buf->flags |= BUFFL_FAIL; - } else { - if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */ - d->htgt = NULL; - n = ahout->scnt << 9; - switch (ahout->cmdstat) { - case ATA_CMD_PIO_READ: - case ATA_CMD_PIO_READ_EXT: - if (skb->len < n) { - printk(KERN_ERR - "aoe: %s. skb->len=%d need=%ld\n", - "runt data size in read", skb->len, n); - /* fail frame f? just returning will rexmit. */ - spin_unlock_irqrestore(&d->lock, flags); - return; - } - bvcpy(f->bv, f->bv_off, skb, n); - case ATA_CMD_PIO_WRITE: - case ATA_CMD_PIO_WRITE_EXT: - ifp = getif(t, skb->dev); - if (ifp) { - ifp->lost = 0; - if (n > DEFAULTBCNT) - ifp->lostjumbo = 0; - } - if (f->bcnt -= n) { - fadvance(f, n); - resend(d, t, f); - goto xmit; - } - break; - case ATA_CMD_ID_ATA: - if (skb->len < 512) { - printk(KERN_INFO - "aoe: runt data size in ataid. skb->len=%d\n", - skb->len); - spin_unlock_irqrestore(&d->lock, flags); - return; - } - if (skb_linearize(skb)) - break; - ataid_complete(d, t, skb->data); - break; - default: - printk(KERN_INFO - "aoe: unrecognized ata command %2.2Xh for %d.%d\n", - ahout->cmdstat, - get_unaligned_be16(&hin->major), - hin->minor); - } - } - - if (buf && --buf->nframesout == 0 && buf->resid == 0) { - diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector); - if (buf->flags & BUFFL_FAIL) - bio_endio(buf->bio, -EIO); - else { - bio_flush_dcache_pages(buf->bio); - bio_endio(buf->bio, 0); - } - mempool_free(buf, d->bufpool); - } - - f->buf = NULL; - f->tag = FREETAG; t->nout--; - aoecmd_work(d); -xmit: - __skb_queue_head_init(&queue); - skb_queue_splice_init(&d->sendq, &queue); spin_unlock_irqrestore(&d->lock, flags); - aoenet_xmit(&queue); + + ktcomplete(f, skb); + + /* + * Note here that we do not perform an aoedev_put, as we are + * leaving this reference for the ktio to release. + */ + return NULL; } void @@ -950,7 +1111,7 @@ aoecmd_ata_id(struct aoedev *d) struct sk_buff *skb; struct aoetgt *t; - f = freeframe(d); + f = newframe(d); if (f == NULL) return NULL; @@ -963,6 +1124,7 @@ aoecmd_ata_id(struct aoedev *d) skb_put(skb, sizeof *h + sizeof *ah); memset(h, 0, skb->len); f->tag = aoehdr_atainit(d, t, h); + fhash(f); t->nout++; f->waited = 0; @@ -983,7 +1145,7 @@ static struct aoetgt * addtgt(struct aoedev *d, char *addr, ulong nframes) { struct aoetgt *t, **tt, **te; - struct frame *f, *e; + int i; tt = d->targets; te = tt + NTARGETS; @@ -995,23 +1157,21 @@ addtgt(struct aoedev *d, char *addr, ulong nframes) "aoe: device addtgt failure; too many targets\n"); return NULL; } - t = kcalloc(1, sizeof *t, GFP_ATOMIC); - f = kcalloc(nframes, sizeof *f, GFP_ATOMIC); - if (!t || !f) { - kfree(f); - kfree(t); + t = kzalloc(sizeof(*t), GFP_ATOMIC); + if (!t) { printk(KERN_INFO "aoe: cannot allocate memory to add target\n"); return NULL; } + d->ntargets++; t->nframes = nframes; - t->frames = f; - e = f + nframes; - for (; f < e; f++) - f->tag = FREETAG; + t->d = d; memcpy(t->addr, addr, sizeof t->addr); t->ifp = t->ifs; t->maxout = t->nframes; + INIT_LIST_HEAD(&t->ffree); + for (i = 0; i < NFACTIVE; ++i) + INIT_LIST_HEAD(&t->factive[i]); return *tt = t; } @@ -1136,3 +1296,53 @@ aoecmd_cleanslate(struct aoedev *d) } } } + +static void +flush_iocq(void) +{ + struct frame *f; + struct aoedev *d; + LIST_HEAD(flist); + struct list_head *pos; + struct sk_buff *skb; + ulong flags; + + spin_lock_irqsave(&iocq.lock, flags); + list_splice_init(&iocq.head, &flist); + spin_unlock_irqrestore(&iocq.lock, flags); + while (!list_empty(&flist)) { + pos = flist.next; + list_del(pos); + f = list_entry(pos, struct frame, head); + d = f->t->d; + skb = f->r_skb; + spin_lock_irqsave(&d->lock, flags); + if (f->buf) { + f->buf->nframesout--; + aoe_failbuf(d, f->buf); + } + aoe_freetframe(f); + spin_unlock_irqrestore(&d->lock, flags); + dev_kfree_skb(skb); + } +} + +int __init +aoecmd_init(void) +{ + INIT_LIST_HEAD(&iocq.head); + spin_lock_init(&iocq.lock); + init_waitqueue_head(&ktiowq); + kts.name = "aoe_ktio"; + kts.fn = ktio; + kts.waitq = &ktiowq; + kts.lock = &iocq.lock; + return aoe_ktstart(&kts); +} + +void +aoecmd_exit(void) +{ + aoe_ktstop(&kts); + flush_iocq(); +} diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index b2d1fd354eac..40bae1a1ff1e 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -48,47 +48,60 @@ dummy_timer(ulong vp) } void -aoedev_downdev(struct aoedev *d) +aoe_failbuf(struct aoedev *d, struct buf *buf) { - struct aoetgt **t, **te; - struct frame *f, *e; - struct buf *buf; struct bio *bio; - t = d->targets; - te = t + NTARGETS; - for (; t < te && *t; t++) { - f = (*t)->frames; - e = f + (*t)->nframes; - for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) { - if (f->tag == FREETAG || f->buf == NULL) - continue; - buf = f->buf; - bio = buf->bio; - if (--buf->nframesout == 0 - && buf != d->inprocess) { - mempool_free(buf, d->bufpool); - bio_endio(bio, -EIO); - } - } - (*t)->maxout = (*t)->nframes; - (*t)->nout = 0; - } - buf = d->inprocess; - if (buf) { + if (buf == NULL) + return; + buf->flags |= BUFFL_FAIL; + if (buf->nframesout == 0) { + if (buf == d->inprocess) /* ensure we only process this once */ + d->inprocess = NULL; bio = buf->bio; mempool_free(buf, d->bufpool); bio_endio(bio, -EIO); } +} + +void +aoedev_downdev(struct aoedev *d) +{ + struct aoetgt *t, **tt, **te; + struct frame *f; + struct list_head *head, *pos, *nx; + int i; + + /* clean out active buffers on all targets */ + tt = d->targets; + te = tt + NTARGETS; + for (; tt < te && (t = *tt); tt++) { + for (i = 0; i < NFACTIVE; i++) { + head = &t->factive[i]; + list_for_each_safe(pos, nx, head) { + list_del(pos); + f = list_entry(pos, struct frame, head); + if (f->buf) { + f->buf->nframesout--; + aoe_failbuf(d, f->buf); + } + aoe_freetframe(f); + } + } + t->maxout = t->nframes; + t->nout = 0; + } + + /* clean out the in-process buffer (if any) */ + aoe_failbuf(d, d->inprocess); d->inprocess = NULL; d->htgt = NULL; + /* clean out all pending I/O */ while (!list_empty(&d->bufq)) { - buf = container_of(d->bufq.next, struct buf, bufs); + struct buf *buf = container_of(d->bufq.next, struct buf, bufs); list_del(d->bufq.next); - bio = buf->bio; - mempool_free(buf, d->bufpool); - bio_endio(bio, -EIO); + aoe_failbuf(d, buf); } if (d->gd) @@ -242,13 +255,16 @@ aoedev_by_sysminor_m(ulong sysminor) static void freetgt(struct aoedev *d, struct aoetgt *t) { - struct frame *f, *e; + struct frame *f; + struct list_head *pos, *nx, *head; - f = t->frames; - e = f + t->nframes; - for (; f < e; f++) + head = &t->ffree; + list_for_each_safe(pos, nx, head) { + list_del(pos); + f = list_entry(pos, struct frame, head); skbfree(f->skb); - kfree(t->frames); + kfree(f); + } kfree(t); } diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index 7f83ad90e76f..6fc4b050fab1 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c @@ -61,6 +61,7 @@ aoe_exit(void) aoenet_exit(); unregister_blkdev(AOE_MAJOR, DEVICE_NAME); + aoecmd_exit(); aoechr_exit(); aoedev_exit(); aoeblk_exit(); /* free cache after de-allocating bufs */ @@ -83,17 +84,20 @@ aoe_init(void) ret = aoenet_init(); if (ret) goto net_fail; + ret = aoecmd_init(); + if (ret) + goto cmd_fail; ret = register_blkdev(AOE_MAJOR, DEVICE_NAME); if (ret < 0) { printk(KERN_ERR "aoe: can't register major\n"); goto blkreg_fail; } - printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION); discover_timer(TINIT); return 0; - blkreg_fail: + aoecmd_exit(); + cmd_fail: aoenet_exit(); net_fail: aoeblk_exit(); diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index 07878076e43c..000eff2b53a8 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -142,7 +142,8 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, switch (h->cmd) { case AOECMD_ATA: - aoecmd_ata_rsp(skb); + /* ata_rsp may keep skb for later processing or give it back */ + skb = aoecmd_ata_rsp(skb); break; case AOECMD_CFG: aoecmd_cfg_rsp(skb); @@ -152,6 +153,9 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, break; /* don't complain about vendor commands */ printk(KERN_INFO "aoe: unknown cmd %d\n", h->cmd); } + + if (!skb) + return 0; exit: dev_kfree_skb(skb); return 0; -- cgit v1.2.3 From 69cf2d85de773d998798e47e3335b85e5645d157 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:23 -0700 Subject: aoe: become I/O request queue handler for increased user control To allow users to choose an elevator algorithm for their particular workloads, change from a make_request-style driver to an I/O-request-queue-handler-style driver. We have to do a couple of things that might be surprising. We manipulate the page _count directly on the assumption that we still have no guarantee that users of the block layer are prohibited from submitting bios containing pages with zero reference counts.[1] If such a prohibition now exists, I can get rid of the _count manipulation. Just as before this patch, we still keep track of the sk_buffs that the network layer still hasn't finished yet and cap the resources we use with a "pool" of skbs.[2] Now that the block layer maintains the disk stats, the aoe driver's diskstats function can go away. 1. https://lkml.org/lkml/2007/3/1/374 2. https://lkml.org/lkml/2007/7/6/241 Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 26 +++-- drivers/block/aoe/aoeblk.c | 88 +++++--------- drivers/block/aoe/aoechr.c | 1 + drivers/block/aoe/aoecmd.c | 282 ++++++++++++++++++++++++++++++++------------- drivers/block/aoe/aoedev.c | 93 ++++++++++----- 5 files changed, 308 insertions(+), 182 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 0cd6c0f7a535..8c4f6d942e05 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -90,7 +90,7 @@ enum { MIN_BUFS = 16, NTARGETS = 8, NAOEIFS = 8, - NSKBPOOLMAX = 128, + NSKBPOOLMAX = 256, NFACTIVE = 17, TIMERTICK = HZ / 10, @@ -100,30 +100,26 @@ enum { }; struct buf { - struct list_head bufs; - ulong stime; /* for disk stats */ - ulong flags; ulong nframesout; ulong resid; ulong bv_resid; - ulong bv_off; sector_t sector; struct bio *bio; struct bio_vec *bv; + struct request *rq; }; struct frame { struct list_head head; u32 tag; ulong waited; - struct buf *buf; struct aoetgt *t; /* parent target I belong to */ - char *bufaddr; - ulong bcnt; sector_t lba; struct sk_buff *skb; /* command skb freed on module exit */ struct sk_buff *r_skb; /* response skb for async processing */ + struct buf *buf; struct bio_vec *bv; + ulong bcnt; ulong bv_off; }; @@ -161,6 +157,7 @@ struct aoedev { u16 rttavg; /* round trip average of requests/responses */ u16 mintimer; u16 fw_ver; /* version of blade's firmware */ + ulong ref; struct work_struct work;/* disk create work struct */ struct gendisk *gd; struct request_queue *blkq; @@ -168,11 +165,13 @@ struct aoedev { sector_t ssize; struct timer_list timer; spinlock_t lock; - struct sk_buff_head sendq; struct sk_buff_head skbpool; mempool_t *bufpool; /* for deadlock-free Buf allocation */ - struct list_head bufq; /* queue of bios to work on */ - struct buf *inprocess; /* the one we're currently working on */ + struct { /* pointers to work in progress */ + struct buf *buf; + struct bio *nxbio; + struct request *rq; + } ip; struct aoetgt *targets[NTARGETS]; struct aoetgt **tgt; /* target in use when working */ struct aoetgt *htgt; /* target needing rexmit assistance */ @@ -209,6 +208,8 @@ void aoecmd_exit(void); int aoecmd_init(void); struct sk_buff *aoecmd_ata_id(struct aoedev *); void aoe_freetframe(struct frame *); +void aoe_flush_iocq(void); +void aoe_end_request(struct aoedev *, struct request *, int); int aoedev_init(void); void aoedev_exit(void); @@ -216,7 +217,8 @@ struct aoedev *aoedev_by_aoeaddr(int maj, int min); struct aoedev *aoedev_by_sysminor_m(ulong sysminor); void aoedev_downdev(struct aoedev *d); int aoedev_flush(const char __user *str, size_t size); -void aoe_failbuf(struct aoedev *d, struct buf *buf); +void aoe_failbuf(struct aoedev *, struct buf *); +void aoedev_put(struct aoedev *); int aoenet_init(void); void aoenet_exit(void); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 3a8f0933cc7d..7ec4b8fa28fd 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -161,68 +161,22 @@ aoeblk_release(struct gendisk *disk, fmode_t mode) } static void -aoeblk_make_request(struct request_queue *q, struct bio *bio) +aoeblk_request(struct request_queue *q) { - struct sk_buff_head queue; struct aoedev *d; - struct buf *buf; - ulong flags; - - blk_queue_bounce(q, &bio); - - if (bio == NULL) { - printk(KERN_ERR "aoe: bio is NULL\n"); - BUG(); - return; - } - d = bio->bi_bdev->bd_disk->private_data; - if (d == NULL) { - printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n"); - BUG(); - bio_endio(bio, -ENXIO); - return; - } else if (bio->bi_io_vec == NULL) { - printk(KERN_ERR "aoe: bi_io_vec is NULL\n"); - BUG(); - bio_endio(bio, -ENXIO); - return; - } - buf = mempool_alloc(d->bufpool, GFP_NOIO); - if (buf == NULL) { - printk(KERN_INFO "aoe: buf allocation failure\n"); - bio_endio(bio, -ENOMEM); - return; - } - memset(buf, 0, sizeof(*buf)); - INIT_LIST_HEAD(&buf->bufs); - buf->stime = jiffies; - buf->bio = bio; - buf->resid = bio->bi_size; - buf->sector = bio->bi_sector; - buf->bv = &bio->bi_io_vec[bio->bi_idx]; - buf->bv_resid = buf->bv->bv_len; - WARN_ON(buf->bv_resid == 0); - buf->bv_off = buf->bv->bv_offset; - - spin_lock_irqsave(&d->lock, flags); + struct request *rq; + d = q->queuedata; if ((d->flags & DEVFL_UP) == 0) { pr_info_ratelimited("aoe: device %ld.%d is not up\n", d->aoemajor, d->aoeminor); - spin_unlock_irqrestore(&d->lock, flags); - mempool_free(buf, d->bufpool); - bio_endio(bio, -ENXIO); + while ((rq = blk_peek_request(q))) { + blk_start_request(rq); + aoe_end_request(d, rq, 1); + } return; } - - list_add_tail(&buf->bufs, &d->bufq); - aoecmd_work(d); - __skb_queue_head_init(&queue); - skb_queue_splice_init(&d->sendq, &queue); - - spin_unlock_irqrestore(&d->lock, flags); - aoenet_xmit(&queue); } static int @@ -254,34 +208,46 @@ aoeblk_gdalloc(void *vp) { struct aoedev *d = vp; struct gendisk *gd; - enum { KB = 1024, MB = KB * KB, READ_AHEAD = MB, }; + mempool_t *mp; + struct request_queue *q; + enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, }; ulong flags; gd = alloc_disk(AOE_PARTITIONS); if (gd == NULL) { - printk(KERN_ERR - "aoe: cannot allocate disk structure for %ld.%d\n", + pr_err("aoe: cannot allocate disk structure for %ld.%d\n", d->aoemajor, d->aoeminor); goto err; } - d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache); - if (d->bufpool == NULL) { + mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab, + buf_pool_cache); + if (mp == NULL) { printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n", d->aoemajor, d->aoeminor); goto err_disk; } + q = blk_init_queue(aoeblk_request, &d->lock); + if (q == NULL) { + pr_err("aoe: cannot allocate block queue for %ld.%d\n", + d->aoemajor, d->aoeminor); + mempool_destroy(mp); + goto err_disk; + } d->blkq = blk_alloc_queue(GFP_KERNEL); if (!d->blkq) goto err_mempool; - blk_queue_make_request(d->blkq, aoeblk_make_request); d->blkq->backing_dev_info.name = "aoe"; if (bdi_init(&d->blkq->backing_dev_info)) goto err_blkq; spin_lock_irqsave(&d->lock, flags); blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS); - d->blkq->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; + q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; + d->bufpool = mp; + d->blkq = gd->queue = q; + q->queuedata = d; + d->gd = gd; gd->major = AOE_MAJOR; gd->first_minor = d->sysminor * AOE_PARTITIONS; gd->fops = &aoe_bdops; @@ -290,8 +256,6 @@ aoeblk_gdalloc(void *vp) snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d", d->aoemajor, d->aoeminor); - gd->queue = d->blkq; - d->gd = gd; d->flags &= ~DEVFL_GDALLOC; d->flags |= DEVFL_UP; diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index f145388cb94a..3557f0d04b46 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -106,6 +106,7 @@ loop: spin_lock_irqsave(&d->lock, flags); goto loop; } + aoedev_put(d); if (skb) { struct sk_buff_head queue; __skb_queue_head_init(&queue); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 59b333c902a6..5928a08c1f3f 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -23,6 +23,8 @@ static void ktcomplete(struct frame *, struct sk_buff *); +static struct buf *nextbuf(struct aoedev *); + static int aoe_deadsecs = 60 * 3; module_param(aoe_deadsecs, int, 0644); MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); @@ -283,17 +285,20 @@ aoecmd_ata_rw(struct aoedev *d) struct bio_vec *bv; struct aoetgt *t; struct sk_buff *skb; + struct sk_buff_head queue; ulong bcnt, fbcnt; char writebit, extbit; writebit = 0x10; extbit = 0x4; + buf = nextbuf(d); + if (buf == NULL) + return 0; f = newframe(d); if (f == NULL) return 0; t = *d->tgt; - buf = d->inprocess; bv = buf->bv; bcnt = t->ifp->maxbcnt; if (bcnt == 0) @@ -312,7 +317,7 @@ aoecmd_ata_rw(struct aoedev *d) fbcnt -= buf->bv_resid; buf->resid -= buf->bv_resid; if (buf->resid == 0) { - d->inprocess = NULL; + d->ip.buf = NULL; break; } buf->bv++; @@ -364,8 +369,11 @@ aoecmd_ata_rw(struct aoedev *d) skb->dev = t->ifp->nd; skb = skb_clone(skb, GFP_ATOMIC); - if (skb) - __skb_queue_tail(&d->sendq, skb); + if (skb) { + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, skb); + aoenet_xmit(&queue); + } return 1; } @@ -415,6 +423,7 @@ static void resend(struct aoedev *d, struct frame *f) { struct sk_buff *skb; + struct sk_buff_head queue; struct aoe_hdr *h; struct aoe_atahdr *ah; struct aoetgt *t; @@ -444,7 +453,9 @@ resend(struct aoedev *d, struct frame *f) skb = skb_clone(skb, GFP_ATOMIC); if (skb == NULL) return; - __skb_queue_tail(&d->sendq, skb); + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, skb); + aoenet_xmit(&queue); } static int @@ -554,7 +565,6 @@ ata_scnt(unsigned char *packet) { static void rexmit_timer(ulong vp) { - struct sk_buff_head queue; struct aoedev *d; struct aoetgt *t, **tt, **te; struct aoeif *ifp; @@ -603,6 +613,12 @@ rexmit_timer(ulong vp) } } + if (!list_empty(&flist)) { /* retransmissions necessary */ + n = d->rttavg <<= 1; + if (n > MAXTIMER) + d->rttavg = MAXTIMER; + } + /* process expired frames */ while (!list_empty(&flist)) { pos = flist.next; @@ -641,45 +657,131 @@ rexmit_timer(ulong vp) resend(d, f); } - if (!skb_queue_empty(&d->sendq)) { - n = d->rttavg <<= 1; - if (n > MAXTIMER) - d->rttavg = MAXTIMER; - } - - if (d->flags & DEVFL_KICKME || d->htgt) { + if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) { d->flags &= ~DEVFL_KICKME; - aoecmd_work(d); + d->blkq->request_fn(d->blkq); } - __skb_queue_head_init(&queue); - skb_queue_splice_init(&d->sendq, &queue); - d->timer.expires = jiffies + TIMERTICK; add_timer(&d->timer); spin_unlock_irqrestore(&d->lock, flags); +} - aoenet_xmit(&queue); +static unsigned long +rqbiocnt(struct request *r) +{ + struct bio *bio; + unsigned long n = 0; + + __rq_for_each_bio(bio, r) + n++; + return n; +} + +/* This can be removed if we are certain that no users of the block + * layer will ever use zero-count pages in bios. Otherwise we have to + * protect against the put_page sometimes done by the network layer. + * + * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for + * discussion. + * + * We cannot use get_page in the workaround, because it insists on a + * positive page count as a precondition. So we use _count directly. + */ +static void +bio_pageinc(struct bio *bio) +{ + struct bio_vec *bv; + struct page *page; + int i; + + bio_for_each_segment(bv, bio, i) { + page = bv->bv_page; + /* Non-zero page count for non-head members of + * compound pages is no longer allowed by the kernel, + * but this has never been seen here. + */ + if (unlikely(PageCompound(page))) + if (compound_trans_head(page) != page) { + pr_crit("page tail used for block I/O\n"); + BUG(); + } + atomic_inc(&page->_count); + } +} + +static void +bio_pagedec(struct bio *bio) +{ + struct bio_vec *bv; + int i; + + bio_for_each_segment(bv, bio, i) + atomic_dec(&bv->bv_page->_count); +} + +static void +bufinit(struct buf *buf, struct request *rq, struct bio *bio) +{ + struct bio_vec *bv; + + memset(buf, 0, sizeof(*buf)); + buf->rq = rq; + buf->bio = bio; + buf->resid = bio->bi_size; + buf->sector = bio->bi_sector; + bio_pageinc(bio); + buf->bv = bv = &bio->bi_io_vec[bio->bi_idx]; + buf->bv_resid = bv->bv_len; + WARN_ON(buf->bv_resid == 0); +} + +static struct buf * +nextbuf(struct aoedev *d) +{ + struct request *rq; + struct request_queue *q; + struct buf *buf; + struct bio *bio; + + q = d->blkq; + if (q == NULL) + return NULL; /* initializing */ + if (d->ip.buf) + return d->ip.buf; + rq = d->ip.rq; + if (rq == NULL) { + rq = blk_peek_request(q); + if (rq == NULL) + return NULL; + blk_start_request(rq); + d->ip.rq = rq; + d->ip.nxbio = rq->bio; + rq->special = (void *) rqbiocnt(rq); + } + buf = mempool_alloc(d->bufpool, GFP_ATOMIC); + if (buf == NULL) { + pr_err("aoe: nextbuf: unable to mempool_alloc!\n"); + return NULL; + } + bio = d->ip.nxbio; + bufinit(buf, rq, bio); + bio = bio->bi_next; + d->ip.nxbio = bio; + if (bio == NULL) + d->ip.rq = NULL; + return d->ip.buf = buf; } /* enters with d->lock held */ void aoecmd_work(struct aoedev *d) { - struct buf *buf; -loop: if (d->htgt && !sthtith(d)) return; - if (d->inprocess == NULL) { - if (list_empty(&d->bufq)) - return; - buf = container_of(d->bufq.next, struct buf, bufs); - list_del(d->bufq.next); - d->inprocess = buf; - } - if (aoecmd_ata_rw(d)) - goto loop; + while (aoecmd_ata_rw(d)) + ; } /* this function performs work that has been deferred until sleeping is OK @@ -802,25 +904,6 @@ gettgt(struct aoedev *d, char *addr) return NULL; } -static inline void -diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector) -{ - unsigned long n_sect = bio->bi_size >> 9; - const int rw = bio_data_dir(bio); - struct hd_struct *part; - int cpu; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(disk, sector); - - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, ticks[rw], duration); - part_stat_add(cpu, part, sectors[rw], n_sect); - part_stat_add(cpu, part, io_ticks, duration); - - part_stat_unlock(); -} - static void bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt) { @@ -842,6 +925,43 @@ loop: goto loop; } +void +aoe_end_request(struct aoedev *d, struct request *rq, int fastfail) +{ + struct bio *bio; + int bok; + struct request_queue *q; + + q = d->blkq; + if (rq == d->ip.rq) + d->ip.rq = NULL; + do { + bio = rq->bio; + bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags); + } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size)); + + /* cf. http://lkml.org/lkml/2006/10/31/28 */ + if (!fastfail) + q->request_fn(q); +} + +static void +aoe_end_buf(struct aoedev *d, struct buf *buf) +{ + struct request *rq; + unsigned long n; + + if (buf == d->ip.buf) + d->ip.buf = NULL; + rq = buf->rq; + bio_pagedec(buf->bio); + mempool_free(buf, d->bufpool); + n = (unsigned long) rq->special; + rq->special = (void *) --n; + if (n == 0) + aoe_end_request(d, rq, 0); +} + static void ktiocomplete(struct frame *f) { @@ -876,7 +996,7 @@ ktiocomplete(struct frame *f) ahout->cmdstat, ahin->cmdstat, d->aoemajor, d->aoeminor); noskb: if (buf) - buf->flags |= BUFFL_FAIL; + clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); goto badrsp; } @@ -887,7 +1007,7 @@ noskb: if (buf) if (skb->len < n) { pr_err("aoe: runt data size in read. skb->len=%d need=%ld\n", skb->len, n); - buf->flags |= BUFFL_FAIL; + clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); break; } bvcpy(f->bv, f->bv_off, skb, n); @@ -927,18 +1047,13 @@ badrsp: aoe_freetframe(f); - if (buf && --buf->nframesout == 0 && buf->resid == 0) { - struct bio *bio = buf->bio; + if (buf && --buf->nframesout == 0 && buf->resid == 0) + aoe_end_buf(d, buf); - diskstats(d->gd, bio, jiffies - buf->stime, buf->sector); - n = (buf->flags & BUFFL_FAIL) ? -EIO : 0; - mempool_free(buf, d->bufpool); - spin_unlock_irq(&d->lock); - if (n != -EIO) - bio_flush_dcache_pages(buf->bio); - bio_endio(bio, n); - } else - spin_unlock_irq(&d->lock); + aoecmd_work(d); + + spin_unlock_irq(&d->lock); + aoedev_put(d); dev_kfree_skb(skb); } @@ -1061,12 +1176,14 @@ aoecmd_ata_rsp(struct sk_buff *skb) printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n", d->aoemajor, d->aoeminor, h->src); spin_unlock_irqrestore(&d->lock, flags); + aoedev_put(d); return skb; } f = getframe(t, n); if (f == NULL) { calc_rttavg(d, -tsince(n)); spin_unlock_irqrestore(&d->lock, flags); + aoedev_put(d); snprintf(ebuf, sizeof ebuf, "%15s e%d.%d tag=%08x@%08lx\n", "unexpected rsp", @@ -1185,8 +1302,10 @@ aoecmd_cfg_rsp(struct sk_buff *skb) struct aoeif *ifp; ulong flags, sysminor, aoemajor; struct sk_buff *sl; + struct sk_buff_head queue; u16 n; + sl = NULL; h = (struct aoe_hdr *) skb_mac_header(skb); ch = (struct aoe_cfghdr *) (h+1); @@ -1223,10 +1342,8 @@ aoecmd_cfg_rsp(struct sk_buff *skb) t = gettgt(d, h->src); if (!t) { t = addtgt(d, h->src, n); - if (!t) { - spin_unlock_irqrestore(&d->lock, flags); - return; - } + if (!t) + goto bail; } ifp = getif(t, skb->dev); if (!ifp) { @@ -1235,8 +1352,7 @@ aoecmd_cfg_rsp(struct sk_buff *skb) printk(KERN_INFO "aoe: device addif failure; " "too many interfaces?\n"); - spin_unlock_irqrestore(&d->lock, flags); - return; + goto bail; } } if (ifp->maxbcnt) { @@ -1257,18 +1373,14 @@ aoecmd_cfg_rsp(struct sk_buff *skb) } /* don't change users' perspective */ - if (d->nopen) { - spin_unlock_irqrestore(&d->lock, flags); - return; + if (d->nopen == 0) { + d->fw_ver = be16_to_cpu(ch->fwver); + sl = aoecmd_ata_id(d); } - d->fw_ver = be16_to_cpu(ch->fwver); - - sl = aoecmd_ata_id(d); - +bail: spin_unlock_irqrestore(&d->lock, flags); - + aoedev_put(d); if (sl) { - struct sk_buff_head queue; __skb_queue_head_init(&queue); __skb_queue_tail(&queue, sl); aoenet_xmit(&queue); @@ -1297,8 +1409,19 @@ aoecmd_cleanslate(struct aoedev *d) } } -static void -flush_iocq(void) +void +aoe_failbuf(struct aoedev *d, struct buf *buf) +{ + if (buf == NULL) + return; + buf->resid = 0; + clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + if (buf->nframesout == 0) + aoe_end_buf(d, buf); +} + +void +aoe_flush_iocq(void) { struct frame *f; struct aoedev *d; @@ -1324,6 +1447,7 @@ flush_iocq(void) aoe_freetframe(f); spin_unlock_irqrestore(&d->lock, flags); dev_kfree_skb(skb); + aoedev_put(d); } } @@ -1344,5 +1468,5 @@ void aoecmd_exit(void) { aoe_ktstop(&kts); - flush_iocq(); + aoe_flush_iocq(); } diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 40bae1a1ff1e..635dc986cf77 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -19,6 +19,17 @@ static void skbpoolfree(struct aoedev *d); static struct aoedev *devlist; static DEFINE_SPINLOCK(devlist_lock); +/* + * Users who grab a pointer to the device with aoedev_by_aoeaddr or + * aoedev_by_sysminor_m automatically get a reference count and must + * be responsible for performing a aoedev_put. With the addition of + * async kthread processing I'm no longer confident that we can + * guarantee consistency in the face of device flushes. + * + * For the time being, we only bother to add extra references for + * frames sitting on the iocq. When the kthreads finish processing + * these frames, they will aoedev_put the device. + */ struct aoedev * aoedev_by_aoeaddr(int maj, int min) { @@ -28,13 +39,25 @@ aoedev_by_aoeaddr(int maj, int min) spin_lock_irqsave(&devlist_lock, flags); for (d=devlist; d; d=d->next) - if (d->aoemajor == maj && d->aoeminor == min) + if (d->aoemajor == maj && d->aoeminor == min) { + d->ref++; break; + } spin_unlock_irqrestore(&devlist_lock, flags); return d; } +void +aoedev_put(struct aoedev *d) +{ + ulong flags; + + spin_lock_irqsave(&devlist_lock, flags); + d->ref--; + spin_unlock_irqrestore(&devlist_lock, flags); +} + static void dummy_timer(ulong vp) { @@ -47,21 +70,26 @@ dummy_timer(ulong vp) add_timer(&d->timer); } -void -aoe_failbuf(struct aoedev *d, struct buf *buf) +static void +aoe_failip(struct aoedev *d) { + struct request *rq; struct bio *bio; + unsigned long n; + + aoe_failbuf(d, d->ip.buf); - if (buf == NULL) + rq = d->ip.rq; + if (rq == NULL) return; - buf->flags |= BUFFL_FAIL; - if (buf->nframesout == 0) { - if (buf == d->inprocess) /* ensure we only process this once */ - d->inprocess = NULL; - bio = buf->bio; - mempool_free(buf, d->bufpool); - bio_endio(bio, -EIO); + while ((bio = d->ip.nxbio)) { + clear_bit(BIO_UPTODATE, &bio->bi_flags); + d->ip.nxbio = bio->bi_next; + n = (unsigned long) rq->special; + rq->special = (void *) --n; } + if ((unsigned long) rq->special == 0) + aoe_end_request(d, rq, 0); } void @@ -70,8 +98,11 @@ aoedev_downdev(struct aoedev *d) struct aoetgt *t, **tt, **te; struct frame *f; struct list_head *head, *pos, *nx; + struct request *rq; int i; + d->flags &= ~DEVFL_UP; + /* clean out active buffers on all targets */ tt = d->targets; te = tt + NTARGETS; @@ -92,22 +123,20 @@ aoedev_downdev(struct aoedev *d) t->nout = 0; } - /* clean out the in-process buffer (if any) */ - aoe_failbuf(d, d->inprocess); - d->inprocess = NULL; + /* clean out the in-process request (if any) */ + aoe_failip(d); d->htgt = NULL; - /* clean out all pending I/O */ - while (!list_empty(&d->bufq)) { - struct buf *buf = container_of(d->bufq.next, struct buf, bufs); - list_del(d->bufq.next); - aoe_failbuf(d, buf); + /* fast fail all pending I/O */ + if (d->blkq) { + while ((rq = blk_peek_request(d->blkq))) { + blk_start_request(rq); + aoe_end_request(d, rq, 1); + } } if (d->gd) set_capacity(d->gd, 0); - - d->flags &= ~DEVFL_UP; } static void @@ -120,6 +149,7 @@ aoedev_freedev(struct aoedev *d) aoedisk_rm_sysfs(d); del_gendisk(d->gd); put_disk(d->gd); + blk_cleanup_queue(d->blkq); } t = d->targets; e = t + NTARGETS; @@ -128,7 +158,6 @@ aoedev_freedev(struct aoedev *d) if (d->bufpool) mempool_destroy(d->bufpool); skbpoolfree(d); - blk_cleanup_queue(d->blkq); kfree(d); } @@ -155,7 +184,8 @@ aoedev_flush(const char __user *str, size_t cnt) spin_lock(&d->lock); if ((!all && (d->flags & DEVFL_UP)) || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) - || d->nopen) { + || d->nopen + || d->ref) { spin_unlock(&d->lock); dd = &d->next; continue; @@ -176,12 +206,15 @@ aoedev_flush(const char __user *str, size_t cnt) return 0; } -/* I'm not really sure that this is a realistic problem, but if the -network driver goes gonzo let's just leak memory after complaining. */ +/* This has been confirmed to occur once with Tms=3*1000 due to the + * driver changing link and not processing its transmit ring. The + * problem is hard enough to solve by returning an error that I'm + * still punting on "solving" this. + */ static void skbfree(struct sk_buff *skb) { - enum { Sms = 100, Tms = 3*1000}; + enum { Sms = 250, Tms = 30 * 1000}; int i = Tms / Sms; if (skb == NULL) @@ -222,8 +255,10 @@ aoedev_by_sysminor_m(ulong sysminor) spin_lock_irqsave(&devlist_lock, flags); for (d=devlist; d; d=d->next) - if (d->sysminor == sysminor) + if (d->sysminor == sysminor) { + d->ref++; break; + } if (d) goto out; d = kcalloc(1, sizeof *d, GFP_ATOMIC); @@ -231,7 +266,6 @@ aoedev_by_sysminor_m(ulong sysminor) goto out; INIT_WORK(&d->work, aoecmd_sleepwork); spin_lock_init(&d->lock); - skb_queue_head_init(&d->sendq); skb_queue_head_init(&d->skbpool); init_timer(&d->timer); d->timer.data = (ulong) d; @@ -240,7 +274,7 @@ aoedev_by_sysminor_m(ulong sysminor) add_timer(&d->timer); d->bufpool = NULL; /* defer to aoeblk_gdalloc */ d->tgt = d->targets; - INIT_LIST_HEAD(&d->bufq); + d->ref = 1; d->sysminor = sysminor; d->aoemajor = AOEMAJOR(sysminor); d->aoeminor = AOEMINOR(sysminor); @@ -274,6 +308,7 @@ aoedev_exit(void) struct aoedev *d; ulong flags; + aoe_flush_iocq(); while ((d = devlist)) { devlist = d->next; -- cgit v1.2.3 From eb086ec59667df5b07d58176e21a5f523ead1d66 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:25 -0700 Subject: aoe: use a kernel thread for transmissions The dev_queue_xmit function needs to have interrupts enabled, so the most simple way to get the locking right but still fulfill that requirement is to use a process that can call dev_queue_xmit serially over queued transmissions. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 2 ++ drivers/block/aoe/aoecmd.c | 4 ++-- drivers/block/aoe/aoenet.c | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 3 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 8c4f6d942e05..d0087de1780e 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -210,6 +210,8 @@ struct sk_buff *aoecmd_ata_id(struct aoedev *); void aoe_freetframe(struct frame *); void aoe_flush_iocq(void); void aoe_end_request(struct aoedev *, struct request *, int); +int aoe_ktstart(struct ktstate *k); +void aoe_ktstop(struct ktstate *k); int aoedev_init(void); void aoedev_exit(void); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 5928a08c1f3f..a1c5e8aa08c0 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1110,14 +1110,14 @@ kthread(void *vp) return 0; } -static void +void aoe_ktstop(struct ktstate *k) { kthread_stop(k->task); wait_for_completion(&k->rendez); } -static int +int aoe_ktstart(struct ktstate *k) { struct task_struct *task; diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index 000eff2b53a8..5f43710601ab 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -33,6 +33,9 @@ static char aoe_iflist[IFLISTSZ]; module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=\"dev1 [dev2 ...]\""); +static wait_queue_head_t txwq; +static struct ktstate kts; + #ifndef MODULE static int __init aoe_iflist_setup(char *str) { @@ -44,6 +47,23 @@ static int __init aoe_iflist_setup(char *str) __setup("aoe_iflist=", aoe_iflist_setup); #endif +static spinlock_t txlock; +static struct sk_buff_head skbtxq; + +/* enters with txlock held */ +static int +tx(void) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&skbtxq))) { + spin_unlock_irq(&txlock); + dev_queue_xmit(skb); + spin_lock_irq(&txlock); + } + return 0; +} + int is_aoe_netif(struct net_device *ifp) { @@ -88,10 +108,14 @@ void aoenet_xmit(struct sk_buff_head *queue) { struct sk_buff *skb, *tmp; + ulong flags; skb_queue_walk_safe(queue, skb, tmp) { __skb_unlink(skb, queue); - dev_queue_xmit(skb); + spin_lock_irqsave(&txlock, flags); + skb_queue_tail(&skbtxq, skb); + spin_unlock_irqrestore(&txlock, flags); + wake_up(&txwq); } } @@ -169,6 +193,15 @@ static struct packet_type aoe_pt __read_mostly = { int __init aoenet_init(void) { + skb_queue_head_init(&skbtxq); + init_waitqueue_head(&txwq); + spin_lock_init(&txlock); + kts.lock = &txlock; + kts.fn = tx; + kts.waitq = &txwq; + kts.name = "aoe_tx"; + if (aoe_ktstart(&kts)) + return -EAGAIN; dev_add_pack(&aoe_pt); return 0; } @@ -176,6 +209,8 @@ aoenet_init(void) void aoenet_exit(void) { + aoe_ktstop(&kts); + skb_queue_purge(&skbtxq); dev_remove_pack(&aoe_pt); } -- cgit v1.2.3 From 3f0f0133747368fe0fcf3908f788b53591bff4e0 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:27 -0700 Subject: aoe: use packets that work with the smallest-MTU local interface Users with several network interfaces dedicated to AoE generally do not configure them to support different-sized AoE data payloads on purpose. For a given AoE target, there will be a set of local network interfaces that can reach it. Using only the payload that will fit in the smallest-sized MTU of all those local interfaces greatly simplifies the driver, especially in failure scenarios. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 7 ++- drivers/block/aoe/aoecmd.c | 151 +++++++++++++++++++++++++-------------------- 2 files changed, 87 insertions(+), 71 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index d0087de1780e..ffded64dcbeb 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -125,9 +125,8 @@ struct frame { struct aoeif { struct net_device *nd; - unsigned char lost; - unsigned char lostjumbo; - ushort maxbcnt; + ulong lost; + int bcnt; }; struct aoetgt { @@ -144,6 +143,7 @@ struct aoetgt { u16 useme; ulong falloc; ulong lastwadj; /* last window adjustment */ + int minbcnt; int wpkts, rpkts; }; @@ -172,6 +172,7 @@ struct aoedev { struct bio *nxbio; struct request *rq; } ip; + ulong maxbcnt; struct aoetgt *targets[NTARGETS]; struct aoetgt **tgt; /* target in use when working */ struct aoetgt *htgt; /* target needing rexmit assistance */ diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index a1c5e8aa08c0..bbab40c8d67c 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -119,16 +119,18 @@ put_lba(struct aoe_atahdr *ah, sector_t lba) ah->lba5 = lba >>= 8; } -static void +static struct aoeif * ifrotate(struct aoetgt *t) { - t->ifp++; - if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL) - t->ifp = t->ifs; - if (t->ifp->nd == NULL) { - printk(KERN_INFO "aoe: no interface to rotate to\n"); - BUG(); - } + struct aoeif *ifp; + + ifp = t->ifp; + ifp++; + if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL) + ifp = t->ifs; + if (ifp->nd == NULL) + return NULL; + return t->ifp = ifp; } static void @@ -232,8 +234,8 @@ newframe(struct aoedev *d) && t->ifp->nd) { f = newtframe(d, t); if (f) { - d->tgt = tt; ifrotate(t); + d->tgt = tt; return f; } } @@ -300,7 +302,7 @@ aoecmd_ata_rw(struct aoedev *d) return 0; t = *d->tgt; bv = buf->bv; - bcnt = t->ifp->maxbcnt; + bcnt = d->maxbcnt; if (bcnt == 0) bcnt = DEFAULTBCNT; if (bcnt > buf->resid) @@ -431,9 +433,14 @@ resend(struct aoedev *d, struct frame *f) u32 n; t = f->t; - ifrotate(t); n = newtag(t); skb = f->skb; + if (ifrotate(t) == NULL) { + /* probably can't happen, but set it up to fail anyway */ + pr_info("aoe: resend: no interfaces to rotate to.\n"); + ktcomplete(f, NULL); + return; + } h = (struct aoe_hdr *) skb_mac_header(skb); ah = (struct aoe_atahdr *) (h+1); @@ -483,21 +490,6 @@ getif(struct aoetgt *t, struct net_device *nd) return NULL; } -static struct aoeif * -addif(struct aoetgt *t, struct net_device *nd) -{ - struct aoeif *p; - - p = getif(t, NULL); - if (!p) - return NULL; - p->nd = nd; - p->maxbcnt = DEFAULTBCNT; - p->lost = 0; - p->lostjumbo = 0; - return p; -} - static void ejectif(struct aoetgt *t, struct aoeif *ifp) { @@ -546,7 +538,11 @@ sthtith(struct aoedev *d) resend(d, nf); } } - /* he's clean, he's useless. take away his interfaces */ + /* We've cleaned up the outstanding so take away his + * interfaces so he won't be used. We should remove him from + * the target array here, but cleaning up a target is + * involved. PUNT! + */ memset(ht->ifs, 0, sizeof ht->ifs); d->htgt = NULL; return 1; @@ -1015,11 +1011,8 @@ noskb: if (buf) case ATA_CMD_PIO_WRITE_EXT: spin_lock_irq(&d->lock); ifp = getif(t, skb->dev); - if (ifp) { + if (ifp) ifp->lost = 0; - if (n > DEFAULTBCNT) - ifp->lostjumbo = 0; - } if (d->htgt == t) /* I'll help myself, thank you. */ d->htgt = NULL; spin_unlock_irq(&d->lock); @@ -1292,6 +1285,56 @@ addtgt(struct aoedev *d, char *addr, ulong nframes) return *tt = t; } +static void +setdbcnt(struct aoedev *d) +{ + struct aoetgt **t, **e; + int bcnt = 0; + + t = d->targets; + e = t + NTARGETS; + for (; t < e && *t; t++) + if (bcnt == 0 || bcnt > (*t)->minbcnt) + bcnt = (*t)->minbcnt; + if (bcnt != d->maxbcnt) { + d->maxbcnt = bcnt; + pr_info("aoe: e%ld.%d: setting %d byte data frames\n", + d->aoemajor, d->aoeminor, bcnt); + } +} + +static void +setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt) +{ + struct aoedev *d; + struct aoeif *p, *e; + int minbcnt; + + d = t->d; + minbcnt = bcnt; + p = t->ifs; + e = p + NAOEIFS; + for (; p < e; p++) { + if (p->nd == NULL) + break; /* end of the valid interfaces */ + if (p->nd == nd) { + p->bcnt = bcnt; /* we're updating */ + nd = NULL; + } else if (minbcnt > p->bcnt) + minbcnt = p->bcnt; /* find the min interface */ + } + if (nd) { + if (p == e) { + pr_err("aoe: device setifbcnt failure; too many interfaces.\n"); + return; + } + p->nd = nd; + p->bcnt = bcnt; + } + t->minbcnt = minbcnt; + setdbcnt(d); +} + void aoecmd_cfg_rsp(struct sk_buff *skb) { @@ -1299,7 +1342,6 @@ aoecmd_cfg_rsp(struct sk_buff *skb) struct aoe_hdr *h; struct aoe_cfghdr *ch; struct aoetgt *t; - struct aoeif *ifp; ulong flags, sysminor, aoemajor; struct sk_buff *sl; struct sk_buff_head queue; @@ -1345,32 +1387,13 @@ aoecmd_cfg_rsp(struct sk_buff *skb) if (!t) goto bail; } - ifp = getif(t, skb->dev); - if (!ifp) { - ifp = addif(t, skb->dev); - if (!ifp) { - printk(KERN_INFO - "aoe: device addif failure; " - "too many interfaces?\n"); - goto bail; - } - } - if (ifp->maxbcnt) { - n = ifp->nd->mtu; - n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr); - n /= 512; - if (n > ch->scnt) - n = ch->scnt; - n = n ? n * 512 : DEFAULTBCNT; - if (n != ifp->maxbcnt) { - printk(KERN_INFO - "aoe: e%ld.%d: setting %d%s%s:%pm\n", - d->aoemajor, d->aoeminor, n, - " byte data frames on ", ifp->nd->name, - t->addr); - ifp->maxbcnt = n; - } - } + n = skb->dev->mtu; + n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr); + n /= 512; + if (n > ch->scnt) + n = ch->scnt; + n = n ? n * 512 : DEFAULTBCNT; + setifbcnt(t, skb->dev, n); /* don't change users' perspective */ if (d->nopen == 0) { @@ -1391,22 +1414,14 @@ void aoecmd_cleanslate(struct aoedev *d) { struct aoetgt **t, **te; - struct aoeif *p, *e; d->mintimer = MINTIMER; + d->maxbcnt = 0; t = d->targets; te = t + NTARGETS; - for (; t < te && *t; t++) { + for (; t < te && *t; t++) (*t)->maxout = (*t)->nframes; - p = (*t)->ifs; - e = p + NAOEIFS; - for (; p < e; p++) { - p->lostjumbo = 0; - p->lost = 0; - p->maxbcnt = DEFAULTBCNT; - } - } } void -- cgit v1.2.3 From d54d35ac6605161a593e3f4411de338ef81b5263 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:29 -0700 Subject: aoe: failover remote interface based on aoe_deadsecs parameter The aoe_deadsecs module parameter allows the user to specify a hard limit on the number of seconds an AoE command can be retransmitted before the AoE block device is considered to have failed. Using aoe_deadsecs to determine the time we try using a different remote interface helps to ensure that the hard limit is not reached before we've tried to recover by sending to a different remote port. As a data storage target, the AoE target is unambiguously identified by its {major, minor} AoE address tuple, and an AoE target can have multiple MAC addresses. However, note that "target" in the driver code and comments means a {major, minor, MAC address} tuple, as in "somewhere to send packets". Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 1 - drivers/block/aoe/aoecmd.c | 8 +++----- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index ffded64dcbeb..d17b72763973 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -96,7 +96,6 @@ enum { TIMERTICK = HZ / 10, MINTIMER = HZ >> 2, MAXTIMER = HZ << 1, - HELPWAIT = 20, }; struct buf { diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index bbab40c8d67c..e3291dfa7607 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -633,11 +633,9 @@ rexmit_timer(ulong vp) list_del(pos); t = f->t; - if (n > HELPWAIT) { - /* see if another target can help */ - if (d->ntargets > 1) - d->htgt = t; - } + if (n > aoe_deadsecs/2) + d->htgt = t; /* see if another target can help */ + if (t->nout == t->maxout) { if (t->maxout > 1) t->maxout--; -- cgit v1.2.3 From 64a80f5ac78a289f66c373ace61973205d960ee7 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:33 -0700 Subject: aoe: associate frames with the AoE storage target In the driver code, "target" and aoetgt refer to a particular remote interface on the AoE storage target. The latter is identified by its AoE major and minor addresses. Commands that are being sent to an AoE storage target {major, minor} can be sent or retransmitted to any of the remote MAC addresses associated with the AoE storage target. That is, frames are naturally associated with not an aoetgt (AoE major, AoE minor, remote MAC address) but an aoedev (AoE major, AoE minor). Making the code reflect that reality simplifies the driver, especially when the path to a remote MAC address becomes unusable. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 8 +++--- drivers/block/aoe/aoecmd.c | 65 ++++++++++++++++++++-------------------------- drivers/block/aoe/aoedev.c | 30 +++++++++++---------- 3 files changed, 49 insertions(+), 54 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index d17b72763973..dab7258ddb26 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -91,7 +91,7 @@ enum { NTARGETS = 8, NAOEIFS = 8, NSKBPOOLMAX = 256, - NFACTIVE = 17, + NFACTIVE = 61, TIMERTICK = HZ / 10, MINTIMER = HZ >> 2, @@ -132,14 +132,11 @@ struct aoetgt { unsigned char addr[6]; ushort nframes; struct aoedev *d; /* parent device I belong to */ - struct list_head factive[NFACTIVE]; /* hash of active frames */ struct list_head ffree; /* list of free frames */ struct aoeif ifs[NAOEIFS]; struct aoeif *ifp; /* current aoeif in use */ ushort nout; ushort maxout; - u16 lasttag; /* last tag sent */ - u16 useme; ulong falloc; ulong lastwadj; /* last window adjustment */ int minbcnt; @@ -156,6 +153,8 @@ struct aoedev { u16 rttavg; /* round trip average of requests/responses */ u16 mintimer; u16 fw_ver; /* version of blade's firmware */ + u16 lasttag; /* last tag sent */ + u16 useme; ulong ref; struct work_struct work;/* disk create work struct */ struct gendisk *gd; @@ -172,6 +171,7 @@ struct aoedev { struct request *rq; } ip; ulong maxbcnt; + struct list_head factive[NFACTIVE]; /* hash of active frames */ struct aoetgt *targets[NTARGETS]; struct aoetgt **tgt; /* target in use when working */ struct aoetgt *htgt; /* target needing rexmit assistance */ diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 2a6a4316db00..cc692fee7ce1 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -59,14 +59,14 @@ new_skb(ulong len) } static struct frame * -getframe(struct aoetgt *t, u32 tag) +getframe(struct aoedev *d, u32 tag) { struct frame *f; struct list_head *head, *pos, *nx; u32 n; n = tag % NFACTIVE; - head = &t->factive[n]; + head = &d->factive[n]; list_for_each_safe(pos, nx, head) { f = list_entry(pos, struct frame, head); if (f->tag == tag) { @@ -83,18 +83,18 @@ getframe(struct aoetgt *t, u32 tag) * This driver reserves tag -1 to mean "unused frame." */ static int -newtag(struct aoetgt *t) +newtag(struct aoedev *d) { register ulong n; n = jiffies & 0xffff; - return n |= (++t->lasttag & 0x7fff) << 16; + return n |= (++d->lasttag & 0x7fff) << 16; } static u32 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h) { - u32 host_tag = newtag(t); + u32 host_tag = newtag(d); memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); memcpy(h->dst, t->addr, sizeof h->dst); @@ -270,11 +270,11 @@ loop: static void fhash(struct frame *f) { - struct aoetgt *t = f->t; + struct aoedev *d = f->t->d; u32 n; n = f->tag % NFACTIVE; - list_add_tail(&f->head, &t->factive[n]); + list_add_tail(&f->head, &d->factive[n]); } static int @@ -433,7 +433,7 @@ resend(struct aoedev *d, struct frame *f) u32 n; t = f->t; - n = newtag(t); + n = newtag(d); skb = f->skb; if (ifrotate(t) == NULL) { /* probably can't happen, but set it up to fail anyway */ @@ -512,9 +512,12 @@ sthtith(struct aoedev *d) int i; for (i = 0; i < NFACTIVE; i++) { - head = &ht->factive[i]; + head = &d->factive[i]; list_for_each_safe(pos, nx, head) { f = list_entry(pos, struct frame, head); + if (f->t != ht) + continue; + nf = newframe(d); if (!nf) return 0; @@ -585,22 +588,20 @@ rexmit_timer(ulong vp) } /* collect all frames to rexmit into flist */ - tt = d->targets; - te = tt + NTARGETS; - for (; tt < te && *tt; tt++) { - t = *tt; - for (i = 0; i < NFACTIVE; i++) { - head = &t->factive[i]; - list_for_each_safe(pos, nx, head) { - f = list_entry(pos, struct frame, head); - if (tsince(f->tag) < timeout) - continue; - /* move to flist for later processing */ - list_move_tail(pos, &flist); - } + for (i = 0; i < NFACTIVE; i++) { + head = &d->factive[i]; + list_for_each_safe(pos, nx, head) { + f = list_entry(pos, struct frame, head); + if (tsince(f->tag) < timeout) + break; /* end of expired frames */ + /* move to flist for later processing */ + list_move_tail(pos, &flist); } - - /* window check */ + } + /* window check */ + tt = d->targets; + te = tt + d->ntargets; + for (; tt < te && (t = *tt); tt++) { if (t->nout == t->maxout && t->maxout < t->nframes && (jiffies - t->lastwadj)/HZ > 10) { @@ -626,7 +627,7 @@ rexmit_timer(ulong vp) * Hang all frames on first hash bucket for downdev * to clean up. */ - list_splice(&flist, &f->t->factive[0]); + list_splice(&flist, &d->factive[0]); aoedev_downdev(d); break; } @@ -1162,15 +1163,7 @@ aoecmd_ata_rsp(struct sk_buff *skb) spin_lock_irqsave(&d->lock, flags); n = be32_to_cpu(get_unaligned(&h->tag)); - t = gettgt(d, h->src); - if (t == NULL) { - printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n", - d->aoemajor, d->aoeminor, h->src); - spin_unlock_irqrestore(&d->lock, flags); - aoedev_put(d); - return skb; - } - f = getframe(t, n); + f = getframe(d, n); if (f == NULL) { calc_rttavg(d, -tsince(n)); spin_unlock_irqrestore(&d->lock, flags); @@ -1185,6 +1178,7 @@ aoecmd_ata_rsp(struct sk_buff *skb) aoechr_error(ebuf); return skb; } + t = f->t; calc_rttavg(d, tsince(f->tag)); t->nout--; aoecmd_work(d); @@ -1253,7 +1247,6 @@ static struct aoetgt * addtgt(struct aoedev *d, char *addr, ulong nframes) { struct aoetgt *t, **tt, **te; - int i; tt = d->targets; te = tt + NTARGETS; @@ -1278,8 +1271,6 @@ addtgt(struct aoedev *d, char *addr, ulong nframes) t->ifp = t->ifs; t->maxout = t->nframes; INIT_LIST_HEAD(&t->ffree); - for (i = 0; i < NFACTIVE; ++i) - INIT_LIST_HEAD(&t->factive[i]); return *tt = t; } diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 635dc986cf77..3968fe6c0077 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -103,22 +103,23 @@ aoedev_downdev(struct aoedev *d) d->flags &= ~DEVFL_UP; - /* clean out active buffers on all targets */ + /* clean out active buffers */ + for (i = 0; i < NFACTIVE; i++) { + head = &d->factive[i]; + list_for_each_safe(pos, nx, head) { + f = list_entry(pos, struct frame, head); + list_del(pos); + if (f->buf) { + f->buf->nframesout--; + aoe_failbuf(d, f->buf); + } + aoe_freetframe(f); + } + } + /* reset window dressings */ tt = d->targets; te = tt + NTARGETS; for (; tt < te && (t = *tt); tt++) { - for (i = 0; i < NFACTIVE; i++) { - head = &t->factive[i]; - list_for_each_safe(pos, nx, head) { - list_del(pos); - f = list_entry(pos, struct frame, head); - if (f->buf) { - f->buf->nframesout--; - aoe_failbuf(d, f->buf); - } - aoe_freetframe(f); - } - } t->maxout = t->nframes; t->nout = 0; } @@ -250,6 +251,7 @@ struct aoedev * aoedev_by_sysminor_m(ulong sysminor) { struct aoedev *d; + int i; ulong flags; spin_lock_irqsave(&devlist_lock, flags); @@ -275,6 +277,8 @@ aoedev_by_sysminor_m(ulong sysminor) d->bufpool = NULL; /* defer to aoeblk_gdalloc */ d->tgt = d->targets; d->ref = 1; + for (i = 0; i < NFACTIVE; i++) + INIT_LIST_HEAD(&d->factive[i]); d->sysminor = sysminor; d->aoemajor = AOEMAJOR(sysminor); d->aoeminor = AOEMINOR(sysminor); -- cgit v1.2.3 From b21faa25c6d25a76c09f1e05a1b18ee2372e3841 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:35 -0700 Subject: aoe: remove unused code and add cosmetic improvements This change removes some unused code and attempts to increase code consistency. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 10 +++------- drivers/block/aoe/aoechr.c | 1 + drivers/block/aoe/aoecmd.c | 13 ++++--------- drivers/block/aoe/aoenet.c | 3 ++- 4 files changed, 10 insertions(+), 17 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index dab7258ddb26..eb41fc549959 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -75,18 +75,14 @@ enum { DEVFL_UP = 1, /* device is installed in system and ready for AoE->ATA commands */ DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */ DEVFL_EXT = (1<<2), /* device accepts lba48 commands */ - DEVFL_CLOSEWAIT = (1<<3), /* device is waiting for all closes to revalidate */ - DEVFL_GDALLOC = (1<<4), /* need to alloc gendisk */ - DEVFL_KICKME = (1<<5), /* slow polling network card catch */ - DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */ - - BUFFL_FAIL = 1, + DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */ + DEVFL_KICKME = (1<<4), /* slow polling network card catch */ + DEVFL_NEWSIZE = (1<<5), /* need to update dev size in block layer */ }; enum { DEFAULTBCNT = 2 * 512, /* 2 sectors */ NPERSHELF = 16, /* number of slots per shelf address */ - FREETAG = -1, MIN_BUFS = 16, NTARGETS = 8, NAOEIFS = 8, diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index acdd0adaf5da..723e60419ed8 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -174,6 +174,7 @@ aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp break; case MINOR_FLUSH: ret = aoedev_flush(buf, cnt); + break; } if (ret == 0) ret = cnt; diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 2f19b9bba913..6d2a21458b50 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -284,7 +284,6 @@ aoecmd_ata_rw(struct aoedev *d) struct aoe_hdr *h; struct aoe_atahdr *ah; struct buf *buf; - struct bio_vec *bv; struct aoetgt *t; struct sk_buff *skb; struct sk_buff_head queue; @@ -301,7 +300,6 @@ aoecmd_ata_rw(struct aoedev *d) if (f == NULL) return 0; t = *d->tgt; - bv = buf->bv; bcnt = d->maxbcnt; if (bcnt == 0) bcnt = DEFAULTBCNT; @@ -788,28 +786,25 @@ void aoecmd_sleepwork(struct work_struct *work) { struct aoedev *d = container_of(work, struct aoedev, work); + struct block_device *bd; + u64 ssize; if (d->flags & DEVFL_GDALLOC) aoeblk_gdalloc(d); if (d->flags & DEVFL_NEWSIZE) { - struct block_device *bd; - unsigned long flags; - u64 ssize; - ssize = get_capacity(d->gd); bd = bdget_disk(d->gd, 0); - if (bd) { mutex_lock(&bd->bd_inode->i_mutex); i_size_write(bd->bd_inode, (loff_t)ssize<<9); mutex_unlock(&bd->bd_inode->i_mutex); bdput(bd); } - spin_lock_irqsave(&d->lock, flags); + spin_lock_irq(&d->lock); d->flags |= DEVFL_UP; d->flags &= ~DEVFL_NEWSIZE; - spin_unlock_irqrestore(&d->lock, flags); + spin_unlock_irq(&d->lock); } } diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index 5f43710601ab..3c923e56d535 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -175,7 +175,8 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, default: if (h->cmd >= AOECMD_VEND_MIN) break; /* don't complain about vendor commands */ - printk(KERN_INFO "aoe: unknown cmd %d\n", h->cmd); + pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd); + break; } if (!skb) -- cgit v1.2.3 From 7392fbe5ade3b28387bb467e39f5f3e01f6c9f13 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:37 -0700 Subject: aoe: update internal version number to 49 The internal version number of the aoe driver appears in a console message when the driver loads and is usually obtained by the user with the userland aoe-version tool, part of the aoetools.[1] Although this patchset includes bugfixes backported from higher-numbered versions published on the coraid.com website, it is a form of version 49. 1. http://aoetools.sourceforge.net/ Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index eb41fc549959..32aede997f01 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -1,5 +1,5 @@ /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ -#define VERSION "47" +#define VERSION "49" #define AOE_MAJOR 152 #define DEVICE_NAME "aoe" -- cgit v1.2.3 From fea05a26c3a215796b7a4fa5cbc25278d3e16d30 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:38 -0700 Subject: aoe: update copyright year in touched files Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 2 +- drivers/block/aoe/aoeblk.c | 2 +- drivers/block/aoe/aoechr.c | 2 +- drivers/block/aoe/aoecmd.c | 2 +- drivers/block/aoe/aoedev.c | 2 +- drivers/block/aoe/aoemain.c | 2 +- drivers/block/aoe/aoenet.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 32aede997f01..27d0a214f3bc 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ #define VERSION "49" #define AOE_MAJOR 152 #define DEVICE_NAME "aoe" diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 7ec4b8fa28fd..83160ab0d273 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ /* * aoeblk.c * block device routines diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index 723e60419ed8..deb30c183fba 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ /* * aoechr.c * AoE character device driver diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 6d2a21458b50..39dacdbda7f1 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ /* * aoecmd.c * Filesystem request handling methods diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 6be7b3858340..ccaecff4c69b 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ /* * aoedev.c * AoE device utility functions; maintains device list. diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index 6fc4b050fab1..04793c2c701b 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ /* * aoemain.c * Module initialization routines, discover timer diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index 3c923e56d535..162c6471275c 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ /* * aoenet.c * Ethernet portion of AoE driver -- cgit v1.2.3 From 0c966214589b9767fd8771b71328f83bac58cb25 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:40 -0700 Subject: aoe: support more AoE addresses with dynamic block device minor numbers The ATA over Ethernet protocol uses a major (shelf) and minor (slot) address to identify a particular storage target. These changes remove an artificial limitation the aoe driver imposes on the use of AoE addresses. For example, without these changes, the slot address has a maximum of 15, but users commonly use slot numbers much greater than that. The AoE shelf and slot address space is often used sparsely. Instead of using a static mapping between AoE addresses and the block device minor number, the block device minor numbers are now allocated on demand. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 6 ++-- drivers/block/aoe/aoeblk.c | 2 +- drivers/block/aoe/aoechr.c | 2 +- drivers/block/aoe/aoecmd.c | 25 +++++--------- drivers/block/aoe/aoedev.c | 86 +++++++++++++++++++++++++++++++--------------- 5 files changed, 72 insertions(+), 49 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 27d0a214f3bc..7b694f7da2de 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -49,6 +49,8 @@ struct aoe_hdr { __be32 tag; }; +#define AOE_MAXSHELF (0xffff-1) /* one less than the broadcast shelf address */ + struct aoe_atahdr { unsigned char aflags; unsigned char errfeat; @@ -211,8 +213,7 @@ void aoe_ktstop(struct ktstate *k); int aoedev_init(void); void aoedev_exit(void); -struct aoedev *aoedev_by_aoeaddr(int maj, int min); -struct aoedev *aoedev_by_sysminor_m(ulong sysminor); +struct aoedev *aoedev_by_aoeaddr(ulong maj, int min, int do_alloc); void aoedev_downdev(struct aoedev *d); int aoedev_flush(const char __user *str, size_t size); void aoe_failbuf(struct aoedev *, struct buf *); @@ -223,4 +224,3 @@ void aoenet_exit(void); void aoenet_xmit(struct sk_buff_head *); int is_aoe_netif(struct net_device *ifp); int set_aoe_iflist(const char __user *str, size_t size); - diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 83160ab0d273..00dfc5008ad4 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -249,7 +249,7 @@ aoeblk_gdalloc(void *vp) q->queuedata = d; d->gd = gd; gd->major = AOE_MAJOR; - gd->first_minor = d->sysminor * AOE_PARTITIONS; + gd->first_minor = d->sysminor; gd->fops = &aoe_bdops; gd->private_data = d; set_capacity(gd, d->ssize); diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index deb30c183fba..ed57a890c643 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -91,7 +91,7 @@ revalidate(const char __user *str, size_t size) pr_err("aoe: invalid device specification %s\n", buf); return -EINVAL; } - d = aoedev_by_aoeaddr(major, minor); + d = aoedev_by_aoeaddr(major, minor, 0); if (!d) return -EINVAL; spin_lock_irqsave(&d->lock, flags); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 39dacdbda7f1..94e810c36de1 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1149,7 +1149,7 @@ aoecmd_ata_rsp(struct sk_buff *skb) h = (struct aoe_hdr *) skb->data; aoemajor = be16_to_cpu(get_unaligned(&h->major)); - d = aoedev_by_aoeaddr(aoemajor, h->minor); + d = aoedev_by_aoeaddr(aoemajor, h->minor, 0); if (d == NULL) { snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response " "for unknown device %d.%d\n", @@ -1330,7 +1330,7 @@ aoecmd_cfg_rsp(struct sk_buff *skb) struct aoe_hdr *h; struct aoe_cfghdr *ch; struct aoetgt *t; - ulong flags, sysminor, aoemajor; + ulong flags, aoemajor; struct sk_buff *sl; struct sk_buff_head queue; u16 n; @@ -1349,18 +1349,15 @@ aoecmd_cfg_rsp(struct sk_buff *skb) "Check shelf dip switches.\n"); return; } - if (h->minor >= NPERSHELF) { - pr_err("aoe: e%ld.%d %s, %d\n", - aoemajor, h->minor, - "slot number larger than the maximum", - NPERSHELF-1); + if (aoemajor > AOE_MAXSHELF) { + pr_info("aoe: e%ld.%d: shelf number too large\n", + aoemajor, (int) h->minor); return; } - sysminor = SYSMINOR(aoemajor, h->minor); - if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) { - printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n", - aoemajor, (int) h->minor); + d = aoedev_by_aoeaddr(aoemajor, h->minor, 1); + if (d == NULL) { + pr_info("aoe: device allocation failure\n"); return; } @@ -1368,12 +1365,6 @@ aoecmd_cfg_rsp(struct sk_buff *skb) if (n > aoe_maxout) /* keep it reasonable */ n = aoe_maxout; - d = aoedev_by_sysminor_m(sysminor); - if (d == NULL) { - printk(KERN_INFO "aoe: device sysminor_m failure\n"); - return; - } - spin_lock_irqsave(&d->lock, flags); t = gettgt(d, h->src); diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index ccaecff4c69b..68a7a5a9ced0 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include "aoe.h" static void dummy_timer(ulong); @@ -19,35 +21,63 @@ static void skbpoolfree(struct aoedev *d); static struct aoedev *devlist; static DEFINE_SPINLOCK(devlist_lock); -/* - * Users who grab a pointer to the device with aoedev_by_aoeaddr or - * aoedev_by_sysminor_m automatically get a reference count and must - * be responsible for performing a aoedev_put. With the addition of - * async kthread processing I'm no longer confident that we can - * guarantee consistency in the face of device flushes. - * - * For the time being, we only bother to add extra references for - * frames sitting on the iocq. When the kthreads finish processing - * these frames, they will aoedev_put the device. +/* Because some systems will have one, many, or no + * - partitions, + * - slots per shelf, + * - or shelves, + * we need some flexibility in the way the minor numbers + * are allocated. So they are dynamic. */ -struct aoedev * -aoedev_by_aoeaddr(int maj, int min) +#define N_DEVS ((1U<next) - if (d->aoemajor == maj && d->aoeminor == min) { - d->ref++; - break; - } + minor /= AOE_PARTITIONS; + BUG_ON(minor >= N_DEVS); - spin_unlock_irqrestore(&devlist_lock, flags); - return d; + spin_lock_irqsave(&used_minors_lock, flags); + BUG_ON(!test_bit(minor, used_minors)); + clear_bit(minor, used_minors); + spin_unlock_irqrestore(&used_minors_lock, flags); } +/* + * Users who grab a pointer to the device with aoedev_by_aoeaddr + * automatically get a reference count and must be responsible + * for performing a aoedev_put. With the addition of async + * kthread processing I'm no longer confident that we can + * guarantee consistency in the face of device flushes. + * + * For the time being, we only bother to add extra references for + * frames sitting on the iocq. When the kthreads finish processing + * these frames, they will aoedev_put the device. + */ + void aoedev_put(struct aoedev *d) { @@ -159,6 +189,7 @@ aoedev_freedev(struct aoedev *d) if (d->bufpool) mempool_destroy(d->bufpool); skbpoolfree(d); + minor_free(d->sysminor); kfree(d); } @@ -246,22 +277,23 @@ skbpoolfree(struct aoedev *d) __skb_queue_head_init(&d->skbpool); } -/* find it or malloc it */ +/* find it or allocate it */ struct aoedev * -aoedev_by_sysminor_m(ulong sysminor) +aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) { struct aoedev *d; int i; ulong flags; + ulong sysminor; spin_lock_irqsave(&devlist_lock, flags); for (d=devlist; d; d=d->next) - if (d->sysminor == sysminor) { + if (d->aoemajor == maj && d->aoeminor == min) { d->ref++; break; } - if (d) + if (d || !do_alloc || minor_get(&sysminor) < 0) goto out; d = kcalloc(1, sizeof *d, GFP_ATOMIC); if (!d) @@ -280,8 +312,8 @@ aoedev_by_sysminor_m(ulong sysminor) for (i = 0; i < NFACTIVE; i++) INIT_LIST_HEAD(&d->factive[i]); d->sysminor = sysminor; - d->aoemajor = AOEMAJOR(sysminor); - d->aoeminor = AOEMINOR(sysminor); + d->aoemajor = maj; + d->aoeminor = min; d->mintimer = MINTIMER; d->next = devlist; devlist = d; -- cgit v1.2.3 From 7159e969d1963f19e7550aafd234b0c5361e5d69 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:44 -0700 Subject: aoe: update and specify AoE address guards and error messages In general, specific is better when it comes to messages about AoE usage problems. Also, explicit checks for the AoE broadcast addresses are added. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 2 -- drivers/block/aoe/aoecmd.c | 17 +++++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 7b694f7da2de..4ae2468b4a06 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -49,8 +49,6 @@ struct aoe_hdr { __be32 tag; }; -#define AOE_MAXSHELF (0xffff-1) /* one less than the broadcast shelf address */ - struct aoe_atahdr { unsigned char aflags; unsigned char errfeat; diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 94e810c36de1..3804a0af3ef1 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1349,15 +1349,14 @@ aoecmd_cfg_rsp(struct sk_buff *skb) "Check shelf dip switches.\n"); return; } - if (aoemajor > AOE_MAXSHELF) { - pr_info("aoe: e%ld.%d: shelf number too large\n", + if (aoemajor == 0xffff) { + pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n", aoemajor, (int) h->minor); return; } - - d = aoedev_by_aoeaddr(aoemajor, h->minor, 1); - if (d == NULL) { - pr_info("aoe: device allocation failure\n"); + if (h->minor == 0xff) { + pr_info("aoe: e%ld.%d: broadcast slot number invalid\n", + aoemajor, (int) h->minor); return; } @@ -1365,6 +1364,12 @@ aoecmd_cfg_rsp(struct sk_buff *skb) if (n > aoe_maxout) /* keep it reasonable */ n = aoe_maxout; + d = aoedev_by_aoeaddr(aoemajor, h->minor, 1); + if (d == NULL) { + pr_info("aoe: device allocation failure\n"); + return; + } + spin_lock_irqsave(&d->lock, flags); t = gettgt(d, h->src); -- cgit v1.2.3 From 1ac9e602625817b0c16cc70ea496875f7bd58a4d Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:47 -0700 Subject: aoe: remove unused code Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 4ae2468b4a06..c2bf79791e35 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -10,9 +10,6 @@ #define AOE_PARTITIONS (16) #endif -#define SYSMINOR(aoemajor, aoeminor) ((aoemajor) * NPERSHELF + (aoeminor)) -#define AOEMAJOR(sysminor) ((sysminor) / NPERSHELF) -#define AOEMINOR(sysminor) ((sysminor) % NPERSHELF) #define WHITESPACE " \t\v\f\n" enum { @@ -82,7 +79,6 @@ enum { enum { DEFAULTBCNT = 2 * 512, /* 2 sectors */ - NPERSHELF = 16, /* number of slots per shelf address */ MIN_BUFS = 16, NTARGETS = 8, NAOEIFS = 8, -- cgit v1.2.3 From 322c9ec009fdc2bc9ccb8f55afab3f7ab8ac71ab Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Thu, 4 Oct 2012 17:16:50 -0700 Subject: aoe: update aoe-internal version number to 50 Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/aoe/aoe.h') diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index c2bf79791e35..d2ed7f18d1ac 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -1,5 +1,5 @@ /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ -#define VERSION "49" +#define VERSION "50" #define AOE_MAJOR 152 #define DEVICE_NAME "aoe" -- cgit v1.2.3