From 48ce8b056c88920c8ac187781048f5dae33c81b9 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Mon, 5 Jun 2006 08:21:03 -0500 Subject: JFS: commit_mutex cleanups I look at code, and see that 1)locks wasn't release in the opposite order in which they were taken 2)in jfs_rename we lock new_ip, and in "error path" we didn't unlock it 3)I see strange expression: "! !" May be this worth to fix? Signed-off-by: Evgeniy Dushistov Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_txnmgr.c | 2 +- fs/jfs/namei.c | 33 ++++++++++++++++----------------- 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index ac3d66948e8c..49618dd94f9a 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2944,7 +2944,7 @@ int jfs_sync(void *arg) * Inode is being freed */ list_del_init(&jfs_ip->anon_inode_list); - } else if (! !mutex_trylock(&jfs_ip->commit_mutex)) { + } else if (mutex_trylock(&jfs_ip->commit_mutex)) { /* * inode will be removed from anonymous list * when it is committed diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 09ea03f62277..295268ad231b 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -165,8 +165,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, out3: txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; @@ -300,8 +300,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) out3: txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; @@ -384,8 +384,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) if (rc == -EIO) txAbort(tid, 1); txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); goto out2; } @@ -422,8 +422,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); /* * Truncating the directory index table is not guaranteed. It @@ -503,8 +503,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) if (rc == -EIO) txAbort(tid, 1); /* Marks FS Dirty */ txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); IWRITE_UNLOCK(ip); goto out1; } @@ -527,8 +527,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) if ((new_size = commitZeroLink(tid, ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); IWRITE_UNLOCK(ip); rc = new_size; goto out1; @@ -556,9 +556,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); - + mutex_unlock(&JFS_IP(dip)->commit_mutex); while (new_size && (rc == 0)) { tid = txBegin(dip->i_sb, 0); @@ -847,8 +846,8 @@ static int jfs_link(struct dentry *old_dentry, out: txEnd(tid); - mutex_unlock(&JFS_IP(dir)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dir)->commit_mutex); jfs_info("jfs_link: rc:%d", rc); return rc; @@ -1037,8 +1036,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, out3: txEnd(tid); - mutex_unlock(&JFS_IP(dip)->commit_mutex); mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; @@ -1160,10 +1159,11 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(new_ip->i_mode)) { new_ip->i_nlink--; if (new_ip->i_nlink) { - mutex_unlock(&JFS_IP(new_dir)->commit_mutex); - mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); if (old_dir != new_dir) mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); if (!S_ISDIR(old_ip->i_mode) && new_ip) IWRITE_UNLOCK(new_ip); jfs_error(new_ip->i_sb, @@ -1281,13 +1281,12 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, out4: txEnd(tid); - - mutex_unlock(&JFS_IP(new_dir)->commit_mutex); - mutex_unlock(&JFS_IP(old_ip)->commit_mutex); - if (old_dir != new_dir) - mutex_unlock(&JFS_IP(old_dir)->commit_mutex); if (new_ip) mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + if (old_dir != new_dir) + mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); while (new_size && (rc == 0)) { tid = txBegin(new_ip->i_sb, 0); -- cgit v1.2.3 From 607f31e80b6f982d7c0dd7a5045377fc368fe507 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 28 Jun 2006 16:52:45 -0400 Subject: Revert "Merge branch 'odirect'" This reverts ccf01ef7aa9c6c293a1c64c27331a2ce227916ec commit. No idea how git managed this one: when I asked it to merge the odirect topic branch it actually generated a patch which reverted the change. Reverting the 'merge' will once again reveal Chuck's recent NFS/O_DIRECT work to the world. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 435 ++++++++++++++++++++++-------------------------- include/linux/nfs_xdr.h | 2 + 2 files changed, 203 insertions(+), 234 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 8ca9707be6c9..9ae7b6f6bf30 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -68,25 +68,19 @@ struct nfs_direct_req { struct kref kref; /* release manager */ /* I/O parameters */ - struct list_head list, /* nfs_read/write_data structs */ - rewrite_list; /* saved nfs_write_data structs */ struct nfs_open_context *ctx; /* file open context info */ struct kiocb * iocb; /* controlling i/o request */ struct inode * inode; /* target file of i/o */ - unsigned long user_addr; /* location of user's buffer */ - size_t user_count; /* total bytes to move */ - loff_t pos; /* starting offset in file */ - struct page ** pages; /* pages in our buffer */ - unsigned int npages; /* count of pages */ /* completion state */ + atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ - int outstanding; /* i/os we're waiting for */ ssize_t count, /* bytes actually processed */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ /* commit state */ + struct list_head rewrite_list; /* saved nfs_write_data structs */ struct nfs_write_data * commit_data; /* special write_data for commits */ int flags; #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ @@ -94,8 +88,37 @@ struct nfs_direct_req { struct nfs_writeverf verf; /* unstable write verifier */ }; -static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync); static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); +static const struct rpc_call_ops nfs_write_direct_ops; + +static inline void get_dreq(struct nfs_direct_req *dreq) +{ + atomic_inc(&dreq->io_count); +} + +static inline int put_dreq(struct nfs_direct_req *dreq) +{ + return atomic_dec_and_test(&dreq->io_count); +} + +/* + * "size" is never larger than rsize or wsize. + */ +static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size) +{ + int page_count; + + page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count -= user_addr >> PAGE_SHIFT; + BUG_ON(page_count < 0); + + return page_count; +} + +static inline unsigned int nfs_max_pages(unsigned int size) +{ + return (size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} /** * nfs_direct_IO - NFS address space operation for direct I/O @@ -119,50 +142,21 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_ return -EINVAL; } -static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty) +static void nfs_direct_dirty_pages(struct page **pages, int npages) { int i; for (i = 0; i < npages; i++) { struct page *page = pages[i]; - if (do_dirty && !PageCompound(page)) + if (!PageCompound(page)) set_page_dirty_lock(page); - page_cache_release(page); } - kfree(pages); } -static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages) +static void nfs_direct_release_pages(struct page **pages, int npages) { - int result = -ENOMEM; - unsigned long page_count; - size_t array_size; - - page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; - page_count -= user_addr >> PAGE_SHIFT; - - array_size = (page_count * sizeof(struct page *)); - *pages = kmalloc(array_size, GFP_KERNEL); - if (*pages) { - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, - page_count, (rw == READ), 0, - *pages, NULL); - up_read(¤t->mm->mmap_sem); - if (result != page_count) { - /* - * If we got fewer pages than expected from - * get_user_pages(), the user buffer runs off the - * end of a mapping; return EFAULT. - */ - if (result >= 0) { - nfs_free_user_pages(*pages, result, 0); - result = -EFAULT; - } else - kfree(*pages); - *pages = NULL; - } - } - return result; + int i; + for (i = 0; i < npages; i++) + page_cache_release(pages[i]); } static inline struct nfs_direct_req *nfs_direct_req_alloc(void) @@ -174,13 +168,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) return NULL; kref_init(&dreq->kref); + kref_get(&dreq->kref); init_completion(&dreq->completion); - INIT_LIST_HEAD(&dreq->list); INIT_LIST_HEAD(&dreq->rewrite_list); dreq->iocb = NULL; dreq->ctx = NULL; spin_lock_init(&dreq->lock); - dreq->outstanding = 0; + atomic_set(&dreq->io_count, 0); dreq->count = 0; dreq->error = 0; dreq->flags = 0; @@ -221,18 +215,11 @@ out: } /* - * We must hold a reference to all the pages in this direct read request - * until the RPCs complete. This could be long *after* we are woken up in - * nfs_direct_wait (for instance, if someone hits ^C on a slow server). - * - * In addition, synchronous I/O uses a stack-allocated iocb. Thus we - * can't trust the iocb is still valid here if this is a synchronous - * request. If the waiter is woken prematurely, the iocb is long gone. + * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust + * the iocb is still valid here if this is a synchronous request. */ static void nfs_direct_complete(struct nfs_direct_req *dreq) { - nfs_free_user_pages(dreq->pages, dreq->npages, 1); - if (dreq->iocb) { long res = (long) dreq->error; if (!res) @@ -245,48 +232,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) } /* - * Note we also set the number of requests we have in the dreq when we are - * done. This prevents races with I/O completion so we will always wait - * until all requests have been dispatched and completed. + * We must hold a reference to all the pages in this direct read request + * until the RPCs complete. This could be long *after* we are woken up in + * nfs_direct_wait (for instance, if someone hits ^C on a slow server). */ -static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize) -{ - struct list_head *list; - struct nfs_direct_req *dreq; - unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - dreq = nfs_direct_req_alloc(); - if (!dreq) - return NULL; - - list = &dreq->list; - for(;;) { - struct nfs_read_data *data = nfs_readdata_alloc(rpages); - - if (unlikely(!data)) { - while (!list_empty(list)) { - data = list_entry(list->next, - struct nfs_read_data, pages); - list_del(&data->pages); - nfs_readdata_free(data); - } - kref_put(&dreq->kref, nfs_direct_req_release); - return NULL; - } - - INIT_LIST_HEAD(&data->pages); - list_add(&data->pages, list); - - data->req = (struct nfs_page *) dreq; - dreq->outstanding++; - if (nbytes <= rsize) - break; - nbytes -= rsize; - } - kref_get(&dreq->kref); - return dreq; -} - static void nfs_direct_read_result(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; @@ -295,6 +244,9 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata) if (nfs_readpage_result(task, data) != 0) return; + nfs_direct_dirty_pages(data->pagevec, data->npages); + nfs_direct_release_pages(data->pagevec, data->npages); + spin_lock(&dreq->lock); if (likely(task->tk_status >= 0)) @@ -302,13 +254,10 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata) else dreq->error = task->tk_status; - if (--dreq->outstanding) { - spin_unlock(&dreq->lock); - return; - } - spin_unlock(&dreq->lock); - nfs_direct_complete(dreq); + + if (put_dreq(dreq)) + nfs_direct_complete(dreq); } static const struct rpc_call_ops nfs_read_direct_ops = { @@ -317,41 +266,60 @@ static const struct rpc_call_ops nfs_read_direct_ops = { }; /* - * For each nfs_read_data struct that was allocated on the list, dispatch - * an NFS READ operation + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, + * bail and stop sending more reads. Read length accounting is + * handled automatically by nfs_direct_read_result(). Otherwise, if + * no requests have been sent, just return an error. */ -static void nfs_direct_read_schedule(struct nfs_direct_req *dreq) +static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos) { struct nfs_open_context *ctx = dreq->ctx; struct inode *inode = ctx->dentry->d_inode; - struct list_head *list = &dreq->list; - struct page **pages = dreq->pages; - size_t count = dreq->user_count; - loff_t pos = dreq->pos; size_t rsize = NFS_SERVER(inode)->rsize; - unsigned int curpage, pgbase; + unsigned int rpages = nfs_max_pages(rsize); + unsigned int pgbase; + int result; + ssize_t started = 0; + + get_dreq(dreq); - curpage = 0; - pgbase = dreq->user_addr & ~PAGE_MASK; + pgbase = user_addr & ~PAGE_MASK; do { struct nfs_read_data *data; size_t bytes; + result = -ENOMEM; + data = nfs_readdata_alloc(rpages); + if (unlikely(!data)) + break; + bytes = rsize; if (count < rsize) bytes = count; - BUG_ON(list_empty(list)); - data = list_entry(list->next, struct nfs_read_data, pages); - list_del_init(&data->pages); + data->npages = nfs_direct_count_pages(user_addr, bytes); + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + data->npages, 1, 0, data->pagevec, NULL); + up_read(¤t->mm->mmap_sem); + if (unlikely(result < data->npages)) { + if (result > 0) + nfs_direct_release_pages(data->pagevec, result); + nfs_readdata_release(data); + break; + } + + get_dreq(dreq); + data->req = (struct nfs_page *) dreq; data->inode = inode; data->cred = ctx->cred; data->args.fh = NFS_FH(inode); data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; - data->args.pages = &pages[curpage]; + data->args.pages = data->pagevec; data->args.count = bytes; data->res.fattr = &data->fattr; data->res.eof = 0; @@ -374,33 +342,35 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq) bytes, (unsigned long long)data->args.offset); + started += bytes; + user_addr += bytes; pos += bytes; pgbase += bytes; - curpage += pgbase >> PAGE_SHIFT; pgbase &= ~PAGE_MASK; count -= bytes; } while (count != 0); - BUG_ON(!list_empty(list)); + + if (put_dreq(dreq)) + nfs_direct_complete(dreq); + + if (started) + return 0; + return result < 0 ? (ssize_t) result : -EFAULT; } -static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages) +static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos) { - ssize_t result; + ssize_t result = 0; sigset_t oldset; struct inode *inode = iocb->ki_filp->f_mapping->host; struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_direct_req *dreq; - dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize); + dreq = nfs_direct_req_alloc(); if (!dreq) return -ENOMEM; - dreq->user_addr = user_addr; - dreq->user_count = count; - dreq->pos = pos; - dreq->pages = pages; - dreq->npages = nr_pages; dreq->inode = inode; dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); if (!is_sync_kiocb(iocb)) @@ -408,8 +378,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count); rpc_clnt_sigmask(clnt, &oldset); - nfs_direct_read_schedule(dreq); - result = nfs_direct_wait(dreq); + result = nfs_direct_read_schedule(dreq, user_addr, count, pos); + if (!result) + result = nfs_direct_wait(dreq); rpc_clnt_sigunmask(clnt, &oldset); return result; @@ -417,10 +388,10 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) { - list_splice_init(&dreq->rewrite_list, &dreq->list); - while (!list_empty(&dreq->list)) { - struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages); + while (!list_empty(&dreq->rewrite_list)) { + struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); list_del(&data->pages); + nfs_direct_release_pages(data->pagevec, data->npages); nfs_writedata_release(data); } } @@ -428,14 +399,51 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { - struct list_head *pos; + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; - list_splice_init(&dreq->rewrite_list, &dreq->list); - list_for_each(pos, &dreq->list) - dreq->outstanding++; dreq->count = 0; + get_dreq(dreq); + + list_for_each(p, &dreq->rewrite_list) { + data = list_entry(p, struct nfs_write_data, pages); + + get_dreq(dreq); + + /* + * Reset data->res. + */ + nfs_fattr_init(&data->fattr); + data->res.count = data->args.count; + memset(&data->verf, 0, sizeof(data->verf)); + + /* + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ + rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, + &nfs_write_direct_ops, data); + NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE); + + data->task.tk_priority = RPC_PRIORITY_NORMAL; + data->task.tk_cookie = (unsigned long) inode; + + /* + * We're called via an RPC callback, so BKL is already held. + */ + rpc_execute(&data->task); + + dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + } - nfs_direct_write_schedule(dreq, FLUSH_STABLE); + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, inode); } static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) @@ -472,8 +480,8 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->cred = dreq->ctx->cred; data->args.fh = NFS_FH(data->inode); - data->args.offset = dreq->pos; - data->args.count = dreq->user_count; + data->args.offset = 0; + data->args.count = 0; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -535,47 +543,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode } #endif -static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize) -{ - struct list_head *list; - struct nfs_direct_req *dreq; - unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - dreq = nfs_direct_req_alloc(); - if (!dreq) - return NULL; - - list = &dreq->list; - for(;;) { - struct nfs_write_data *data = nfs_writedata_alloc(wpages); - - if (unlikely(!data)) { - while (!list_empty(list)) { - data = list_entry(list->next, - struct nfs_write_data, pages); - list_del(&data->pages); - nfs_writedata_free(data); - } - kref_put(&dreq->kref, nfs_direct_req_release); - return NULL; - } - - INIT_LIST_HEAD(&data->pages); - list_add(&data->pages, list); - - data->req = (struct nfs_page *) dreq; - dreq->outstanding++; - if (nbytes <= wsize) - break; - nbytes -= wsize; - } - - nfs_alloc_commit_data(dreq); - - kref_get(&dreq->kref); - return dreq; -} - static void nfs_direct_write_result(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; @@ -605,8 +572,6 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata) } } } - /* In case we have to resend */ - data->args.stable = NFS_FILE_SYNC; spin_unlock(&dreq->lock); } @@ -620,14 +585,8 @@ static void nfs_direct_write_release(void *calldata) struct nfs_write_data *data = calldata; struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - spin_lock(&dreq->lock); - if (--dreq->outstanding) { - spin_unlock(&dreq->lock); - return; - } - spin_unlock(&dreq->lock); - - nfs_direct_write_complete(dreq, data->inode); + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, data->inode); } static const struct rpc_call_ops nfs_write_direct_ops = { @@ -636,41 +595,62 @@ static const struct rpc_call_ops nfs_write_direct_ops = { }; /* - * For each nfs_write_data struct that was allocated on the list, dispatch - * an NFS WRITE operation + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, + * bail and stop sending more writes. Write length accounting is + * handled automatically by nfs_direct_write_result(). Otherwise, if + * no requests have been sent, just return an error. */ -static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync) +static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync) { struct nfs_open_context *ctx = dreq->ctx; struct inode *inode = ctx->dentry->d_inode; - struct list_head *list = &dreq->list; - struct page **pages = dreq->pages; - size_t count = dreq->user_count; - loff_t pos = dreq->pos; size_t wsize = NFS_SERVER(inode)->wsize; - unsigned int curpage, pgbase; + unsigned int wpages = nfs_max_pages(wsize); + unsigned int pgbase; + int result; + ssize_t started = 0; - curpage = 0; - pgbase = dreq->user_addr & ~PAGE_MASK; + get_dreq(dreq); + + pgbase = user_addr & ~PAGE_MASK; do { struct nfs_write_data *data; size_t bytes; + result = -ENOMEM; + data = nfs_writedata_alloc(wpages); + if (unlikely(!data)) + break; + bytes = wsize; if (count < wsize) bytes = count; - BUG_ON(list_empty(list)); - data = list_entry(list->next, struct nfs_write_data, pages); + data->npages = nfs_direct_count_pages(user_addr, bytes); + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + data->npages, 0, 0, data->pagevec, NULL); + up_read(¤t->mm->mmap_sem); + if (unlikely(result < data->npages)) { + if (result > 0) + nfs_direct_release_pages(data->pagevec, result); + nfs_writedata_release(data); + break; + } + + get_dreq(dreq); + list_move_tail(&data->pages, &dreq->rewrite_list); + data->req = (struct nfs_page *) dreq; data->inode = inode; data->cred = ctx->cred; data->args.fh = NFS_FH(inode); data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; - data->args.pages = &pages[curpage]; + data->args.pages = data->pagevec; data->args.count = bytes; data->res.fattr = &data->fattr; data->res.count = bytes; @@ -694,19 +674,26 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync) bytes, (unsigned long long)data->args.offset); + started += bytes; + user_addr += bytes; pos += bytes; pgbase += bytes; - curpage += pgbase >> PAGE_SHIFT; pgbase &= ~PAGE_MASK; count -= bytes; } while (count != 0); - BUG_ON(!list_empty(list)); + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, inode); + + if (started) + return 0; + return result < 0 ? (ssize_t) result : -EFAULT; } -static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages) +static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos) { - ssize_t result; + ssize_t result = 0; sigset_t oldset; struct inode *inode = iocb->ki_filp->f_mapping->host; struct rpc_clnt *clnt = NFS_CLIENT(inode); @@ -714,17 +701,14 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz size_t wsize = NFS_SERVER(inode)->wsize; int sync = 0; - dreq = nfs_direct_write_alloc(count, wsize); + dreq = nfs_direct_req_alloc(); if (!dreq) return -ENOMEM; + nfs_alloc_commit_data(dreq); + if (dreq->commit_data == NULL || count < wsize) sync = FLUSH_STABLE; - dreq->user_addr = user_addr; - dreq->user_count = count; - dreq->pos = pos; - dreq->pages = pages; - dreq->npages = nr_pages; dreq->inode = inode; dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); if (!is_sync_kiocb(iocb)) @@ -735,8 +719,9 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz nfs_begin_data_update(inode); rpc_clnt_sigmask(clnt, &oldset); - nfs_direct_write_schedule(dreq, sync); - result = nfs_direct_wait(dreq); + result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync); + if (!result) + result = nfs_direct_wait(dreq); rpc_clnt_sigunmask(clnt, &oldset); return result; @@ -766,8 +751,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) { ssize_t retval = -EINVAL; - int page_count; - struct page **pages; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -789,14 +772,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, if (retval) goto out; - retval = nfs_get_user_pages(READ, (unsigned long) buf, - count, &pages); - if (retval < 0) - goto out; - page_count = retval; - - retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos, - pages, page_count); + retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos); if (retval > 0) iocb->ki_pos = pos + retval; @@ -832,8 +808,6 @@ out: ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) { ssize_t retval; - int page_count; - struct page **pages; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -861,14 +835,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t if (retval) goto out; - retval = nfs_get_user_pages(WRITE, (unsigned long) buf, - count, &pages); - if (retval < 0) - goto out; - page_count = retval; - - retval = nfs_direct_write(iocb, (unsigned long) buf, count, - pos, pages, page_count); + retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos); /* * XXX: nfs_end_data_update() already ensures this file's diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7c7320fa51aa..2d3fb6416d91 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -729,6 +729,7 @@ struct nfs_read_data { struct list_head pages; /* Coalesced read requests */ struct nfs_page *req; /* multi ops per nfs_page */ struct page **pagevec; + unsigned int npages; /* active pages in pagevec */ struct nfs_readargs args; struct nfs_readres res; #ifdef CONFIG_NFS_V4 @@ -747,6 +748,7 @@ struct nfs_write_data { struct list_head pages; /* Coalesced requests we wish to flush */ struct nfs_page *req; /* multi ops per nfs_page */ struct page **pagevec; + unsigned int npages; /* active pages in pagevec */ struct nfs_writeargs args; /* argument struct */ struct nfs_writeres res; /* result struct */ #ifdef CONFIG_NFS_V4 -- cgit v1.2.3 From f475ae957db66650db66916c62604ac27409d884 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 29 Jun 2006 16:38:32 -0400 Subject: VFS: Allow caller to determine if BSD or posix locks were actually freed Change posix_lock_file_conf(), and flock_lock_file() so that if called with an F_UNLCK argument, and the FL_EXISTS flag they will indicate whether or not any locks were actually freed by returning 0 or -ENOENT. Signed-off-by: Trond Myklebust --- fs/locks.c | 18 ++++++++++++++++-- include/linux/fs.h | 1 + 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 1ad29c9b6252..50cb0a2b74d9 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -725,6 +725,10 @@ next_task: /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks * at the head of the list, but that's secret knowledge known only to * flock_lock_file and posix_lock_file. + * + * Note that if called with an FL_EXISTS argument, the caller may determine + * whether or not a lock was successfully freed by testing the return + * value for -ENOENT. */ static int flock_lock_file(struct file *filp, struct file_lock *request) { @@ -750,8 +754,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) break; } - if (request->fl_type == F_UNLCK) + if (request->fl_type == F_UNLCK) { + if ((request->fl_flags & FL_EXISTS) && !found) + error = -ENOENT; goto out; + } error = -ENOMEM; new_fl = locks_alloc_lock(); @@ -948,8 +955,11 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request error = 0; if (!added) { - if (request->fl_type == F_UNLCK) + if (request->fl_type == F_UNLCK) { + if (request->fl_flags & FL_EXISTS) + error = -ENOENT; goto out; + } if (!new_fl) { error = -ENOLCK; @@ -996,6 +1006,10 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request * Add a POSIX style lock to a file. * We merge adjacent & overlapping locks whenever possible. * POSIX locks are sorted by owner task, then by starting address + * + * Note that if called with an FL_EXISTS argument, the caller may determine + * whether or not a lock was successfully freed by testing the return + * value for -ENOENT. */ int posix_lock_file(struct file *filp, struct file_lock *fl) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 134b32068246..43aef9b230fd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -716,6 +716,7 @@ extern spinlock_t files_lock; #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_ACCESS 8 /* not trying to lock, just looking */ +#define FL_EXISTS 16 /* when unlocking, test for existence */ #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ -- cgit v1.2.3 From 9b07357490e5c7a1c3c2b6f4679d7ee4b4185ecd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 29 Jun 2006 16:38:34 -0400 Subject: NLM,NFSv4: Don't put UNLOCK requests on the wire unless we hold a lock Use the new behaviour of {flock,posix}_file_lock(F_UNLCK) to determine if we held a lock, and only send the RPC request to the server if this was the case. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 18 ++++++++++-------- fs/nfs/nfs4proc.c | 31 ++++++++++++------------------- 2 files changed, 22 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 5980c45998cc..24c691f5480e 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -454,7 +454,7 @@ static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *ho fl->fl_ops = &nlmclnt_lock_ops; } -static void do_vfs_lock(struct file_lock *fl) +static int do_vfs_lock(struct file_lock *fl) { int res = 0; switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { @@ -467,9 +467,7 @@ static void do_vfs_lock(struct file_lock *fl) default: BUG(); } - if (res < 0) - printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", - __FUNCTION__); + return res; } /* @@ -541,7 +539,8 @@ again: } fl->fl_flags |= FL_SLEEP; /* Ensure the resulting lock will get added to granted list */ - do_vfs_lock(fl); + if (do_vfs_lock(fl) < 0) + printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__); up_read(&host->h_rwsem); } status = nlm_stat_to_errno(resp->status); @@ -606,15 +605,19 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) { struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; - int status; + int status = 0; /* * Note: the server is supposed to either grant us the unlock * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either * case, we want to unlock. */ + fl->fl_flags |= FL_EXISTS; down_read(&host->h_rwsem); - do_vfs_lock(fl); + if (do_vfs_lock(fl) == -ENOENT) { + up_read(&host->h_rwsem); + goto out; + } up_read(&host->h_rwsem); if (req->a_flags & RPC_TASK_ASYNC) @@ -624,7 +627,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) if (status < 0) goto out; - status = 0; if (resp->status == NLM_LCK_GRANTED) goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b4916b092194..b8c63757f039 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3144,9 +3144,6 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl) default: BUG(); } - if (res < 0) - printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", - __FUNCTION__); return res; } @@ -3258,8 +3255,6 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, return ERR_PTR(-ENOMEM); } - /* Unlock _before_ we do the RPC call */ - do_vfs_lock(fl->fl_file, fl); return rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data); } @@ -3270,30 +3265,28 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * struct rpc_task *task; int status = 0; - /* Is this a delegated lock? */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) - goto out_unlock; - /* Is this open_owner holding any locks on the server? */ - if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) - goto out_unlock; - status = nfs4_set_lock_state(state, request); + /* Unlock _before_ we do the RPC call */ + request->fl_flags |= FL_EXISTS; + if (do_vfs_lock(request->fl_file, request) == -ENOENT) + goto out; if (status != 0) - goto out_unlock; + goto out; + /* Is this a delegated lock? */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) + goto out; lsp = request->fl_u.nfs4_fl.owner; - status = -ENOMEM; seqid = nfs_alloc_seqid(&lsp->ls_seqid); + status = -ENOMEM; if (seqid == NULL) - goto out_unlock; + goto out; task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid); status = PTR_ERR(task); if (IS_ERR(task)) - goto out_unlock; + goto out; status = nfs4_wait_for_completion_rpc_task(task); rpc_release_task(task); - return status; -out_unlock: - do_vfs_lock(request->fl_file, request); +out: return status; } -- cgit v1.2.3 From 42a2d13eee3c895d22e9d1a52b96d15ca49adabc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 29 Jun 2006 16:38:36 -0400 Subject: NFSv4: Ensure nfs4_lock_expired() caches delegated locks Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b8c63757f039..8bdfe3ff7925 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3454,10 +3454,10 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request struct nfs4_exception exception = { }; int err; - /* Cache the lock if possible... */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) - return 0; do { + /* Cache the lock if possible... */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) + return 0; err = _nfs4_do_setlk(state, F_SETLK, request, 1); if (err != -NFS4ERR_DELAY) break; @@ -3476,6 +3476,8 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request if (err != 0) return err; do { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) + return 0; err = _nfs4_do_setlk(state, F_SETLK, request, 0); if (err != -NFS4ERR_DELAY) break; -- cgit v1.2.3 From f07f18dd6f29f11887b8d9cf7ecb736bf2f7dc62 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 29 Jun 2006 16:38:37 -0400 Subject: VFS: Add support for the FL_ACCESS flag to flock_lock_file() Signed-off-by: Trond Myklebust --- fs/locks.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 50cb0a2b74d9..b0b41a64e10b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -739,6 +739,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) int found = 0; lock_kernel(); + if (request->fl_flags & FL_ACCESS) + goto find_conflict; for_each_lock(inode, before) { struct file_lock *fl = *before; if (IS_POSIX(fl)) @@ -771,6 +773,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) if (found) cond_resched(); +find_conflict: for_each_lock(inode, before) { struct file_lock *fl = *before; if (IS_POSIX(fl)) @@ -784,6 +787,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) locks_insert_block(fl, request); goto out; } + if (request->fl_flags & FL_ACCESS) + goto out; locks_copy_lock(new_fl, request); locks_insert_lock(&inode->i_flock, new_fl); new_fl = NULL; -- cgit v1.2.3 From 01c3b861cd77b28565a2d18c7caa3ce7f938e35c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 29 Jun 2006 16:38:39 -0400 Subject: NLM,NFSv4: Wait on local locks before we put RPC calls on the wire Use FL_ACCESS flag to test and/or wait for local locks before we try requesting a lock from the server Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 8 +++++++- fs/nfs/nfs4proc.c | 35 ++++++++++++++++++++++++----------- 2 files changed, 31 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 24c691f5480e..89ba0df14c22 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -496,6 +496,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; struct nlm_wait *block = NULL; + unsigned char fl_flags = fl->fl_flags; int status = -ENOLCK; if (!host->h_monitored && nsm_monitor(host) < 0) { @@ -503,6 +504,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) host->h_name); goto out; } + fl->fl_flags |= FL_ACCESS; + status = do_vfs_lock(fl); + if (status < 0) + goto out; block = nlmclnt_prepare_block(host, fl); again: @@ -537,8 +542,8 @@ again: up_read(&host->h_rwsem); goto again; } - fl->fl_flags |= FL_SLEEP; /* Ensure the resulting lock will get added to granted list */ + fl->fl_flags = fl_flags | FL_SLEEP; if (do_vfs_lock(fl) < 0) printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__); up_read(&host->h_rwsem); @@ -551,6 +556,7 @@ out_unblock: nlmclnt_cancel(host, req->a_args.block, fl); out: nlm_release_call(req); + fl->fl_flags = fl_flags; return status; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8bdfe3ff7925..e6ee97f19d81 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3489,29 +3489,42 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct nfs4_client *clp = state->owner->so_client; + unsigned char fl_flags = request->fl_flags; int status; /* Is this a delegated open? */ - if (NFS_I(state->inode)->delegation_state != 0) { - /* Yes: cache locks! */ - status = do_vfs_lock(request->fl_file, request); - /* ...but avoid races with delegation recall... */ - if (status < 0 || test_bit(NFS_DELEGATED_STATE, &state->flags)) - return status; - } - down_read(&clp->cl_sem); status = nfs4_set_lock_state(state, request); if (status != 0) goto out; + request->fl_flags |= FL_ACCESS; + status = do_vfs_lock(request->fl_file, request); + if (status < 0) + goto out; + down_read(&clp->cl_sem); + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { + struct nfs_inode *nfsi = NFS_I(state->inode); + /* Yes: cache locks! */ + down_read(&nfsi->rwsem); + /* ...but avoid races with delegation recall... */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { + request->fl_flags = fl_flags & ~FL_SLEEP; + status = do_vfs_lock(request->fl_file, request); + up_read(&nfsi->rwsem); + goto out_unlock; + } + up_read(&nfsi->rwsem); + } status = _nfs4_do_setlk(state, cmd, request, 0); if (status != 0) - goto out; + goto out_unlock; /* Note: we always want to sleep here! */ - request->fl_flags |= FL_SLEEP; + request->fl_flags = fl_flags | FL_SLEEP; if (do_vfs_lock(request->fl_file, request) < 0) printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__); -out: +out_unlock: up_read(&clp->cl_sem); +out: + request->fl_flags = fl_flags; return status; } -- cgit v1.2.3 From 83715ad54fad5a7ed330110f83e31ae92630e9d9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 5 Jul 2006 13:17:12 -0400 Subject: NFS: Fix NFS page_state usage The introduction of the FLUSH_INVALIDATE argument to nfs_sync_inode_wait() does not clear the nr_unstable page state counter for pages that are being released. Also fix a longstanding similar bug when nfs_commit_list() fails. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bca5734ca9fb..86bac6a5008e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -578,7 +578,7 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un return ret; } -static void nfs_cancel_requests(struct list_head *head) +static void nfs_cancel_dirty_list(struct list_head *head) { struct nfs_page *req; while(!list_empty(head)) { @@ -589,6 +589,19 @@ static void nfs_cancel_requests(struct list_head *head) } } +static void nfs_cancel_commit_list(struct list_head *head) +{ + struct nfs_page *req; + + while(!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_inode_remove_request(req); + nfs_clear_page_writeback(req); + dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + } +} + /* * nfs_scan_dirty - Scan an inode for dirty requests * @inode: NFS inode to scan @@ -1381,6 +1394,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) nfs_list_remove_request(req); nfs_mark_request_commit(req); nfs_clear_page_writeback(req); + dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); } return -ENOMEM; } @@ -1499,7 +1513,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start, if (pages != 0) { spin_unlock(&nfsi->req_lock); if (how & FLUSH_INVALIDATE) - nfs_cancel_requests(&head); + nfs_cancel_dirty_list(&head); else ret = nfs_flush_list(inode, &head, pages, how); spin_lock(&nfsi->req_lock); @@ -1512,7 +1526,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start, break; if (how & FLUSH_INVALIDATE) { spin_unlock(&nfsi->req_lock); - nfs_cancel_requests(&head); + nfs_cancel_commit_list(&head); spin_lock(&nfsi->req_lock); continue; } -- cgit v1.2.3 From 4e0641a7ad98fca5646a6be17605bc80f6c4ebde Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 5 Jul 2006 13:05:13 -0400 Subject: NFS: Optimise away an excessive GETATTR call when a file is symlinked In the case when compiling via a symlink tree, we want to ensure that the close-to-open GETATTR call is applied only to the final file, and not to the symlink. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3ddda6f7ecc2..e7ffb4deb3e5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -690,7 +690,9 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd) goto out_force; /* This is an open(2) */ if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 && - !(server->flags & NFS_MOUNT_NOCTO)) + !(server->flags & NFS_MOUNT_NOCTO) && + (S_ISREG(inode->i_mode) || + S_ISDIR(inode->i_mode))) goto out_force; } return nfs_revalidate_inode(server, inode); -- cgit v1.2.3 From 8ba10ab128e88bfbe58f7164543827ef3c3a2c88 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 8 Jul 2006 02:17:40 +0000 Subject: [CIFS] CIFS_DEBUG2 depends on CIFS Signed-off-by: Steve French --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 00aa3d5c5a83..7db05742a92d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1791,6 +1791,7 @@ config CIFS_POSIX config CIFS_DEBUG2 bool "Enable additional CIFS debugging routines" + depends on CIFS help Enabling this option adds a few more debugging routines to the cifs code which slightly increases the size of -- cgit v1.2.3 From aadd06e5c56b9ff5117ec77e59eada43dc46e2fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Jul 2006 11:00:01 +0200 Subject: [PATCH] splice: fix problems with sys_tee() Several issues noticed/fixed: - We cannot reliably block in link_pipe() while holding both input and output mutexes. So do preparatory checks before locking down both mutexes and doing the link. - The ipipe->nrbufs vs i check was bad, because we could have dropped the ipipe lock in-between. This causes us to potentially look at unknown buffers if we were racing with someone else reading this pipe. Signed-off-by: Jens Axboe --- fs/splice.c | 238 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 133 insertions(+), 105 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 05fd2787be98..684bca3d3a10 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1306,6 +1306,85 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, return error; } +/* + * Make sure there's data to read. Wait for input if we can, otherwise + * return an appropriate error. + */ +static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs) + return 0; + + ret = 0; + mutex_lock(&pipe->inode->i_mutex); + + while (!pipe->nrbufs) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } + pipe_wait(pipe); + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + +/* + * Make sure there's writeable room. Wait for room if we can, otherwise + * return an appropriate error. + */ +static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs < PIPE_BUFFERS) + return 0; + + ret = 0; + mutex_lock(&pipe->inode->i_mutex); + + while (pipe->nrbufs >= PIPE_BUFFERS) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + break; + } + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + /* * Link contents of ipipe to opipe. */ @@ -1314,9 +1393,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret, do_wakeup, i, ipipe_first; - - ret = do_wakeup = ipipe_first = 0; + int ret = 0, i = 0, nbuf; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1324,126 +1401,62 @@ static int link_pipe(struct pipe_inode_info *ipipe, * could deadlock (one doing tee from A -> B, the other from B -> A). */ if (ipipe->inode < opipe->inode) { - ipipe_first = 1; - mutex_lock(&ipipe->inode->i_mutex); - mutex_lock(&opipe->inode->i_mutex); + mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_CHILD); } else { - mutex_lock(&opipe->inode->i_mutex); - mutex_lock(&ipipe->inode->i_mutex); + mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_CHILD); } - for (i = 0;; i++) { + do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - if (ipipe->nrbufs - i) { - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); - /* - * If we have room, fill this buffer - */ - if (opipe->nrbufs < PIPE_BUFFERS) { - int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); - - /* - * Get a reference to this pipe buffer, - * so we can copy the contents over. - */ - ibuf->ops->get(ipipe, ibuf); - - obuf = opipe->bufs + nbuf; - *obuf = *ibuf; - - /* - * Don't inherit the gift flag, we need to - * prevent multiple steals of this page. - */ - obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - - if (obuf->len > len) - obuf->len = len; - - opipe->nrbufs++; - do_wakeup = 1; - ret += obuf->len; - len -= obuf->len; - - if (!len) - break; - if (opipe->nrbufs < PIPE_BUFFERS) - continue; - } - - /* - * We have input available, but no output room. - * If we already copied data, return that. If we - * need to drop the opipe lock, it must be ordered - * last to avoid deadlocks. - */ - if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { - if (!ret) - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - do_wakeup = 0; - } + /* + * If we have iterated all input buffers or ran out of + * output room, break. + */ + if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) + break; - opipe->waiting_writers++; - pipe_wait(opipe); - opipe->waiting_writers--; - continue; - } + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); /* - * No input buffers, do the usual checks for available - * writers and blocking and wait if necessary + * Get a reference to this pipe buffer, + * so we can copy the contents over. */ - if (!ipipe->writers) - break; - if (!ipipe->waiting_writers) { - if (ret) - break; - } + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + /* - * pipe_wait() drops the ipipe mutex. To avoid deadlocks - * with another process, we can only safely do that if - * the ipipe lock is ordered last. + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. */ - if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { - if (!ret) - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - if (waitqueue_active(&ipipe->wait)) - wake_up_interruptible_sync(&ipipe->wait); - kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + if (obuf->len > len) + obuf->len = len; - pipe_wait(ipipe); - } + opipe->nrbufs++; + ret += obuf->len; + len -= obuf->len; + i++; + } while (len); mutex_unlock(&ipipe->inode->i_mutex); mutex_unlock(&opipe->inode->i_mutex); - if (do_wakeup) { + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) { smp_mb(); if (waitqueue_active(&opipe->wait)) wake_up_interruptible(&opipe->wait); @@ -1464,14 +1477,29 @@ static long do_tee(struct file *in, struct file *out, size_t len, { struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + int ret = -EINVAL; /* - * Link ipipe to the two output pipes, consuming as we go along. + * Duplicate the contents of ipipe to opipe without actually + * copying the data. */ - if (ipipe && opipe) - return link_pipe(ipipe, opipe, len, flags); + if (ipipe && opipe && ipipe != opipe) { + /* + * Keep going, unless we encounter an error. The ipipe/opipe + * ordering doesn't really matter. + */ + ret = link_ipipe_prep(ipipe, flags); + if (!ret) { + ret = link_opipe_prep(opipe, flags); + if (!ret) { + ret = link_pipe(ipipe, opipe, len, flags); + if (!ret && (flags & SPLICE_F_NONBLOCK)) + ret = -EAGAIN; + } + } + } - return -EINVAL; + return ret; } asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) -- cgit v1.2.3 From 73ce5934e2d855db436566297f12966eb507a435 Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Mon, 10 Jul 2006 04:43:56 -0700 Subject: [PATCH] reiserfs: fix journaling issue regarding fsync() When write() extends a file(i_size is increased) and fsync() is called, change of inode must be written to journaling area through fsync(). But,currently the i_trans_id is not correctly updated when i_size is increased. So fsync() does not kick the journal writer. Reiserfs_file_write() already updates the transaction when blocks are allocated, but the case when i_size increases and new blocks are not added is not correctly treated. Following patch fix this bug. Signed-off-by: Hisashi Hifumi Cc: Jeff Mahoney Cc: Chris Mason Cc: Hans Reiser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 752cea12e30f..f318b58510fd 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -860,8 +860,12 @@ static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_han // this sets the proper flags for O_SYNC to trigger a commit mark_inode_dirty(inode); reiserfs_write_unlock(inode->i_sb); - } else + } else { + reiserfs_write_lock(inode->i_sb); + reiserfs_update_inode_transaction(inode); mark_inode_dirty(inode); + reiserfs_write_unlock(inode->i_sb); + } sd_update = 1; } -- cgit v1.2.3 From 25e206b54b9a20e63b6f5194aeebfa13d37e015c Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Mon, 10 Jul 2006 04:44:00 -0700 Subject: [PATCH] partitions: let partitions inherit policy from disk Change the partition code in fs/partitions/check.c to initialize a newly detected partition's policy field with that of the containing block device (see patch below). My reasoning is that function set_disk_ro() in block/genhd.c modifies the policy field (read-only indicator) of a disk and all contained partitions. When a partition is detected after the call to set_disk_ro(), the policy field of this partition will currently not inherit the disk's policy field. This behavior poses a problem in cases where a block device can be 'logically de- and reactivated' like e.g. the s390 DASD driver because partition detection may run after the policy field has been modified. Signed-off-by: Peter Oberparleiter Acked-by: Al Viro Makes-sense-to: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/partitions/check.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 839634026eb5..51c6a748df49 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -339,6 +339,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) p->start_sect = start; p->nr_sects = len; p->partno = part; + p->policy = disk->policy; if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1])) snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part); -- cgit v1.2.3 From 69c3a5b8fd8cfa67be22f6d7ae5c681c6777d817 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Jul 2006 04:44:23 -0700 Subject: [PATCH] fs/read_write.c: EXPORT_UNUSED_SYMBOL This patch marks an unused export as EXPORT_UNUSED_SYMBOL. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/read_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 5bc0e9234f9d..d4cb3183c99c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -436,7 +436,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) return seg; } -EXPORT_SYMBOL(iov_shorten); +EXPORT_UNUSED_SYMBOL(iov_shorten); /* June 2006 */ /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) -- cgit v1.2.3 From b6174df5eec9cdfd598c03d6d0807e344e109213 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Mon, 10 Jul 2006 04:44:49 -0700 Subject: [PATCH] mmap zero-length hugetlb file with PROT_NONE to protect a hugetlb virtual area Sometimes, applications need below call to be successful although "/mnt/hugepages/file1" doesn't exist. fd = open("/mnt/hugepages/file1", O_CREAT|O_RDWR, 0755); *addr = mmap(NULL, 0x1024*1024*256, PROT_NONE, 0, fd, 0); As for regular pages (or files), above call does work, but as for huge pages, above call would fail because hugetlbfs_file_mmap would fail if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size). This capability on huge page is useful on ia64 when the process wants to protect one area on region 4, so other threads couldn't read/write this area. A famous JVM (Java Virtual Machine) implementation on IA64 needs the capability. Signed-off-by: Zhang Yanmin Cc: David Gibson Cc: Hugh Dickins [ Expand-on-mmap semantics again... this time matching normal fs's. wli ] Acked-by: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6449cb697967..c3920c96dadf 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -83,8 +83,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) - goto out; if (vma->vm_flags & VM_MAYSHARE && hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), @@ -93,7 +91,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; hugetlb_prefault_arch_hook(vma->vm_mm); - if (inode->i_size < len) + if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From 1aeb21d626327ee909fef03f72aea6e8a60e6c0c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:50 -0700 Subject: [PATCH] FDPIC: Fix FDPIC compile errors Fix FDPIC compile errors. (akpm: we suspect it fixes a warning) Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index eba4e23b9ca0..07624b95aee6 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -459,6 +459,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, */ hwcap = ELF_HWCAP; k_platform = ELF_PLATFORM; + u_platform = NULL; if (k_platform) { platform_len = strlen(k_platform) + 1; -- cgit v1.2.3 From 21ff821630c0e64f5d2fab96ced72000d77fa90b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:52 -0700 Subject: [PATCH] NOMMU: Fix execution off of ramfs with mmap() Fix execution through the FDPIC binfmt of programs stored on ramfs by preventing the ramfs mmap() returning successfully on a private mapping of a ramfs file. This causes NOMMU mmap to make a copy of the mapped portion of the file and map that instead. This could be improved by granting direct mapping access to read-only private mappings for which the data is stored on a contiguous run of pages. However, this is only likely to be the case if the file was extended with truncate before being written. ramfs is left to map the file directly for shared mappings so that SYSV IPC and POSIX shared memory both still work. Signed-off-by: David Howells Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 99fffc9e1bfd..677139b48e00 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -283,9 +283,9 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, /*****************************************************************************/ /* - * set up a mapping + * set up a mapping for shared memory segments */ int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { - return 0; + return vma->vm_flags & VM_SHARED ? 0 : -ENOSYS; } -- cgit v1.2.3 From 8a2ab7f5df76b920d62b908919d987d3b8a82856 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:53 -0700 Subject: [PATCH] FDPIC: Adjust the ELF-FDPIC driver to conform more to the CodingStyle Adjust the ELF-FDPIC binfmt driver to conform much more to the CodingStyle, silly though it may be. Further changes: (*) Drop the casts to long for addresses in kdebug() statements (they're unsigned long already). (*) Use extra variables to avoid expressions longer than 80 chars by splitting the statement into multiple statements and letting the compiler optimise them back together. (*) Eliminate duplicate call of ksize() when working out how much space was actually allocated for the stack. (*) Discard the commented-out load_shlib prototype and op pointer as this will not be supported in ELF-FDPIC for the foreseeable future. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 305 +++++++++++++++++++++++++++----------------------- 1 file changed, 168 insertions(+), 137 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 07624b95aee6..a4ff87389823 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1,6 +1,6 @@ /* binfmt_elf_fdpic.c: FDPIC ELF binary format * - * Copyright (C) 2003, 2004 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * Derived from binfmt_elf.c * @@ -50,43 +50,45 @@ typedef char *elf_caddr_t; MODULE_LICENSE("GPL"); -static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs); -//static int load_elf_fdpic_library(struct file *); -static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file); -static int elf_fdpic_map_file(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm, - const char *what); +static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); +static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, + struct mm_struct *, const char *); -static int create_elf_fdpic_tables(struct linux_binprm *bprm, - struct mm_struct *mm, - struct elf_fdpic_params *exec_params, - struct elf_fdpic_params *interp_params); +static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, + struct elf_fdpic_params *, + struct elf_fdpic_params *); #ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *_sp); -static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm); +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, + unsigned long *); +static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, + struct file *, + struct mm_struct *); #endif -static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm); +static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, + struct file *, struct mm_struct *); static struct linux_binfmt elf_fdpic_format = { .module = THIS_MODULE, .load_binary = load_elf_fdpic_binary, -// .load_shlib = load_elf_fdpic_library, // .core_dump = elf_fdpic_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, }; -static int __init init_elf_fdpic_binfmt(void) { return register_binfmt(&elf_fdpic_format); } -static void __exit exit_elf_fdpic_binfmt(void) { unregister_binfmt(&elf_fdpic_format); } +static int __init init_elf_fdpic_binfmt(void) +{ + return register_binfmt(&elf_fdpic_format); +} -module_init(init_elf_fdpic_binfmt) -module_exit(exit_elf_fdpic_binfmt) +static void __exit exit_elf_fdpic_binfmt(void) +{ + unregister_binfmt(&elf_fdpic_format); +} + +module_init(init_elf_fdpic_binfmt); +module_exit(exit_elf_fdpic_binfmt); static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) { @@ -105,7 +107,8 @@ static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) /* * read the program headers table into memory */ -static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file) +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, + struct file *file) { struct elf32_phdr *phdr; unsigned long size; @@ -121,7 +124,8 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *f if (!params->phdrs) return -ENOMEM; - retval = kernel_read(file, params->hdr.e_phoff, (char *) params->phdrs, size); + retval = kernel_read(file, params->hdr.e_phoff, + (char *) params->phdrs, size); if (retval < 0) return retval; @@ -141,17 +145,24 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *f } return 0; -} /* end elf_fdpic_fetch_phdrs() */ +} /*****************************************************************************/ /* * load an fdpic binary into various bits of memory */ -static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_elf_fdpic_binary(struct linux_binprm *bprm, + struct pt_regs *regs) { struct elf_fdpic_params exec_params, interp_params; struct elf_phdr *phdr; - unsigned long stack_size; + unsigned long stack_size, entryaddr; +#ifndef CONFIG_MMU + unsigned long fullsize; +#endif +#ifdef ELF_FDPIC_PLAT_INIT + unsigned long dynaddr; +#endif struct file *interpreter = NULL; /* to shut gcc up */ char *interpreter_name = NULL; int executable_stack; @@ -212,7 +223,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs goto error; } - retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE); + retval = kernel_read(interpreter, 0, bprm->buf, + BINPRM_BUF_SIZE); if (retval < 0) goto error; @@ -295,7 +307,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs ¤t->mm->start_stack, ¤t->mm->start_brk); - retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); + retval = setup_arg_pages(bprm, current->mm->start_stack, + executable_stack); if (retval < 0) { send_sig(SIGKILL, current, 0); goto error_kill; @@ -303,7 +316,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs #endif /* load the executable and interpreter into memory */ - retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, "executable"); + retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, + "executable"); if (retval < 0) goto error_kill; @@ -324,7 +338,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs if (!current->mm->start_brk) current->mm->start_brk = current->mm->end_data; - current->mm->brk = current->mm->start_brk = PAGE_ALIGN(current->mm->start_brk); + current->mm->brk = current->mm->start_brk = + PAGE_ALIGN(current->mm->start_brk); #else /* create a stack and brk area big enough for everyone @@ -336,47 +351,45 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs stack_size = PAGE_SIZE * 2; down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, - 0, - stack_size, + current->mm->start_brk = do_mmap(NULL, 0, stack_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0); - if (IS_ERR((void *) current->mm->start_brk)) { + if (IS_ERR_VALUE(current->mm->start_brk)) { up_write(¤t->mm->mmap_sem); retval = current->mm->start_brk; current->mm->start_brk = 0; goto error_kill; } - if (do_mremap(current->mm->start_brk, - stack_size, - ksize((char *) current->mm->start_brk), - 0, 0 - ) == current->mm->start_brk - ) - stack_size = ksize((char *) current->mm->start_brk); + /* expand the stack mapping to use up the entire allocation granule */ + fullsize = ksize((char *) current->mm->start_brk); + if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size, + fullsize, 0, 0))) + stack_size = fullsize; up_write(¤t->mm->mmap_sem); current->mm->brk = current->mm->start_brk; current->mm->context.end_brk = current->mm->start_brk; - current->mm->context.end_brk += (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; + current->mm->context.end_brk += + (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; current->mm->start_stack = current->mm->start_brk + stack_size; #endif compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; - if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) + if (create_elf_fdpic_tables(bprm, current->mm, + &exec_params, &interp_params) < 0) goto error_kill; - kdebug("- start_code %lx", (long) current->mm->start_code); - kdebug("- end_code %lx", (long) current->mm->end_code); - kdebug("- start_data %lx", (long) current->mm->start_data); - kdebug("- end_data %lx", (long) current->mm->end_data); - kdebug("- start_brk %lx", (long) current->mm->start_brk); - kdebug("- brk %lx", (long) current->mm->brk); - kdebug("- start_stack %lx", (long) current->mm->start_stack); + kdebug("- start_code %lx", current->mm->start_code); + kdebug("- end_code %lx", current->mm->end_code); + kdebug("- start_data %lx", current->mm->start_data); + kdebug("- end_data %lx", current->mm->end_data); + kdebug("- start_brk %lx", current->mm->start_brk); + kdebug("- brk %lx", current->mm->brk); + kdebug("- start_stack %lx", current->mm->start_stack); #ifdef ELF_FDPIC_PLAT_INIT /* @@ -385,21 +398,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs * example. This macro performs whatever initialization to * the regs structure is required. */ - ELF_FDPIC_PLAT_INIT(regs, - exec_params.map_addr, - interp_params.map_addr, - interp_params.dynamic_addr ?: exec_params.dynamic_addr - ); + dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr; + ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr, + dynaddr); #endif /* everything is now ready... get the userspace context ready to roll */ - start_thread(regs, - interp_params.entry_addr ?: exec_params.entry_addr, - current->mm->start_stack); + entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; + start_thread(regs, entryaddr, current->mm->start_stack); if (unlikely(current->ptrace & PT_PTRACED)) { if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); + ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP); else send_sig(SIGTRAP, current, 0); } @@ -419,11 +429,11 @@ error: return retval; /* unrecoverable error - kill the process */ - error_kill: +error_kill: send_sig(SIGSEGV, current, 0); goto error; -} /* end load_elf_fdpic_binary() */ +} /*****************************************************************************/ /* @@ -471,11 +481,11 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, #if defined(__i386__) && defined(CONFIG_SMP) /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions - * by the processes running on the same package. One thing we can do - * is to shuffle the initial stack for them. + * by the processes running on the same package. One thing we can do is + * to shuffle the initial stack for them. * - * the conditionals here are unneeded, but kept in to make the - * code behaviour the same as pre change unless we have hyperthreaded + * the conditionals here are unneeded, but kept in to make the code + * behaviour the same as pre change unless we have hyperthreaded * processors. This keeps Mr Marcelo Person happier but should be * removed for 2.5 */ @@ -498,11 +508,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, if (interp_params->loadmap) { len = sizeof(struct elf32_fdpic_loadmap); - len += sizeof(struct elf32_fdpic_loadseg) * interp_params->loadmap->nsegs; + len += sizeof(struct elf32_fdpic_loadseg) * + interp_params->loadmap->nsegs; sp = (sp - len) & ~7UL; interp_params->map_addr = sp; - if (copy_to_user((void __user *) sp, interp_params->loadmap, len) != 0) + if (copy_to_user((void __user *) sp, interp_params->loadmap, + len) != 0) return -EFAULT; current->mm->context.interp_fdpic_loadmap = (unsigned long) sp; @@ -526,34 +538,37 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp -= sp & 15UL; /* put the ELF interpreter info on the stack */ -#define NEW_AUX_ENT(nr, id, val) \ - do { \ - struct { unsigned long _id, _val; } __user *ent = (void __user *) csp; \ - __put_user((id), &ent[nr]._id); \ - __put_user((val), &ent[nr]._val); \ +#define NEW_AUX_ENT(nr, id, val) \ + do { \ + struct { unsigned long _id, _val; } __user *ent; \ + \ + ent = (void __user *) csp; \ + __put_user((id), &ent[nr]._id); \ + __put_user((val), &ent[nr]._val); \ } while (0) csp -= 2 * sizeof(unsigned long); NEW_AUX_ENT(0, AT_NULL, 0); if (k_platform) { csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(0, AT_PLATFORM, (elf_addr_t)(unsigned long) u_platform); + NEW_AUX_ENT(0, AT_PLATFORM, + (elf_addr_t) (unsigned long) u_platform); } csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); - NEW_AUX_ENT( 0, AT_HWCAP, hwcap); - NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE); - NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr); - NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr)); - NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum); - NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr); - NEW_AUX_ENT( 7, AT_FLAGS, 0); - NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr); - NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid); - NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid); - NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid); - NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid); + NEW_AUX_ENT( 0, AT_HWCAP, hwcap); + NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE); + NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC); + NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr); + NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr)); + NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum); + NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr); + NEW_AUX_ENT( 7, AT_FLAGS, 0); + NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr); + NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid); + NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid); + NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid); + NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid); #ifdef ARCH_DLINFO /* ARCH_DLINFO must come last so platform specific code can enforce @@ -579,7 +594,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, #ifdef CONFIG_MMU current->mm->arg_start = bprm->p; #else - current->mm->arg_start = current->mm->start_stack - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); + current->mm->arg_start = current->mm->start_stack - + (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); #endif p = (char __user *) current->mm->arg_start; @@ -607,7 +623,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, mm->start_stack = (unsigned long) sp; return 0; -} /* end create_elf_fdpic_tables() */ +} /*****************************************************************************/ /* @@ -615,7 +631,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, * the stack */ #ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *_sp) +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *_sp) { unsigned long index, stop, sp; char *src; @@ -636,9 +653,9 @@ static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; - out: +out: return ret; -} /* end elf_fdpic_transfer_args_to_stack() */ +} #endif /*****************************************************************************/ @@ -713,17 +730,18 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, seg = loadmap->segs; for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { if (params->hdr.e_entry >= seg->p_vaddr && - params->hdr.e_entry < seg->p_vaddr + seg->p_memsz - ) { + params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) { params->entry_addr = - (params->hdr.e_entry - seg->p_vaddr) + seg->addr; + (params->hdr.e_entry - seg->p_vaddr) + + seg->addr; break; } } } /* determine where the program header table has wound up if mapped */ - stop = params->hdr.e_phoff + params->hdr.e_phnum * sizeof (struct elf_phdr); + stop = params->hdr.e_phoff; + stop += params->hdr.e_phnum * sizeof (struct elf_phdr); phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { @@ -737,9 +755,11 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, seg = loadmap->segs; for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { if (phdr->p_vaddr >= seg->p_vaddr && - phdr->p_vaddr + phdr->p_filesz <= seg->p_vaddr + seg->p_memsz - ) { - params->ph_addr = (phdr->p_vaddr - seg->p_vaddr) + seg->addr + + phdr->p_vaddr + phdr->p_filesz <= + seg->p_vaddr + seg->p_memsz) { + params->ph_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr + params->hdr.e_phoff - phdr->p_offset; break; } @@ -756,18 +776,22 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, seg = loadmap->segs; for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { if (phdr->p_vaddr >= seg->p_vaddr && - phdr->p_vaddr + phdr->p_memsz <= seg->p_vaddr + seg->p_memsz - ) { - params->dynamic_addr = (phdr->p_vaddr - seg->p_vaddr) + seg->addr; - - /* check the dynamic section contains at least one item, and that - * the last item is a NULL entry */ + phdr->p_vaddr + phdr->p_memsz <= + seg->p_vaddr + seg->p_memsz) { + params->dynamic_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr; + + /* check the dynamic section contains at least + * one item, and that the last item is a NULL + * entry */ if (phdr->p_memsz == 0 || phdr->p_memsz % sizeof(Elf32_Dyn) != 0) goto dynamic_error; tmp = phdr->p_memsz / sizeof(Elf32_Dyn); - if (((Elf32_Dyn *) params->dynamic_addr)[tmp - 1].d_tag != 0) + if (((Elf32_Dyn *) + params->dynamic_addr)[tmp - 1].d_tag != 0) goto dynamic_error; break; } @@ -776,8 +800,8 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, } /* now elide adjacent segments in the load map on MMU linux - * - on uClinux the holes between may actually be filled with system stuff or stuff from - * other processes + * - on uClinux the holes between may actually be filled with system + * stuff or stuff from other processes */ #ifdef CONFIG_MMU nloads = loadmap->nsegs; @@ -788,7 +812,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) { load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz); if (load_addr == (seg->addr & PAGE_MASK)) { - mseg->p_memsz += load_addr - (mseg->addr + mseg->p_memsz); + mseg->p_memsz += + load_addr - + (mseg->addr + mseg->p_memsz); mseg->p_memsz += seg->addr & ~PAGE_MASK; mseg->p_memsz += seg->p_memsz; loadmap->nsegs--; @@ -816,20 +842,21 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, return 0; - dynamic_error: +dynamic_error: printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", what, file->f_dentry->d_inode->i_ino); return -ELIBBAD; -} /* end elf_fdpic_map_file() */ +} /*****************************************************************************/ /* * map a file with constant displacement under uClinux */ #ifndef CONFIG_MMU -static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *params, - struct file *file, - struct mm_struct *mm) +static int elf_fdpic_map_file_constdisp_on_uclinux( + struct elf_fdpic_params *params, + struct file *file, + struct mm_struct *mm) { struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; @@ -840,7 +867,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para load_addr = params->load_addr; seg = params->loadmap->segs; - /* determine the bounds of the contiguous overall allocation we must make */ + /* determine the bounds of the contiguous overall allocation we must + * make */ phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { if (params->phdrs[loop].p_type != PT_LOAD) @@ -861,7 +889,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para maddr = do_mmap(NULL, load_addr, top - base, PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); up_write(&mm->mmap_sem); - if (IS_ERR((void *) maddr)) + if (IS_ERR_VALUE(maddr)) return (int) maddr; if (load_addr != 0) @@ -879,7 +907,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para seg->p_vaddr = phdr->p_vaddr; seg->p_memsz = phdr->p_memsz; - ret = file->f_op->read(file, (void *) seg->addr, phdr->p_filesz, &fpos); + ret = file->f_op->read(file, (void *) seg->addr, + phdr->p_filesz, &fpos); if (ret < 0) return ret; @@ -896,8 +925,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para if (phdr->p_flags & PF_X) { mm->start_code = seg->addr; mm->end_code = seg->addr + phdr->p_memsz; - } - else if (!mm->start_data) { + } else if (!mm->start_data) { mm->start_data = seg->addr; #ifndef CONFIG_MMU mm->end_data = seg->addr + phdr->p_memsz; @@ -914,7 +942,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para } return 0; -} /* end elf_fdpic_map_file_constdisp_on_uclinux() */ +} #endif /*****************************************************************************/ @@ -975,14 +1003,14 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, case ELF_FDPIC_FLAG_CONSTDISP: /* constant displacement - * - can be mapped anywhere, but must be mapped as a unit + * - can be mapped anywhere, but must be mapped as a + * unit */ if (!dvset) { maddr = load_addr; delta_vaddr = phdr->p_vaddr; dvset = 1; - } - else { + } else { maddr = load_addr + phdr->p_vaddr - delta_vaddr; flags |= MAP_FIXED; } @@ -1006,13 +1034,14 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, up_write(&mm->mmap_sem); kdebug("mmap[%d] sz=%lx pr=%x fl=%x of=%lx --> %08lx", - loop, phdr->p_memsz + disp, prot, flags, phdr->p_offset - disp, - maddr); + loop, phdr->p_memsz + disp, prot, flags, + phdr->p_offset - disp, maddr); - if (IS_ERR((void *) maddr)) + if (IS_ERR_VALUE(maddr)) return (int) maddr; - if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == ELF_FDPIC_FLAG_CONTIGUOUS) + if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == + ELF_FDPIC_FLAG_CONTIGUOUS) load_addr += PAGE_ALIGN(phdr->p_memsz + disp); seg->addr = maddr + disp; @@ -1023,7 +1052,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_offset == 0) params->elfhdr_addr = seg->addr; - /* clear the bit between beginning of mapping and beginning of PT_LOAD */ + /* clear the bit between beginning of mapping and beginning of + * PT_LOAD */ if (prot & PROT_WRITE && disp > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); clear_user((void __user *) maddr, disp); @@ -1039,19 +1069,20 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); #ifdef CONFIG_MMU - if (excess > excess1) { unsigned long xaddr = maddr + phdr->p_filesz + excess1; unsigned long xmaddr; flags |= MAP_FIXED | MAP_ANONYMOUS; down_write(&mm->mmap_sem); - xmaddr = do_mmap(NULL, xaddr, excess - excess1, prot, flags, 0); + xmaddr = do_mmap(NULL, xaddr, excess - excess1, + prot, flags, 0); up_write(&mm->mmap_sem); kdebug("mmap[%d] " " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", - loop, xaddr, excess - excess1, prot, flags, xmaddr); + loop, xaddr, excess - excess1, prot, flags, + xmaddr); if (xmaddr != xaddr) return -ENOMEM; @@ -1060,7 +1091,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (prot & PROT_WRITE && excess1 > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess1); - clear_user((void __user *) maddr + phdr->p_filesz, excess1); + clear_user((void __user *) maddr + phdr->p_filesz, + excess1); } #else @@ -1075,8 +1107,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_flags & PF_X) { mm->start_code = maddr; mm->end_code = maddr + phdr->p_memsz; - } - else if (!mm->start_data) { + } else if (!mm->start_data) { mm->start_data = maddr; mm->end_data = maddr + phdr->p_memsz; } @@ -1086,4 +1117,4 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, } return 0; -} /* end elf_fdpic_map_file_by_direct_mmap() */ +} -- cgit v1.2.3 From b4cac1a0227a6f84be0381cd350a3c8730a4a671 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:54 -0700 Subject: [PATCH] FDPIC: Move roundup() into linux/kernel.h Move the roundup() macro from binfmt_elf.c into linux/kernel.h as it's generally useful. [akpm@osdl.org: nuke all the other implementations] Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/irixelf.c | 2 -- drivers/scsi/aic7xxx/aic79xx_osm.h | 1 - fs/binfmt_elf.c | 2 -- fs/proc/kcore.c | 2 -- fs/xfs/linux-2.6/xfs_linux.h | 1 - include/linux/kernel.h | 1 + 6 files changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c index 10d3644e3608..ab12c8f01518 100644 --- a/arch/mips/kernel/irixelf.c +++ b/arch/mips/kernel/irixelf.c @@ -999,8 +999,6 @@ static inline int maydump(struct vm_area_struct *vma) return 1; } -#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) - /* An ELF note in memory. */ struct memelfnote { diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.h b/drivers/scsi/aic7xxx/aic79xx_osm.h index 9e871de23835..601340d84410 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.h +++ b/drivers/scsi/aic7xxx/aic79xx_osm.h @@ -93,7 +93,6 @@ #endif /********************************** Misc Macros *******************************/ -#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) #define powerof2(x) ((((x)-1)&(x))==0) /************************* Forward Declarations *******************************/ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f42e64210ee5..672a3b90bc55 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1185,8 +1185,6 @@ static int maydump(struct vm_area_struct *vma) return 1; } -#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) - /* An ELF note in memory */ struct memelfnote { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 036d14d83627..8d6d85d7400f 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -42,8 +42,6 @@ const struct file_operations proc_kcore_operations = { #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) #endif -#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) - /* An ELF note in memory */ struct memelfnote { diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 8c021dc57d1f..a13f75c1a936 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -215,7 +215,6 @@ BUFFER_FNS(PrivateStart, unwritten); #define MIN(a,b) (min(a,b)) #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) -#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* * Various platform dependent calls that don't fit anywhere else diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5c1ec1f84eab..181c69cad4e3 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -33,6 +33,7 @@ extern const char linux_banner[]; #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) #define KERN_EMERG "<0>" /* system is unusable */ #define KERN_ALERT "<1>" /* action must be taken immediately */ -- cgit v1.2.3 From 6d8c4e3b0150ff537902477ed62f8a8e9e70007b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Jul 2006 04:44:55 -0700 Subject: [PATCH] FDPIC: Add coredump capability for the ELF-FDPIC binfmt Add coredump capability for the ELF-FDPIC binfmt. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/frv/kernel/process.c | 8 + fs/binfmt_elf_fdpic.c | 676 +++++++++++++++++++++++++++++++++++++++++++++- include/asm-frv/elf.h | 6 +- include/linux/elfcore.h | 10 + 4 files changed, 694 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index ecdeafb2fdce..f0c767a56c65 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -371,3 +371,11 @@ int elf_check_arch(const struct elf32_hdr *hdr) return 1; } + +int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpregs) +{ + memcpy(fpregs, + ¤t->thread.user->f, + sizeof(current->thread.user->f)); + return 1; +} diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index a4ff87389823..2f3365829229 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -24,7 +24,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -48,6 +50,12 @@ typedef char *elf_caddr_t; #define kdebug(fmt, ...) do {} while(0) #endif +#if 0 +#define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) +#else +#define kdcore(fmt, ...) do {} while(0) +#endif + MODULE_LICENSE("GPL"); static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); @@ -70,10 +78,16 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, struct file *, struct mm_struct *); +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) +static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *); +#endif + static struct linux_binfmt elf_fdpic_format = { .module = THIS_MODULE, .load_binary = load_elf_fdpic_binary, -// .core_dump = elf_fdpic_core_dump, +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) + .core_dump = elf_fdpic_core_dump, +#endif .min_coredump = ELF_EXEC_PAGESIZE, }; @@ -87,7 +101,7 @@ static void __exit exit_elf_fdpic_binfmt(void) unregister_binfmt(&elf_fdpic_format); } -module_init(init_elf_fdpic_binfmt); +core_initcall(init_elf_fdpic_binfmt); module_exit(exit_elf_fdpic_binfmt); static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) @@ -1118,3 +1132,661 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, return 0; } + +/*****************************************************************************/ +/* + * ELF-FDPIC core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge + * + * Modelled on fs/binfmt_elf.c core dumper + */ +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) + +/* + * These are the only things you should do on a core-file: use only these + * functions to write out all the necessary info. + */ +static int dump_write(struct file *file, const void *addr, int nr) +{ + return file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} + +static int dump_seek(struct file *file, loff_t off) +{ + if (file->f_op->llseek) { + if (file->f_op->llseek(file, off, SEEK_SET) != off) + return 0; + } else { + file->f_pos = off; + } + return 1; +} + +/* + * Decide whether a segment is worth dumping; default is yes to be + * sure (missing info is worse than too much; etc). + * Personally I'd include everything, and use the coredump limit... + * + * I think we should skip something. But I am not sure how. H.J. + */ +static int maydump(struct vm_area_struct *vma) +{ + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & (VM_IO | VM_RESERVED)) { + kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* If we may not read the contents, don't allow us to dump + * them either. "dump_write()" can't handle it anyway. + */ + if (!(vma->vm_flags & VM_READ)) { + kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* Dump shared memory only if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (vma->vm_file->f_dentry->d_inode->i_nlink == 0) { + kdcore("%08lx: %08lx: no (share)", vma->vm_start, vma->vm_flags); + return 1; + } + + kdcore("%08lx: %08lx: no (share)", vma->vm_start, vma->vm_flags); + return 0; + } + +#ifdef CONFIG_MMU + /* If it hasn't been written to, don't write it out */ + if (!vma->anon_vma) { + kdcore("%08lx: %08lx: no (!anon)", vma->vm_start, vma->vm_flags); + return 0; + } +#endif + + kdcore("%08lx: %08lx: yes", vma->vm_start, vma->vm_flags); + return 1; +} + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup(strlen(en->name) + 1, 4); + sz += roundup(en->datasz, 4); + + return sz; +} + +/* #define DEBUG */ + +#define DUMP_WRITE(addr, nr) \ + do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) +#define DUMP_SEEK(off) \ + do { if (!dump_seek(file, (off))) return 0; } while(0) + +static int writenote(struct memelfnote *men, struct file *file) +{ + struct elf_note en; + + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + DUMP_WRITE(&en, sizeof(en)); + DUMP_WRITE(men->name, en.n_namesz); + /* XXX - cast from long long to long to avoid need for libgcc.a */ + DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ + DUMP_WRITE(men->data, men->datasz); + DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ + + return 1; +} +#undef DUMP_WRITE +#undef DUMP_SEEK + +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ + goto end_coredump; +#define DUMP_SEEK(off) \ + if (!dump_seek(file, (off))) \ + goto end_coredump; + +static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) +{ + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_FDPIC_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = segs; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + return; +} + +static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) +{ + phdr->p_type = PT_NOTE; + phdr->p_offset = offset; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = sz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + return; +} + +static inline void fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) +{ + note->name = name; + note->type = type; + note->datasz = sz; + note->data = data; + return; +} + +/* + * fill up all the fields in prstatus from the given task struct, except + * registers which need to be filled up seperately. + */ +static void fill_prstatus(struct elf_prstatus *prstatus, + struct task_struct *p, long signr) +{ + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; + prstatus->pr_pid = p->pid; + prstatus->pr_ppid = p->parent->pid; + prstatus->pr_pgrp = process_group(p); + prstatus->pr_sid = p->signal->session; + if (thread_group_leader(p)) { + /* + * This is the record for the group leader. Add in the + * cumulative times of previous dead threads. This total + * won't include the time of each live thread whose state + * is included in the core dump. The final total reported + * to our parent process when it calls wait4 will include + * those sums as well as the little bit more time it takes + * this and each other thread to finish dying after the + * core dump synchronization phase. + */ + cputime_to_timeval(cputime_add(p->utime, p->signal->utime), + &prstatus->pr_utime); + cputime_to_timeval(cputime_add(p->stime, p->signal->stime), + &prstatus->pr_stime); + } else { + cputime_to_timeval(p->utime, &prstatus->pr_utime); + cputime_to_timeval(p->stime, &prstatus->pr_stime); + } + cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); + cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); + + prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; + prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; +} + +static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, + struct mm_struct *mm) +{ + unsigned int i, len; + + /* first copy the parameters from user space */ + memset(psinfo, 0, sizeof(struct elf_prpsinfo)); + + len = mm->arg_end - mm->arg_start; + if (len >= ELF_PRARGSZ) + len = ELF_PRARGSZ - 1; + if (copy_from_user(&psinfo->pr_psargs, + (const char __user *) mm->arg_start, len)) + return -EFAULT; + for (i = 0; i < len; i++) + if (psinfo->pr_psargs[i] == 0) + psinfo->pr_psargs[i] = ' '; + psinfo->pr_psargs[len] = 0; + + psinfo->pr_pid = p->pid; + psinfo->pr_ppid = p->parent->pid; + psinfo->pr_pgrp = process_group(p); + psinfo->pr_sid = p->signal->session; + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; + psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; + psinfo->pr_zomb = psinfo->pr_sname == 'Z'; + psinfo->pr_nice = task_nice(p); + psinfo->pr_flag = p->flags; + SET_UID(psinfo->pr_uid, p->uid); + SET_GID(psinfo->pr_gid, p->gid); + strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + + return 0; +} + +/* Here is the structure in which status of each thread is captured. */ +struct elf_thread_status +{ + struct list_head list; + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + elf_fpregset_t fpu; /* NT_PRFPREG */ + struct task_struct *thread; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t xfpu; /* NT_PRXFPREG */ +#endif + struct memelfnote notes[3]; + int num_notes; +}; + +/* + * In order to add the specific thread information for the elf file format, + * we need to keep a linked list of every thread's pr_status and then create + * a single section for them in the final core file. + */ +static int elf_dump_thread_status(long signr, struct elf_thread_status *t) +{ + struct task_struct *p = t->thread; + int sz = 0; + + t->num_notes = 0; + + fill_prstatus(&t->prstatus, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &t->prstatus); + t->num_notes++; + sz += notesize(&t->notes[0]); + + t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu); + if (t->prstatus.pr_fpvalid) { + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + &t->fpu); + t->num_notes++; + sz += notesize(&t->notes[1]); + } + +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { + fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu), + &t->xfpu); + t->num_notes++; + sz += notesize(&t->notes[2]); + } +#endif + return sz; +} + +/* + * dump the segments for an MMU process + */ +#ifdef CONFIG_MMU +static int elf_fdpic_dump_segments(struct file *file, struct mm_struct *mm, + size_t *size, unsigned long *limit) +{ + struct vm_area_struct *vma; + + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + unsigned long addr; + + if (!maydump(vma)) + continue; + + for (addr = vma->vm_start; + addr < vma->vm_end; + addr += PAGE_SIZE + ) { + struct vm_area_struct *vma; + struct page *page; + + if (get_user_pages(current, current->mm, addr, 1, 0, 1, + &page, &vma) <= 0) { + DUMP_SEEK(file->f_pos + PAGE_SIZE); + } + else if (page == ZERO_PAGE(addr)) { + DUMP_SEEK(file->f_pos + PAGE_SIZE); + page_cache_release(page); + } + else { + void *kaddr; + + flush_cache_page(vma, addr, page_to_pfn(page)); + kaddr = kmap(page); + if ((*size += PAGE_SIZE) > *limit || + !dump_write(file, kaddr, PAGE_SIZE) + ) { + kunmap(page); + page_cache_release(page); + return -EIO; + } + kunmap(page); + page_cache_release(page); + } + } + } + + return 0; + +end_coredump: + return -EFBIG; +} +#endif + +/* + * dump the segments for a NOMMU process + */ +#ifndef CONFIG_MMU +static int elf_fdpic_dump_segments(struct file *file, struct mm_struct *mm, + size_t *size, unsigned long *limit) +{ + struct vm_list_struct *vml; + + for (vml = current->mm->context.vmlist; vml; vml = vml->next) { + struct vm_area_struct *vma = vml->vma; + + if (!maydump(vma)) + continue; + + if ((*size += PAGE_SIZE) > *limit) + return -EFBIG; + + if (!dump_write(file, (void *) vma->vm_start, + vma->vm_end - vma->vm_start)) + return -EIO; + } + + return 0; +} +#endif + +/* + * Actual dumper + * + * This is a two-pass process; first we find the offsets of the bits, + * and then they are actually written out. If we run out of core limit + * we just truncate. + */ +static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, + struct file *file) +{ +#define NUM_NOTES 6 + int has_dumped = 0; + mm_segment_t fs; + int segs; + size_t size = 0; + int i; + struct vm_area_struct *vma; + struct elfhdr *elf = NULL; + loff_t offset = 0, dataoff; + unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; + int numnote; + struct memelfnote *notes = NULL; + struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ + struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */ + struct task_struct *g, *p; + LIST_HEAD(thread_list); + struct list_head *t; + elf_fpregset_t *fpu = NULL; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t *xfpu = NULL; +#endif + int thread_status_size = 0; +#ifndef CONFIG_MMU + struct vm_list_struct *vml; +#endif + elf_addr_t *auxv; + + /* + * We no longer stop all VM operations. + * + * This is because those proceses that could possibly change map_count + * or the mmap / vma pages are now blocked in do_exit on current + * finishing this core dump. + * + * Only ptrace can touch these memory addresses, but it doesn't change + * the map_count or the pages allocated. So no possibility of crashing + * exists while dumping the mm->vm_next areas to the core file. + */ + + /* alloc memory for large data structures: too large to be on stack */ + elf = kmalloc(sizeof(*elf), GFP_KERNEL); + if (!elf) + goto cleanup; + prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); + if (!prstatus) + goto cleanup; + psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); + if (!psinfo) + goto cleanup; + notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL); + if (!notes) + goto cleanup; + fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); + if (!fpu) + goto cleanup; +#ifdef ELF_CORE_COPY_XFPREGS + xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); + if (!xfpu) + goto cleanup; +#endif + + if (signr) { + struct elf_thread_status *tmp; + read_lock(&tasklist_lock); + do_each_thread(g,p) + if (current->mm == p->mm && current != p) { + tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); + if (!tmp) { + read_unlock(&tasklist_lock); + goto cleanup; + } + INIT_LIST_HEAD(&tmp->list); + tmp->thread = p; + list_add(&tmp->list, &thread_list); + } + while_each_thread(g,p); + read_unlock(&tasklist_lock); + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp; + int sz; + + tmp = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(signr, tmp); + thread_status_size += sz; + } + } + + /* now collect the dump for the current */ + fill_prstatus(prstatus, current, signr); + elf_core_copy_regs(&prstatus->pr_reg, regs); + +#ifdef CONFIG_MMU + segs = current->mm->map_count; +#else + segs = 0; + for (vml = current->mm->context.vmlist; vml; vml = vml->next) + segs++; +#endif +#ifdef ELF_CORE_EXTRA_PHDRS + segs += ELF_CORE_EXTRA_PHDRS; +#endif + + /* Set up header */ + fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ + + has_dumped = 1; + current->flags |= PF_DUMPCORE; + + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + + fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus); + fill_psinfo(psinfo, current->group_leader, current->mm); + fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + + numnote = 2; + + auxv = (elf_addr_t *) current->mm->saved_auxv; + + i = 0; + do + i += 2; + while (auxv[i - 2] != AT_NULL); + fill_note(¬es[numnote++], "CORE", NT_AUXV, + i * sizeof(elf_addr_t), auxv); + + /* Try to dump the FPU. */ + if ((prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, regs, fpu))) + fill_note(notes + numnote++, + "CORE", NT_PRFPREG, sizeof(*fpu), fpu); +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(current, xfpu)) + fill_note(notes + numnote++, + "LINUX", NT_PRXFPREG, sizeof(*xfpu), xfpu); +#endif + + fs = get_fs(); + set_fs(KERNEL_DS); + + DUMP_WRITE(elf, sizeof(*elf)); + offset += sizeof(*elf); /* Elf header */ + offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ + + /* Write notes phdr entry */ + { + struct elf_phdr phdr; + int sz = 0; + + for (i = 0; i < numnote; i++) + sz += notesize(notes + i); + + sz += thread_status_size; + + fill_elf_note_phdr(&phdr, sz, offset); + offset += sz; + DUMP_WRITE(&phdr, sizeof(phdr)); + } + + /* Page-align dumped data */ + dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); + + /* write program headers for segments dump */ + for ( +#ifdef CONFIG_MMU + vma = current->mm->mmap; vma; vma = vma->vm_next +#else + vml = current->mm->context.vmlist; vml; vml = vml->next +#endif + ) { + struct elf_phdr phdr; + size_t sz; + +#ifndef CONFIG_MMU + vma = vml->vma; +#endif + + sz = vma->vm_end - vma->vm_start; + + phdr.p_type = PT_LOAD; + phdr.p_offset = offset; + phdr.p_vaddr = vma->vm_start; + phdr.p_paddr = 0; + phdr.p_filesz = maydump(vma) ? sz : 0; + phdr.p_memsz = sz; + offset += phdr.p_filesz; + phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; + if (vma->vm_flags & VM_WRITE) + phdr.p_flags |= PF_W; + if (vma->vm_flags & VM_EXEC) + phdr.p_flags |= PF_X; + phdr.p_align = ELF_EXEC_PAGESIZE; + + DUMP_WRITE(&phdr, sizeof(phdr)); + } + +#ifdef ELF_CORE_WRITE_EXTRA_PHDRS + ELF_CORE_WRITE_EXTRA_PHDRS; +#endif + + /* write out the notes section */ + for (i = 0; i < numnote; i++) + if (!writenote(notes + i, file)) + goto end_coredump; + + /* write out the thread status notes section */ + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp = + list_entry(t, struct elf_thread_status, list); + + for (i = 0; i < tmp->num_notes; i++) + if (!writenote(&tmp->notes[i], file)) + goto end_coredump; + } + + DUMP_SEEK(dataoff); + + if (elf_fdpic_dump_segments(file, current->mm, &size, &limit) < 0) + goto end_coredump; + +#ifdef ELF_CORE_WRITE_EXTRA_DATA + ELF_CORE_WRITE_EXTRA_DATA; +#endif + + if (file->f_pos != offset) { + /* Sanity check */ + printk(KERN_WARNING + "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", + file->f_pos, offset); + } + +end_coredump: + set_fs(fs); + +cleanup: + while (!list_empty(&thread_list)) { + struct list_head *tmp = thread_list.next; + list_del(tmp); + kfree(list_entry(tmp, struct elf_thread_status, list)); + } + + kfree(elf); + kfree(prstatus); + kfree(psinfo); + kfree(notes); + kfree(fpu); +#ifdef ELF_CORE_COPY_XFPREGS + kfree(xfpu); +#endif + return has_dumped; +#undef NUM_NOTES +} + +#endif /* USE_ELF_CORE_DUMP */ diff --git a/include/asm-frv/elf.h b/include/asm-frv/elf.h index 38656da00e40..7df58a3e6e4a 100644 --- a/include/asm-frv/elf.h +++ b/include/asm-frv/elf.h @@ -64,7 +64,7 @@ typedef unsigned long elf_greg_t; #define ELF_NGREG (sizeof(struct pt_regs) / sizeof(elf_greg_t)) typedef elf_greg_t elf_gregset_t[ELF_NGREG]; -typedef struct fpmedia_struct elf_fpregset_t; +typedef struct user_fpmedia_regs elf_fpregset_t; /* * This is used to ensure we don't load something for the wrong architecture. @@ -116,6 +116,7 @@ do { \ } while(0) #define USE_ELF_CORE_DUMP +#define ELF_FDPIC_CORE_EFLAGS EF_FRV_FDPIC #define ELF_EXEC_PAGESIZE 16384 /* This is the location that an ET_DYN program is loaded if exec'ed. Typical @@ -125,9 +126,6 @@ do { \ #define ELF_ET_DYN_BASE 0x08000000UL -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - memcpy(&pr_reg[0], ®s->sp, 31 * sizeof(uint32_t)); - /* This yields a mask that user programs can use to figure out what instruction set this cpu supports. */ diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 0cf0bea010fe..9631dddae348 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -60,6 +60,16 @@ struct elf_prstatus long pr_instr; /* Current instruction */ #endif elf_gregset_t pr_reg; /* GP registers */ +#ifdef CONFIG_BINFMT_ELF_FDPIC + /* When using FDPIC, the loadmap addresses need to be communicated + * to GDB in order for GDB to do the necessary relocations. The + * fields (below) used to communicate this information are placed + * immediately after ``pr_reg'', so that the loadmap addresses may + * be viewed as part of the register set if so desired. + */ + unsigned long pr_exec_fdpic_loadmap; + unsigned long pr_interp_fdpic_loadmap; +#endif int pr_fpvalid; /* True if math co-processor being used. */ }; -- cgit v1.2.3 From 92eb7a2f28d551acedeb5752263267a64b1f5ddf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Jul 2006 04:45:31 -0700 Subject: [PATCH] fix weird logic in alloc_fdtable() There's a fairly obvious infinite loop in there. Also, use roundup_pow_of_two() rather than open-coding stuff. Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 55f4e7022563..3f356086061d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -240,13 +240,9 @@ static struct fdtable *alloc_fdtable(int nr) if (!fdt) goto out; - nfds = 8 * L1_CACHE_BYTES; - /* Expand to the max in easy steps */ - while (nfds <= nr) { - nfds = nfds * 2; - if (nfds > NR_OPEN) - nfds = NR_OPEN; - } + nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nfds)); + if (nfds > NR_OPEN) + nfds = NR_OPEN; new_openset = alloc_fdset(nfds); new_execset = alloc_fdset(nfds); -- cgit v1.2.3 From 36cf96f5e7c098731a1ad9d79694d6f591b18e7f Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 10 Jul 2006 04:45:33 -0700 Subject: [PATCH] Remove leftover ext3 acl declarations These functions no longer exist; remove their declarations. Signed-off-by: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/acl.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 92d50b53a933..0d1e6279cbfd 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -62,9 +62,6 @@ extern int ext3_permission (struct inode *, int, struct nameidata *); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); -extern int init_ext3_acl(void); -extern void exit_ext3_acl(void); - #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include #define ext3_permission NULL -- cgit v1.2.3 From e2b209509ca33743864846aef2e1b2afc21f7915 Mon Sep 17 00:00:00 2001 From: Shankar Anand Date: Mon, 10 Jul 2006 04:45:44 -0700 Subject: [PATCH] knfsd: nfsd4: add per-operation server stats Add an nfs4 operations count array to nfsd_stats structure. The count is incremented in nfsd4_proc_compound() where all the operations are handled by the nfsv4 server. This count of individual nfsv4 operations is also entered into /proc filesystem. Signed-off-by: Shankar Anand Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4proc.c | 8 ++++++++ fs/nfsd/stats.c | 10 ++++++++++ include/linux/nfs4.h | 6 ++++++ include/linux/nfsd/stats.h | 6 ++++++ 4 files changed, 30 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b0e095ea0c03..ee4eff27aedc 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -721,6 +721,12 @@ nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) return nfs_ok; } +static inline void nfsd4_increment_op_stats(u32 opnum) +{ + if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) + nfsdstats.nfs4_opcount[opnum]++; +} + /* * COMPOUND call. @@ -930,6 +936,8 @@ encode_op: /* XXX Ugh, we need to get rid of this kind of special case: */ if (op->opnum == OP_READ && op->u.read.rd_filp) fput(op->u.read.rd_filp); + + nfsd4_increment_op_stats(op->opnum); } out: diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 57265d563804..71944cddf680 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -72,6 +72,16 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) /* show my rpc info */ svc_seq_show(seq, &nfsd_svcstats); +#ifdef CONFIG_NFSD_V4 + /* Show count for individual nfsv4 operations */ + /* Writing operation numbers 0 1 2 also for maintaining uniformity */ + seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); + for (i = 0; i <= LAST_NFS4_OP; i++) + seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + + seq_putc(seq, '\n'); +#endif + return 0; } diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 5f681d534295..db05182ca0e8 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -157,6 +157,12 @@ enum nfs_opnum4 { OP_ILLEGAL = 10044, }; +/*Defining first and last NFS4 operations implemented. +Needs to be updated if more operations are defined in future.*/ + +#define FIRST_NFS4_OP OP_ACCESS +#define LAST_NFS4_OP OP_RELEASE_LOCKOWNER + enum nfsstat4 { NFS4_OK = 0, NFS4ERR_PERM = 1, diff --git a/include/linux/nfsd/stats.h b/include/linux/nfsd/stats.h index b6f1e0cda4f2..28a82fdd922f 100644 --- a/include/linux/nfsd/stats.h +++ b/include/linux/nfsd/stats.h @@ -9,6 +9,8 @@ #ifndef LINUX_NFSD_STATS_H #define LINUX_NFSD_STATS_H +#include + struct nfsd_stats { unsigned int rchits; /* repcache hits */ unsigned int rcmisses; /* repcache hits */ @@ -27,6 +29,10 @@ struct nfsd_stats { unsigned int ra_size; /* size of ra cache */ unsigned int ra_depth[11]; /* number of times ra entry was found that deep * in the cache (10percentiles). [10] = not found */ +#ifdef CONFIG_NFSD_V4 + unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ +#endif + }; /* thread usage wraps very million seconds (approx one fortnight) */ -- cgit v1.2.3 From d579091b4385e9386e244622d593fe064aa8e8e7 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Wed, 12 Jul 2006 09:03:05 -0700 Subject: [PATCH] fix fdset leakage When found, it is obvious. nfds calculated when allocating fdsets is rewritten by calculation of size of fdtable, and when we are unlucky, we try to free fdsets of wrong size. Found due to OpenVZ resource management (User Beancounters). Signed-off-by: Alexey Kuznetsov Signed-off-by: Kirill Korotaev Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 3f356086061d..c8f1b0af8e00 100644 --- a/fs/file.c +++ b/fs/file.c @@ -273,11 +273,13 @@ static struct fdtable *alloc_fdtable(int nr) } while (nfds <= nr); new_fds = alloc_fd_array(nfds); if (!new_fds) - goto out; + goto out2; fdt->fd = new_fds; fdt->max_fds = nfds; fdt->free_files = NULL; return fdt; +out2: + nfds = fdt->max_fdset; out: if (new_openset) free_fdset(new_openset, nfds); -- cgit v1.2.3 From 232ba9dbd68bb084d5d90c511f207d18eae614da Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 12 Jul 2006 09:03:06 -0700 Subject: [PATCH] lockdep: annotate the sysfs i_mutex to be a separate class sysfs has a different i_mutex lock order behavior for i_mutex than the other filesystems; sysfs i_mutex is called in many places with subsystem locks held. At the same time, many of the VFS locking rules do not apply to sysfs at all (cross directory rename for example). To untangle this mess (which gives false positives in lockdep), we're giving sysfs inodes their own class for i_mutex. Signed-off-by: Arjan van de Ven Cc: Ingo Molnar Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sysfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 5e0e31cc46f5..9889e54e1f13 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -109,6 +109,17 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } + +/* + * sysfs has a different i_mutex lock order behavior for i_mutex than other + * filesystems; sysfs i_mutex is called in many places with subsystem locks + * held. At the same time, many of the VFS locking rules do not apply to + * sysfs at all (cross directory rename for example). To untangle this mess + * (which gives false positives in lockdep), we're giving sysfs inodes their + * own class for i_mutex. + */ +static struct lock_class_key sysfs_inode_imutex_key; + struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) { struct inode * inode = new_inode(sysfs_sb); @@ -118,6 +129,7 @@ struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; inode->i_op = &sysfs_inode_operations; + lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); if (sd->s_iattr) { /* sysfs_dirent has non-default attributes -- cgit v1.2.3 From 0635170b544b01b46a81b4ac5cff5020ab59d1fc Mon Sep 17 00:00:00 2001 From: "Adam B. Jerome" Date: Wed, 12 Jul 2006 09:03:07 -0700 Subject: [PATCH] /fs/proc/: 'larger than buffer size' memory accessed by clear_user() Address a potential 'larger than buffer size' memory access by clear_user(). Without this patch, this call to clear_user() can attempt to clear too many (tsz) bytes resulting in a wrong (-EFAULT) return code by read_kcore(). Signed-off-by: Adam B. Jerome Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8d6d85d7400f..6a984f64edd7 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -382,7 +382,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) */ if (n) { if (clear_user(buffer + tsz - n, - tsz - n)) + n)) return -EFAULT; } } else { -- cgit v1.2.3 From a29b0b74e73b66674d20a170e463fe9032f2272a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 12 Jul 2006 09:03:08 -0700 Subject: [PATCH] alloc_fdtable() expansion fix We're supposed to go the next power of two if nfds==nr. Of `nr', not of `nfsd'. Spotted by Rene Scharfe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index c8f1b0af8e00..b3c6b82e6a9d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -240,7 +240,7 @@ static struct fdtable *alloc_fdtable(int nr) if (!fdt) goto out; - nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nfds)); + nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nr + 1)); if (nfds > NR_OPEN) nfds = NR_OPEN; -- cgit v1.2.3 From 18b0bbd8ca6d3cb90425aa0d77b99a762c6d6de3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 14 Jul 2006 16:51:34 -0700 Subject: Fix nasty /proc vulnerability We have a bad interaction with both the kernel and user space being able to change some of the /proc file status. This fixes the most obvious part of it, but I expect we'll also make it harder for users to modify even their "own" files in /proc. Signed-off-by: Linus Torvalds --- fs/proc/base.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 243a94af0427..0cb8f20d000c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1338,6 +1338,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) } else { inode->i_uid = 0; inode->i_gid = 0; + inode->i_mode = 0; } security_task_to_inode(task, inode); put_task_struct(task); -- cgit v1.2.3 From 9ee8ab9fbf21e6b87ad227cd46c0a4be41ab749b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 14 Jul 2006 21:48:03 -0700 Subject: Relax /proc fix a bit Clearign all of i_mode was a bit draconian. We only really care about S_ISUID/ISGID, after all. Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0cb8f20d000c..474eae345068 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1338,8 +1338,8 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) } else { inode->i_uid = 0; inode->i_gid = 0; - inode->i_mode = 0; } + inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); put_task_struct(task); return 1; @@ -1390,6 +1390,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_uid = 0; inode->i_gid = 0; } + inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); put_task_struct(task); return 1; -- cgit v1.2.3 From de45921535bfc3b1f63b426c2a9739635f864283 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Fri, 14 Jul 2006 00:23:49 -0700 Subject: [PATCH] struct file leakage 2.6.16 leaks like hell. While testing, I found massive leakage (reproduced in openvz) in: *filp *size-4096 And 1 object leaks in *size-32 *size-64 *size-128 It is the fix for the first one. filp leaks in the bowels of namei.c. Seems, size-4096 is file table leaking in expand_fdtables. I have no idea what are the rest and why they show only accompanying another leaks. Some debugging structs? [akpm@osdl.org, Trond: remove the IS_ERR() check] Signed-off-by: Alexey Kuznetsov Cc: Kirill Korotaev Cc: Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index c9750d755aff..e01070d7bf58 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1712,8 +1712,14 @@ do_link: if (error) goto exit_dput; error = __do_follow_link(&path, nd); - if (error) + if (error) { + /* Does someone understand code flow here? Or it is only + * me so stupid? Anathema to whoever designed this non-sense + * with "intent.open". + */ + release_open_intent(nd); return error; + } nd->flags &= ~LOOKUP_PARENT; if (nd->last_type == LAST_BIND) goto ok; -- cgit v1.2.3 From 6fbe82a952790c634ea6035c223a01a81377daf1 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 14 Jul 2006 00:24:22 -0700 Subject: [PATCH] reiserfs: fix handling of device names with /'s in them On systems with block devices containing a slash (virtual dasd, cciss, etc), reiserfs will fail to initialize /proc/fs/reiserfs/ due to it being interpreted as a subdirectory. The generic block device code changes the / to ! for use in the sysfs tree. This patch uses that convention. Tested by making dm devices use dm/ rather than dm- [akpm@osdl.org: name variables consistently] Signed-off-by: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/procfs.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 5d8a8cfebc70..c533ec1bcaec 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -492,9 +492,17 @@ static void add_file(struct super_block *sb, char *name, int reiserfs_proc_info_init(struct super_block *sb) { + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + spin_lock_init(&__PINFO(sb).lock); - REISERFS_SB(sb)->procdir = - proc_mkdir(reiserfs_bdevname(sb), proc_info_root); + REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root); if (REISERFS_SB(sb)->procdir) { REISERFS_SB(sb)->procdir->owner = THIS_MODULE; REISERFS_SB(sb)->procdir->data = sb; @@ -508,13 +516,22 @@ int reiserfs_proc_info_init(struct super_block *sb) return 0; } reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s", - proc_info_root_name, reiserfs_bdevname(sb)); + proc_info_root_name, b); return 1; } int reiserfs_proc_info_done(struct super_block *sb) { struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + if (de) { remove_proc_entry("journal", de); remove_proc_entry("oidmap", de); @@ -528,7 +545,7 @@ int reiserfs_proc_info_done(struct super_block *sb) __PINFO(sb).exiting = 1; spin_unlock(&__PINFO(sb).lock); if (proc_info_root) { - remove_proc_entry(reiserfs_bdevname(sb), proc_info_root); + remove_proc_entry(b, proc_info_root); REISERFS_SB(sb)->procdir = NULL; } return 0; -- cgit v1.2.3 From d247e2c661f28a21e5f9a8d672e1e88a7c1c5d4a Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 14 Jul 2006 00:24:23 -0700 Subject: [PATCH] add function documentation for register_chrdev() Documentation for register_chrdev() was missing completely. [akpm@osdl.org: kerneldocification] Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/char_dev.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs') diff --git a/fs/char_dev.c b/fs/char_dev.c index a4cbc6706ef0..3483d3cf8087 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -182,6 +182,28 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, return 0; } +/** + * register_chrdev() - Register a major number for character devices. + * @major: major device number or 0 for dynamic allocation + * @name: name of this range of devices + * @fops: file operations associated with this devices + * + * If @major == 0 this functions will dynamically allocate a major and return + * its number. + * + * If @major > 0 this function will attempt to reserve a device with the given + * major number and will return zero on success. + * + * Returns a -ve errno on failure. + * + * The name of this device has nothing to do with the name of the device in + * /dev. It only helps to keep track of the different owners of devices. If + * your module name has only one type of devices it's ok to use e.g. the name + * of the module here. + * + * This function registers a range of 256 minor numbers. The first minor number + * is 0. + */ int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { -- cgit v1.2.3 From 25890454667b3295f67b3372352be90705f8667c Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:43 -0700 Subject: [PATCH] per-task-delay-accounting: /proc export of aggregated block I/O delays Export I/O delays seen by a task through /proc//stats for use in top etc. Note that delays for I/O done for swapping in pages (swapin I/O) is clubbed together with all other I/O here (this is not the case in the netlink interface where the swapin I/O is kept distinct) [akpm@osdl.org: printk warning fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 6 ++++-- include/linux/delayacct.h | 10 ++++++++++ kernel/delayacct.c | 12 ++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 7495d3e20775..0b615d62a159 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -411,7 +412,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %llu\n", task->pid, tcomm, state, @@ -455,7 +456,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) task->exit_signal, task_cpu(task), task->rt_priority, - task->policy); + task->policy, + (unsigned long long)delayacct_blkio_ticks(task)); if(mm) mmput(mm); return res; diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index d955078a1441..7e8b6011b8f3 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -37,6 +37,7 @@ extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(void); extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); +extern __u64 __delayacct_blkio_ticks(struct task_struct *); static inline void delayacct_set_flag(int flag) { @@ -86,6 +87,13 @@ static inline int delayacct_add_tsk(struct taskstats *d, return __delayacct_add_tsk(d, tsk); } +static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) +{ + if (tsk->delays) + return __delayacct_blkio_ticks(tsk); + return 0; +} + #else static inline void delayacct_set_flag(int flag) {} @@ -104,6 +112,8 @@ static inline void delayacct_blkio_end(void) static inline int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { return 0; } +static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) +{ return 0; } #endif /* CONFIG_TASK_DELAY_ACCT */ #endif diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 1be274a462ca..f05392d64267 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -164,3 +164,15 @@ done: spin_unlock(&tsk->delays_lock); return 0; } + +__u64 __delayacct_blkio_ticks(struct task_struct *tsk) +{ + __u64 ret; + + spin_lock(&tsk->delays->lock); + ret = nsec_to_clock_t(tsk->delays->blkio_delay + + tsk->delays->swapin_delay); + spin_unlock(&tsk->delays->lock); + return ret; +} + -- cgit v1.2.3 From 92d032855e64834283de5acfb0463232e0ab128e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Jul 2006 12:20:05 -0700 Subject: Mark /proc MS_NOSUID and MS_NOEXEC Not that we really need this any more, but at the same time there's no reason not to do this. Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6dcef089e18e..49dfb2ab783e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -192,7 +192,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent) { struct inode * root_inode; - s->s_flags |= MS_NODIRATIME; + s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; -- cgit v1.2.3 From 6d76fa58b050044994fe25f8753b8023f2b36737 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Jul 2006 12:26:45 -0700 Subject: Don't allow chmod() on the /proc// files This just turns off chmod() on the /proc// files, since there is no good reason to allow it, and had we disallowed it originally, the nasty /proc race exploit wouldn't have been possible. The other patches already fixed the problem chmod() could cause, so this is really just some final mop-up.. This particular version is based off a patch by Eugene and Marcel which had much better naming than my original equivalent one. Signed-off-by: Eugene Teo Signed-off-by: Marcel Holtmann Signed-off-by: Linus Torvalds --- fs/proc/base.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 474eae345068..fe8d55fb17cc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -551,6 +551,27 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } +static int proc_setattr(struct dentry *dentry, struct iattr *attr) +{ + int error; + struct inode *inode = dentry->d_inode; + + if (attr->ia_valid & ATTR_MODE) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (!error) { + error = security_inode_setattr(dentry, attr); + if (!error) + error = inode_setattr(inode, attr); + } + return error; +} + +static struct inode_operations proc_def_inode_operations = { + .setattr = proc_setattr, +}; + extern struct seq_operations mounts_op; struct proc_mounts { struct seq_file m; @@ -1111,7 +1132,8 @@ out: static struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, }; static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) @@ -1285,6 +1307,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st ei = PROC_I(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); + inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. @@ -1529,11 +1552,13 @@ static struct file_operations proc_task_operations = { */ static struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, + .setattr = proc_setattr, }; static struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, + .setattr = proc_setattr, }; #ifdef CONFIG_SECURITY @@ -1847,11 +1872,13 @@ static struct file_operations proc_tid_base_operations = { static struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; static struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; #ifdef CONFIG_SECURITY @@ -1894,11 +1921,13 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir, static struct inode_operations proc_tgid_attr_inode_operations = { .lookup = proc_tgid_attr_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; static struct inode_operations proc_tid_attr_inode_operations = { .lookup = proc_tid_attr_lookup, .getattr = pid_getattr, + .setattr = proc_setattr, }; #endif -- cgit v1.2.3 From 115ff50bade0f93a288677745a5884def6cbf9b1 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 26 Jul 2006 14:52:13 -0500 Subject: JFS: Quota support broken, no quota_read and quota_write jfs_quota_read/write are very near duplicates of ext2_quota_read/write. Cleaned up jfs_get_block as long as I had to change it to be non-static. Signed-off-by: Dave Kleikamp --- fs/jfs/inode.c | 16 ++------ fs/jfs/jfs_inode.h | 1 + fs/jfs/super.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 119 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 43e3f566aad6..a223cf4faa9b 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -168,16 +168,15 @@ void jfs_dirty_inode(struct inode *inode) set_cflag(COMMIT_Dirty, inode); } -static int -jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks, - struct buffer_head *bh_result, int create) +int jfs_get_block(struct inode *ip, sector_t lblock, + struct buffer_head *bh_result, int create) { s64 lblock64 = lblock; int rc = 0; xad_t xad; s64 xaddr; int xflag; - s32 xlen = max_blocks; + s32 xlen = bh_result->b_size >> ip->i_blkbits; /* * Take appropriate lock on inode @@ -188,7 +187,7 @@ jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks, IREAD_LOCK(ip); if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && - (!xtLookup(ip, lblock64, max_blocks, &xflag, &xaddr, &xlen, 0)) && + (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) && xaddr) { if (xflag & XAD_NOTRECORDED) { if (!create) @@ -255,13 +254,6 @@ jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks, return rc; } -static int jfs_get_block(struct inode *ip, sector_t lblock, - struct buffer_head *bh_result, int create) -{ - return jfs_get_blocks(ip, lblock, bh_result->b_size >> ip->i_blkbits, - bh_result, create); -} - static int jfs_writepage(struct page *page, struct writeback_control *wbc) { return nobh_writepage(page, jfs_get_block, wbc); diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index b5c7da6190dc..1fc48df670c8 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -32,6 +32,7 @@ extern void jfs_truncate_nolock(struct inode *, loff_t); extern void jfs_free_zero_link(struct inode *); extern struct dentry *jfs_get_parent(struct dentry *dentry); extern void jfs_set_inode_flags(struct inode *); +extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations jfs_aops; extern struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 4f6cfebc82db..90ee0de829c1 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -298,7 +299,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, break; } -#if defined(CONFIG_QUOTA) +#ifdef CONFIG_QUOTA case Opt_quota: case Opt_usrquota: *flag |= JFS_USRQUOTA; @@ -597,7 +598,7 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (sbi->flag & JFS_NOINTEGRITY) seq_puts(seq, ",nointegrity"); -#if defined(CONFIG_QUOTA) +#ifdef CONFIG_QUOTA if (sbi->flag & JFS_USRQUOTA) seq_puts(seq, ",usrquota"); @@ -608,6 +609,112 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs) return 0; } +#ifdef CONFIG_QUOTA + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and noone else should touch the files) + * we don't have to be afraid of races */ +static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head tmp_bh; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + + tmp_bh.b_state = 0; + tmp_bh.b_size = 1 << inode->i_blkbits; + err = jfs_get_block(inode, blk, &tmp_bh, 0); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t jfs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t towrite = len; + struct buffer_head tmp_bh; + struct buffer_head *bh; + + mutex_lock(&inode->i_mutex); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + + tmp_bh.b_state = 0; + err = jfs_get_block(inode, blk, &tmp_bh, 1); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) + return err; + if (inode->i_size < off+len-towrite) + i_size_write(inode, off+len-towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + mutex_unlock(&inode->i_mutex); + return len - towrite; +} + +#endif + static struct super_operations jfs_super_operations = { .alloc_inode = jfs_alloc_inode, .destroy_inode = jfs_destroy_inode, @@ -621,7 +728,11 @@ static struct super_operations jfs_super_operations = { .unlockfs = jfs_unlockfs, .statfs = jfs_statfs, .remount_fs = jfs_remount, - .show_options = jfs_show_options + .show_options = jfs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = jfs_quota_read, + .quota_write = jfs_quota_write, +#endif }; static struct export_operations jfs_export_operations = { -- cgit v1.2.3 From 2a293b7d5aa2f0d1e3d87b642f7ac263c2d664e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Jul 2006 17:04:26 +1000 Subject: [XFS] All xfs_disk_dquot_t values are (as the name says) disk endian. Before putting them into struct statfs they should be endian-swapped. SGI-PV: 954580 SGI-Modid: xfs-linux-melb:xfs-kern:26550a Signed-off-by: Christoph Hellwig Signed-off-by: Nathan Scott --- fs/xfs/quota/xfs_qm_bhv.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index e95e99f7168f..f137856c3261 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -217,17 +217,24 @@ xfs_qm_statvfs( return 0; dp = &dqp->q_core; - limit = dp->d_blk_softlimit ? dp->d_blk_softlimit : dp->d_blk_hardlimit; + limit = dp->d_blk_softlimit ? + be64_to_cpu(dp->d_blk_softlimit) : + be64_to_cpu(dp->d_blk_hardlimit); if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; - statp->f_bfree = (statp->f_blocks > dp->d_bcount) ? - (statp->f_blocks - dp->d_bcount) : 0; + statp->f_bfree = + (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? + (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; } - limit = dp->d_ino_softlimit ? dp->d_ino_softlimit : dp->d_ino_hardlimit; + + limit = dp->d_ino_softlimit ? + be64_to_cpu(dp->d_ino_softlimit) : + be64_to_cpu(dp->d_ino_hardlimit); if (limit && statp->f_files > limit) { statp->f_files = limit; - statp->f_ffree = (statp->f_files > dp->d_icount) ? - (statp->f_ffree - dp->d_icount) : 0; + statp->f_ffree = + (statp->f_files > be64_to_cpu(dp->d_icount)) ? + (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; } xfs_qm_dqput(dqp); -- cgit v1.2.3 From f5faad799475c4058416264f672bb33bf8b5ef41 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:04:44 +1000 Subject: [XFS] Fix remount vs no/barrier options by ensuring we clear unwanted flags from iclog buffers before submitting them for writing. SGI-PV: 954772 SGI-Modid: xfs-linux-melb:xfs-kern:26605a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_buf.h | 4 ++-- fs/xfs/xfs_log.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index ceda3a2859d2..7858703ed84c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -246,8 +246,8 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define BUF_BUSY XBF_DONT_BLOCK #define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) -#define XFS_BUF_ZEROFLAGS(bp) \ - ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI)) +#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ + ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) #define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e730328636c3..21ac1a67e3e0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1413,7 +1413,7 @@ xlog_sync(xlog_t *log, ops = iclog->ic_header.h_num_logops; INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops); - bp = iclog->ic_bp; + bp = iclog->ic_bp; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT))); @@ -1430,15 +1430,14 @@ xlog_sync(xlog_t *log, } XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ + XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); /* * Do an ordered write for the log block. - * - * It may not be needed to flush the first split block in the log wrap - * case, but do it anyways to be safe -AK + * Its unnecessary to flush the first split block in the log wrap case. */ - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER)) XFS_BUF_ORDERED(bp); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); @@ -1460,7 +1459,7 @@ xlog_sync(xlog_t *log, return error; } if (split) { - bp = iclog->ic_log->l_xbuf; + bp = iclog->ic_log->l_xbuf; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); @@ -1468,6 +1467,7 @@ xlog_sync(xlog_t *log, XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ (__psint_t)count), split); XFS_BUF_SET_FSPRIVATE(bp, iclog); + XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) -- cgit v1.2.3 From b2ea401bac39e75ebb64038609ed22efbc799905 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:05:13 +1000 Subject: [XFS] Fix a barrier related forced shutdown on mounts with quota enabled. SGI-PV: 912426 SGI-Modid: xfs-linux-melb:xfs-kern:26622a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_super.c | 7 +++++++ fs/xfs/xfs_vfsops.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9bdef9d51900..4754f342a5d3 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -314,6 +314,13 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp) return; } + if (xfs_readonly_buftarg(mp->m_ddev_targp)) { + xfs_fs_cmn_err(CE_NOTE, mp, + "Disabling barriers, underlying device is readonly"); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + return; + } + error = xfs_barrier_test(mp); if (error) { xfs_fs_cmn_err(CE_NOTE, mp, diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 6c96391f3f1a..b427d220a169 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -515,7 +515,7 @@ xfs_mount( if (error) goto error2; - if ((mp->m_flags & XFS_MOUNT_BARRIER) && !(vfsp->vfs_flag & VFS_RDONLY)) + if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_mountfs_check_barriers(mp); error = XFS_IOINIT(vfsp, args, flags); -- cgit v1.2.3 From 41ff715abc49324fb2cb20e66bc4e0290cfdbe51 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:05:51 +1000 Subject: [XFS] Ensure bulkstat from an invalid inode number gets caught always with EINVAL. SGI-PV: 953819 SGI-Modid: xfs-linux-melb:xfs-kern:26629a Signed-off-by: Nathan Scott --- fs/xfs/xfs_inode.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 86c1bf0bba9e..1f8ecff8553a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -334,10 +334,9 @@ xfs_itobp( #if !defined(__KERNEL__) ni = 0; #elif defined(DEBUG) - ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : - (BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog); + ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; #else /* usual case */ - ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 1; + ni = 1; #endif for (i = 0; i < ni; i++) { @@ -348,11 +347,15 @@ xfs_itobp( (i << mp->m_sb.sb_inodelog)); di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (imap_flags & XFS_IMAP_BULKSTAT) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EINVAL); + } #ifdef DEBUG - if (!(imap_flags & XFS_IMAP_BULKSTAT)) - cmn_err(CE_ALERT, + cmn_err(CE_ALERT, "Device %s - bad inode magic/vsn " "daddr %lld #%d (magic=%x)", XFS_BUFTARG_NAME(mp->m_ddev_targp), -- cgit v1.2.3 From 8bcb2839b74d605f5549962a6e69dc07768e95b6 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Fri, 28 Jul 2006 08:46:05 -0500 Subject: JFS: Fix bug in quota code. tmp_bh.b_size must be initialized Signed-off-by: Dave Kleikamp --- fs/jfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 90ee0de829c1..143bcd1d5eaa 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -678,6 +678,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; + tmp_bh.b_size = 1 << inode->i_blkbits; err = jfs_get_block(inode, blk, &tmp_bh, 1); if (err) goto out; -- cgit v1.2.3 From 2ccb48ebb4de139eef4fcefd5f2bb823cb0d81b9 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sun, 30 Jul 2006 03:03:01 -0700 Subject: [PATCH] ext3: avoid triggering ext3_error on bad NFS file handle The inode number out of an NFS file handle gets passed eventually to ext3_get_inode_block() without any checking. If ext3_get_inode_block() allows it to trigger an error, then bad filehandles can have unpleasant effect - ext3_error() will usually cause a forced read-only remount, or a panic if `errors=panic' was used. So remove the call to ext3_error there and put a matching check in ext3/namei.c where inode numbers are read off storage. [akpm@osdl.org: fix off-by-one error] Signed-off-by: Neil Brown Signed-off-by: Jan Kara Cc: Marcel Holtmann Cc: Cc: "Stephen C. Tweedie" Cc: Eric Sandeen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/inode.c | 13 +++++++------ fs/ext3/namei.c | 15 +++++++++++++-- include/linux/ext3_fs.h | 9 +++++++++ 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index f804d5e9d60c..ab034d3053ea 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2402,14 +2402,15 @@ static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, struct buffer_head *bh; struct ext3_group_desc * gdp; - - if ((ino != EXT3_ROOT_INO && ino != EXT3_JOURNAL_INO && - ino != EXT3_RESIZE_INO && ino < EXT3_FIRST_INO(sb)) || - ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) { - ext3_error(sb, "ext3_get_inode_block", - "bad inode number: %lu", ino); + if (!ext3_valid_inum(sb, ino)) { + /* + * This error is already checked for in namei.c unless we are + * looking at an NFS filehandle, in which case no error + * report is needed + */ return 0; } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); if (block_group >= EXT3_SB(sb)->s_groups_count) { ext3_error(sb,"ext3_get_inode_block","group >= groups count"); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index d9176dba3698..2aa7101b27cd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1000,7 +1000,12 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str if (bh) { unsigned long ino = le32_to_cpu(de->inode); brelse (bh); - inode = iget(dir->i_sb, ino); + if (!ext3_valid_inum(dir->i_sb, ino)) { + ext3_error(dir->i_sb, "ext3_lookup", + "bad inode number: %lu", ino); + inode = NULL; + } else + inode = iget(dir->i_sb, ino); if (!inode) return ERR_PTR(-EACCES); @@ -1028,7 +1033,13 @@ struct dentry *ext3_get_parent(struct dentry *child) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); - inode = iget(child->d_inode->i_sb, ino); + + if (!ext3_valid_inum(child->d_inode->i_sb, ino)) { + ext3_error(child->d_inode->i_sb, "ext3_get_parent", + "bad inode number: %lu", ino); + inode = NULL; + } else + inode = iget(child->d_inode->i_sb, ino); if (!inode) return ERR_PTR(-EACCES); diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 5607e6457a65..9f9cce7bd86d 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -492,6 +492,15 @@ static inline struct ext3_inode_info *EXT3_I(struct inode *inode) { return container_of(inode, struct ext3_inode_info, vfs_inode); } + +static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT3_ROOT_INO || + ino == EXT3_JOURNAL_INO || + ino == EXT3_RESIZE_INO || + (ino >= EXT3_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); +} #else /* Assume that user mode programs are passing in an ext3fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test -- cgit v1.2.3 From d1bbf14f37261c2c0dba71404602e1ddcec069d2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 30 Jul 2006 03:03:16 -0700 Subject: [PATCH] knfsd: Fix stale file handle problem with subtree_checking. A recent commit (7fc90ec93a5eb71f4b08403baf5ba7176b3ec6b1) moved the call to nfsd_setuser out of the 'find a dentry for a filehandle' branch of fh_verify so that it would always be called. This had the unfortunately side-effect of moving *after* the call to decode_fh, so the prober fsuid was not set when nfsd_acceptable was called, the 'permission' check did the wrong thing. This patch moves the nfsd_setuser call back where it was, and add as call in the other branch of the if. Cc: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfsfh.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ecc439d2565f..501d83884530 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -187,6 +187,11 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) goto out; } + /* Set user creds for this exportpoint */ + error = nfserrno(nfsd_setuser(rqstp, exp)); + if (error) + goto out; + /* * Look up the dentry using the NFS file handle. */ @@ -241,16 +246,17 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) dprintk("nfsd: fh_verify - just checking\n"); dentry = fhp->fh_dentry; exp = fhp->fh_export; + /* Set user creds for this exportpoint; necessary even + * in the "just checking" case because this may be a + * filehandle that was created by fh_compose, and that + * is about to be used in another nfsv4 compound + * operation */ + error = nfserrno(nfsd_setuser(rqstp, exp)); + if (error) + goto out; } cache_get(&exp->h); - /* Set user creds for this exportpoint; necessary even in the "just - * checking" case because this may be a filehandle that was created by - * fh_compose, and that is about to be used in another nfsv4 compound - * operation */ - error = nfserrno(nfsd_setuser(rqstp, exp)); - if (error) - goto out; error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); if (error) -- cgit v1.2.3 From 0e1dfc66b6ec94984a4778132147a8aa36461d58 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 30 Jul 2006 03:03:28 -0700 Subject: [PATCH] invalidate_bdev() speedup We can immediately bail from invalidate_bdev() if the blockdev has no pagecache. This solves the huge IPI storms which hald is causing on the big ia64 machines when it polls CDROM drives. Acked-by: Jes Sorensen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 3660dcb97591..71649ef9b658 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -470,13 +470,18 @@ out: pass does the actual I/O. */ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) { + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) + return; + invalidate_bh_lrus(); /* * FIXME: what about destroy_dirty_buffers? * We really want to use invalidate_inode_pages2() for * that, but not until that's cleaned up. */ - invalidate_inode_pages(bdev->bd_inode->i_mapping); + invalidate_inode_pages(mapping); } /* -- cgit v1.2.3 From cfa224e928f782e1593b5222688fad84c2cad3e8 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Sun, 30 Jul 2006 03:03:51 -0700 Subject: [PATCH] enable mac partition label per default on pmac Enable mac partition table support per default also for a powermac config. Signed-off-by: Olaf Hering Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/partitions/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig index c9a478099281..e478f1941831 100644 --- a/fs/partitions/Kconfig +++ b/fs/partitions/Kconfig @@ -99,7 +99,7 @@ config IBM_PARTITION config MAC_PARTITION bool "Macintosh partition map support" if PARTITION_ADVANCED - default y if MAC + default y if (MAC || PPC_PMAC) help Say Y here if you would like to use hard disks under Linux which were partitioned on a Macintosh. -- cgit v1.2.3 From 5b6509aa8c2f292caea7c0602ec361f920951508 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 30 Jul 2006 03:03:54 -0700 Subject: [PATCH] inotify: fix deadlock found by lockdep This is a real deadlock, a nice complex one: (warning: long explanation follows so that Andrew can have a complete patch description) it's an ABCDA deadlock: A iprune_mutex B inode->inotify_mutex C ih->mutex D dev->ev_mutex The AB relationship comes straight from invalidate_inodes() int invalidate_inodes(struct super_block * sb) { int busy; LIST_HEAD(throw_away); mutex_lock(&iprune_mutex); spin_lock(&inode_lock); inotify_unmount_inodes(&sb->s_inodes); where inotify_umount_inodes() takes the mutex_lock(&inode->inotify_mutex); The BC relationship comes directly from inotify_find_update_watch(): s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode, u32 mask) { ... mutex_lock(&inode->inotify_mutex); mutex_lock(&ih->mutex); The CD relationship comes from inotify_rm_wd: inotify_rm_wd does mutex_lock(&inode->inotify_mutex); mutex_lock(&ih->mutex) and then calls inotify_remove_watch_locked() which calls notify_dev_queue_event() which does mutex_lock(&dev->ev_mutex); (this strictly is a BCD relationship) The DA relationship comes from the most interesting part: [] shrink_icache_memory+0x42/0x270 [] shrink_slab+0x11d/0x1c9 [] try_to_free_pages+0x187/0x244 [] __alloc_pages+0x1cd/0x2e0 [] cache_alloc_refill+0x3f8/0x821 [] kmem_cache_alloc+0x85/0xcb [] kernel_event+0x2e/0x122 [] inotify_dev_queue_event+0xcc/0x140 inotify_dev_queue_event schedules a kernel_event which does a kmem_cache_alloc( , GFP_KERNEL) which may try to shrink slabs, including the inode cache .. which then takes iprune_mutex. And voila, there is an AB, a BC, a CD relationship (even a direct BCD), and also now a DA relationship -> a circular type AB-BA deadlock but involving 4 locks. The solution is simple: kernel_event() is NOT allowed to use GFP_KERNEL, but must use GFP_NOFS to not cause recursion into the VFS. Signed-off-by: Arjan van de Ven Acked-by: Ingo Molnar Acked-by: Robert Love Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inotify_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/inotify_user.c b/fs/inotify_user.c index f2386442adee..017cb0f134d6 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -187,7 +187,7 @@ static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie, { struct inotify_kernel_event *kevent; - kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL); + kevent = kmem_cache_alloc(event_cachep, GFP_NOFS); if (unlikely(!kevent)) return NULL; -- cgit v1.2.3 From 6ecbc4e1a395062a8e99e4f5fe328f6ba166d9c8 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:03:56 -0700 Subject: [PATCH] Remove incorrect unlock_kernel from allocation failure path in coda_open() Commit 398c53a757702e1e3a7a2c24860c7ad26acb53ed (in the historical GIT tree) moved the lock_kernel() in coda_open after the allocation of a coda_file_info struct, but left an unlock_kernel() in the allocation failure error path; remove it. Signed-off-by: Josh Triplett Acked-by: Jan Harkes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coda/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/coda/file.c b/fs/coda/file.c index cc66c681bd11..dbfbcfa5b3c0 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -136,10 +136,8 @@ int coda_open(struct inode *coda_inode, struct file *coda_file) coda_vfs_stat.open++; cfi = kmalloc(sizeof(struct coda_file_info), GFP_KERNEL); - if (!cfi) { - unlock_kernel(); + if (!cfi) return -ENOMEM; - } lock_kernel(); -- cgit v1.2.3 From 0aa9e4f147880b2d7d1eef1f0b45112af0e36f9f Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:03:58 -0700 Subject: [PATCH] efs: Remove incorrect unlock_kernel from failure path in efs_symlink_readpage() If efs_symlink_readpage hits the -ENAMETOOLONG error path, it will call unlock_kernel without ever having called lock_kernel(); fix this by creating and jumping to a new label fail_notlocked rather than the fail label used after calling lock_kernel(). Signed-off-by: Josh Triplett Cc: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/efs/symlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c index e249cf733a6b..1d30d2ff440f 100644 --- a/fs/efs/symlink.c +++ b/fs/efs/symlink.c @@ -22,7 +22,7 @@ static int efs_symlink_readpage(struct file *file, struct page *page) err = -ENAMETOOLONG; if (size > 2 * EFS_BLOCKSIZE) - goto fail; + goto fail_notlocked; lock_kernel(); /* read first 512 bytes of link target */ @@ -47,6 +47,7 @@ static int efs_symlink_readpage(struct file *file, struct page *page) return 0; fail: unlock_kernel(); +fail_notlocked: SetPageError(page); kunmap(page); unlock_page(page); -- cgit v1.2.3 From 344fe78669d2d1cff9e8939598f6d0d865b6a75b Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:03:59 -0700 Subject: [PATCH] ufs: remove incorrect unlock_kernel from failure path in ufs_symlink() ufs_symlink, in one of its error paths, calls unlock_kernel without ever having called lock_kernel(); fix this by creating and jumping to a new label out_notlocked rather than the out label used after calling lock_kernel(). Signed-off-by: Josh Triplett Cc: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index abd5f23a426d..d344b411e261 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -129,7 +129,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, struct inode * inode; if (l > sb->s_blocksize) - goto out; + goto out_notlocked; lock_kernel(); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); @@ -155,6 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, err = ufs_add_nondir(dentry, inode); out: unlock_kernel(); +out_notlocked: return err; out_fail: -- cgit v1.2.3 From 685d16ddb07b74537fb18972784e6214840fdd20 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sun, 30 Jul 2006 03:04:08 -0700 Subject: [PATCH] fuse: fix zero timeout An attribute and entry timeout of zero should mean, that the entity is invalidated immediately after the operation. Previously invalidation only happened at the next clock tick. Reported and tested by Craig Davies. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dir.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 72a74cde6de8..6db66ec386ae 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -25,8 +25,11 @@ */ static unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec) { - struct timespec ts = {sec, nsec}; - return jiffies + timespec_to_jiffies(&ts); + if (sec || nsec) { + struct timespec ts = {sec, nsec}; + return jiffies + timespec_to_jiffies(&ts); + } else + return jiffies - 1; } /* -- cgit v1.2.3 From 0a0898cf413876d4ed6e371f3e04bf38600a9205 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sun, 30 Jul 2006 03:04:10 -0700 Subject: [PATCH] fuse: use jiffies_64 It is entirely possible (though rare) that jiffies half-wraps around, while a dentry/inode remains in the cache. This could mean that the dentry/inode is not invalidated for another half wraparound-time. To get around this problem, use 64-bit jiffies. The only problem with this is that dentry->d_time is 32 bits on 32-bit archs. So use d_fsdata as the high 32 bits. This is an ugly hack, but far simpler, than having to allocate private data just for this purpose. Since 64-bit jiffies can be assumed never to wrap around, simple comparison can be used, and a zero time value can represent "invalid". Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dir.c | 44 ++++++++++++++++++++++++++++++++++++-------- fs/fuse/fuse_i.h | 2 +- fs/fuse/inode.c | 2 +- 3 files changed, 38 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 6db66ec386ae..409ce6a7cca4 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,6 +14,33 @@ #include #include +#if BITS_PER_LONG >= 64 +static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; +} + +static inline u64 fuse_dentry_time(struct dentry *entry) +{ + return entry->d_time; +} +#else +/* + * On 32 bit archs store the high 32 bits of time in d_fsdata + */ +static void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; + entry->d_fsdata = (void *) (unsigned long) (time >> 32); +} + +static u64 fuse_dentry_time(struct dentry *entry) +{ + return (u64) entry->d_time + + ((u64) (unsigned long) entry->d_fsdata << 32); +} +#endif + /* * FUSE caches dentries and attributes with separate timeout. The * time in jiffies until the dentry/attributes are valid is stored in @@ -23,13 +50,13 @@ /* * Calculate the time in jiffies until a dentry/attributes are valid */ -static unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec) +static u64 time_to_jiffies(unsigned long sec, unsigned long nsec) { if (sec || nsec) { struct timespec ts = {sec, nsec}; - return jiffies + timespec_to_jiffies(&ts); + return get_jiffies_64() + timespec_to_jiffies(&ts); } else - return jiffies - 1; + return 0; } /* @@ -38,7 +65,8 @@ static unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec) */ static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o) { - entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec); + fuse_dentry_settime(entry, + time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); if (entry->d_inode) get_fuse_inode(entry->d_inode)->i_time = time_to_jiffies(o->attr_valid, o->attr_valid_nsec); @@ -50,7 +78,7 @@ static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o) */ void fuse_invalidate_attr(struct inode *inode) { - get_fuse_inode(inode)->i_time = jiffies - 1; + get_fuse_inode(inode)->i_time = 0; } /* @@ -63,7 +91,7 @@ void fuse_invalidate_attr(struct inode *inode) */ static void fuse_invalidate_entry_cache(struct dentry *entry) { - entry->d_time = jiffies - 1; + fuse_dentry_settime(entry, 0); } /* @@ -105,7 +133,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) if (inode && is_bad_inode(inode)) return 0; - else if (time_after(jiffies, entry->d_time)) { + else if (fuse_dentry_time(entry) < get_jiffies_64()) { int err; struct fuse_entry_out outarg; struct fuse_conn *fc; @@ -669,7 +697,7 @@ static int fuse_revalidate(struct dentry *entry) if (!fuse_allow_task(fc, current)) return -EACCES; if (get_node_id(inode) != FUSE_ROOT_ID && - time_before_eq(jiffies, fi->i_time)) + fi->i_time >= get_jiffies_64()) return 0; return fuse_do_getattr(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0dbf96621841..69c7750d55b8 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -59,7 +59,7 @@ struct fuse_inode { struct fuse_req *forget_req; /** Time in jiffies until the file attributes are valid */ - unsigned long i_time; + u64 i_time; }; /** FUSE specific file data */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index dcaaabd3b9c4..7d25092262ae 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -51,7 +51,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) return NULL; fi = get_fuse_inode(inode); - fi->i_time = jiffies - 1; + fi->i_time = 0; fi->nodeid = 0; fi->nlookup = 0; fi->forget_req = fuse_request_alloc(); -- cgit v1.2.3 From 873302c71c0e60234eb187b15f83c2d79e84c40a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sun, 30 Jul 2006 03:04:10 -0700 Subject: [PATCH] fuse: fix typo Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/control.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a3bce3a77253..46fe60b2da23 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -105,7 +105,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, /* * Add a connection to the control filesystem (if it exists). Caller - * must host fuse_mutex + * must hold fuse_mutex */ int fuse_ctl_add_conn(struct fuse_conn *fc) { @@ -139,7 +139,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) /* * Remove a connection from the control filesystem (if it exists). - * Caller must host fuse_mutex + * Caller must hold fuse_mutex */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { -- cgit v1.2.3 From bc65ac6a0ffc66c56d1e6893685d7fe87c63cc44 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:04:12 -0700 Subject: [PATCH] freevxfs: Add missing lock_kernel() to vxfs_readdir Commit 7b2fd697427e73c81d5fa659efd91bd07d303b0e in the historical GIT tree stopped calling the readdir member of a file_operations struct with the big kernel lock held, and fixed up all the readdir functions to do their own locking. However, that change added calls to unlock_kernel() in vxfs_readdir, but no call to lock_kernel(). Fix this by adding a call to lock_kernel(). Signed-off-by: Josh Triplett Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/freevxfs/vxfs_lookup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 29cce456c7ce..43886fa00a2a 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -246,6 +246,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) u_long page, npages, block, pblocks, nblocks, offset; loff_t pos; + lock_kernel(); + switch ((long)fp->f_pos) { case 0: if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) -- cgit v1.2.3 From 0e31f51d8177320d61ec5786ca4aafa7b7a749b4 Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sun, 30 Jul 2006 03:04:14 -0700 Subject: [PATCH] ext3 -nobh option causes oops For files other than IFREG, nobh option doesn't make sense. Modifications to them are journalled and needs buffer heads to do that. Without this patch, we get kernel oops in page_buffers(). Signed-off-by: Badari Pulavarty Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ab034d3053ea..c5ee9f0691e3 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1158,7 +1158,7 @@ retry: ret = PTR_ERR(handle); goto out; } - if (test_opt(inode->i_sb, NOBH)) + if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) ret = nobh_prepare_write(page, from, to, ext3_get_block); else ret = block_prepare_write(page, from, to, ext3_get_block); @@ -1244,7 +1244,7 @@ static int ext3_writeback_commit_write(struct file *file, struct page *page, if (new_i_size > EXT3_I(inode)->i_disksize) EXT3_I(inode)->i_disksize = new_i_size; - if (test_opt(inode->i_sb, NOBH)) + if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) ret = nobh_commit_write(file, page, from, to); else ret = generic_commit_write(file, page, from, to); @@ -1494,7 +1494,7 @@ static int ext3_writeback_writepage(struct page *page, goto out_fail; } - if (test_opt(inode->i_sb, NOBH)) + if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) ret = nobh_writepage(page, ext3_get_block, wbc); else ret = block_write_full_page(page, ext3_get_block, wbc); -- cgit v1.2.3 From 4c90c68aca278f425afc0b48d86298b960fbc0ce Mon Sep 17 00:00:00 2001 From: Russ Ross Date: Sun, 30 Jul 2006 03:04:15 -0700 Subject: [PATCH] 9p: fix marshalling bug in tcreate with empty extension field Signed-off-by: Russ Ross Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/conv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/conv.c b/fs/9p/conv.c index 1e898144eb7c..56d88c1a09c5 100644 --- a/fs/9p/conv.c +++ b/fs/9p/conv.c @@ -673,8 +673,10 @@ struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode, struct cbuf *bufp = &buffer; size = 4 + 2 + strlen(name) + 4 + 1; /* fid[4] name[s] perm[4] mode[1] */ - if (extended && extension!=NULL) - size += 2 + strlen(extension); /* extension[s] */ + if (extended) { + size += 2 + /* extension[s] */ + (extension == NULL ? 0 : strlen(extension)); + } fc = v9fs_create_common(bufp, size, TCREATE); if (IS_ERR(fc)) -- cgit v1.2.3 From 834a9b8ca7a01c34570be021f88e18884a29f048 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Sun, 30 Jul 2006 03:04:16 -0700 Subject: [PATCH] 9p: fix fid behavior on failed remove Based on a bug report from Russ Ross According to the spec: "The remove request asks the file server both to remove the file represented by fid and to clunk the fid, even if the remove fails." but the Linux client seems to expect the fid to be valid after a failed remove attempt. Specifically, I'm getting this behavior when attempting to remove a non-empty directory. Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 2f580a197b8d..eae50c9d6dc4 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -434,11 +434,11 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) result = v9fs_t_remove(v9ses, fid, &fcall); if (result < 0) { PRINT_FCALL_ERROR("remove fails", fcall); - } else { - v9fs_put_idpool(fid, &v9ses->fidpool); - v9fs_fid_destroy(v9fid); } + v9fs_put_idpool(fid, &v9ses->fidpool); + v9fs_fid_destroy(v9fid); + kfree(fcall); return result; } -- cgit v1.2.3 From 3e2efce067cec0099f99ae59f28feda99b02b498 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 13 Jul 2006 13:16:02 -0400 Subject: [PATCH] fix faulty inode data collection for open() with O_CREAT When the specified path is an existing file or when it is a symlink, audit collects the wrong inode number, which causes it to miss the open() event. Adding a second hook to the open() path fixes this. Also add audit_copy_inode() to consolidate some code. Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- fs/namei.c | 2 ++ include/linux/audit.h | 7 ++++++ kernel/auditsc.c | 63 +++++++++++++++++++++++++++++++++------------------ 3 files changed, 50 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index e01070d7bf58..47a7bad92d2a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1659,6 +1659,7 @@ do_last: * It already exists. */ mutex_unlock(&dir->d_inode->i_mutex); + audit_inode_update(path.dentry->d_inode); error = -EEXIST; if (flag & O_EXCL) @@ -1669,6 +1670,7 @@ do_last: if (flag & O_NOFOLLOW) goto exit_dput; } + error = -ENOENT; if (!path.dentry->d_inode) goto exit_dput; diff --git a/include/linux/audit.h b/include/linux/audit.h index b27d7debc5a1..e7e5e5348987 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -328,6 +328,7 @@ extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct inode *inode); extern void __audit_inode_child(const char *dname, const struct inode *inode, unsigned long pino); +extern void __audit_inode_update(const struct inode *inode); static inline void audit_getname(const char *name) { if (unlikely(current->audit_context)) @@ -343,6 +344,10 @@ static inline void audit_inode_child(const char *dname, if (unlikely(current->audit_context)) __audit_inode_child(dname, inode, pino); } +static inline void audit_inode_update(const struct inode *inode) { + if (unlikely(current->audit_context)) + __audit_inode_update(inode); +} /* Private API (for audit.c only) */ extern unsigned int audit_serial(void); @@ -414,8 +419,10 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) #define audit_putname(n) do { ; } while (0) #define __audit_inode(n,i) do { ; } while (0) #define __audit_inode_child(d,i,p) do { ; } while (0) +#define __audit_inode_update(i) do { ; } while (0) #define audit_inode(n,i) do { ; } while (0) #define audit_inode_child(d,i,p) do { ; } while (0) +#define audit_inode_update(i) do { ; } while (0) #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0) #define audit_get_loginuid(c) ({ -1; }) #define audit_ipc_obj(i) ({ 0; }) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ae40ac8c39e7..b939ed2da3ee 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1199,14 +1199,18 @@ void audit_putname(const char *name) #endif } -static void audit_inode_context(int idx, const struct inode *inode) +/* Copy inode data into an audit_names. */ +static void audit_copy_inode(struct audit_names *name, const struct inode *inode) { - struct audit_context *context = current->audit_context; - - selinux_get_inode_sid(inode, &context->names[idx].osid); + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + selinux_get_inode_sid(inode, &name->osid); } - /** * audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1240,13 +1244,7 @@ void __audit_inode(const char *name, const struct inode *inode) ++context->ino_count; #endif } - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; - audit_inode_context(idx, inode); + audit_copy_inode(&context->names[idx], inode); } /** @@ -1302,16 +1300,37 @@ update_context: context->names[idx].name_len = AUDIT_NAME_FULL; context->names[idx].name_put = 0; /* don't call __putname() */ - if (inode) { - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; - audit_inode_context(idx, inode); - } else - context->names[idx].ino = (unsigned long)-1; + if (!inode) + context->names[idx].ino = (unsigned long)-1; + else + audit_copy_inode(&context->names[idx], inode); +} + +/** + * audit_inode_update - update inode info for last collected name + * @inode: inode being audited + * + * When open() is called on an existing object with the O_CREAT flag, the inode + * data audit initially collects is incorrect. This additional hook ensures + * audit has the inode data for the actual object to be opened. + */ +void __audit_inode_update(const struct inode *inode) +{ + struct audit_context *context = current->audit_context; + int idx; + + if (!context->in_syscall || !inode) + return; + + if (context->name_count == 0) { + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + } + idx = context->name_count - 1; + + audit_copy_inode(&context->names[idx], inode); } /** -- cgit v1.2.3 From 73d3ec5abad3f1730ac8530899d2c14d92f3ad63 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 13 Jul 2006 13:16:39 -0400 Subject: [PATCH] fix missed create event for directory audit When an object is created via a symlink into an audited directory, audit misses the event due to not having collected the inode data for the directory. Modify __audit_inode_child() to copy the parent inode data if a parent wasn't found in audit_names[]. Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- fs/namei.c | 2 +- include/linux/audit.h | 8 ++++---- include/linux/fsnotify.h | 6 +++--- kernel/auditsc.c | 16 +++++++++++++--- 4 files changed, 21 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 47a7bad92d2a..0ab26cbdacc0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1357,7 +1357,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) return -ENOENT; BUG_ON(victim->d_parent->d_inode != dir); - audit_inode_child(victim->d_name.name, victim->d_inode, dir->i_ino); + audit_inode_child(victim->d_name.name, victim->d_inode, dir); error = permission(dir,MAY_WRITE | MAY_EXEC, NULL); if (error) diff --git a/include/linux/audit.h b/include/linux/audit.h index e7e5e5348987..bf196c05826c 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -327,7 +327,7 @@ extern void __audit_getname(const char *name); extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct inode *inode); extern void __audit_inode_child(const char *dname, const struct inode *inode, - unsigned long pino); + const struct inode *parent); extern void __audit_inode_update(const struct inode *inode); static inline void audit_getname(const char *name) { @@ -339,10 +339,10 @@ static inline void audit_inode(const char *name, const struct inode *inode) { __audit_inode(name, inode); } static inline void audit_inode_child(const char *dname, - const struct inode *inode, - unsigned long pino) { + const struct inode *inode, + const struct inode *parent) { if (unlikely(current->audit_context)) - __audit_inode_child(dname, inode, pino); + __audit_inode_child(dname, inode, parent); } static inline void audit_inode_update(const struct inode *inode) { if (unlikely(current->audit_context)) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index cc5dec70c32c..d4f219ffaa5d 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -67,7 +67,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, if (source) { inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL); } - audit_inode_child(new_name, source, new_dir->i_ino); + audit_inode_child(new_name, source, new_dir); } /* @@ -98,7 +98,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry) inode_dir_notify(inode, DN_CREATE); inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name, dentry->d_inode); - audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino); + audit_inode_child(dentry->d_name.name, dentry->d_inode, inode); } /* @@ -109,7 +109,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) inode_dir_notify(inode, DN_CREATE); inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, dentry->d_name.name, dentry->d_inode); - audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino); + audit_inode_child(dentry->d_name.name, dentry->d_inode, inode); } /* diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b939ed2da3ee..b1356fc63b26 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1251,7 +1251,7 @@ void __audit_inode(const char *name, const struct inode *inode) * audit_inode_child - collect inode info for created/removed objects * @dname: inode's dentry name * @inode: inode being audited - * @pino: inode number of dentry parent + * @parent: inode of dentry parent * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. @@ -1262,7 +1262,7 @@ void __audit_inode(const char *name, const struct inode *inode) * unsuccessful attempts. */ void __audit_inode_child(const char *dname, const struct inode *inode, - unsigned long pino) + const struct inode *parent) { int idx; struct audit_context *context = current->audit_context; @@ -1276,7 +1276,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode, if (!dname) goto update_context; for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].ino == pino) { + if (context->names[idx].ino == parent->i_ino) { const char *name = context->names[idx].name; if (!name) @@ -1304,6 +1304,16 @@ update_context: context->names[idx].ino = (unsigned long)-1; else audit_copy_inode(&context->names[idx], inode); + + /* A parent was not found in audit_names, so copy the inode data for the + * provided parent. */ + if (!found_name) { + idx = context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + audit_copy_inode(&context->names[idx], parent); + } } /** -- cgit v1.2.3 From 5ac3a9c26c1cc4861d9cdd8b293fecbfcdc81afe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Jul 2006 06:38:45 -0400 Subject: [PATCH] don't bother with aux entires for dummy context Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- include/linux/audit.h | 22 +++++++++++----------- kernel/auditsc.c | 6 +++--- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 0ab26cbdacc0..55a131230f94 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -159,7 +159,7 @@ char * getname(const char __user * filename) #ifdef CONFIG_AUDITSYSCALL void putname(const char *name) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) audit_putname(name); else __putname(name); @@ -1125,7 +1125,7 @@ static int fastcall do_path_lookup(int dfd, const char *name, retval = link_path_walk(name, nd); out: if (likely(retval == 0)) { - if (unlikely(current->audit_context && nd && nd->dentry && + if (unlikely(!audit_dummy_context() && nd && nd->dentry && nd->dentry->d_inode)) audit_inode(name, nd->dentry->d_inode); } diff --git a/include/linux/audit.h b/include/linux/audit.h index 3f736d658218..64f9f9e56ac5 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -336,21 +336,21 @@ static inline int audit_dummy_context(void) } static inline void audit_getname(const char *name) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) __audit_getname(name); } static inline void audit_inode(const char *name, const struct inode *inode) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) __audit_inode(name, inode); } static inline void audit_inode_child(const char *dname, const struct inode *inode, const struct inode *parent) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) __audit_inode_child(dname, inode, parent); } static inline void audit_inode_update(const struct inode *inode) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) __audit_inode_update(inode); } @@ -375,43 +375,43 @@ extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) return __audit_ipc_obj(ipcp); return 0; } static inline int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) return __audit_ipc_set_perm(qbytes, uid, gid, mode); return 0; } static inline int audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) return __audit_mq_open(oflag, mode, u_attr); return 0; } static inline int audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) return __audit_mq_timedsend(mqdes, msg_len, msg_prio, u_abs_timeout); return 0; } static inline int audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) return __audit_mq_timedreceive(mqdes, msg_len, u_msg_prio, u_abs_timeout); return 0; } static inline int audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) return __audit_mq_notify(mqdes, u_notification); return 0; } static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { - if (unlikely(current->audit_context)) + if (unlikely(!audit_dummy_context())) return __audit_mq_getsetattr(mqdes, mqstat); return 0; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9618d1507251..f571c7e925e6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1676,7 +1676,7 @@ int audit_bprm(struct linux_binprm *bprm) unsigned long p, next; void *to; - if (likely(!audit_enabled || !context)) + if (likely(!audit_enabled || !context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, @@ -1714,7 +1714,7 @@ int audit_socketcall(int nargs, unsigned long *args) struct audit_aux_data_socketcall *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) + if (likely(!context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); @@ -1742,7 +1742,7 @@ int audit_sockaddr(int len, void *a) struct audit_aux_data_sockaddr *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) + if (likely(!context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); -- cgit v1.2.3 From ce510193272c295b891e45525a83b543ae3207c1 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 24 Jul 2006 16:30:00 -0700 Subject: NFS: Release dcache_lock in an error path of nfs_path In one of the error paths of nfs_path, it may return with dcache_lock still held; fix this by adding and using a new error path Elong_unlock which unlocks dcache_lock. Signed-off-by: Josh Triplett Signed-off-by: Trond Myklebust (cherry picked from f4b90b43677fb23297c56802c3056fc304f988d9 commit) --- fs/nfs/namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 19b98ca468eb..86b3169c8cac 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -51,7 +51,7 @@ char *nfs_path(const char *base, const struct dentry *dentry, namelen = dentry->d_name.len; buflen -= namelen + 1; if (buflen < 0) - goto Elong; + goto Elong_unlock; end -= namelen; memcpy(end, dentry->d_name.name, namelen); *--end = '/'; @@ -68,6 +68,8 @@ char *nfs_path(const char *base, const struct dentry *dentry, end -= namelen; memcpy(end, base, namelen); return end; +Elong_unlock: + spin_unlock(&dcache_lock); Elong: return ERR_PTR(-ENAMETOOLONG); } -- cgit v1.2.3 From e4e20512cfe0bacec0764b4925889d1fa94644f9 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 3 Aug 2006 15:07:47 -0400 Subject: NFS: make 2 functions static nfs_writedata_free() and nfs_readdata_free() can now become static. Signed-off-by: Adrian Bunk Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust (cherry picked from 5e1ce40f0c3c8f67591aff17756930d7a18ceb1a commit) --- fs/nfs/read.c | 2 +- fs/nfs/write.c | 2 +- include/linux/nfs_fs.h | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 52bf634260a1..65c0c5b32351 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -63,7 +63,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) return p; } -void nfs_readdata_free(struct nfs_read_data *p) +static void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 86bac6a5008e..50774991f8d5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -137,7 +137,7 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) return p; } -void nfs_writedata_free(struct nfs_write_data *p) +static void nfs_writedata_free(struct nfs_write_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 55ea853d57bc..247434553ae8 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -476,10 +476,9 @@ static inline int nfs_wb_page(struct inode *inode, struct page* page) } /* - * Allocate and free nfs_write_data structures + * Allocate nfs_write_data structures */ extern struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount); -extern void nfs_writedata_free(struct nfs_write_data *p); /* * linux/fs/nfs/read.c @@ -491,10 +490,9 @@ extern int nfs_readpage_result(struct rpc_task *, struct nfs_read_data *); extern void nfs_readdata_release(void *data); /* - * Allocate and free nfs_read_data structures + * Allocate nfs_read_data structures */ extern struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount); -extern void nfs_readdata_free(struct nfs_read_data *p); /* * linux/fs/nfs3proc.c -- cgit v1.2.3 From f3d43c769d14b7065da7f62ec468b1fcb8cd6e06 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 3 Aug 2006 15:07:47 -0400 Subject: NLM/lockd: remove b_done We never actually set the b_done field any more; it's always zero. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust (cherry picked from af8412d4283ef91356e65e0ed9b025b376aebded commit) --- fs/lockd/svclock.c | 12 +++--------- include/linux/lockd/lockd.h | 1 - 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index baf5ae513481..c9d419703cf3 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -638,9 +638,6 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) if (task->tk_status < 0) { /* RPC error: Re-insert for retransmission */ timeout = 10 * HZ; - } else if (block->b_done) { - /* Block already removed, kill it for real */ - timeout = 0; } else { /* Call was successful, now wait for client callback */ timeout = 60 * HZ; @@ -709,13 +706,10 @@ nlmsvc_retry_blocked(void) break; if (time_after(block->b_when,jiffies)) break; - dprintk("nlmsvc_retry_blocked(%p, when=%ld, done=%d)\n", - block, block->b_when, block->b_done); + dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n", + block, block->b_when); kref_get(&block->b_count); - if (block->b_done) - nlmsvc_unlink_block(block); - else - nlmsvc_grant_blocked(block); + nlmsvc_grant_blocked(block); nlmsvc_release_block(block); } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index aa4fe905bb4d..0d92c468d55a 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -123,7 +123,6 @@ struct nlm_block { unsigned int b_id; /* block id */ unsigned char b_queued; /* re-queued */ unsigned char b_granted; /* VFS granted lock */ - unsigned char b_done; /* callback complete */ struct nlm_file * b_file; /* file in question */ }; -- cgit v1.2.3 From 1fb32b7bd8203d0175649a75ede3ee7634d6a941 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Sat, 5 Aug 2006 12:13:55 -0700 Subject: [PATCH] ufs: ufs_get_locked_page() race fix As discussed earlier: http://lkml.org/lkml/2006/6/28/136 this patch fixes such issue: `ufs_get_locked_page' takes page from cache after that `vmtruncate' takes page and deletes it from cache `ufs_get_locked_page' locks page, and reports about EIO error. Also because of find_lock_page always return valid page or NULL, we have no need to check it if page not NULL. Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/util.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 337cf2c46d10..005d6815adf5 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -257,6 +257,7 @@ try_again: page = read_cache_page(mapping, index, (filler_t*)mapping->a_ops->readpage, NULL); + if (IS_ERR(page)) { printk(KERN_ERR "ufs_change_blocknr: " "read_cache_page error: ino %lu, index: %lu\n", @@ -266,6 +267,13 @@ try_again: lock_page(page); + if (unlikely(page->mapping == NULL)) { + /* Truncate got there first */ + unlock_page(page); + page_cache_release(page); + goto try_again; + } + if (!PageUptodate(page) || PageError(page)) { unlock_page(page); page_cache_release(page); @@ -275,15 +283,8 @@ try_again: mapping->host->i_ino, index); page = ERR_PTR(-EIO); - goto out; } } - - if (unlikely(!page->mapping || !page_has_buffers(page))) { - unlock_page(page); - page_cache_release(page); - goto try_again;/*we really need these buffers*/ - } out: return page; } -- cgit v1.2.3 From 06fa45d3a19c6fbfccbf295e9f08087492338631 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Sat, 5 Aug 2006 12:13:57 -0700 Subject: [PATCH] ufs: handle truncated pages ufs_get_locked_page is called twice in ufs code, one time in ufs_truncate path(we allocated last block), and another time when fragments are reallocated. In ideal world in the second case on allocation/free block layer we should not know that things like `truncate' exists, but now with such crutch like ufs_get_locked_page we can (or should?) skip truncated pages. Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/balloc.c | 2 +- fs/ufs/util.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index b01804baa120..b82381475779 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -248,7 +248,7 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk, if (likely(cur_index != index)) { page = ufs_get_locked_page(mapping, index); - if (IS_ERR(page)) + if (!page || IS_ERR(page)) /* it was truncated or EIO */ continue; } else page = locked_page; diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 005d6815adf5..22f820a9b15c 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -251,7 +251,6 @@ struct page *ufs_get_locked_page(struct address_space *mapping, { struct page *page; -try_again: page = find_lock_page(mapping, index); if (!page) { page = read_cache_page(mapping, index, @@ -271,7 +270,8 @@ try_again: /* Truncate got there first */ unlock_page(page); page_cache_release(page); - goto try_again; + page = NULL; + goto out; } if (!PageUptodate(page) || PageError(page)) { -- cgit v1.2.3 From b0b33dee2dcc85626627919094befc17cfb141e4 Mon Sep 17 00:00:00 2001 From: Alexander Zarochentsev Date: Sat, 5 Aug 2006 12:14:01 -0700 Subject: [PATCH] i_mutex does not need to be locked in reiserfs_delete_inode() Fixes an i_mutex-inside-i_mutex lockdep nasty. Signed-off-by: Alexander Zarochentsev Cc: Cc: Hans Reiser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/inode.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 12dfdcfbee3d..ac57305b1afc 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -39,14 +39,10 @@ void reiserfs_delete_inode(struct inode *inode) /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ - mutex_lock(&inode->i_mutex); - reiserfs_delete_xattrs(inode); - if (journal_begin(&th, inode->i_sb, jbegin_count)) { - mutex_unlock(&inode->i_mutex); + if (journal_begin(&th, inode->i_sb, jbegin_count)) goto out; - } reiserfs_update_inode_transaction(inode); err = reiserfs_delete_object(&th, inode); @@ -57,12 +53,8 @@ void reiserfs_delete_inode(struct inode *inode) if (!err) DQUOT_FREE_INODE(inode); - if (journal_end(&th, inode->i_sb, jbegin_count)) { - mutex_unlock(&inode->i_mutex); + if (journal_end(&th, inode->i_sb, jbegin_count)) goto out; - } - - mutex_unlock(&inode->i_mutex); /* check return value from reiserfs_delete_object after * ending the transaction -- cgit v1.2.3 From 94f563c426a78c97fc2a377315995e6ec8343872 Mon Sep 17 00:00:00 2001 From: Diego Calleja Date: Sat, 5 Aug 2006 12:14:55 -0700 Subject: [PATCH] Fix BeFS slab corruption In bugzilla #6941, Jens Kilian reported: "The function befs_utf2nls (in fs/befs/linuxvfs.c) writes a 0 byte past the end of a block of memory allocated via kmalloc(), leading to memory corruption. This happens only for filenames which are pure ASCII and a multiple of 4 bytes in length. [...] Without DEBUG_SLAB, this leads to further corruption and hard lockups; I believe this is the bug which has made kernels later than 2.6.8 unusable for me. (This must be due to changes in memory management, the bug has been in the BeFS driver since the time it was introduced (AFAICT).) Steps to reproduce: Create a directory (in BeOS, naturally :-) with files named, e.g., "1", "22", "333", "4444", ... Mount it in Linux and do an "ls" or "find"" This patch implements the suggested fix. Credits to Jens Kilian for debugging the problem and finding the right fix. Signed-off-by: Diego Calleja Cc: Jens Kilian Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/befs/linuxvfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index fcaeead9696b..50cfca5c7efd 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -512,7 +512,11 @@ befs_utf2nls(struct super_block *sb, const char *in, wchar_t uni; int unilen, utflen; char *result; - int maxlen = in_len; /* The utf8->nls conversion can't make more chars */ + /* The utf8->nls conversion won't make the final nls string bigger + * than the utf one, but if the string is pure ascii they'll have the + * same width and an extra char is needed to save the additional \0 + */ + int maxlen = in_len + 1; befs_debug(sb, "---> utf2nls()"); @@ -588,7 +592,10 @@ befs_nls2utf(struct super_block *sb, const char *in, wchar_t uni; int unilen, utflen; char *result; - int maxlen = 3 * in_len; + /* There're nls characters that will translate to 3-chars-wide UTF-8 + * characters, a additional byte is needed to save the final \0 + * in special cases */ + int maxlen = (3 * in_len) + 1; befs_debug(sb, "---> nls2utf()\n"); -- cgit v1.2.3 From b5f3953c10b27fcd1c83e199e573b41d8327e22e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 5 Aug 2006 12:15:08 -0700 Subject: [PATCH] fix reiserfs lock inversion of bkl vs inode semaphore The correct lock ordering is inode lock -> BKL Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/file.c | 2 +- fs/reiserfs/ioctl.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index f318b58510fd..1627edd50810 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -48,8 +48,8 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) return 0; } - reiserfs_write_lock(inode->i_sb); mutex_lock(&inode->i_mutex); + reiserfs_write_lock(inode->i_sb); /* freeing preallocation only involves relogging blocks that * are already in the current transaction. preallocation gets * freed at the end of each transaction, so it is impossible for diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 745c88100895..a986b5e1e288 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -116,12 +116,12 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp) if (REISERFS_I(inode)->i_flags & i_nopack_mask) { return 0; } - reiserfs_write_lock(inode->i_sb); /* we need to make sure nobody is changing the file size beneath ** us */ mutex_lock(&inode->i_mutex); + reiserfs_write_lock(inode->i_sb); write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ -- cgit v1.2.3 From b4c76fa721c7c8a43655a74e508870d21d2e26d3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 5 Aug 2006 12:15:10 -0700 Subject: [PATCH] reiserfs_write_full_page() should not get_block past eof reiserfs_write_full_page does zero bytes in the file past eof, but it may call get_block on those buffers as well. On machines where the page size is larger than the blocksize, this can result in mmaped files incorrectly growing up to a block boundary during writepage. The fix is to avoid calling get_block for any blocks that are entirely past eof Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/inode.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ac57305b1afc..52f1e2136546 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2340,6 +2340,7 @@ static int reiserfs_write_full_page(struct page *page, unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; int error = 0; unsigned long block; + sector_t last_block; struct buffer_head *head, *bh; int partial = 0; int nr = 0; @@ -2387,10 +2388,19 @@ static int reiserfs_write_full_page(struct page *page, } bh = head; block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; /* first map all the buffers, logging any direct items we find */ do { - if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) || - (buffer_mapped(bh) + if (block > last_block) { + /* + * This can happen when the block size is less than + * the page size. The corresponding bytes in the page + * were zero filled above + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if ((checked || buffer_dirty(bh)) && + (!buffer_mapped(bh) || (buffer_mapped(bh) && bh->b_blocknr == 0))) { /* not mapped yet, or it points to a direct item, search -- cgit v1.2.3 From 225add619624b4877941470f31d297e0151b21be Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 5 Aug 2006 12:15:17 -0700 Subject: [PATCH] udf: initialize parts of inode earlier in create I saw an oops down this path when trying to create a new file on a UDF filesystem which was internally marked as readonly, but mounted rw: udf_create udf_new_inode new_inode alloc_inode udf_alloc_inode udf_new_block returns EIO due to readonlyness iput (on error) udf_put_inode udf_discard_prealloc udf_next_aext udf_current_aext udf_get_fileshortad OOPS the udf_discard_prealloc() path was examining uninitialized fields of the udf inode. udf_discard_prealloc() already has this code to short-circuit the discard path if no extents are preallocated: if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_IN_ICB || inode->i_size == UDF_I_LENEXTENTS(inode)) { return; } so if we initialize UDF_I_LENEXTENTS(inode) = 0 earlier in udf_new_inode, we won't try to free the (not) preallocated blocks, since this will match the i_size = 0 set when the inode was initialized. Signed-off-by: Eric Sandeen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/ialloc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 3873c672cb4c..33323473e3c4 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -75,6 +75,12 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err) } *err = -ENOSPC; + UDF_I_UNIQUE(inode) = 0; + UDF_I_LENEXTENTS(inode) = 0; + UDF_I_NEXT_ALLOC_BLOCK(inode) = 0; + UDF_I_NEXT_ALLOC_GOAL(inode) = 0; + UDF_I_STRAT4096(inode) = 0; + block = udf_new_block(dir->i_sb, NULL, UDF_I_LOCATION(dir).partitionReferenceNum, start, err); if (*err) @@ -84,11 +90,6 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err) } mutex_lock(&sbi->s_alloc_mutex); - UDF_I_UNIQUE(inode) = 0; - UDF_I_LENEXTENTS(inode) = 0; - UDF_I_NEXT_ALLOC_BLOCK(inode) = 0; - UDF_I_NEXT_ALLOC_GOAL(inode) = 0; - UDF_I_STRAT4096(inode) = 0; if (UDF_SB_LVIDBH(sb)) { struct logicalVolHeaderDesc *lvhd; -- cgit v1.2.3 From 4b1af774451bbc8440719e3fe441934a337c3b63 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Mon, 26 Jun 2006 15:17:47 -0700 Subject: ocfs2: Fix lvb corruption Properly ignore LVB flags during a PR downconvert. This avoids an illegal lvb update. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmunlock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index b0c3134f4f70..4ce3f82fb736 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -483,6 +483,10 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data) /* lock was found on queue */ lksb = lock->lksb; + if (flags & (LKM_VALBLK|LKM_PUT_LVB) && + lock->ml.type != LKM_EXMODE) + flags &= ~(LKM_VALBLK|LKM_PUT_LVB); + /* unlockast only called on originating node */ if (flags & LKM_PUT_LVB) { lksb->flags |= DLM_LKSB_PUT_LVB; @@ -632,6 +636,8 @@ retry: spin_lock(&res->spinlock); is_master = (res->owner == dlm->node_num); + if (flags & LKM_VALBLK && lock->ml.type != LKM_EXMODE) + flags &= ~LKM_VALBLK; spin_unlock(&res->spinlock); if (is_master) { -- cgit v1.2.3 From a23eac99d4392b8b779305498d7614e41a0e16e9 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Sun, 18 Jun 2006 21:28:01 -0700 Subject: ocfs2: do not modify lksb->status in the unlock ast This can race with other ast notification, which can cause bad status values to propagate into the unlock ast. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmunlock.c | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 4ce3f82fb736..59866e471d96 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -155,7 +155,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, else status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions); - if (status != DLM_NORMAL) + if (status != DLM_NORMAL && status != DLM_CANCELGRANT) goto leave; /* By now this has been masked out of cancel requests. */ @@ -183,8 +183,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, spin_lock(&lock->spinlock); /* if the master told us the lock was already granted, * let the ast handle all of these actions */ - if (status == DLM_NORMAL && - lksb->status == DLM_CANCELGRANT) { + if (status == DLM_CANCELGRANT) { actions &= ~(DLM_UNLOCK_REMOVE_LOCK| DLM_UNLOCK_REGRANT_LOCK| DLM_UNLOCK_CLEAR_CONVERT_TYPE); @@ -349,14 +348,9 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, vec, veclen, owner, &status); if (tmpret >= 0) { // successfully sent and received - if (status == DLM_CANCELGRANT) - ret = DLM_NORMAL; - else if (status == DLM_FORWARD) { + if (status == DLM_FORWARD) mlog(0, "master was in-progress. retry\n"); - ret = DLM_FORWARD; - } else - ret = status; - lksb->status = status; + ret = status; } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { @@ -372,7 +366,6 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, /* something bad. this will BUG in ocfs2 */ ret = dlm_err_to_dlm_status(tmpret); } - lksb->status = ret; } return ret; @@ -511,11 +504,8 @@ not_found: "cookie=%u:%llu\n", dlm_get_lock_cookie_node(unlock->cookie), dlm_get_lock_cookie_seq(unlock->cookie)); - else { - /* send the lksb->status back to the other node */ - status = lksb->status; + else dlm_lock_put(lock); - } leave: if (res) @@ -537,26 +527,22 @@ static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, if (dlm_lock_on_list(&res->blocked, lock)) { /* cancel this outright */ - lksb->status = DLM_NORMAL; status = DLM_NORMAL; *actions = (DLM_UNLOCK_CALL_AST | DLM_UNLOCK_REMOVE_LOCK); } else if (dlm_lock_on_list(&res->converting, lock)) { /* cancel the request, put back on granted */ - lksb->status = DLM_NORMAL; status = DLM_NORMAL; *actions = (DLM_UNLOCK_CALL_AST | DLM_UNLOCK_REMOVE_LOCK | DLM_UNLOCK_REGRANT_LOCK | DLM_UNLOCK_CLEAR_CONVERT_TYPE); } else if (dlm_lock_on_list(&res->granted, lock)) { - /* too late, already granted. DLM_CANCELGRANT */ - lksb->status = DLM_CANCELGRANT; - status = DLM_NORMAL; + /* too late, already granted. */ + status = DLM_CANCELGRANT; *actions = DLM_UNLOCK_CALL_AST; } else { mlog(ML_ERROR, "lock to cancel is not on any list!\n"); - lksb->status = DLM_IVLOCKID; status = DLM_IVLOCKID; *actions = 0; } @@ -573,13 +559,11 @@ static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, /* unlock request */ if (!dlm_lock_on_list(&res->granted, lock)) { - lksb->status = DLM_DENIED; status = DLM_DENIED; dlm_error(status); *actions = 0; } else { /* unlock granted lock */ - lksb->status = DLM_NORMAL; status = DLM_NORMAL; *actions = (DLM_UNLOCK_FREE_LOCK | DLM_UNLOCK_CALL_AST | @@ -671,7 +655,7 @@ retry: } if (call_ast) { - mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status); + mlog(0, "calling unlockast(%p, %d)\n", data, status); if (is_master) { /* it is possible that there is one last bast * pending. make sure it is flushed, then @@ -683,9 +667,12 @@ retry: wait_event(dlm->ast_wq, dlm_lock_basts_flushed(dlm, lock)); } - (*unlockast)(data, lksb->status); + (*unlockast)(data, status); } + if (status == DLM_CANCELGRANT) + status = DLM_NORMAL; + if (status == DLM_NORMAL) { mlog(0, "kicking the thread\n"); dlm_kick_thread(dlm, res); -- cgit v1.2.3 From 34e3d180370c44ad3ecd3a1f9099e150d3bb103f Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Sat, 15 Jul 2006 10:22:39 -0700 Subject: ocfs2: fix check for locally granted state during dlmunlock() If a process requests a lock cancel but the lock has been remotely granted already then there is no need to send the cancel message. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmunlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 59866e471d96..37be4b2e0d4a 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -155,7 +155,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, else status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions); - if (status != DLM_NORMAL && status != DLM_CANCELGRANT) + if (status != DLM_NORMAL && (status != DLM_CANCELGRANT || !master_node)) goto leave; /* By now this has been masked out of cancel requests. */ -- cgit v1.2.3 From 9acd72f4240429dfd762c9a2c7eb5c18b5d32529 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 15 Jul 2006 02:36:01 +0200 Subject: [PATCH] fs/ocfs2/dlm/dlmmaster.c: unexport dlm_migrate_lockres This patch removes the unused EXPORT_SYMBOL_GPL(dlm_migrate_lockres). Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmmaster.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 1b8346dd0572..9503240ef0e5 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2375,7 +2375,6 @@ leave: mlog(0, "returning %d\n", ret); return ret; } -EXPORT_SYMBOL_GPL(dlm_migrate_lockres); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) { -- cgit v1.2.3 From 101ebf256de54e78e6d3277adacf656e125a2c5a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 2 May 2006 17:54:45 -0700 Subject: ocfs2: limit cluster bitmap information saved at mount We were storing cluster count on the ocfs2_super structure, but never actually using it so remove that. Also, we don't want to populate the uptodate cache with the unlocked block read - it is technically safe as is, but we should change it for correctness. Signed-off-by: Mark Fasheh --- fs/ocfs2/ocfs2.h | 1 - fs/ocfs2/super.c | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index cd4a6f253d13..d52100d49b6c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -197,7 +197,6 @@ struct ocfs2_super struct ocfs2_node_map recovery_map; struct ocfs2_node_map umount_map; - u32 num_clusters; u64 root_blkno; u64 system_dir_blkno; u64 bitmap_blkno; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 382706a67ffd..d17e33e66a1e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1442,8 +1442,13 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; + /* We don't have a cluster lock on the bitmap here because + * we're only interested in static information and the extra + * complexity at mount time isn't worht it. Don't pass the + * inode in to the read function though as we don't want it to + * be put in the cache. */ status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, - inode); + NULL); iput(inode); if (status < 0) { mlog_errno(status); @@ -1452,7 +1457,6 @@ static int ocfs2_initialize_super(struct super_block *sb, di = (struct ocfs2_dinode *) bitmap_bh->b_data; osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); - osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total); brelse(bitmap_bh); mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n", (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg); -- cgit v1.2.3 From 7bf72edee614e10b8d470c40a326f47bfdd69992 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 3 May 2006 17:46:50 -0700 Subject: ocfs2: better group descriptor consistency checks Try to catch corrupted group descriptors with some stronger checks placed in a couple of strategic locations. Detect a failed resizefs and refuse to allocate past what bitmap i_clusters allows. Signed-off-by: Mark Fasheh --- fs/ocfs2/suballoc.c | 114 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 94 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 195523090c87..7ac68cac041d 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -85,11 +85,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, u64 *bg_blkno); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); -static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, - struct buffer_head *bg_bh, - unsigned int bits_wanted, - u16 *bit_off, - u16 *bits_found); static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, @@ -143,6 +138,64 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); } +/* somewhat more expensive than our other checks, so use sparingly. */ +static int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct ocfs2_group_desc *gd) +{ + unsigned int max_bits; + + if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd); + return -EIO; + } + + if (di->i_blkno != gd->bg_parent_dinode) { + ocfs2_error(sb, "Group descriptor # %llu has bad parent " + "pointer (%llu, expected %llu)", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), + (unsigned long long)le64_to_cpu(di->i_blkno)); + return -EIO; + } + + max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); + if (le16_to_cpu(gd->bg_bits) > max_bits) { + ocfs2_error(sb, "Group descriptor # %llu has bit count of %u", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits)); + return -EIO; + } + + if (le16_to_cpu(gd->bg_chain) >= + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { + ocfs2_error(sb, "Group descriptor # %llu has bad chain %u", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_chain)); + return -EIO; + } + + if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { + ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " + "claims that %u are free", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + le16_to_cpu(gd->bg_free_bits_count)); + return -EIO; + } + + if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { + ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " + "max bitmap bits of %u", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + 8 * le16_to_cpu(gd->bg_size)); + return -EIO; + } + + return 0; +} + static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, @@ -663,6 +716,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, + unsigned int total_bits, u16 *bit_off, u16 *bits_found) { @@ -679,10 +733,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; - while((offset = ocfs2_find_next_zero_bit(bitmap, - le16_to_cpu(bg->bg_bits), - start)) != -1) { - if (offset == le16_to_cpu(bg->bg_bits)) + while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { + if (offset == total_bits) break; if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { @@ -911,14 +963,35 @@ static int ocfs2_cluster_group_search(struct inode *inode, { int search = -ENOSPC; int ret; - struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; u16 tmp_off, tmp_found; + unsigned int max_bits, gd_cluster_off; BUG_ON(!ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) { + if (gd->bg_free_bits_count) { + max_bits = le16_to_cpu(gd->bg_bits); + + /* Tail groups in cluster bitmaps which aren't cpg + * aligned are prone to partial extention by a failed + * fs resize. If the file system resize never got to + * update the dinode cluster count, then we don't want + * to trust any clusters past it, regardless of what + * the group descriptor says. */ + gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(gd->bg_blkno)); + if ((gd_cluster_off + max_bits) > + OCFS2_I(inode)->ip_clusters) { + max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; + mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + OCFS2_I(inode)->ip_clusters, max_bits); + } + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, + max_bits, &tmp_off, &tmp_found); if (ret) return ret; @@ -951,6 +1024,7 @@ static int ocfs2_block_group_search(struct inode *inode, if (bg->bg_free_bits_count) ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, + le16_to_cpu(bg->bg_bits), bit_off, bits_found); return ret; @@ -988,9 +1062,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; + status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); + if (status) { + mlog_errno(status); goto bail; } @@ -1018,9 +1092,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; + status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); + if (status) { + mlog_errno(status); goto bail; } } @@ -1494,9 +1568,9 @@ static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, } group = (struct ocfs2_group_desc *) group_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(group)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group); - status = -EIO; + status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group); + if (status) { + mlog_errno(status); goto bail; } BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); -- cgit v1.2.3 From 883d4cae4a2b01a05193cf2665c77b7489a8b6a0 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 5 Jun 2006 16:41:00 -0400 Subject: ocfs2: allocation hints Record the most recently used allocation group on the allocation context, so that subsequent allocations can attempt to optimize for contiguousness. Local alloc especially should benefit from this as the current chain search tends to let it spew across the disk. Signed-off-by: Mark Fasheh --- fs/ocfs2/localalloc.c | 8 +++ fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/suballoc.c | 147 +++++++++++++++++++++++++++++++++++++++++++++----- fs/ocfs2/suballoc.h | 2 + 4 files changed, 145 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 0d1973ea32b0..1f17a4d08287 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -840,6 +840,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, mlog(0, "Allocating %u clusters for a new window.\n", ocfs2_local_alloc_window_bits(osb)); + + /* Instruct the allocation code to try the most recently used + * cluster group. We'll re-record the group used this pass + * below. */ + ac->ac_last_group = osb->la_last_gd; + /* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use * the more specific cluster api to claim bits. */ @@ -852,6 +858,8 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, goto bail; } + osb->la_last_gd = ac->ac_last_group; + la->la_bm_off = cpu_to_le32(cluster_off); alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); /* just in case... In the future when we find space ourselves, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d52100d49b6c..0462a7f4e21b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -236,6 +236,7 @@ struct ocfs2_super enum ocfs2_local_alloc_state local_alloc_state; struct buffer_head *local_alloc_bh; + u64 la_last_gd; /* Next two fields are for local node slot recovery during * mount. */ diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 7ac68cac041d..9d91e66f51a9 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -70,12 +70,6 @@ static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u16 *bit_off, u16 *bits_found); -static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, - u32 bits_wanted, - u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno); static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, u32 bits_wanted, @@ -1030,12 +1024,103 @@ static int ocfs2_block_group_search(struct inode *inode, return ret; } +static int ocfs2_alloc_dinode_update_counts(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + int ret; + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; + + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 gd_blkno, + u16 *bits_left) +{ + int ret; + u16 found; + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *gd; + struct inode *alloc_inode = ac->ac_inode; + struct ocfs2_journal_handle *handle = ac->ac_handle; + + ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, + &group_bh, OCFS2_BH_CACHED, alloc_inode); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + gd = (struct ocfs2_group_desc *) group_bh->b_data; + if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd); + ret = -EIO; + goto out; + } + + ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, + bit_off, &found); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + *num_bits = found; + + ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, + *num_bits, + le16_to_cpu(gd->bg_chain)); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, + *bit_off, *num_bits); + if (ret < 0) + mlog_errno(ret); + + *bits_left = le16_to_cpu(gd->bg_free_bits_count); + +out: + brelse(group_bh); + + return ret; +} + static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, u32 bits_wanted, u32 min_bits, u16 *bit_off, unsigned int *num_bits, - u64 *bg_blkno) + u64 *bg_blkno, + u16 *bits_left) { int status; u16 chain, tmp_bits; @@ -1173,6 +1258,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, (unsigned long long)fe->i_blkno); *bg_blkno = le64_to_cpu(bg->bg_blkno); + *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: if (group_bh) brelse(group_bh); @@ -1194,6 +1280,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, { int status; u16 victim, i; + u16 bits_left = 0; + u64 hint_blkno = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; @@ -1220,6 +1308,28 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, goto bail; } + if (hint_blkno) { + /* Attempt to short-circuit the usual search mechanism + * by jumping straight to the most recently used + * allocation group. This helps us mantain some + * contiguousness across allocations. */ + status = ocfs2_search_one_group(ac, bits_wanted, min_bits, + bit_off, num_bits, + hint_blkno, &bits_left); + if (!status) { + /* Be careful to update *bg_blkno here as the + * caller is expecting it to be filled in, and + * ocfs2_search_one_group() won't do that for + * us. */ + *bg_blkno = hint_blkno; + goto set_hint; + } + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + } + cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; victim = ocfs2_find_victim_chain(cl); @@ -1227,9 +1337,9 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, ac->ac_allow_chain_relink = 1; status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off, - num_bits, bg_blkno); + num_bits, bg_blkno, &bits_left); if (!status) - goto bail; + goto set_hint; if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1251,8 +1361,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, ac->ac_chain = i; status = ocfs2_search_chain(ac, bits_wanted, min_bits, - bit_off, num_bits, - bg_blkno); + bit_off, num_bits, bg_blkno, + &bits_left); if (!status) break; if (status < 0 && status != -ENOSPC) { @@ -1260,8 +1370,19 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, goto bail; } } -bail: +set_hint: + if (status != -ENOSPC) { + /* If the next search of this group is not likely to + * yield a suitable extent, then we reset the last + * group hint so as to not waste a disk read */ + if (bits_left < min_bits) + ac->ac_last_group = 0; + else + ac->ac_last_group = *bg_blkno; + } + +bail: mlog_exit(status); return status; } @@ -1415,7 +1536,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, { int status; unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; - u64 bg_blkno; + u64 bg_blkno = 0; u16 bg_bit_off; mlog_entry_void(); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a76c82a7ceac..c787838d1052 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -49,6 +49,8 @@ struct ocfs2_alloc_context { u16 ac_chain; int ac_allow_chain_relink; group_search_t *ac_group_search; + + u64 ac_last_group; }; void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); -- cgit v1.2.3 From 0e1edbd99994270023cea5afe593f972eb09a778 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Thu, 10 Aug 2006 14:40:41 +1000 Subject: [XFS] Fix xfs_free_extent related NULL pointer dereference. We recently fixed an out-of-space deadlock in XFS, and part of that fix involved the addition of the XFS_ALLOC_FLAG_FREEING flag to some of the space allocator calls to indicate they're freeing space, not allocating it. There was a missed xfs_alloc_fix_freelist condition test that did not correctly test "flags". The same test would also test an uninitialised structure field (args->userdata) and depending on its value either would or would not return early with a critical buffer pointer set to NULL. This fixes that up, adds asserts to several places to catch future botches of this nature, and skips sections of xfs_alloc_fix_freelist that are irrelevent for the space-freeing case. SGI-PV: 955303 SGI-Modid: xfs-linux-melb:xfs-kern:26743a Signed-off-by: Nathan Scott --- fs/xfs/xfs_alloc.c | 103 ++++++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index eef6763f3a67..d2bbcd882a69 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1835,40 +1835,47 @@ xfs_alloc_fix_freelist( &agbp))) return error; if (!pag->pagf_init) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); args->agbp = NULL; return 0; } } else agbp = NULL; - /* If this is a metadata preferred pag and we are user data + /* + * If this is a metadata preferred pag and we are user data * then try somewhere else if we are not being asked to * try harder at this point */ - if (pag->pagf_metadata && args->userdata && flags) { + if (pag->pagf_metadata && args->userdata && + (flags & XFS_ALLOC_FLAG_TRYLOCK)) { + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); args->agbp = NULL; return 0; } - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; - /* - * If it looks like there isn't a long enough extent, or enough - * total blocks, reject it. - */ - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || pag->pagf_longest > 0); - if (args->minlen + args->alignment + args->minalignslop - 1 > longest || - (!(flags & XFS_ALLOC_FLAG_FREEING) && - (int)(pag->pagf_freeblks + pag->pagf_flcount - - need - args->total) < - (int)args->minleft)) { - if (agbp) - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + need = XFS_MIN_FREELIST_PAG(pag, mp); + delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; + /* + * If it looks like there isn't a long enough extent, or enough + * total blocks, reject it. + */ + longest = (pag->pagf_longest > delta) ? + (pag->pagf_longest - delta) : + (pag->pagf_flcount > 0 || pag->pagf_longest > 0); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(pag->pagf_freeblks + pag->pagf_flcount - + need - args->total) < (int)args->minleft)) { + if (agbp) + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } } + /* * Get the a.g. freespace buffer. * Can fail if we're not blocking on locks, and it's held. @@ -1878,6 +1885,8 @@ xfs_alloc_fix_freelist( &agbp))) return error; if (agbp == NULL) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); args->agbp = NULL; return 0; } @@ -1887,22 +1896,24 @@ xfs_alloc_fix_freelist( */ agf = XFS_BUF_TO_AGF(agbp); need = XFS_MIN_FREELIST(agf, mp); - delta = need > be32_to_cpu(agf->agf_flcount) ? - (need - be32_to_cpu(agf->agf_flcount)) : 0; /* * If there isn't enough total or single-extent, reject it. */ - longest = be32_to_cpu(agf->agf_longest); - longest = (longest > delta) ? (longest - delta) : - (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); - if (args->minlen + args->alignment + args->minalignslop - 1 > longest || - (!(flags & XFS_ALLOC_FLAG_FREEING) && - (int)(be32_to_cpu(agf->agf_freeblks) + - be32_to_cpu(agf->agf_flcount) - need - args->total) < - (int)args->minleft)) { - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + delta = need > be32_to_cpu(agf->agf_flcount) ? + (need - be32_to_cpu(agf->agf_flcount)) : 0; + longest = be32_to_cpu(agf->agf_longest); + longest = (longest > delta) ? (longest - delta) : + (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(be32_to_cpu(agf->agf_freeblks) + + be32_to_cpu(agf->agf_flcount) - need - args->total) < + (int)args->minleft)) { + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } } /* * Make the freelist shorter if it's too long. @@ -1950,12 +1961,11 @@ xfs_alloc_fix_freelist( * on a completely full ag. */ if (targs.agbno == NULLAGBLOCK) { - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - xfs_trans_brelse(tp, agflbp); - args->agbp = NULL; - return 0; - } - break; + if (flags & XFS_ALLOC_FLAG_FREEING) + break; + xfs_trans_brelse(tp, agflbp); + args->agbp = NULL; + return 0; } /* * Put each allocated block on the list. @@ -2442,31 +2452,26 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len) /* length of extent */ { -#ifdef DEBUG - xfs_agf_t *agf; /* a.g. freespace header */ -#endif - xfs_alloc_arg_t args; /* allocation argument structure */ + xfs_alloc_arg_t args; int error; ASSERT(len != 0); + memset(&args, 0, sizeof(xfs_alloc_arg_t)); args.tp = tp; args.mp = tp->t_mountp; args.agno = XFS_FSB_TO_AGNO(args.mp, bno); ASSERT(args.agno < args.mp->m_sb.sb_agcount); args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - args.alignment = 1; - args.minlen = args.minleft = args.minalignslop = 0; down_read(&args.mp->m_peraglock); args.pag = &args.mp->m_perag[args.agno]; if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) goto error0; #ifdef DEBUG ASSERT(args.agbp != NULL); - agf = XFS_BUF_TO_AGF(args.agbp); - ASSERT(args.agbno + len <= be32_to_cpu(agf->agf_length)); + ASSERT((args.agbno + len) <= + be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)); #endif - error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, - len, 0); + error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); error0: up_read(&args.mp->m_peraglock); return error; -- cgit v1.2.3 From 3a5ff61c18659443f76bad6cf06f60103046de5d Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 14 Jul 2006 22:37:11 +0000 Subject: [CIFS] Do not time out posix brl requests when using new posix setfileinfo request and do not time out slow requests to a server that is still responding well to other threads Suggested by jra of Samba team Signed-off-by: Steve French (cherry picked from 89b57148115479eef074b8d3f86c4c86c96ac969 commit) --- fs/cifs/CHANGES | 6 ++++++ fs/cifs/cifsfs.c | 6 +----- fs/cifs/cifsfs.h | 2 +- fs/cifs/cifsglob.h | 3 ++- fs/cifs/cifssmb.c | 11 ++++++++--- fs/cifs/connect.c | 17 ++++++++++++++++- fs/cifs/file.c | 6 ++---- fs/cifs/transport.c | 16 +++++++++++++--- 8 files changed, 49 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index a61d17ed1827..acb843b9bc3b 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,9 @@ +Version 1.45 +------------ +Do not time out lockw calls when using posix extensions. Do not +time out requests if server still responding reasonably fast +on requests on other threads + Version 1.44 ------------ Rewritten sessionsetup support, including support for legacy SMB diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c28ede599946..3cd750029be2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -402,7 +402,6 @@ static struct quotactl_ops cifs_quotactl_ops = { }; #endif -#ifdef CONFIG_CIFS_EXPERIMENTAL static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags) { struct cifs_sb_info *cifs_sb; @@ -422,7 +421,7 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags) tcon->tidStatus = CifsExiting; up(&tcon->tconSem); - /* cancel_brl_requests(tcon); */ + /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ /* cancel_notify_requests(tcon); */ if(tcon->ses && tcon->ses->server) { @@ -438,7 +437,6 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags) return; } -#endif static int cifs_remount(struct super_block *sb, int *flags, char *data) { @@ -457,9 +455,7 @@ struct super_operations cifs_super_ops = { unless later we add lazy close of inodes or unless the kernel forgets to call us with the same number of releases (closes) as opens */ .show_options = cifs_show_options, -#ifdef CONFIG_CIFS_EXPERIMENTAL .umount_begin = cifs_umount_begin, -#endif .remount_fs = cifs_remount, }; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 8f75c6f24701..39ee8ef3bdeb 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -100,5 +100,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern int cifs_ioctl (struct inode * inode, struct file * filep, unsigned int command, unsigned long arg); -#define CIFS_VERSION "1.44" +#define CIFS_VERSION "1.45" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 6d7cf5f3bc0b..39b43e6a7509 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -158,7 +158,8 @@ struct TCP_Server_Info { /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[SERVER_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* needed for CIFS PDU signature */ - char mac_signing_key[CIFS_SESS_KEY_SIZE + 16]; + char mac_signing_key[CIFS_SESS_KEY_SIZE + 16]; + unsigned long lstrp; /* when we got last response from this server */ }; /* diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 19678c575dfc..c03c42ee9eb9 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1484,6 +1484,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, char *data_offset; struct cifs_posix_lock *parm_data; int rc = 0; + int timeout = 0; int bytes_returned = 0; __u16 params, param_offset, offset, byte_count, count; @@ -1503,7 +1504,6 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; - pSMB->Timeout = 0; pSMB->Reserved2 = 0; param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; offset = param_offset + params; @@ -1529,8 +1529,13 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, (((char *) &pSMB->hdr.Protocol) + offset); parm_data->lock_type = cpu_to_le16(lock_type); - if(waitFlag) + if(waitFlag) { + timeout = 3; /* blocking operation, no timeout */ parm_data->lock_flags = cpu_to_le16(1); + pSMB->Timeout = cpu_to_le32(-1); + } else + pSMB->Timeout = 0; + parm_data->pid = cpu_to_le32(current->tgid); parm_data->start = cpu_to_le64(pLockData->fl_start); parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ @@ -1542,7 +1547,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, pSMB->hdr.smb_buf_length += byte_count; pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); + (struct smb_hdr *) pSMBr, &bytes_returned, timeout); if (rc) { cFYI(1, ("Send error in Posix Lock = %d", rc)); } else if (get_flag) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 876eb9ef85fe..c4aedcf5c924 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -612,6 +612,10 @@ multi_t2_fnd: #ifdef CONFIG_CIFS_STATS2 mid_entry->when_received = jiffies; #endif + /* so we do not time out requests to server + which is still responding (since server could + be busy but not dead) */ + server->lstrp = jiffies; break; } } @@ -1969,7 +1973,18 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, } cFYI(1,("Negotiate caps 0x%x",(int)cap)); - +#ifdef CONFIG_CIFS_DEBUG2 + if(cap & CIFS_UNIX_FCNTL_CAP) + cFYI(1,("FCNTL cap")); + if(cap & CIFS_UNIX_EXTATTR_CAP) + cFYI(1,("EXTATTR cap")); + if(cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) + cFYI(1,("POSIX path cap")); + if(cap & CIFS_UNIX_XATTR_CAP) + cFYI(1,("XATTR cap")); + if(cap & CIFS_UNIX_POSIX_ACL_CAP) + cFYI(1,("POSIX ACL cap")); +#endif /* CIFS_DEBUG2 */ if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { cFYI(1,("setting capabilities failed")); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 944d2b9e092d..52e2e4c8794b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -644,8 +644,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) account for negative length which we can not accept over the wire */ if (IS_GETLK(cmd)) { - if(experimEnabled && - (cifs_sb->tcon->ses->capabilities & CAP_UNIX) && + if((cifs_sb->tcon->ses->capabilities & CAP_UNIX) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability))) { int posix_lock_type; @@ -683,8 +682,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) FreeXid(xid); return rc; } - if (experimEnabled && - (cifs_sb->tcon->ses->capabilities & CAP_UNIX) && + if ((cifs_sb->tcon->ses->capabilities & CAP_UNIX) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability))) { int posix_lock_type; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 17ba329e2b3d..95e23ca670a8 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -444,8 +444,9 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, if(timeout != MAX_SCHEDULE_TIMEOUT) { timeout += jiffies; wait_event(ses->server->response_q, - (!(midQ->midState & MID_REQUEST_SUBMITTED)) || - time_after(jiffies, timeout) || + (!(midQ->midState & MID_REQUEST_SUBMITTED)) || + (time_after(jiffies, timeout) && + time_after(jiffies, ses->server->lstrp + HZ)) || ((ses->server->tcpStatus != CifsGood) && (ses->server->tcpStatus != CifsNew))); } else { @@ -710,9 +711,18 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, /* No user interrupts in wait - wreaks havoc with performance */ if(timeout != MAX_SCHEDULE_TIMEOUT) { timeout += jiffies; + /* although we prefer not to time out if the server is still + responding - we will time out if the server takes + more than 15 (or 45 or 180) seconds to respond to this request + and has not responded to any request from other threads + on this client within a second (note that it is not worth + grabbing the GlobalMid_Lock and slowing things down in this + wait event to more accurately check the lstrsp field on some + arch since we are already in an error path that will retry */ wait_event(ses->server->response_q, (!(midQ->midState & MID_REQUEST_SUBMITTED)) || - time_after(jiffies, timeout) || + (time_after(jiffies, timeout) && + time_after(jiffies, ses->server->lstrp + HZ)) || ((ses->server->tcpStatus != CifsGood) && (ses->server->tcpStatus != CifsNew))); } else { -- cgit v1.2.3 From 14a441a2b4ee1dfc00ec822d91d9fb20f401c62f Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 16 Jul 2006 04:32:51 +0000 Subject: [CIFS] spinlock protect read of last srv response time in timeout path Signed-off-by: Jeremy Allison Signed-off-by: Steve French (cherry picked from b33a3f55e54fd210fc043eafcf83728b03bc9e02 commit) --- fs/cifs/transport.c | 99 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 95e23ca670a8..d16e6032d4cf 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -3,7 +3,8 @@ * * Copyright (C) International Business Machines Corp., 2002,2005 * Author(s): Steve French (sfrench@us.ibm.com) - * + * Jeremy Allison (jra@samba.org) 2006. + * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; either version 2.1 of the License, or @@ -442,13 +443,46 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, /* No user interrupts in wait - wreaks havoc with performance */ if(timeout != MAX_SCHEDULE_TIMEOUT) { - timeout += jiffies; - wait_event(ses->server->response_q, - (!(midQ->midState & MID_REQUEST_SUBMITTED)) || - (time_after(jiffies, timeout) && - time_after(jiffies, ses->server->lstrp + HZ)) || - ((ses->server->tcpStatus != CifsGood) && - (ses->server->tcpStatus != CifsNew))); + unsigned long curr_timeout; + + for (;;) { + curr_timeout = timeout + jiffies; + wait_event(ses->server->response_q, + (!(midQ->midState & MID_REQUEST_SUBMITTED)) || + time_after(jiffies, curr_timeout) || + ((ses->server->tcpStatus != CifsGood) && + (ses->server->tcpStatus != CifsNew))); + + if (time_after(jiffies, curr_timeout) && + (midQ->midState & MID_REQUEST_SUBMITTED) && + ((ses->server->tcpStatus == CifsGood) || + (ses->server->tcpStatus == CifsNew))) { + + unsigned long lrt; + + /* We timed out. Is the server still + sending replies ? */ + spin_lock(&GlobalMid_Lock); + lrt = ses->server->lstrp; + spin_unlock(&GlobalMid_Lock); + + /* Calculate 10 seconds past last receive time. + Although we prefer not to time out if the + server is still responding - we will time + out if the server takes more than 15 (or 45 + or 180) seconds to respond to this request + and has not responded to any request from + other threads on the client within 10 seconds */ + lrt += (10 * HZ); + if (time_after(jiffies, lrt)) { + /* No replies for 10 seconds. */ + cERROR(1,("server not responding")); + break; + } + } else { + break; + } + } } else { wait_event(ses->server->response_q, (!(midQ->midState & MID_REQUEST_SUBMITTED)) || @@ -710,21 +744,40 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, /* No user interrupts in wait - wreaks havoc with performance */ if(timeout != MAX_SCHEDULE_TIMEOUT) { - timeout += jiffies; - /* although we prefer not to time out if the server is still - responding - we will time out if the server takes - more than 15 (or 45 or 180) seconds to respond to this request - and has not responded to any request from other threads - on this client within a second (note that it is not worth - grabbing the GlobalMid_Lock and slowing things down in this - wait event to more accurately check the lstrsp field on some - arch since we are already in an error path that will retry */ - wait_event(ses->server->response_q, - (!(midQ->midState & MID_REQUEST_SUBMITTED)) || - (time_after(jiffies, timeout) && - time_after(jiffies, ses->server->lstrp + HZ)) || - ((ses->server->tcpStatus != CifsGood) && - (ses->server->tcpStatus != CifsNew))); + unsigned long curr_timeout; + + for (;;) { + curr_timeout = timeout + jiffies; + wait_event(ses->server->response_q, + (!(midQ->midState & MID_REQUEST_SUBMITTED)) || + time_after(jiffies, curr_timeout) || + ((ses->server->tcpStatus != CifsGood) && + (ses->server->tcpStatus != CifsNew))); + + if (time_after(jiffies, curr_timeout) && + (midQ->midState & MID_REQUEST_SUBMITTED) && + ((ses->server->tcpStatus == CifsGood) || + (ses->server->tcpStatus == CifsNew))) { + + unsigned long lrt; + + /* We timed out. Is the server still + sending replies ? */ + spin_lock(&GlobalMid_Lock); + lrt = ses->server->lstrp; + spin_unlock(&GlobalMid_Lock); + + /* Calculate 10 seconds past last receive time*/ + lrt += (10 * HZ); + if (time_after(jiffies, lrt)) { + /* Server sent no reply in 10 seconds */ + cERROR(1,("Server not responding")); + break; + } + } else { + break; + } + } } else { wait_event(ses->server->response_q, (!(midQ->midState & MID_REQUEST_SUBMITTED)) || -- cgit v1.2.3 From 5da07b0208066fd3544ecf2b521fc7a2e0228ad5 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 16 Jul 2006 21:33:15 +0000 Subject: [CIFS] Make midState usage more consistent Although harmless, we were sometimes treating midState like it contained flags but they are exclusive states, and this makes that more clear. Signed-off-by: Jeremy Allison Signed-off-by: Steve French (cherry picked from 586c057c3a68dd6ae0f3ba94fbf76798b1558074 commit) --- fs/cifs/transport.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index d16e6032d4cf..748b0df07a73 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -448,13 +448,13 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, for (;;) { curr_timeout = timeout + jiffies; wait_event(ses->server->response_q, - (!(midQ->midState & MID_REQUEST_SUBMITTED)) || + (!(midQ->midState == MID_REQUEST_SUBMITTED)) || time_after(jiffies, curr_timeout) || ((ses->server->tcpStatus != CifsGood) && (ses->server->tcpStatus != CifsNew))); if (time_after(jiffies, curr_timeout) && - (midQ->midState & MID_REQUEST_SUBMITTED) && + (midQ->midState == MID_REQUEST_SUBMITTED) && ((ses->server->tcpStatus == CifsGood) || (ses->server->tcpStatus == CifsNew))) { @@ -485,7 +485,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, } } else { wait_event(ses->server->response_q, - (!(midQ->midState & MID_REQUEST_SUBMITTED)) || + (!(midQ->midState == MID_REQUEST_SUBMITTED)) || ((ses->server->tcpStatus != CifsGood) && (ses->server->tcpStatus != CifsNew))); } @@ -749,13 +749,13 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, for (;;) { curr_timeout = timeout + jiffies; wait_event(ses->server->response_q, - (!(midQ->midState & MID_REQUEST_SUBMITTED)) || + (!(midQ->midState == MID_REQUEST_SUBMITTED)) || time_after(jiffies, curr_timeout) || ((ses->server->tcpStatus != CifsGood) && (ses->server->tcpStatus != CifsNew))); if (time_after(jiffies, curr_timeout) && - (midQ->midState & MID_REQUEST_SUBMITTED) && + (midQ->midState == MID_REQUEST_SUBMITTED) && ((ses->server->tcpStatus == CifsGood) || (ses->server->tcpStatus == CifsNew))) { @@ -780,7 +780,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, } } else { wait_event(ses->server->response_q, - (!(midQ->midState & MID_REQUEST_SUBMITTED)) || + (!(midQ->midState == MID_REQUEST_SUBMITTED)) || ((ses->server->tcpStatus != CifsGood) && (ses->server->tcpStatus != CifsNew))); } -- cgit v1.2.3 From 6c3d8909d85b2c18fd7a6e64f0ca757a257b40fa Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 31 Jul 2006 22:46:20 +0000 Subject: [CIFS] Allow cifsd to suspend if connection is lost Make cifsd allow us to suspend if it has lost the connection with a server Ref: http://bugzilla.kernel.org/show_bug.cgi?id=6811 Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Steve French (cherry picked from 27bd6cd87b0ada66515ad49bc346d77d1e9d3e05 commit) --- fs/cifs/connect.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index c4aedcf5c924..b706b4f48b10 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -182,6 +182,7 @@ cifs_reconnect(struct TCP_Server_Info *server) while ((server->tcpStatus != CifsExiting) && (server->tcpStatus != CifsGood)) { + try_to_freeze(); if(server->protocolType == IPV6) { rc = ipv6_connect(&server->addr.sockAddr6,&server->ssocket); } else { -- cgit v1.2.3 From 7ee1af765dfa3146aef958258003245e082284e5 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Wed, 2 Aug 2006 21:56:33 +0000 Subject: [CIFS] Allow Windows blocking locks to be cancelled via a CANCEL_LOCK call. TODO - restrict this to servers that support NT_STATUS codes (Win9x will probably not support this call). Signed-off-by: Jeremy Allison Signed-off-by: Steve French (cherry picked from 570d4d2d895569825d0d017d4e76b51138f68864 commit) --- fs/cifs/cifsglob.h | 15 +- fs/cifs/cifsproto.h | 4 + fs/cifs/cifssmb.c | 15 +- fs/cifs/file.c | 95 ++++++-- fs/cifs/netmisc.c | 1 + fs/cifs/smberr.h | 1 + fs/cifs/transport.c | 677 +++++++++++++++++++++++++++++++--------------------- 7 files changed, 512 insertions(+), 296 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 39b43e6a7509..b24006c47df1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -3,6 +3,7 @@ * * Copyright (C) International Business Machines Corp., 2002,2006 * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -267,14 +268,14 @@ struct cifsTconInfo { }; /* - * This info hangs off the cifsFileInfo structure. This is used to track - * byte stream locks on the file + * This info hangs off the cifsFileInfo structure, pointed to by llist. + * This is used to track byte stream locks on the file */ struct cifsLockInfo { - struct cifsLockInfo *next; - int start; - int length; - int type; + struct list_head llist; /* pointer to next cifsLockInfo */ + __u64 offset; + __u64 length; + __u8 type; }; /* @@ -305,6 +306,8 @@ struct cifsFileInfo { /* lock scope id (0 if none) */ struct file * pfile; /* needed for writepage */ struct inode * pInode; /* needed for oplock break */ + struct semaphore lock_sem; + struct list_head llist; /* list of byte range locks we have. */ unsigned closePend:1; /* file is marked to close */ unsigned invalidHandle:1; /* file closed via session abend */ atomic_t wrtPending; /* handle in use - defer close */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index a5ddc62d6fe6..b35c55c3c8bb 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -50,6 +50,10 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */ , const int long_op); +extern int SendReceiveBlockingLock(const unsigned int /* xid */ , struct cifsTconInfo *, + struct smb_hdr * /* input */ , + struct smb_hdr * /* out */ , + int * /* bytes returned */); extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid); extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length); extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c03c42ee9eb9..dcbc3e075e81 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1460,8 +1460,13 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, pSMB->hdr.smb_buf_length += count; pSMB->ByteCount = cpu_to_le16(count); - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + if (waitFlag) { + rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned); + } else { + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, timeout); + } cifs_stats_inc(&tcon->num_locks); if (rc) { cFYI(1, ("Send error in Lock = %d", rc)); @@ -1546,8 +1551,14 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, pSMB->Reserved4 = 0; pSMB->hdr.smb_buf_length += byte_count; pSMB->ByteCount = cpu_to_le16(byte_count); - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + if (waitFlag) { + rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned); + } else { + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, timeout); + } + if (rc) { cFYI(1, ("Send error in Posix Lock = %d", rc)); } else if (get_flag) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 52e2e4c8794b..e9c5ba9084fc 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -5,6 +5,7 @@ * * Copyright (C) International Business Machines Corp., 2002,2003 * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -47,6 +48,8 @@ static inline struct cifsFileInfo *cifs_init_private( private_data->netfid = netfid; private_data->pid = current->tgid; init_MUTEX(&private_data->fh_sem); + init_MUTEX(&private_data->lock_sem); + INIT_LIST_HEAD(&private_data->llist); private_data->pfile = file; /* needed for writepage */ private_data->pInode = inode; private_data->invalidHandle = FALSE; @@ -473,6 +476,8 @@ int cifs_close(struct inode *inode, struct file *file) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; if (pSMBFile) { + struct cifsLockInfo *li, *tmp; + pSMBFile->closePend = TRUE; if (pTcon) { /* no sense reconnecting to close a file that is @@ -496,6 +501,16 @@ int cifs_close(struct inode *inode, struct file *file) pSMBFile->netfid); } } + + /* Delete any outstanding lock records. + We'll lose them when the file is closed anyway. */ + down(&pSMBFile->lock_sem); + list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) { + list_del(&li->llist); + kfree(li); + } + up(&pSMBFile->lock_sem); + write_lock(&GlobalSMBSeslock); list_del(&pSMBFile->flist); list_del(&pSMBFile->tlist); @@ -570,6 +585,21 @@ int cifs_closedir(struct inode *inode, struct file *file) return rc; } +static int store_file_lock(struct cifsFileInfo *fid, __u64 len, + __u64 offset, __u8 lockType) +{ + struct cifsLockInfo *li = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); + if (li == NULL) + return -ENOMEM; + li->offset = offset; + li->length = len; + li->type = lockType; + down(&fid->lock_sem); + list_add(&li->llist, &fid->llist); + up(&fid->lock_sem); + return 0; +} + int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) { int rc, xid; @@ -581,6 +611,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) struct cifsTconInfo *pTcon; __u16 netfid; __u8 lockType = LOCKING_ANDX_LARGE_FILES; + int posix_locking; length = 1 + pfLock->fl_end - pfLock->fl_start; rc = -EACCES; @@ -639,14 +670,14 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) } netfid = ((struct cifsFileInfo *)file->private_data)->netfid; + posix_locking = (cifs_sb->tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability)); /* BB add code here to normalize offset and length to account for negative length which we can not accept over the wire */ if (IS_GETLK(cmd)) { - if((cifs_sb->tcon->ses->capabilities & CAP_UNIX) && - (CIFS_UNIX_FCNTL_CAP & - le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability))) { + if(posix_locking) { int posix_lock_type; if(lockType & LOCKING_ANDX_SHARED_LOCK) posix_lock_type = CIFS_RDLCK; @@ -682,9 +713,15 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) FreeXid(xid); return rc; } - if ((cifs_sb->tcon->ses->capabilities & CAP_UNIX) && - (CIFS_UNIX_FCNTL_CAP & - le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability))) { + + if (!numLock && !numUnlock) { + /* if no lock or unlock then nothing + to do since we do not know what it is */ + FreeXid(xid); + return -EOPNOTSUPP; + } + + if (posix_locking) { int posix_lock_type; if(lockType & LOCKING_ANDX_SHARED_LOCK) posix_lock_type = CIFS_RDLCK; @@ -693,18 +730,46 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) if(numUnlock == 1) posix_lock_type = CIFS_UNLCK; - else if(numLock == 0) { - /* if no lock or unlock then nothing - to do since we do not know what it is */ - FreeXid(xid); - return -EOPNOTSUPP; - } + rc = CIFSSMBPosixLock(xid, pTcon, netfid, 0 /* set */, length, pfLock, posix_lock_type, wait_flag); - } else - rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start, - numUnlock, numLock, lockType, wait_flag); + } else { + struct cifsFileInfo *fid = (struct cifsFileInfo *)file->private_data; + + if (numLock) { + rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start, + 0, numLock, lockType, wait_flag); + + if (rc == 0) { + /* For Windows locks we must store them. */ + rc = store_file_lock(fid, length, + pfLock->fl_start, lockType); + } + } else if (numUnlock) { + /* For each stored lock that this unlock overlaps + completely, unlock it. */ + int stored_rc = 0; + struct cifsLockInfo *li, *tmp; + + down(&fid->lock_sem); + list_for_each_entry_safe(li, tmp, &fid->llist, llist) { + if (pfLock->fl_start <= li->offset && + length >= li->length) { + stored_rc = CIFSSMBLock(xid, pTcon, netfid, + li->length, li->offset, + 1, 0, li->type, FALSE); + if (stored_rc) + rc = stored_rc; + + list_del(&li->llist); + kfree(li); + } + } + up(&fid->lock_sem); + } + } + if (pfLock->fl_flags & FL_POSIX) posix_lock_file_wait(file, pfLock); FreeXid(xid); diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index b66eff5dc624..ce87550e918f 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -72,6 +72,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRinvlevel,-EOPNOTSUPP}, {ERRdirnotempty, -ENOTEMPTY}, {ERRnotlocked, -ENOLCK}, + {ERRcancelviolation, -ENOLCK}, {ERRalreadyexists, -EEXIST}, {ERRmoredata, -EOVERFLOW}, {ERReasnotsupported,-EOPNOTSUPP}, diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h index cd41c67ff8d3..212c3c296409 100644 --- a/fs/cifs/smberr.h +++ b/fs/cifs/smberr.h @@ -95,6 +95,7 @@ #define ERRinvlevel 124 #define ERRdirnotempty 145 #define ERRnotlocked 158 +#define ERRcancelviolation 173 #define ERRalreadyexists 183 #define ERRbadpipe 230 #define ERRpipebusy 231 diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 748b0df07a73..48d47b46b1fb 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -37,7 +37,7 @@ extern mempool_t *cifs_mid_poolp; extern kmem_cache_t *cifs_oplock_cachep; static struct mid_q_entry * -AllocMidQEntry(struct smb_hdr *smb_buffer, struct cifsSesInfo *ses) +AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses) { struct mid_q_entry *temp; @@ -204,6 +204,10 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer, rc = 0; } + /* Don't want to modify the buffer as a + side effect of this call. */ + smb_buffer->smb_buf_length = smb_buf_length; + return rc; } @@ -218,6 +222,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec, unsigned int len = iov[0].iov_len; unsigned int total_len; int first_vec = 0; + unsigned int smb_buf_length = smb_buffer->smb_buf_length; if(ssocket == NULL) return -ENOTSOCK; /* BB eventually add reconnect code here */ @@ -294,36 +299,15 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec, } else rc = 0; + /* Don't want to modify the buffer as a + side effect of this call. */ + smb_buffer->smb_buf_length = smb_buf_length; + return rc; } -int -SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, - struct kvec *iov, int n_vec, int * pRespBufType /* ret */, - const int long_op) +static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) { - int rc = 0; - unsigned int receive_len; - unsigned long timeout; - struct mid_q_entry *midQ; - struct smb_hdr *in_buf = iov[0].iov_base; - - *pRespBufType = CIFS_NO_BUFFER; /* no response buf yet */ - - if ((ses == NULL) || (ses->server == NULL)) { - cifs_small_buf_release(in_buf); - cERROR(1,("Null session")); - return -EIO; - } - - if(ses->server->tcpStatus == CifsExiting) { - cifs_small_buf_release(in_buf); - return -ENOENT; - } - - /* Ensure that we do not send more than 50 overlapping requests - to the same server. We may make this configurable later or - use ses->maxReq */ if(long_op == -1) { /* oplock breaks must not be held up */ atomic_inc(&ses->server->inFlight); @@ -346,53 +330,140 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, } else { if(ses->server->tcpStatus == CifsExiting) { spin_unlock(&GlobalMid_Lock); - cifs_small_buf_release(in_buf); return -ENOENT; } - /* can not count locking commands against total since - they are allowed to block on server */ + /* can not count locking commands against total since + they are allowed to block on server */ - if(long_op < 3) { /* update # of requests on the wire to server */ + if (long_op < 3) atomic_inc(&ses->server->inFlight); - } spin_unlock(&GlobalMid_Lock); break; } } } - /* make sure that we sign in the same order that we send on this socket - and avoid races inside tcp sendmsg code that could cause corruption - of smb data */ - - down(&ses->server->tcpSem); + return 0; +} +static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf, + struct mid_q_entry **ppmidQ) +{ if (ses->server->tcpStatus == CifsExiting) { - rc = -ENOENT; - goto out_unlock2; + return -ENOENT; } else if (ses->server->tcpStatus == CifsNeedReconnect) { cFYI(1,("tcp session dead - return to caller to retry")); - rc = -EAGAIN; - goto out_unlock2; + return -EAGAIN; } else if (ses->status != CifsGood) { /* check if SMB session is bad because we are setting it up */ if((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && (in_buf->Command != SMB_COM_NEGOTIATE)) { - rc = -EAGAIN; - goto out_unlock2; + return -EAGAIN; } /* else ok - we are setting up session */ } - midQ = AllocMidQEntry(in_buf, ses); - if (midQ == NULL) { + *ppmidQ = AllocMidQEntry(in_buf, ses); + if (*ppmidQ == NULL) { + return -ENOMEM; + } + return 0; +} + +static int wait_for_response(struct cifsSesInfo *ses, + struct mid_q_entry *midQ, + unsigned long timeout, + unsigned long time_to_wait) +{ + unsigned long curr_timeout; + + for (;;) { + curr_timeout = timeout + jiffies; + wait_event(ses->server->response_q, + (!(midQ->midState == MID_REQUEST_SUBMITTED)) || + time_after(jiffies, curr_timeout) || + ((ses->server->tcpStatus != CifsGood) && + (ses->server->tcpStatus != CifsNew))); + + if (time_after(jiffies, curr_timeout) && + (midQ->midState == MID_REQUEST_SUBMITTED) && + ((ses->server->tcpStatus == CifsGood) || + (ses->server->tcpStatus == CifsNew))) { + + unsigned long lrt; + + /* We timed out. Is the server still + sending replies ? */ + spin_lock(&GlobalMid_Lock); + lrt = ses->server->lstrp; + spin_unlock(&GlobalMid_Lock); + + /* Calculate time_to_wait past last receive time. + Although we prefer not to time out if the + server is still responding - we will time + out if the server takes more than 15 (or 45 + or 180) seconds to respond to this request + and has not responded to any request from + other threads on the client within 10 seconds */ + lrt += time_to_wait; + if (time_after(jiffies, lrt)) { + /* No replies for time_to_wait. */ + cERROR(1,("server not responding")); + return -1; + } + } else { + return 0; + } + } +} + +int +SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, + struct kvec *iov, int n_vec, int * pRespBufType /* ret */, + const int long_op) +{ + int rc = 0; + unsigned int receive_len; + unsigned long timeout; + struct mid_q_entry *midQ; + struct smb_hdr *in_buf = iov[0].iov_base; + + *pRespBufType = CIFS_NO_BUFFER; /* no response buf yet */ + + if ((ses == NULL) || (ses->server == NULL)) { + cifs_small_buf_release(in_buf); + cERROR(1,("Null session")); + return -EIO; + } + + if(ses->server->tcpStatus == CifsExiting) { + cifs_small_buf_release(in_buf); + return -ENOENT; + } + + /* Ensure that we do not send more than 50 overlapping requests + to the same server. We may make this configurable later or + use ses->maxReq */ + + rc = wait_for_free_request(ses, long_op); + if (rc) { + cifs_small_buf_release(in_buf); + return rc; + } + + /* make sure that we sign in the same order that we send on this socket + and avoid races inside tcp sendmsg code that could cause corruption + of smb data */ + + down(&ses->server->tcpSem); + + rc = allocate_mid(ses, in_buf, &midQ); + if (rc) { up(&ses->server->tcpSem); cifs_small_buf_release(in_buf); - /* If not lock req, update # of requests on wire to server */ - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } - return -ENOMEM; + /* Update # of requests on wire to server */ + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; } rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number); @@ -407,32 +478,23 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; #endif - if(rc < 0) { - DeleteMidQEntry(midQ); - up(&ses->server->tcpSem); - cifs_small_buf_release(in_buf); - /* If not lock req, update # of requests on wire to server */ - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } - return rc; - } else { - up(&ses->server->tcpSem); - cifs_small_buf_release(in_buf); - } + + up(&ses->server->tcpSem); + cifs_small_buf_release(in_buf); + + if(rc < 0) + goto out; if (long_op == -1) - goto cifs_no_response_exit2; + goto out; else if (long_op == 2) /* writes past end of file can take loong time */ timeout = 180 * HZ; else if (long_op == 1) timeout = 45 * HZ; /* should be greater than servers oplock break timeout (about 43 seconds) */ - else if (long_op > 2) { - timeout = MAX_SCHEDULE_TIMEOUT; - } else + else timeout = 15 * HZ; + /* wait for 15 seconds or until woken up due to response arriving or due to last connection to this server being unmounted */ if (signal_pending(current)) { @@ -442,53 +504,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, } /* No user interrupts in wait - wreaks havoc with performance */ - if(timeout != MAX_SCHEDULE_TIMEOUT) { - unsigned long curr_timeout; - - for (;;) { - curr_timeout = timeout + jiffies; - wait_event(ses->server->response_q, - (!(midQ->midState == MID_REQUEST_SUBMITTED)) || - time_after(jiffies, curr_timeout) || - ((ses->server->tcpStatus != CifsGood) && - (ses->server->tcpStatus != CifsNew))); - - if (time_after(jiffies, curr_timeout) && - (midQ->midState == MID_REQUEST_SUBMITTED) && - ((ses->server->tcpStatus == CifsGood) || - (ses->server->tcpStatus == CifsNew))) { - - unsigned long lrt; - - /* We timed out. Is the server still - sending replies ? */ - spin_lock(&GlobalMid_Lock); - lrt = ses->server->lstrp; - spin_unlock(&GlobalMid_Lock); - - /* Calculate 10 seconds past last receive time. - Although we prefer not to time out if the - server is still responding - we will time - out if the server takes more than 15 (or 45 - or 180) seconds to respond to this request - and has not responded to any request from - other threads on the client within 10 seconds */ - lrt += (10 * HZ); - if (time_after(jiffies, lrt)) { - /* No replies for 10 seconds. */ - cERROR(1,("server not responding")); - break; - } - } else { - break; - } - } - } else { - wait_event(ses->server->response_q, - (!(midQ->midState == MID_REQUEST_SUBMITTED)) || - ((ses->server->tcpStatus != CifsGood) && - (ses->server->tcpStatus != CifsNew))); - } + wait_for_response(ses, midQ, timeout, 10 * HZ); spin_lock(&GlobalMid_Lock); if (midQ->resp_buf) { @@ -516,11 +532,9 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, } spin_unlock(&GlobalMid_Lock); DeleteMidQEntry(midQ); - /* If not lock req, update # of requests on wire to server */ - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } + /* Update # of requests on wire to server */ + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); return rc; } @@ -571,24 +585,12 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, cFYI(1,("Bad MID state?")); } } -cifs_no_response_exit2: - DeleteMidQEntry(midQ); - - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } - return rc; +out: -out_unlock2: - up(&ses->server->tcpSem); - cifs_small_buf_release(in_buf); - /* If not lock req, update # of requests on wire to server */ - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } + DeleteMidQEntry(midQ); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); return rc; } @@ -618,85 +620,34 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, /* Ensure that we do not send more than 50 overlapping requests to the same server. We may make this configurable later or use ses->maxReq */ - if(long_op == -1) { - /* oplock breaks must not be held up */ - atomic_inc(&ses->server->inFlight); - } else { - spin_lock(&GlobalMid_Lock); - while(1) { - if(atomic_read(&ses->server->inFlight) >= - cifs_max_pending){ - spin_unlock(&GlobalMid_Lock); -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&ses->server->num_waiters); -#endif - wait_event(ses->server->request_q, - atomic_read(&ses->server->inFlight) - < cifs_max_pending); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&ses->server->num_waiters); -#endif - spin_lock(&GlobalMid_Lock); - } else { - if(ses->server->tcpStatus == CifsExiting) { - spin_unlock(&GlobalMid_Lock); - return -ENOENT; - } - /* can not count locking commands against total since - they are allowed to block on server */ - - if(long_op < 3) { - /* update # of requests on the wire to server */ - atomic_inc(&ses->server->inFlight); - } - spin_unlock(&GlobalMid_Lock); - break; - } - } - } + rc = wait_for_free_request(ses, long_op); + if (rc) + return rc; + /* make sure that we sign in the same order that we send on this socket and avoid races inside tcp sendmsg code that could cause corruption of smb data */ down(&ses->server->tcpSem); - if (ses->server->tcpStatus == CifsExiting) { - rc = -ENOENT; - goto out_unlock; - } else if (ses->server->tcpStatus == CifsNeedReconnect) { - cFYI(1,("tcp session dead - return to caller to retry")); - rc = -EAGAIN; - goto out_unlock; - } else if (ses->status != CifsGood) { - /* check if SMB session is bad because we are setting it up */ - if((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && - (in_buf->Command != SMB_COM_NEGOTIATE)) { - rc = -EAGAIN; - goto out_unlock; - } /* else ok - we are setting up session */ - } - midQ = AllocMidQEntry(in_buf, ses); - if (midQ == NULL) { + rc = allocate_mid(ses, in_buf, &midQ); + if (rc) { up(&ses->server->tcpSem); - /* If not lock req, update # of requests on wire to server */ - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } - return -ENOMEM; + /* Update # of requests on wire to server */ + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; } if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - up(&ses->server->tcpSem); cERROR(1, ("Illegal length, greater than maximum frame, %d", in_buf->smb_buf_length)); DeleteMidQEntry(midQ); - /* If not lock req, update # of requests on wire to server */ - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } + up(&ses->server->tcpSem); + /* Update # of requests on wire to server */ + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); return -EIO; } @@ -712,27 +663,19 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; #endif - if(rc < 0) { - DeleteMidQEntry(midQ); - up(&ses->server->tcpSem); - /* If not lock req, update # of requests on wire to server */ - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } - return rc; - } else - up(&ses->server->tcpSem); + up(&ses->server->tcpSem); + + if(rc < 0) + goto out; + if (long_op == -1) - goto cifs_no_response_exit; + goto out; else if (long_op == 2) /* writes past end of file can take loong time */ timeout = 180 * HZ; else if (long_op == 1) timeout = 45 * HZ; /* should be greater than servers oplock break timeout (about 43 seconds) */ - else if (long_op > 2) { - timeout = MAX_SCHEDULE_TIMEOUT; - } else + else timeout = 15 * HZ; /* wait for 15 seconds or until woken up due to response arriving or due to last connection to this server being unmounted */ @@ -743,47 +686,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, } /* No user interrupts in wait - wreaks havoc with performance */ - if(timeout != MAX_SCHEDULE_TIMEOUT) { - unsigned long curr_timeout; - - for (;;) { - curr_timeout = timeout + jiffies; - wait_event(ses->server->response_q, - (!(midQ->midState == MID_REQUEST_SUBMITTED)) || - time_after(jiffies, curr_timeout) || - ((ses->server->tcpStatus != CifsGood) && - (ses->server->tcpStatus != CifsNew))); - - if (time_after(jiffies, curr_timeout) && - (midQ->midState == MID_REQUEST_SUBMITTED) && - ((ses->server->tcpStatus == CifsGood) || - (ses->server->tcpStatus == CifsNew))) { - - unsigned long lrt; - - /* We timed out. Is the server still - sending replies ? */ - spin_lock(&GlobalMid_Lock); - lrt = ses->server->lstrp; - spin_unlock(&GlobalMid_Lock); - - /* Calculate 10 seconds past last receive time*/ - lrt += (10 * HZ); - if (time_after(jiffies, lrt)) { - /* Server sent no reply in 10 seconds */ - cERROR(1,("Server not responding")); - break; - } - } else { - break; - } - } - } else { - wait_event(ses->server->response_q, - (!(midQ->midState == MID_REQUEST_SUBMITTED)) || - ((ses->server->tcpStatus != CifsGood) && - (ses->server->tcpStatus != CifsNew))); - } + wait_for_response(ses, midQ, timeout, 10 * HZ); spin_lock(&GlobalMid_Lock); if (midQ->resp_buf) { @@ -811,11 +714,9 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, } spin_unlock(&GlobalMid_Lock); DeleteMidQEntry(midQ); - /* If not lock req, update # of requests on wire to server */ - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } + /* Update # of requests on wire to server */ + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); return rc; } @@ -862,23 +763,253 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, cERROR(1,("Bad MID state?")); } } -cifs_no_response_exit: + +out: + DeleteMidQEntry(midQ); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); - } + return rc; +} +/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */ + +static int +send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf, + struct mid_q_entry *midQ) +{ + int rc = 0; + struct cifsSesInfo *ses = tcon->ses; + __u16 mid = in_buf->Mid; + + header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0); + in_buf->Mid = mid; + down(&ses->server->tcpSem); + rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); + if (rc) { + up(&ses->server->tcpSem); + return rc; + } + rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, + (struct sockaddr *) &(ses->server->addr.sockAddr)); + up(&ses->server->tcpSem); return rc; +} + +/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows + blocking lock to return. */ + +static int +send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon, + struct smb_hdr *in_buf, + struct smb_hdr *out_buf) +{ + int bytes_returned; + struct cifsSesInfo *ses = tcon->ses; + LOCK_REQ *pSMB = (LOCK_REQ *)in_buf; + + /* We just modify the current in_buf to change + the type of lock from LOCKING_ANDX_SHARED_LOCK + or LOCKING_ANDX_EXCLUSIVE_LOCK to + LOCKING_ANDX_CANCEL_LOCK. */ + + pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES; + pSMB->Timeout = 0; + pSMB->hdr.Mid = GetNextMid(ses->server); + + return SendReceive(xid, ses, in_buf, out_buf, + &bytes_returned, 0); +} + +int +SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, + struct smb_hdr *in_buf, struct smb_hdr *out_buf, + int *pbytes_returned) +{ + int rc = 0; + int rstart = 0; + unsigned int receive_len; + struct mid_q_entry *midQ; + struct cifsSesInfo *ses; + + if (tcon == NULL || tcon->ses == NULL) { + cERROR(1,("Null smb session")); + return -EIO; + } + ses = tcon->ses; + + if(ses->server == NULL) { + cERROR(1,("Null tcp session")); + return -EIO; + } + + if(ses->server->tcpStatus == CifsExiting) + return -ENOENT; + + /* Ensure that we do not send more than 50 overlapping requests + to the same server. We may make this configurable later or + use ses->maxReq */ + + rc = wait_for_free_request(ses, 3); + if (rc) + return rc; + + /* make sure that we sign in the same order that we send on this socket + and avoid races inside tcp sendmsg code that could cause corruption + of smb data */ + + down(&ses->server->tcpSem); + + rc = allocate_mid(ses, in_buf, &midQ); + if (rc) { + up(&ses->server->tcpSem); + return rc; + } + + if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + up(&ses->server->tcpSem); + cERROR(1, ("Illegal length, greater than maximum frame, %d", + in_buf->smb_buf_length)); + DeleteMidQEntry(midQ); + return -EIO; + } + + rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); -out_unlock: + midQ->midState = MID_REQUEST_SUBMITTED; +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&ses->server->inSend); +#endif + rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, + (struct sockaddr *) &(ses->server->addr.sockAddr)); +#ifdef CONFIG_CIFS_STATS2 + atomic_dec(&ses->server->inSend); + midQ->when_sent = jiffies; +#endif up(&ses->server->tcpSem); - /* If not lock req, update # of requests on wire to server */ - if(long_op < 3) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); + + if(rc < 0) { + DeleteMidQEntry(midQ); + return rc; + } + + /* Wait for a reply - allow signals to interrupt. */ + rc = wait_event_interruptible(ses->server->response_q, + (!(midQ->midState == MID_REQUEST_SUBMITTED)) || + ((ses->server->tcpStatus != CifsGood) && + (ses->server->tcpStatus != CifsNew))); + + /* Were we interrupted by a signal ? */ + if ((rc == -ERESTARTSYS) && + (midQ->midState == MID_REQUEST_SUBMITTED) && + ((ses->server->tcpStatus == CifsGood) || + (ses->server->tcpStatus == CifsNew))) { + + if (in_buf->Command == SMB_COM_TRANSACTION2) { + /* POSIX lock. We send a NT_CANCEL SMB to cause the + blocking lock to return. */ + + rc = send_nt_cancel(tcon, in_buf, midQ); + if (rc) { + DeleteMidQEntry(midQ); + return rc; + } + } else { + /* Windows lock. We send a LOCKINGX_CANCEL_LOCK + to cause the blocking lock to return. */ + + rc = send_lock_cancel(xid, tcon, in_buf, out_buf); + + /* If we get -ENOLCK back the lock may have + already been removed. Don't exit in this case. */ + if (rc && rc != -ENOLCK) { + DeleteMidQEntry(midQ); + return rc; + } + } + + /* Wait 5 seconds for the response. */ + if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ)==0) { + /* We got the response - restart system call. */ + rstart = 1; + } + } + + spin_lock(&GlobalMid_Lock); + if (midQ->resp_buf) { + spin_unlock(&GlobalMid_Lock); + receive_len = midQ->resp_buf->smb_buf_length; + } else { + cERROR(1,("No response for cmd %d mid %d", + midQ->command, midQ->mid)); + if(midQ->midState == MID_REQUEST_SUBMITTED) { + if(ses->server->tcpStatus == CifsExiting) + rc = -EHOSTDOWN; + else { + ses->server->tcpStatus = CifsNeedReconnect; + midQ->midState = MID_RETRY_NEEDED; + } + } + + if (rc != -EHOSTDOWN) { + if(midQ->midState == MID_RETRY_NEEDED) { + rc = -EAGAIN; + cFYI(1,("marking request for retry")); + } else { + rc = -EIO; + } + } + spin_unlock(&GlobalMid_Lock); + DeleteMidQEntry(midQ); + return rc; } + + if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { + cERROR(1, ("Frame too large received. Length: %d Xid: %d", + receive_len, xid)); + rc = -EIO; + } else { /* rcvd frame is ok */ + + if (midQ->resp_buf && out_buf + && (midQ->midState == MID_RESPONSE_RECEIVED)) { + out_buf->smb_buf_length = receive_len; + memcpy((char *)out_buf + 4, + (char *)midQ->resp_buf + 4, + receive_len); + + dump_smb(out_buf, 92); + /* convert the length into a more usable form */ + if((receive_len > 24) && + (ses->server->secMode & (SECMODE_SIGN_REQUIRED | + SECMODE_SIGN_ENABLED))) { + rc = cifs_verify_signature(out_buf, + ses->server->mac_signing_key, + midQ->sequence_number+1); + if(rc) { + cERROR(1,("Unexpected SMB signature")); + /* BB FIXME add code to kill session */ + } + } + + *pbytes_returned = out_buf->smb_buf_length; + + /* BB special case reconnect tid and uid here? */ + rc = map_smb_to_linux_error(out_buf); + /* convert ByteCount if necessary */ + if (receive_len >= + sizeof (struct smb_hdr) - + 4 /* do not count RFC1001 header */ + + (2 * out_buf->WordCount) + 2 /* bcc */ ) + BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); + } else { + rc = -EIO; + cERROR(1,("Bad MID state?")); + } + } + DeleteMidQEntry(midQ); + if (rstart && rc == -EACCES) + return -ERESTARTSYS; return rc; } -- cgit v1.2.3 From 66abda5e1fa48e12e06d0b68746b0e67202a97d2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 11 Aug 2006 16:52:09 +0000 Subject: [CIFS] Fix oops when negotiating lanman and no password specified Pointed out by Guenter Kukkukk Signed-of-by: Steve French (cherry picked from bbf33d512da608c7221fec42b56b9ef89c25a5ee commit) --- fs/cifs/cifsencrypt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index a89efaf78a26..4bc250b2d9fc 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -277,7 +277,8 @@ void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key) return; memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); - strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE); + if(ses->password) + strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE); if((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0) if(extended_security & CIFSSEC_MAY_PLNTXT) { -- cgit v1.2.3 From 1725cd0ae07bb31f68803edcc5bdc99952c7d2f4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 13 Aug 2006 23:24:17 -0700 Subject: [PATCH] adfs error message fix Don't use NULL as a printf control string. Fixes bug #6889. Cc: Ralph Corderoy Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/adfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/adfs/super.c b/fs/adfs/super.c index ba1c88af49fe..82011019494c 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -308,7 +308,7 @@ static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_di if (adfs_checkmap(sb, dm)) return dm; - adfs_error(sb, NULL, "map corrupted"); + adfs_error(sb, "map corrupted"); error_free: while (--zone >= 0) -- cgit v1.2.3 From 95f8797f42b058333d1e6f0d1dcd8edf5dc6c244 Mon Sep 17 00:00:00 2001 From: Dan Bastone Date: Sun, 13 Aug 2006 23:24:18 -0700 Subject: [PATCH] initialize parts of udf inode earlier in create Eric says: > I saw an oops down this path when trying to create a new file on a UDF > filesystem which was internally marked as readonly, but mounted rw: > > udf_create > udf_new_inode > new_inode > alloc_inode > udf_alloc_inode > udf_new_block > returns EIO due to readonlyness > iput (on error) I ran into the same issue today, but when listing a directory with invalid/corrupt entries: udf_lookup udf_iget get_new_inode_fast alloc_inode udf_alloc_inode __udf_read_inode fails for any reason iput (on error) ... The following patch to udf_alloc_inode() should take care of both (and other similar) cases, but I've only tested it with udf_lookup(). Signed-off-by: Dan Bastone Cc: Eric Sandeen Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/udf/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index 4df822c881b6..7de172efa084 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -115,6 +115,13 @@ static struct inode *udf_alloc_inode(struct super_block *sb) ei = (struct udf_inode_info *)kmem_cache_alloc(udf_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; + + ei->i_unique = 0; + ei->i_lenExtents = 0; + ei->i_next_alloc_block = 0; + ei->i_next_alloc_goal = 0; + ei->i_strat4096 = 0; + return &ei->vfs_inode; } -- cgit v1.2.3 From 1d7ea7324ae7a59f8e17e4ba76a2707c1e6f24d2 Mon Sep 17 00:00:00 2001 From: Alexander Zarochentsev Date: Sun, 13 Aug 2006 23:24:27 -0700 Subject: [PATCH] fuse: fix error case in fuse_readpages Don't let fuse_readpages leave the @pages list not empty when exiting on error. [akpm@osdl.org: kernel-doc fixes] Signed-off-by: Alexander Zarochentsev Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 10 ++++++++-- include/linux/mm.h | 1 + mm/swap.c | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 63614ed16336..5c4fcd1dbf59 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -395,14 +395,16 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_readpages_data data; int err; + err = -EIO; if (is_bad_inode(inode)) - return -EIO; + goto clean_pages_up; data.file = file; data.inode = inode; data.req = fuse_get_req(fc); + err = PTR_ERR(data.req); if (IS_ERR(data.req)) - return PTR_ERR(data.req); + goto clean_pages_up; err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { @@ -412,6 +414,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, fuse_put_request(fc, data.req); } return err; + +clean_pages_up: + put_pages_list(pages); + return err; } static size_t fuse_send_write(struct fuse_req *req, struct file *file, diff --git a/include/linux/mm.h b/include/linux/mm.h index 990957e0929f..f0b135cd86da 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -336,6 +336,7 @@ static inline void init_page_count(struct page *page) } void put_page(struct page *page); +void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); diff --git a/mm/swap.c b/mm/swap.c index 8fd095c4ae51..687686a61f7c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -54,6 +54,26 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/** + * put_pages_list(): release a list of pages + * + * Release a list of pages which are strung together on page.lru. Currently + * used by read_cache_pages() and related error recovery code. + * + * @pages: list of pages threaded on page->lru + */ +void put_pages_list(struct list_head *pages) +{ + while (!list_empty(pages)) { + struct page *victim; + + victim = list_entry(pages->prev, struct page, lru); + list_del(&victim->lru); + page_cache_release(victim); + } +} +EXPORT_SYMBOL(put_pages_list); + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the -- cgit v1.2.3 From 74361cb6828398a96167b3234e186fbd731e5f30 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Aug 2006 08:54:48 -0700 Subject: [PATCH] fcntl(F_SETSIG) fix fcntl(F_SETSIG) no longer works on leases because lease_release_private_callback() gets called as the lease is copied in order to initialise it. The problem is that lease_alloc() performs an unnecessary initialisation, which sets the lease_manager_ops. Avoid the problem by allocating the target lease structure using locks_alloc_lock(). Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/locks.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index b0b41a64e10b..d7c53392cac1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1421,8 +1421,9 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp) if (!leases_enable) goto out; - error = lease_alloc(filp, arg, &fl); - if (error) + error = -ENOMEM; + fl = locks_alloc_lock(); + if (fl == NULL) goto out; locks_copy_lock(fl, lease); @@ -1430,6 +1431,7 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp) locks_insert_lock(before, fl); *flp = fl; + error = 0; out: return error; } -- cgit v1.2.3 From e466e4876bf39474e15d0572f2204578137ae7f5 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 15 Aug 2006 13:07:18 +0000 Subject: [CIFS] Fix oops in cifs_close due to unitialized lock sem and list in new POSIX locking code Signed-off-by: Steve French --- fs/cifs/CHANGES | 5 ++++- fs/cifs/connect.c | 14 ++++++++------ fs/cifs/dir.c | 4 ++++ 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index acb843b9bc3b..bd37727526ea 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -2,7 +2,10 @@ Version 1.45 ------------ Do not time out lockw calls when using posix extensions. Do not time out requests if server still responding reasonably fast -on requests on other threads +on requests on other threads. Improve POSIX locking emulation, +(lock cancel now works, and unlock of merged range works even +to Windows servers now). Fix oops on mount to lanman servers +(win9x, os/2 etc.) when null password. Version 1.44 ------------ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b706b4f48b10..5d394c726860 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1271,33 +1271,35 @@ find_unc(__be32 new_target_ip_addr, char *uncName, char *userName) read_lock(&GlobalSMBSeslock); list_for_each(tmp, &GlobalTreeConnectionList) { - cFYI(1, ("Next tcon - ")); + cFYI(1, ("Next tcon")); tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList); if (tcon->ses) { if (tcon->ses->server) { cFYI(1, - (" old ip addr: %x == new ip %x ?", + ("old ip addr: %x == new ip %x ?", tcon->ses->server->addr.sockAddr.sin_addr. s_addr, new_target_ip_addr)); if (tcon->ses->server->addr.sockAddr.sin_addr. s_addr == new_target_ip_addr) { - /* BB lock tcon and server and tcp session and increment use count here? */ + /* BB lock tcon, server and tcp session and increment use count here? */ /* found a match on the TCP session */ /* BB check if reconnection needed */ - cFYI(1,("Matched ip, old UNC: %s == new: %s ?", + cFYI(1,("IP match, old UNC: %s new: %s", tcon->treeName, uncName)); if (strncmp (tcon->treeName, uncName, MAX_TREE_SIZE) == 0) { cFYI(1, - ("Matched UNC, old user: %s == new: %s ?", + ("and old usr: %s new: %s", tcon->treeName, uncName)); if (strncmp (tcon->ses->userName, userName, MAX_USERNAME_SIZE) == 0) { read_unlock(&GlobalSMBSeslock); - return tcon;/* also matched user (smb session)*/ + /* matched smb session + (user name */ + return tcon; } } } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index ba4cbe9b0684..914239d53634 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -267,6 +267,10 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, pCifsFile->invalidHandle = FALSE; pCifsFile->closePend = FALSE; init_MUTEX(&pCifsFile->fh_sem); + init_MUTEX(&pCifsFile->lock_sem); + INIT_LIST_HEAD(&pCifsFile->llist); + atomic_set(&pCifsFile->wrtPending,0); + /* set the following in open now pCifsFile->pfile = file; */ write_lock(&GlobalSMBSeslock); -- cgit v1.2.3 From 5ddaa683a513439081c9511b0d9ad490672c51c9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 15 Aug 2006 13:35:48 +0000 Subject: [CIFS] endian errors in lanman protocol support le16 compared to host-endian constant u8 fed to le32_to_cpu() le16 compared to host-endian constant Signed-off-by: Al Viro Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 2 +- fs/cifs/readdir.c | 2 +- fs/cifs/sess.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index dcbc3e075e81..075d8fb3d376 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -477,7 +477,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) /* BB get server time for time conversions and add code to use it and timezone since this is not UTC */ - if (rsp->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { + if (rsp->EncryptionKeyLength == cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { memcpy(server->cryptKey, rsp->EncryptionKey, CIFS_CRYPTO_KEY_SIZE); } else if (server->secMode & SECMODE_PW_ENCRYPT) { diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 03bbcb377913..105761e3ba0e 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -556,7 +556,7 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile) FIND_FILE_STANDARD_INFO * pFindData = (FIND_FILE_STANDARD_INFO *)current_entry; filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); + len = pFindData->FileNameLength; } else { cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level)); } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 7202d534ef0b..d1705ab8136e 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -372,7 +372,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, /* no capabilities flags in old lanman negotiation */ - pSMB->old_req.PasswordLength = CIFS_SESS_KEY_SIZE; + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); /* BB calculate hash with password */ /* and copy into bcc */ -- cgit v1.2.3 From ea4c07d780a6f7b7be2d984117bd3e0a2b772e3d Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 16 Aug 2006 19:44:25 +0000 Subject: [CIFS] Do not send Query All EAs SMB when mount option nouser_xattr specified Pointed out by Bjoern Jacke Signed-off-by: Steve French --- fs/cifs/CHANGES | 3 ++- fs/cifs/README | 2 +- fs/cifs/xattr.c | 6 +++++- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index bd37727526ea..0feb3bd49cb8 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -5,7 +5,8 @@ time out requests if server still responding reasonably fast on requests on other threads. Improve POSIX locking emulation, (lock cancel now works, and unlock of merged range works even to Windows servers now). Fix oops on mount to lanman servers -(win9x, os/2 etc.) when null password. +(win9x, os/2 etc.) when null password. Do not send listxattr +(SMB to query all EAs) if nouser_xattr specified. Version 1.44 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index 7986d0d97ace..5f0e1bd64fee 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -408,7 +408,7 @@ A partial list of the supported mount options follows: user_xattr Allow getting and setting user xattrs as OS/2 EAs (extended attributes) to the server (default) e.g. via setfattr and getfattr utilities. - nouser_xattr Do not allow getfattr/setfattr to get/set xattrs + nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs mapchars Translate six of the seven reserved characters (not backslash) *?<>|: to the remap range (above 0xF000), which also diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 7754d641775e..067648b7179b 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -330,11 +330,15 @@ ssize_t cifs_listxattr(struct dentry * direntry, char * data, size_t buf_size) sb = direntry->d_inode->i_sb; if(sb == NULL) return -EIO; - xid = GetXid(); cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; + if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + return -EOPNOTSUPP; + + xid = GetXid(); + full_path = build_path_from_dentry(direntry); if(full_path == NULL) { FreeXid(xid); -- cgit v1.2.3 From 78bd4d484f81a611ef6ff02f909e576cb9aac7f2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2006 08:33:23 +0200 Subject: [PATCH] sys_ioprio_set: minor do_each_thread+break fix From include/linux/sched.h: * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ Signed-off-by: Oleg Nesterov Signed-off-by: Jens Axboe --- fs/ioprio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ioprio.c b/fs/ioprio.c index 93aa5715f224..3db31038e9ab 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -111,9 +111,9 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) continue; ret = set_task_ioprio(p, ioprio); if (ret) - break; + goto free_uid; } while_each_thread(g, p); - +free_uid: if (who) free_uid(user); break; -- cgit v1.2.3 From 9f83e45eb54fc7198dc59fc63255341851ba4c48 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2006 08:34:15 +0200 Subject: [PATCH] Fix current_io_context() vs set_task_ioprio() race I know nothing about io scheduler, but I suspect set_task_ioprio() is not safe. current_io_context() initializes "struct io_context", then sets ->io_context. set_task_ioprio() running on another cpu may see the changes out of order, so ->set_ioprio(ioc) may use io_context which was not initialized properly. Signed-off-by: Oleg Nesterov Signed-off-by: Jens Axboe --- block/ll_rw_blk.c | 2 ++ fs/ioprio.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'fs') diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 61d6b3c65b66..ddd9253f9d55 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3628,6 +3628,8 @@ struct io_context *current_io_context(gfp_t gfp_flags) ret->nr_batch_requests = 0; /* because this is 0 */ ret->aic = NULL; ret->cic_root.rb_node = NULL; + /* make sure set_task_ioprio() sees the settings above */ + smp_wmb(); tsk->io_context = ret; } diff --git a/fs/ioprio.c b/fs/ioprio.c index 3db31038e9ab..06578311c63f 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -44,6 +44,9 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) task->ioprio = ioprio; ioc = task->io_context; + /* see wmb() in current_io_context() */ + smp_read_barrier_depends(); + if (ioc && ioc->set_ioprio) ioc->set_ioprio(ioc, ioprio); -- cgit v1.2.3 From e014ff8d4285b81f0de0719d8eee72bc50bfd4be Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2006 10:02:50 +0200 Subject: [PATCH] uninline ioprio_best() Saves 376 bytes (5 callers) for me. Signed-off-by: Oleg Nesterov Signed-off-by: Jens Axboe --- fs/ioprio.c | 23 +++++++++++++++++++++++ include/linux/ioprio.h | 23 +---------------------- 2 files changed, 24 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ioprio.c b/fs/ioprio.c index 06578311c63f..78b1deae3fa2 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -140,6 +140,29 @@ out: return ret; } +int ioprio_best(unsigned short aprio, unsigned short bprio) +{ + unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); + unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); + + if (!ioprio_valid(aprio)) + return bprio; + if (!ioprio_valid(bprio)) + return aprio; + + if (aclass == IOPRIO_CLASS_NONE) + aclass = IOPRIO_CLASS_BE; + if (bclass == IOPRIO_CLASS_NONE) + bclass = IOPRIO_CLASS_BE; + + if (aclass == bclass) + return min(aprio, bprio); + if (aclass > bclass) + return bprio; + else + return aprio; +} + asmlinkage long sys_ioprio_get(int which, int who) { struct task_struct *g, *p; diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 88d5961f7a3f..8e2042b9d471 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -59,27 +59,6 @@ static inline int task_nice_ioprio(struct task_struct *task) /* * For inheritance, return the highest of the two given priorities */ -static inline int ioprio_best(unsigned short aprio, unsigned short bprio) -{ - unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); - unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); - - if (!ioprio_valid(aprio)) - return bprio; - if (!ioprio_valid(bprio)) - return aprio; - - if (aclass == IOPRIO_CLASS_NONE) - aclass = IOPRIO_CLASS_BE; - if (bclass == IOPRIO_CLASS_NONE) - bclass = IOPRIO_CLASS_BE; - - if (aclass == bclass) - return min(aprio, bprio); - if (aclass > bclass) - return bprio; - else - return aprio; -} +extern int ioprio_best(unsigned short aprio, unsigned short bprio); #endif -- cgit v1.2.3 From 00a2b0f6dd2372842df73de72d51621b539fea44 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2006 13:56:26 +0200 Subject: Fix possible UDF deadlock and memory corruption (CVE-2006-4145) UDF code is not really ready to handle extents larger that 1GB. This is the easy way to forbid creating those. Also truncation code did not count with the case when there are no extents in the file and we are extending the file. Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/udf/super.c | 2 +- fs/udf/truncate.c | 64 +++++++++++++++++++++++++++++++++---------------------- 2 files changed, 40 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index 7de172efa084..fcce1a21a51b 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1659,7 +1659,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) iput(inode); goto error_out; } - sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_maxbytes = 1<<30; return 0; error_out: diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index e1b0e8cfecb4..0abd66ce36ea 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -239,37 +239,51 @@ void udf_truncate_extents(struct inode * inode) { if (offset) { - extoffset -= adsize; - etype = udf_next_aext(inode, &bloc, &extoffset, &eloc, &elen, &bh, 1); - if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) - { - extoffset -= adsize; - elen = EXT_NOT_RECORDED_NOT_ALLOCATED | (elen + offset); - udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 0); + /* + * OK, there is not extent covering inode->i_size and + * no extent above inode->i_size => truncate is + * extending the file by 'offset'. + */ + if ((!bh && extoffset == udf_file_entry_alloc_offset(inode)) || + (bh && extoffset == sizeof(struct allocExtDesc))) { + /* File has no extents at all! */ + memset(&eloc, 0x00, sizeof(kernel_lb_addr)); + elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset; + udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1); } - else if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) - { - kernel_lb_addr neloc = { 0, 0 }; + else { extoffset -= adsize; - nelen = EXT_NOT_RECORDED_NOT_ALLOCATED | - ((elen + offset + inode->i_sb->s_blocksize - 1) & - ~(inode->i_sb->s_blocksize - 1)); - udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 1); - udf_add_aext(inode, &bloc, &extoffset, eloc, (etype << 30) | elen, &bh, 1); - } - else - { - if (elen & (inode->i_sb->s_blocksize - 1)) + etype = udf_next_aext(inode, &bloc, &extoffset, &eloc, &elen, &bh, 1); + if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) + { + extoffset -= adsize; + elen = EXT_NOT_RECORDED_NOT_ALLOCATED | (elen + offset); + udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 0); + } + else if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + kernel_lb_addr neloc = { 0, 0 }; extoffset -= adsize; - elen = EXT_RECORDED_ALLOCATED | - ((elen + inode->i_sb->s_blocksize - 1) & + nelen = EXT_NOT_RECORDED_NOT_ALLOCATED | + ((elen + offset + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); - udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 1); + udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 1); + udf_add_aext(inode, &bloc, &extoffset, eloc, (etype << 30) | elen, &bh, 1); + } + else + { + if (elen & (inode->i_sb->s_blocksize - 1)) + { + extoffset -= adsize; + elen = EXT_RECORDED_ALLOCATED | + ((elen + inode->i_sb->s_blocksize - 1) & + ~(inode->i_sb->s_blocksize - 1)); + udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 1); + } + memset(&eloc, 0x00, sizeof(kernel_lb_addr)); + elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset; + udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1); } - memset(&eloc, 0x00, sizeof(kernel_lb_addr)); - elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset; - udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1); } } } -- cgit v1.2.3 From ddeff520f02b92128132c282c350fa72afffb84a Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Wed, 9 Aug 2006 13:53:47 -0400 Subject: NFS: Fix a potential deadlock in nfs_release_page nfs_wb_page() waits on request completion and, as a result, is not safe to be called from nfs_release_page() invoked by VM scanner as part of GFP_NOFS allocation. Fix possible deadlock by analyzing gfp mask and refusing to release page if __GFP_FS is not set. Signed-off-by: Nikita Danilov Signed-off-by: Trond Myklebust (cherry picked from 374d969debfb290bafcb41d28918dc6f7e43ce31 commit) --- fs/nfs/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cc2b874ad5a4..48e892880d5b 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -312,7 +312,13 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) static int nfs_release_page(struct page *page, gfp_t gfp) { - return !nfs_wb_page(page->mapping->host, page); + if (gfp & __GFP_FS) + return !nfs_wb_page(page->mapping->host, page); + else + /* + * Avoid deadlock on nfs_wait_on_request(). + */ + return 0; } const struct address_space_operations nfs_file_aops = { -- cgit v1.2.3 From a634904a7de0d3a0bc606f608007a34e8c05bfee Mon Sep 17 00:00:00 2001 From: ASANO Masahiro Date: Tue, 22 Aug 2006 20:06:02 -0400 Subject: VFS: add lookup hint for network file systems I'm trying to speeding up mkdir(2) for network file systems. A typical mkdir(2) calls two inode_operations: lookup and mkdir. The lookup operation would fail with ENOENT in common case. I think it is unnecessary because the subsequent mkdir operation can check it. In case of creat(2), lookup operation is called with the LOOKUP_CREATE flag, so individual filesystem can omit real lookup. e.g. nfs_lookup(). Here is a sample patch which uses LOOKUP_CREATE and O_EXCL on mkdir, symlink and mknod. This uses the gadget for creat(2). And here is the result of a benchmark on NFSv3. mkdir(2) 10,000 times: original 50.5 sec patched 29.0 sec Signed-off-by: ASANO Masahiro Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust (cherry picked from fab7bf44449b29f9d5572a5dd8adcf7c91d5bf0f commit) --- fs/namei.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 55a131230f94..863166441bf3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1767,6 +1767,8 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir) if (nd->last_type != LAST_NORM) goto fail; nd->flags &= ~LOOKUP_PARENT; + nd->flags |= LOOKUP_CREATE; + nd->intent.open.flags = O_EXCL; /* * Do the final lookup. -- cgit v1.2.3 From 5d67476fff2df6ff12f60b540fd0e74cf2a668f9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 31 Jul 2006 14:11:48 -0700 Subject: SUNRPC: make rpc_unlink() take a dentry argument instead of a path Signe-off-by: Trond Myklebust (cherry picked from 88bf6d811b01a4be7fd507d18bf5f1c527989089 commit) --- fs/nfs/idmap.c | 3 +-- include/linux/sunrpc/rpc_pipe_fs.h | 2 +- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/rpc_pipe.c | 20 ++++++-------------- 4 files changed, 9 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index b81e7ed3c902..df0be1214358 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -130,9 +130,8 @@ nfs_idmap_delete(struct nfs4_client *clp) if (!idmap) return; + rpc_unlink(idmap->idmap_dentry); dput(idmap->idmap_dentry); - idmap->idmap_dentry = NULL; - rpc_unlink(idmap->idmap_path); clp->cl_idmap = NULL; kfree(idmap); } diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index 2c2189cb30aa..04d2767d5ef7 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -44,7 +44,7 @@ extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *); extern struct dentry *rpc_mkdir(char *, struct rpc_clnt *); extern int rpc_rmdir(char *); extern struct dentry *rpc_mkpipe(char *, void *, struct rpc_pipe_ops *, int flags); -extern int rpc_unlink(char *); +extern int rpc_unlink(struct dentry *); extern struct vfsmount *rpc_get_mount(void); extern void rpc_put_mount(void); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 4a9aa9393b97..beaa7b848246 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -718,7 +718,7 @@ gss_destroy(struct rpc_auth *auth) auth, auth->au_flavor); gss_auth = container_of(auth, struct gss_auth, rpc_auth); - rpc_unlink(gss_auth->path); + rpc_unlink(gss_auth->dentry); dput(gss_auth->dentry); gss_auth->dentry = NULL; gss_mech_put(gss_auth->mech); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index a3bd2db2e024..9144f2767b66 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -746,22 +746,15 @@ err_dput: } int -rpc_unlink(char *path) +rpc_unlink(struct dentry *dentry) { - struct nameidata nd; - struct dentry *dentry; + struct dentry *parent; struct inode *dir; - int error; + int error = 0; - if ((error = rpc_lookup_parent(path, &nd)) != 0) - return error; - dir = nd.dentry->d_inode; + parent = dget_parent(dentry); + dir = parent->d_inode; mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); - dentry = lookup_one_len(nd.last.name, nd.dentry, nd.last.len); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_release; - } d_drop(dentry); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); @@ -769,9 +762,8 @@ rpc_unlink(char *path) } dput(dentry); inode_dir_notify(dir, DN_DELETE); -out_release: mutex_unlock(&dir->i_mutex); - rpc_release_path(&nd); + dput(parent); return error; } -- cgit v1.2.3 From 8f8e7a50f450fcb86a5b2ffb94543c57a14f8260 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Aug 2006 13:11:15 -0400 Subject: SUNRPC: Fix dentry refcounting issues with users of rpc_pipefs rpc_unlink() and rpc_rmdir() will dput the dentry reference for you. Signed-off-by: Trond Myklebust (cherry picked from a05a57effa71a1f67ccbfc52335c10c8b85f3f6a commit) --- fs/nfs/idmap.c | 1 - net/sunrpc/auth_gss/auth_gss.c | 1 - net/sunrpc/clnt.c | 15 ++++++--------- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index df0be1214358..07a5dd57646e 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -131,7 +131,6 @@ nfs_idmap_delete(struct nfs4_client *clp) if (!idmap) return; rpc_unlink(idmap->idmap_dentry); - dput(idmap->idmap_dentry); clp->cl_idmap = NULL; kfree(idmap); } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index beaa7b848246..ef1cf5b476c8 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -719,7 +719,6 @@ gss_destroy(struct rpc_auth *auth) gss_auth = container_of(auth, struct gss_auth, rpc_auth); rpc_unlink(gss_auth->dentry); - dput(gss_auth->dentry); gss_auth->dentry = NULL; gss_mech_put(gss_auth->mech); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d307556872db..d9eac7069101 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -184,7 +184,6 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname, out_no_auth: if (!IS_ERR(clnt->cl_dentry)) { rpc_rmdir(clnt->cl_dentry); - dput(clnt->cl_dentry); rpc_put_mount(); } out_no_path: @@ -251,10 +250,8 @@ rpc_clone_client(struct rpc_clnt *clnt) new->cl_autobind = 0; new->cl_oneshot = 0; new->cl_dead = 0; - if (!IS_ERR(new->cl_dentry)) { + if (!IS_ERR(new->cl_dentry)) dget(new->cl_dentry); - rpc_get_mount(); - } rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); if (new->cl_auth) atomic_inc(&new->cl_auth->au_count); @@ -317,11 +314,15 @@ rpc_destroy_client(struct rpc_clnt *clnt) clnt->cl_auth = NULL; } if (clnt->cl_parent != clnt) { + if (!IS_ERR(clnt->cl_dentry)) + dput(clnt->cl_dentry); rpc_destroy_client(clnt->cl_parent); goto out_free; } - if (!IS_ERR(clnt->cl_dentry)) + if (!IS_ERR(clnt->cl_dentry)) { rpc_rmdir(clnt->cl_dentry); + rpc_put_mount(); + } if (clnt->cl_xprt) { xprt_destroy(clnt->cl_xprt); clnt->cl_xprt = NULL; @@ -331,10 +332,6 @@ rpc_destroy_client(struct rpc_clnt *clnt) out_free: rpc_free_iostats(clnt->cl_metrics); clnt->cl_metrics = NULL; - if (!IS_ERR(clnt->cl_dentry)) { - dput(clnt->cl_dentry); - rpc_put_mount(); - } kfree(clnt); return 0; } -- cgit v1.2.3 From 01df9c5e918ae5559f2d96da0143f8bfbb9e6171 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 10 Aug 2006 11:58:57 -0400 Subject: LOCKD: Fix a deadlock in nlm_traverse_files() nlm_traverse_files() is not allowed to hold the nlm_file_mutex while calling nlm_inspect file, since it may end up calling nlm_release_file() when releaseing the blocks. Signed-off-by: Trond Myklebust (cherry picked from e558d3cde986e04f68afe8c790ad68ef4b94587a commit) --- fs/lockd/svcsubs.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 2a4df9b3779a..01b4db9e5466 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -237,19 +237,22 @@ static int nlm_traverse_files(struct nlm_host *host, int action) { struct nlm_file *file, **fp; - int i; + int i, ret = 0; mutex_lock(&nlm_file_mutex); for (i = 0; i < FILE_NRHASH; i++) { fp = nlm_files + i; while ((file = *fp) != NULL) { + file->f_count++; + mutex_unlock(&nlm_file_mutex); + /* Traverse locks, blocks and shares of this file * and update file->f_locks count */ - if (nlm_inspect_file(host, file, action)) { - mutex_unlock(&nlm_file_mutex); - return 1; - } + if (nlm_inspect_file(host, file, action)) + ret = 1; + mutex_lock(&nlm_file_mutex); + file->f_count--; /* No more references to this file. Let go of it. */ if (!file->f_blocks && !file->f_locks && !file->f_shares && !file->f_count) { @@ -262,7 +265,7 @@ nlm_traverse_files(struct nlm_host *host, int action) } } mutex_unlock(&nlm_file_mutex); - return 0; + return ret; } /* -- cgit v1.2.3 From 79558f3610efd7928e8882b2eaca3093b283630e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 13:44:32 -0400 Subject: NFS: Fix issue with EIO on NFS read The problem is that we may be caching writes that would extend the file and create a hole in the region that we are reading. In this case, we need to detect the eof from the server, ensure that we zero out the pages that are part of the hole and mark them as up to date. Signed-off-by: Trond Myklebust (cherry picked from 856b603b01b99146918c093969b6cb1b1b0f1c01 commit) --- fs/nfs/read.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 65c0c5b32351..da9cf11c326f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -116,10 +116,17 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; base &= ~PAGE_CACHE_MASK; pglen = PAGE_CACHE_SIZE - base; - if (pglen < remainder) + for (;;) { + if (remainder <= pglen) { + memclear_highpage_flush(*pages, base, remainder); + break; + } memclear_highpage_flush(*pages, base, pglen); - else - memclear_highpage_flush(*pages, base, remainder); + pages++; + remainder -= pglen; + pglen = PAGE_CACHE_SIZE; + base = 0; + } } /* @@ -476,6 +483,8 @@ static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) unsigned int base = data->args.pgbase; struct page **pages; + if (data->res.eof) + count = data->args.count; if (unlikely(count == 0)) return; pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; @@ -483,11 +492,7 @@ static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) count += base; for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) SetPageUptodate(*pages); - /* - * Was this an eof or a short read? If the latter, don't mark the page - * as uptodate yet. - */ - if (count > 0 && (data->res.eof || data->args.count == data->res.count)) + if (count != 0) SetPageUptodate(*pages); } @@ -502,6 +507,8 @@ static void nfs_readpage_set_pages_error(struct nfs_read_data *data) count += base; for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) SetPageError(*pages); + if (count != 0) + SetPageError(*pages); } /* -- cgit v1.2.3 From e8896495bca8490a427409e0886d63d05419ec65 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 24 Aug 2006 15:44:19 -0400 Subject: NFS: Check lengths more thoroughly in NFS4 readdir XDR decode Check the bounds of length specifiers more thoroughly in the XDR decoding of NFS4 readdir reply data. Currently, if the server returns a bitmap or attr length that causes the current decode point pointer to wrap, this could go undetected (consider a small "negative" length on a 32-bit machine). Also add a check into the main XDR decode handler to make sure that the amount of data is a multiple of four bytes (as specified by RFC-1014). This makes sure that we can do u32* pointer subtraction in the NFS client without risking an undefined result (the result is undefined if the pointers are not correctly aligned with respect to one another). Signed-Off-By: David Howells Signed-off-by: Trond Myklebust (cherry picked from 5861fddd64a7eaf7e8b1a9997455a24e7f688092 commit) --- fs/nfs/nfs4xdr.c | 21 +++++++++++---------- net/sunrpc/clnt.c | 11 +++++++++++ 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1750d996f49f..730ec8fb31c6 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3355,7 +3355,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n struct kvec *iov = rcvbuf->head; unsigned int nr, pglen = rcvbuf->page_len; uint32_t *end, *entry, *p, *kaddr; - uint32_t len, attrlen; + uint32_t len, attrlen, xlen; int hdrlen, recvd, status; status = decode_op_hdr(xdr, OP_READDIR); @@ -3377,10 +3377,10 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); kaddr = p = (uint32_t *) kmap_atomic(page, KM_USER0); - end = (uint32_t *) ((char *)p + pglen + readdir->pgbase); + end = p + ((pglen + readdir->pgbase) >> 2); entry = p; for (nr = 0; *p++; nr++) { - if (p + 3 > end) + if (end - p < 3) goto short_pkt; dprintk("cookie = %Lu, ", *((unsigned long long *)p)); p += 2; /* cookie */ @@ -3389,18 +3389,19 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); goto err_unmap; } - dprintk("filename = %*s\n", len, (char *)p); - p += XDR_QUADLEN(len); - if (p + 1 > end) + xlen = XDR_QUADLEN(len); + if (end - p < xlen + 1) goto short_pkt; + dprintk("filename = %*s\n", len, (char *)p); + p += xlen; len = ntohl(*p++); /* bitmap length */ - p += len; - if (p + 1 > end) + if (end - p < len + 1) goto short_pkt; + p += len; attrlen = XDR_QUADLEN(ntohl(*p++)); - p += attrlen; /* attributes */ - if (p + 2 > end) + if (end - p < attrlen + 2) goto short_pkt; + p += attrlen; /* attributes */ entry = p; } if (!nr && (entry[0] != 0 || entry[1] == 0)) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d9eac7069101..3e19d321067a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1181,6 +1181,17 @@ call_verify(struct rpc_task *task) u32 *p = iov->iov_base, n; int error = -EACCES; + if ((task->tk_rqstp->rq_rcv_buf.len & 3) != 0) { + /* RFC-1014 says that the representation of XDR data must be a + * multiple of four bytes + * - if it isn't pointer subtraction in the NFS client may give + * undefined results + */ + printk(KERN_WARNING + "call_verify: XDR representation not a multiple of" + " 4 bytes: 0x%x\n", task->tk_rqstp->rq_rcv_buf.len); + goto out_eio; + } if ((len -= 3) < 0) goto out_overflow; p += 1; /* skip XID */ -- cgit v1.2.3 From 16b4289c7460ba9c04af40c574949dcca9029658 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 24 Aug 2006 12:27:15 -0400 Subject: NFSv4: Add v4 exception handling for the ACL functions. This is needed in order to handle any NFS4ERR_DELAY errors that might be returned by the server. It also ensures that we map the NFSv4 errors before they are returned to userland. Signed-off-by: Trond Myklebust (cherry picked from 71c12b3f0abc7501f6ed231a6d17bc9c05a238dc commit) --- fs/nfs/nfs4proc.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e6ee97f19d81..153898e1331f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2668,7 +2668,7 @@ out: nfs4_set_cached_acl(inode, acl); } -static inline ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { struct page *pages[NFS4ACL_MAXPAGES]; struct nfs_getaclargs args = { @@ -2721,6 +2721,19 @@ out_free: return ret; } +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + ssize_t ret; + do { + ret = __nfs4_get_acl_uncached(inode, buf, buflen); + if (ret >= 0) + break; + ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception); + } while (exception.retry); + return ret; +} + static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); @@ -2737,7 +2750,7 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) return nfs4_get_acl_uncached(inode, buf, buflen); } -static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFS4ACL_MAXPAGES]; @@ -2763,6 +2776,18 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen return ret; } +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + __nfs4_proc_set_acl(inode, buf, buflen), + &exception); + } while (exception.retry); + return err; +} + static int nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) { -- cgit v1.2.3 From a343bb7750e6a098909c34f5c5dfddbc4fa40053 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 20:06:03 -0400 Subject: VFS: Fix access("file", X_OK) in the presence of ACLs Currently, the access() call will return incorrect information on NFS if there exists an ACL that grants execute access to the user on a regular file. The reason the information is incorrect is that the VFS overrides this execute access in open_exec() by checking (inode->i_mode & 0111). This patch propagates the VFS execute bit check back into the generic permission() call. Signed-off-by: Trond Myklebust (cherry picked from 64cbae98848c4c99851cb0a405f0b4982cd76c1e commit) --- fs/namei.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 863166441bf3..432d6bc6fab0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -227,10 +227,10 @@ int generic_permission(struct inode *inode, int mask, int permission(struct inode *inode, int mask, struct nameidata *nd) { + umode_t mode = inode->i_mode; int retval, submask; if (mask & MAY_WRITE) { - umode_t mode = inode->i_mode; /* * Nobody gets write access to a read-only fs. @@ -247,6 +247,13 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) } + /* + * MAY_EXEC on regular files requires special handling: We override + * filesystem execute permissions if the mode bits aren't set. + */ + if ((mask & MAY_EXEC) && S_ISREG(mode) && !(mode & S_IXUGO)) + return -EACCES; + /* Ordinary permission routines do not understand MAY_APPEND. */ submask = mask & ~MAY_APPEND; if (inode->i_op && inode->i_op->permission) -- cgit v1.2.3 From 9167b0b9a0ab7907191523f5a0528e3b9c288e21 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 20:06:03 -0400 Subject: VFS: Remove redundant open-coded mode bit check in prepare_binfmt(). The check in prepare_binfmt() for inode->i_mode & 0111 is redundant, since open_exec() will already have done that. Signed-off-by: Trond Myklebust (cherry picked from 822dec482ced07af32c378cd936d77345786572b commit) --- fs/exec.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 8344ba73a2a6..a6f64a98ac50 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -922,12 +922,6 @@ int prepare_binprm(struct linux_binprm *bprm) int retval; mode = inode->i_mode; - /* - * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, - * generic_permission lets a non-executable through - */ - if (!(mode & 0111)) /* with at least _one_ execute bit set */ - return -EACCES; if (bprm->file->f_op == NULL) return -EACCES; -- cgit v1.2.3 From a969fd5a4e162c4485ae8f3e49d674656a18fa36 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 20:06:04 -0400 Subject: VFS: Remove redundant open-coded mode bit checks in open_exec(). The check in open_exec() for inode->i_mode & 0111 has been made redundant by the fix to permission(). Signed-off-by: Trond Myklebust (cherry picked from 1d3741c5d991686699f100b65b9956f7ee7ae0ae commit) --- fs/exec.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index a6f64a98ac50..f7aabfeca033 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -486,8 +486,6 @@ struct file *open_exec(const char *name) if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && S_ISREG(inode->i_mode)) { int err = vfs_permission(&nd, MAY_EXEC); - if (!err && !(inode->i_mode & 0111)) - err = -EACCES; file = ERR_PTR(err); if (!err) { file = nameidata_to_filp(&nd, O_RDONLY); -- cgit v1.2.3 From 81a42d298d8bd1b96be4bd459494f25fdd99b594 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 25 Aug 2006 15:58:57 -0700 Subject: [DISKLABEL] SUN: Fix signed int usage for sector count The current sun disklabel code uses a signed int for the sector count. When partitions larger than 1 TB are used, the cast to a sector_t causes the partition sizes to be invalid: # cat /proc/paritions | grep sdan 66 112 2146435072 sdan 66 115 9223372036853660736 sdan3 66 120 9223372036853660736 sdan8 This patch switches the sector count to an unsigned int to fix this. Signed-off-by: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- fs/partitions/sun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c index abe91ca03edf..0a5927c806ca 100644 --- a/fs/partitions/sun.c +++ b/fs/partitions/sun.c @@ -74,7 +74,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); for (i = 0; i < 8; i++, p++) { unsigned long st_sector; - int num_sectors; + unsigned int num_sectors; st_sector = be32_to_cpu(p->start_cylinder) * spc; num_sectors = be32_to_cpu(p->num_sectors); -- cgit v1.2.3 From 6946bd636364effce06ea46fe8f8cd6e2edb004e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 27 Aug 2006 01:23:31 -0700 Subject: [PATCH] lockdep: fix blkdev_open() warning On Wed, 2006-08-09 at 07:57 +0200, Rolf Eike Beer wrote: > ============================================= > [ INFO: possible recursive locking detected ] > --------------------------------------------- > parted/7929 is trying to acquire lock: > (&bdev->bd_mutex){--..}, at: [] __blkdev_put+0x1e/0x13c > > but task is already holding lock: > (&bdev->bd_mutex){--..}, at: [] do_open+0x72/0x3a8 > > other info that might help us debug this: > 1 lock held by parted/7929: > #0: (&bdev->bd_mutex){--..}, at: [] do_open+0x72/0x3a8 > stack backtrace: > [] show_trace_log_lvl+0x58/0x15b > [] show_trace+0xd/0x10 > [] dump_stack+0x17/0x1a > [] __lock_acquire+0x753/0x99c > [] lock_acquire+0x4a/0x6a > [] mutex_lock_nested+0xc8/0x20c > [] __blkdev_put+0x1e/0x13c > [] blkdev_put+0xa/0xc > [] do_open+0x336/0x3a8 > [] blkdev_open+0x1f/0x4c > [] __dentry_open+0xc7/0x1aa > [] nameidata_to_filp+0x1c/0x2e > [] do_filp_open+0x2e/0x35 > [] do_sys_open+0x38/0x68 > [] sys_open+0x16/0x18 > [] sysenter_past_esp+0x56/0x8d OK, I'm having a look here; its all new to me so bear with me. blkdev_open() calls do_open(bdev, ...,BD_MUTEX_NORMAL) and takes mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_NORMAL) then something fails, and we're thrown to: out_first: where if (bdev != bdev->bd_contains) blkdev_put(bdev->bd_contains) which is __blkdev_put(bdev->bd_contains, BD_MUTEX_NORMAL) which does mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_NORMAL) <--- lockdep trigger When going to out_first, dbev->bd_contains is either bdev or whole, and since we take the branch it must be whole. So it seems to me the following patch would be the right one: [akpm@osdl.org: compile fix] Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Acked-by: NeilBrown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 114 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 56 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 37534573960b..045f98854f14 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -884,6 +884,61 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); +static int __blkdev_put(struct block_device *bdev, unsigned int subclass) +{ + int ret = 0; + struct inode *bd_inode = bdev->bd_inode; + struct gendisk *disk = bdev->bd_disk; + + mutex_lock_nested(&bdev->bd_mutex, subclass); + lock_kernel(); + if (!--bdev->bd_openers) { + sync_blockdev(bdev); + kill_bdev(bdev); + } + if (bdev->bd_contains == bdev) { + if (disk->fops->release) + ret = disk->fops->release(bd_inode, NULL); + } else { + mutex_lock_nested(&bdev->bd_contains->bd_mutex, + subclass + 1); + bdev->bd_contains->bd_part_count--; + mutex_unlock(&bdev->bd_contains->bd_mutex); + } + if (!bdev->bd_openers) { + struct module *owner = disk->fops->owner; + + put_disk(disk); + module_put(owner); + + if (bdev->bd_contains != bdev) { + kobject_put(&bdev->bd_part->kobj); + bdev->bd_part = NULL; + } + bdev->bd_disk = NULL; + bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + if (bdev != bdev->bd_contains) + __blkdev_put(bdev->bd_contains, subclass + 1); + bdev->bd_contains = NULL; + } + unlock_kernel(); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} + +int blkdev_put(struct block_device *bdev) +{ + return __blkdev_put(bdev, BD_MUTEX_NORMAL); +} +EXPORT_SYMBOL(blkdev_put); + +int blkdev_put_partition(struct block_device *bdev) +{ + return __blkdev_put(bdev, BD_MUTEX_PARTITION); +} +EXPORT_SYMBOL(blkdev_put_partition); + static int blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags); @@ -980,7 +1035,7 @@ out_first: bdev->bd_disk = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) - blkdev_put(bdev->bd_contains); + __blkdev_put(bdev->bd_contains, BD_MUTEX_WHOLE); bdev->bd_contains = NULL; put_disk(disk); module_put(owner); @@ -1079,63 +1134,6 @@ static int blkdev_open(struct inode * inode, struct file * filp) return res; } -static int __blkdev_put(struct block_device *bdev, unsigned int subclass) -{ - int ret = 0; - struct inode *bd_inode = bdev->bd_inode; - struct gendisk *disk = bdev->bd_disk; - - mutex_lock_nested(&bdev->bd_mutex, subclass); - lock_kernel(); - if (!--bdev->bd_openers) { - sync_blockdev(bdev); - kill_bdev(bdev); - } - if (bdev->bd_contains == bdev) { - if (disk->fops->release) - ret = disk->fops->release(bd_inode, NULL); - } else { - mutex_lock_nested(&bdev->bd_contains->bd_mutex, - subclass + 1); - bdev->bd_contains->bd_part_count--; - mutex_unlock(&bdev->bd_contains->bd_mutex); - } - if (!bdev->bd_openers) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - - if (bdev->bd_contains != bdev) { - kobject_put(&bdev->bd_part->kobj); - bdev->bd_part = NULL; - } - bdev->bd_disk = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; - if (bdev != bdev->bd_contains) - __blkdev_put(bdev->bd_contains, subclass + 1); - bdev->bd_contains = NULL; - } - unlock_kernel(); - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - return ret; -} - -int blkdev_put(struct block_device *bdev) -{ - return __blkdev_put(bdev, BD_MUTEX_NORMAL); -} - -EXPORT_SYMBOL(blkdev_put); - -int blkdev_put_partition(struct block_device *bdev) -{ - return __blkdev_put(bdev, BD_MUTEX_PARTITION); -} - -EXPORT_SYMBOL(blkdev_put_partition); - static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); -- cgit v1.2.3 From f5fb09fa3392ad43fbcfc2f4580752f383ab5996 Mon Sep 17 00:00:00 2001 From: Andries Brouwer Date: Sun, 27 Aug 2006 01:23:42 -0700 Subject: [PATCH] Fix for minix crash Mounting a (corrupt) minix filesystem with zero s_zmap_blocks gives a spectacular crash on my 2.6.17.8 system, no doubt because minix/inode.c does an unconditional minix_set_bit(0,sbi->s_zmap[0]->b_data); [akpm@osdl.org: make labels conistent while we're there] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/minix/inode.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 9ea91c5eeb7b..330ff9fc7cf0 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -204,6 +204,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) /* * Allocate the buffer map to keep the superblock small. */ + if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0) + goto out_illegal_sb; i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh); map = kmalloc(i, GFP_KERNEL); if (!map) @@ -263,7 +265,7 @@ out_no_root: out_no_bitmap: printk("MINIX-fs: bad superblock or unable to read bitmaps\n"); - out_freemap: +out_freemap: for (i = 0; i < sbi->s_imap_blocks; i++) brelse(sbi->s_imap[i]); for (i = 0; i < sbi->s_zmap_blocks; i++) @@ -276,11 +278,16 @@ out_no_map: printk("MINIX-fs: can't allocate map\n"); goto out_release; +out_illegal_sb: + if (!silent) + printk("MINIX-fs: bad superblock\n"); + goto out_release; + out_no_fs: if (!silent) printk("VFS: Can't find a Minix or Minix V2 filesystem " "on device %s\n", s->s_id); - out_release: +out_release: brelse(bh); goto out; @@ -290,7 +297,7 @@ out_bad_hblock: out_bad_sb: printk("MINIX-fs: unable to read superblock\n"); - out: +out: s->s_fs_info = NULL; kfree(sbi); return -EINVAL; -- cgit v1.2.3 From 607eb266aea9dd2abe515985e12c5cd8b32546e8 Mon Sep 17 00:00:00 2001 From: Andries Brouwer Date: Sun, 27 Aug 2006 01:23:43 -0700 Subject: [PATCH] ext2: prevent div-by-zero on corrupted fs Mounting an ext2 filesystem with zero s_inodes_per_group will cause a divide error. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f2702cda9779..681dea8f9532 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -775,7 +775,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (EXT2_INODE_SIZE(sb) == 0) goto cantfind_ext2; sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0) + if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0) goto cantfind_ext2; sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; -- cgit v1.2.3 From 08fb306fe63d98eb86e3b16f4cc21816fa47f18e Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Sun, 27 Aug 2006 01:23:44 -0700 Subject: [PATCH] ext3 filesystem bogus ENOSPC with reservation fix To handle the earlier bogus ENOSPC error caused by filesystem full of block reservation, current code falls back to non block reservation, starts to allocate block(s) from the goal allocation block group as if there is no block reservation. Current code needs to re-load the corresponding block group descriptor for the initial goal block group in this case. The patch fixes this. Signed-off-by: Mingming Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/balloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index a504a40d6d29..063d994bda0b 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1269,12 +1269,12 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, goal = le32_to_cpu(es->s_first_data_block); group_no = (goal - le32_to_cpu(es->s_first_data_block)) / EXT3_BLOCKS_PER_GROUP(sb); + goal_group = group_no; +retry_alloc: gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); if (!gdp) goto io_error; - goal_group = group_no; -retry: free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); /* * if there is not enough free blocks to make a new resevation @@ -1349,7 +1349,7 @@ retry: if (my_rsv) { my_rsv = NULL; group_no = goal_group; - goto retry; + goto retry_alloc; } /* No space left on the device */ *errp = -ENOSPC; -- cgit v1.2.3 From c37336b078ba9d2ff38c535b194996a7ad6e69f8 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Sun, 27 Aug 2006 01:23:45 -0700 Subject: [PATCH] ufs: write to hole in big file On UFS, this scenario: open(O_TRUNC) lseek(1024 * 1024 * 80) write("A") lseek(1024 * 2) write("A") may cause access to invalid address. This happened because of "goal" is calculated in wrong way in block allocation path, as I see this problem exists also in 2.4. We use construction like this i_data[lastfrag], i_data array of pointers to direct blocks, indirect and so on, it has ceratain size ~20 elements, and lastfrag may have value for example 40000. Also this patch fixes related to handling such scenario issues, wrong zeroing metadata, in case of block(not fragment) allocation, and wrong goal calculation, when we allocate block Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/inode.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e7c8615beb65..30c6e8a9446c 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -169,18 +169,20 @@ static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh) static struct buffer_head * ufs_clear_frags(struct inode *inode, sector_t beg, - unsigned int n) + unsigned int n, sector_t want) { - struct buffer_head *res, *bh; + struct buffer_head *res = NULL, *bh; sector_t end = beg + n; - res = sb_getblk(inode->i_sb, beg); - ufs_clear_frag(inode, res); - for (++beg; beg < end; ++beg) { + for (; beg < end; ++beg) { bh = sb_getblk(inode->i_sb, beg); ufs_clear_frag(inode, bh); - brelse(bh); + if (want != beg) + brelse(bh); + else + res = bh; } + BUG_ON(!res); return res; } @@ -265,7 +267,9 @@ repeat: lastfrag = ufsi->i_lastfrag; } - goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb; + tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]); + if (tmp) + goal = tmp + uspi->s_fpb; tmp = ufs_new_fragments (inode, p, fragment - blockoff, goal, required + blockoff, err, locked_page); @@ -277,13 +281,15 @@ repeat: tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff), fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff), err, locked_page); - } + } else /* (lastblock > block) */ { /* * We will allocate new block before last allocated block */ - else /* (lastblock > block) */ { - if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1]))) - goal = tmp + uspi->s_fpb; + if (block) { + tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[block-1]); + if (tmp) + goal = tmp + uspi->s_fpb; + } tmp = ufs_new_fragments(inode, p, fragment - blockoff, goal, uspi->s_fpb, err, locked_page); } @@ -296,7 +302,7 @@ repeat: } if (!phys) { - result = ufs_clear_frags(inode, tmp + blockoff, required); + result = ufs_clear_frags(inode, tmp, required, tmp + blockoff); } else { *phys = tmp + blockoff; result = NULL; @@ -383,7 +389,7 @@ repeat: } } - if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1]) + uspi->s_fpb)) + if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1]))) goal = tmp + uspi->s_fpb; else goal = bh->b_blocknr + uspi->s_fpb; @@ -397,7 +403,8 @@ repeat: if (!phys) { - result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb); + result = ufs_clear_frags(inode, tmp, uspi->s_fpb, + tmp + blockoff); } else { *phys = tmp + blockoff; *new = 1; -- cgit v1.2.3 From ecdc63948763586e101108dfe1ba316ec069fe39 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Sun, 27 Aug 2006 01:23:46 -0700 Subject: [PATCH] ufs: truncate correction 1) When we allocated last fragment in ufs_truncate, we read page, check if block mapped to address, and if not trying to allocate it. This is wrong behaviour, fragment may be NOT allocated, but mapped, this happened because of "block map" function not checked allocated fragment or not, it just take address of the first fragment in the block, add offset of fragment and return result, this is correct behaviour in almost all situation except call from ufs_truncate. 2) Almost all implementation of UFS, which I can investigate have such "defect": if you have full disk, and try truncate file, for example 3GB to 2MB, and have hole in this region, truncate return -ENOSPC. I tried evade from this problem, but "block allocation" algorithm is tied to right value of i_lastfrag, and fix of this corner case may slow down of ordinaries scenarios, so this patch makes behavior of "truncate" operations similar to what other UFS implementations do. Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/truncate.c | 77 ++++++++++++++++++------------------------------------- 1 file changed, 25 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index c9b55872079b..ea11d04c41a0 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -375,17 +375,15 @@ static int ufs_alloc_lastblock(struct inode *inode) int err = 0; struct address_space *mapping = inode->i_mapping; struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; - struct ufs_inode_info *ufsi = UFS_I(inode); unsigned lastfrag, i, end; struct page *lastpage; struct buffer_head *bh; lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; - if (!lastfrag) { - ufsi->i_lastfrag = 0; + if (!lastfrag) goto out; - } + lastfrag--; lastpage = ufs_get_locked_page(mapping, lastfrag >> @@ -400,25 +398,25 @@ static int ufs_alloc_lastblock(struct inode *inode) for (i = 0; i < end; ++i) bh = bh->b_this_page; - if (!buffer_mapped(bh)) { - err = ufs_getfrag_block(inode, lastfrag, bh, 1); - - if (unlikely(err)) - goto out_unlock; - - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - /* - * we do not zeroize fragment, because of - * if it maped to hole, it already contains zeroes - */ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - set_page_dirty(lastpage); - } + + err = ufs_getfrag_block(inode, lastfrag, bh, 1); + + if (unlikely(err)) + goto out_unlock; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + /* + * we do not zeroize fragment, because of + * if it maped to hole, it already contains zeroes + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + set_page_dirty(lastpage); } + out_unlock: ufs_put_locked_page(lastpage); out: @@ -440,23 +438,11 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; - if (inode->i_size > old_i_size) { - /* - * if we expand file we should care about - * allocation of block for last byte first of all - */ - err = ufs_alloc_lastblock(inode); + err = ufs_alloc_lastblock(inode); - if (err) { - i_size_write(inode, old_i_size); - goto out; - } - /* - * go away, because of we expand file, and we do not - * need free blocks, and zeroizes page - */ - lock_kernel(); - goto almost_end; + if (err) { + i_size_write(inode, old_i_size); + goto out; } block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); @@ -477,21 +463,8 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) yield(); } - if (inode->i_size < old_i_size) { - /* - * now we should have enough space - * to allocate block for last byte - */ - err = ufs_alloc_lastblock(inode); - if (err) - /* - * looks like all the same - we have no space, - * but we truncate file already - */ - inode->i_size = (ufsi->i_lastfrag - 1) * uspi->s_fsize; - } -almost_end: inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ufsi->i_lastfrag = DIRECT_FRAGMENT; unlock_kernel(); mark_inode_dirty(inode); out: -- cgit v1.2.3 From 45f17e0c2ae05c133a348452690de0e5fa863293 Mon Sep 17 00:00:00 2001 From: Masoud Asgharifard Sharbiani Date: Sun, 27 Aug 2006 01:23:48 -0700 Subject: [PATCH] eventpoll.c compile fix Fix two compile failures in eventpoll.c code which would happen if DEBUG_EPOLL is bigger than zero. Signed-off-by: Masoud Sharbiani Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 19ffb043abbc..3a3567433b92 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1168,7 +1168,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", - current, ep, epi->file, error)); + current, ep, epi->ffd.file, error)); return error; } @@ -1236,7 +1236,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct eventpoll *ep = epi->ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", - current, epi->file, epi, ep)); + current, epi->ffd.file, epi, ep)); write_lock_irqsave(&ep->lock, flags); -- cgit v1.2.3 From ea817398e68dfa25612229fda7fc74580cf915fb Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sun, 27 Aug 2006 01:23:52 -0700 Subject: [PATCH] Manage jbd allocations from its own slabs JBD currently allocates commit and frozen buffers from slabs. With CONFIG_SLAB_DEBUG, its possible for an allocation to cross the page boundary causing IO problems. https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=200127 So, instead of allocating these from regular slabs - manage allocation from its own slabs and disable slab debug for these slabs. [akpm@osdl.org: cleanups] Signed-off-by: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 6 ++-- fs/jbd/journal.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++---- fs/jbd/transaction.c | 9 ++--- include/linux/jbd.h | 3 ++ 4 files changed, 97 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 0971814c38b8..42da60784311 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -261,7 +261,7 @@ void journal_commit_transaction(journal_t *journal) struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; jbd_unlock_bh_state(bh); } @@ -745,14 +745,14 @@ restart_loop: * Otherwise, we can just throw away the frozen data now. */ if (jh->b_committed_data) { - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; } } else if (jh->b_frozen_data) { - kfree(jh->b_frozen_data); + jbd_slab_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 8c9b28dff119..f66724ce443a 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -84,6 +84,7 @@ EXPORT_SYMBOL(journal_force_commit); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static int journal_create_jbd_slab(size_t slab_size); /* * Helper function used to manage commit timeouts @@ -328,10 +329,10 @@ repeat: char *tmp; jbd_unlock_bh_state(bh_in); - tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS); + tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS); jbd_lock_bh_state(bh_in); if (jh_in->b_frozen_data) { - kfree(tmp); + jbd_slab_free(tmp, bh_in->b_size); goto repeat; } @@ -1069,17 +1070,17 @@ static int load_superblock(journal_t *journal) int journal_load(journal_t *journal) { int err; + journal_superblock_t *sb; err = load_superblock(journal); if (err) return err; + sb = journal->j_superblock; /* If this is a V2 superblock, then we have to check the * features flags on it. */ if (journal->j_format_version >= 2) { - journal_superblock_t *sb = journal->j_superblock; - if ((sb->s_feature_ro_compat & ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & @@ -1090,6 +1091,13 @@ int journal_load(journal_t *journal) } } + /* + * Create a slab for this blocksize + */ + err = journal_create_jbd_slab(cpu_to_be32(sb->s_blocksize)); + if (err) + return err; + /* Let the recovery code check whether it needs to recover any * data from the journal. */ if (journal_recover(journal)) @@ -1611,6 +1619,77 @@ void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry) return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0)); } +/* + * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed + * and allocate frozen and commit buffers from these slabs. + * + * Reason for doing this is to avoid, SLAB_DEBUG - since it could + * cause bh to cross page boundary. + */ + +#define JBD_MAX_SLABS 5 +#define JBD_SLAB_INDEX(size) (size >> 11) + +static kmem_cache_t *jbd_slab[JBD_MAX_SLABS]; +static const char *jbd_slab_names[JBD_MAX_SLABS] = { + "jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k" +}; + +static void journal_destroy_jbd_slabs(void) +{ + int i; + + for (i = 0; i < JBD_MAX_SLABS; i++) { + if (jbd_slab[i]) + kmem_cache_destroy(jbd_slab[i]); + jbd_slab[i] = NULL; + } +} + +static int journal_create_jbd_slab(size_t slab_size) +{ + int i = JBD_SLAB_INDEX(slab_size); + + BUG_ON(i >= JBD_MAX_SLABS); + + /* + * Check if we already have a slab created for this size + */ + if (jbd_slab[i]) + return 0; + + /* + * Create a slab and force alignment to be same as slabsize - + * this will make sure that allocations won't cross the page + * boundary. + */ + jbd_slab[i] = kmem_cache_create(jbd_slab_names[i], + slab_size, slab_size, 0, NULL, NULL); + if (!jbd_slab[i]) { + printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +void * jbd_slab_alloc(size_t size, gfp_t flags) +{ + int idx; + + idx = JBD_SLAB_INDEX(size); + BUG_ON(jbd_slab[idx] == NULL); + return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL); +} + +void jbd_slab_free(void *ptr, size_t size) +{ + int idx; + + idx = JBD_SLAB_INDEX(size); + BUG_ON(jbd_slab[idx] == NULL); + kmem_cache_free(jbd_slab[idx], ptr); +} + /* * Journal_head storage management */ @@ -1799,13 +1878,13 @@ static void __journal_remove_journal_head(struct buffer_head *bh) printk(KERN_WARNING "%s: freeing " "b_frozen_data\n", __FUNCTION__); - kfree(jh->b_frozen_data); + jbd_slab_free(jh->b_frozen_data, bh->b_size); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing " "b_committed_data\n", __FUNCTION__); - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); } bh->b_private = NULL; jh->b_bh = NULL; /* debug, really */ @@ -1961,6 +2040,7 @@ static void journal_destroy_caches(void) journal_destroy_revoke_caches(); journal_destroy_journal_head_cache(); journal_destroy_handle_cache(); + journal_destroy_jbd_slabs(); } static int __init journal_init(void) diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 508b2ea91f43..de2e4cbbf79a 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -666,8 +666,9 @@ repeat: if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, - GFP_NOFS); + frozen_buffer = + jbd_slab_alloc(jh2bh(jh)->b_size, + GFP_NOFS); if (!frozen_buffer) { printk(KERN_EMERG "%s: OOM for frozen_buffer\n", @@ -879,7 +880,7 @@ int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) repeat: if (!jh->b_committed_data) { - committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS); + committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { printk(KERN_EMERG "%s: No memory for committed data\n", __FUNCTION__); @@ -906,7 +907,7 @@ repeat: out: journal_put_journal_head(jh); if (unlikely(committed_data)) - kfree(committed_data); + jbd_slab_free(committed_data, bh->b_size); return err; } diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 20eb34403d0c..a04c154c5207 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -72,6 +72,9 @@ extern int journal_enable_debug; #endif extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry); +extern void * jbd_slab_alloc(size_t size, gfp_t flags); +extern void jbd_slab_free(void *ptr, size_t size); + #define jbd_kmalloc(size, flags) \ __jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry) #define jbd_rep_kmalloc(size, flags) \ -- cgit v1.2.3 From 4df46240a1312161e3c794f6ace50ef7eb5ff3d7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 27 Aug 2006 01:23:56 -0700 Subject: [PATCH] lockdep: annotate reiserfs reiserfs seems to have another locking level layer for the i_mutex due to the xattrs-are-a-directory thing. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.c | 2 +- include/linux/fs.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 39fedaa88a0c..d935fb9394e3 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -424,7 +424,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf) int res = -ENOTDIR; if (!file->f_op || !file->f_op->readdir) goto out; - mutex_lock(&inode->i_mutex); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR); // down(&inode->i_zombie); res = -ENOENT; if (!IS_DEADDIR(inode)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 25610205c90d..555bc195c420 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -570,13 +570,14 @@ struct inode { * 3: quota file * * The locking order between these classes is - * parent -> child -> normal -> quota + * parent -> child -> normal -> xattr -> quota */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, + I_MUTEX_XATTR, I_MUTEX_QUOTA }; -- cgit v1.2.3 From 513627d7fec6fcb7b3d56ce355cb4d192c76b530 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Sun, 27 Aug 2006 01:23:57 -0700 Subject: [PATCH] fix up lockdep trace in fs/exec.c This fixes the locking error noticed by lockdep: ============================================= [ INFO: possible recursive locking detected ] --------------------------------------------- init/1 is trying to acquire lock: (&sighand->siglock){....}, at: [] flush_old_exec+0x3ae/0x859 but task is already holding lock: (&sighand->siglock){....}, at: [] flush_old_exec+0x39e/0x859 other info that might help us debug this: 2 locks held by init/1: #0: (tasklist_lock){..--}, at: [] flush_old_exec+0x38e/0x859 #1: (&sighand->siglock){....}, at: [] flush_old_exec+0x39e/0x859 stack backtrace: [] show_trace_log_lvl+0x54/0xfd [] show_trace+0xd/0x10 [] dump_stack+0x19/0x1b [] __lock_acquire+0x773/0x997 [] lock_acquire+0x4b/0x6c [] _spin_lock+0x19/0x28 [] flush_old_exec+0x3ae/0x859 [] load_elf_binary+0x4aa/0x1628 [] search_binary_handler+0xa7/0x24e [] do_execve+0x15b/0x1f9 [] sys_execve+0x29/0x4d [] syscall_call+0x7/0xb Signed-off-by: Arjan van de Ven Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index f7aabfeca033..54135df2a966 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -751,7 +751,7 @@ no_thread_group: write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); - spin_lock(&newsighand->siglock); + spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING); rcu_assign_pointer(current->sighand, newsighand); recalc_sigpending(); -- cgit v1.2.3 From f5ef68da5fda5e095b585ea5ecdd42af3c8695f7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 27 Aug 2006 01:23:58 -0700 Subject: [PATCH] /proc/meminfo: don't put spaces in names None of the other /proc/meminfo lines have a space in the identifier. This post-2.6.17 addition has the potential to break existing parsers, so use an underscore instead (like Committed_AS). Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 2 +- fs/proc/proc_misc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/drivers/base/node.c b/drivers/base/node.c index d7de1753e094..e9b0957f15d1 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -64,7 +64,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf) "Node %d Mapped: %8lu kB\n" "Node %d AnonPages: %8lu kB\n" "Node %d PageTables: %8lu kB\n" - "Node %d NFS Unstable: %8lu kB\n" + "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" "Node %d Slab: %8lu kB\n", nid, K(i.totalram), diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 9f2cfc30f9cf..942156225447 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -169,7 +169,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "Mapped: %8lu kB\n" "Slab: %8lu kB\n" "PageTables: %8lu kB\n" - "NFS Unstable: %8lu kB\n" + "NFS_Unstable: %8lu kB\n" "Bounce: %8lu kB\n" "CommitLimit: %8lu kB\n" "Committed_AS: %8lu kB\n" -- cgit v1.2.3