diff options
Diffstat (limited to 'fs')
257 files changed, 6457 insertions, 3410 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index b3c2cc79c20d..082d227fa56b 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -277,6 +277,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, case ACL_TYPE_ACCESS: if (acl) { struct iattr iattr; + struct posix_acl *old_acl = acl; retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); if (retval) @@ -287,6 +288,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, * by the mode bits. So don't * update ACL. */ + posix_acl_release(old_acl); value = NULL; size = 0; } diff --git a/fs/Kconfig b/fs/Kconfig index c2a377cdda2b..83eab52fb3f6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -38,6 +38,7 @@ config FS_DAX bool "Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) + select FS_IOMAP help Direct Access (DAX) can be used on memory-backed block devices. If the block device supports DAX and the filesystem supports DAX, diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 1e9d2f84e5b5..b29447e03ede 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -343,7 +343,7 @@ void afs_dispatch_give_up_callbacks(struct work_struct *work) * had callbacks entirely, and the server will call us later to break * them */ - afs_fs_give_up_callbacks(server, &afs_async_call); + afs_fs_give_up_callbacks(server, true); } /* diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index d764236072b1..2edbdcbf6432 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -24,65 +24,86 @@ static int afs_deliver_cb_callback(struct afs_call *); static int afs_deliver_cb_probe_uuid(struct afs_call *); static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *); static void afs_cm_destructor(struct afs_call *); +static void SRXAFSCB_CallBack(struct work_struct *); +static void SRXAFSCB_InitCallBackState(struct work_struct *); +static void SRXAFSCB_Probe(struct work_struct *); +static void SRXAFSCB_ProbeUuid(struct work_struct *); +static void SRXAFSCB_TellMeAboutYourself(struct work_struct *); + +#define CM_NAME(name) \ + const char afs_SRXCB##name##_name[] __tracepoint_string = \ + "CB." #name /* * CB.CallBack operation type */ +static CM_NAME(CallBack); static const struct afs_call_type afs_SRXCBCallBack = { - .name = "CB.CallBack", + .name = afs_SRXCBCallBack_name, .deliver = afs_deliver_cb_callback, .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_CallBack, }; /* * CB.InitCallBackState operation type */ +static CM_NAME(InitCallBackState); static const struct afs_call_type afs_SRXCBInitCallBackState = { - .name = "CB.InitCallBackState", + .name = afs_SRXCBInitCallBackState_name, .deliver = afs_deliver_cb_init_call_back_state, .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_InitCallBackState, }; /* * CB.InitCallBackState3 operation type */ +static CM_NAME(InitCallBackState3); static const struct afs_call_type afs_SRXCBInitCallBackState3 = { - .name = "CB.InitCallBackState3", + .name = afs_SRXCBInitCallBackState3_name, .deliver = afs_deliver_cb_init_call_back_state3, .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_InitCallBackState, }; /* * CB.Probe operation type */ +static CM_NAME(Probe); static const struct afs_call_type afs_SRXCBProbe = { - .name = "CB.Probe", + .name = afs_SRXCBProbe_name, .deliver = afs_deliver_cb_probe, .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_Probe, }; /* * CB.ProbeUuid operation type */ +static CM_NAME(ProbeUuid); static const struct afs_call_type afs_SRXCBProbeUuid = { - .name = "CB.ProbeUuid", + .name = afs_SRXCBProbeUuid_name, .deliver = afs_deliver_cb_probe_uuid, .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_ProbeUuid, }; /* * CB.TellMeAboutYourself operation type */ +static CM_NAME(TellMeAboutYourself); static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { - .name = "CB.TellMeAboutYourself", + .name = afs_SRXCBTellMeAboutYourself_name, .deliver = afs_deliver_cb_tell_me_about_yourself, .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_TellMeAboutYourself, }; /* @@ -153,6 +174,7 @@ static void SRXAFSCB_CallBack(struct work_struct *work) afs_send_empty_reply(call); afs_break_callbacks(call->server, call->count, call->request); + afs_put_call(call); _leave(""); } @@ -274,9 +296,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) return -ENOTCONN; call->server = server; - INIT_WORK(&call->work, SRXAFSCB_CallBack); - queue_work(afs_wq, &call->work); - return 0; + return afs_queue_call_work(call); } /* @@ -290,6 +310,7 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) afs_init_callback_state(call->server); afs_send_empty_reply(call); + afs_put_call(call); _leave(""); } @@ -320,9 +341,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) return -ENOTCONN; call->server = server; - INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); - queue_work(afs_wq, &call->work); - return 0; + return afs_queue_call_work(call); } /* @@ -332,7 +351,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { struct sockaddr_rxrpc srx; struct afs_server *server; - struct afs_uuid *r; + struct uuid_v1 *r; unsigned loop; __be32 *b; int ret; @@ -362,15 +381,15 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) } _debug("unmarshall UUID"); - call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL); if (!call->request) return -ENOMEM; b = call->buffer; r = call->request; - r->time_low = ntohl(b[0]); - r->time_mid = ntohl(b[1]); - r->time_hi_and_version = ntohl(b[2]); + r->time_low = b[0]; + r->time_mid = htons(ntohl(b[1])); + r->time_hi_and_version = htons(ntohl(b[2])); r->clock_seq_hi_and_reserved = ntohl(b[3]); r->clock_seq_low = ntohl(b[4]); @@ -394,9 +413,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) return -ENOTCONN; call->server = server; - INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); - queue_work(afs_wq, &call->work); - return 0; + return afs_queue_call_work(call); } /* @@ -408,6 +425,7 @@ static void SRXAFSCB_Probe(struct work_struct *work) _enter(""); afs_send_empty_reply(call); + afs_put_call(call); _leave(""); } @@ -427,9 +445,7 @@ static int afs_deliver_cb_probe(struct afs_call *call) /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; - INIT_WORK(&call->work, SRXAFSCB_Probe); - queue_work(afs_wq, &call->work); - return 0; + return afs_queue_call_work(call); } /* @@ -438,7 +454,7 @@ static int afs_deliver_cb_probe(struct afs_call *call) static void SRXAFSCB_ProbeUuid(struct work_struct *work) { struct afs_call *call = container_of(work, struct afs_call, work); - struct afs_uuid *r = call->request; + struct uuid_v1 *r = call->request; struct { __be32 match; @@ -452,6 +468,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) reply.match = htonl(1); afs_send_simple_reply(call, &reply, sizeof(reply)); + afs_put_call(call); _leave(""); } @@ -460,7 +477,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) */ static int afs_deliver_cb_probe_uuid(struct afs_call *call) { - struct afs_uuid *r; + struct uuid_v1 *r; unsigned loop; __be32 *b; int ret; @@ -486,15 +503,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) } _debug("unmarshall UUID"); - call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL); if (!call->request) return -ENOMEM; b = call->buffer; r = call->request; - r->time_low = ntohl(b[0]); - r->time_mid = ntohl(b[1]); - r->time_hi_and_version = ntohl(b[2]); + r->time_low = b[0]; + r->time_mid = htons(ntohl(b[1])); + r->time_hi_and_version = htons(ntohl(b[2])); r->clock_seq_hi_and_reserved = ntohl(b[3]); r->clock_seq_low = ntohl(b[4]); @@ -510,9 +527,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) call->state = AFS_CALL_REPLYING; - INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); - queue_work(afs_wq, &call->work); - return 0; + return afs_queue_call_work(call); } /* @@ -554,9 +569,9 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) memset(&reply, 0, sizeof(reply)); reply.ia.nifs = htonl(nifs); - reply.ia.uuid[0] = htonl(afs_uuid.time_low); - reply.ia.uuid[1] = htonl(afs_uuid.time_mid); - reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version); + reply.ia.uuid[0] = afs_uuid.time_low; + reply.ia.uuid[1] = htonl(ntohs(afs_uuid.time_mid)); + reply.ia.uuid[2] = htonl(ntohs(afs_uuid.time_hi_and_version)); reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved); reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low); for (loop = 0; loop < 6; loop++) @@ -574,7 +589,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) reply.cap.capcount = htonl(1); reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION); afs_send_simple_reply(call, &reply, sizeof(reply)); - + afs_put_call(call); _leave(""); } @@ -594,7 +609,5 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; - INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); - queue_work(afs_wq, &call->work); - return 0; + return afs_queue_call_work(call); } diff --git a/fs/afs/file.c b/fs/afs/file.c index 6344aee4ac4b..ba7b71fba34b 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -16,6 +16,7 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/gfp.h> +#include <linux/task_io_accounting_ops.h> #include "internal.h" static int afs_readpage(struct file *file, struct page *page); @@ -101,6 +102,21 @@ int afs_release(struct inode *inode, struct file *file) return 0; } +/* + * Dispose of a ref to a read record. + */ +void afs_put_read(struct afs_read *req) +{ + int i; + + if (atomic_dec_and_test(&req->usage)) { + for (i = 0; i < req->nr_pages; i++) + if (req->pages[i]) + put_page(req->pages[i]); + kfree(req); + } +} + #ifdef CONFIG_AFS_FSCACHE /* * deal with notification that a page was read from the cache @@ -126,9 +142,8 @@ int afs_page_filler(void *data, struct page *page) { struct inode *inode = page->mapping->host; struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_read *req; struct key *key = data; - size_t len; - off_t offset; int ret; _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); @@ -164,12 +179,23 @@ int afs_page_filler(void *data, struct page *page) _debug("cache said ENOBUFS"); default: go_on: - offset = page->index << PAGE_SHIFT; - len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); + req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *), + GFP_KERNEL); + if (!req) + goto enomem; + + atomic_set(&req->usage, 1); + req->pos = (loff_t)page->index << PAGE_SHIFT; + req->len = min_t(size_t, i_size_read(inode) - req->pos, + PAGE_SIZE); + req->nr_pages = 1; + req->pages[0] = page; + get_page(page); /* read the contents of the file from the server into the * page */ - ret = afs_vnode_fetch_data(vnode, key, offset, len, page); + ret = afs_vnode_fetch_data(vnode, key, req); + afs_put_read(req); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -201,6 +227,8 @@ int afs_page_filler(void *data, struct page *page) _leave(" = 0"); return 0; +enomem: + ret = -ENOMEM; error: SetPageError(page); unlock_page(page); @@ -235,6 +263,131 @@ static int afs_readpage(struct file *file, struct page *page) } /* + * Make pages available as they're filled. + */ +static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req) +{ +#ifdef CONFIG_AFS_FSCACHE + struct afs_vnode *vnode = call->reply; +#endif + struct page *page = req->pages[req->index]; + + req->pages[req->index] = NULL; + SetPageUptodate(page); + + /* send the page to the cache */ +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page) && + fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { + fscache_uncache_page(vnode->cache, page); + BUG_ON(PageFsCache(page)); + } +#endif + unlock_page(page); + put_page(page); +} + +/* + * Read a contiguous set of pages. + */ +static int afs_readpages_one(struct file *file, struct address_space *mapping, + struct list_head *pages) +{ + struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct afs_read *req; + struct list_head *p; + struct page *first, *page; + struct key *key = file->private_data; + pgoff_t index; + int ret, n, i; + + /* Count the number of contiguous pages at the front of the list. Note + * that the list goes prev-wards rather than next-wards. + */ + first = list_entry(pages->prev, struct page, lru); + index = first->index + 1; + n = 1; + for (p = first->lru.prev; p != pages; p = p->prev) { + page = list_entry(p, struct page, lru); + if (page->index != index) + break; + index++; + n++; + } + + req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *) * n, + GFP_NOFS); + if (!req) + return -ENOMEM; + + atomic_set(&req->usage, 1); + req->page_done = afs_readpages_page_done; + req->pos = first->index; + req->pos <<= PAGE_SHIFT; + + /* Transfer the pages to the request. We add them in until one fails + * to add to the LRU and then we stop (as that'll make a hole in the + * contiguous run. + * + * Note that it's possible for the file size to change whilst we're + * doing this, but we rely on the server returning less than we asked + * for if the file shrank. We also rely on this to deal with a partial + * page at the end of the file. + */ + do { + page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + index = page->index; + if (add_to_page_cache_lru(page, mapping, index, + readahead_gfp_mask(mapping))) { +#ifdef CONFIG_AFS_FSCACHE + fscache_uncache_page(vnode->cache, page); +#endif + put_page(page); + break; + } + + req->pages[req->nr_pages++] = page; + req->len += PAGE_SIZE; + } while (req->nr_pages < n); + + if (req->nr_pages == 0) { + kfree(req); + return 0; + } + + ret = afs_vnode_fetch_data(vnode, key, req); + if (ret < 0) + goto error; + + task_io_account_read(PAGE_SIZE * req->nr_pages); + afs_put_read(req); + return 0; + +error: + if (ret == -ENOENT) { + _debug("got NOENT from server" + " - marking file deleted and stale"); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } + + for (i = 0; i < req->nr_pages; i++) { + page = req->pages[i]; + if (page) { +#ifdef CONFIG_AFS_FSCACHE + fscache_uncache_page(vnode->cache, page); +#endif + SetPageError(page); + unlock_page(page); + } + } + + afs_put_read(req); + return ret; +} + +/* * read a set of pages */ static int afs_readpages(struct file *file, struct address_space *mapping, @@ -287,8 +440,11 @@ static int afs_readpages(struct file *file, struct address_space *mapping, return ret; } - /* load the missing pages from the network */ - ret = read_cache_pages(mapping, pages, afs_page_filler, key); + while (!list_empty(pages)) { + ret = afs_readpages_one(file, mapping, pages); + if (ret < 0) + break; + } _leave(" = %d [netting]", ret); return ret; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 31c616ab9b40..ac8e766978dc 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -275,7 +275,7 @@ int afs_fs_fetch_file_status(struct afs_server *server, struct key *key, struct afs_vnode *vnode, struct afs_volsync *volsync, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; __be32 *bp; @@ -300,7 +300,7 @@ int afs_fs_fetch_file_status(struct afs_server *server, bp[2] = htonl(vnode->fid.vnode); bp[3] = htonl(vnode->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -309,15 +309,19 @@ int afs_fs_fetch_file_status(struct afs_server *server, static int afs_deliver_fs_fetch_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; + struct afs_read *req = call->reply3; const __be32 *bp; - struct page *page; + unsigned int size; void *buffer; int ret; - _enter("{%u}", call->unmarshall); + _enter("{%u,%zu/%u;%u/%llu}", + call->unmarshall, call->offset, call->count, + req->remain, req->actual_len); switch (call->unmarshall) { case 0: + req->actual_len = 0; call->offset = 0; call->unmarshall++; if (call->operation_ID != FSFETCHDATA64) { @@ -334,10 +338,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (ret < 0) return ret; - call->count = ntohl(call->tmp); - _debug("DATA length MSW: %u", call->count); - if (call->count > 0) - return -EBADMSG; + req->actual_len = ntohl(call->tmp); + req->actual_len <<= 32; call->offset = 0; call->unmarshall++; @@ -349,26 +351,52 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (ret < 0) return ret; - call->count = ntohl(call->tmp); - _debug("DATA length: %u", call->count); - if (call->count > PAGE_SIZE) + req->actual_len |= ntohl(call->tmp); + _debug("DATA length: %llu", req->actual_len); + /* Check that the server didn't want to send us extra. We + * might want to just discard instead, but that requires + * cooperation from AF_RXRPC. + */ + if (req->actual_len > req->len) return -EBADMSG; - call->offset = 0; + + req->remain = req->actual_len; + call->offset = req->pos & (PAGE_SIZE - 1); + req->index = 0; + if (req->actual_len == 0) + goto no_more_data; call->unmarshall++; + begin_page: + if (req->remain > PAGE_SIZE - call->offset) + size = PAGE_SIZE - call->offset; + else + size = req->remain; + call->count = call->offset + size; + ASSERTCMP(call->count, <=, PAGE_SIZE); + req->remain -= size; + /* extract the returned data */ case 3: - _debug("extract data"); - if (call->count > 0) { - page = call->reply3; - buffer = kmap(page); - ret = afs_extract_data(call, buffer, - call->count, true); - kunmap(page); - if (ret < 0) - return ret; + _debug("extract data %u/%llu %zu/%u", + req->remain, req->actual_len, call->offset, call->count); + + buffer = kmap(req->pages[req->index]); + ret = afs_extract_data(call, buffer, call->count, true); + kunmap(req->pages[req->index]); + if (ret < 0) + return ret; + if (call->offset == PAGE_SIZE) { + if (req->page_done) + req->page_done(call, req); + if (req->remain > 0) { + req->index++; + call->offset = 0; + goto begin_page; + } } + no_more_data: call->offset = 0; call->unmarshall++; @@ -393,17 +421,25 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) } if (call->count < PAGE_SIZE) { - _debug("clear"); - page = call->reply3; - buffer = kmap(page); + buffer = kmap(req->pages[req->index]); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap(page); + kunmap(req->pages[req->index]); + if (req->page_done) + req->page_done(call, req); } _leave(" = 0 [done]"); return 0; } +static void afs_fetch_data_destructor(struct afs_call *call) +{ + struct afs_read *req = call->reply3; + + afs_put_read(req); + afs_flat_call_destructor(call); +} + /* * FS.FetchData operation type */ @@ -411,14 +447,14 @@ static const struct afs_call_type afs_RXFSFetchData = { .name = "FS.FetchData", .deliver = afs_deliver_fs_fetch_data, .abort_to_error = afs_abort_to_error, - .destructor = afs_flat_call_destructor, + .destructor = afs_fetch_data_destructor, }; static const struct afs_call_type afs_RXFSFetchData64 = { .name = "FS.FetchData64", .deliver = afs_deliver_fs_fetch_data, .abort_to_error = afs_abort_to_error, - .destructor = afs_flat_call_destructor, + .destructor = afs_fetch_data_destructor, }; /* @@ -427,17 +463,14 @@ static const struct afs_call_type afs_RXFSFetchData64 = { static int afs_fs_fetch_data64(struct afs_server *server, struct key *key, struct afs_vnode *vnode, - off_t offset, size_t length, - struct page *buffer, - const struct afs_wait_mode *wait_mode) + struct afs_read *req, + bool async) { struct afs_call *call; __be32 *bp; _enter(""); - ASSERTCMP(length, <, ULONG_MAX); - call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4); if (!call) return -ENOMEM; @@ -445,7 +478,7 @@ static int afs_fs_fetch_data64(struct afs_server *server, call->key = key; call->reply = vnode; call->reply2 = NULL; /* volsync */ - call->reply3 = buffer; + call->reply3 = req; call->service_id = FS_SERVICE; call->port = htons(AFS_FS_PORT); call->operation_ID = FSFETCHDATA64; @@ -456,12 +489,13 @@ static int afs_fs_fetch_data64(struct afs_server *server, bp[1] = htonl(vnode->fid.vid); bp[2] = htonl(vnode->fid.vnode); bp[3] = htonl(vnode->fid.unique); - bp[4] = htonl(upper_32_bits(offset)); - bp[5] = htonl((u32) offset); + bp[4] = htonl(upper_32_bits(req->pos)); + bp[5] = htonl(lower_32_bits(req->pos)); bp[6] = 0; - bp[7] = htonl((u32) length); + bp[7] = htonl(lower_32_bits(req->len)); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + atomic_inc(&req->usage); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -470,16 +504,16 @@ static int afs_fs_fetch_data64(struct afs_server *server, int afs_fs_fetch_data(struct afs_server *server, struct key *key, struct afs_vnode *vnode, - off_t offset, size_t length, - struct page *buffer, - const struct afs_wait_mode *wait_mode) + struct afs_read *req, + bool async) { struct afs_call *call; __be32 *bp; - if (upper_32_bits(offset) || upper_32_bits(offset + length)) - return afs_fs_fetch_data64(server, key, vnode, offset, length, - buffer, wait_mode); + if (upper_32_bits(req->pos) || + upper_32_bits(req->len) || + upper_32_bits(req->pos + req->len)) + return afs_fs_fetch_data64(server, key, vnode, req, async); _enter(""); @@ -490,7 +524,7 @@ int afs_fs_fetch_data(struct afs_server *server, call->key = key; call->reply = vnode; call->reply2 = NULL; /* volsync */ - call->reply3 = buffer; + call->reply3 = req; call->service_id = FS_SERVICE; call->port = htons(AFS_FS_PORT); call->operation_ID = FSFETCHDATA; @@ -501,10 +535,11 @@ int afs_fs_fetch_data(struct afs_server *server, bp[1] = htonl(vnode->fid.vid); bp[2] = htonl(vnode->fid.vnode); bp[3] = htonl(vnode->fid.unique); - bp[4] = htonl(offset); - bp[5] = htonl(length); + bp[4] = htonl(lower_32_bits(req->pos)); + bp[5] = htonl(lower_32_bits(req->len)); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + atomic_inc(&req->usage); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -533,7 +568,7 @@ static const struct afs_call_type afs_RXFSGiveUpCallBacks = { * - the callbacks are held in the server->cb_break ring */ int afs_fs_give_up_callbacks(struct afs_server *server, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; size_t ncallbacks; @@ -587,7 +622,7 @@ int afs_fs_give_up_callbacks(struct afs_server *server, ASSERT(ncallbacks > 0); wake_up_nr(&server->cb_break_waitq, ncallbacks); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -638,7 +673,7 @@ int afs_fs_create(struct afs_server *server, struct afs_fid *newfid, struct afs_file_status *newstatus, struct afs_callback *newcb, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; size_t namesz, reqsz, padsz; @@ -683,7 +718,7 @@ int afs_fs_create(struct afs_server *server, *bp++ = htonl(mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -728,7 +763,7 @@ int afs_fs_remove(struct afs_server *server, struct afs_vnode *vnode, const char *name, bool isdir, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; size_t namesz, reqsz, padsz; @@ -763,7 +798,7 @@ int afs_fs_remove(struct afs_server *server, bp = (void *) bp + padsz; } - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -809,7 +844,7 @@ int afs_fs_link(struct afs_server *server, struct afs_vnode *dvnode, struct afs_vnode *vnode, const char *name, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; size_t namesz, reqsz, padsz; @@ -848,7 +883,7 @@ int afs_fs_link(struct afs_server *server, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -897,7 +932,7 @@ int afs_fs_symlink(struct afs_server *server, const char *contents, struct afs_fid *newfid, struct afs_file_status *newstatus, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; size_t namesz, reqsz, padsz, c_namesz, c_padsz; @@ -952,7 +987,7 @@ int afs_fs_symlink(struct afs_server *server, *bp++ = htonl(S_IRWXUGO); /* unix mode */ *bp++ = 0; /* segment size */ - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1001,7 +1036,7 @@ int afs_fs_rename(struct afs_server *server, const char *orig_name, struct afs_vnode *new_dvnode, const char *new_name, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz; @@ -1055,7 +1090,7 @@ int afs_fs_rename(struct afs_server *server, bp = (void *) bp + n_padsz; } - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1110,7 +1145,7 @@ static int afs_fs_store_data64(struct afs_server *server, pgoff_t first, pgoff_t last, unsigned offset, unsigned to, loff_t size, loff_t pos, loff_t i_size, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_vnode *vnode = wb->vnode; struct afs_call *call; @@ -1159,7 +1194,7 @@ static int afs_fs_store_data64(struct afs_server *server, *bp++ = htonl(i_size >> 32); *bp++ = htonl((u32) i_size); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1168,7 +1203,7 @@ static int afs_fs_store_data64(struct afs_server *server, int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, pgoff_t first, pgoff_t last, unsigned offset, unsigned to, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_vnode *vnode = wb->vnode; struct afs_call *call; @@ -1194,7 +1229,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32) return afs_fs_store_data64(server, wb, first, last, offset, to, - size, pos, i_size, wait_mode); + size, pos, i_size, async); call = afs_alloc_flat_call(&afs_RXFSStoreData, (4 + 6 + 3) * 4, @@ -1233,7 +1268,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, *bp++ = htonl(size); *bp++ = htonl(i_size); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1295,7 +1330,7 @@ static const struct afs_call_type afs_RXFSStoreData64_as_Status = { */ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key, struct afs_vnode *vnode, struct iattr *attr, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; __be32 *bp; @@ -1334,7 +1369,7 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key, *bp++ = htonl(attr->ia_size >> 32); /* new file length */ *bp++ = htonl((u32) attr->ia_size); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1343,7 +1378,7 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key, */ static int afs_fs_setattr_size(struct afs_server *server, struct key *key, struct afs_vnode *vnode, struct iattr *attr, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; __be32 *bp; @@ -1354,7 +1389,7 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key, ASSERT(attr->ia_valid & ATTR_SIZE); if (attr->ia_size >> 32) return afs_fs_setattr_size64(server, key, vnode, attr, - wait_mode); + async); call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status, (4 + 6 + 3) * 4, @@ -1382,7 +1417,7 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key, *bp++ = 0; /* size of write */ *bp++ = htonl(attr->ia_size); /* new file length */ - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1391,14 +1426,14 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key, */ int afs_fs_setattr(struct afs_server *server, struct key *key, struct afs_vnode *vnode, struct iattr *attr, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; __be32 *bp; if (attr->ia_valid & ATTR_SIZE) return afs_fs_setattr_size(server, key, vnode, attr, - wait_mode); + async); _enter(",%x,{%x:%u},,", key_serial(key), vnode->fid.vid, vnode->fid.vnode); @@ -1424,7 +1459,7 @@ int afs_fs_setattr(struct afs_server *server, struct key *key, xdr_encode_AFS_StoreStatus(&bp, attr); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1626,7 +1661,7 @@ int afs_fs_get_volume_status(struct afs_server *server, struct key *key, struct afs_vnode *vnode, struct afs_volume_status *vs, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; __be32 *bp; @@ -1656,7 +1691,7 @@ int afs_fs_get_volume_status(struct afs_server *server, bp[0] = htonl(FSGETVOLUMESTATUS); bp[1] = htonl(vnode->fid.vid); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1718,7 +1753,7 @@ int afs_fs_set_lock(struct afs_server *server, struct key *key, struct afs_vnode *vnode, afs_lock_type_t type, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; __be32 *bp; @@ -1742,7 +1777,7 @@ int afs_fs_set_lock(struct afs_server *server, *bp++ = htonl(vnode->fid.unique); *bp++ = htonl(type); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1751,7 +1786,7 @@ int afs_fs_set_lock(struct afs_server *server, int afs_fs_extend_lock(struct afs_server *server, struct key *key, struct afs_vnode *vnode, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; __be32 *bp; @@ -1774,7 +1809,7 @@ int afs_fs_extend_lock(struct afs_server *server, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } /* @@ -1783,7 +1818,7 @@ int afs_fs_extend_lock(struct afs_server *server, int afs_fs_release_lock(struct afs_server *server, struct key *key, struct afs_vnode *vnode, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; __be32 *bp; @@ -1806,5 +1841,5 @@ int afs_fs_release_lock(struct afs_server *server, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + return afs_make_call(&server->addr, call, GFP_NOFS, async); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 535a38d2c1d0..8acf3670e756 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> +#include <linux/uuid.h> #include <net/af_rxrpc.h> #include "afs.h" @@ -51,31 +52,22 @@ struct afs_mount_params { struct key *key; /* key to use for secure mounting */ }; -/* - * definition of how to wait for the completion of an operation - */ -struct afs_wait_mode { - /* RxRPC received message notification */ - rxrpc_notify_rx_t notify_rx; - - /* synchronous call waiter and call dispatched notification */ - int (*wait)(struct afs_call *call); - - /* asynchronous call completion */ - void (*async_complete)(void *reply, int error); +enum afs_call_state { + AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ + AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ + AFS_CALL_AWAIT_OP_ID, /* awaiting op ID on incoming call */ + AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ + AFS_CALL_REPLYING, /* replying to incoming call */ + AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ + AFS_CALL_COMPLETE, /* Completed or failed */ }; - -extern const struct afs_wait_mode afs_sync_call; -extern const struct afs_wait_mode afs_async_call; - /* * a record of an in-progress RxRPC call */ struct afs_call { const struct afs_call_type *type; /* type of call */ - const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - struct work_struct async_work; /* asynchronous work processor */ + struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct key *key; /* security for this call */ @@ -91,15 +83,8 @@ struct afs_call { pgoff_t first; /* first page in mapping to deal with */ pgoff_t last; /* last page in mapping to deal with */ size_t offset; /* offset into received data store */ - enum { /* call state */ - AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ - AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ - AFS_CALL_AWAIT_OP_ID, /* awaiting op ID on incoming call */ - AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ - AFS_CALL_REPLYING, /* replying to incoming call */ - AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ - AFS_CALL_COMPLETE, /* Completed or failed */ - } state; + atomic_t usage; + enum afs_call_state state; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ unsigned request_size; /* size of request data */ @@ -110,6 +95,7 @@ struct afs_call { bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ bool need_attention; /* T if RxRPC poked us */ + bool async; /* T if asynchronous */ u16 service_id; /* RxRPC service ID to call */ __be16 port; /* target UDP port */ u32 operation_ID; /* operation ID for an incoming call */ @@ -131,6 +117,25 @@ struct afs_call_type { /* clean up a call */ void (*destructor)(struct afs_call *call); + + /* Work function */ + void (*work)(struct work_struct *work); +}; + +/* + * Record of an outstanding read operation on a vnode. + */ +struct afs_read { + loff_t pos; /* Where to start reading */ + loff_t len; /* How much to read */ + loff_t actual_len; /* How much we're actually getting */ + atomic_t usage; + unsigned int remain; /* Amount remaining */ + unsigned int index; /* Which page we're reading into */ + unsigned int pg_offset; /* Offset in page we're at */ + unsigned int nr_pages; + void (*page_done)(struct afs_call *, struct afs_read *); + struct page *pages[]; }; /* @@ -403,30 +408,6 @@ struct afs_interface { unsigned mtu; /* MTU of interface */ }; -/* - * UUID definition [internet draft] - * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns - * increments since midnight 15th October 1582 - * - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID - * time - * - the clock sequence is a 14-bit counter to avoid duplicate times - */ -struct afs_uuid { - u32 time_low; /* low part of timestamp */ - u16 time_mid; /* mid part of timestamp */ - u16 time_hi_and_version; /* high part of timestamp and version */ -#define AFS_UUID_TO_UNIX_TIME 0x01b21dd213814000ULL -#define AFS_UUID_TIMEHI_MASK 0x0fff -#define AFS_UUID_VERSION_TIME 0x1000 /* time-based UUID */ -#define AFS_UUID_VERSION_NAME 0x3000 /* name-based UUID */ -#define AFS_UUID_VERSION_RANDOM 0x4000 /* (pseudo-)random generated UUID */ - u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ -#define AFS_UUID_CLOCKHI_MASK 0x3f -#define AFS_UUID_VARIANT_STD 0x80 - u8 clock_seq_low; /* clock seq low */ - u8 node[6]; /* spatially unique node ID (MAC addr) */ -}; - /*****************************************************************************/ /* * cache.c @@ -494,6 +475,7 @@ extern const struct file_operations afs_file_operations; extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); extern int afs_page_filler(void *, struct page *); +extern void afs_put_read(struct afs_read *); /* * flock.c @@ -509,50 +491,37 @@ extern int afs_flock(struct file *, int, struct file_lock *); */ extern int afs_fs_fetch_file_status(struct afs_server *, struct key *, struct afs_vnode *, struct afs_volsync *, - const struct afs_wait_mode *); -extern int afs_fs_give_up_callbacks(struct afs_server *, - const struct afs_wait_mode *); + bool); +extern int afs_fs_give_up_callbacks(struct afs_server *, bool); extern int afs_fs_fetch_data(struct afs_server *, struct key *, - struct afs_vnode *, off_t, size_t, struct page *, - const struct afs_wait_mode *); + struct afs_vnode *, struct afs_read *, bool); extern int afs_fs_create(struct afs_server *, struct key *, struct afs_vnode *, const char *, umode_t, struct afs_fid *, struct afs_file_status *, - struct afs_callback *, - const struct afs_wait_mode *); + struct afs_callback *, bool); extern int afs_fs_remove(struct afs_server *, struct key *, - struct afs_vnode *, const char *, bool, - const struct afs_wait_mode *); + struct afs_vnode *, const char *, bool, bool); extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *, - struct afs_vnode *, const char *, - const struct afs_wait_mode *); + struct afs_vnode *, const char *, bool); extern int afs_fs_symlink(struct afs_server *, struct key *, struct afs_vnode *, const char *, const char *, - struct afs_fid *, struct afs_file_status *, - const struct afs_wait_mode *); + struct afs_fid *, struct afs_file_status *, bool); extern int afs_fs_rename(struct afs_server *, struct key *, struct afs_vnode *, const char *, - struct afs_vnode *, const char *, - const struct afs_wait_mode *); + struct afs_vnode *, const char *, bool); extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *, - pgoff_t, pgoff_t, unsigned, unsigned, - const struct afs_wait_mode *); + pgoff_t, pgoff_t, unsigned, unsigned, bool); extern int afs_fs_setattr(struct afs_server *, struct key *, - struct afs_vnode *, struct iattr *, - const struct afs_wait_mode *); + struct afs_vnode *, struct iattr *, bool); extern int afs_fs_get_volume_status(struct afs_server *, struct key *, struct afs_vnode *, - struct afs_volume_status *, - const struct afs_wait_mode *); + struct afs_volume_status *, bool); extern int afs_fs_set_lock(struct afs_server *, struct key *, - struct afs_vnode *, afs_lock_type_t, - const struct afs_wait_mode *); + struct afs_vnode *, afs_lock_type_t, bool); extern int afs_fs_extend_lock(struct afs_server *, struct key *, - struct afs_vnode *, - const struct afs_wait_mode *); + struct afs_vnode *, bool); extern int afs_fs_release_lock(struct afs_server *, struct key *, - struct afs_vnode *, - const struct afs_wait_mode *); + struct afs_vnode *, bool); /* * inode.c @@ -573,7 +542,7 @@ extern int afs_drop_inode(struct inode *); * main.c */ extern struct workqueue_struct *afs_wq; -extern struct afs_uuid afs_uuid; +extern struct uuid_v1 afs_uuid; /* * misc.c @@ -592,6 +561,11 @@ extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); extern void afs_mntpt_kill_timer(void); /* + * netdevices.c + */ +extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool); + +/* * proc.c */ extern int afs_proc_init(void); @@ -603,11 +577,13 @@ extern void afs_proc_cell_remove(struct afs_cell *); * rxrpc.c */ extern struct socket *afs_socket; +extern atomic_t afs_outstanding_calls; extern int afs_open_socket(void); extern void afs_close_socket(void); -extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, - const struct afs_wait_mode *); +extern void afs_put_call(struct afs_call *); +extern int afs_queue_call_work(struct afs_call *); +extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, bool); extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); @@ -653,21 +629,14 @@ extern int afs_fs_init(void); extern void afs_fs_exit(void); /* - * use-rtnetlink.c - */ -extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool); -extern int afs_get_MAC_address(u8 *, size_t); - -/* * vlclient.c */ extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, const char *, struct afs_cache_vlocation *, - const struct afs_wait_mode *); + bool); extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *, afs_volid_t, afs_voltype_t, - struct afs_cache_vlocation *, - const struct afs_wait_mode *); + struct afs_cache_vlocation *, bool); /* * vlocation.c @@ -699,7 +668,7 @@ extern void afs_vnode_finalise_status_update(struct afs_vnode *, extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *, struct key *); extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *, - off_t, size_t, struct page *); + struct afs_read *); extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *, umode_t, struct afs_fid *, struct afs_file_status *, struct afs_callback *, struct afs_server **); @@ -756,6 +725,8 @@ extern int afs_fsync(struct file *, loff_t, loff_t, int); /* * debug tracing */ +#include <trace/events/afs.h> + extern unsigned afs_debug; #define dbgprintk(FMT,...) \ diff --git a/fs/afs/main.c b/fs/afs/main.c index 0b187ef3b5b7..51d7d17bca57 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -15,6 +15,7 @@ #include <linux/completion.h> #include <linux/sched.h> #include <linux/random.h> +#define CREATE_TRACE_POINTS #include "internal.h" MODULE_DESCRIPTION("AFS Client File System"); @@ -30,53 +31,10 @@ static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); -struct afs_uuid afs_uuid; +struct uuid_v1 afs_uuid; struct workqueue_struct *afs_wq; /* - * get a client UUID - */ -static int __init afs_get_client_UUID(void) -{ - struct timespec ts; - u64 uuidtime; - u16 clockseq; - int ret; - - /* read the MAC address of one of the external interfaces and construct - * a UUID from it */ - ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node)); - if (ret < 0) - return ret; - - getnstimeofday(&ts); - uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10; - uuidtime += ts.tv_nsec / 100; - uuidtime += AFS_UUID_TO_UNIX_TIME; - afs_uuid.time_low = uuidtime; - afs_uuid.time_mid = uuidtime >> 32; - afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK; - afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME; - - get_random_bytes(&clockseq, 2); - afs_uuid.clock_seq_low = clockseq; - afs_uuid.clock_seq_hi_and_reserved = - (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK; - afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD; - - _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", - afs_uuid.time_low, - afs_uuid.time_mid, - afs_uuid.time_hi_and_version, - afs_uuid.clock_seq_hi_and_reserved, - afs_uuid.clock_seq_low, - afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2], - afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]); - - return 0; -} - -/* * initialise the AFS client FS module */ static int __init afs_init(void) @@ -85,9 +43,7 @@ static int __init afs_init(void) printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); - ret = afs_get_client_UUID(); - if (ret < 0) - return ret; + generate_random_uuid((unsigned char *)&afs_uuid); /* create workqueue */ ret = -ENOMEM; diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 81dd075356b9..d4fb0afc0097 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) /* try and do the mount */ _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); + mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); _debug("--- mount result %p ---", mnt); free_page((unsigned long) devname); diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c index 7ad36506c256..40b2bab3e401 100644 --- a/fs/afs/netdevices.c +++ b/fs/afs/netdevices.c @@ -12,27 +12,6 @@ #include "internal.h" /* - * get a MAC address from a random ethernet interface that has a real one - * - the buffer will normally be 6 bytes in size - */ -int afs_get_MAC_address(u8 *mac, size_t maclen) -{ - struct net_device *dev; - int ret = -ENODEV; - - BUG_ON(maclen != ETH_ALEN); - - rtnl_lock(); - dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); - if (dev) { - memcpy(mac, dev->dev_addr, maclen); - ret = 0; - } - rtnl_unlock(); - return ret; -} - -/* * get a list of this system's interface IPv4 addresses, netmasks and MTUs * - maxbufs must be at least 1 * - returns the number of interface records in the buffer diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 25f05a8d21b1..95f42872b787 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -19,35 +19,16 @@ struct socket *afs_socket; /* my RxRPC socket */ static struct workqueue_struct *afs_async_calls; static struct afs_call *afs_spare_incoming_call; -static atomic_t afs_outstanding_calls; +atomic_t afs_outstanding_calls; -static void afs_free_call(struct afs_call *); static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static int afs_wait_for_call_to_complete(struct afs_call *); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); -static int afs_dont_wait_for_call_to_complete(struct afs_call *); static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); static int afs_deliver_cm_op_id(struct afs_call *); -/* synchronous call management */ -const struct afs_wait_mode afs_sync_call = { - .notify_rx = afs_wake_up_call_waiter, - .wait = afs_wait_for_call_to_complete, -}; - -/* asynchronous call management */ -const struct afs_wait_mode afs_async_call = { - .notify_rx = afs_wake_up_async_call, - .wait = afs_dont_wait_for_call_to_complete, -}; - -/* asynchronous incoming call management */ -static const struct afs_wait_mode afs_async_incoming_call = { - .notify_rx = afs_wake_up_async_call, -}; - /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", @@ -130,9 +111,11 @@ void afs_close_socket(void) { _enter(""); + kernel_listen(afs_socket, 0); + flush_workqueue(afs_async_calls); + if (afs_spare_incoming_call) { - atomic_inc(&afs_outstanding_calls); - afs_free_call(afs_spare_incoming_call); + afs_put_call(afs_spare_incoming_call); afs_spare_incoming_call = NULL; } @@ -141,7 +124,6 @@ void afs_close_socket(void) TASK_UNINTERRUPTIBLE); _debug("no outstanding calls"); - flush_workqueue(afs_async_calls); kernel_sock_shutdown(afs_socket, SHUT_RDWR); flush_workqueue(afs_async_calls); sock_release(afs_socket); @@ -152,44 +134,79 @@ void afs_close_socket(void) } /* - * free a call + * Allocate a call. */ -static void afs_free_call(struct afs_call *call) +static struct afs_call *afs_alloc_call(const struct afs_call_type *type, + gfp_t gfp) { - _debug("DONE %p{%s} [%d]", - call, call->type->name, atomic_read(&afs_outstanding_calls)); + struct afs_call *call; + int o; - ASSERTCMP(call->rxcall, ==, NULL); - ASSERT(!work_pending(&call->async_work)); - ASSERT(call->type->name != NULL); + call = kzalloc(sizeof(*call), gfp); + if (!call) + return NULL; - kfree(call->request); - kfree(call); + call->type = type; + atomic_set(&call->usage, 1); + INIT_WORK(&call->async_work, afs_process_async_call); + init_waitqueue_head(&call->waitq); - if (atomic_dec_and_test(&afs_outstanding_calls)) - wake_up_atomic_t(&afs_outstanding_calls); + o = atomic_inc_return(&afs_outstanding_calls); + trace_afs_call(call, afs_call_trace_alloc, 1, o, + __builtin_return_address(0)); + return call; } /* - * End a call but do not free it + * Dispose of a reference on a call. */ -static void afs_end_call_nofree(struct afs_call *call) +void afs_put_call(struct afs_call *call) { - if (call->rxcall) { - rxrpc_kernel_end_call(afs_socket, call->rxcall); - call->rxcall = NULL; + int n = atomic_dec_return(&call->usage); + int o = atomic_read(&afs_outstanding_calls); + + trace_afs_call(call, afs_call_trace_put, n + 1, o, + __builtin_return_address(0)); + + ASSERTCMP(n, >=, 0); + if (n == 0) { + ASSERT(!work_pending(&call->async_work)); + ASSERT(call->type->name != NULL); + + if (call->rxcall) { + rxrpc_kernel_end_call(afs_socket, call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); + + kfree(call->request); + kfree(call); + + o = atomic_dec_return(&afs_outstanding_calls); + trace_afs_call(call, afs_call_trace_free, 0, o, + __builtin_return_address(0)); + if (o == 0) + wake_up_atomic_t(&afs_outstanding_calls); } - if (call->type->destructor) - call->type->destructor(call); } /* - * End a call and free it + * Queue the call for actual work. Returns 0 unconditionally for convenience. */ -static void afs_end_call(struct afs_call *call) +int afs_queue_call_work(struct afs_call *call) { - afs_end_call_nofree(call); - afs_free_call(call); + int u = atomic_inc_return(&call->usage); + + trace_afs_call(call, afs_call_trace_work, u, + atomic_read(&afs_outstanding_calls), + __builtin_return_address(0)); + + INIT_WORK(&call->work, call->type->work); + + if (!queue_work(afs_wq, &call->work)) + afs_put_call(call); + return 0; } /* @@ -200,25 +217,19 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, { struct afs_call *call; - call = kzalloc(sizeof(*call), GFP_NOFS); + call = afs_alloc_call(type, GFP_NOFS); if (!call) goto nomem_call; - _debug("CALL %p{%s} [%d]", - call, type->name, atomic_read(&afs_outstanding_calls)); - atomic_inc(&afs_outstanding_calls); - - call->type = type; - call->request_size = request_size; - call->reply_max = reply_max; - if (request_size) { + call->request_size = request_size; call->request = kmalloc(request_size, GFP_NOFS); if (!call->request) goto nomem_free; } if (reply_max) { + call->reply_max = reply_max; call->buffer = kmalloc(reply_max, GFP_NOFS); if (!call->buffer) goto nomem_free; @@ -228,7 +239,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, return call; nomem_free: - afs_free_call(call); + afs_put_call(call); nomem_call: return NULL; } @@ -315,7 +326,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, * initiate a call */ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, - const struct afs_wait_mode *wait_mode) + bool async) { struct sockaddr_rxrpc srx; struct rxrpc_call *rxcall; @@ -332,8 +343,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, call, call->type->name, key_serial(call->key), atomic_read(&afs_outstanding_calls)); - call->wait_mode = wait_mode; - INIT_WORK(&call->async_work, afs_process_async_call); + call->async = async; memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -347,7 +357,9 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, /* create a call */ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, (unsigned long) call, gfp, - wait_mode->notify_rx); + (async ? + afs_wake_up_async_call : + afs_wake_up_call_waiter)); call->key = NULL; if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); @@ -386,12 +398,15 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, /* at this point, an async call may no longer exist as it may have * already completed */ - return wait_mode->wait(call); + if (call->async) + return -EINPROGRESS; + + return afs_wait_for_call_to_complete(call); error_do_abort: rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD"); error_kill_call: - afs_end_call(call); + afs_put_call(call); _leave(" = %d", ret); return ret; } @@ -416,6 +431,8 @@ static void afs_deliver_to_call(struct afs_call *call) ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, NULL, 0, &offset, false, &call->abort_code); + trace_afs_recv_data(call, 0, offset, false, ret); + if (ret == -EINPROGRESS || ret == -EAGAIN) return; if (ret == 1 || ret < 0) { @@ -459,7 +476,7 @@ static void afs_deliver_to_call(struct afs_call *call) done: if (call->state == AFS_CALL_COMPLETE && call->incoming) - afs_end_call(call); + afs_put_call(call); out: _leave(""); return; @@ -516,7 +533,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) } _debug("call complete"); - afs_end_call(call); + afs_put_call(call); _leave(" = %d", ret); return ret; } @@ -540,24 +557,25 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) { struct afs_call *call = (struct afs_call *)call_user_ID; + int u; + trace_afs_notify_call(rxcall, call); call->need_attention = true; - queue_work(afs_async_calls, &call->async_work); -} -/* - * put a call into asynchronous mode - * - mustn't touch the call descriptor as the call my have completed by the - * time we get here - */ -static int afs_dont_wait_for_call_to_complete(struct afs_call *call) -{ - _enter(""); - return -EINPROGRESS; + u = __atomic_add_unless(&call->usage, 1, 0); + if (u != 0) { + trace_afs_call(call, afs_call_trace_wake, u, + atomic_read(&afs_outstanding_calls), + __builtin_return_address(0)); + + if (!queue_work(afs_async_calls, &call->async_work)) + afs_put_call(call); + } } /* - * delete an asynchronous call + * Delete an asynchronous call. The work item carries a ref to the call struct + * that we need to release. */ static void afs_delete_async_call(struct work_struct *work) { @@ -565,13 +583,14 @@ static void afs_delete_async_call(struct work_struct *work) _enter(""); - afs_free_call(call); + afs_put_call(call); _leave(""); } /* - * perform processing on an asynchronous call + * Perform I/O processing on an asynchronous call. The work item carries a ref + * to the call struct that we either need to release or to pass on. */ static void afs_process_async_call(struct work_struct *work) { @@ -584,21 +603,19 @@ static void afs_process_async_call(struct work_struct *work) afs_deliver_to_call(call); } - if (call->state == AFS_CALL_COMPLETE && call->wait_mode) { - if (call->wait_mode->async_complete) - call->wait_mode->async_complete(call->reply, - call->error); + if (call->state == AFS_CALL_COMPLETE) { call->reply = NULL; - /* kill the call */ - afs_end_call_nofree(call); - - /* we can't just delete the call because the work item may be - * queued */ + /* We have two refs to release - one from the alloc and one + * queued with the work item - and we can't just deallocate the + * call because the work item may be queued again. + */ call->async_work.func = afs_delete_async_call; - queue_work(afs_async_calls, &call->async_work); + if (!queue_work(afs_async_calls, &call->async_work)) + afs_put_call(call); } + afs_put_call(call); _leave(""); } @@ -618,15 +635,13 @@ static void afs_charge_preallocation(struct work_struct *work) for (;;) { if (!call) { - call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); + call = afs_alloc_call(&afs_RXCMxxxx, GFP_KERNEL); if (!call) break; - INIT_WORK(&call->async_work, afs_process_async_call); - call->wait_mode = &afs_async_incoming_call; - call->type = &afs_RXCMxxxx; - init_waitqueue_head(&call->waitq); + call->async = true; call->state = AFS_CALL_AWAIT_OP_ID; + init_waitqueue_head(&call->waitq); } if (rxrpc_kernel_charge_accept(afs_socket, @@ -648,9 +663,8 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, { struct afs_call *call = (struct afs_call *)user_call_ID; - atomic_inc(&afs_outstanding_calls); call->rxcall = NULL; - afs_free_call(call); + afs_put_call(call); } /* @@ -659,7 +673,6 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long user_call_ID) { - atomic_inc(&afs_outstanding_calls); queue_work(afs_wq, &afs_charge_preallocation_work); } @@ -689,6 +702,8 @@ static int afs_deliver_cm_op_id(struct afs_call *call) if (!afs_cm_incoming_call(call)) return -ENOTSUPP; + trace_afs_cb_call(call); + /* pass responsibility for the remainer of this message off to the * cache manager op */ return call->type->deliver(call); @@ -721,7 +736,6 @@ void afs_send_empty_reply(struct afs_call *call) rxrpc_kernel_abort_call(afs_socket, call->rxcall, RX_USER_ABORT, ENOMEM, "KOO"); default: - afs_end_call(call); _leave(" [error]"); return; } @@ -760,7 +774,6 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) rxrpc_kernel_abort_call(afs_socket, call->rxcall, RX_USER_ABORT, ENOMEM, "KOO"); } - afs_end_call(call); _leave(" [error]"); } @@ -780,6 +793,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, buf, count, &call->offset, want_more, &call->abort_code); + trace_afs_recv_data(call, count, call->offset, want_more, ret); if (ret == 0 || ret == -EAGAIN) return ret; diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 94bcd97d22b8..a5e4cc561b6c 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -147,7 +147,7 @@ int afs_vl_get_entry_by_name(struct in_addr *addr, struct key *key, const char *volname, struct afs_cache_vlocation *entry, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; size_t volnamesz, reqsz, padsz; @@ -177,7 +177,7 @@ int afs_vl_get_entry_by_name(struct in_addr *addr, memset((void *) bp + volnamesz, 0, padsz); /* initiate the call */ - return afs_make_call(addr, call, GFP_KERNEL, wait_mode); + return afs_make_call(addr, call, GFP_KERNEL, async); } /* @@ -188,7 +188,7 @@ int afs_vl_get_entry_by_id(struct in_addr *addr, afs_volid_t volid, afs_voltype_t voltype, struct afs_cache_vlocation *entry, - const struct afs_wait_mode *wait_mode) + bool async) { struct afs_call *call; __be32 *bp; @@ -211,5 +211,5 @@ int afs_vl_get_entry_by_id(struct in_addr *addr, *bp = htonl(voltype); /* initiate the call */ - return afs_make_call(addr, call, GFP_KERNEL, wait_mode); + return afs_make_call(addr, call, GFP_KERNEL, async); } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 45a86396fd2d..d7d8dd8c0b31 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -53,7 +53,7 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl, /* attempt to access the VL server */ ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb, - &afs_sync_call); + false); switch (ret) { case 0: goto out; @@ -111,7 +111,7 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl, /* attempt to access the VL server */ ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb, - &afs_sync_call); + false); switch (ret) { case 0: goto out; diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c index 25cf4c3f4ff7..dcb956143c86 100644 --- a/fs/afs/vnode.c +++ b/fs/afs/vnode.c @@ -358,7 +358,7 @@ get_anyway: server, ntohl(server->addr.s_addr)); ret = afs_fs_fetch_file_status(server, key, vnode, NULL, - &afs_sync_call); + false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -393,7 +393,7 @@ no_server: * - TODO implement caching */ int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key, - off_t offset, size_t length, struct page *page) + struct afs_read *desc) { struct afs_server *server; int ret; @@ -420,8 +420,8 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - ret = afs_fs_fetch_data(server, key, vnode, offset, length, - page, &afs_sync_call); + ret = afs_fs_fetch_data(server, key, vnode, desc, + false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -477,7 +477,7 @@ int afs_vnode_create(struct afs_vnode *vnode, struct key *key, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); ret = afs_fs_create(server, key, vnode, name, mode, newfid, - newstatus, newcb, &afs_sync_call); + newstatus, newcb, false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -533,7 +533,7 @@ int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); ret = afs_fs_remove(server, key, vnode, name, isdir, - &afs_sync_call); + false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -595,7 +595,7 @@ int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); ret = afs_fs_link(server, key, dvnode, vnode, name, - &afs_sync_call); + false); } while (!afs_volume_release_fileserver(dvnode, server, ret)); @@ -659,7 +659,7 @@ int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); ret = afs_fs_symlink(server, key, vnode, name, content, - newfid, newstatus, &afs_sync_call); + newfid, newstatus, false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -729,7 +729,7 @@ int afs_vnode_rename(struct afs_vnode *orig_dvnode, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); ret = afs_fs_rename(server, key, orig_dvnode, orig_name, - new_dvnode, new_name, &afs_sync_call); + new_dvnode, new_name, false); } while (!afs_volume_release_fileserver(orig_dvnode, server, ret)); @@ -795,7 +795,7 @@ int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); ret = afs_fs_store_data(server, wb, first, last, offset, to, - &afs_sync_call); + false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -847,7 +847,7 @@ int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call); + ret = afs_fs_setattr(server, key, vnode, attr, false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -894,7 +894,7 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - ret = afs_fs_get_volume_status(server, key, vnode, vs, &afs_sync_call); + ret = afs_fs_get_volume_status(server, key, vnode, vs, false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -933,7 +933,7 @@ int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key, _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call); + ret = afs_fs_set_lock(server, key, vnode, type, false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -971,7 +971,7 @@ int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key) _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call); + ret = afs_fs_extend_lock(server, key, vnode, false); } while (!afs_volume_release_fileserver(vnode, server, ret)); @@ -1009,7 +1009,7 @@ int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key) _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call); + ret = afs_fs_release_lock(server, key, vnode, false); } while (!afs_volume_release_fileserver(vnode, server, ret)); diff --git a/fs/afs/volume.c b/fs/afs/volume.c index d142a2449e65..546f9d01710b 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -106,6 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) volume->cell = params->cell; volume->vid = vlocation->vldb.vid[params->type]; + volume->bdi.ra_pages = VM_MAX_READAHEAD*1024/PAGE_SIZE; ret = bdi_setup_and_register(&volume->bdi, "afs"); if (ret) goto error_bdi; diff --git a/fs/afs/write.c b/fs/afs/write.c index f865c3f05bea..c83c1a0e851f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -86,19 +86,30 @@ void afs_put_writeback(struct afs_writeback *wb) static int afs_fill_page(struct afs_vnode *vnode, struct key *key, loff_t pos, struct page *page) { + struct afs_read *req; loff_t i_size; int ret; - int len; _enter(",,%llu", (unsigned long long)pos); + req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *), + GFP_KERNEL); + if (!req) + return -ENOMEM; + + atomic_set(&req->usage, 1); + req->pos = pos; + req->nr_pages = 1; + req->pages[0] = page; + i_size = i_size_read(&vnode->vfs_inode); if (pos + PAGE_SIZE > i_size) - len = i_size - pos; + req->len = i_size - pos; else - len = PAGE_SIZE; + req->len = PAGE_SIZE; - ret = afs_vnode_fetch_data(vnode, key, pos, len, page); + ret = afs_vnode_fetch_data(vnode, key, req); + afs_put_read(req); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -1085,7 +1085,8 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) * Tell lockdep we inherited freeze protection from submission * thread. */ - __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); file_end_write(file); } @@ -1525,7 +1526,8 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, * by telling it the lock got released so that it doesn't * complain about held lock when we return to userspace. */ - __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); } kfree(iovec); return ret; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 1278335ce366..79fbd85db4ba 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_real_cred()->uid; - wq->gid = current_real_cred()->gid; + wq->uid = current_cred()->uid; + wq->gid = current_cred()->gid; wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 29a02daf08a9..443a6f537d56 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) -static int set_brk(unsigned long start, unsigned long end) +static int set_brk(unsigned long start, unsigned long end, int prot) { start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - int error = vm_brk(start, end - start); + /* + * Map the last of the bss segment. + * If the header is requesting these pages to be + * executable, honour that (ppc32 needs this). + */ + int error = vm_brk_flags(start, end - start, + prot & PROT_EXEC ? VM_EXEC : 0); if (error) return error; } @@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, unsigned long load_addr = 0; int load_addr_set = 0; unsigned long last_bss = 0, elf_bss = 0; + int bss_prot = 0; unsigned long error = ~0UL; unsigned long total_size; int i; @@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, * elf_bss and last_bss is the bss section. */ k = load_addr + eppnt->p_vaddr + eppnt->p_memsz; - if (k > last_bss) + if (k > last_bss) { last_bss = k; + bss_prot = elf_prot; + } } } @@ -623,13 +632,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, /* * Next, align both the file and mem bss up to the page size, * since this is where elf_bss was just zeroed up to, and where - * last_bss will end after the vm_brk() below. + * last_bss will end after the vm_brk_flags() below. */ elf_bss = ELF_PAGEALIGN(elf_bss); last_bss = ELF_PAGEALIGN(last_bss); /* Finally, if there is still more bss to allocate, do it. */ if (last_bss > elf_bss) { - error = vm_brk(elf_bss, last_bss - elf_bss); + error = vm_brk_flags(elf_bss, last_bss - elf_bss, + bss_prot & PROT_EXEC ? VM_EXEC : 0); if (error) goto out; } @@ -674,6 +684,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; unsigned long elf_bss, elf_brk; + int bss_prot = 0; int retval, i; unsigned long elf_entry; unsigned long interp_load_addr = 0; @@ -882,7 +893,8 @@ static int load_elf_binary(struct linux_binprm *bprm) before this one. Map anonymous pages, if needed, and clear the area. */ retval = set_brk(elf_bss + load_bias, - elf_brk + load_bias); + elf_brk + load_bias, + bss_prot); if (retval) goto out_free_dentry; nbyte = ELF_PAGEOFFSET(elf_bss); @@ -976,8 +988,10 @@ static int load_elf_binary(struct linux_binprm *bprm) if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; - if (k > elf_brk) + if (k > elf_brk) { + bss_prot = elf_prot; elf_brk = k; + } } loc->elf_ex.e_entry += load_bias; @@ -993,7 +1007,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * mapping in the interpreter, to make sure it doesn't wind * up getting placed where the bss needs to go. */ - retval = set_brk(elf_bss, elf_brk); + retval = set_brk(elf_bss, elf_brk, bss_prot); if (retval) goto out_free_dentry; if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { @@ -1428,17 +1442,18 @@ static void fill_prstatus(struct elf_prstatus *prstatus, * group-wide total, not its individual thread total. */ thread_group_cputime(p, &cputime); - cputime_to_timeval(cputime.utime, &prstatus->pr_utime); - cputime_to_timeval(cputime.stime, &prstatus->pr_stime); + prstatus->pr_utime = ns_to_timeval(cputime.utime); + prstatus->pr_stime = ns_to_timeval(cputime.stime); } else { - cputime_t utime, stime; + u64 utime, stime; task_cputime(p, &utime, &stime); - cputime_to_timeval(utime, &prstatus->pr_utime); - cputime_to_timeval(stime, &prstatus->pr_stime); + prstatus->pr_utime = ns_to_timeval(utime); + prstatus->pr_stime = ns_to_timeval(stime); } - cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); - cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); + + prstatus->pr_cutime = ns_to_timeval(p->signal->cutime); + prstatus->pr_cstime = ns_to_timeval(p->signal->cstime); } static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, @@ -2298,6 +2313,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; } } + dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d2e36f82c35d..ffca4bbc3d63 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1349,17 +1349,17 @@ static void fill_prstatus(struct elf_prstatus *prstatus, * group-wide total, not its individual thread total. */ thread_group_cputime(p, &cputime); - cputime_to_timeval(cputime.utime, &prstatus->pr_utime); - cputime_to_timeval(cputime.stime, &prstatus->pr_stime); + prstatus->pr_utime = ns_to_timeval(cputime.utime); + prstatus->pr_stime = ns_to_timeval(cputime.stime); } else { - cputime_t utime, stime; + u64 utime, stime; task_cputime(p, &utime, &stime); - cputime_to_timeval(utime, &prstatus->pr_utime); - cputime_to_timeval(stime, &prstatus->pr_stime); + prstatus->pr_utime = ns_to_timeval(utime); + prstatus->pr_stime = ns_to_timeval(stime); } - cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); - cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); + prstatus->pr_cutime = ns_to_timeval(p->signal->cutime); + prstatus->pr_cstime = ns_to_timeval(p->signal->cstime); prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; diff --git a/fs/block_dev.c b/fs/block_dev.c index 6254cee8f8f3..73031ec54a7b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -328,9 +328,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(inode); + struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; - bool is_read = (iov_iter_rw(iter) == READ); + bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; int ret; @@ -343,7 +344,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); - dio->is_sync = is_sync_kiocb(iocb); + dio->is_sync = is_sync = is_sync_kiocb(iocb); if (dio->is_sync) dio->waiter = current; else @@ -353,6 +354,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->multi_bio = false; dio->should_dirty = is_read && (iter->type == ITER_IOVEC); + blk_start_plug(&plug); for (;;) { bio->bi_bdev = bdev; bio->bi_iter.bi_sector = pos >> 9; @@ -394,8 +396,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } + blk_finish_plug(&plug); - if (!dio->is_sync) + if (!is_sync) return -EIOCBQUEUED; for (;;) { @@ -881,6 +884,8 @@ static void bdev_evict_inode(struct inode *inode) spin_lock(&bdev_lock); list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); + if (bdev->bd_bdi != &noop_backing_dev_info) + bdi_put(bdev->bd_bdi); } static const struct super_operations bdev_sops = { @@ -951,6 +956,21 @@ static int bdev_set(struct inode *inode, void *data) static LIST_HEAD(all_bdevs); +/* + * If there is a bdev inode for this device, unhash it so that it gets evicted + * as soon as last inode reference is dropped. + */ +void bdev_unhash_inode(dev_t dev) +{ + struct inode *inode; + + inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev); + if (inode) { + remove_inode_hash(inode); + iput(inode); + } +} + struct block_device *bdget(dev_t dev) { struct block_device *bdev; @@ -968,6 +988,7 @@ struct block_device *bdget(dev_t dev) bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; + bdev->bd_bdi = &noop_backing_dev_info; bdev->bd_block_size = (1 << inode->i_blkbits); bdev->bd_part_count = 0; bdev->bd_invalidated = 0; @@ -1524,6 +1545,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); if (!partno) { ret = -ENXIO; @@ -1619,6 +1642,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; + bdi_put(bdev->bd_bdi); + bdev->bd_bdi = &noop_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 63d197724519..ff0b0be92d61 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -273,6 +273,8 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) unsigned long flags; while (1) { + void *wtag; + spin_lock_irqsave(lock, flags); if (list_empty(list)) break; @@ -299,11 +301,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) spin_unlock_irqrestore(lock, flags); /* - * we don't want to call the ordered free functions - * with the lock held though + * We don't want to call the ordered free functions with the + * lock held though. Save the work as tag for the trace event, + * because the callback could free the structure. */ + wtag = work; work->ordered_free(work); - trace_btrfs_all_work_done(work); + trace_btrfs_all_work_done(wq->fs_info, wtag); } spin_unlock_irqrestore(lock, flags); } @@ -311,6 +315,7 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) static void normal_work_helper(struct btrfs_work *work) { struct __btrfs_workqueue *wq; + void *wtag; int need_order = 0; /* @@ -324,6 +329,8 @@ static void normal_work_helper(struct btrfs_work *work) if (work->ordered_func) need_order = 1; wq = work->wq; + /* Safe for tracepoints in case work gets freed by the callback */ + wtag = work; trace_btrfs_work_sched(work); thresh_exec_hook(wq); @@ -333,7 +340,7 @@ static void normal_work_helper(struct btrfs_work *work) run_ordered_work(wq); } if (!need_order) - trace_btrfs_all_work_done(work); + trace_btrfs_all_work_done(wq->fs_info, wtag); } void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7f390849343b..c4444d6f439f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1024,6 +1024,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, unsigned long buf_offset; unsigned long current_buf_start; unsigned long start_byte; + unsigned long prev_start_byte; unsigned long working_bytes = total_out - buf_start; unsigned long bytes; char *kaddr; @@ -1071,26 +1072,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, if (!bio->bi_iter.bi_size) return 0; bvec = bio_iter_iovec(bio, bio->bi_iter); - + prev_start_byte = start_byte; start_byte = page_offset(bvec.bv_page) - disk_start; /* - * make sure our new page is covered by this - * working buffer + * We need to make sure we're only adjusting + * our offset into compression working buffer when + * we're switching pages. Otherwise we can incorrectly + * keep copying when we were actually done. */ - if (total_out <= start_byte) - return 1; + if (start_byte != prev_start_byte) { + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; - /* - * the next page in the biovec might not be adjacent - * to the last page, but it might still be found - * inside this working buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + buf_offset; + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } } } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 18004169552c..37a31b12bb0c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1800,7 +1800,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; - bdi = blk_get_backing_dev_info(device->bdev); + bdi = device->bdev->bd_bdi; if (bdi_congested(bdi, bdi_bits)) { ret = 1; break; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e97302f437a1..dcd2e798767e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2522,11 +2522,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { spin_unlock(&locked_ref->lock); - btrfs_delayed_ref_unlock(locked_ref); spin_lock(&delayed_refs->lock); locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; cond_resched(); count++; @@ -2572,7 +2572,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, */ if (must_insert_reserved) locked_ref->must_insert_reserved = 1; + spin_lock(&delayed_refs->lock); locked_ref->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); @@ -7384,7 +7387,8 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); + /* We should only have one-level nested. */ + down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); spin_lock(&cluster->refill_lock); if (used_bg == cluster->block_group) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f2b281ad7af6..1e861a063721 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3835,10 +3835,7 @@ cache_acl: break; case S_IFDIR: inode->i_fop = &btrfs_dir_file_operations; - if (root == fs_info->tree_root) - inode->i_op = &btrfs_dir_ro_inode_operations; - else - inode->i_op = &btrfs_dir_inode_operations; + inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; @@ -4505,8 +4502,19 @@ search_again: if (found_type > min_type) { del_item = 1; } else { - if (item_end < new_size) + if (item_end < new_size) { + /* + * With NO_HOLES mode, for the following mapping + * + * [0-4k][hole][8k-12k] + * + * if truncating isize down to 6k, it ends up + * isize being 8k. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + last_size = new_size; break; + } if (found_key.offset >= new_size) del_item = 1; else @@ -5710,6 +5718,7 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; inode->i_op = &btrfs_dir_ro_inode_operations; + inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; inode->i_mtime = current_time(inode); @@ -7059,7 +7068,7 @@ insert: write_unlock(&em_tree->lock); out: - trace_btrfs_get_extent(root, em); + trace_btrfs_get_extent(root, inode, em); btrfs_free_path(path); if (trans) { @@ -7215,7 +7224,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, struct extent_map *em = NULL; int ret; - down_read(&BTRFS_I(inode)->dio_sem); if (type != BTRFS_ORDERED_NOCOW) { em = create_pinned_em(inode, start, len, orig_start, block_start, block_len, orig_block_len, @@ -7234,7 +7242,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, em = ERR_PTR(ret); } out: - up_read(&BTRFS_I(inode)->dio_sem); return em; } @@ -7623,11 +7630,18 @@ static void adjust_dio_outstanding_extents(struct inode *inode, * within our reservation, otherwise we need to adjust our inode * counter appropriately. */ - if (dio_data->outstanding_extents) { + if (dio_data->outstanding_extents >= num_extents) { dio_data->outstanding_extents -= num_extents; } else { + /* + * If dio write length has been split due to no large enough + * contiguous space, we need to compensate our inode counter + * appropriately. + */ + u64 num_needed = num_extents - dio_data->outstanding_extents; + spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents += num_extents; + BTRFS_I(inode)->outstanding_extents += num_needed; spin_unlock(&BTRFS_I(inode)->lock); } } @@ -8685,6 +8699,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.unsubmitted_oe_range_start = (u64)offset; dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; + down_read(&BTRFS_I(inode)->dio_sem); } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_end(inode); @@ -8697,6 +8712,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) iter, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { + up_read(&BTRFS_I(inode)->dio_sem); current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) @@ -9205,6 +9221,7 @@ static int btrfs_truncate(struct inode *inode) break; } + btrfs_block_rsv_release(fs_info, rsv, -1); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, 0); BUG_ON(ret); /* shouldn't happen */ @@ -10572,8 +10589,6 @@ static const struct inode_operations btrfs_dir_inode_operations = { static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, - .get_acl = btrfs_get_acl, - .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 33f967d30b2a..21e51b0ba188 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5653,6 +5653,10 @@ long btrfs_ioctl(struct file *file, unsigned int #ifdef CONFIG_COMPAT long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + /* + * These all access 32-bit values anyway so no further + * handling is necessary. + */ switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; @@ -5663,8 +5667,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; - default: - return -ENOIOCTLCMD; } return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f10bf5213ed8..eeffff84f280 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -37,6 +37,7 @@ */ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 +#define LOG_OTHER_INODE 2 /* * directory trouble cases @@ -4641,7 +4642,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode) || (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags) && - inode_only == LOG_INODE_EXISTS)) + inode_only >= LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; @@ -4665,7 +4666,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return ret; } - mutex_lock(&BTRFS_I(inode)->log_mutex); + if (inode_only == LOG_OTHER_INODE) { + inode_only = LOG_INODE_EXISTS; + mutex_lock_nested(&BTRFS_I(inode)->log_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&BTRFS_I(inode)->log_mutex); + } /* * a brute force approach to making sure we get the most uptodate @@ -4817,7 +4824,7 @@ again: * unpin it. */ err = btrfs_log_inode(trans, root, other_inode, - LOG_INODE_EXISTS, + LOG_OTHER_INODE, 0, LLONG_MAX, ctx); iput(other_inode); if (err) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 161342b73ce5..726f928238d0 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -352,7 +352,5 @@ skip: out: btrfs_free_path(path); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); - return 0; + return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3c3c69c0eee4..b2e70073a10d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -366,7 +366,7 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) */ blk_start_plug(&plug); - bdi = blk_get_backing_dev_info(device->bdev); + bdi = device->bdev->bd_bdi; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; diff --git a/fs/buffer.c b/fs/buffer.c index d21771fcf7d3..0e87401cf335 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1660,7 +1660,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) head = page_buffers(page); bh = head; do { - if (!buffer_mapped(bh)) + if (!buffer_mapped(bh) || (bh->b_blocknr < block)) goto next; if (bh->b_blocknr >= block + len) break; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9cd0c0ea7cdb..e4b066cd912a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -502,9 +502,9 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); if (truncate_size) - *truncate_size = capsnap->truncate_size; + *truncate_size = ci->i_truncate_size; if (truncate_seq) - *truncate_seq = capsnap->truncate_seq; + *truncate_seq = ci->i_truncate_seq; } spin_unlock(&ci->i_ceph_lock); return snapc; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index baea866a6751..94fd76d04683 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2591,8 +2591,13 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, add_wait_queue(&ci->i_cap_wq, &wait); while (!try_get_cap_refs(ci, need, want, endoff, - true, &_got, &err)) + true, &_got, &err)) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } remove_wait_queue(&ci->i_cap_wq, &wait); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d7a93696663b..8ab1fdf0bd49 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1230,7 +1230,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct ceph_mds_client *mdsc = ceph_sb_to_client(dir->i_sb)->mdsc; struct ceph_mds_request *req; - int op, mask, err; + int op, err; + u32 mask; if (flags & LOOKUP_RCU) return -ECHILD; @@ -1245,7 +1246,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) mask |= CEPH_CAP_XATTR_SHARED; - req->r_args.getattr.mask = mask; + req->r_args.getattr.mask = cpu_to_le32(mask); err = ceph_mdsc_do_request(mdsc, NULL, req); switch (err) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 398e5328b309..5e659d054b40 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -305,7 +305,8 @@ static int frag_tree_split_cmp(const void *l, const void *r) { struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; - return ceph_frag_compare(ls->frag, rs->frag); + return ceph_frag_compare(le32_to_cpu(ls->frag), + le32_to_cpu(rs->frag)); } static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4f49253387a0..c9d2e553a6c4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -288,12 +288,13 @@ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { - if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + u32 op = le32_to_cpu(info->head->op); + + if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_READDIR || - info->head->op == CEPH_MDS_OP_LSSNAP) + else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_dir(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_CREATE) + else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features); else return -EIO; @@ -2106,6 +2107,11 @@ static int __do_request(struct ceph_mds_client *mdsc, dout("do_request mdsmap err %d\n", err); goto finish; } + if (mdsc->mdsmap->m_epoch == 0) { + dout("do_request no mdsmap, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + goto finish; + } if (!(mdsc->fsc->mount_options->flags & CEPH_MOUNT_OPT_MOUNTWAIT) && !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index e7b478b49985..034f00f21390 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -9,8 +9,6 @@ config CIFS select CRYPTO_ARC4 select CRYPTO_ECB select CRYPTO_DES - select CRYPTO_SHA256 - select CRYPTO_CMAC help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block @@ -169,11 +167,15 @@ config CIFS_NFSD_EXPORT config CIFS_SMB2 bool "SMB2 and SMB3 network file system support" - depends on CIFS && INET - select NLS + depends on CIFS select KEYS select FSCACHE select DNS_RESOLVER + select CRYPTO_AES + select CRYPTO_SHA256 + select CRYPTO_CMAC + select CRYPTO_AEAD2 + select CRYPTO_CCM help This enables support for the Server Message Block version 2 @@ -194,7 +196,7 @@ config CIFS_SMB2 config CIFS_SMB311 bool "SMB3.1.1 network file system support (Experimental)" - depends on CIFS_SMB2 && INET + depends on CIFS_SMB2 help This enables experimental support for the newest, SMB3.1.1, dialect. diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ec9dbbcca3b9..9156be545b0f 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -245,7 +245,8 @@ compose_mount_options_err: * @fullpath: full path in UNC format * @ref: server's referral */ -static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, +static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, + struct cifs_sb_info *cifs_sb, const char *fullpath, const struct dfs_info3_param *ref) { struct vfsmount *mnt; @@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, if (IS_ERR(mountdata)) return (struct vfsmount *)mountdata; - mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); + mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata); kfree(mountdata); kfree(devname); return mnt; @@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) mnt = ERR_PTR(-EINVAL); break; } - mnt = cifs_dfs_do_refmount(cifs_sb, + mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, referrals + i); cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__, referrals[i].node_name, mnt); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 66bd7fa9b7a6..058ac9b36f04 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -34,6 +34,7 @@ #include <linux/random.h> #include <linux/highmem.h> #include <crypto/skcipher.h> +#include <crypto/aead.h> static int cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server) @@ -75,24 +76,20 @@ int __cifs_calc_signature(struct smb_rqst *rqst, struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; - for (i = 0; i < n_vec; i++) { + if (n_vec < 2 || iov[0].iov_len != 4) + return -EIO; + + for (i = 1; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { cifs_dbg(VFS, "null iovec entry\n"); return -EIO; } - /* The first entry includes a length field (which does not get - signed that occupies the first 4 bytes before the header */ - if (i == 0) { - if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ - break; /* nothing to sign or corrupt header */ - rc = crypto_shash_update(shash, - iov[i].iov_base + 4, iov[i].iov_len - 4); - } else { - rc = crypto_shash_update(shash, - iov[i].iov_base, iov[i].iov_len); - } + if (i == 1 && iov[1].iov_len <= 4) + break; /* nothing to sign or corrupt header */ + rc = crypto_shash_update(shash, + iov[i].iov_base, iov[i].iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with payload\n", __func__); @@ -168,6 +165,10 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, char smb_signature[20]; struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base; + if (rqst->rq_iov[0].iov_len != 4 || + rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base) + return -EIO; + if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; @@ -209,12 +210,14 @@ int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { - struct kvec iov; + struct kvec iov[2]; - iov.iov_base = cifs_pdu; - iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4; + iov[0].iov_base = cifs_pdu; + iov[0].iov_len = 4; + iov[1].iov_base = (char *)cifs_pdu + 4; + iov[1].iov_len = be32_to_cpu(cifs_pdu->smb_buf_length); - return cifs_sign_smbv(&iov, 1, server, + return cifs_sign_smbv(iov, 2, server, pexpected_response_sequence_number); } @@ -227,6 +230,10 @@ int cifs_verify_signature(struct smb_rqst *rqst, char what_we_think_sig_should_be[20]; struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base; + if (rqst->rq_iov[0].iov_len != 4 || + rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base) + return -EIO; + if (cifs_pdu == NULL || server == NULL) return -EINVAL; @@ -868,7 +875,7 @@ out: } void -cifs_crypto_shash_release(struct TCP_Server_Info *server) +cifs_crypto_secmech_release(struct TCP_Server_Info *server) { if (server->secmech.cmacaes) { crypto_free_shash(server->secmech.cmacaes); @@ -890,6 +897,16 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server) server->secmech.hmacmd5 = NULL; } + if (server->secmech.ccmaesencrypt) { + crypto_free_aead(server->secmech.ccmaesencrypt); + server->secmech.ccmaesencrypt = NULL; + } + + if (server->secmech.ccmaesdecrypt) { + crypto_free_aead(server->secmech.ccmaesdecrypt); + server->secmech.ccmaesdecrypt = NULL; + } + kfree(server->secmech.sdesccmacaes); server->secmech.sdesccmacaes = NULL; kfree(server->secmech.sdeschmacsha256); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 70f4e65fced2..15e1db8738ae 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1365,5 +1365,19 @@ MODULE_DESCRIPTION ("VFS to access servers complying with the SNIA CIFS Specification " "e.g. Samba and Windows"); MODULE_VERSION(CIFS_VERSION); +MODULE_SOFTDEP("pre: arc4"); +MODULE_SOFTDEP("pre: des"); +MODULE_SOFTDEP("pre: ecb"); +MODULE_SOFTDEP("pre: hmac"); +MODULE_SOFTDEP("pre: md4"); +MODULE_SOFTDEP("pre: md5"); +MODULE_SOFTDEP("pre: nls"); +#ifdef CONFIG_CIFS_SMB2 +MODULE_SOFTDEP("pre: aes"); +MODULE_SOFTDEP("pre: cmac"); +MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: aead2"); +MODULE_SOFTDEP("pre: ccm"); +#endif /* CONFIG_CIFS_SMB2 */ module_init(init_cifs) module_exit(exit_cifs) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 7ea8a3393936..1a90bb3e2986 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -136,6 +136,8 @@ struct cifs_secmech { struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ + struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */ + struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */ }; /* per smb session structure/fields */ @@ -208,7 +210,7 @@ struct cifsInodeInfo; struct cifs_open_parms; struct smb_version_operations { - int (*send_cancel)(struct TCP_Server_Info *, void *, + int (*send_cancel)(struct TCP_Server_Info *, struct smb_rqst *, struct mid_q_entry *); bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *); /* setup request: allocate mid, sign message */ @@ -433,6 +435,14 @@ struct smb_version_operations { bool (*dir_needs_close)(struct cifsFileInfo *); long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t, loff_t); + /* init transform request - used for encryption for now */ + int (*init_transform_rq)(struct TCP_Server_Info *, struct smb_rqst *, + struct smb_rqst *); + /* free transform request */ + void (*free_transform_rq)(struct smb_rqst *); + int (*is_transform_hdr)(void *buf); + int (*receive_transform)(struct TCP_Server_Info *, + struct mid_q_entry **); }; struct smb_version_values { @@ -1119,7 +1129,10 @@ struct cifs_readdata { int (*read_into_pages)(struct TCP_Server_Info *server, struct cifs_readdata *rdata, unsigned int len); - struct kvec iov; + int (*copy_into_pages)(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, + struct iov_iter *iter); + struct kvec iov[2]; unsigned int pagesz; unsigned int tailsz; unsigned int credits; @@ -1302,6 +1315,13 @@ typedef int (mid_receive_t)(struct TCP_Server_Info *server, */ typedef void (mid_callback_t)(struct mid_q_entry *mid); +/* + * This is the protopyte for mid handle function. This is called once the mid + * has been recognized after decryption of the message. + */ +typedef int (mid_handle_t)(struct TCP_Server_Info *server, + struct mid_q_entry *mid); + /* one of these for every pending CIFS request to the server */ struct mid_q_entry { struct list_head qhead; /* mids waiting on reply from this server */ @@ -1316,6 +1336,7 @@ struct mid_q_entry { #endif mid_receive_t *receive; /* call receive callback */ mid_callback_t *callback; /* call completion callback */ + mid_handle_t *handle; /* call handle mid callback */ void *callback_data; /* general purpose pointer for callback */ void *resp_buf; /* pointer to received SMB header */ int mid_state; /* wish this were enum but can not pass to wait_event */ @@ -1323,6 +1344,7 @@ struct mid_q_entry { bool large_buf:1; /* if valid response, is pointer to large buf */ bool multiRsp:1; /* multiple trans2 responses for one request */ bool multiEnd:1; /* both received */ + bool decrypted:1; /* decrypted entry */ }; /* Make code in transport.c a little cleaner by moving @@ -1475,7 +1497,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, #define CIFS_OBREAK_OP 0x0100 /* oplock break request */ #define CIFS_NEG_OP 0x0200 /* negotiate request */ #define CIFS_OP_MASK 0x0380 /* mask request type */ + #define CIFS_HAS_CREDITS 0x0400 /* already has credits */ +#define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */ /* Security Flags: indicate type of session setup needed */ #define CIFSSEC_MAY_SIGN 0x00001 diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c7b3c841e660..406d2c10ba78 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -75,10 +75,16 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, extern void DeleteMidQEntry(struct mid_q_entry *midEntry); extern void cifs_delete_mid(struct mid_q_entry *mid); extern void cifs_wake_up_task(struct mid_q_entry *mid); +extern int cifs_handle_standard(struct TCP_Server_Info *server, + struct mid_q_entry *mid); +extern int cifs_discard_remaining_data(struct TCP_Server_Info *server); extern int cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid_receive_t *receive, mid_callback_t *callback, - void *cbdata, const int flags); + mid_handle_t *handle, void *cbdata, const int flags); +extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, + struct smb_rqst *rqst, int *resp_buf_type, + const int flags, struct kvec *resp_iov); extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, struct smb_hdr * /* input */ , struct smb_hdr * /* out */ , @@ -96,7 +102,8 @@ extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int *credits); extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, - int * /* type of buf returned */ , const int flags); + int * /* type of buf returned */, const int flags, + struct kvec * /* resp vec */); extern int SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *ptcon, struct smb_hdr *in_buf , @@ -441,7 +448,7 @@ extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, const struct nls_table *); extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); -extern void cifs_crypto_shash_release(struct TCP_Server_Info *); +extern void cifs_crypto_secmech_release(struct TCP_Server_Info *server); extern int calc_seckey(struct cifs_ses *); extern int generate_smb30signingkey(struct cifs_ses *); extern int generate_smb311signingkey(struct cifs_ses *); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index b47261858e6d..f5099fb8a22f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -673,6 +673,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon) return rc; rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0); + cifs_small_buf_release(smb_buffer); if (rc) cifs_dbg(FYI, "Tree disconnect failed %d\n", rc); @@ -707,9 +708,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server) { ECHO_REQ *smb; int rc = 0; - struct kvec iov; - struct smb_rqst rqst = { .rq_iov = &iov, - .rq_nvec = 1 }; + struct kvec iov[2]; + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = 2 }; cifs_dbg(FYI, "In echo request\n"); @@ -724,10 +725,13 @@ CIFSSMBEcho(struct TCP_Server_Info *server) put_bcc(1, &smb->hdr); smb->Data[0] = 'a'; inc_rfc1001_len(smb, 3); - iov.iov_base = smb; - iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; - rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, + iov[0].iov_len = 4; + iov[0].iov_base = smb; + iov[1].iov_len = get_rfc1002_length(smb); + iov[1].iov_base = (char *)smb + 4; + + rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, NULL, server, CIFS_ASYNC_OP | CIFS_ECHO_OP); if (rc) cifs_dbg(FYI, "Echo request failed: %d\n", rc); @@ -772,6 +776,7 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses) pSMB->AndXCommand = 0xFF; rc = SendReceiveNoRsp(xid, ses, (char *) pSMB, 0); + cifs_small_buf_release(pSMB); session_already_dead: mutex_unlock(&ses->session_mutex); @@ -1394,8 +1399,8 @@ openRetry: * Discard any remaining data in the current SMB. To do this, we borrow the * current bigbuf. */ -static int -discard_remaining_data(struct TCP_Server_Info *server) +int +cifs_discard_remaining_data(struct TCP_Server_Info *server) { unsigned int rfclen = get_rfc1002_length(server->smallbuf); int remaining = rfclen + 4 - server->total_read; @@ -1421,7 +1426,7 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) int length; struct cifs_readdata *rdata = mid->callback_data; - length = discard_remaining_data(server); + length = cifs_discard_remaining_data(server); dequeue_mid(mid, rdata->result); return length; } @@ -1454,7 +1459,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (server->ops->is_status_pending && server->ops->is_status_pending(buf, server, 0)) { - discard_remaining_data(server); + cifs_discard_remaining_data(server); return -1; } @@ -1507,10 +1512,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) } /* set up first iov for signature check */ - rdata->iov.iov_base = buf; - rdata->iov.iov_len = server->total_read; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", - rdata->iov.iov_base, rdata->iov.iov_len); + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = server->total_read - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n", + rdata->iov[0].iov_base, server->total_read); /* how much data is in the response? */ data_len = server->ops->read_data_length(buf); @@ -1543,8 +1550,8 @@ cifs_readv_callback(struct mid_q_entry *mid) struct cifs_readdata *rdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - struct smb_rqst rqst = { .rq_iov = &rdata->iov, - .rq_nvec = 1, + struct smb_rqst rqst = { .rq_iov = rdata->iov, + .rq_nvec = 2, .rq_pages = rdata->pages, .rq_npages = rdata->nr_pages, .rq_pagesz = rdata->pagesz, @@ -1599,8 +1606,8 @@ cifs_async_readv(struct cifs_readdata *rdata) READ_REQ *smb = NULL; int wct; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); - struct smb_rqst rqst = { .rq_iov = &rdata->iov, - .rq_nvec = 1 }; + struct smb_rqst rqst = { .rq_iov = rdata->iov, + .rq_nvec = 2 }; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@ -1640,12 +1647,14 @@ cifs_async_readv(struct cifs_readdata *rdata) } /* 4 for RFC1001 length + 1 for BCC */ - rdata->iov.iov_base = smb; - rdata->iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; + rdata->iov[0].iov_base = smb; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = (char *)smb + 4; + rdata->iov[1].iov_len = get_rfc1002_length(smb); kref_get(&rdata->refcount); rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive, - cifs_readv_callback, rdata, 0); + cifs_readv_callback, NULL, rdata, 0); if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); @@ -1667,6 +1676,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, int wct; int resp_buf_type = 0; struct kvec iov[1]; + struct kvec rsp_iov; __u32 pid = io_parms->pid; __u16 netfid = io_parms->netfid; __u64 offset = io_parms->offset; @@ -1716,10 +1726,11 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, iov[0].iov_base = (char *)pSMB; iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; - rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, - &resp_buf_type, CIFS_LOG_ERROR); + rc = SendReceive2(xid, tcon->ses, iov, 1, &resp_buf_type, + CIFS_LOG_ERROR, &rsp_iov); + cifs_small_buf_release(pSMB); cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); - pSMBr = (READ_RSP *)iov[0].iov_base; + pSMBr = (READ_RSP *)rsp_iov.iov_base; if (rc) { cifs_dbg(VFS, "Send error in read = %d\n", rc); } else { @@ -1747,12 +1758,11 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, } } -/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ if (*buf) { - free_rsp_buf(resp_buf_type, iov[0].iov_base); + free_rsp_buf(resp_buf_type, rsp_iov.iov_base); } else if (resp_buf_type != CIFS_NO_BUFFER) { /* return buffer to caller to free */ - *buf = iov[0].iov_base; + *buf = rsp_iov.iov_base; if (resp_buf_type == CIFS_SMALL_BUFFER) *pbuf_type = CIFS_SMALL_BUFFER; else if (resp_buf_type == CIFS_LARGE_BUFFER) @@ -2093,7 +2103,7 @@ cifs_async_writev(struct cifs_writedata *wdata, WRITE_REQ *smb = NULL; int wct; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct kvec iov; + struct kvec iov[2]; struct smb_rqst rqst = { }; if (tcon->ses->capabilities & CAP_LARGE_FILES) { @@ -2126,11 +2136,13 @@ cifs_async_writev(struct cifs_writedata *wdata, cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); /* 4 for RFC1001 length + 1 for BCC */ - iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1; - iov.iov_base = smb; + iov[0].iov_len = 4; + iov[0].iov_base = smb; + iov[1].iov_len = get_rfc1002_length(smb) + 1; + iov[1].iov_base = (char *)smb + 4; - rqst.rq_iov = &iov; - rqst.rq_nvec = 1; + rqst.rq_iov = iov; + rqst.rq_nvec = 2; rqst.rq_pages = wdata->pages; rqst.rq_npages = wdata->nr_pages; rqst.rq_pagesz = wdata->pagesz; @@ -2151,12 +2163,12 @@ cifs_async_writev(struct cifs_writedata *wdata, (struct smb_com_writex_req *)smb; inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5); put_bcc(wdata->bytes + 5, &smbw->hdr); - iov.iov_len += 4; /* pad bigger by four bytes */ + iov[1].iov_len += 4; /* pad bigger by four bytes */ } kref_get(&wdata->refcount); rc = cifs_call_async(tcon->ses->server, &rqst, NULL, - cifs_writev_callback, wdata, 0); + cifs_writev_callback, NULL, wdata, 0); if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); @@ -2182,6 +2194,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, __u64 offset = io_parms->offset; struct cifs_tcon *tcon = io_parms->tcon; unsigned int count = io_parms->length; + struct kvec rsp_iov; *nbytes = 0; @@ -2240,8 +2253,9 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, else /* wct == 12 pad bigger by four bytes */ iov[0].iov_len = smb_hdr_len + 8; - - rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0); + rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0, + &rsp_iov); + cifs_small_buf_release(pSMB); cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); if (rc) { cifs_dbg(FYI, "Send error Write2 = %d\n", rc); @@ -2249,7 +2263,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, /* presumably this can not happen, but best to be safe */ rc = -EIO; } else { - WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base; + WRITE_RSP *pSMBr = (WRITE_RSP *)rsp_iov.iov_base; *nbytes = le16_to_cpu(pSMBr->CountHigh); *nbytes = (*nbytes) << 16; *nbytes += le16_to_cpu(pSMBr->Count); @@ -2263,8 +2277,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, *nbytes &= 0xFFFF; } -/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ - free_rsp_buf(resp_buf_type, iov[0].iov_base); + free_rsp_buf(resp_buf_type, rsp_iov.iov_base); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -2279,6 +2292,7 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; LOCK_REQ *pSMB = NULL; struct kvec iov[2]; + struct kvec rsp_iov; int resp_buf_type; __u16 count; @@ -2307,7 +2321,9 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); - rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); + rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP, + &rsp_iov); + cifs_small_buf_release(pSMB); if (rc) cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc); @@ -2368,14 +2384,12 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon, inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); - if (waitFlag) { + if (waitFlag) rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMB, &bytes_returned); - cifs_small_buf_release(pSMB); - } else { + else rc = SendReceiveNoRsp(xid, tcon->ses, (char *)pSMB, flags); - /* SMB buffer freed by function above */ - } + cifs_small_buf_release(pSMB); cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); if (rc) cifs_dbg(FYI, "Send error in Lock = %d\n", rc); @@ -2401,6 +2415,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, int resp_buf_type = 0; __u16 params, param_offset, offset, byte_count, count; struct kvec iov[1]; + struct kvec rsp_iov; cifs_dbg(FYI, "Posix Lock\n"); @@ -2462,11 +2477,10 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)pSMB; iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, - &resp_buf_type, timeout); - pSMB = NULL; /* request buf already freed by SendReceive2. Do - not try to free it twice below on exit */ - pSMBr = (struct smb_com_transaction2_sfi_rsp *)iov[0].iov_base; + &resp_buf_type, timeout, &rsp_iov); + pSMBr = (struct smb_com_transaction2_sfi_rsp *)rsp_iov.iov_base; } + cifs_small_buf_release(pSMB); if (rc) { cifs_dbg(FYI, "Send error in Posix Lock = %d\n", rc); @@ -2506,10 +2520,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, } plk_err_exit: - if (pSMB) - cifs_small_buf_release(pSMB); - - free_rsp_buf(resp_buf_type, iov[0].iov_base); + free_rsp_buf(resp_buf_type, rsp_iov.iov_base); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -2536,6 +2547,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) pSMB->LastWriteTime = 0xFFFFFFFF; pSMB->ByteCount = 0; rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + cifs_small_buf_release(pSMB); cifs_stats_inc(&tcon->stats.cifs_stats.num_closes); if (rc) { if (rc != -EINTR) { @@ -2565,6 +2577,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) pSMB->FileID = (__u16) smb_file_id; pSMB->ByteCount = 0; rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + cifs_small_buf_release(pSMB); cifs_stats_inc(&tcon->stats.cifs_stats.num_flushes); if (rc) cifs_dbg(VFS, "Send error in Flush = %d\n", rc); @@ -3820,6 +3833,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, int buf_type = 0; QUERY_SEC_DESC_REQ *pSMB; struct kvec iov[1]; + struct kvec rsp_iov; cifs_dbg(FYI, "GetCifsACL\n"); @@ -3843,7 +3857,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, - 0); + 0, &rsp_iov); + cifs_small_buf_release(pSMB); cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get); if (rc) { cifs_dbg(FYI, "Send error in QuerySecDesc = %d\n", rc); @@ -3855,11 +3870,11 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, char *pdata; /* validate_nttransact */ - rc = validate_ntransact(iov[0].iov_base, (char **)&parm, + rc = validate_ntransact(rsp_iov.iov_base, (char **)&parm, &pdata, &parm_len, pbuflen); if (rc) goto qsec_out; - pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base; + pSMBr = (struct smb_com_ntransact_rsp *)rsp_iov.iov_base; cifs_dbg(FYI, "smb %p parm %p data %p\n", pSMBr, parm, *acl_inf); @@ -3896,8 +3911,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, } } qsec_out: - free_rsp_buf(buf_type, iov[0].iov_base); -/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ + free_rsp_buf(buf_type, rsp_iov.iov_base); return rc; } @@ -4666,6 +4680,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon, pSMB->FileID = searchHandle; pSMB->ByteCount = 0; rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + cifs_small_buf_release(pSMB); if (rc) cifs_dbg(VFS, "Send error in FindClose = %d\n", rc); @@ -5687,6 +5702,7 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + cifs_small_buf_release(pSMB); if (rc) { cifs_dbg(FYI, "Send error in SetFileInfo (SetFileSize) = %d\n", rc); @@ -5758,6 +5774,7 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, pSMB->ByteCount = cpu_to_le16(byte_count); memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + cifs_small_buf_release(pSMB); if (rc) cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n", rc); @@ -5818,6 +5835,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon, pSMB->ByteCount = cpu_to_le16(byte_count); *data_offset = delete_file ? 1 : 0; rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + cifs_small_buf_release(pSMB); if (rc) cifs_dbg(FYI, "Send error in SetFileDisposition = %d\n", rc); @@ -6057,6 +6075,7 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, cifs_fill_unix_set_info((FILE_UNIX_BASIC_INFO *)data_offset, args); rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); + cifs_small_buf_release(pSMB); if (rc) cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n", rc); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 35ae49ed1f76..777ad9f4fc3c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -787,6 +787,15 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) dump_smb(buf, server->total_read); + return cifs_handle_standard(server, mid); +} + +int +cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + char *buf = server->large_buf ? server->bigbuf : server->smallbuf; + int length; + /* * We know that we received enough to get to the MID as we * checked the pdu_length earlier. Now check to see @@ -872,12 +881,19 @@ cifs_demultiplex_thread(void *p) continue; server->total_read += length; - mid_entry = server->ops->find_mid(server, buf); + if (server->ops->is_transform_hdr && + server->ops->receive_transform && + server->ops->is_transform_hdr(buf)) { + length = server->ops->receive_transform(server, + &mid_entry); + } else { + mid_entry = server->ops->find_mid(server, buf); - if (!mid_entry || !mid_entry->receive) - length = standard_receive3(server, mid_entry); - else - length = mid_entry->receive(server, mid_entry); + if (!mid_entry || !mid_entry->receive) + length = standard_receive3(server, mid_entry); + else + length = mid_entry->receive(server, mid_entry); + } if (length < 0) continue; @@ -2154,7 +2170,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); - cifs_crypto_shash_release(server); + cifs_crypto_secmech_release(server); cifs_fscache_release_client_cookie(server); kfree(server->session_key.response); @@ -2273,7 +2289,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) return tcp_ses; out_err_crypto_release: - cifs_crypto_shash_release(tcp_ses); + cifs_crypto_secmech_release(tcp_ses); put_net(cifs_net_ns(tcp_ses)); @@ -2614,12 +2630,18 @@ get_ses_fail: return ERR_PTR(rc); } -static int match_tcon(struct cifs_tcon *tcon, const char *unc) +static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info) { if (tcon->tidStatus == CifsExiting) return 0; - if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE)) + if (strncmp(tcon->treeName, volume_info->UNC, MAX_TREE_SIZE)) return 0; + if (tcon->seal != volume_info->seal) + return 0; +#ifdef CONFIG_CIFS_SMB2 + if (tcon->snapshot_time != volume_info->snapshot_time) + return 0; +#endif /* CONFIG_CIFS_SMB2 */ return 1; } @@ -2632,14 +2654,8 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &ses->tcon_list) { tcon = list_entry(tmp, struct cifs_tcon, tcon_list); - if (!match_tcon(tcon, volume_info->UNC)) - continue; - -#ifdef CONFIG_CIFS_SMB2 - if (tcon->snapshot_time != volume_info->snapshot_time) + if (!match_tcon(tcon, volume_info)) continue; -#endif /* CONFIG_CIFS_SMB2 */ - ++tcon->tc_count; spin_unlock(&cifs_tcp_ses_lock); return tcon; @@ -2685,8 +2701,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) cifs_dbg(FYI, "Found match on UNC path\n"); /* existing tcon already has a reference */ cifs_put_smb_ses(ses); - if (tcon->seal != volume_info->seal) - cifs_dbg(VFS, "transport encryption setting conflicts with existing tid\n"); return tcon; } @@ -2742,7 +2756,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->Flags &= ~SMB_SHARE_IS_IN_DFS; cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags); } - tcon->seal = volume_info->seal; tcon->use_persistent = false; /* check if SMB2 or later, CIFS does not support persistent handles */ if (volume_info->persistent) { @@ -2779,6 +2792,24 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->use_resilient = true; } + if (volume_info->seal) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "SMB3 or later required for encryption\n"); + rc = -EOPNOTSUPP; + goto out_fail; +#ifdef CONFIG_CIFS_SMB2 + } else if (tcon->ses->server->capabilities & + SMB2_GLOBAL_CAP_ENCRYPTION) + tcon->seal = true; + else { + cifs_dbg(VFS, "Encryption is not supported on share\n"); + rc = -EOPNOTSUPP; + goto out_fail; +#endif /* CONFIG_CIFS_SMB2 */ + } + } + /* * We can have only one retry value for a connection to a share so for * resources mounted more than once to the same server share the last @@ -2910,7 +2941,7 @@ cifs_match_super(struct super_block *sb, void *data) if (!match_server(tcp_srv, volume_info) || !match_session(ses, volume_info) || - !match_tcon(tcon, volume_info->UNC) || + !match_tcon(tcon, volume_info) || !match_prepath(sb, mnt_data)) { rc = 0; goto out; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 18a1e1d6671f..98dc842e7245 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2884,7 +2884,15 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; size_t copy = min_t(size_t, remaining, PAGE_SIZE); - size_t written = copy_page_to_iter(page, 0, copy, iter); + size_t written; + + if (unlikely(iter->type & ITER_PIPE)) { + void *addr = kmap_atomic(page); + + written = copy_to_iter(addr, copy, iter); + kunmap_atomic(addr); + } else + written = copy_page_to_iter(page, 0, copy, iter); remaining -= written; if (written < copy && iov_iter_count(iter) > 0) break; @@ -2903,8 +2911,9 @@ cifs_uncached_readv_complete(struct work_struct *work) } static int -cifs_uncached_read_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, unsigned int len) +uncached_fill_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, struct iov_iter *iter, + unsigned int len) { int result = 0; unsigned int i; @@ -2933,7 +2942,10 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, rdata->tailsz = len; len = 0; } - result = cifs_read_page_from_socket(server, page, n); + if (iter) + result = copy_page_from_iter(page, 0, n, iter); + else + result = cifs_read_page_from_socket(server, page, n); if (result < 0) break; @@ -2945,6 +2957,21 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, } static int +cifs_uncached_read_into_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, unsigned int len) +{ + return uncached_fill_pages(server, rdata, NULL, len); +} + +static int +cifs_uncached_copy_into_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, + struct iov_iter *iter) +{ + return uncached_fill_pages(server, rdata, iter, iter->count); +} + +static int cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, struct cifs_sb_info *cifs_sb, struct list_head *rdata_list) { @@ -2991,6 +3018,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->pid = pid; rdata->pagesz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; + rdata->copy_into_pages = cifs_uncached_copy_into_pages; rdata->credits = credits; if (!rdata->cfile->invalidHandle || @@ -3341,8 +3369,9 @@ cifs_readv_complete(struct work_struct *work) } static int -cifs_readpages_read_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, unsigned int len) +readpages_fill_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, struct iov_iter *iter, + unsigned int len) { int result = 0; unsigned int i; @@ -3396,7 +3425,10 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, continue; } - result = cifs_read_page_from_socket(server, page, n); + if (iter) + result = copy_page_from_iter(page, 0, n, iter); + else + result = cifs_read_page_from_socket(server, page, n); if (result < 0) break; @@ -3408,6 +3440,21 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, } static int +cifs_readpages_read_into_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, unsigned int len) +{ + return readpages_fill_pages(server, rdata, NULL, len); +} + +static int +cifs_readpages_copy_into_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, + struct iov_iter *iter) +{ + return readpages_fill_pages(server, rdata, iter, iter->count); +} + +static int readpages_get_pages(struct address_space *mapping, struct list_head *page_list, unsigned int rsize, struct list_head *tmplist, unsigned int *nr_pages, loff_t *offset, unsigned int *bytes) @@ -3561,6 +3608,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata->pid = pid; rdata->pagesz = PAGE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; + rdata->copy_into_pages = cifs_readpages_copy_into_pages; rdata->credits = credits; list_for_each_entry_safe(page, tpage, &tmplist, lru) { diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 8f6a2a5863b9..a27fc8791551 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -285,6 +285,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file) rc = -ENOMEM; goto error_exit; } + spin_lock_init(&cifsFile->file_info_lock); file->private_data = cifsFile; cifsFile->tlink = cifs_get_tlink(tlink); tcon = tlink_tcon(tlink); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 538d9b55699a..dcbcc927399a 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -344,13 +344,12 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, /* BB is NTLMV2 session security format easier to use here? */ flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | - NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; - if (ses->server->sign) { + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC | + NTLMSSP_NEGOTIATE_SEAL; + if (ses->server->sign) flags |= NTLMSSP_NEGOTIATE_SIGN; - if (!ses->server->session_estab || - ses->ntlmssp->sesskey_per_smbsess) - flags |= NTLMSSP_NEGOTIATE_KEY_XCH; - } + if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; sec_blob->NegotiateFlags = cpu_to_le32(flags); @@ -407,13 +406,12 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer, flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | - NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; - if (ses->server->sign) { + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC | + NTLMSSP_NEGOTIATE_SEAL; + if (ses->server->sign) flags |= NTLMSSP_NEGOTIATE_SIGN; - if (!ses->server->session_estab || - ses->ntlmssp->sesskey_per_smbsess) - flags |= NTLMSSP_NEGOTIATE_KEY_XCH; - } + if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); @@ -652,6 +650,7 @@ sess_sendreceive(struct sess_data *sess_data) int rc; struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base; __u16 count; + struct kvec rsp_iov = { NULL, 0 }; count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len; smb_buf->smb_buf_length = @@ -661,7 +660,9 @@ sess_sendreceive(struct sess_data *sess_data) rc = SendReceive2(sess_data->xid, sess_data->ses, sess_data->iov, 3 /* num_iovecs */, &sess_data->buf0_type, - CIFS_LOG_ERROR); + CIFS_LOG_ERROR, &rsp_iov); + cifs_small_buf_release(sess_data->iov[0].iov_base); + memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); return rc; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index fc537c29044e..67a987e4d026 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -36,11 +36,11 @@ * SMB_COM_NT_CANCEL request and then sends it. */ static int -send_nt_cancel(struct TCP_Server_Info *server, void *buf, +send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst, struct mid_q_entry *mid) { int rc = 0; - struct smb_hdr *in_buf = (struct smb_hdr *)buf; + struct smb_hdr *in_buf = (struct smb_hdr *)rqst->rq_iov[0].iov_base; /* -4 for RFC1001 length and +2 for BCC field */ in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4 + 2); diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 0ffa18094335..401a5d856636 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -61,4 +61,9 @@ /* Maximum buffer size value we can send with 1 credit */ #define SMB2_MAX_BUFFER_SIZE 65536 +static inline struct smb2_sync_hdr *get_sync_hdr(void *buf) +{ + return &(((struct smb2_hdr *)buf)->sync_hdr); +} + #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 8257a5a97cc0..3030a9dfb0dd 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -26,6 +26,7 @@ #include "smb2pdu.h" #include "smb2proto.h" #include "smb2status.h" +#include "smb2glob.h" struct status_to_posix_error { __le32 smb2_status; @@ -2449,10 +2450,10 @@ smb2_print_status(__le32 status) int map_smb2_to_linux_error(char *buf, bool log_err) { - struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + struct smb2_sync_hdr *shdr = get_sync_hdr(buf); unsigned int i; int rc = -EIO; - __le32 smb2err = hdr->Status; + __le32 smb2err = shdr->Status; if (smb2err == 0) return 0; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 3d383489b9cf..fd516ea8b8f8 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -28,31 +28,32 @@ #include "cifs_debug.h" #include "cifs_unicode.h" #include "smb2status.h" +#include "smb2glob.h" static int -check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) +check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid) { - __u64 wire_mid = le64_to_cpu(hdr->MessageId); + __u64 wire_mid = le64_to_cpu(shdr->MessageId); /* * Make sure that this really is an SMB, that it is a response, * and that the message ids match. */ - if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) && + if ((shdr->ProtocolId == SMB2_PROTO_NUMBER) && (mid == wire_mid)) { - if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) + if (shdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) return 0; else { /* only one valid case where server sends us request */ - if (hdr->Command == SMB2_OPLOCK_BREAK) + if (shdr->Command == SMB2_OPLOCK_BREAK) return 0; else cifs_dbg(VFS, "Received Request not response\n"); } } else { /* bad signature or mid */ - if (hdr->ProtocolId != SMB2_PROTO_NUMBER) + if (shdr->ProtocolId != SMB2_PROTO_NUMBER) cifs_dbg(VFS, "Bad protocol string signature header %x\n", - le32_to_cpu(hdr->ProtocolId)); + le32_to_cpu(shdr->ProtocolId)); if (mid != wire_mid) cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", mid, wire_mid); @@ -95,8 +96,9 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { int smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) { - struct smb2_hdr *hdr = (struct smb2_hdr *)buf; - struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; + struct smb2_pdu *pdu = (struct smb2_pdu *)buf; + struct smb2_hdr *hdr = &pdu->hdr; + struct smb2_sync_hdr *shdr = get_sync_hdr(buf); __u64 mid; __u32 len = get_rfc1002_length(buf); __u32 clc_len; /* calculated length */ @@ -111,7 +113,7 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) * ie Validate the wct via smb2_struct_sizes table above */ - if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { struct smb2_transform_hdr *thdr = (struct smb2_transform_hdr *)buf; struct cifs_ses *ses = NULL; @@ -133,10 +135,10 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) } } - - mid = le64_to_cpu(hdr->MessageId); + mid = le64_to_cpu(shdr->MessageId); if (length < sizeof(struct smb2_pdu)) { - if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) { + if ((length >= sizeof(struct smb2_hdr)) + && (shdr->Status != 0)) { pdu->StructureSize2 = 0; /* * As with SMB/CIFS, on some error cases servers may @@ -154,29 +156,30 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) return 1; } - if (check_smb2_hdr(hdr, mid)) + if (check_smb2_hdr(shdr, mid)) return 1; - if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) { + if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) { cifs_dbg(VFS, "Illegal structure size %u\n", - le16_to_cpu(hdr->StructureSize)); + le16_to_cpu(shdr->StructureSize)); return 1; } - command = le16_to_cpu(hdr->Command); + command = le16_to_cpu(shdr->Command); if (command >= NUMBER_OF_SMB2_COMMANDS) { cifs_dbg(VFS, "Illegal SMB2 command %d\n", command); return 1; } if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) { - if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 || + if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) { /* error packets have 9 byte structure size */ cifs_dbg(VFS, "Illegal response size %u for command %d\n", le16_to_cpu(pdu->StructureSize2), command); return 1; - } else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0) + } else if (command == SMB2_OPLOCK_BREAK_HE + && (shdr->Status == 0) && (le16_to_cpu(pdu->StructureSize2) != 44) && (le16_to_cpu(pdu->StructureSize2) != 36)) { /* special case for SMB2.1 lease break message */ @@ -199,7 +202,7 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) clc_len, 4 + len, mid); /* create failed on symlink */ if (command == SMB2_CREATE_HE && - hdr->Status == STATUS_STOPPED_ON_SYMLINK) + shdr->Status == STATUS_STOPPED_ON_SYMLINK) return 0; /* Windows 7 server returns 24 bytes more */ if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) @@ -261,11 +264,12 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { char * smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) { + struct smb2_sync_hdr *shdr = get_sync_hdr(hdr); *off = 0; *len = 0; /* error responses do not have data area */ - if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED && + if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED && (((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2) return NULL; @@ -275,7 +279,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) * of the data buffer offset and data buffer length for the particular * command. */ - switch (hdr->Command) { + switch (shdr->Command) { case SMB2_NEGOTIATE: *off = le16_to_cpu( ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset); @@ -346,7 +350,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) /* return pointer to beginning of data area, ie offset from SMB start */ if ((*off != 0) && (*len != 0)) - return (char *)(&hdr->ProtocolId) + *off; + return (char *)shdr + *off; else return NULL; } @@ -358,12 +362,13 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) unsigned int smb2_calc_size(void *buf) { - struct smb2_hdr *hdr = (struct smb2_hdr *)buf; - struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; + struct smb2_pdu *pdu = (struct smb2_pdu *)buf; + struct smb2_hdr *hdr = &pdu->hdr; + struct smb2_sync_hdr *shdr = get_sync_hdr(hdr); int offset; /* the offset from the beginning of SMB to data area */ int data_length; /* the length of the variable length data area */ /* Structure Size has already been checked to make sure it is 64 */ - int len = 4 + le16_to_cpu(pdu->hdr.StructureSize); + int len = 4 + le16_to_cpu(shdr->StructureSize); /* * StructureSize2, ie length of fixed parameter area has already @@ -371,7 +376,7 @@ smb2_calc_size(void *buf) */ len += le16_to_cpu(pdu->StructureSize2); - if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false) + if (has_smb2_data_area[le16_to_cpu(shdr->Command)] == false) goto calc_size_exit; smb2_get_data_area_len(&offset, &data_length, hdr); @@ -582,7 +587,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Checking for oplock break\n"); - if (rsp->hdr.Command != SMB2_OPLOCK_BREAK) + if (rsp->hdr.sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; if (rsp->StructureSize != diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 5d456ebb3813..a44b4dbe4aae 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -20,6 +20,8 @@ #include <linux/pagemap.h> #include <linux/vfs.h> #include <linux/falloc.h> +#include <linux/scatterlist.h> +#include <crypto/aead.h> #include "cifsglob.h" #include "smb2pdu.h" #include "smb2proto.h" @@ -119,7 +121,9 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype) static unsigned int smb2_get_credits(struct mid_q_entry *mid) { - return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest); + struct smb2_sync_hdr *shdr = get_sync_hdr(mid->resp_buf); + + return le16_to_cpu(shdr->CreditRequest); } static int @@ -184,10 +188,10 @@ static struct mid_q_entry * smb2_find_mid(struct TCP_Server_Info *server, char *buf) { struct mid_q_entry *mid; - struct smb2_hdr *hdr = (struct smb2_hdr *)buf; - __u64 wire_mid = le64_to_cpu(hdr->MessageId); + struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + __u64 wire_mid = le64_to_cpu(shdr->MessageId); - if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { cifs_dbg(VFS, "encrypted frame parsing not supported yet"); return NULL; } @@ -196,7 +200,7 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) list_for_each_entry(mid, &server->pending_mid_q, qhead) { if ((mid->mid == wire_mid) && (mid->mid_state == MID_REQUEST_SUBMITTED) && - (mid->command == hdr->Command)) { + (mid->command == shdr->Command)) { spin_unlock(&GlobalMid_Lock); return mid; } @@ -209,12 +213,12 @@ static void smb2_dump_detail(void *buf) { #ifdef CONFIG_CIFS_DEBUG2 - struct smb2_hdr *smb = (struct smb2_hdr *)buf; + struct smb2_sync_hdr *shdr = get_sync_hdr(buf); cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", - smb->Command, smb->Status, smb->Flags, smb->MessageId, - smb->ProcessId); - cifs_dbg(VFS, "smb buf %p len %u\n", smb, smb2_calc_size(smb)); + shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId, + shdr->ProcessId); + cifs_dbg(VFS, "smb buf %p len %u\n", buf, smb2_calc_size(buf)); #endif } @@ -1002,14 +1006,14 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, static bool smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) { - struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + struct smb2_sync_hdr *shdr = get_sync_hdr(buf); - if (hdr->Status != STATUS_PENDING) + if (shdr->Status != STATUS_PENDING) return false; if (!length) { spin_lock(&server->req_lock); - server->credits += le16_to_cpu(hdr->CreditRequest); + server->credits += le16_to_cpu(shdr->CreditRequest); spin_unlock(&server->req_lock); wake_up(&server->request_q); } @@ -1545,6 +1549,633 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile) return !cfile->invalidHandle; } +static void +fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq) +{ + struct smb2_sync_hdr *shdr = + (struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base; + unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base); + + memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr)); + tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; + tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len); + tr_hdr->Flags = cpu_to_le16(0x01); + get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE); + memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); + inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - 4); + inc_rfc1001_len(tr_hdr, orig_len); +} + +static struct scatterlist * +init_sg(struct smb_rqst *rqst, u8 *sign) +{ + unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages + 1; + unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24; + struct scatterlist *sg; + unsigned int i; + unsigned int j; + + sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL); + if (!sg) + return NULL; + + sg_init_table(sg, sg_len); + sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len); + for (i = 1; i < rqst->rq_nvec; i++) + sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base, + rqst->rq_iov[i].iov_len); + for (j = 0; i < sg_len - 1; i++, j++) { + unsigned int len = (j < rqst->rq_npages - 1) ? rqst->rq_pagesz + : rqst->rq_tailsz; + sg_set_page(&sg[i], rqst->rq_pages[j], len, 0); + } + sg_set_buf(&sg[sg_len - 1], sign, SMB2_SIGNATURE_SIZE); + return sg; +} + +struct cifs_crypt_result { + int err; + struct completion completion; +}; + +static void cifs_crypt_complete(struct crypto_async_request *req, int err) +{ + struct cifs_crypt_result *res = req->data; + + if (err == -EINPROGRESS) + return; + + res->err = err; + complete(&res->completion); +} + +/* + * Encrypt or decrypt @rqst message. @rqst has the following format: + * iov[0] - transform header (associate data), + * iov[1-N] and pages - data to encrypt. + * On success return encrypted data in iov[1-N] and pages, leave iov[0] + * untouched. + */ +static int +crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) +{ + struct smb2_transform_hdr *tr_hdr = + (struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base; + unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24; + struct cifs_ses *ses; + int rc = 0; + struct scatterlist *sg; + u8 sign[SMB2_SIGNATURE_SIZE] = {}; + struct aead_request *req; + char *iv; + unsigned int iv_len; + struct cifs_crypt_result result = {0, }; + struct crypto_aead *tfm; + unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); + + init_completion(&result.completion); + + ses = smb2_find_smb_ses(server, tr_hdr->SessionId); + if (!ses) { + cifs_dbg(VFS, "%s: Could not find session\n", __func__); + return 0; + } + + rc = smb3_crypto_aead_allocate(server); + if (rc) { + cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); + return rc; + } + + tfm = enc ? server->secmech.ccmaesencrypt : + server->secmech.ccmaesdecrypt; + rc = crypto_aead_setkey(tfm, enc ? ses->smb3encryptionkey : + ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); + return rc; + } + + rc = crypto_aead_setauthsize(tfm, SMB2_SIGNATURE_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Failed to set authsize %d\n", __func__, rc); + return rc; + } + + req = aead_request_alloc(tfm, GFP_KERNEL); + if (!req) { + cifs_dbg(VFS, "%s: Failed to alloc aead request", __func__); + return -ENOMEM; + } + + if (!enc) { + memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE); + crypt_len += SMB2_SIGNATURE_SIZE; + } + + sg = init_sg(rqst, sign); + if (!sg) { + cifs_dbg(VFS, "%s: Failed to init sg %d", __func__, rc); + goto free_req; + } + + iv_len = crypto_aead_ivsize(tfm); + iv = kzalloc(iv_len, GFP_KERNEL); + if (!iv) { + cifs_dbg(VFS, "%s: Failed to alloc IV", __func__); + goto free_sg; + } + iv[0] = 3; + memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CMM_NONCE); + + aead_request_set_crypt(req, sg, sg, crypt_len, iv); + aead_request_set_ad(req, assoc_data_len); + + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + cifs_crypt_complete, &result); + + rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req); + + if (rc == -EINPROGRESS || rc == -EBUSY) { + wait_for_completion(&result.completion); + rc = result.err; + } + + if (!rc && enc) + memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); + + kfree(iv); +free_sg: + kfree(sg); +free_req: + kfree(req); + return rc; +} + +static int +smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, + struct smb_rqst *old_rq) +{ + struct kvec *iov; + struct page **pages; + struct smb2_transform_hdr *tr_hdr; + unsigned int npages = old_rq->rq_npages; + int i; + int rc = -ENOMEM; + + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return rc; + + new_rq->rq_pages = pages; + new_rq->rq_npages = old_rq->rq_npages; + new_rq->rq_pagesz = old_rq->rq_pagesz; + new_rq->rq_tailsz = old_rq->rq_tailsz; + + for (i = 0; i < npages; i++) { + pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!pages[i]) + goto err_free_pages; + } + + iov = kmalloc_array(old_rq->rq_nvec, sizeof(struct kvec), GFP_KERNEL); + if (!iov) + goto err_free_pages; + + /* copy all iovs from the old except the 1st one (rfc1002 length) */ + memcpy(&iov[1], &old_rq->rq_iov[1], + sizeof(struct kvec) * (old_rq->rq_nvec - 1)); + new_rq->rq_iov = iov; + new_rq->rq_nvec = old_rq->rq_nvec; + + tr_hdr = kmalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL); + if (!tr_hdr) + goto err_free_iov; + + /* fill the 1st iov with a transform header */ + fill_transform_hdr(tr_hdr, old_rq); + new_rq->rq_iov[0].iov_base = tr_hdr; + new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr); + + /* copy pages form the old */ + for (i = 0; i < npages; i++) { + char *dst = kmap(new_rq->rq_pages[i]); + char *src = kmap(old_rq->rq_pages[i]); + unsigned int len = (i < npages - 1) ? new_rq->rq_pagesz : + new_rq->rq_tailsz; + memcpy(dst, src, len); + kunmap(new_rq->rq_pages[i]); + kunmap(old_rq->rq_pages[i]); + } + + rc = crypt_message(server, new_rq, 1); + cifs_dbg(FYI, "encrypt message returned %d", rc); + if (rc) + goto err_free_tr_hdr; + + return rc; + +err_free_tr_hdr: + kfree(tr_hdr); +err_free_iov: + kfree(iov); +err_free_pages: + for (i = i - 1; i >= 0; i--) + put_page(pages[i]); + kfree(pages); + return rc; +} + +static void +smb3_free_transform_rq(struct smb_rqst *rqst) +{ + int i = rqst->rq_npages - 1; + + for (; i >= 0; i--) + put_page(rqst->rq_pages[i]); + kfree(rqst->rq_pages); + /* free transform header */ + kfree(rqst->rq_iov[0].iov_base); + kfree(rqst->rq_iov); +} + +static int +smb3_is_transform_hdr(void *buf) +{ + struct smb2_transform_hdr *trhdr = buf; + + return trhdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM; +} + +static int +decrypt_raw_data(struct TCP_Server_Info *server, char *buf, + unsigned int buf_data_size, struct page **pages, + unsigned int npages, unsigned int page_data_size) +{ + struct kvec iov[2]; + struct smb_rqst rqst = {NULL}; + struct smb2_hdr *hdr; + int rc; + + iov[0].iov_base = buf; + iov[0].iov_len = sizeof(struct smb2_transform_hdr); + iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr); + iov[1].iov_len = buf_data_size; + + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + rqst.rq_pages = pages; + rqst.rq_npages = npages; + rqst.rq_pagesz = PAGE_SIZE; + rqst.rq_tailsz = (page_data_size % PAGE_SIZE) ? : PAGE_SIZE; + + rc = crypt_message(server, &rqst, 0); + cifs_dbg(FYI, "decrypt message returned %d\n", rc); + + if (rc) + return rc; + + memmove(buf + 4, iov[1].iov_base, buf_data_size); + hdr = (struct smb2_hdr *)buf; + hdr->smb2_buf_length = cpu_to_be32(buf_data_size + page_data_size); + server->total_read = buf_data_size + page_data_size + 4; + + return rc; +} + +static int +read_data_into_pages(struct TCP_Server_Info *server, struct page **pages, + unsigned int npages, unsigned int len) +{ + int i; + int length; + + for (i = 0; i < npages; i++) { + struct page *page = pages[i]; + size_t n; + + n = len; + if (len >= PAGE_SIZE) { + /* enough data to fill the page */ + n = PAGE_SIZE; + len -= n; + } else { + zero_user(page, len, PAGE_SIZE - len); + len = 0; + } + length = cifs_read_page_from_socket(server, page, n); + if (length < 0) + return length; + server->total_read += length; + } + + return 0; +} + +static int +init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size, + unsigned int cur_off, struct bio_vec **page_vec) +{ + struct bio_vec *bvec; + int i; + + bvec = kcalloc(npages, sizeof(struct bio_vec), GFP_KERNEL); + if (!bvec) + return -ENOMEM; + + for (i = 0; i < npages; i++) { + bvec[i].bv_page = pages[i]; + bvec[i].bv_offset = (i == 0) ? cur_off : 0; + bvec[i].bv_len = min_t(unsigned int, PAGE_SIZE, data_size); + data_size -= bvec[i].bv_len; + } + + if (data_size != 0) { + cifs_dbg(VFS, "%s: something went wrong\n", __func__); + kfree(bvec); + return -EIO; + } + + *page_vec = bvec; + return 0; +} + +static int +handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, + char *buf, unsigned int buf_len, struct page **pages, + unsigned int npages, unsigned int page_data_size) +{ + unsigned int data_offset; + unsigned int data_len; + unsigned int cur_off; + unsigned int cur_page_idx; + unsigned int pad_len; + struct cifs_readdata *rdata = mid->callback_data; + struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct bio_vec *bvec = NULL; + struct iov_iter iter; + struct kvec iov; + int length; + + if (shdr->Command != SMB2_READ) { + cifs_dbg(VFS, "only big read responses are supported\n"); + return -ENOTSUPP; + } + + if (server->ops->is_status_pending && + server->ops->is_status_pending(buf, server, 0)) + return -1; + + rdata->result = server->ops->map_error(buf, false); + if (rdata->result != 0) { + cifs_dbg(FYI, "%s: server returned error %d\n", + __func__, rdata->result); + dequeue_mid(mid, rdata->result); + return 0; + } + + data_offset = server->ops->read_data_offset(buf) + 4; + data_len = server->ops->read_data_length(buf); + + if (data_offset < server->vals->read_rsp_size) { + /* + * win2k8 sometimes sends an offset of 0 when the read + * is beyond the EOF. Treat it as if the data starts just after + * the header. + */ + cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n", + __func__, data_offset); + data_offset = server->vals->read_rsp_size; + } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { + /* data_offset is beyond the end of smallbuf */ + cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n", + __func__, data_offset); + rdata->result = -EIO; + dequeue_mid(mid, rdata->result); + return 0; + } + + pad_len = data_offset - server->vals->read_rsp_size; + + if (buf_len <= data_offset) { + /* read response payload is in pages */ + cur_page_idx = pad_len / PAGE_SIZE; + cur_off = pad_len % PAGE_SIZE; + + if (cur_page_idx != 0) { + /* data offset is beyond the 1st page of response */ + cifs_dbg(FYI, "%s: data offset (%u) beyond 1st page of response\n", + __func__, data_offset); + rdata->result = -EIO; + dequeue_mid(mid, rdata->result); + return 0; + } + + if (data_len > page_data_size - pad_len) { + /* data_len is corrupt -- discard frame */ + rdata->result = -EIO; + dequeue_mid(mid, rdata->result); + return 0; + } + + rdata->result = init_read_bvec(pages, npages, page_data_size, + cur_off, &bvec); + if (rdata->result != 0) { + dequeue_mid(mid, rdata->result); + return 0; + } + + iov_iter_bvec(&iter, WRITE | ITER_BVEC, bvec, npages, data_len); + } else if (buf_len >= data_offset + data_len) { + /* read response payload is in buf */ + WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); + iov.iov_base = buf + data_offset; + iov.iov_len = data_len; + iov_iter_kvec(&iter, WRITE | ITER_KVEC, &iov, 1, data_len); + } else { + /* read response payload cannot be in both buf and pages */ + WARN_ONCE(1, "buf can not contain only a part of read data"); + rdata->result = -EIO; + dequeue_mid(mid, rdata->result); + return 0; + } + + /* set up first iov for signature check */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = server->vals->read_rsp_size - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, server->vals->read_rsp_size); + + length = rdata->copy_into_pages(server, rdata, &iter); + + kfree(bvec); + + if (length < 0) + return length; + + dequeue_mid(mid, false); + return length; +} + +static int +receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) +{ + char *buf = server->smallbuf; + struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; + unsigned int npages; + struct page **pages; + unsigned int len; + unsigned int buflen = get_rfc1002_length(buf) + 4; + int rc; + int i = 0; + + len = min_t(unsigned int, buflen, server->vals->read_rsp_size - 4 + + sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1; + + rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len); + if (rc < 0) + return rc; + server->total_read += rc; + + len = le32_to_cpu(tr_hdr->OriginalMessageSize) + 4 - + server->vals->read_rsp_size; + npages = DIV_ROUND_UP(len, PAGE_SIZE); + + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + rc = -ENOMEM; + goto discard_data; + } + + for (; i < npages; i++) { + pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!pages[i]) { + rc = -ENOMEM; + goto discard_data; + } + } + + /* read read data into pages */ + rc = read_data_into_pages(server, pages, npages, len); + if (rc) + goto free_pages; + + rc = cifs_discard_remaining_data(server); + if (rc) + goto free_pages; + + rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size - 4, + pages, npages, len); + if (rc) + goto free_pages; + + *mid = smb2_find_mid(server, buf); + if (*mid == NULL) + cifs_dbg(FYI, "mid not found\n"); + else { + cifs_dbg(FYI, "mid found\n"); + (*mid)->decrypted = true; + rc = handle_read_data(server, *mid, buf, + server->vals->read_rsp_size, + pages, npages, len); + } + +free_pages: + for (i = i - 1; i >= 0; i--) + put_page(pages[i]); + kfree(pages); + return rc; +discard_data: + cifs_discard_remaining_data(server); + goto free_pages; +} + +static int +receive_encrypted_standard(struct TCP_Server_Info *server, + struct mid_q_entry **mid) +{ + int length; + char *buf = server->smallbuf; + unsigned int pdu_length = get_rfc1002_length(buf); + unsigned int buf_size; + struct mid_q_entry *mid_entry; + + /* switch to large buffer if too big for a small one */ + if (pdu_length + 4 > MAX_CIFS_SMALL_BUFFER_SIZE) { + server->large_buf = true; + memcpy(server->bigbuf, buf, server->total_read); + buf = server->bigbuf; + } + + /* now read the rest */ + length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, + pdu_length - HEADER_SIZE(server) + 1 + 4); + if (length < 0) + return length; + server->total_read += length; + + buf_size = pdu_length + 4 - sizeof(struct smb2_transform_hdr); + length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0); + if (length) + return length; + + mid_entry = smb2_find_mid(server, buf); + if (mid_entry == NULL) + cifs_dbg(FYI, "mid not found\n"); + else { + cifs_dbg(FYI, "mid found\n"); + mid_entry->decrypted = true; + } + + *mid = mid_entry; + + if (mid_entry && mid_entry->handle) + return mid_entry->handle(server, mid_entry); + + return cifs_handle_standard(server, mid_entry); +} + +static int +smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid) +{ + char *buf = server->smallbuf; + unsigned int pdu_length = get_rfc1002_length(buf); + struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; + unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); + + if (pdu_length + 4 < sizeof(struct smb2_transform_hdr) + + sizeof(struct smb2_sync_hdr)) { + cifs_dbg(VFS, "Transform message is too small (%u)\n", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + return -ECONNABORTED; + } + + if (pdu_length + 4 < orig_len + sizeof(struct smb2_transform_hdr)) { + cifs_dbg(VFS, "Transform message is broken\n"); + cifs_reconnect(server); + wake_up(&server->response_q); + return -ECONNABORTED; + } + + if (pdu_length + 4 > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) + return receive_encrypted_read(server, mid); + + return receive_encrypted_standard(server, mid); +} + +int +smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + char *buf = server->large_buf ? server->bigbuf : server->smallbuf; + + return handle_read_data(server, mid, buf, get_rfc1002_length(buf) + 4, + NULL, 0, 0); +} + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -1791,6 +2422,10 @@ struct smb_version_operations smb30_operations = { .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, .enum_snapshots = smb3_enum_snapshots, + .init_transform_rq = smb3_init_transform_rq, + .free_transform_rq = smb3_free_transform_rq, + .is_transform_hdr = smb3_is_transform_hdr, + .receive_transform = smb3_receive_transform, }; #ifdef CONFIG_CIFS_SMB311 @@ -1879,6 +2514,10 @@ struct smb_version_operations smb311_operations = { .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, .enum_snapshots = smb3_enum_snapshots, + .init_transform_rq = smb3_init_transform_rq, + .free_transform_rq = smb3_free_transform_rq, + .is_transform_hdr = smb3_is_transform_hdr, + .receive_transform = smb3_receive_transform, }; #endif /* CIFS_SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 87457227812c..ad83b3db2840 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -77,45 +77,42 @@ static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { /* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */ }; +static int encryption_required(const struct cifs_tcon *tcon) +{ + if (!tcon) + return 0; + if ((tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) || + (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)) + return 1; + if (tcon->seal && + (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + return 1; + return 0; +} static void -smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , +smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, const struct cifs_tcon *tcon) { - struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; - char *temp = (char *)hdr; - /* lookup word count ie StructureSize from table */ - __u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_cmd)]; - - /* - * smaller than SMALL_BUFFER_SIZE but bigger than fixed area of - * largest operations (Create) - */ - memset(temp, 0, 256); - - /* Note this is only network field converted to big endian */ - hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr) - - 4 /* RFC 1001 length field itself not counted */); - - hdr->ProtocolId = SMB2_PROTO_NUMBER; - hdr->StructureSize = cpu_to_le16(64); - hdr->Command = smb2_cmd; + shdr->ProtocolId = SMB2_PROTO_NUMBER; + shdr->StructureSize = cpu_to_le16(64); + shdr->Command = smb2_cmd; if (tcon && tcon->ses && tcon->ses->server) { struct TCP_Server_Info *server = tcon->ses->server; spin_lock(&server->req_lock); /* Request up to 2 credits but don't go over the limit. */ if (server->credits >= server->max_credits) - hdr->CreditRequest = cpu_to_le16(0); + shdr->CreditRequest = cpu_to_le16(0); else - hdr->CreditRequest = cpu_to_le16( + shdr->CreditRequest = cpu_to_le16( min_t(int, server->max_credits - server->credits, 2)); spin_unlock(&server->req_lock); } else { - hdr->CreditRequest = cpu_to_le16(2); + shdr->CreditRequest = cpu_to_le16(2); } - hdr->ProcessId = cpu_to_le32((__u16)current->tgid); + shdr->ProcessId = cpu_to_le32((__u16)current->tgid); if (!tcon) goto out; @@ -124,13 +121,13 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ if ((tcon->ses) && (tcon->ses->server) && (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) - hdr->CreditCharge = cpu_to_le16(1); + shdr->CreditCharge = cpu_to_le16(1); /* else CreditCharge MBZ */ - hdr->TreeId = tcon->tid; + shdr->TreeId = tcon->tid; /* Uid is not converted */ if (tcon->ses) - hdr->SessionId = tcon->ses->Suid; + shdr->SessionId = tcon->ses->Suid; /* * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have @@ -143,12 +140,12 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , * but it is safer to net set it for now. */ /* if (tcon->share_flags & SHI1005_FLAGS_DFS) - hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */ + shdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */ - if (tcon->ses && tcon->ses->server && tcon->ses->server->sign) - hdr->Flags |= SMB2_FLAGS_SIGNED; + if (tcon->ses && tcon->ses->server && tcon->ses->server->sign && + !encryption_required(tcon)) + shdr->Flags |= SMB2_FLAGS_SIGNED; out: - pdu->StructureSize2 = cpu_to_le16(parmsize); return; } @@ -289,16 +286,74 @@ out: return rc; } +static void +fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, + unsigned int *total_len) +{ + struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf; + /* lookup word count ie StructureSize from table */ + __u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)]; + + /* + * smaller than SMALL_BUFFER_SIZE but bigger than fixed area of + * largest operations (Create) + */ + memset(buf, 0, 256); + + smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon); + spdu->StructureSize2 = cpu_to_le16(parmsize); + + *total_len = parmsize + sizeof(struct smb2_sync_hdr); +} + +/* init request without RFC1001 length at the beginning */ +static int +smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, + void **request_buf, unsigned int *total_len) +{ + int rc; + struct smb2_sync_hdr *shdr; + + rc = smb2_reconnect(smb2_command, tcon); + if (rc) + return rc; + + /* BB eventually switch this to SMB2 specific small buf size */ + *request_buf = cifs_small_buf_get(); + if (*request_buf == NULL) { + /* BB should we add a retry in here if not a writepage? */ + return -ENOMEM; + } + + shdr = (struct smb2_sync_hdr *)(*request_buf); + + fill_small_buf(smb2_command, tcon, shdr, total_len); + + if (tcon != NULL) { +#ifdef CONFIG_CIFS_STATS2 + uint16_t com_code = le16_to_cpu(smb2_command); + + cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_sent[com_code]); +#endif + cifs_stats_inc(&tcon->num_smbs_sent); + } + + return rc; +} + /* * Allocate and return pointer to an SMB request hdr, and set basic * SMB information in the SMB header. If the return code is zero, this - * function must have filled in request_buf pointer. + * function must have filled in request_buf pointer. The returned buffer + * has RFC1001 length at the beginning. */ static int small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, void **request_buf) { - int rc = 0; + int rc; + unsigned int total_len; + struct smb2_pdu *pdu; rc = smb2_reconnect(smb2_command, tcon); if (rc) @@ -311,7 +366,12 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, return -ENOMEM; } - smb2_hdr_assemble((struct smb2_hdr *) *request_buf, smb2_command, tcon); + pdu = (struct smb2_pdu *)(*request_buf); + + fill_small_buf(smb2_command, tcon, get_sync_hdr(pdu), &total_len); + + /* Note this is only network field converted to big endian */ + pdu->hdr.smb2_buf_length = cpu_to_be32(total_len); if (tcon != NULL) { #ifdef CONFIG_CIFS_STATS2 @@ -376,7 +436,6 @@ static void assemble_neg_contexts(struct smb2_negotiate_req *req) } #endif /* SMB311 */ - /* * * SMB2 Worker functions follow: @@ -398,6 +457,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) struct smb2_negotiate_req *req; struct smb2_negotiate_rsp *rsp; struct kvec iov[1]; + struct kvec rsp_iov; int rc = 0; int resp_buftype; struct TCP_Server_Info *server = ses->server; @@ -416,7 +476,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) if (rc) return rc; - req->hdr.SessionId = 0; + req->hdr.sync_hdr.SessionId = 0; req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); @@ -446,9 +506,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags); - - rsp = (struct smb2_negotiate_rsp *)iov[0].iov_base; + rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + rsp = (struct smb2_negotiate_rsp *)rsp_iov.iov_base; /* * No tcon so can't do * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); @@ -627,14 +687,15 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) if (rc) return rc; - req->hdr.SessionId = 0; /* First session, not a reauthenticate */ + /* First session, not a reauthenticate */ + req->hdr.sync_hdr.SessionId = 0; /* if reconnect, we need to send previous sess id, otherwise it is 0 */ req->PreviousSessionId = sess_data->previous_session; req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ - req->hdr.CreditRequest = cpu_to_le16(3); + req->hdr.sync_hdr.CreditRequest = cpu_to_le16(3); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -671,6 +732,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) { int rc; struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base; + struct kvec rsp_iov = { NULL, 0 }; /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = @@ -685,7 +747,9 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) rc = SendReceive2(sess_data->xid, sess_data->ses, sess_data->iov, 2, &sess_data->buf0_type, - CIFS_LOG_ERROR | CIFS_NEG_OP); + CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); + cifs_small_buf_release(sess_data->iov[0].iov_base); + memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); return rc; } @@ -697,15 +761,13 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; mutex_lock(&ses->server->srv_mutex); - if (ses->server->sign && ses->server->ops->generate_signingkey) { + if (ses->server->ops->generate_signingkey) { rc = ses->server->ops->generate_signingkey(ses); - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; if (rc) { cifs_dbg(FYI, "SMB3 session key generation failed\n"); mutex_unlock(&ses->server->srv_mutex); - goto keygen_exit; + return rc; } } if (!ses->server->session_estab) { @@ -719,12 +781,6 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) ses->status = CifsGood; ses->need_reconnect = false; spin_unlock(&GlobalMid_Lock); - -keygen_exit: - if (!ses->server->sign) { - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - } return rc; } @@ -781,11 +837,9 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->hdr.SessionId; + ses->Suid = rsp->hdr.sync_hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); - if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) - cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); rc = SMB2_sess_establish_session(sess_data); out_put_spnego_key: @@ -859,7 +913,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) /* If true, rc here is expected and not an error */ if (sess_data->buf0_type != CIFS_NO_BUFFER && - rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rsp->hdr.sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) rc = 0; if (rc) @@ -880,10 +934,8 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); - ses->Suid = rsp->hdr.SessionId; + ses->Suid = rsp->hdr.sync_hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); - if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) - cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); out: kfree(ntlmssp_blob); @@ -916,7 +968,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) goto out; req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; - req->hdr.SessionId = ses->Suid; + req->hdr.sync_hdr.SessionId = ses->Suid; rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, sess_data->nls_cp); @@ -940,10 +992,8 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->hdr.SessionId; + ses->Suid = rsp->hdr.sync_hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); - if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) - cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); rc = SMB2_sess_establish_session(sess_data); out: @@ -1018,6 +1068,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) struct smb2_logoff_req *req; /* response is also trivial struct */ int rc = 0; struct TCP_Server_Info *server; + int flags = 0; cifs_dbg(FYI, "disconnect session %p\n", ses); @@ -1035,11 +1086,15 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) return rc; /* since no tcon, smb2_init can not do this, so do here */ - req->hdr.SessionId = ses->Suid; - if (server->sign) - req->hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.sync_hdr.SessionId = ses->Suid; + + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + flags |= CIFS_TRANSFORM_REQ; + else if (server->sign) + req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0); + rc = SendReceiveNoRsp(xid, ses, (char *) req, flags); + cifs_small_buf_release(req); /* * No tcon so can't do * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); @@ -1071,11 +1126,13 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct smb2_tree_connect_req *req; struct smb2_tree_connect_rsp *rsp = NULL; struct kvec iov[2]; + struct kvec rsp_iov; int rc = 0; int resp_buftype; int unc_path_len; struct TCP_Server_Info *server; __le16 *unc_path = NULL; + int flags = 0; cifs_dbg(FYI, "TCON\n"); @@ -1087,12 +1144,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, if (tcon && tcon->bad_network_name) return -ENOENT; - if ((tcon && tcon->seal) && - ((ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) == 0)) { - cifs_dbg(VFS, "encryption requested but no server support"); - return -EOPNOTSUPP; - } - unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL); if (unc_path == NULL) return -ENOMEM; @@ -1111,11 +1162,15 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, } if (tcon == NULL) { + if ((ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)) + flags |= CIFS_TRANSFORM_REQ; + /* since no tcon, smb2_init can not do this, so do here */ - req->hdr.SessionId = ses->Suid; + req->hdr.sync_hdr.SessionId = ses->Suid; /* if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED) req->hdr.Flags |= SMB2_FLAGS_SIGNED; */ - } + } else if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field and 1 for pad */ @@ -1130,8 +1185,9 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, inc_rfc1001_len(req, unc_path_len - 1 /* pad */); - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0); - rsp = (struct smb2_tree_connect_rsp *)iov[0].iov_base; + rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; if (rc != 0) { if (tcon) { @@ -1142,7 +1198,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, } if (tcon == NULL) { - ses->ipc_tid = rsp->hdr.TreeId; + ses->ipc_tid = rsp->hdr.sync_hdr.TreeId; goto tcon_exit; } @@ -1165,15 +1221,18 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tidStatus = CifsGood; tcon->need_reconnect = false; - tcon->tid = rsp->hdr.TreeId; + tcon->tid = rsp->hdr.sync_hdr.TreeId; strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); + + if (tcon->seal && + !(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + cifs_dbg(VFS, "Encryption is requested but not supported\n"); + init_copy_chunk_defaults(tcon); - if (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA) - cifs_dbg(VFS, "Encrypted shares not supported"); if (tcon->ses->server->ops->validate_negotiate) rc = tcon->ses->server->ops->validate_negotiate(xid, tcon); tcon_exit: @@ -1182,7 +1241,7 @@ tcon_exit: return rc; tcon_error_exit: - if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { + if (rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); if (tcon) tcon->bad_network_name = true; @@ -1197,6 +1256,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) int rc = 0; struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; + int flags = 0; cifs_dbg(FYI, "Tree Disconnect\n"); @@ -1212,7 +1272,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if (rc) return rc; - rc = SendReceiveNoRsp(xid, ses, (char *)&req->hdr, 0); + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + rc = SendReceiveNoRsp(xid, ses, (char *)req, flags); + cifs_small_buf_release(req); if (rc) cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); @@ -1474,14 +1538,16 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; struct kvec iov[4]; + struct kvec rsp_iov; int resp_buftype; int uni_path_len; __le16 *copy_path = NULL; int copy_size; int rc = 0; - unsigned int num_iovecs = 2; + unsigned int n_iov = 2; __u32 file_attributes = 0; char *dhc_buf = NULL, *lc_buf = NULL; + int flags = 0; cifs_dbg(FYI, "create/open\n"); @@ -1494,6 +1560,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc) return rc; + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + if (oparms->create_options & CREATE_OPTION_READONLY) file_attributes |= ATTR_READONLY; if (oparms->create_options & CREATE_OPTION_SPECIAL) @@ -1544,25 +1613,25 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, *oplock == SMB2_OPLOCK_LEVEL_NONE) req->RequestedOplockLevel = *oplock; else { - rc = add_lease_context(server, iov, &num_iovecs, oplock); + rc = add_lease_context(server, iov, &n_iov, oplock); if (rc) { cifs_small_buf_release(req); kfree(copy_path); return rc; } - lc_buf = iov[num_iovecs-1].iov_base; + lc_buf = iov[n_iov-1].iov_base; } if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) { /* need to set Next field of lease context if we request it */ if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) { struct create_context *ccontext = - (struct create_context *)iov[num_iovecs-1].iov_base; + (struct create_context *)iov[n_iov-1].iov_base; ccontext->Next = cpu_to_le32(server->vals->create_lease_size); } - rc = add_durable_context(iov, &num_iovecs, oparms, + rc = add_durable_context(iov, &n_iov, oparms, tcon->use_persistent); if (rc) { cifs_small_buf_release(req); @@ -1570,11 +1639,12 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, kfree(lc_buf); return rc; } - dhc_buf = iov[num_iovecs-1].iov_base; + dhc_buf = iov[n_iov-1].iov_base; } - rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); - rsp = (struct smb2_create_rsp *)iov[0].iov_base; + rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); @@ -1618,12 +1688,15 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, { struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp; + struct smb2_sync_hdr *shdr; struct TCP_Server_Info *server; struct cifs_ses *ses; struct kvec iov[2]; + struct kvec rsp_iov; int resp_buftype; - int num_iovecs; + int n_iov; int rc = 0; + int flags = 0; cifs_dbg(FYI, "SMB2 IOCTL\n"); @@ -1648,6 +1721,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) return rc; + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + req->CtlCode = cpu_to_le32(opcode); req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; @@ -1659,9 +1735,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4); iov[1].iov_base = in_data; iov[1].iov_len = indatalen; - num_iovecs = 2; + n_iov = 2; } else - num_iovecs = 1; + n_iov = 1; req->OutputOffset = 0; req->OutputCount = 0; /* MBZ */ @@ -1698,8 +1774,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, iov[0].iov_len = get_rfc1002_length(req) + 4; - rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); - rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; + rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; if ((rc != 0) && (rc != -EINVAL)) { cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); @@ -1742,9 +1819,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - memcpy(*out_data, - (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), - *plen); + shdr = get_sync_hdr(rsp); + memcpy(*out_data, (char *)shdr + le32_to_cpu(rsp->OutputOffset), *plen); ioctl_exit: free_rsp_buf(resp_buftype, rsp); return rc; @@ -1784,8 +1860,10 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; struct kvec iov[1]; + struct kvec rsp_iov; int resp_buftype; int rc = 0; + int flags = 0; cifs_dbg(FYI, "Close\n"); @@ -1798,6 +1876,9 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; @@ -1805,8 +1886,9 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); - rsp = (struct smb2_close_rsp *)iov[0].iov_base; + rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); @@ -1885,10 +1967,12 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_query_info_req *req; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov[2]; + struct kvec rsp_iov; int rc = 0; int resp_buftype; struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; + int flags = 0; cifs_dbg(FYI, "Query Info\n"); @@ -1901,6 +1985,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + req->InfoType = SMB2_O_INFO_FILE; req->FileInfoClass = info_class; req->PersistentFileId = persistent_fid; @@ -1914,8 +2001,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); - rsp = (struct smb2_query_info_rsp *)iov[0].iov_base; + rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -1963,11 +2051,11 @@ static void smb2_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; - struct smb2_echo_rsp *smb2 = (struct smb2_echo_rsp *)mid->resp_buf; + struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf; unsigned int credits_received = 1; if (mid->mid_state == MID_RESPONSE_RECEIVED) - credits_received = le16_to_cpu(smb2->hdr.CreditRequest); + credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest); mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); @@ -2029,9 +2117,9 @@ SMB2_echo(struct TCP_Server_Info *server) { struct smb2_echo_req *req; int rc = 0; - struct kvec iov; - struct smb_rqst rqst = { .rq_iov = &iov, - .rq_nvec = 1 }; + struct kvec iov[2]; + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = 2 }; cifs_dbg(FYI, "In echo request\n"); @@ -2045,14 +2133,16 @@ SMB2_echo(struct TCP_Server_Info *server) if (rc) return rc; - req->hdr.CreditRequest = cpu_to_le16(1); + req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1); - iov.iov_base = (char *)req; /* 4 for rfc1002 length field */ - iov.iov_len = get_rfc1002_length(req) + 4; + iov[0].iov_len = 4; + iov[0].iov_base = (char *)req; + iov[1].iov_len = get_rfc1002_length(req); + iov[1].iov_base = (char *)req + 4; - rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server, - CIFS_ECHO_OP); + rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL, + server, CIFS_ECHO_OP); if (rc) cifs_dbg(FYI, "Echo request failed: %d\n", rc); @@ -2068,8 +2158,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; struct kvec iov[1]; + struct kvec rsp_iov; int resp_buftype; int rc = 0; + int flags = 0; cifs_dbg(FYI, "Flush\n"); @@ -2082,6 +2174,9 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) return rc; + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; @@ -2089,12 +2184,13 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); + rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); if (rc != 0) cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE); - free_rsp_buf(resp_buftype, iov[0].iov_base); + free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc; } @@ -2103,19 +2199,23 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, * have the end_of_chain boolean set to true. */ static int -smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms, - unsigned int remaining_bytes, int request_type) +smb2_new_read_req(void **buf, unsigned int *total_len, + struct cifs_io_parms *io_parms, unsigned int remaining_bytes, + int request_type) { int rc = -EACCES; - struct smb2_read_req *req = NULL; + struct smb2_read_plain_req *req = NULL; + struct smb2_sync_hdr *shdr; - rc = small_smb2_init(SMB2_READ, io_parms->tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, (void **) &req, + total_len); if (rc) return rc; if (io_parms->tcon->ses->server == NULL) return -ECONNABORTED; - req->hdr.ProcessId = cpu_to_le32(io_parms->pid); + shdr = &req->sync_hdr; + shdr->ProcessId = cpu_to_le32(io_parms->pid); req->PersistentFileId = io_parms->persistent_fid; req->VolatileFileId = io_parms->volatile_fid; @@ -2128,19 +2228,19 @@ smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms, if (request_type & CHAINED_REQUEST) { if (!(request_type & END_OF_CHAIN)) { - /* 4 for rfc1002 length field */ - req->hdr.NextCommand = - cpu_to_le32(get_rfc1002_length(req) + 4); + /* next 8-byte aligned request */ + *total_len = DIV_ROUND_UP(*total_len, 8) * 8; + shdr->NextCommand = cpu_to_le32(*total_len); } else /* END_OF_CHAIN */ - req->hdr.NextCommand = 0; + shdr->NextCommand = 0; if (request_type & RELATED_REQUEST) { - req->hdr.Flags |= SMB2_FLAGS_RELATED_OPERATIONS; + shdr->Flags |= SMB2_FLAGS_RELATED_OPERATIONS; /* * Related requests use info from previous read request * in chain. */ - req->hdr.SessionId = 0xFFFFFFFF; - req->hdr.TreeId = 0xFFFFFFFF; + shdr->SessionId = 0xFFFFFFFF; + shdr->TreeId = 0xFFFFFFFF; req->PersistentFileId = 0xFFFFFFFF; req->VolatileFileId = 0xFFFFFFFF; } @@ -2150,9 +2250,7 @@ smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms, else req->RemainingBytes = 0; - iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + *buf = req; return rc; } @@ -2162,10 +2260,11 @@ smb2_readv_callback(struct mid_q_entry *mid) struct cifs_readdata *rdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - struct smb2_hdr *buf = (struct smb2_hdr *)rdata->iov.iov_base; + struct smb2_sync_hdr *shdr = + (struct smb2_sync_hdr *)rdata->iov[1].iov_base; unsigned int credits_received = 1; - struct smb_rqst rqst = { .rq_iov = &rdata->iov, - .rq_nvec = 1, + struct smb_rqst rqst = { .rq_iov = rdata->iov, + .rq_nvec = 2, .rq_pages = rdata->pages, .rq_npages = rdata->nr_pages, .rq_pagesz = rdata->pagesz, @@ -2177,9 +2276,9 @@ smb2_readv_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits_received = le16_to_cpu(buf->CreditRequest); + credits_received = le16_to_cpu(shdr->CreditRequest); /* result already set, check signature */ - if (server->sign) { + if (server->sign && !mid->decrypted) { int rc; rc = smb2_verify_signature(&rqst, server); @@ -2216,16 +2315,19 @@ smb2_readv_callback(struct mid_q_entry *mid) add_credits(server, credits_received, 0); } -/* smb2_async_readv - send an async write, and set up mid to handle result */ +/* smb2_async_readv - send an async read, and set up mid to handle result */ int smb2_async_readv(struct cifs_readdata *rdata) { int rc, flags = 0; - struct smb2_hdr *buf; + char *buf; + struct smb2_sync_hdr *shdr; struct cifs_io_parms io_parms; - struct smb_rqst rqst = { .rq_iov = &rdata->iov, - .rq_nvec = 1 }; + struct smb_rqst rqst = { .rq_iov = rdata->iov, + .rq_nvec = 2 }; struct TCP_Server_Info *server; + unsigned int total_len; + __be32 req_len; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@ -2239,7 +2341,7 @@ smb2_async_readv(struct cifs_readdata *rdata) server = io_parms.tcon->ses->server; - rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0); + rc = smb2_new_read_req((void **) &buf, &total_len, &io_parms, 0, 0); if (rc) { if (rc == -EAGAIN && rdata->credits) { /* credits was reset by reconnect */ @@ -2252,26 +2354,34 @@ smb2_async_readv(struct cifs_readdata *rdata) return rc; } - buf = (struct smb2_hdr *)rdata->iov.iov_base; - /* 4 for rfc1002 length field */ - rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4; + if (encryption_required(io_parms.tcon)) + flags |= CIFS_TRANSFORM_REQ; + + req_len = cpu_to_be32(total_len); + + rdata->iov[0].iov_base = &req_len; + rdata->iov[0].iov_len = sizeof(__be32); + rdata->iov[1].iov_base = buf; + rdata->iov[1].iov_len = total_len; + + shdr = (struct smb2_sync_hdr *)buf; if (rdata->credits) { - buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, + shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); - buf->CreditRequest = buf->CreditCharge; + shdr->CreditRequest = shdr->CreditCharge; spin_lock(&server->req_lock); server->credits += rdata->credits - - le16_to_cpu(buf->CreditCharge); + le16_to_cpu(shdr->CreditCharge); spin_unlock(&server->req_lock); wake_up(&server->request_q); - flags = CIFS_HAS_CREDITS; + flags |= CIFS_HAS_CREDITS; } kref_get(&rdata->refcount); rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, cifs_readv_receive, smb2_readv_callback, - rdata, flags); + smb3_handle_read_data, rdata, flags); if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); @@ -2286,21 +2396,41 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *buf_type) { int resp_buftype, rc = -EACCES; + struct smb2_read_plain_req *req = NULL; struct smb2_read_rsp *rsp = NULL; - struct kvec iov[1]; + struct smb2_sync_hdr *shdr; + struct kvec iov[2]; + struct kvec rsp_iov; + unsigned int total_len; + __be32 req_len; + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = 2 }; + int flags = CIFS_LOG_ERROR; + struct cifs_ses *ses = io_parms->tcon->ses; *nbytes = 0; - rc = smb2_new_read_req(iov, io_parms, 0, 0); + rc = smb2_new_read_req((void **)&req, &total_len, io_parms, 0, 0); if (rc) return rc; - rc = SendReceive2(xid, io_parms->tcon->ses, iov, 1, - &resp_buftype, CIFS_LOG_ERROR); + if (encryption_required(io_parms->tcon)) + flags |= CIFS_TRANSFORM_REQ; - rsp = (struct smb2_read_rsp *)iov[0].iov_base; + req_len = cpu_to_be32(total_len); - if (rsp->hdr.Status == STATUS_END_OF_FILE) { - free_rsp_buf(resp_buftype, iov[0].iov_base); + iov[0].iov_base = &req_len; + iov[0].iov_len = sizeof(__be32); + iov[1].iov_base = req; + iov[1].iov_len = total_len; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + + rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; + shdr = get_sync_hdr(rsp); + + if (shdr->Status == STATUS_END_OF_FILE) { + free_rsp_buf(resp_buftype, rsp_iov.iov_base); return 0; } @@ -2319,11 +2449,10 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, } if (*buf) { - memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset, - *nbytes); - free_rsp_buf(resp_buftype, iov[0].iov_base); + memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes); + free_rsp_buf(resp_buftype, rsp_iov.iov_base); } else if (resp_buftype != CIFS_NO_BUFFER) { - *buf = iov[0].iov_base; + *buf = rsp_iov.iov_base; if (resp_buftype == CIFS_SMALL_BUFFER) *buf_type = CIFS_SMALL_BUFFER; else if (resp_buftype == CIFS_LARGE_BUFFER) @@ -2348,7 +2477,7 @@ smb2_writev_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits_received = le16_to_cpu(rsp->hdr.CreditRequest); + credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest); wdata->result = smb2_check_receive(mid, tcon->ses->server, 0); if (wdata->result != 0) break; @@ -2394,10 +2523,11 @@ smb2_async_writev(struct cifs_writedata *wdata, { int rc = -EACCES, flags = 0; struct smb2_write_req *req = NULL; + struct smb2_sync_hdr *shdr; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - struct kvec iov; - struct smb_rqst rqst; + struct kvec iov[2]; + struct smb_rqst rqst = { }; rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); if (rc) { @@ -2412,7 +2542,11 @@ smb2_async_writev(struct cifs_writedata *wdata, goto async_writev_out; } - req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid); + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + shdr = get_sync_hdr(req); + shdr->ProcessId = cpu_to_le32(wdata->cfile->pid); req->PersistentFileId = wdata->cfile->fid.persistent_fid; req->VolatileFileId = wdata->cfile->fid.volatile_fid; @@ -2426,11 +2560,13 @@ smb2_async_writev(struct cifs_writedata *wdata, req->RemainingBytes = 0; /* 4 for rfc1002 length field and 1 for Buffer */ - iov.iov_len = get_rfc1002_length(req) + 4 - 1; - iov.iov_base = req; + iov[0].iov_len = 4; + iov[0].iov_base = req; + iov[1].iov_len = get_rfc1002_length(req) - 1; + iov[1].iov_base = (char *)req + 4; - rqst.rq_iov = &iov; - rqst.rq_nvec = 1; + rqst.rq_iov = iov; + rqst.rq_nvec = 2; rqst.rq_pages = wdata->pages; rqst.rq_npages = wdata->nr_pages; rqst.rq_pagesz = wdata->pagesz; @@ -2444,20 +2580,20 @@ smb2_async_writev(struct cifs_writedata *wdata, inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */); if (wdata->credits) { - req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, + shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); - req->hdr.CreditRequest = req->hdr.CreditCharge; + shdr->CreditRequest = shdr->CreditCharge; spin_lock(&server->req_lock); server->credits += wdata->credits - - le16_to_cpu(req->hdr.CreditCharge); + le16_to_cpu(shdr->CreditCharge); spin_unlock(&server->req_lock); wake_up(&server->request_q); - flags = CIFS_HAS_CREDITS; + flags |= CIFS_HAS_CREDITS; } kref_get(&wdata->refcount); - rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata, - flags); + rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL, + wdata, flags); if (rc) { kref_put(&wdata->refcount, release); @@ -2483,6 +2619,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, struct smb2_write_req *req = NULL; struct smb2_write_rsp *rsp = NULL; int resp_buftype; + struct kvec rsp_iov; + int flags = 0; + *nbytes = 0; if (n_vec < 1) @@ -2495,7 +2634,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (io_parms->tcon->ses->server == NULL) return -ECONNABORTED; - req->hdr.ProcessId = cpu_to_le32(io_parms->pid); + if (encryption_required(io_parms->tcon)) + flags |= CIFS_TRANSFORM_REQ; + + req->hdr.sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); req->PersistentFileId = io_parms->persistent_fid; req->VolatileFileId = io_parms->volatile_fid; @@ -2517,8 +2659,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */); rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1, - &resp_buftype, 0); - rsp = (struct smb2_write_rsp *)iov[0].iov_base; + &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE); @@ -2581,6 +2724,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_query_directory_req *req; struct smb2_query_directory_rsp *rsp = NULL; struct kvec iov[2]; + struct kvec rsp_iov; int rc = 0; int len; int resp_buftype = CIFS_NO_BUFFER; @@ -2591,6 +2735,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, char *end_of_smb; unsigned int output_size = CIFSMaxBufSize; size_t info_buf_size; + int flags = 0; if (ses && (ses->server)) server = ses->server; @@ -2601,6 +2746,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + switch (srch_inf->info_level) { case SMB_FIND_FILE_DIRECTORY_INFO: req->FileInformationClass = FILE_DIRECTORY_INFORMATION; @@ -2645,11 +2793,13 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, inc_rfc1001_len(req, len - 1 /* Buffer */); - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0); - rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base; + rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; if (rc) { - if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) { + if (rc == -ENODATA && + rsp->hdr.sync_hdr.Status == STATUS_NO_MORE_FILES) { srch_inf->endOfSearch = true; rc = 0; } @@ -2705,11 +2855,13 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_set_info_req *req; struct smb2_set_info_rsp *rsp = NULL; struct kvec *iov; + struct kvec rsp_iov; int rc = 0; int resp_buftype; unsigned int i; struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; + int flags = 0; if (ses && (ses->server)) server = ses->server; @@ -2729,7 +2881,10 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } - req->hdr.ProcessId = cpu_to_le32(pid); + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid); req->InfoType = SMB2_O_INFO_FILE; req->FileInfoClass = info_class; @@ -2756,8 +2911,9 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, iov[i].iov_len = size[i]; } - rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0); - rsp = (struct smb2_set_info_rsp *)iov[0].iov_base; + rc = SendReceive2(xid, ses, iov, num, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(req); + rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; if (rc != 0) cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE); @@ -2885,20 +3041,23 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, { int rc; struct smb2_oplock_break *req = NULL; + int flags = CIFS_OBREAK_OP; cifs_dbg(FYI, "SMB2_oplock_break\n"); rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); - if (rc) return rc; + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + req->VolatileFid = volatile_fid; req->PersistentFid = persistent_fid; req->OplockLevel = oplock_level; - req->hdr.CreditRequest = cpu_to_le16(1); + req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1); - rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP); - /* SMB2 buffer freed by function above */ + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags); + cifs_small_buf_release(req); if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); @@ -2958,10 +3117,12 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_query_info_rsp *rsp = NULL; struct kvec iov; + struct kvec rsp_iov; int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; struct smb2_fs_full_size_info *info = NULL; + int flags = 0; rc = build_qfs_info_req(&iov, tcon, FS_FULL_SIZE_INFORMATION, sizeof(struct smb2_fs_full_size_info), @@ -2969,12 +3130,16 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0); + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); goto qfsinf_exit; } - rsp = (struct smb2_query_info_rsp *)iov.iov_base; + rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; info = (struct smb2_fs_full_size_info *)(4 /* RFC1001 len */ + le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr); @@ -2985,7 +3150,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, copy_fs_info_to_kstatfs(info, fsdata); qfsinf_exit: - free_rsp_buf(resp_buftype, iov.iov_base); + free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc; } @@ -2995,10 +3160,12 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_query_info_rsp *rsp = NULL; struct kvec iov; + struct kvec rsp_iov; int rc = 0; int resp_buftype, max_len, min_len; struct cifs_ses *ses = tcon->ses; unsigned int rsp_len, offset; + int flags = 0; if (level == FS_DEVICE_INFORMATION) { max_len = sizeof(FILE_SYSTEM_DEVICE_INFO); @@ -3019,12 +3186,16 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0); + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); goto qfsattr_exit; } - rsp = (struct smb2_query_info_rsp *)iov.iov_base; + rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; rsp_len = le32_to_cpu(rsp->OutputBufferLength); offset = le16_to_cpu(rsp->OutputBufferOffset); @@ -3048,7 +3219,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, } qfsattr_exit: - free_rsp_buf(resp_buftype, iov.iov_base); + free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc; } @@ -3060,8 +3231,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; struct smb2_lock_req *req = NULL; struct kvec iov[2]; + struct kvec rsp_iov; int resp_buf_type; unsigned int count; + int flags = CIFS_NO_RESP; cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); @@ -3069,7 +3242,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - req->hdr.ProcessId = cpu_to_le32(pid); + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid); req->LockCount = cpu_to_le16(num_lock); req->PersistentFileId = persist_fid; @@ -3085,7 +3261,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_len = count; cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); - rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); + rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, flags, + &rsp_iov); + cifs_small_buf_release(req); if (rc) { cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc); cifs_stats_fail_inc(tcon, SMB2_LOCK_HE); @@ -3117,22 +3295,25 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, { int rc; struct smb2_lease_ack *req = NULL; + int flags = CIFS_OBREAK_OP; cifs_dbg(FYI, "SMB2_lease_break\n"); rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); - if (rc) return rc; - req->hdr.CreditRequest = cpu_to_le16(1); + if (encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1); req->StructureSize = cpu_to_le16(36); inc_rfc1001_len(req, 12); memcpy(req->LeaseKey, lease_key, 16); req->LeaseState = lease_state; - rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP); - /* SMB2 buffer freed by function above */ + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags); + cifs_small_buf_release(req); if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index dc0d141f33e2..c03b252501a1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -101,10 +101,7 @@ #define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64) -struct smb2_hdr { - __be32 smb2_buf_length; /* big endian on wire */ - /* length is only two or three bytes - with - one or two byte type preceding it that MBZ */ +struct smb2_sync_hdr { __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ __le16 StructureSize; /* 64 */ __le16 CreditCharge; /* MBZ */ @@ -120,16 +117,31 @@ struct smb2_hdr { __u8 Signature[16]; } __packed; +struct smb2_sync_pdu { + struct smb2_sync_hdr sync_hdr; + __le16 StructureSize2; /* size of wct area (varies, request specific) */ +} __packed; + +struct smb2_hdr { + __be32 smb2_buf_length; /* big endian on wire */ + /* length is only two or three bytes - with */ + /* one or two byte type preceding it that MBZ */ + struct smb2_sync_hdr sync_hdr; +} __packed; + struct smb2_pdu { struct smb2_hdr hdr; __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; +#define SMB3_AES128CMM_NONCE 11 +#define SMB3_AES128GCM_NONCE 12 + struct smb2_transform_hdr { __be32 smb2_buf_length; /* big endian on wire */ /* length is only two or three bytes - with one or two byte type preceding it that MBZ */ - __u8 ProtocolId[4]; /* 0xFD 'S' 'M' 'B' */ + __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ __u8 Signature[16]; __u8 Nonce[16]; __le32 OriginalMessageSize; @@ -814,8 +826,9 @@ struct smb2_flush_rsp { #define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */ #define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */ -struct smb2_read_req { - struct smb2_hdr hdr; +/* SMB2 read request without RFC1001 length at the beginning */ +struct smb2_read_plain_req { + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 49 */ __u8 Padding; /* offset from start of SMB2 header to place read */ __u8 Flags; /* MBZ unless SMB3.02 or later */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index f2d511a6971b..85fc7a789334 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -56,6 +56,10 @@ extern void smb2_echo_request(struct work_struct *work); extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); extern bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv); +extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, + __u64 ses_id); +extern int smb3_handle_read_data(struct TCP_Server_Info *server, + struct mid_q_entry *mid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); @@ -97,6 +101,7 @@ extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); extern void smb2_reconnect_server(struct work_struct *work); +extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server); /* * SMB2 Worker functions - most of protocol specific implementation details diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index bc9a7b634643..7c3bb1bd7eed 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -31,6 +31,7 @@ #include <asm/processor.h> #include <linux/mempool.h> #include <linux/highmem.h> +#include <crypto/aead.h> #include "smb2pdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -114,14 +115,14 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) return 0; } -static struct cifs_ses * -smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server) +struct cifs_ses * +smb2_find_smb_ses(struct TCP_Server_Info *server, __u64 ses_id) { struct cifs_ses *ses; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->Suid != smb2hdr->SessionId) + if (ses->Suid != ses_id) continue; spin_unlock(&cifs_tcp_ses_lock); return ses; @@ -131,7 +132,6 @@ smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server) return NULL; } - int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { @@ -139,17 +139,17 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[1].iov_base; struct cifs_ses *ses; - ses = smb2_find_smb_ses(smb2_pdu, server); + ses = smb2_find_smb_ses(server, shdr->SessionId); if (!ses) { cifs_dbg(VFS, "%s: Could not find session\n", __func__); return 0; } memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); - memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); + memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); rc = smb2_crypto_shash_allocate(server); if (rc) { @@ -174,7 +174,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) &server->secmech.sdeschmacsha256->shash); if (!rc) - memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); + memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); return rc; } @@ -356,17 +356,17 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[1].iov_base; struct cifs_ses *ses; - ses = smb2_find_smb_ses(smb2_pdu, server); + ses = smb2_find_smb_ses(server, shdr->SessionId); if (!ses) { cifs_dbg(VFS, "%s: Could not find session\n", __func__); return 0; } memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); - memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); + memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); rc = crypto_shash_setkey(server->secmech.cmacaes, ses->smb3signingkey, SMB2_CMACAES_SIZE); @@ -391,7 +391,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) &server->secmech.sdesccmacaes->shash); if (!rc) - memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); + memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); return rc; } @@ -401,14 +401,15 @@ static int smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int rc = 0; - struct smb2_hdr *smb2_pdu = rqst->rq_iov[0].iov_base; + struct smb2_sync_hdr *shdr = + (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; - if (!(smb2_pdu->Flags & SMB2_FLAGS_SIGNED) || + if (!(shdr->Flags & SMB2_FLAGS_SIGNED) || server->tcpStatus == CifsNeedNegotiate) return rc; if (!server->session_estab) { - strncpy(smb2_pdu->Signature, "BSRSPYL", 8); + strncpy(shdr->Signature, "BSRSPYL", 8); return rc; } @@ -422,11 +423,12 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { unsigned int rc; char server_response_sig[16]; - struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_sync_hdr *shdr = + (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; - if ((smb2_pdu->Command == SMB2_NEGOTIATE) || - (smb2_pdu->Command == SMB2_SESSION_SETUP) || - (smb2_pdu->Command == SMB2_OPLOCK_BREAK) || + if ((shdr->Command == SMB2_NEGOTIATE) || + (shdr->Command == SMB2_SESSION_SETUP) || + (shdr->Command == SMB2_OPLOCK_BREAK) || (!server->session_estab)) return 0; @@ -436,17 +438,17 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) */ /* Do not need to verify session setups with signature "BSRSPYL " */ - if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0) + if (memcmp(shdr->Signature, "BSRSPYL ", 8) == 0) cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n", - smb2_pdu->Command); + shdr->Command); /* * Save off the origiginal signature so we can modify the smb and check * our calculated signature against what the server sent. */ - memcpy(server_response_sig, smb2_pdu->Signature, SMB2_SIGNATURE_SIZE); + memcpy(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE); - memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE); + memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE); mutex_lock(&server->srv_mutex); rc = server->ops->calc_signature(rqst, server); @@ -455,8 +457,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) if (rc) return rc; - if (memcmp(server_response_sig, smb2_pdu->Signature, - SMB2_SIGNATURE_SIZE)) + if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE)) return -EACCES; else return 0; @@ -467,18 +468,19 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) * and when srv_mutex is held. */ static inline void -smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr) +smb2_seq_num_into_buf(struct TCP_Server_Info *server, + struct smb2_sync_hdr *shdr) { - unsigned int i, num = le16_to_cpu(hdr->CreditCharge); + unsigned int i, num = le16_to_cpu(shdr->CreditCharge); - hdr->MessageId = get_next_mid64(server); + shdr->MessageId = get_next_mid64(server); /* skip message numbers according to CreditCharge field */ for (i = 1; i < num; i++) get_next_mid(server); } static struct mid_q_entry * -smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer, +smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, struct TCP_Server_Info *server) { struct mid_q_entry *temp; @@ -493,9 +495,9 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer, return temp; else { memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = le64_to_cpu(smb_buffer->MessageId); + temp->mid = le64_to_cpu(shdr->MessageId); temp->pid = current->pid; - temp->command = smb_buffer->Command; /* Always LE */ + temp->command = shdr->Command; /* Always LE */ temp->when_alloc = jiffies; temp->server = server; @@ -513,7 +515,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer, } static int -smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf, +smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr, struct mid_q_entry **mid) { if (ses->server->tcpStatus == CifsExiting) @@ -525,19 +527,19 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf, } if (ses->status == CifsNew) { - if ((buf->Command != SMB2_SESSION_SETUP) && - (buf->Command != SMB2_NEGOTIATE)) + if ((shdr->Command != SMB2_SESSION_SETUP) && + (shdr->Command != SMB2_NEGOTIATE)) return -EAGAIN; /* else ok - we are setting up session */ } if (ses->status == CifsExiting) { - if (buf->Command != SMB2_LOGOFF) + if (shdr->Command != SMB2_LOGOFF) return -EAGAIN; /* else ok - we are shutting down the session */ } - *mid = smb2_mid_entry_alloc(buf, ses->server); + *mid = smb2_mid_entry_alloc(shdr, ses->server); if (*mid == NULL) return -ENOMEM; spin_lock(&GlobalMid_Lock); @@ -551,16 +553,18 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { unsigned int len = get_rfc1002_length(mid->resp_buf); - struct kvec iov; - struct smb_rqst rqst = { .rq_iov = &iov, - .rq_nvec = 1 }; + struct kvec iov[2]; + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = 2 }; - iov.iov_base = (char *)mid->resp_buf; - iov.iov_len = get_rfc1002_length(mid->resp_buf) + 4; + iov[0].iov_base = (char *)mid->resp_buf; + iov[0].iov_len = 4; + iov[1].iov_base = (char *)mid->resp_buf + 4; + iov[1].iov_len = len; dump_smb(mid->resp_buf, min_t(u32, 80, len)); /* convert the length into a more usable form */ - if (len > 24 && server->sign) { + if (len > 24 && server->sign && !mid->decrypted) { int rc; rc = smb2_verify_signature(&rqst, server); @@ -576,12 +580,13 @@ struct mid_q_entry * smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) { int rc; - struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_sync_hdr *shdr = + (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; struct mid_q_entry *mid; - smb2_seq_num_into_buf(ses->server, hdr); + smb2_seq_num_into_buf(ses->server, shdr); - rc = smb2_get_mid_entry(ses, hdr, &mid); + rc = smb2_get_mid_entry(ses, shdr, &mid); if (rc) return ERR_PTR(rc); rc = smb2_sign_rqst(rqst, ses->server); @@ -596,12 +601,13 @@ struct mid_q_entry * smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; - struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_sync_hdr *shdr = + (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; struct mid_q_entry *mid; - smb2_seq_num_into_buf(server, hdr); + smb2_seq_num_into_buf(server, shdr); - mid = smb2_mid_entry_alloc(hdr, server); + mid = smb2_mid_entry_alloc(shdr, server); if (mid == NULL) return ERR_PTR(-ENOMEM); @@ -613,3 +619,33 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) return mid; } + +int +smb3_crypto_aead_allocate(struct TCP_Server_Info *server) +{ + struct crypto_aead *tfm; + + if (!server->secmech.ccmaesencrypt) { + tfm = crypto_alloc_aead("ccm(aes)", 0, 0); + if (IS_ERR(tfm)) { + cifs_dbg(VFS, "%s: Failed to alloc encrypt aead\n", + __func__); + return PTR_ERR(tfm); + } + server->secmech.ccmaesencrypt = tfm; + } + + if (!server->secmech.ccmaesdecrypt) { + tfm = crypto_alloc_aead("ccm(aes)", 0, 0); + if (IS_ERR(tfm)) { + crypto_free_aead(server->secmech.ccmaesencrypt); + server->secmech.ccmaesencrypt = NULL; + cifs_dbg(VFS, "%s: Failed to alloc decrypt aead\n", + __func__); + return PTR_ERR(tfm); + } + server->secmech.ccmaesdecrypt = tfm; + } + + return 0; +} diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index fbb84c08e3cd..526f0533cb4e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -221,7 +221,7 @@ rqst_len(struct smb_rqst *rqst) } static int -smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) +__smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; struct kvec *iov = rqst->rq_iov; @@ -245,8 +245,12 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) return -EIO; } + if (n_vec < 2) + return -EIO; + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length); dump_smb(iov[0].iov_base, iov[0].iov_len); + dump_smb(iov[1].iov_base, iov[1].iov_len); /* cork the socket */ kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, @@ -309,24 +313,43 @@ uncork: } static int -smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) +smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst, int flags) { - struct smb_rqst rqst = { .rq_iov = iov, - .rq_nvec = n_vec }; + struct smb_rqst cur_rqst; + int rc; + + if (!(flags & CIFS_TRANSFORM_REQ)) + return __smb_send_rqst(server, rqst); + + if (!server->ops->init_transform_rq || + !server->ops->free_transform_rq) { + cifs_dbg(VFS, "Encryption requested but transform callbacks are missed\n"); + return -EIO; + } + + rc = server->ops->init_transform_rq(server, &cur_rqst, rqst); + if (rc) + return rc; - return smb_send_rqst(server, &rqst); + rc = __smb_send_rqst(server, &cur_rqst); + server->ops->free_transform_rq(&cur_rqst); + return rc; } int smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, unsigned int smb_buf_length) { - struct kvec iov; + struct kvec iov[2]; + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = 2 }; - iov.iov_base = smb_buffer; - iov.iov_len = smb_buf_length + 4; + iov[0].iov_base = smb_buffer; + iov[0].iov_len = 4; + iov[1].iov_base = (char *)smb_buffer + 4; + iov[1].iov_len = smb_buf_length; - return smb_sendv(server, &iov, 1); + return __smb_send_rqst(server, &rqst); } static int @@ -454,6 +477,10 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; + if (rqst->rq_iov[0].iov_len != 4 || + rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base) + return ERR_PTR(-EIO); + /* enable signing if server requires it */ if (server->sign) hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; @@ -478,7 +505,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) int cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid_receive_t *receive, mid_callback_t *callback, - void *cbdata, const int flags) + mid_handle_t *handle, void *cbdata, const int flags) { int rc, timeout, optype; struct mid_q_entry *mid; @@ -505,6 +532,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid->receive = receive; mid->callback = callback; mid->callback_data = cbdata; + mid->handle = handle; mid->mid_state = MID_REQUEST_SUBMITTED; /* put it on the pending_mid_q */ @@ -514,7 +542,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_in_send_inc(server); - rc = smb_send_rqst(server, rqst); + rc = smb_send_rqst(server, rqst, flags); cifs_in_send_dec(server); cifs_save_when_sent(mid); @@ -547,12 +575,13 @@ SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, { int rc; struct kvec iov[1]; + struct kvec rsp_iov; int resp_buf_type; iov[0].iov_base = in_buf; iov[0].iov_len = get_rfc1002_length(in_buf) + 4; flags |= CIFS_NO_RESP; - rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); + rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc); return rc; @@ -595,10 +624,11 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) } static inline int -send_cancel(struct TCP_Server_Info *server, void *buf, struct mid_q_entry *mid) +send_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst, + struct mid_q_entry *mid) { return server->ops->send_cancel ? - server->ops->send_cancel(server, buf, mid) : 0; + server->ops->send_cancel(server, rqst, mid) : 0; } int @@ -611,13 +641,15 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, /* convert the length into a more usable form */ if (server->sign) { - struct kvec iov; + struct kvec iov[2]; int rc = 0; - struct smb_rqst rqst = { .rq_iov = &iov, - .rq_nvec = 1 }; + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = 2 }; - iov.iov_base = mid->resp_buf; - iov.iov_len = len; + iov[0].iov_base = mid->resp_buf; + iov[0].iov_len = 4; + iov[1].iov_base = (char *)mid->resp_buf + 4; + iov[1].iov_len = len - 4; /* FIXME: add code to kill session */ rc = cifs_verify_signature(&rqst, server, mid->sequence_number); @@ -637,6 +669,10 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; + if (rqst->rq_iov[0].iov_len != 4 || + rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base) + return ERR_PTR(-EIO); + rc = allocate_mid(ses, hdr, &mid); if (rc) return ERR_PTR(rc); @@ -649,17 +685,15 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) } int -SendReceive2(const unsigned int xid, struct cifs_ses *ses, - struct kvec *iov, int n_vec, int *resp_buf_type /* ret */, - const int flags) +cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, + struct smb_rqst *rqst, int *resp_buf_type, const int flags, + struct kvec *resp_iov) { int rc = 0; int timeout, optype; struct mid_q_entry *midQ; - char *buf = iov[0].iov_base; unsigned int credits = 1; - struct smb_rqst rqst = { .rq_iov = iov, - .rq_nvec = n_vec }; + char *buf; timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; @@ -667,15 +701,12 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, *resp_buf_type = CIFS_NO_BUFFER; /* no response buf yet */ if ((ses == NULL) || (ses->server == NULL)) { - cifs_small_buf_release(buf); cifs_dbg(VFS, "Null session\n"); return -EIO; } - if (ses->server->tcpStatus == CifsExiting) { - cifs_small_buf_release(buf); + if (ses->server->tcpStatus == CifsExiting) return -ENOENT; - } /* * Ensure that we do not send more than 50 overlapping requests @@ -684,10 +715,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, */ rc = wait_for_free_request(ses->server, timeout, optype); - if (rc) { - cifs_small_buf_release(buf); + if (rc) return rc; - } /* * Make sure that we sign in the same order that we send on this socket @@ -697,10 +726,9 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, mutex_lock(&ses->server->srv_mutex); - midQ = ses->server->ops->setup_request(ses, &rqst); + midQ = ses->server->ops->setup_request(ses, rqst); if (IS_ERR(midQ)) { mutex_unlock(&ses->server->srv_mutex); - cifs_small_buf_release(buf); /* Update # of requests on wire to server */ add_credits(ses->server, 1, optype); return PTR_ERR(midQ); @@ -708,7 +736,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, midQ->mid_state = MID_REQUEST_SUBMITTED; cifs_in_send_inc(ses->server); - rc = smb_sendv(ses->server, iov, n_vec); + rc = smb_send_rqst(ses->server, rqst, flags); cifs_in_send_dec(ses->server); cifs_save_when_sent(midQ); @@ -716,32 +744,25 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, ses->server->sequence_number -= 2; mutex_unlock(&ses->server->srv_mutex); - if (rc < 0) { - cifs_small_buf_release(buf); + if (rc < 0) goto out; - } - if (timeout == CIFS_ASYNC_OP) { - cifs_small_buf_release(buf); + if (timeout == CIFS_ASYNC_OP) goto out; - } rc = wait_for_response(ses->server, midQ); if (rc != 0) { - send_cancel(ses->server, buf, midQ); + send_cancel(ses->server, rqst, midQ); spin_lock(&GlobalMid_Lock); if (midQ->mid_state == MID_REQUEST_SUBMITTED) { midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); - cifs_small_buf_release(buf); add_credits(ses->server, 1, optype); return rc; } spin_unlock(&GlobalMid_Lock); } - cifs_small_buf_release(buf); - rc = cifs_sync_mid_result(midQ, ses->server); if (rc != 0) { add_credits(ses->server, 1, optype); @@ -755,8 +776,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, } buf = (char *)midQ->resp_buf; - iov[0].iov_base = buf; - iov[0].iov_len = get_rfc1002_length(buf) + 4; + resp_iov->iov_base = buf; + resp_iov->iov_len = get_rfc1002_length(buf) + 4; if (midQ->large_buf) *resp_buf_type = CIFS_LARGE_BUFFER; else @@ -778,12 +799,45 @@ out: } int +SendReceive2(const unsigned int xid, struct cifs_ses *ses, + struct kvec *iov, int n_vec, int *resp_buf_type /* ret */, + const int flags, struct kvec *resp_iov) +{ + struct smb_rqst rqst; + struct kvec *new_iov; + int rc; + + new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), GFP_KERNEL); + if (!new_iov) + return -ENOMEM; + + /* 1st iov is a RFC1001 length followed by the rest of the packet */ + memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec)); + + new_iov[0].iov_base = new_iov[1].iov_base; + new_iov[0].iov_len = 4; + new_iov[1].iov_base += 4; + new_iov[1].iov_len -= 4; + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = new_iov; + rqst.rq_nvec = n_vec + 1; + + rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); + kfree(new_iov); + return rc; +} + +int SendReceive(const unsigned int xid, struct cifs_ses *ses, struct smb_hdr *in_buf, struct smb_hdr *out_buf, int *pbytes_returned, const int timeout) { int rc = 0; struct mid_q_entry *midQ; + unsigned int len = be32_to_cpu(in_buf->smb_buf_length); + struct kvec iov = { .iov_base = in_buf, .iov_len = len }; + struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; if (ses == NULL) { cifs_dbg(VFS, "Null smb session\n"); @@ -801,10 +855,9 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, to the same server. We may make this configurable later or use ses->maxReq */ - if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + - MAX_CIFS_HDR_SIZE - 4) { + if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", - be32_to_cpu(in_buf->smb_buf_length)); + len); return -EIO; } @@ -835,7 +888,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, midQ->mid_state = MID_REQUEST_SUBMITTED; cifs_in_send_inc(ses->server); - rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); + rc = smb_send(ses->server, in_buf, len); cifs_in_send_dec(ses->server); cifs_save_when_sent(midQ); @@ -852,7 +905,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = wait_for_response(ses->server, midQ); if (rc != 0) { - send_cancel(ses->server, in_buf, midQ); + send_cancel(ses->server, &rqst, midQ); spin_lock(&GlobalMid_Lock); if (midQ->mid_state == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ @@ -921,6 +974,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, int rstart = 0; struct mid_q_entry *midQ; struct cifs_ses *ses; + unsigned int len = be32_to_cpu(in_buf->smb_buf_length); + struct kvec iov = { .iov_base = in_buf, .iov_len = len }; + struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; if (tcon == NULL || tcon->ses == NULL) { cifs_dbg(VFS, "Null smb session\n"); @@ -940,10 +996,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, to the same server. We may make this configurable later or use ses->maxReq */ - if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + - MAX_CIFS_HDR_SIZE - 4) { + if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", - be32_to_cpu(in_buf->smb_buf_length)); + len); return -EIO; } @@ -972,7 +1027,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, midQ->mid_state = MID_REQUEST_SUBMITTED; cifs_in_send_inc(ses->server); - rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); + rc = smb_send(ses->server, in_buf, len); cifs_in_send_dec(ses->server); cifs_save_when_sent(midQ); @@ -1001,7 +1056,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (in_buf->Command == SMB_COM_TRANSACTION2) { /* POSIX lock. We send a NT_CANCEL SMB to cause the blocking lock to return. */ - rc = send_cancel(ses->server, in_buf, midQ); + rc = send_cancel(ses->server, &rqst, midQ); if (rc) { cifs_delete_mid(midQ); return rc; @@ -1022,7 +1077,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, rc = wait_for_response(ses->server, midQ); if (rc) { - send_cancel(ses->server, in_buf, midQ); + send_cancel(ses->server, &rqst, midQ); spin_lock(&GlobalMid_Lock); if (midQ->mid_state == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 4d24d17bcfc1..504b3c3539dc 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -51,22 +51,8 @@ #define elf_prstatus compat_elf_prstatus #define elf_prpsinfo compat_elf_prpsinfo -/* - * Compat version of cputime_to_compat_timeval, perhaps this - * should be an inline in <linux/compat.h>. - */ -static void cputime_to_compat_timeval(const cputime_t cputime, - struct compat_timeval *value) -{ - struct timeval tv; - cputime_to_timeval(cputime, &tv); - value->tv_sec = tv.tv_sec; - value->tv_usec = tv.tv_usec; -} - -#undef cputime_to_timeval -#define cputime_to_timeval cputime_to_compat_timeval - +#undef ns_to_timeval +#define ns_to_timeval ns_to_compat_timeval /* * To use this file, asm/elf.h must define compat_elf_check_arch. diff --git a/fs/coredump.c b/fs/coredump.c index e525b6017cdf..ae6b05629ca1 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -833,3 +833,21 @@ int dump_align(struct coredump_params *cprm, int align) return mod ? dump_skip(cprm, align - mod) : 1; } EXPORT_SYMBOL(dump_align); + +/* + * Ensures that file size is big enough to contain the current file + * postion. This prevents gdb from complaining about a truncated file + * if the last "write" to the file was dump_skip. + */ +void dump_truncate(struct coredump_params *cprm) +{ + struct file *file = cprm->file; + loff_t offset; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + offset = file->f_op->llseek(file, 0, SEEK_CUR); + if (i_size_read(file->f_mapping->host) < offset) + do_truncate(file->f_path.dentry, offset, 0, file); + } +} +EXPORT_SYMBOL(dump_truncate); diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index f514978f6688..08b46e6e3995 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -1,6 +1,5 @@ config FS_ENCRYPTION tristate "FS Encryption (Per-file encryption)" - depends on BLOCK select CRYPTO select CRYPTO_AES select CRYPTO_CBC diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index f17684c48739..9f6607f17b53 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c new file mode 100644 index 000000000000..a409a84f1bca --- /dev/null +++ b/fs/crypto/bio.c @@ -0,0 +1,145 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include <linux/pagemap.h> +#include <linux/module.h> +#include <linux/bio.h> +#include <linux/namei.h> +#include "fscrypt_private.h" + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + PAGE_SIZE, 0, GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(bio); + if ((err == 0) && bio->bi_error) + err = -EIO; + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index ac8e4f6a3773..02a7a9286449 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -24,7 +24,6 @@ #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/ratelimit.h> -#include <linux/bio.h> #include <linux/dcache.h> #include <linux/namei.h> #include "fscrypt_private.h" @@ -44,7 +43,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -static struct workqueue_struct *fscrypt_read_workqueue; +struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; @@ -141,16 +140,10 @@ static void page_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -typedef enum { - FS_DECRYPT = 0, - FS_ENCRYPT, -} fscrypt_direction_t; - -static int do_page_crypto(const struct inode *inode, - fscrypt_direction_t rw, u64 lblk_num, - struct page *src_page, struct page *dest_page, - unsigned int len, unsigned int offs, - gfp_t gfp_flags) +int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags) { struct { __le64 index; @@ -205,7 +198,8 @@ static int do_page_crypto(const struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags) { ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) @@ -260,9 +254,9 @@ struct page *fscrypt_encrypt_page(const struct inode *inode, if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { /* with inplace-encryption we just encrypt the page */ - err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, - page, ciphertext_page, - len, offs, gfp_flags); + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page, + ciphertext_page, len, offs, + gfp_flags); if (err) return ERR_PTR(err); @@ -276,14 +270,14 @@ struct page *fscrypt_encrypt_page(const struct inode *inode, return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; ctx->w.control_page = page; - err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, - page, ciphertext_page, - len, offs, gfp_flags); + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, len, offs, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -320,72 +314,11 @@ int fscrypt_decrypt_page(const struct inode *inode, struct page *page, if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) BUG_ON(!PageLocked(page)); - return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len, - offs, GFP_NOFS); + return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, + len, offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len) -{ - struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; - struct bio *bio; - int ret, err = 0; - - BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); - if (IS_ERR(ciphertext_page)) { - err = PTR_ERR(ciphertext_page); - goto errout; - } - - while (len--) { - err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - PAGE_SIZE, 0, GFP_NOFS); - if (err) - goto errout; - - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } - bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = - pblk << (inode->i_sb->s_blocksize_bits - 9); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - ret = bio_add_page(bio, ciphertext_page, - inode->i_sb->s_blocksize, 0); - if (ret != inode->i_sb->s_blocksize) { - /* should never happen! */ - WARN_ON(1); - bio_put(bio); - err = -EIO; - goto errout; - } - err = submit_bio_wait(bio); - if ((err == 0) && bio->bi_error) - err = -EIO; - bio_put(bio); - if (err) - goto errout; - lblk++; - pblk++; - } - err = 0; -errout: - fscrypt_release_ctx(ctx); - return err; -} -EXPORT_SYMBOL(fscrypt_zeroout_range); - /* * Validate dentries for encrypted directories to make sure we aren't * potentially caching stale data after a key has been added or @@ -442,64 +375,6 @@ const struct dentry_operations fscrypt_d_ops = { }; EXPORT_SYMBOL(fscrypt_d_ops); -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page->mapping->host, page, - PAGE_SIZE, 0, page->index); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else { - SetPageUptodate(page); - } - unlock_page(page); - } - fscrypt_release_ctx(ctx); - bio_put(bio); -} - -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); -} -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); - -void fscrypt_pullback_bio_page(struct page **page, bool restore) -{ - struct fscrypt_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct fscrypt_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - if (restore) - fscrypt_restore_control_page(bounce_page); -} -EXPORT_SYMBOL(fscrypt_pullback_bio_page); - void fscrypt_restore_control_page(struct page *page) { struct fscrypt_ctx *ctx; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 56ad9d195f18..13052b85c393 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -332,7 +332,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, * in a directory. Consequently, a user space name cannot be mapped to * a disk-space name */ - return -EACCES; + return -ENOKEY; } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); @@ -367,7 +367,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } if (!lookup) - return -EACCES; + return -ENOKEY; /* * We don't have the key and we are doing a lookup; decode the diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index aeab032d7d35..fdbb8af32eaf 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -11,7 +11,7 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H -#include <linux/fscrypto.h> +#include <linux/fscrypt_supp.h> #define FS_FNAME_CRYPTO_DIGEST_SIZE 32 @@ -71,6 +71,11 @@ struct fscrypt_info { u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; }; +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 #define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 @@ -81,11 +86,20 @@ struct fscrypt_completion_result { #define DECLARE_FS_COMPLETION_RESULT(ecr) \ struct fscrypt_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } + COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } /* crypto.c */ -int fscrypt_initialize(unsigned int cop_flags); +extern int fscrypt_initialize(unsigned int cop_flags); +extern struct workqueue_struct *fscrypt_read_workqueue; +extern int fscrypt_do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, + struct page *src_page, + struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); +extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags); /* keyinfo.c */ extern int fscrypt_get_crypt_info(struct inode *); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 6eeea1dcba41..02eb6b9e4438 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -77,26 +77,22 @@ out: static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - u8 *prefix, int prefix_size) + const char *prefix) { - u8 *full_key_descriptor; + char *description; struct key *keyring_key; struct fscrypt_key *master_key; const struct user_key_payload *ukp; - int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; int res; - full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); - if (!full_key_descriptor) + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + if (!description) return -ENOMEM; - memcpy(full_key_descriptor, prefix, prefix_size); - sprintf(full_key_descriptor + prefix_size, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); - full_key_descriptor[full_key_len - 1] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - kfree(full_key_descriptor); + keyring_key = request_key(&key_type_logon, description, NULL); + kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); @@ -206,12 +202,15 @@ retry: res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode)) + if (!fscrypt_dummy_context_enabled(inode) || + inode->i_sb->s_cop->is_encrypted(inode)) return res; + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); } else if (res != sizeof(ctx)) { return -EINVAL; } @@ -247,20 +246,10 @@ retry: if (!raw_key) goto out; - if (fscrypt_dummy_context_enabled(inode)) { - memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); - goto got_key; - } - - res = validate_user_key(crypt_info, &ctx, raw_key, - FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX); if (res && inode->i_sb->s_cop->key_prefix) { - u8 *prefix = NULL; - int prefix_size, res2; - - prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); - res2 = validate_user_key(crypt_info, &ctx, raw_key, - prefix, prefix_size); + int res2 = validate_user_key(crypt_info, &ctx, raw_key, + inode->i_sb->s_cop->key_prefix); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -269,7 +258,6 @@ retry: } else if (res) { goto out; } -got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6ed7c2eebeec..14b76da71269 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -13,37 +13,20 @@ #include <linux/mount.h> #include "fscrypt_private.h" -static int inode_has_encryption_context(struct inode *inode) -{ - if (!inode->i_sb->s_cop->get_context) - return 0; - return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); -} - /* - * check whether the policy is consistent with the encryption context - * for the inode + * check whether an encryption policy is consistent with an encryption context */ -static int is_encryption_context_consistent_with_policy(struct inode *inode, +static bool is_encryption_context_consistent_with_policy( + const struct fscrypt_context *ctx, const struct fscrypt_policy *policy) { - struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->get_context) - return 0; - - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); + return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx->flags == policy->flags) && + (ctx->contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx->filenames_encryption_mode == + policy->filenames_encryption_mode); } static int create_encryption_context_from_policy(struct inode *inode, @@ -66,20 +49,12 @@ static int create_encryption_context_from_policy(struct inode *inode, FS_KEY_DESCRIPTOR_SIZE); if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); + policy->contents_encryption_mode)) return -EINVAL; - } if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); + policy->filenames_encryption_mode)) return -EINVAL; - } if (policy->flags & ~FS_POLICY_FLAGS_VALID) return -EINVAL; @@ -98,6 +73,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + struct fscrypt_context ctx; if (copy_from_user(&policy, arg, sizeof(policy))) return -EFAULT; @@ -114,9 +90,10 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) inode_lock(inode); - if (!inode_has_encryption_context(inode)) { + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) - ret = -EINVAL; + ret = -ENOTDIR; else if (!inode->i_sb->s_cop->empty_dir) ret = -EOPNOTSUPP; else if (!inode->i_sb->s_cop->empty_dir(inode)) @@ -124,12 +101,14 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) else ret = create_encryption_context_from_policy(inode, &policy); - } else if (!is_encryption_context_consistent_with_policy(inode, - &policy)) { - printk(KERN_WARNING - "%s: Policy inconsistent with encryption context\n", - __func__); - ret = -EINVAL; + } else if (ret == sizeof(ctx) && + is_encryption_context_consistent_with_policy(&ctx, + &policy)) { + /* The file already uses the same encryption policy. */ + ret = 0; + } else if (ret >= 0 || ret == -ERANGE) { + /* The file already uses a different encryption policy. */ + ret = -EEXIST; } inode_unlock(inode); @@ -151,8 +130,10 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0 && res != -ERANGE) + return res; if (res != sizeof(ctx)) - return -ENODATA; + return -EINVAL; if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; @@ -179,6 +160,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) BUG_ON(1); } + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) + return 1; + /* no restrictions if the parent directory is not encrypted */ if (!parent->i_sb->s_cop->is_encrypted(parent)) return 1; @@ -212,9 +198,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context); * @parent: Parent inode from which the context is inherited. * @child: Child inode that inherits the context from @parent. * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info + * @preload: preload child i_crypt_info if true * - * Return: Zero on success, non-zero otherwise + * Return: 0 on success, -errno on failure */ int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload) @@ -235,19 +221,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return -ENOKEY; ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - if (fscrypt_dummy_context_enabled(parent)) { - ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE); - } + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); @@ -35,6 +35,9 @@ #include <linux/iomap.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/fs_dax.h> + /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -451,16 +454,37 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } +static int __dax_invalidate_mapping_entry(struct address_space *mapping, + pgoff_t index, bool trunc) +{ + int ret = 0; + void *entry; + struct radix_tree_root *page_tree = &mapping->page_tree; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (!entry || !radix_tree_exceptional_entry(entry)) + goto out; + if (!trunc && + (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || + radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) + goto out; + radix_tree_delete(page_tree, index); + mapping->nrexceptional--; + ret = 1; +out: + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + return ret; +} /* * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * entry to get unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { - void *entry; + int ret = __dax_invalidate_mapping_entry(mapping, index, true); - spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, NULL); /* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the @@ -468,16 +492,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) * caller has seen exceptional entry for this index, we better find it * at that index as well... */ - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { - spin_unlock_irq(&mapping->tree_lock); - return 0; - } - radix_tree_delete(&mapping->page_tree, index); + WARN_ON_ONCE(!ret); + return ret; +} + +/* + * Invalidate exceptional DAX entry if easily possible. This handles DAX + * entries for invalidate_inode_pages() so we evict the entry only if we can + * do so without blocking. + */ +int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + int ret = 0; + void *entry, **slot; + struct radix_tree_root *page_tree = &mapping->page_tree; + + spin_lock_irq(&mapping->tree_lock); + entry = __radix_tree_lookup(page_tree, index, NULL, &slot); + if (!entry || !radix_tree_exceptional_entry(entry) || + slot_locked(mapping, slot)) + goto out; + if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || + radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto out; + radix_tree_delete(page_tree, index); mapping->nrexceptional--; + ret = 1; +out: spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, entry, true); + if (ret) + dax_wake_mapping_entry_waiter(mapping, index, entry, true); + return ret; +} - return 1; +/* + * Invalidate exceptional DAX entry if it is clean. + */ +int dax_invalidate_mapping_entry_sync(struct address_space *mapping, + pgoff_t index) +{ + return __dax_invalidate_mapping_entry(mapping, index, false); } /* @@ -488,15 +542,16 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) * otherwise it will simply fall out of the page cache under memory * pressure without ever having been dirtied. */ -static int dax_load_hole(struct address_space *mapping, void *entry, +static int dax_load_hole(struct address_space *mapping, void **entry, struct vm_fault *vmf) { struct page *page; + int ret; /* Hole page already exists? Return it... */ - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - return VM_FAULT_LOCKED; + if (!radix_tree_exceptional_entry(*entry)) { + page = *entry; + goto out; } /* This will replace locked radix tree entry with a hole page */ @@ -504,8 +559,17 @@ static int dax_load_hole(struct address_space *mapping, void *entry, vmf->gfp_mask | __GFP_ZERO); if (!page) return VM_FAULT_OOM; + out: vmf->page = page; - return VM_FAULT_LOCKED; + ret = finish_fault(vmf); + vmf->page = NULL; + *entry = page; + if (!ret) { + /* Grab reference for PTE that is now referencing the page */ + get_page(page); + return VM_FAULT_NOPAGE; + } + return ret; } static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, @@ -630,8 +694,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pgoff_t index, unsigned long pfn) { struct vm_area_struct *vma; - pte_t *ptep; - pte_t pte; + pte_t pte, *ptep = NULL; + pmd_t *pmdp = NULL; spinlock_t *ptl; bool changed; @@ -646,21 +710,42 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, address = pgoff_address(index, vma); changed = false; - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) continue; - if (pfn != pte_pfn(*ptep)) - goto unlock; - if (!pte_dirty(*ptep) && !pte_write(*ptep)) - goto unlock; - flush_cache_page(vma, address, pfn); - pte = ptep_clear_flush(vma, address, ptep); - pte = pte_wrprotect(pte); - pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); - changed = true; -unlock: - pte_unmap_unlock(ptep, ptl); + if (pmdp) { +#ifdef CONFIG_FS_DAX_PMD + pmd_t pmd; + + if (pfn != pmd_pfn(*pmdp)) + goto unlock_pmd; + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + goto unlock_pmd; + + flush_cache_page(vma, address, pfn); + pmd = pmdp_huge_clear_flush(vma, address, pmdp); + pmd = pmd_wrprotect(pmd); + pmd = pmd_mkclean(pmd); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + changed = true; +unlock_pmd: + spin_unlock(ptl); +#endif + } else { + if (pfn != pte_pfn(*ptep)) + goto unlock_pte; + if (!pte_dirty(*ptep) && !pte_write(*ptep)) + goto unlock_pte; + + flush_cache_page(vma, address, pfn); + pte = ptep_clear_flush(vma, address, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, address, ptep, pte); + changed = true; +unlock_pte: + pte_unmap_unlock(ptep, ptl); + } if (changed) mmu_notifier_invalidate_page(vma->vm_mm, address); @@ -908,7 +993,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -#ifdef CONFIG_FS_IOMAP static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) { return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); @@ -934,11 +1018,27 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) return -EIO; + /* + * Write can allocate block for an area which has a hole page mapped + * into page tables. We have to tear down these mappings so that data + * written by write(2) is visible in mmap. + */ + if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { + invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_SHIFT, + (end - 1) >> PAGE_SHIFT); + } + while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); struct blk_dax_ctl dax = { 0 }; ssize_t map_len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + dax.sector = dax_iomap_sector(iomap, pos); dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; map_len = dax_map_atomic(iomap->bdev, &dax); @@ -982,31 +1082,18 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, */ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, - struct iomap_ops *ops) + const struct iomap_ops *ops) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos, ret = 0, done = 0; unsigned flags = 0; - if (iov_iter_rw(iter) == WRITE) + if (iov_iter_rw(iter) == WRITE) { + lockdep_assert_held_exclusive(&inode->i_rwsem); flags |= IOMAP_WRITE; - - /* - * Yes, even DAX files can have page cache attached to them: A zeroed - * page is inserted into the pagecache when we have to serve a write - * fault on a hole. It should never be dirtied and can simply be - * dropped from the pagecache once we get real data for the page. - * - * XXX: This is racy against mmap, and there's nothing we can do about - * it. We'll eventually need to shift this down even further so that - * we can check if we allocated blocks over a hole first. - */ - if (mapping->nrpages) { - ret = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, - (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(ret); + } else { + lockdep_assert_held(&inode->i_rwsem); } while (iov_iter_count(iter)) { @@ -1023,6 +1110,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(dax_iomap_rw); +static int dax_fault_return(int error) +{ + if (error == 0) + return VM_FAULT_NOPAGE; + if (error == -ENOMEM) + return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; +} + /** * dax_iomap_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred @@ -1034,7 +1130,7 @@ EXPORT_SYMBOL_GPL(dax_iomap_rw); * necessary locking for the page fault to proceed successfully. */ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - struct iomap_ops *ops) + const struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; @@ -1055,12 +1151,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (pos >= i_size_read(inode)) return VM_FAULT_SIGBUS; - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - error = PTR_ERR(entry); - goto out; - } - if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; @@ -1071,9 +1161,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); if (error) - goto unlock_entry; + return dax_fault_return(error); if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - error = -EIO; /* fs corruption? */ + vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ + goto finish_iomap; + } + + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); + if (IS_ERR(entry)) { + vmf_ret = dax_fault_return(PTR_ERR(entry)); goto finish_iomap; } @@ -1096,13 +1192,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } if (error) - goto finish_iomap; + goto error_unlock_entry; __SetPageUptodate(vmf->cow_page); vmf_ret = finish_fault(vmf); if (!vmf_ret) vmf_ret = VM_FAULT_DONE_COW; - goto finish_iomap; + goto unlock_entry; } switch (iomap.type) { @@ -1114,12 +1210,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } error = dax_insert_mapping(mapping, iomap.bdev, sector, PAGE_SIZE, &entry, vma, vmf); + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if (error == -EBUSY) + error = 0; break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - vmf_ret = dax_load_hole(mapping, entry, vmf); - break; + vmf_ret = dax_load_hole(mapping, &entry, vmf); + goto unlock_entry; } /*FALLTHRU*/ default: @@ -1128,31 +1227,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, break; } + error_unlock_entry: + vmf_ret = dax_fault_return(error) | major; + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); finish_iomap: if (ops->iomap_end) { - if (error || (vmf_ret & VM_FAULT_ERROR)) { - /* keep previous error */ - ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, - &iomap); - } else { - error = ops->iomap_end(inode, pos, PAGE_SIZE, - PAGE_SIZE, flags, &iomap); - } - } - unlock_entry: - if (vmf_ret != VM_FAULT_LOCKED || error) - put_locked_mapping_entry(mapping, vmf->pgoff, entry); - out: - if (error == -ENOMEM) - return VM_FAULT_OOM | major; - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if (error < 0 && error != -EBUSY) - return VM_FAULT_SIGBUS | major; - if (vmf_ret) { - WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ - return vmf_ret; + int copied = PAGE_SIZE; + + if (vmf_ret & VM_FAULT_ERROR) + copied = 0; + /* + * The fault is done by now and there's no way back (other + * thread may be already happily using PTE we have installed). + * Just ignore error from ->iomap_end since we cannot do much + * with it. + */ + ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } - return VM_FAULT_NOPAGE | major; + return vmf_ret; } EXPORT_SYMBOL_GPL(dax_iomap_fault); @@ -1163,21 +1256,21 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) -static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, - struct vm_fault *vmf, unsigned long address, - struct iomap *iomap, loff_t pos, bool write, void **entryp) +static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, + loff_t pos, void **entryp) { - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct block_device *bdev = iomap->bdev; + struct inode *inode = mapping->host; struct blk_dax_ctl dax = { .sector = dax_iomap_sector(iomap, pos), .size = PMD_SIZE, }; long length = dax_map_atomic(bdev, &dax); - void *ret; + void *ret = NULL; if (length < 0) /* dax_map_atomic() failed */ - return VM_FAULT_FALLBACK; + goto fallback; if (length < PMD_SIZE) goto unmap_fallback; if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) @@ -1190,67 +1283,86 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, RADIX_DAX_PMD); if (IS_ERR(ret)) - return VM_FAULT_FALLBACK; + goto fallback; *entryp = ret; - return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); + trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); + return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + dax.pfn, vmf->flags & FAULT_FLAG_WRITE); unmap_fallback: dax_unmap_atomic(bdev, &dax); +fallback: + trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, + dax.pfn, ret); return VM_FAULT_FALLBACK; } -static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, - struct vm_fault *vmf, unsigned long address, - struct iomap *iomap, void **entryp) +static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, + void **entryp) { - struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long pmd_addr = address & PMD_MASK; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + unsigned long pmd_addr = vmf->address & PMD_MASK; + struct inode *inode = mapping->host; struct page *zero_page; + void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; - void *ret; - zero_page = mm_get_huge_zero_page(vma->vm_mm); + zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); if (unlikely(!zero_page)) - return VM_FAULT_FALLBACK; + goto fallback; ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, RADIX_DAX_PMD | RADIX_DAX_HZP); if (IS_ERR(ret)) - return VM_FAULT_FALLBACK; + goto fallback; *entryp = ret; - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_none(*pmd)) { + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (!pmd_none(*(vmf->pmd))) { spin_unlock(ptl); - return VM_FAULT_FALLBACK; + goto fallback; } - pmd_entry = mk_pmd(zero_page, vma->vm_page_prot); + pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); pmd_entry = pmd_mkhuge(pmd_entry); - set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry); + set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); + trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); return VM_FAULT_NOPAGE; + +fallback: + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); + return VM_FAULT_FALLBACK; } -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) +int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops) { + struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long pmd_addr = address & PMD_MASK; - bool write = flags & FAULT_FLAG_WRITE; + unsigned long pmd_addr = vmf->address & PMD_MASK; + bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; - struct vm_fault vmf; void *entry; loff_t pos; int error; + /* + * Check whether offset isn't beyond end of file now. Caller is + * supposed to hold locks serializing us with truncate / punch hole so + * this is a reliable test. + */ + pgoff = linear_page_index(vma, pmd_addr); + max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + + trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); + /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) goto fallback; @@ -1261,32 +1373,16 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - /* - * Check whether offset isn't beyond end of file now. Caller is - * supposed to hold locks serializing us with truncate / punch hole so - * this is a reliable test. - */ - pgoff = linear_page_index(vma, pmd_addr); - max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; - - if (pgoff > max_pgoff) - return VM_FAULT_SIGBUS; + if (pgoff > max_pgoff) { + result = VM_FAULT_SIGBUS; + goto out; + } /* If the PMD would extend beyond the file size */ if ((pgoff | PG_PMD_COLOUR) > max_pgoff) goto fallback; /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. - */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); - if (IS_ERR(entry)) - goto fallback; - - /* * Note that we don't use iomap_apply here. We aren't doing I/O, only * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. @@ -1294,52 +1390,61 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, pos = (loff_t)pgoff << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) - goto unlock_entry; + goto fallback; + if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; - vmf.pgoff = pgoff; - vmf.flags = flags; - vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; + /* + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX + * PMD or a HZP entry. If it can't (because a 4k page is already in + * the tree, for instance), it will return -EEXIST and we just fall + * back to 4k entries. + */ + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto finish_iomap; switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, - &iomap, pos, write, &entry); + result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) - goto finish_iomap; - result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, - &entry); + goto unlock_entry; + result = dax_pmd_load_hole(vmf, &iomap, &entry); break; default: WARN_ON_ONCE(1); break; } + unlock_entry: + put_locked_mapping_entry(mapping, pgoff, entry); finish_iomap: if (ops->iomap_end) { - if (result == VM_FAULT_FALLBACK) { - ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, - &iomap); - } else { - error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, - iomap_flags, &iomap); - if (error) - result = VM_FAULT_FALLBACK; - } + int copied = PMD_SIZE; + + if (result == VM_FAULT_FALLBACK) + copied = 0; + /* + * The fault is done by now and there's no way back (other + * thread may be already happily using PMD we have installed). + * Just ignore error from ->iomap_end since we cannot do much + * with it. + */ + ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, + &iomap); } - unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); fallback: if (result == VM_FAULT_FALLBACK) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } +out: + trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); return result; } EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); #endif /* CONFIG_FS_DAX_PMD */ -#endif /* CONFIG_FS_IOMAP */ diff --git a/fs/dcache.c b/fs/dcache.c index 769903dbc19d..95d71eda8142 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1336,8 +1336,11 @@ int d_set_mounted(struct dentry *dentry) } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { - dentry->d_flags |= DCACHE_MOUNTED; - ret = 0; + ret = -EBUSY; + if (!d_mountpoint(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } } spin_unlock(&dentry->d_lock); out: diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index f17fcf89e18e..7fd4ec4bb214 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = { static struct vfsmount *debugfs_automount(struct path *path) { - struct vfsmount *(*f)(void *); - f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; - return f(d_inode(path->dentry)->i_private); + debugfs_automount_t f; + f = (debugfs_automount_t)path->dentry->d_fsdata; + return f(path->dentry, d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { @@ -248,6 +248,42 @@ static struct file_system_type debug_fs_type = { }; MODULE_ALIAS_FS("debugfs"); +/** + * debugfs_lookup() - look up an existing debugfs file + * @name: a pointer to a string containing the name of the file to look up. + * @parent: a pointer to the parent dentry of the file. + * + * This function will return a pointer to a dentry if it succeeds. If the file + * doesn't exist or an error occurs, %NULL will be returned. The returned + * dentry must be passed to dput() when it is no longer needed. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_lookup(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + + if (IS_ERR(parent)) + return NULL; + + if (!parent) + parent = debugfs_mount->mnt_root; + + inode_lock(d_inode(parent)); + dentry = lookup_one_len(name, parent, strlen(name)); + inode_unlock(d_inode(parent)); + + if (IS_ERR(dentry)) + return NULL; + if (!d_really_is_positive(dentry)) { + dput(dentry); + return NULL; + } + return dentry; +} +EXPORT_SYMBOL_GPL(debugfs_lookup); + static struct dentry *start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; @@ -504,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, - struct vfsmount *(*f)(void *), + debugfs_automount_t f, void *data) { struct dentry *dentry = start_creating(name, parent); diff --git a/fs/direct-io.c b/fs/direct-io.c index aeae8c063451..c87bae4376b8 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -906,6 +906,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { const unsigned blkbits = sdio->blkbits; + const unsigned i_blkbits = blkbits + sdio->blkfactor; int ret = 0; while (sdio->block_in_file < sdio->final_block_in_request) { @@ -949,7 +950,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, clean_bdev_aliases( map_bh->b_bdev, map_bh->b_blocknr, - map_bh->b_size >> blkbits); + map_bh->b_size >> i_blkbits); } if (!sdio->blkfactor) diff --git a/fs/exec.c b/fs/exec.c index e57946610733..698a86094f76 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1426,12 +1426,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm) struct task_struct *p = current, *t; unsigned n_fs; - if (p->ptrace) { - if (ptracer_capable(p, current_user_ns())) - bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; - else - bprm->unsafe |= LSM_UNSAFE_PTRACE; - } + if (p->ptrace) + bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to @@ -1479,7 +1475,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) if (task_no_new_privs(current)) return; - inode = file_inode(bprm->file); + inode = bprm->file->f_path.dentry->d_inode; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index 5e6a2c0a1f0b..1f7d5e46cdda 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -122,7 +122,7 @@ void exofs_sysfs_dbg_print(void) list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { printk(KERN_INFO "%s: name %s ref %d\n", __func__, kobject_name(k_name), - (int)atomic_read(&k_name->kref.refcount)); + (int)kref_read(&k_name->kref)); } #endif } diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 36bea5adcaba..c634874e12d9 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,5 @@ config EXT2_FS tristate "Second extended fs support" - select FS_IOMAP if FS_DAX help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 37e2be784ac7..5e64de9c5093 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -814,7 +814,7 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; -extern struct iomap_ops ext2_iomap_ops; +extern const struct iomap_ops ext2_iomap_ops; /* namei.c */ extern const struct inode_operations ext2_dir_inode_operations; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0093ea2512a8..128cce540645 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -751,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } else { - *new = true; } + *new = true; ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); @@ -843,13 +842,13 @@ ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, return 0; } -struct iomap_ops ext2_iomap_ops = { +const struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; #else /* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ -struct iomap_ops ext2_iomap_ops; +const struct iomap_ops ext2_iomap_ops; #endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 7b90691e98c4..e38039fd96ff 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -37,7 +37,6 @@ config EXT4_FS select CRC16 select CRYPTO select CRYPTO_CRC32C - select FS_IOMAP if FS_DAX help This is the next generation of the ext3 filesystem. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2163c1e69f2a..cee23b684f47 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -32,7 +32,11 @@ #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <crypto/hash.h> -#include <linux/fscrypto.h> +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#include <linux/fscrypt_supp.h> +#else +#include <linux/fscrypt_notsupp.h> +#endif #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #ifdef __KERNEL__ @@ -679,6 +683,16 @@ struct fsxattr { #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR +#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) + +/* + * Flags for going down operation + */ +#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -1343,11 +1357,6 @@ struct ext4_super_block { /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 -#ifdef CONFIG_EXT4_FS_ENCRYPTION -#define EXT4_KEY_DESC_PREFIX "ext4:" -#define EXT4_KEY_DESC_PREFIX_SIZE 5 -#endif - /* * fourth extended-fs super-block data in memory */ @@ -1404,8 +1413,7 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; - unsigned long s_resize_flags; /* Flags indicating if there - is a resizer */ + unsigned long s_ext4_flags; /* Ext4 superblock flags */ unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; @@ -1517,12 +1525,6 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; - - /* Encryption support */ -#ifdef CONFIG_EXT4_FS_ENCRYPTION - u8 key_prefix[EXT4_KEY_DESC_PREFIX_SIZE]; - u8 key_prefix_size; -#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1845,6 +1847,18 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) } /* + * Superblock flags + */ +#define EXT4_FLAGS_RESIZING 0 +#define EXT4_FLAGS_SHUTDOWN 1 + +static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); +} + + +/* * Default values for user and/or group using reserved blocks */ #define EXT4_DEF_RESUID 0 @@ -2320,28 +2334,6 @@ static inline int ext4_fname_setup_filename(struct inode *dir, } static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy -#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk #endif /* dir.c */ @@ -3034,7 +3026,7 @@ extern int ext4_inline_data_fiemap(struct inode *inode, extern int ext4_try_to_evict_inline_data(handle_t *handle, struct inode *inode, int needed); -extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); +extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); @@ -3228,7 +3220,6 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) EXT4_WQ_HASH_SZ]) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; -#define EXT4_RESIZING 0 extern int ext4_resize_begin(struct super_block *sb); extern void ext4_resize_end(struct super_block *sb); @@ -3253,7 +3244,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } -extern struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_ops; #endif /* __KERNEL__ */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index e770c1ee4613..dd106b1d5d89 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -43,6 +43,10 @@ static int ext4_journal_check_start(struct super_block *sb) journal_t *journal; might_sleep(); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + return -EIO; + if (sb->s_flags & MS_RDONLY) return -EROFS; WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); @@ -161,6 +165,13 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); if (ext4_handle_valid(handle)) { + struct super_block *sb; + + sb = handle->h_transaction->t_journal->j_private; + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) { + jbd2_journal_abort_handle(handle); + return -EIO; + } err = jbd2_journal_get_write_access(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3e295d3350a9..2a97dff87b96 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5334,7 +5334,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t stop, *iterator, ex_start, ex_end; /* Let path point to the last extent */ - path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5343,15 +5344,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) goto out; - stop = le32_to_cpu(extent->ee_block) + - ext4_ext_get_actual_len(extent); + stop = le32_to_cpu(extent->ee_block); /* * In case of left shift, Don't start shifting extents until we make * sure the hole is big enough to accommodate the shift. */ if (SHIFT == SHIFT_LEFT) { - path = ext4_find_extent(inode, start - 1, &path, 0); + path = ext4_find_extent(inode, start - 1, &path, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5383,9 +5384,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, else iterator = &stop; - /* Its safe to start updating extents */ - while (start < stop) { - path = ext4_find_extent(inode, *iterator, &path, 0); + /* + * Its safe to start updating extents. Start and stop are unsigned, so + * in case of right shift if extent with 0 block is reached, iterator + * becomes NULL to indicate the end of the loop. + */ + while (iterator && start <= stop) { + path = ext4_find_extent(inode, *iterator, &path, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5412,8 +5418,11 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); - *iterator = le32_to_cpu(extent->ee_block) > 0 ? - le32_to_cpu(extent->ee_block) - 1 : 0; + if (le32_to_cpu(extent->ee_block) > 0) + *iterator = le32_to_cpu(extent->ee_block) - 1; + else + /* Beginning is reached, end of the loop */ + iterator = NULL; /* Update path extent in case we need to stop */ while (le32_to_cpu(extent->ee_block) < start) extent++; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b5f184493c57..13021a054fc0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -57,6 +57,9 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { + if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) + return -EIO; + if (!iov_iter_count(to)) return 0; /* skip atime */ @@ -175,7 +178,6 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - bool overwrite = false; inode_lock(inode); ret = ext4_write_checks(iocb, from); @@ -188,16 +190,9 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out; - if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { - overwrite = true; - downgrade_write(&inode->i_rwsem); - } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); out: - if (!overwrite) - inode_unlock(inode); - else - inode_unlock_shared(inode); + inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; @@ -213,6 +208,9 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) int overwrite = 0; ssize_t ret; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); @@ -258,7 +256,6 @@ out: static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; - handle_t *handle = NULL; struct inode *inode = file_inode(vma->vm_file); struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -266,61 +263,33 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, - EXT4_DATA_TRANS_BLOCKS(sb)); - } else - down_read(&EXT4_I(inode)->i_mmap_sem); - - if (IS_ERR(handle)) - result = VM_FAULT_SIGBUS; - else - result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); - - if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); - up_read(&EXT4_I(inode)->i_mmap_sem); + } + down_read(&EXT4_I(inode)->i_mmap_sem); + result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); + up_read(&EXT4_I(inode)->i_mmap_sem); + if (write) sb_end_pagefault(sb); - } else - up_read(&EXT4_I(inode)->i_mmap_sem); return result; } -static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) +static int +ext4_dax_pmd_fault(struct vm_fault *vmf) { int result; - handle_t *handle = NULL; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - bool write = flags & FAULT_FLAG_WRITE; + bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); - file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, - ext4_chunk_trans_blocks(inode, - PMD_SIZE / PAGE_SIZE)); - } else - down_read(&EXT4_I(inode)->i_mmap_sem); - - if (IS_ERR(handle)) - result = VM_FAULT_SIGBUS; - else { - result = dax_iomap_pmd_fault(vma, addr, pmd, flags, - &ext4_iomap_ops); + file_update_time(vmf->vma->vm_file); } - - if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); - up_read(&EXT4_I(inode)->i_mmap_sem); + down_read(&EXT4_I(inode)->i_mmap_sem); + result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops); + up_read(&EXT4_I(inode)->i_mmap_sem); + if (write) sb_end_pagefault(sb); - } else - up_read(&EXT4_I(inode)->i_mmap_sem); return result; } @@ -376,6 +345,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + if (ext4_encrypted_inode(inode)) { int err = fscrypt_get_encryption_info(inode); if (err) @@ -403,6 +375,9 @@ static int ext4_file_open(struct inode * inode, struct file * filp) char buf[64], *cp; int ret; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && !(sb->s_flags & MS_RDONLY))) { sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 88effb1053c7..9d549608fd30 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -100,6 +100,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) tid_t commit_tid; bool needs_barrier = false; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + J_ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file_enter(file, datasync); diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index e026aa941fd5..38b8a96eb97c 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -10,7 +10,8 @@ */ #include <linux/fs.h> -#include <linux/cryptohash.h> +#include <linux/compiler.h> +#include <linux/bitops.h> #include "ext4.h" #define DELTA 0x9E3779B9 @@ -32,6 +33,74 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) buf[1] += b1; } +/* F, G and H are basic MD4 functions: selection, majority, parity */ +#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) + +/* + * The generic round function. The application is so specific that + * we don't bother protecting all the arguments with parens, as is generally + * good macro practice, in favor of extra legibility. + * Rotation is separate from addition to prevent recomputation + */ +#define ROUND(f, a, b, c, d, x, s) \ + (a += f(b, c, d) + x, a = rol32(a, s)) +#define K1 0 +#define K2 013240474631UL +#define K3 015666365641UL + +/* + * Basic cut-down MD4 transform. Returns only 32 bits of result. + */ +static __u32 half_md4_transform(__u32 buf[4], __u32 const in[8]) +{ + __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; + + /* Round 1 */ + ROUND(F, a, b, c, d, in[0] + K1, 3); + ROUND(F, d, a, b, c, in[1] + K1, 7); + ROUND(F, c, d, a, b, in[2] + K1, 11); + ROUND(F, b, c, d, a, in[3] + K1, 19); + ROUND(F, a, b, c, d, in[4] + K1, 3); + ROUND(F, d, a, b, c, in[5] + K1, 7); + ROUND(F, c, d, a, b, in[6] + K1, 11); + ROUND(F, b, c, d, a, in[7] + K1, 19); + + /* Round 2 */ + ROUND(G, a, b, c, d, in[1] + K2, 3); + ROUND(G, d, a, b, c, in[3] + K2, 5); + ROUND(G, c, d, a, b, in[5] + K2, 9); + ROUND(G, b, c, d, a, in[7] + K2, 13); + ROUND(G, a, b, c, d, in[0] + K2, 3); + ROUND(G, d, a, b, c, in[2] + K2, 5); + ROUND(G, c, d, a, b, in[4] + K2, 9); + ROUND(G, b, c, d, a, in[6] + K2, 13); + + /* Round 3 */ + ROUND(H, a, b, c, d, in[3] + K3, 3); + ROUND(H, d, a, b, c, in[7] + K3, 9); + ROUND(H, c, d, a, b, in[2] + K3, 11); + ROUND(H, b, c, d, a, in[6] + K3, 15); + ROUND(H, a, b, c, d, in[1] + K3, 3); + ROUND(H, d, a, b, c, in[5] + K3, 9); + ROUND(H, c, d, a, b, in[0] + K3, 11); + ROUND(H, b, c, d, a, in[4] + K3, 15); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; + + return buf[1]; /* "most hashed" word */ +} +#undef ROUND +#undef K1 +#undef K2 +#undef K3 +#undef F +#undef G +#undef H /* The old legacy hash */ static __u32 dx_hack_hash_unsigned(const char *name, int len) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e57e8d90ea54..b14bae2598bc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -764,6 +764,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); + if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + return ERR_PTR(-EIO); + if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { @@ -771,7 +774,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (err) return ERR_PTR(err); if (!fscrypt_has_encryption_key(dir)) - return ERR_PTR(-EPERM); + return ERR_PTR(-ENOKEY); if (!handle) nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); encrypt = 1; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 437df6a1a841..30a9f210d1e3 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -215,6 +215,9 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return; + BUG_ON(!EXT4_I(inode)->i_inline_off); BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); @@ -381,7 +384,7 @@ out: static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { - int ret, size; + int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) @@ -391,15 +394,14 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, if (size < len) return -ENOSPC; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else ret = ext4_create_inline_data(handle, inode, len); - up_write(&EXT4_I(inode)->xattr_sem); - + ext4_write_unlock_xattr(inode, &no_expand); return ret; } @@ -533,7 +535,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, unsigned flags) { - int ret, needed_blocks; + int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; struct page *page = NULL; @@ -573,7 +575,7 @@ retry: goto out; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; /* If some one has already done this for us, just exit. */ if (!ext4_has_inline_data(inode)) { @@ -610,7 +612,7 @@ retry: put_page(page); page = NULL; ext4_orphan_add(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; ext4_journal_stop(handle); handle = NULL; @@ -636,7 +638,7 @@ out: put_page(page); } if (sem_held) - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); if (handle) ext4_journal_stop(handle); brelse(iloc.bh); @@ -729,7 +731,7 @@ convert: int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int ret; + int ret, no_expand; void *kaddr; struct ext4_iloc iloc; @@ -747,7 +749,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, goto out; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); kaddr = kmap_atomic(page); @@ -757,7 +759,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, /* clear page dirty so that writepages wouldn't work for us. */ ClearPageDirty(page); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); out: return copied; @@ -768,7 +770,7 @@ ext4_journalled_write_inline_data(struct inode *inode, unsigned len, struct page *page) { - int ret; + int ret, no_expand; void *kaddr; struct ext4_iloc iloc; @@ -778,11 +780,11 @@ ext4_journalled_write_inline_data(struct inode *inode, return NULL; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, 0, len); kunmap_atomic(kaddr); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return iloc.bh; } @@ -944,8 +946,15 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, struct page *page) { int i_size_changed = 0; + int ret; - copied = ext4_write_inline_data_end(inode, pos, len, copied, page); + ret = ext4_write_inline_data_end(inode, pos, len, copied, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ret; + } + copied = ret; /* * No need to use i_size_read() here, the i_size @@ -1043,7 +1052,6 @@ static int ext4_add_dirent_to_inline(handle_t *handle, dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; - ext4_mark_inode_dirty(handle, dir); return 1; } @@ -1259,7 +1267,7 @@ out: int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - int ret, inline_size; + int ret, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; @@ -1267,7 +1275,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, if (ret) return ret; - down_write(&EXT4_I(dir)->xattr_sem); + ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) goto out; @@ -1312,8 +1320,8 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); out: + ext4_write_unlock_xattr(dir, &no_expand); ext4_mark_inode_dirty(handle, dir); - up_write(&EXT4_I(dir)->xattr_sem); brelse(iloc.bh); return ret; } @@ -1673,7 +1681,7 @@ int ext4_delete_inline_entry(handle_t *handle, struct buffer_head *bh, int *has_inline_data) { - int err, inline_size; + int err, inline_size, no_expand; struct ext4_iloc iloc; void *inline_start; @@ -1681,7 +1689,7 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) return err; - down_write(&EXT4_I(dir)->xattr_sem); + ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; @@ -1709,13 +1717,11 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) goto out; - err = ext4_mark_inode_dirty(handle, dir); - if (unlikely(err)) - goto out; - ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); out: - up_write(&EXT4_I(dir)->xattr_sem); + ext4_write_unlock_xattr(dir, &no_expand); + if (likely(err == 0)) + err = ext4_mark_inode_dirty(handle, dir); brelse(iloc.bh); if (err != -ENOENT) ext4_std_error(dir->i_sb, err); @@ -1814,11 +1820,11 @@ out: int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { - int ret; + int ret, no_expand; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); ret = ext4_destroy_inline_data_nolock(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return ret; } @@ -1900,10 +1906,10 @@ out: return error; } -void ext4_inline_data_truncate(struct inode *inode, int *has_inline) +int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; - int inline_size, value_len, needed_blocks; + int inline_size, value_len, needed_blocks, no_expand, err = 0; size_t i_size; void *value = NULL; struct ext4_xattr_ibody_find is = { @@ -1918,19 +1924,19 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) needed_blocks = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) - return; + return PTR_ERR(handle); - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { *has_inline = 0; ext4_journal_stop(handle); - return; + return 0; } - if (ext4_orphan_add(handle, inode)) + if ((err = ext4_orphan_add(handle, inode)) != 0) goto out; - if (ext4_get_inode_loc(inode, &is.iloc)) + if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0) goto out; down_write(&EXT4_I(inode)->i_data_sem); @@ -1941,24 +1947,29 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) if (i_size < inline_size) { /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { - if (ext4_xattr_ibody_find(inode, &i, &is)) + if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; BUG_ON(is.s.not_found); value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); - if (!value) + if (!value) { + err = -ENOMEM; goto out_error; + } - if (ext4_xattr_ibody_get(inode, i.name_index, i.name, - value, value_len)) + err = ext4_xattr_ibody_get(inode, i.name_index, + i.name, value, value_len); + if (err <= 0) goto out_error; i.value = value; i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; - if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is)) + err = ext4_xattr_ibody_inline_set(handle, inode, + &i, &is); + if (err) goto out_error; } @@ -1978,23 +1989,24 @@ out_error: up_write(&EXT4_I(inode)->i_data_sem); out: brelse(is.iloc.bh); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); kfree(value); if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - + if (err == 0) { + inode->i_mtime = inode->i_ctime = current_time(inode); + err = ext4_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + } ext4_journal_stop(handle); - return; + return err; } int ext4_convert_inline_data(struct inode *inode) { - int error, needed_blocks; + int error, needed_blocks, no_expand; handle_t *handle; struct ext4_iloc iloc; @@ -2016,15 +2028,10 @@ int ext4_convert_inline_data(struct inode *inode) goto out_free; } - down_write(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - up_write(&EXT4_I(inode)->xattr_sem); - goto out; - } - - error = ext4_convert_inline_data_nolock(handle, inode, &iloc); - up_write(&EXT4_I(inode)->xattr_sem); -out: + ext4_write_lock_xattr(inode, &no_expand); + if (ext4_has_inline_data(inode)) + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); out_free: brelse(iloc.bh); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 88d57af1b516..75212a6e69f8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1189,6 +1189,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + trace_ext4_write_begin(inode, pos, len, flags); /* * Reserve one block more for addition to orphan list in case @@ -1330,8 +1333,11 @@ static int ext4_write_end(struct file *file, if (ext4_has_inline_data(inode)) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); - if (ret < 0) + if (ret < 0) { + unlock_page(page); + put_page(page); goto errout; + } copied = ret; } else copied = block_write_end(file, mapping, pos, @@ -1385,7 +1391,9 @@ errout: * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_handle_dirty_metadata() instead. */ -static void zero_new_buffers(struct page *page, unsigned from, unsigned to) +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct page *page, + unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; @@ -1402,7 +1410,7 @@ static void zero_new_buffers(struct page *page, unsigned from, unsigned to) size = min(to, block_end) - start; zero_user(page, start, size); - set_buffer_uptodate(bh); + write_end_fn(handle, bh); } clear_buffer_new(bh); } @@ -1431,18 +1439,25 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (ext4_has_inline_data(inode)) - copied = ext4_write_inline_data_end(inode, pos, len, - copied, page); - else { - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - zero_new_buffers(page, from+copied, to); + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + goto errout; } - + copied = ret; + } else if (unlikely(copied < len) && !PageUptodate(page)) { + copied = 0; + ext4_journalled_zero_new_buffers(handle, page, from, to); + } else { + if (unlikely(copied < len)) + ext4_journalled_zero_new_buffers(handle, page, + from + copied, to); ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); + from + copied, &partial, + write_end_fn); if (!partial) SetPageUptodate(page); } @@ -1468,6 +1483,7 @@ static int ext4_journalled_write_end(struct file *file, */ ext4_orphan_add(handle, inode); +errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -2034,6 +2050,12 @@ static int ext4_writepage(struct page *page, struct ext4_io_submit io_submit; bool keep_towrite = false; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { + ext4_invalidatepage(page, 0, PAGE_SIZE); + unlock_page(page); + return -EIO; + } + trace_ext4_writepage(page); size = i_size_read(inode); if (page->index == size >> PAGE_SHIFT) @@ -2409,7 +2431,8 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_forced_shutdown(EXT4_SB(sb)) || + EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2464,8 +2487,8 @@ update_disksize: disksize = i_size; if (disksize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = disksize; - err2 = ext4_mark_inode_dirty(handle, inode); up_write(&EXT4_I(inode)->i_data_sem); + err2 = ext4_mark_inode_dirty(handle, inode); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", @@ -2631,6 +2654,9 @@ static int ext4_writepages(struct address_space *mapping, struct blk_plug plug; bool give_up_on_write = false; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); @@ -2667,7 +2693,8 @@ static int ext4_writepages(struct address_space *mapping, * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { + if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || + sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { ret = -EROFS; goto out_writepages; } @@ -2892,6 +2919,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; handle_t *handle; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || @@ -3420,7 +3450,7 @@ orphan_del: return ret; } -struct iomap_ops ext4_iomap_ops = { +const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, .iomap_end = ext4_iomap_end, }; @@ -3914,6 +3944,10 @@ static int ext4_block_truncate_page(handle_t *handle, unsigned blocksize; struct inode *inode = mapping->host; + /* If we are processing an encrypted inode during orphan list handling */ + if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode)) + return 0; + blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); @@ -4222,7 +4256,9 @@ int ext4_truncate(struct inode *inode) if (ext4_has_inline_data(inode)) { int has_inline = 1; - ext4_inline_data_truncate(inode, &has_inline); + err = ext4_inline_data_truncate(inode, &has_inline); + if (err) + return err; if (has_inline) return 0; } @@ -5197,6 +5233,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) int orphan = 0; const unsigned int ia_valid = attr->ia_valid; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + error = setattr_prepare(dentry, attr); if (error) return error; @@ -5483,6 +5522,9 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + if (IS_I_VERSION(inode)) inode_inc_iversion(inode); @@ -5506,6 +5548,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d534399cf607..a4273ddb9922 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -16,6 +16,7 @@ #include <linux/quotaops.h> #include <linux/uuid.h> #include <linux/uaccess.h> +#include <linux/delay.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -442,6 +443,52 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) return iflags; } +int ext4_shutdown(struct super_block *sb, unsigned long arg) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + __u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (__u32 __user *)arg)) + return -EFAULT; + + if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) + return -EINVAL; + + if (ext4_forced_shutdown(sbi)) + return 0; + + ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); + + switch (flags) { + case EXT4_GOING_FLAGS_DEFAULT: + freeze_bdev(sb->s_bdev); + set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + thaw_bdev(sb->s_bdev, sb); + break; + case EXT4_GOING_FLAGS_LOGFLUSH: + set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { + (void) ext4_force_commit(sb); + jbd2_journal_abort(sbi->s_journal, 0); + } + break; + case EXT4_GOING_FLAGS_NOLOGFLUSH: + set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { + msleep(100); + jbd2_journal_abort(sbi->s_journal, 0); + } + break; + default: + return -EINVAL; + } + clear_opt(sb, DISCARD); + return 0; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -893,6 +940,8 @@ resizefs_out: return 0; } + case EXT4_IOC_SHUTDOWN: + return ext4_shutdown(sb, arg); default: return -ENOTTY; } @@ -959,6 +1008,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_SET_ENCRYPTION_POLICY: case EXT4_IOC_GET_ENCRYPTION_PWSALT: case EXT4_IOC_GET_ENCRYPTION_POLICY: + case EXT4_IOC_SHUTDOWN: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7ae43c59bc79..10c62de642c6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1556,7 +1556,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, ex->fe_len += 1 << order; } - BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); + if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) { + /* Should never happen! (but apparently sometimes does?!?) */ + WARN_ON(1); + ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent " + "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", + block, order, needed, ex->fe_group, ex->fe_start, + ex->fe_len, ex->fe_logical); + ex->fe_len = 0; + ex->fe_start = 0; + ex->fe_group = 0; + } return ex->fe_len; } @@ -2136,8 +2146,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req + * We also support searching for power-of-two requests only for + * requests upto maximum buddy size we have constructed. */ - if (i >= sbi->s_mb_order2_reqs) { + if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { /* * This should tell if fe_len is exactly power of 2 */ @@ -2207,7 +2219,7 @@ repeat: } ac->ac_groups_scanned++; - if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2) + if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % sbi->s_stripe)) @@ -3123,6 +3135,13 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (ar->pright && start + size - 1 >= ar->lright) size -= start + size - ar->lright; + /* + * Trim allocation request for filesystems with artificially small + * groups. + */ + if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); + end = start + size; /* check we don't cross already preallocated blocks */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index eadba919f26b..6ad612c576fc 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1378,6 +1378,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, return NULL; retval = ext4_fname_setup_filename(dir, d_name, 1, &fname); + if (retval == -ENOENT) + return NULL; if (retval) return ERR_PTR(retval); @@ -1616,13 +1618,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi !fscrypt_has_permitted_context(dir, inode)) { int nokey = ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode); - iput(inode); - if (nokey) + if (nokey) { + iput(inode); return ERR_PTR(-ENOKEY); + } ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", (unsigned long) dir->i_ino, (unsigned long) inode->i_ino); + iput(inode); return ERR_PTR(-EPERM); } } @@ -2935,6 +2939,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; + if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + return -EIO; + /* Initialize quotas before so that eventual writes go in * separate transaction */ retval = dquot_initialize(dir); @@ -3008,6 +3015,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; + if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + return -EIO; + trace_ext4_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ @@ -3078,6 +3088,9 @@ static int ext4_symlink(struct inode *dir, struct fscrypt_str disk_link; struct fscrypt_symlink_data *sd = NULL; + if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + return -EIO; + disk_link.len = len + 1; disk_link.name = (char *) symname; @@ -3088,7 +3101,7 @@ static int ext4_symlink(struct inode *dir, if (err) return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); sd = kzalloc(disk_link.len, GFP_KERNEL); @@ -3525,6 +3538,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; + if ((ext4_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (ext4_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3725,6 +3744,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int retval; struct timespec ctime; + if ((ext4_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (ext4_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && (old_dir != new_dir) && @@ -3858,6 +3883,9 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) + return -EIO; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index d83b0f3c5fe9..208241b06662 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -24,7 +24,6 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/backing-dev.h> -#include <linux/fscrypto.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -158,7 +157,7 @@ static int ext4_end_io(ext4_io_end_t *io) io->handle = NULL; /* Following call will use up the handle */ ret = ext4_convert_unwritten_extents(handle, inode, offset, size); - if (ret < 0) { + if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index cf681004b196..c3ed9021b781 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -45,7 +45,8 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; } - if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, + &EXT4_SB(sb)->s_ext4_flags)) ret = -EBUSY; return ret; @@ -53,7 +54,7 @@ int ext4_resize_begin(struct super_block *sb) void ext4_resize_end(struct super_block *sb) { - clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); + clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags); smp_mb__after_atomic(); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 66845a08a87a..2e03a0a88d92 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -438,6 +438,9 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + return; + if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; @@ -459,6 +462,9 @@ void __ext4_error_inode(struct inode *inode, const char *function, struct va_format vaf; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); es->s_last_error_block = cpu_to_le64(block); if (ext4_error_ratelimit(inode->i_sb)) { @@ -491,6 +497,9 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return; + es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { @@ -567,6 +576,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + return; + /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ @@ -600,6 +612,9 @@ void __ext4_abort(struct super_block *sb, const char *function, struct va_format vaf; va_list args; + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + return; + save_error_info(sb, function, line); va_start(args, fmt); vaf.fmt = fmt; @@ -695,6 +710,9 @@ __acquires(bitlock) va_list args; struct ext4_super_block *es = EXT4_SB(sb)->s_es; + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + return; + es->s_last_error_ino = cpu_to_le32(ino); es->s_last_error_block = cpu_to_le64(block); __save_error_info(sb, function, line); @@ -825,6 +843,7 @@ static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + int aborted = 0; int i, err; ext4_unregister_li_request(sb); @@ -834,9 +853,10 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { + aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; - if (err < 0) + if ((err < 0) && !aborted) ext4_abort(sb, "Couldn't clean up the journal"); } @@ -847,7 +867,7 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!(sb->s_flags & MS_RDONLY)) { + if (!(sb->s_flags & MS_RDONLY) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } @@ -1100,12 +1120,6 @@ static int ext4_get_context(struct inode *inode, void *ctx, size_t len) EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); } -static int ext4_key_prefix(struct inode *inode, u8 **key) -{ - *key = EXT4_SB(inode->i_sb)->key_prefix; - return EXT4_SB(inode->i_sb)->key_prefix_size; -} - static int ext4_prepare_context(struct inode *inode) { return ext4_convert_inline_data(inode); @@ -1179,9 +1193,9 @@ static unsigned ext4_max_namelen(struct inode *inode) EXT4_NAME_LEN; } -static struct fscrypt_operations ext4_cryptops = { +static const struct fscrypt_operations ext4_cryptops = { + .key_prefix = "ext4:", .get_context = ext4_get_context, - .key_prefix = ext4_key_prefix, .prepare_context = ext4_prepare_context, .set_context = ext4_set_context, .dummy_context = ext4_dummy_context, @@ -1190,7 +1204,7 @@ static struct fscrypt_operations ext4_cryptops = { .max_namelen = ext4_max_namelen, }; #else -static struct fscrypt_operations ext4_cryptops = { +static const struct fscrypt_operations ext4_cryptops = { .is_encrypted = ext4_encrypted_inode, }; #endif @@ -1290,7 +1304,7 @@ enum { Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, - Opt_lazytime, Opt_nolazytime, + Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, @@ -1358,6 +1372,7 @@ static const match_table_t tokens = { {Opt_delalloc, "delalloc"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_removed, "mblk_io_submit"}, {Opt_removed, "nomblk_io_submit"}, @@ -1563,6 +1578,7 @@ static const struct mount_opts { #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, + {Opt_debug_want_extra_isize, 0, MOPT_GTE0}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, @@ -1676,6 +1692,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, if (arg == 0) arg = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * arg; + } else if (token == Opt_debug_want_extra_isize) { + sbi->s_want_extra_isize = arg; } else if (token == Opt_max_batch_time) { sbi->s_max_batch_time = arg; } else if (token == Opt_min_batch_time) { @@ -2619,9 +2637,9 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) ret = sbi->s_stripe; - else if (stripe_width <= sbi->s_blocks_per_group) + else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) ret = stripe_width; - else if (stride <= sbi->s_blocks_per_group) + else if (stride && stride <= sbi->s_blocks_per_group) ret = stride; else ret = 0; @@ -3842,7 +3860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { - if (le32_to_cpu(es->s_first_meta_bg) >= db_count) { + if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", @@ -3925,7 +3943,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - if (ext4_load_journal(sb, es, journal_devnum)) + err = ext4_load_journal(sb, es, journal_devnum); + if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && ext4_has_feature_journal_needs_recovery(sb)) { @@ -4087,7 +4106,8 @@ no_journal: sb->s_flags |= MS_RDONLY; /* determine the minimum size of new large inodes, if present */ - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && + sbi->s_want_extra_isize == 0) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; if (ext4_has_feature_extra_isize(sb)) { @@ -4218,11 +4238,6 @@ no_journal: ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); kfree(orig_data); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - memcpy(sbi->key_prefix, EXT4_KEY_DESC_PREFIX, - EXT4_KEY_DESC_PREFIX_SIZE); - sbi->key_prefix_size = EXT4_KEY_DESC_PREFIX_SIZE; -#endif return 0; cantfind_ext4: @@ -4720,6 +4735,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + return 0; + trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); /* @@ -4803,7 +4821,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (sb->s_flags & MS_RDONLY) + if ((sb->s_flags & MS_RDONLY) || ext4_forced_shutdown(EXT4_SB(sb))) return 0; if (EXT4_SB(sb)->s_journal) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5a94fa52b74f..67636acf7624 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -411,6 +411,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + if (strlen(name) > 255) return -ERANGE; @@ -1188,16 +1191,14 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; - unsigned long no_expand; + int no_expand; int error; if (!name) return -EINVAL; if (strlen(name) > 255) return -ERANGE; - down_write(&EXT4_I(inode)->xattr_sem); - no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); - ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_write_lock_xattr(inode, &no_expand); error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) @@ -1264,7 +1265,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = current_time(inode); if (!value) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with @@ -1278,9 +1279,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, cleanup: brelse(is.iloc.bh); brelse(bs.bh); - if (no_expand == 0) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return error; } @@ -1497,12 +1496,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ + int no_expand; + + if (ext4_write_trylock_xattr(inode, &no_expand) == 0) + return 0; - down_write(&EXT4_I(inode)->xattr_sem); - /* - * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty - */ - ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) @@ -1584,17 +1582,16 @@ shift: EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return 0; cleanup: brelse(bh); /* - * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode - * size expansion failed. + * Inode size expansion failed; don't try again */ - up_write(&EXT4_I(inode)->xattr_sem); + no_expand = 1; + ext4_write_unlock_xattr(inode, &no_expand); return error; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index a92e783fa057..099c8b670ef5 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -102,6 +102,38 @@ extern const struct xattr_handler ext4_xattr_security_handler; #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" +/* + * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes. + * The first is to signal that there the inline xattrs and data are + * taking up so much space that we might as well not keep trying to + * expand it. The second is that xattr_sem is taken for writing, so + * we shouldn't try to recurse into the inode expansion. For this + * second case, we need to make sure that we take save and restore the + * NO_EXPAND state flag appropriately. + */ +static inline void ext4_write_lock_xattr(struct inode *inode, int *save) +{ + down_write(&EXT4_I(inode)->xattr_sem); + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); +} + +static inline int ext4_write_trylock_xattr(struct inode *inode, int *save) +{ + if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0) + return 0; + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + return 1; +} + +static inline void ext4_write_unlock_xattr(struct inode *inode, int *save) +{ + if (*save == 0) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + up_write(&EXT4_I(inode)->xattr_sem); +} + extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 827c5daef4fc..18607fc5240d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -268,7 +268,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, err = fscrypt_setup_filename(dir, child, 1, &fname); if (err) { - *res_page = ERR_PTR(err); + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); return NULL; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2da8c3aa0ce5..069fc7277d8d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,7 +22,11 @@ #include <linux/vmalloc.h> #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/fscrypto.h> +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#include <linux/fscrypt_supp.h> +#else +#include <linux/fscrypt_notsupp.h> +#endif #include <crypto/hash.h> #ifdef CONFIG_F2FS_CHECK_FS @@ -760,10 +764,6 @@ enum { MAX_TIME, }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 -#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -771,11 +771,6 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ -#ifdef CONFIG_F2FS_FS_ENCRYPTION - u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; - u8 key_prefix_size; -#endif - #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ @@ -2510,28 +2505,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -#ifndef CONFIG_F2FS_FS_ENCRYPTION -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy -#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk -#endif #endif diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 56c19b0610a8..11cabcadb1a3 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -403,7 +403,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); @@ -447,7 +447,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, goto err_out; if (!fscrypt_has_encryption_key(inode)) { - err = -EPERM; + err = -ENOKEY; goto err_out; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0738f48293cc..0d8802453758 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -713,8 +713,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } sector = SECTOR_FROM_BLOCK(blkstart); - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 702638e21c76..a831303bb777 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1156,12 +1156,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } -static int f2fs_key_prefix(struct inode *inode, u8 **key) -{ - *key = F2FS_I_SB(inode)->key_prefix; - return F2FS_I_SB(inode)->key_prefix_size; -} - static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -1176,16 +1170,16 @@ static unsigned f2fs_max_namelen(struct inode *inode) inode->i_sb->s_blocksize : F2FS_NAME_LEN; } -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { + .key_prefix = "f2fs:", .get_context = f2fs_get_context, - .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; #else -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { .is_encrypted = f2fs_encrypted_inode, }; #endif @@ -1518,12 +1512,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) mutex_init(&sbi->wio_mutex[NODE]); mutex_init(&sbi->wio_mutex[DATA]); spin_lock_init(&sbi->cp_lock); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; -#endif } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -1553,16 +1541,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_size(bdev) - 1)) + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 4304072161aa..40d61077bead 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -542,6 +542,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { if (invalidate) set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); } } else { @@ -560,6 +561,10 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, TASK_UNINTERRUPTIBLE); + /* Make sure any pending writes are cancelled. */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + fscache_invalidate_writes(cookie); + /* Reset the cookie state if it wasn't relinquished */ if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { atomic_inc(&cookie->n_active); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index 9b28649df3a1..a8aa00be4444 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -48,6 +48,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) cookie->flags = 1 << FSCACHE_COOKIE_ENABLED; spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); INIT_HLIST_HEAD(&cookie->backing_objects); /* check the netfs type is not already present */ diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 9e792e30f4db..7a182c87f378 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -30,6 +30,7 @@ static const struct fscache_state *fscache_look_up_object(struct fscache_object static const struct fscache_state *fscache_object_available(struct fscache_object *, int); static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); static const struct fscache_state *fscache_update_object(struct fscache_object *, int); +static const struct fscache_state *fscache_object_dead(struct fscache_object *, int); #define __STATE_NAME(n) fscache_osm_##n #define STATE(n) (&__STATE_NAME(n)) @@ -91,7 +92,7 @@ static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); -static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL); +static WORK_STATE(OBJECT_DEAD, "DEAD", fscache_object_dead); static WAIT_STATE(WAIT_FOR_INIT, "?INI", TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); @@ -229,6 +230,10 @@ execute_work_state: event = -1; if (new_state == NO_TRANSIT) { _debug("{OBJ%x} %s notrans", object->debug_id, state->name); + if (unlikely(state == STATE(OBJECT_DEAD))) { + _leave(" [dead]"); + return; + } fscache_enqueue_object(object); event_mask = object->oob_event_mask; goto unmask_events; @@ -239,7 +244,7 @@ execute_work_state: object->state = state = new_state; if (state->work) { - if (unlikely(state->work == ((void *)2UL))) { + if (unlikely(state == STATE(OBJECT_DEAD))) { _leave(" [dead]"); return; } @@ -645,6 +650,12 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob fscache_mark_object_dead(object); object->oob_event_mask = 0; + if (test_bit(FSCACHE_OBJECT_RETIRED, &object->flags)) { + /* Reject any new read/write ops and abort any that are pending. */ + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + fscache_cancel_all_ops(object); + } + if (list_empty(&object->dependents) && object->n_ops == 0 && object->n_children == 0) @@ -1077,3 +1088,20 @@ void fscache_object_mark_killed(struct fscache_object *object, } } EXPORT_SYMBOL(fscache_object_mark_killed); + +/* + * The object is dead. We can get here if an object gets queued by an event + * that would lead to its death (such as EV_KILL) when the dispatcher is + * already running (and so can be requeued) but hasn't yet cleared the event + * mask. + */ +static const struct fscache_state *fscache_object_dead(struct fscache_object *object, + int event) +{ + if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD, + &object->flags)) + return NO_TRANSIT; + + WARN(true, "FS-Cache object redispatched after death"); + return NO_TRANSIT; +} diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 70ea57c7b6bb..f11792672977 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -399,6 +399,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->waitq.lock); + if (test_bit(FR_FINISHED, &req->flags)) { + spin_unlock(&fiq->waitq.lock); + return; + } if (list_empty(&req->intr_entry)) { list_add_tail(&req->intr_entry, &fiq->interrupts); wake_up_locked(&fiq->waitq); @@ -1372,6 +1376,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, * code can Oops if the buffer persists after module unload. */ bufs[page_nr].ops = &nosteal_pipe_buf_ops; + bufs[page_nr].flags = 0; ret = add_to_pipe(pipe, &bufs[page_nr++]); if (unlikely(ret < 0)) break; @@ -2025,7 +2030,6 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) struct fuse_req *req; req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; - clear_bit(FR_PENDING, &req->flags); clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); request_end(fc, req); @@ -2103,6 +2107,8 @@ void fuse_abort_conn(struct fuse_conn *fc) spin_lock(&fiq->waitq.lock); fiq->connected = 0; list_splice_init(&fiq->pending, &to_end2); + list_for_each_entry(req, &to_end2, list) + clear_bit(FR_PENDING, &req->flags); while (forget_pending(fiq)) kfree(dequeue_forget(fiq, 1, NULL)); wake_up_all_locked(&fiq->waitq); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1f7c732f32b0..811fd8929a18 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -68,7 +68,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec) if (sec || nsec) { struct timespec64 ts = { sec, - max_t(u32, nsec, NSEC_PER_SEC - 1) + min_t(u32, nsec, NSEC_PER_SEC - 1) }; return get_jiffies_64() + timespec64_to_jiffies(&ts); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 91307940c8ac..052f8d3c41cb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -256,7 +256,7 @@ struct fuse_io_priv { #define FUSE_IO_PRIV_SYNC(f) \ { \ - .refcnt = { ATOMIC_INIT(1) }, \ + .refcnt = KREF_INIT(1), \ .async = 0, \ .file = f, \ } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 6b039d7ce160..ed7a2e252ad8 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -143,8 +143,8 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc) /* This is the same as calling block_write_full_page, but it also * writes pages outside of i_size */ -int gfs2_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) +static int gfs2_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index fc5da4cbe88c..01b97c012c6e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -720,6 +720,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrp_list rlist; + struct gfs2_trans *tr; u64 bn, bstart; u32 blen, btotal; __be64 *p; @@ -728,6 +729,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, unsigned int revokes = 0; int x; int error; + int jblocks_rqsted; error = gfs2_rindex_update(sdp); if (error) @@ -791,12 +793,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */ gfs2_rs_deltree(&ip->i_res); - error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + - RES_INDIRECT + RES_STATFS + RES_QUOTA, - revokes); +restart: + jblocks_rqsted = rg_blocks + RES_DINODE + + RES_INDIRECT + RES_STATFS + RES_QUOTA + + gfs2_struct2blk(sdp, revokes, sizeof(u64)); + if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2)) + jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2); + error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); if (error) goto out_rg_gunlock; + tr = current->journal_info; down_write(&ip->i_rw_mutex); gfs2_trans_add_meta(ip->i_gl, dibh); @@ -810,6 +817,16 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (!*p) continue; + /* check for max reasonable journal transaction blocks */ + if (tr->tr_num_buf_new + RES_STATFS + + RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { + if (rg_blocks >= tr->tr_num_buf_new) + rg_blocks -= tr->tr_num_buf_new; + else + rg_blocks = 0; + break; + } + bn = be64_to_cpu(*p); if (bstart + blen == bn) @@ -827,6 +844,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, *p = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); } + if (p == bottom) + rg_blocks = 0; + if (bstart) { __gfs2_free_blocks(ip, bstart, blen, metadata); btotal += blen; @@ -844,6 +864,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, gfs2_trans_end(sdp); + if (rg_blocks) + goto restart; + out_rg_gunlock: gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); out_rlist: diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 94f50cac91c6..ec0848fcca02 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -658,9 +658,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct kmem_cache *cachep; int ret, tries = 0; + rcu_read_lock(); gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); if (gl && !lockref_get_not_dead(&gl->gl_lockref)) gl = NULL; + rcu_read_unlock(); *glp = gl; if (gl) @@ -728,15 +730,18 @@ again: if (ret == -EEXIST) { ret = 0; + rcu_read_lock(); tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) { if (++tries < 100) { + rcu_read_unlock(); cond_resched(); goto again; } tmp = NULL; ret = -ENOMEM; } + rcu_read_unlock(); } else { WARN_ON_ONCE(ret); } @@ -1420,26 +1425,32 @@ static struct shrinker glock_shrinker = { * @sdp: the filesystem * @bucket: the bucket * + * Note that the function can be called multiple times on the same + * object. So the user must ensure that the function can cope with + * that. */ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct rhash_head *pos; - const struct bucket_table *tbl; - int i; + struct rhashtable_iter iter; - rcu_read_lock(); - tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) { + rhashtable_walk_enter(&gl_hash_table, &iter); + + do { + gl = ERR_PTR(rhashtable_walk_start(&iter)); + if (gl) + continue; + + while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) if ((gl->gl_name.ln_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); - } - } - rcu_read_unlock(); - cond_resched(); + + rhashtable_walk_stop(&iter); + } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); + + rhashtable_walk_exit(&iter); } /** @@ -1802,16 +1813,18 @@ void gfs2_glock_exit(void) static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { - do { - gi->gl = rhashtable_walk_next(&gi->hti); + while ((gi->gl = rhashtable_walk_next(&gi->hti))) { if (IS_ERR(gi->gl)) { if (PTR_ERR(gi->gl) == -EAGAIN) continue; gi->gl = NULL; + return; } - /* Skip entries for other sb and dead entries */ - } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) || - __lockref_is_dead(&gi->gl->gl_lockref))); + /* Skip entries for other sb and dead entries */ + if (gi->sdp == gi->gl->gl_name.ln_sbd && + !__lockref_is_dead(&gi->gl->gl_lockref)) + return; + } } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a6a3389a07fc..c45084ac642d 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -470,15 +470,19 @@ struct gfs2_quota_data { struct rcu_head qd_rcu; }; +enum { + TR_TOUCHED = 1, + TR_ATTACHED = 2, + TR_ALLOCED = 3, +}; + struct gfs2_trans { unsigned long tr_ip; unsigned int tr_blocks; unsigned int tr_revokes; unsigned int tr_reserved; - unsigned int tr_touched:1; - unsigned int tr_attached:1; - unsigned int tr_alloced:1; + unsigned long tr_flags; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; @@ -794,6 +798,7 @@ struct gfs2_sbd { atomic_t sd_log_thresh1; atomic_t sd_log_thresh2; atomic_t sd_log_blks_free; + atomic_t sd_log_blks_needed; wait_queue_head_t sd_log_waitq; wait_queue_head_t sd_logd_waitq; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 27c00a16def0..f865b96374df 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -349,6 +349,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) if (gfs2_assert_warn(sdp, blks) || gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) return -EINVAL; + atomic_add(blks, &sdp->sd_log_blks_needed); retry: free_blocks = atomic_read(&sdp->sd_log_blks_free); if (unlikely(free_blocks <= wanted)) { @@ -370,6 +371,7 @@ retry: wake_up(&sdp->sd_reserving_log_wait); goto retry; } + atomic_sub(blks, &sdp->sd_log_blks_needed); trace_gfs2_log_blocks(sdp, -blks); /* @@ -797,7 +799,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) { - WARN_ON_ONCE(old->tr_attached != 1); + WARN_ON_ONCE(!test_bit(TR_ATTACHED, &old->tr_flags)); old->tr_num_buf_new += new->tr_num_buf_new; old->tr_num_databuf_new += new->tr_num_databuf_new; @@ -821,9 +823,9 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) if (sdp->sd_log_tr) { gfs2_merge_trans(sdp->sd_log_tr, tr); } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { - gfs2_assert_withdraw(sdp, tr->tr_alloced); + gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, &tr->tr_flags)); sdp->sd_log_tr = tr; - tr->tr_attached = 1; + set_bit(TR_ATTACHED, &tr->tr_flags); } sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; @@ -891,13 +893,16 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) { - return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); + return (atomic_read(&sdp->sd_log_pinned) + + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh1)); } static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) { unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); - return used_blocks >= atomic_read(&sdp->sd_log_thresh2); + return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh2); } /** @@ -913,12 +918,15 @@ int gfs2_logd(void *data) struct gfs2_sbd *sdp = data; unsigned long t = 1; DEFINE_WAIT(wait); + bool did_flush; while (!kthread_should_stop()) { + did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + did_flush = true; } if (gfs2_ail_flush_reqd(sdp)) { @@ -926,9 +934,10 @@ int gfs2_logd(void *data) gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp); gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + did_flush = true; } - if (!gfs2_ail_flush_reqd(sdp)) + if (!gfs2_ail_flush_reqd(sdp) || did_flush) wake_up(&sdp->sd_log_waitq); t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 49db8ef13fdf..663ffc135ef3 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -292,7 +292,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) { struct gfs2_trans *tr = current->journal_info; - if (tr && tr->tr_touched) + if (tr && test_bit(TR_TOUCHED, &tr->tr_flags)) gfs2_io_error_bh(sdp, bh); brelse(bh); *bhp = NULL; @@ -319,7 +319,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) if (!buffer_uptodate(bh)) { struct gfs2_trans *tr = current->journal_info; - if (tr && tr->tr_touched) + if (tr && test_bit(TR_TOUCHED, &tr->tr_flags)) gfs2_io_error_bh(sdp, bh); return -EIO; } @@ -345,7 +345,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, int meta) tr->tr_num_buf_rm++; else tr->tr_num_databuf_rm++; - tr->tr_touched = 1; + set_bit(TR_TOUCHED, &tr->tr_flags); was_pinned = 1; brelse(bh); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index a34308df927f..b108e7ba81af 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -683,6 +683,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail_jindex; } + atomic_set(&sdp->sd_log_blks_needed, 0); if (sdp->sd_args.ar_spectator) { sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); @@ -1226,7 +1227,7 @@ static int set_gfs2_super(struct super_block *s, void *data) * We set the bdi here to the queue backing, file systems can * overwrite this in ->fill_super() */ - s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; + s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 0c1bde395062..affef3c066e0 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -48,7 +48,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, tr->tr_blocks = blocks; tr->tr_revokes = revokes; tr->tr_reserved = 1; - tr->tr_alloced = 1; + set_bit(TR_ALLOCED, &tr->tr_flags); if (blocks) tr->tr_reserved += 6 + blocks; if (revokes) @@ -78,7 +78,8 @@ static void gfs2_print_trans(const struct gfs2_trans *tr) { pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip); pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n", - tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); + tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, + test_bit(TR_TOUCHED, &tr->tr_flags)); pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n", tr->tr_num_buf_new, tr->tr_num_buf_rm, tr->tr_num_databuf_new, tr->tr_num_databuf_rm, @@ -89,12 +90,12 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; s64 nbuf; - int alloced = tr->tr_alloced; + int alloced = test_bit(TR_ALLOCED, &tr->tr_flags); BUG_ON(!tr); current->journal_info = NULL; - if (!tr->tr_touched) { + if (!test_bit(TR_TOUCHED, &tr->tr_flags)) { gfs2_log_release(sdp, tr->tr_reserved); if (alloced) { kfree(tr); @@ -112,8 +113,8 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_print_trans(tr); gfs2_log_commit(sdp, tr); - if (alloced && !tr->tr_attached) - kfree(tr); + if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags)) + kfree(tr); up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) @@ -169,6 +170,10 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) } lock_buffer(bh); + if (buffer_pinned(bh)) { + set_bit(TR_TOUCHED, &tr->tr_flags); + goto out; + } gfs2_log_lock(sdp); bd = bh->b_private; if (bd == NULL) { @@ -182,7 +187,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) gfs2_log_lock(sdp); } gfs2_assert(sdp, bd->bd_gl == gl); - tr->tr_touched = 1; + set_bit(TR_TOUCHED, &tr->tr_flags); if (list_empty(&bd->bd_list)) { set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); @@ -191,45 +196,24 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) list_add_tail(&bd->bd_list, &tr->tr_databuf); } gfs2_log_unlock(sdp); +out: unlock_buffer(bh); } -static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_meta_header *mh; - struct gfs2_trans *tr; - enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); - - tr = current->journal_info; - tr->tr_touched = 1; - if (!list_empty(&bd->bd_list)) - return; - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; - if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { - pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n", - (unsigned long long)bd->bd_bh->b_blocknr); - BUG(); - } - if (unlikely(state == SFS_FROZEN)) { - printk(KERN_INFO "GFS2:adding buf while frozen\n"); - gfs2_assert_withdraw(sdp, 0); - } - gfs2_pin(sdp, bd->bd_bh); - mh->__pad0 = cpu_to_be64(0); - mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); - list_add(&bd->bd_list, &tr->tr_buf); - tr->tr_num_buf_new++; -} - void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_bufdata *bd; + struct gfs2_meta_header *mh; + struct gfs2_trans *tr = current->journal_info; + enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lock_buffer(bh); + if (buffer_pinned(bh)) { + set_bit(TR_TOUCHED, &tr->tr_flags); + goto out; + } gfs2_log_lock(sdp); bd = bh->b_private; if (bd == NULL) { @@ -245,8 +229,29 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) gfs2_log_lock(sdp); } gfs2_assert(sdp, bd->bd_gl == gl); - meta_lo_add(sdp, bd); + set_bit(TR_TOUCHED, &tr->tr_flags); + if (!list_empty(&bd->bd_list)) + goto out_unlock; + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { + pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n", + (unsigned long long)bd->bd_bh->b_blocknr); + BUG(); + } + if (unlikely(state == SFS_FROZEN)) { + printk(KERN_INFO "GFS2:adding buf while frozen\n"); + gfs2_assert_withdraw(sdp, 0); + } + gfs2_pin(sdp, bd->bd_bh); + mh->__pad0 = cpu_to_be64(0); + mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + list_add(&bd->bd_list, &tr->tr_buf); + tr->tr_num_buf_new++; +out_unlock: gfs2_log_unlock(sdp); +out: unlock_buffer(bh); } @@ -256,7 +261,7 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) BUG_ON(!list_empty(&bd->bd_list)); gfs2_add_revoke(sdp, bd); - tr->tr_touched = 1; + set_bit(TR_TOUCHED, &tr->tr_flags); tr->tr_num_revoke++; } diff --git a/fs/internal.h b/fs/internal.h index b63cf3af2dc2..11c6d89dce9c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -182,7 +182,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, void *data, struct iomap *iomap); loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap_ops *ops, void *data, + unsigned flags, const struct iomap_ops *ops, void *data, iomap_actor_t actor); /* direct-io.c: */ diff --git a/fs/iomap.c b/fs/iomap.c index 354a123f170e..d89f70bbb952 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -41,7 +41,7 @@ */ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, - struct iomap_ops *ops, void *data, iomap_actor_t actor) + const struct iomap_ops *ops, void *data, iomap_actor_t actor) { struct iomap iomap = { 0 }; loff_t written = 0, ret; @@ -114,6 +114,9 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, BUG_ON(pos + len > iomap->offset + iomap->length); + if (fatal_signal_pending(current)) + return -EINTR; + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); if (!page) return -ENOMEM; @@ -232,7 +235,7 @@ again: ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, - struct iomap_ops *ops) + const struct iomap_ops *ops) { struct inode *inode = iocb->ki_filp->f_mapping->host; loff_t pos = iocb->ki_pos, ret = 0, written = 0; @@ -315,7 +318,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, - struct iomap_ops *ops) + const struct iomap_ops *ops) { loff_t ret; @@ -395,7 +398,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - struct iomap_ops *ops) + const struct iomap_ops *ops) { loff_t ret; @@ -415,7 +418,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - struct iomap_ops *ops) + const struct iomap_ops *ops) { unsigned blocksize = (1 << inode->i_blkbits); unsigned off = pos & (blocksize - 1); @@ -443,7 +446,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, } int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - struct iomap_ops *ops) + const struct iomap_ops *ops) { struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); @@ -542,7 +545,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, - loff_t start, loff_t len, struct iomap_ops *ops) + loff_t start, loff_t len, const struct iomap_ops *ops) { struct fiemap_ctx ctx; loff_t ret; @@ -836,8 +839,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, } ssize_t -iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops, - iomap_dio_end_io_t end_io) +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, iomap_dio_end_io_t end_io) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8c514367ba5a..b6b194ec1b4f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -393,7 +393,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * We hold j_checkpoint_mutex so tail cannot change under us. * We don't need any special data guarantees for writing sb diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a097048ed1a3..a1a359bfcc9c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -276,11 +276,11 @@ loop: goto loop; end_loop: - write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); jbd_debug(1, "Journal thread exiting.\n"); + write_unlock(&journal->j_state_lock); return 0; } @@ -944,7 +944,7 @@ out: */ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) { - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); if (tid_gt(tid, journal->j_tail_sequence)) __jbd2_update_log_tail(journal, tid, block); mutex_unlock(&journal->j_checkpoint_mutex); @@ -1304,7 +1304,7 @@ static int journal_reset(journal_t *journal) journal->j_flags |= JBD2_FLUSHED; } else { /* Lock here to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * Update log tail information. We use REQ_FUA since new * transaction will start reusing journal space and so we @@ -1691,7 +1691,7 @@ int jbd2_journal_destroy(journal_t *journal) spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); /* @@ -1713,7 +1713,7 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); journal->j_tail_sequence = @@ -1955,7 +1955,7 @@ int jbd2_journal_flush(journal_t *journal) spin_lock(&journal->j_list_lock); while (!err && journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); @@ -1965,7 +1965,7 @@ int jbd2_journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); if (!err) { err = jbd2_cleanup_journal_tail(journal); if (err < 0) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e1652665bd93..5e659ee08d6a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1863,7 +1863,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) __blist_del_buffer(list, jh); jh->b_jlist = BJ_None; - if (test_clear_buffer_jbddirty(bh)) + if (transaction && is_journal_aborted(transaction->t_journal)) + clear_buffer_jbddirty(bh); + else if (test_clear_buffer_jbddirty(bh)) mark_buffer_dirty(bh); /* Expose it to the VM */ } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index cf4c636ff4da..439b946c4808 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -41,6 +41,9 @@ static bool kernfs_lockdep(struct kernfs_node *kn) static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) { + if (!kn) + return strlcpy(buf, "(null)", buflen); + return strlcpy(buf, kn->parent ? kn->name : "/", buflen); } @@ -110,6 +113,8 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * + * [3] when @kn_to is NULL result will be "(null)" + * * Returns the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. @@ -123,6 +128,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, size_t depth_from, depth_to, len = 0; int i, j; + if (!kn_to) + return strlcpy(buf, "(null)", buflen); + if (!kn_from) kn_from = kernfs_root(kn_to)->kn; @@ -166,6 +174,8 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, * similar to strlcpy(). It returns the length of @kn's name and if @buf * isn't long enough, it's filled upto @buflen-1 and nul terminated. * + * Fills buffer with "(null)" if @kn is NULL. + * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) diff --git a/fs/libfs.c b/fs/libfs.c index e973cd51f126..28d6f35feed6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -245,7 +245,8 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, struct inode *root; struct qstr d_name = QSTR_INIT(name, strlen(name)); - s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL); + s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER, + &init_user_ns, NULL); if (IS_ERR(s)) return ERR_CAST(s); diff --git a/fs/mount.h b/fs/mount.h index 2c856fc47ae3..2826543a131d 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt) } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); diff --git a/fs/namei.c b/fs/namei.c index 7d87699c3e2e..d41fab78798b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -672,17 +672,15 @@ static bool legitimize_links(struct nameidata *nd) /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data - * @dentry: child of nd->path.dentry or NULL - * @seq: seq number to check dentry against * Returns: 0 on success, -ECHILD on failure * - * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry - * for ref-walk mode. @dentry must be a path found by a do_lookup call on - * @nd or NULL. Must be called from rcu-walk context. + * unlazy_walk attempts to legitimize the current nd->path and nd->root + * for ref-walk mode. + * Must be called from rcu-walk context. * Nothing should touch nameidata between unlazy_walk() failure and * terminate_walk(). */ -static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq) +static int unlazy_walk(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; @@ -691,33 +689,66 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out2; + if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) + goto out1; + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) + goto out; + } + rcu_read_unlock(); + BUG_ON(nd->inode != parent->d_inode); + return 0; + +out2: + nd->path.mnt = NULL; + nd->path.dentry = NULL; +out1: + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; +out: + rcu_read_unlock(); + return -ECHILD; +} + +/** + * unlazy_child - try to switch to ref-walk mode. + * @nd: nameidata pathwalk data + * @dentry: child of nd->path.dentry + * @seq: seq number to check dentry against + * Returns: 0 on success, -ECHILD on failure + * + * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry + * for ref-walk mode. @dentry must be a path found by a do_lookup call on + * @nd. Must be called from rcu-walk context. + * Nothing should touch nameidata between unlazy_child() failure and + * terminate_walk(). + */ +static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq) +{ + BUG_ON(!(nd->flags & LOOKUP_RCU)); + + nd->flags &= ~LOOKUP_RCU; + if (unlikely(!legitimize_links(nd))) + goto out2; if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) goto out2; - if (unlikely(!lockref_get_not_dead(&parent->d_lockref))) + if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* - * For a negative lookup, the lookup sequence point is the parents - * sequence point, and it only needs to revalidate the parent dentry. - * - * For a positive lookup, we need to move both the parent and the - * dentry from the RCU domain to be properly refcounted. And the - * sequence number in the dentry validates *both* dentry counters, - * since we checked the sequence number of the parent after we got - * the child sequence number. So we know the parent must still - * be valid if the child sequence number is still valid. + * We need to move both the parent and the dentry from the RCU domain + * to be properly refcounted. And the sequence number in the dentry + * validates *both* dentry counters, since we checked the sequence + * number of the parent after we got the child sequence number. So we + * know the parent must still be valid if the child sequence number is */ - if (!dentry) { - if (read_seqcount_retry(&parent->d_seq, nd->seq)) - goto out; - BUG_ON(nd->inode != parent->d_inode); - } else { - if (!lockref_get_not_dead(&dentry->d_lockref)) - goto out; - if (read_seqcount_retry(&dentry->d_seq, seq)) - goto drop_dentry; + if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) + goto out; + if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) { + rcu_read_unlock(); + dput(dentry); + goto drop_root_mnt; } - /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. @@ -733,10 +764,6 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq rcu_read_unlock(); return 0; -drop_dentry: - rcu_read_unlock(); - dput(dentry); - goto drop_root_mnt; out2: nd->path.mnt = NULL; out1: @@ -749,27 +776,12 @@ drop_root_mnt: return -ECHILD; } -static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq) -{ - if (unlikely(!legitimize_path(nd, link, seq))) { - drop_links(nd); - nd->depth = 0; - nd->flags &= ~LOOKUP_RCU; - nd->path.mnt = NULL; - nd->path.dentry = NULL; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - rcu_read_unlock(); - } else if (likely(unlazy_walk(nd, NULL, 0)) == 0) { - return 0; - } - path_put(link); - return -ECHILD; -} - static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { - return dentry->d_op->d_revalidate(dentry, flags); + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) + return dentry->d_op->d_revalidate(dentry, flags); + else + return 1; } /** @@ -790,7 +802,7 @@ static int complete_walk(struct nameidata *nd) if (nd->flags & LOOKUP_RCU) { if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - if (unlikely(unlazy_walk(nd, NULL, 0))) + if (unlikely(unlazy_walk(nd))) return -ECHILD; } @@ -1016,7 +1028,7 @@ const char *get_link(struct nameidata *nd) touch_atime(&last->link); cond_resched(); } else if (atime_needs_update_rcu(&last->link, inode)) { - if (unlikely(unlazy_walk(nd, NULL, 0))) + if (unlikely(unlazy_walk(nd))) return ERR_PTR(-ECHILD); touch_atime(&last->link); } @@ -1035,7 +1047,7 @@ const char *get_link(struct nameidata *nd) if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD)) { - if (unlikely(unlazy_walk(nd, NULL, 0))) + if (unlikely(unlazy_walk(nd))) return ERR_PTR(-ECHILD); res = get(dentry, inode, &last->done); } @@ -1100,7 +1112,6 @@ static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; - const struct cred *old_cred; int err; if (!path->dentry->d_op || !path->dentry->d_op->d_automount) @@ -1129,9 +1140,7 @@ static int follow_automount(struct path *path, struct nameidata *nd, if (nd->total_link_count >= 40) return -ELOOP; - old_cred = override_creds(&init_cred); mnt = path->dentry->d_op->d_automount(path); - revert_creds(old_cred); if (IS_ERR(mnt)) { /* * The filesystem is allowed to return -EISDIR here to indicate @@ -1472,19 +1481,14 @@ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { - struct dentry *dentry; - int error; - - dentry = d_lookup(dir, name); + struct dentry *dentry = d_lookup(dir, name); if (dentry) { - if (dentry->d_flags & DCACHE_OP_REVALIDATE) { - error = d_revalidate(dentry, flags); - if (unlikely(error <= 0)) { - if (!error) - d_invalidate(dentry); - dput(dentry); - return ERR_PTR(error); - } + int error = d_revalidate(dentry, flags); + if (unlikely(error <= 0)) { + if (!error) + d_invalidate(dentry); + dput(dentry); + return ERR_PTR(error); } } return dentry; @@ -1549,7 +1553,7 @@ static int lookup_fast(struct nameidata *nd, bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (unlikely(!dentry)) { - if (unlazy_walk(nd, NULL, 0)) + if (unlazy_walk(nd)) return -ECHILD; return 0; } @@ -1574,14 +1578,8 @@ static int lookup_fast(struct nameidata *nd, return -ECHILD; *seqp = seq; - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) - status = d_revalidate(dentry, nd->flags); - if (unlikely(status <= 0)) { - if (unlazy_walk(nd, dentry, seq)) - return -ECHILD; - if (status == -ECHILD) - status = d_revalidate(dentry, nd->flags); - } else { + status = d_revalidate(dentry, nd->flags); + if (likely(status > 0)) { /* * Note: do negative dentry check after revalidation in * case that drops it. @@ -1592,15 +1590,17 @@ static int lookup_fast(struct nameidata *nd, path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, seqp))) return 1; - if (unlazy_walk(nd, dentry, seq)) - return -ECHILD; } + if (unlazy_child(nd, dentry, seq)) + return -ECHILD; + if (unlikely(status == -ECHILD)) + /* we'd been told to redo it in non-rcu mode */ + status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return 0; - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) @@ -1639,8 +1639,7 @@ again: if (IS_ERR(dentry)) goto out; if (unlikely(!d_in_lookup(dentry))) { - if ((dentry->d_flags & DCACHE_OP_REVALIDATE) && - !(flags & LOOKUP_NO_REVAL)) { + if (!(flags & LOOKUP_NO_REVAL)) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) { @@ -1671,7 +1670,7 @@ static inline int may_lookup(struct nameidata *nd) int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD) return err; - if (unlazy_walk(nd, NULL, 0)) + if (unlazy_walk(nd)) return -ECHILD; } return inode_permission(nd->inode, MAY_EXEC); @@ -1706,9 +1705,17 @@ static int pick_link(struct nameidata *nd, struct path *link, error = nd_alloc_stack(nd); if (unlikely(error)) { if (error == -ECHILD) { - if (unlikely(unlazy_link(nd, link, seq))) - return -ECHILD; - error = nd_alloc_stack(nd); + if (unlikely(!legitimize_path(nd, link, seq))) { + drop_links(nd); + nd->depth = 0; + nd->flags &= ~LOOKUP_RCU; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + } else if (likely(unlazy_walk(nd)) == 0) + error = nd_alloc_stack(nd); } if (error) { path_put(link); @@ -2125,7 +2132,7 @@ OK: } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd, NULL, 0)) + if (unlazy_walk(nd)) return -ECHILD; } return -ENOTDIR; @@ -2582,7 +2589,7 @@ mountpoint_last(struct nameidata *nd) /* If we're in rcuwalk, drop out of it to handle last component */ if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd, NULL, 0)) + if (unlazy_walk(nd)) return -ECHILD; } @@ -2941,10 +2948,16 @@ static inline int open_to_namei_flags(int flag) static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) { + struct user_namespace *s_user_ns; int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; + s_user_ns = dir->dentry->d_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) + return -EOVERFLOW; + error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -3069,9 +3082,6 @@ static int lookup_open(struct nameidata *nd, struct path *path, if (d_in_lookup(dentry)) break; - if (!(dentry->d_flags & DCACHE_OP_REVALIDATE)) - break; - error = d_revalidate(dentry, nd->flags); if (likely(error > 0)) break; diff --git a/fs/namespace.c b/fs/namespace.c index b5b1259e064f..8bfad42c1ccf 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) } /* - * find the last mount at @dentry on vfsmount @mnt. - * mount_lock must be held. - */ -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) -{ - struct mount *p, *res = NULL; - p = __lookup_mnt(mnt, dentry); - if (!p) - goto out; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - hlist_for_each_entry_continue(p, mnt_hash) { - if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) - break; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - } -out: - return res; -} - -/* * lookup_mnt - Return the first child mount mounted at path * * "First" means first mounted chronologically. If you create the @@ -742,26 +720,50 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry) return NULL; } -static struct mountpoint *new_mountpoint(struct dentry *dentry) +static struct mountpoint *get_mountpoint(struct dentry *dentry) { - struct hlist_head *chain = mp_hash(dentry); - struct mountpoint *mp; + struct mountpoint *mp, *new = NULL; int ret; - mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); - if (!mp) + if (d_mountpoint(dentry)) { +mountpoint: + read_seqlock_excl(&mount_lock); + mp = lookup_mountpoint(dentry); + read_sequnlock_excl(&mount_lock); + if (mp) + goto done; + } + + if (!new) + new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!new) return ERR_PTR(-ENOMEM); + + /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); - if (ret) { - kfree(mp); - return ERR_PTR(ret); - } - mp->m_dentry = dentry; - mp->m_count = 1; - hlist_add_head(&mp->m_hash, chain); - INIT_HLIST_HEAD(&mp->m_list); + /* Someone else set d_mounted? */ + if (ret == -EBUSY) + goto mountpoint; + + /* The dentry is not available as a mountpoint? */ + mp = ERR_PTR(ret); + if (ret) + goto done; + + /* Add the new mountpoint to the hash table */ + read_seqlock_excl(&mount_lock); + new->m_dentry = dentry; + new->m_count = 1; + hlist_add_head(&new->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&new->m_list); + read_sequnlock_excl(&mount_lock); + + mp = new; + new = NULL; +done: + kfree(new); return mp; } @@ -854,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt, hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } +static void __attach_mnt(struct mount *mnt, struct mount *parent) +{ + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); +} + /* * vfsmount lock must be held for write */ @@ -862,28 +871,45 @@ static void attach_mnt(struct mount *mnt, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); - hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + __attach_mnt(mnt, parent); } -static void attach_shadowed(struct mount *mnt, - struct mount *parent, - struct mount *shadows) +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { - if (shadows) { - hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); - list_add(&mnt->mnt_child, &shadows->mnt_child); - } else { - hlist_add_head_rcu(&mnt->mnt_hash, - m_hash(&parent->mnt, mnt->mnt_mountpoint)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); - } + struct mountpoint *old_mp = mnt->mnt_mp; + struct dentry *old_mountpoint = mnt->mnt_mountpoint; + struct mount *old_parent = mnt->mnt_parent; + + list_del_init(&mnt->mnt_child); + hlist_del_init(&mnt->mnt_mp_list); + hlist_del_init_rcu(&mnt->mnt_hash); + + attach_mnt(mnt, parent, mp); + + put_mountpoint(old_mp); + + /* + * Safely avoid even the suggestion this code might sleep or + * lock the mount hash by taking advantage of the knowledge that + * mnt_change_mountpoint will not release the final reference + * to a mountpoint. + * + * During mounting, the mount passed in as the parent mount will + * continue to use the old mountpoint and during unmounting, the + * old mountpoint will continue to exist until namespace_unlock, + * which happens well after mnt_change_mountpoint. + */ + spin_lock(&old_mountpoint->d_lock); + old_mountpoint->d_lockref.count--; + spin_unlock(&old_mountpoint->d_lock); + + mnt_add_count(old_parent, -1); } /* * vfsmount lock must be held for write */ -static void commit_tree(struct mount *mnt, struct mount *shadows) +static void commit_tree(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m; @@ -901,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) n->mounts += n->pending_mounts; n->pending_mounts = 0; - attach_shadowed(mnt, parent, shadows); + __attach_mnt(mnt, parent); touch_mnt_namespace(n); } @@ -965,6 +991,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void } EXPORT_SYMBOL_GPL(vfs_kern_mount); +struct vfsmount * +vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, + const char *name, void *data) +{ + /* Until it is worked out how to pass the user namespace + * through from the parent mount to the submount don't support + * unprivileged mounts with submounts. + */ + if (mountpoint->d_sb->s_user_ns != &init_user_ns) + return ERR_PTR(-EPERM); + + return vfs_kern_mount(type, MS_SUBMOUNT, name, data); +} +EXPORT_SYMBOL_GPL(vfs_submount); + static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { @@ -1595,11 +1636,11 @@ void __detach_mounts(struct dentry *dentry) struct mount *mnt; namespace_lock(); + lock_mount_hash(); mp = lookup_mountpoint(dentry); if (IS_ERR_OR_NULL(mp)) goto out_unlock; - lock_mount_hash(); event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); @@ -1609,9 +1650,9 @@ void __detach_mounts(struct dentry *dentry) } else umount_tree(mnt, UMOUNT_CONNECTED); } - unlock_mount_hash(); put_mountpoint(mp); out_unlock: + unlock_mount_hash(); namespace_unlock(); } @@ -1740,7 +1781,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { - struct mount *t = NULL; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { s = skip_mnt_tree(s); @@ -1762,14 +1802,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - mnt_set_mountpoint(parent, p->mnt_mp, q); - if (!list_empty(&parent->mnt_mounts)) { - t = list_last_entry(&parent->mnt_mounts, - struct mount, mnt_child); - if (t->mnt_mp != p->mnt_mp) - t = NULL; - } - attach_shadowed(q, parent, t); + attach_mnt(q, parent, p->mnt_mp); unlock_mount_hash(); } } @@ -1968,10 +2001,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, { HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mountpoint *smp; struct mount *child, *p; struct hlist_node *n; int err; + /* Preallocate a mountpoint in case the new mounts need + * to be tucked under other mounts. + */ + smp = get_mountpoint(source_mnt->mnt.mnt_root); + if (IS_ERR(smp)) + return PTR_ERR(smp); + /* Is there space to add these mounts to the mount namespace? */ if (!parent_path) { err = count_mounts(ns, source_mnt); @@ -1998,16 +2039,19 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt, NULL); + commit_tree(source_mnt); } hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt_last(&child->mnt_parent->mnt, - child->mnt_mountpoint); - commit_tree(child, q); + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); + commit_tree(child); } + put_mountpoint(smp); unlock_mount_hash(); return 0; @@ -2022,6 +2066,11 @@ static int attach_recursive_mnt(struct mount *source_mnt, cleanup_group_ids(source_mnt, NULL); out: ns->pending_mounts = 0; + + read_seqlock_excl(&mount_lock); + put_mountpoint(smp); + read_sequnlock_excl(&mount_lock); + return err; } @@ -2038,9 +2087,7 @@ retry: namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = lookup_mountpoint(dentry); - if (!mp) - mp = new_mountpoint(dentry); + struct mountpoint *mp = get_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); inode_unlock(dentry->d_inode); @@ -2059,7 +2106,11 @@ retry: static void unlock_mount(struct mountpoint *where) { struct dentry *dentry = where->m_dentry; + + read_seqlock_excl(&mount_lock); put_mountpoint(where); + read_sequnlock_excl(&mount_lock); + namespace_unlock(); inode_unlock(dentry->d_inode); } @@ -2768,7 +2819,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME | MS_NOREMOTELOCK); + MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, @@ -3135,9 +3186,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); + put_mountpoint(root_mp); unlock_mount_hash(); chroot_fs_refs(&root, &new); - put_mountpoint(root_mp); error = 0; out4: unlock_mount(old_mp); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 5551e8ef67fd..e49d831c4e85 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, const char *devname, struct nfs_clone_mount *mountdata) { - return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); + return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata); } /** diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index d21104912676..d8b040bd9814 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mountdata->hostname, mountdata->mnt_path); - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata); if (!IS_ERR(mnt)) break; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6dcbc5defb7a..0a0eaecf9676 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -38,7 +38,6 @@ #include <linux/mm.h> #include <linux/delay.h> #include <linux/errno.h> -#include <linux/file.h> #include <linux/string.h> #include <linux/ratelimit.h> #include <linux/printk.h> @@ -1083,7 +1082,8 @@ int nfs4_call_sync(struct rpc_clnt *clnt, return nfs4_call_sync_sequence(clnt, server, msg, args, res); } -static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) +static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, + unsigned long timestamp) { struct nfs_inode *nfsi = NFS_I(dir); @@ -1099,6 +1099,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) NFS_INO_INVALID_ACL; } dir->i_version = cinfo->after; + nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfs_fscache_invalidate(dir); spin_unlock(&dir->i_lock); @@ -2391,11 +2392,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) nfs_fattr_map_and_free_names(server, &data->f_attr); if (o_arg->open_flags & O_CREAT) { - update_changeattr(dir, &o_res->cinfo); if (o_arg->open_flags & O_EXCL) data->file_created = 1; else if (o_res->cinfo.before != o_res->cinfo.after) data->file_created = 1; + if (data->file_created || dir->i_version != o_res->cinfo.after) + update_changeattr(dir, &o_res->cinfo, + o_res->f_attr->time_start); } if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) server->caps &= ~NFS_CAP_POSIX_LOCK; @@ -2697,7 +2700,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, sattr->ia_valid |= ATTR_MTIME; /* Except MODE, it seems harmless of setting twice. */ - if ((attrset[1] & FATTR4_WORD1_MODE)) + if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && + attrset[1] & FATTR4_WORD1_MODE) sattr->ia_valid &= ~ATTR_MODE; if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) @@ -4073,11 +4077,12 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) .rpc_argp = &args, .rpc_resp = &res, }; + unsigned long timestamp = jiffies; int status; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); if (status == 0) - update_changeattr(dir, &res.cinfo); + update_changeattr(dir, &res.cinfo, timestamp); return status; } @@ -4125,7 +4130,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) return 0; - update_changeattr(dir, &res->cinfo); + if (task->tk_status == 0) + update_changeattr(dir, &res->cinfo, res->dir_attr->time_start); return 1; } @@ -4159,8 +4165,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) return 0; - update_changeattr(old_dir, &res->old_cinfo); - update_changeattr(new_dir, &res->new_cinfo); + if (task->tk_status == 0) { + update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start); + if (new_dir != old_dir) + update_changeattr(new_dir, &res->new_cinfo, res->new_fattr->time_start); + } return 1; } @@ -4197,7 +4206,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { - update_changeattr(dir, &res.cinfo); + update_changeattr(dir, &res.cinfo, res.fattr->time_start); status = nfs_post_op_update_inode(inode, res.fattr); if (!status) nfs_setsecurity(inode, res.fattr, res.label); @@ -4272,7 +4281,8 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { - update_changeattr(dir, &data->res.dir_cinfo); + update_changeattr(dir, &data->res.dir_cinfo, + data->res.fattr->time_start); status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label); } return status; @@ -6127,7 +6137,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->server = server; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); - get_file(fl->fl_file); memcpy(&p->fl, fl, sizeof(p->fl)); return p; out_free_seqid: @@ -6240,7 +6249,6 @@ static void nfs4_lock_release(void *calldata) nfs_free_seqid(data->arg.lock_seqid); nfs4_put_lock_state(data->lsp); put_nfs_open_context(data->ctx); - fput(data->fl.fl_file); kfree(data); dprintk("%s: done!\n", __func__); } @@ -8483,6 +8491,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, goto out; } + nfs4_sequence_free_slot(&lgp->res.seq_res); err = nfs4_handle_exception(server, nfs4err, exception); if (!status) { if (exception->retry) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1d152f4470cd..daeb94e3acd4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1091,6 +1091,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) case -NFS4ERR_BADXDR: case -NFS4ERR_RESOURCE: case -NFS4ERR_NOFILEHANDLE: + case -NFS4ERR_MOVED: /* Non-seqid mutating errors */ return; }; @@ -1729,7 +1730,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) break; case -NFS4ERR_STALE_CLIENTID: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_EXPIRED: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 59554f3adf29..dd042498ce7c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1200,10 +1200,10 @@ _pnfs_return_layout(struct inode *ino) send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&tmp_list); if (send) status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); out_put_layout_hdr: + pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); out: dprintk("<-- %s status: %d\n", __func__, status); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b00d53d13d47..006068526542 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -728,8 +728,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); - smp_mb__after_atomic(); - wake_up_page(head->wb_page, PG_private); clear_bit(PG_MAPPED, &head->wb_flags); } nfsi->nrequests--; diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 47febcf99185..20b1c17320d5 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -104,6 +104,7 @@ config NFSD_SCSILAYOUT depends on NFSD_V4 && BLOCK select NFSD_PNFS select EXPORTFS_BLOCK_OPS + select BLK_SCSI_REQUEST help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 0780ff864539..a06115e31612 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -10,6 +10,7 @@ #include <linux/nfsd/debug.h> #include <scsi/scsi_proto.h> #include <scsi/scsi_common.h> +#include <scsi/scsi_request.h> #include "blocklayoutxdr.h" #include "pnfs.h" @@ -213,6 +214,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, { struct request_queue *q = bdev->bd_disk->queue; struct request *rq; + struct scsi_request *req; size_t bufflen = 252, len, id_len; u8 *buf, *d, type, assoc; int error; @@ -221,23 +223,24 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, if (!buf) return -ENOMEM; - rq = blk_get_request(q, READ, GFP_KERNEL); + rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL); if (IS_ERR(rq)) { error = -ENOMEM; goto out_free_buf; } - blk_rq_set_block_pc(rq); + req = scsi_req(rq); + scsi_req_init(rq); error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); if (error) goto out_put_request; - rq->cmd[0] = INQUIRY; - rq->cmd[1] = 1; - rq->cmd[2] = 0x83; - rq->cmd[3] = bufflen >> 8; - rq->cmd[4] = bufflen & 0xff; - rq->cmd_len = COMMAND_SIZE(INQUIRY); + req->cmd[0] = INQUIRY; + req->cmd[1] = 1; + req->cmd[2] = 0x83; + req->cmd[3] = bufflen >> 8; + req->cmd[4] = bufflen & 0xff; + req->cmd_len = COMMAND_SIZE(INQUIRY); error = blk_execute_rq(rq->q, NULL, rq, 1); if (error) { diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 596205d939a1..e122da696f1b 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -223,10 +223,11 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, struct nfs4_layout_stateid *ls; struct nfs4_stid *stp; - stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache); + stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache, + nfsd4_free_layout_stateid); if (!stp) return NULL; - stp->sc_free = nfsd4_free_layout_stateid; + get_nfs4_file(fp); stp->sc_file = fp; @@ -613,6 +614,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) { struct nfs4_client *clp = ls->ls_stid.sc_client; char addr_str[INET6_ADDRSTRLEN]; + static char const nfsd_recall_failed[] = "/sbin/nfsd-recall-failed"; static char *envp[] = { "HOME=/", "TERM=linux", @@ -628,12 +630,13 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) "nfsd: client %s failed to respond to layout recall. " " Fencing..\n", addr_str); - argv[0] = "/sbin/nfsd-recall-failed"; + argv[0] = (char *)nfsd_recall_failed; argv[1] = addr_str; argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; - error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + error = call_usermodehelper(nfsd_recall_failed, argv, envp, + UMH_WAIT_PROC); if (error) { printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n", addr_str, error); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4b4beaaa4eaa..a0dee8ae9f97 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -633,8 +633,8 @@ out: return co; } -struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, - struct kmem_cache *slab) +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, + void (*sc_free)(struct nfs4_stid *)) { struct nfs4_stid *stid; int new_id; @@ -650,6 +650,8 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, idr_preload_end(); if (new_id < 0) goto out_free; + + stid->sc_free = sc_free; stid->sc_client = cl; stid->sc_stateid.si_opaque.so_id = new_id; stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; @@ -675,15 +677,12 @@ out_free: static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { struct nfs4_stid *stid; - struct nfs4_ol_stateid *stp; - stid = nfs4_alloc_stid(clp, stateid_slab); + stid = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_ol_stateid); if (!stid) return NULL; - stp = openlockstateid(stid); - stp->st_stid.sc_free = nfs4_free_ol_stateid; - return stp; + return openlockstateid(stid); } static void nfs4_free_deleg(struct nfs4_stid *stid) @@ -781,11 +780,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, goto out_dec; if (delegation_blocked(¤t_fh->fh_handle)) goto out_dec; - dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); + dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg)); if (dp == NULL) goto out_dec; - dp->dl_stid.sc_free = nfs4_free_deleg; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -5580,7 +5578,6 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; - stp->st_stid.sc_free = nfs4_free_lock_stateid; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; @@ -5623,7 +5620,7 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, lst = find_lock_stateid(lo, fi); if (lst == NULL) { spin_unlock(&clp->cl_lock); - ns = nfs4_alloc_stid(clp, stateid_slab); + ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); if (ns == NULL) return NULL; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 7ecf16be4a44..8fae53ce21d1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2440,7 +2440,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - u32 *supp = nfsd_suppattrs[minorversion]; + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); if (!IS_POSIXACL(dentry->d_inode)) supp[0] &= ~FATTR4_WORD0_ACL; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index c9399366f9df..4516e8b7d776 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -603,8 +603,8 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); -struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, - struct kmem_cache *slab); +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, + void (*sc_free)(struct nfs4_stid *)); void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 12eeae62a2b1..e1872f36147f 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; + sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info; err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index bbc175d4213d..a4c46221755e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -31,7 +31,6 @@ static bool should_merge(struct fsnotify_event *old_fsn, static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) { struct fsnotify_event *test_event; - bool do_merge = false; pr_debug("%s: list=%p event=%p\n", __func__, list, event); @@ -47,16 +46,12 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) list_for_each_entry_reverse(test_event, list, list) { if (should_merge(test_event, event)) { - do_merge = true; - break; + test_event->mask |= event->mask; + return 1; } } - if (!do_merge) - return 0; - - test_event->mask |= event->mask; - return 1; + return 0; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index a6f5907a3fee..7c461fd49c4c 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group, const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; + +#ifdef CONFIG_INOTIFY_USER +static inline void dec_inotify_instances(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES); +} + +static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts) +{ + return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES); +} + +static inline void dec_inotify_watches(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES); +} +#endif diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 19e7ec109a75..f36c29398de3 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group) /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_destroy(&group->inotify_data.idr); - if (group->inotify_data.user) { - atomic_dec(&group->inotify_data.user->inotify_devs); - free_uid(group->inotify_data.user); - } + if (group->inotify_data.ucounts) + dec_inotify_instances(group->inotify_data.ucounts); } static void inotify_free_event(struct fsnotify_event *fsn_event) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 69d1ea3d292a..1cf41c623be1 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -44,10 +44,8 @@ #include <asm/ioctls.h> -/* these are configurable via /proc/sys/fs/inotify/ */ -static int inotify_max_user_instances __read_mostly; +/* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -static int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; @@ -60,7 +58,7 @@ static int zero; struct ctl_table inotify_table[] = { { .procname = "max_user_instances", - .data = &inotify_max_user_instances, + .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = { }, { .procname = "max_user_watches", - .data = &inotify_max_user_watches, + .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); - atomic_dec(&group->inotify_data.user->inotify_watches); + dec_inotify_watches(group->inotify_data.ucounts); } /* ding dong the mark is dead */ @@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group, tmp_i_mark->fsn_mark.mask = mask; tmp_i_mark->wd = -1; - ret = -ENOSPC; - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) - goto out_err; - ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; + /* increment the number of watches the user has */ + if (!inc_inotify_watches(group->inotify_data.ucounts)) { + inotify_remove_from_idr(group, tmp_i_mark); + ret = -ENOSPC; + goto out_err; + } + /* we are on the idr, now get on the inode */ ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); @@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group, goto out_err; } - /* increment the number of watches the user has */ - atomic_inc(&group->inotify_data.user->inotify_watches); /* return the watch descriptor for this new mark */ ret = tmp_i_mark->wd; @@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.user = get_current_user(); + group->inotify_data.ucounts = inc_ucount(current_user_ns(), + current_euid(), + UCOUNT_INOTIFY_INSTANCES); - if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > - inotify_max_user_instances) { + if (!group->inotify_data.ucounts) { fsnotify_destroy_group(group); return ERR_PTR(-EMFILE); } @@ -819,8 +819,8 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); inotify_max_queued_events = 16384; - inotify_max_user_instances = 128; - inotify_max_user_watches = 8192; + init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; + init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192; return 0; } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d3fea0bd89e2..6043306e8e21 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -510,18 +510,6 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) } } -void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) -{ - assert_spin_locked(&old->lock); - new->inode = old->inode; - new->mnt = old->mnt; - if (old->group) - fsnotify_get_group(old->group); - new->group = old->group; - new->mask = old->mask; - new->free_mark = old->free_mark; -} - /* * Nothing fancy, just initialize lists and locks and counters. */ diff --git a/fs/nsfs.c b/fs/nsfs.c index 8c9fb29c6673..1656843e87d2 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -7,6 +7,7 @@ #include <linux/seq_file.h> #include <linux/user_namespace.h> #include <linux/nsfs.h> +#include <linux/uaccess.h> static struct vfsmount *nsfs_mnt; @@ -163,7 +164,10 @@ int open_related_ns(struct ns_common *ns, static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { + struct user_namespace *user_ns; struct ns_common *ns = get_proc_ns(file_inode(filp)); + uid_t __user *argp; + uid_t uid; switch (ioctl) { case NS_GET_USERNS: @@ -172,6 +176,15 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (!ns->ops->get_parent) return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); + case NS_GET_NSTYPE: + return ns->ops->type; + case NS_GET_OWNER_UID: + if (ns->ops->type != CLONE_NEWUSER) + return -EINVAL; + user_ns = container_of(ns, struct user_namespace, ns); + argp = (uid_t __user *) arg; + uid = from_kuid_munged(current_user_ns(), user_ns->owner); + return put_user(uid, argp); default: return -ENOTTY; } diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index bed1fcb63088..dc22ba8c710f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle, int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; - int status = 0; + int status, had_lock; + struct ocfs2_lock_holder oh; - status = ocfs2_inode_lock(inode, &bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } + had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); + if (had_lock < 0) + return had_lock; status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); brelse(bh); return status; } @@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; - int ret; + int had_lock; + struct ocfs2_lock_holder oh; osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - if (ret != -ENOENT) - mlog_errno(ret); - return ERR_PTR(ret); - } + + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); + if (had_lock < 0) + return ERR_PTR(had_lock); acl = ocfs2_get_acl_nolock(inode, type, di_bh); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); return acl; } diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 27d1242c8383..564c504d6efd 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -349,7 +349,7 @@ static void sc_show_sock_container(struct seq_file *seq, " func key: 0x%08x\n" " func type: %u\n", sc, - atomic_read(&sc->sc_kref.refcount), + kref_read(&sc->sc_kref), &saddr, inet ? ntohs(sport) : 0, &daddr, inet ? ntohs(dport) : 0, sc->sc_node->nd_name, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d4b5c81f0445..ec000575e863 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -97,7 +97,7 @@ typeof(sc) __sc = (sc); \ mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ "pg_off %zu] " fmt, __sc, \ - atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ + kref_read(&__sc->sc_kref), __sc->sc_sock, \ __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ ##args); \ } while (0) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index e7b760deefae..9b984cae4c4e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -81,7 +81,7 @@ static void __dlm_print_lock(struct dlm_lock *lock) lock->ml.type, lock->ml.convert_type, lock->ml.node, dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), - atomic_read(&lock->lock_refs.refcount), + kref_read(&lock->lock_refs), (list_empty(&lock->ast_list) ? 'y' : 'n'), (lock->ast_pending ? 'y' : 'n'), (list_empty(&lock->bast_list) ? 'y' : 'n'), @@ -106,7 +106,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) printk("lockres: %s, owner=%u, state=%u\n", buf, res->owner, res->state); printk(" last used: %lu, refcnt: %u, on purge list: %s\n", - res->last_used, atomic_read(&res->refs.refcount), + res->last_used, kref_read(&res->refs), list_empty(&res->purge) ? "no" : "yes"); printk(" on dirty list: %s, on reco list: %s, " "migrating pending: %s\n", @@ -298,7 +298,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) mle_type, mle->master, mle->new_master, !list_empty(&mle->hb_events), !!mle->inuse, - atomic_read(&mle->mle_refs.refcount)); + kref_read(&mle->mle_refs)); out += snprintf(buf + out, len - out, "Maybe="); out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES, @@ -494,7 +494,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) lock->ast_pending, lock->bast_pending, lock->convert_pending, lock->lock_pending, lock->cancel_pending, lock->unlock_pending, - atomic_read(&lock->lock_refs.refcount)); + kref_read(&lock->lock_refs)); spin_unlock(&lock->spinlock); return out; @@ -521,7 +521,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) !list_empty(&res->recovering), res->inflight_locks, res->migration_pending, atomic_read(&res->asts_reserved), - atomic_read(&res->refs.refcount)); + kref_read(&res->refs)); /* refmap */ out += snprintf(buf + out, len - out, "RMAP:"); @@ -777,7 +777,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) /* Purge Count: xxx Refs: xxx */ out += snprintf(buf + out, len - out, "Purge Count: %d Refs: %d\n", dlm->purge_count, - atomic_read(&dlm->dlm_refs.refcount)); + kref_read(&dlm->dlm_refs)); /* Dead Node: xxx */ out += snprintf(buf + out, len - out, diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 733e4e79c8e2..32fd261ae13d 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -2072,7 +2072,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks); mlog(0, "context init: refcount %u\n", - atomic_read(&dlm->dlm_refs.refcount)); + kref_read(&dlm->dlm_refs)); leave: if (ret < 0 && dlm) { diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a464c8088170..7025d8c27999 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -233,7 +233,7 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle) assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); - if (!atomic_read(&mle->mle_refs.refcount)) { + if (!kref_read(&mle->mle_refs)) { /* this may or may not crash, but who cares. * it's a BUG. */ mlog(ML_ERROR, "bad mle: %p\n", mle); @@ -1124,9 +1124,9 @@ recheck: unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); /* - if (atomic_read(&mle->mle_refs.refcount) < 2) + if (kref_read(&mle->mle_refs) < 2) mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, - atomic_read(&mle->mle_refs.refcount), + kref_read(&mle->mle_refs), res->lockname.len, res->lockname.name); */ atomic_set(&mle->woken, 0); @@ -1979,7 +1979,7 @@ ok: * on this mle. */ spin_lock(&dlm->master_lock); - rr = atomic_read(&mle->mle_refs.refcount); + rr = kref_read(&mle->mle_refs); if (mle->inuse > 0) { if (extra_ref && rr < 3) err = 1; diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 1082b2c3014b..63d701cd1e2e 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -251,7 +251,7 @@ leave: mlog(0, "lock %u:%llu should be gone now! refs=%d\n", dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), - atomic_read(&lock->lock_refs.refcount)-1); + kref_read(&lock->lock_refs)-1); dlm_lock_put(lock); } if (actions & DLM_UNLOCK_CALL_AST) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 83d576f6a287..8dce4099a6ca 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) init_waitqueue_head(&res->l_event); INIT_LIST_HEAD(&res->l_blocked_list); INIT_LIST_HEAD(&res->l_mask_waiters); + INIT_LIST_HEAD(&res->l_holders); } void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, @@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) res->l_flags = 0UL; } +/* + * Keep a list of processes who have interest in a lockres. + * Note: this is now only uesed for check recursive cluster locking. + */ +static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + INIT_LIST_HEAD(&oh->oh_list); + oh->oh_owner_pid = get_pid(task_pid(current)); + + spin_lock(&lockres->l_lock); + list_add_tail(&oh->oh_list, &lockres->l_holders); + spin_unlock(&lockres->l_lock); +} + +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + spin_lock(&lockres->l_lock); + list_del(&oh->oh_list); + spin_unlock(&lockres->l_lock); + + put_pid(oh->oh_owner_pid); +} + +static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_lock_holder *oh; + struct pid *pid; + + /* look in the list of holders for one with the current task as owner */ + spin_lock(&lockres->l_lock); + pid = task_pid(current); + list_for_each_entry(oh, &lockres->l_holders, oh_list) { + if (oh->oh_owner_pid == pid) { + spin_unlock(&lockres->l_lock); + return 1; + } + } + spin_unlock(&lockres->l_lock); + + return 0; +} + static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { @@ -2333,8 +2378,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, goto getbh; } - if (ocfs2_mount_local(osb)) - goto local; + if ((arg_flags & OCFS2_META_LOCK_GETBH) || + ocfs2_mount_local(osb)) + goto update; if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); @@ -2363,7 +2409,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); -local: +update: /* * We only see this flag if we're being called from * ocfs2_read_locked_inode(). It means we're locking an inode @@ -2497,6 +2543,59 @@ void ocfs2_inode_unlock(struct inode *inode, ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); } +/* + * This _tracker variantes are introduced to deal with the recursive cluster + * locking issue. The idea is to keep track of a lock holder on the stack of + * the current process. If there's a lock holder on the stack, we know the + * task context is already protected by cluster locking. Currently, they're + * used in some VFS entry routines. + * + * return < 0 on error, return == 0 if there's no lock holder on the stack + * before this call, return == 1 if this call would be a recursive locking. + */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh) +{ + int status; + int arg_flags = 0, has_locked; + struct ocfs2_lock_res *lockres; + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + has_locked = ocfs2_is_locked_by_me(lockres); + /* Just get buffer head if the cluster lock has been taken */ + if (has_locked) + arg_flags = OCFS2_META_LOCK_GETBH; + + if (likely(!has_locked || ret_bh)) { + status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + } + if (!has_locked) + ocfs2_add_holder(lockres, oh); + + return has_locked; +} + +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock) +{ + struct ocfs2_lock_res *lockres; + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + if (!had_lock) { + ocfs2_remove_holder(lockres, oh); + ocfs2_inode_unlock(inode, ex); + } +} + int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno) { struct ocfs2_lock_res *lockres; @@ -3303,6 +3402,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, lockres->l_level, new_level); + /* + * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always + * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that + * we can recover correctly from node failure. Otherwise, we may get + * invalid LVB in LKB, but without DLM_SBF_VALNOTVALIDÂ being set. + */ + if (!ocfs2_is_o2cb_active() && + lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lvb = 1; + if (lvb) dlm_flags |= DLM_LKF_VALBLK; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index d293a22c32c5..a7fc18ba0dc1 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -70,6 +70,11 @@ struct ocfs2_orphan_scan_lvb { __be32 lvb_os_seqno; }; +struct ocfs2_lock_holder { + struct list_head oh_list; + struct pid *oh_owner_pid; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -77,6 +82,8 @@ struct ocfs2_orphan_scan_lvb { #define OCFS2_META_LOCK_NOQUEUE (0x02) /* don't block waiting for the downconvert thread, instead return -EAGAIN */ #define OCFS2_LOCK_NONBLOCK (0x04) +/* just get back disk inode bh if we've got cluster lock. */ +#define OCFS2_META_LOCK_GETBH (0x08) /* Locking subclasses of inode cluster lock */ enum { @@ -170,4 +177,15 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); /* To set the locking protocol on module initialization */ void ocfs2_set_locking_protocol(void); + +/* The _tracker pair is used to avoid cluster recursive locking */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh); +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock); + #endif /* DLMGLUE_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c4889655d32b..7b6a146327d7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) handle_t *handle = NULL; struct dquot *transfer_to[MAXQUOTAS] = { }; int qtype; + int had_lock; + struct ocfs2_lock_holder oh; trace_ocfs2_setattr(inode, dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - status = ocfs2_inode_lock(inode, &bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); + had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); + if (had_lock < 0) { + status = had_lock; goto bail_unlock_rw; + } else if (had_lock) { + /* + * As far as we know, ocfs2_setattr() could only be the first + * VFS entry point in the call chain of recursive cluster + * locking issue. + * + * For instance: + * chmod_common() + * notify_change() + * ocfs2_setattr() + * posix_acl_chmod() + * ocfs2_iop_get_acl() + * + * But, we're not 100% sure if it's always true, because the + * ordering of the VFS entry points in the call chain is out + * of our control. So, we'd better dump the stack here to + * catch the other cases of recursive locking. + */ + mlog(ML_ERROR, "Another case of recursive locking:\n"); + dump_stack(); } inode_locked = 1; @@ -1260,8 +1281,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - if (status) { - ocfs2_inode_unlock(inode, 1); + if (status && inode_locked) { + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); inode_locked = 0; } bail_unlock_rw: @@ -1279,7 +1300,7 @@ bail: mlog_errno(status); } if (inode_locked) - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); brelse(bh); return status; @@ -1320,21 +1341,32 @@ bail: int ocfs2_permission(struct inode *inode, int mask) { - int ret; + int ret, had_lock; + struct ocfs2_lock_holder oh; if (mask & MAY_NOT_BLOCK) return -ECHILD; - ret = ocfs2_inode_lock(inode, NULL, 0); - if (ret) { - if (ret != -ENOENT) - mlog_errno(ret); + had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh); + if (had_lock < 0) { + ret = had_lock; goto out; + } else if (had_lock) { + /* See comments in ocfs2_setattr() for details. + * The call chain of this case could be: + * do_sys_open() + * may_open() + * inode_permission() + * ocfs2_permission() + * ocfs2_iop_get_acl() + */ + mlog(ML_ERROR, "Another case of recursive locking:\n"); + dump_stack(); } ret = generic_permission(inode, mask); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: return ret; } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7e5958b0be6b..0c39d71c67a1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -172,6 +172,7 @@ struct ocfs2_lock_res { struct list_head l_blocked_list; struct list_head l_mask_waiters; + struct list_head l_holders; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 52c07346bea3..820359096c7a 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; +inline int ocfs2_is_o2cb_active(void) +{ + return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); +} +EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); + static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index f2dce10fae54..e3036e1790e8 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ +int ocfs2_is_o2cb_active(void); + extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index c48859f16e7b..67c24351a67f 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -115,6 +115,13 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) return &orangefs_inode->vfs_inode; } +static void orangefs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + kmem_cache_free(orangefs_inode_cache, orangefs_inode); +} + static void orangefs_destroy_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); @@ -123,7 +130,7 @@ static void orangefs_destroy_inode(struct inode *inode) "%s: deallocated %p destroying inode %pU\n", __func__, orangefs_inode, get_khandle_from_ino(inode)); - kmem_cache_free(orangefs_inode_cache, orangefs_inode); + call_rcu(&inode->i_rcu, orangefs_i_callback); } /* diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 9ad48d9202a9..023bb0b03352 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -154,29 +154,38 @@ out_err: static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, struct dentry **ret) { - const char *s = d->name.name; + /* Counting down from the end, since the prefix can change */ + size_t rem = d->name.len - 1; struct dentry *dentry = NULL; int err; - if (*s != '/') + if (d->name.name[0] != '/') return ovl_lookup_single(base, d, d->name.name, d->name.len, 0, "", ret); - while (*s++ == '/' && !IS_ERR_OR_NULL(base) && d_can_lookup(base)) { + while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) { + const char *s = d->name.name + d->name.len - rem; const char *next = strchrnul(s, '/'); - size_t slen = strlen(s); + size_t thislen = next - s; + bool end = !next[0]; - if (WARN_ON(slen > d->name.len) || - WARN_ON(strcmp(d->name.name + d->name.len - slen, s))) + /* Verify we did not go off the rails */ + if (WARN_ON(s[-1] != '/')) return -EIO; - err = ovl_lookup_single(base, d, s, next - s, - d->name.len - slen, next, &base); + err = ovl_lookup_single(base, d, s, thislen, + d->name.len - rem, next, &base); dput(dentry); if (err) return err; dentry = base; - s = next; + if (end) + break; + + rem -= thislen + 1; + + if (WARN_ON(rem >= d->name.len)) + return -EIO; } *ret = dentry; return 0; diff --git a/fs/pnode.c b/fs/pnode.c index 06a793f4ae38..5bc7896d122a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -322,6 +322,21 @@ out: return ret; } +static struct mount *find_topper(struct mount *mnt) +{ + /* If there is exactly one mount covering mnt completely return it. */ + struct mount *child; + + if (!list_is_singular(&mnt->mnt_mounts)) + return NULL; + + child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); + if (child->mnt_mountpoint != mnt->mnt.mnt_root) + return NULL; + + return child; +} + /* * return true if the refcount is greater than count */ @@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count) */ int propagate_mount_busy(struct mount *mnt, int refcnt) { - struct mount *m, *child; + struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; - int ret = 0; if (mnt == parent) return do_refcount_check(mnt, refcnt); @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); - if (child && list_empty(&child->mnt_mounts) && - (ret = do_refcount_check(child, 1))) - break; + int count = 1; + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); + if (!child) + continue; + + /* Is there exactly one mount on the child that covers + * it completely whose reference should be ignored? + */ + topper = find_topper(child); + if (topper) + count += 1; + else if (!list_empty(&child->mnt_mounts)) + continue; + + if (do_refcount_check(child, count)) + return 1; } - return ret; + return 0; } /* @@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } @@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt_last(&m->mnt, + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); - if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { + if (!child || (child->mnt.mnt_flags & MNT_UMOUNT)) + continue; + if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) { SET_MNT_MARK(child); } } @@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - - struct mount *child = __lookup_mnt_last(&m->mnt, + struct mount *topper; + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); /* * umount the child only if the child has no children @@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt) if (!child || !IS_MNT_MARKED(child)) continue; CLEAR_MNT_MARK(child); + + /* If there is exactly one mount covering all of child + * replace child with that mount. + */ + topper = find_topper(child); + if (topper) + mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, + topper); + if (list_empty(&child->mnt_mounts)) { list_del_init(&child->mnt_child); child->mnt.mnt_flags |= MNT_UMOUNT; diff --git a/fs/pnode.h b/fs/pnode.h index 550f5a8b4fcf..dc87e65becd2 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, + struct mount *mnt); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 595522022aca..c9d48dc78495 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -922,11 +922,10 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) int error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - return 0; - if (error == 0) - acl = NULL; + error = posix_acl_update_mode(inode, + &inode->i_mode, &acl); + if (error) + return error; } inode->i_ctime = current_time(inode); diff --git a/fs/proc/array.c b/fs/proc/array.c index 51a4213afa2e..fe12b519d09b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -401,8 +401,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, unsigned long long start_time; unsigned long cmin_flt = 0, cmaj_flt = 0; unsigned long min_flt = 0, maj_flt = 0; - cputime_t cutime, cstime, utime, stime; - cputime_t cgtime, gtime; + u64 cutime, cstime, utime, stime; + u64 cgtime, gtime; unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; @@ -497,10 +497,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, " ", cmin_flt); seq_put_decimal_ull(m, " ", maj_flt); seq_put_decimal_ull(m, " ", cmaj_flt); - seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime)); - seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime)); - seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime)); - seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime)); + seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime)); + seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime)); + seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime)); + seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime)); seq_put_decimal_ll(m, " ", priority); seq_put_decimal_ll(m, " ", nice); seq_put_decimal_ll(m, " ", num_threads); @@ -542,8 +542,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, " ", task->rt_priority); seq_put_decimal_ull(m, " ", task->policy); seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task)); - seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime)); - seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime)); + seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime)); + seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime)); if (mm && permitted) { seq_put_decimal_ull(m, " ", mm->start_data); diff --git a/fs/proc/base.c b/fs/proc/base.c index 8e7e61b28f31..b73b4de8fb36 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ +void task_dump_owner(struct task_struct *task, mode_t mode, + kuid_t *ruid, kgid_t *rgid) +{ + /* Depending on the state of dumpable compute who should own a + * proc file for a task. + */ + const struct cred *cred; + kuid_t uid; + kgid_t gid; + + /* Default to the tasks effective ownership */ + rcu_read_lock(); + cred = __task_cred(task); + uid = cred->euid; + gid = cred->egid; + rcu_read_unlock(); + + /* + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. + */ + if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { + struct mm_struct *mm; + task_lock(task); + mm = task->mm; + /* Make non-dumpable tasks owned by some root */ + if (mm) { + if (get_dumpable(mm) != SUID_DUMP_USER) { + struct user_namespace *user_ns = mm->user_ns; + + uid = make_kuid(user_ns, 0); + if (!uid_valid(uid)) + uid = GLOBAL_ROOT_UID; + + gid = make_kgid(user_ns, 0); + if (!gid_valid(gid)) + gid = GLOBAL_ROOT_GID; + } + } else { + uid = GLOBAL_ROOT_UID; + gid = GLOBAL_ROOT_GID; + } + task_unlock(task); + } + *ruid = uid; + *rgid = gid; +} + struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; - const struct cred *cred; /* We need a new inode */ @@ -1694,13 +1745,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, if (!ei->pid) goto out_unlock; - if (task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); out: @@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = d_inode(dentry); struct task_struct *task; - const struct cred *cred; struct pid_namespace *pid = dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) */ return -ENOENT; } - if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || - task_dumpable(task)) { - cred = __task_cred(task); - stat->uid = cred->euid; - stat->gid = cred->egid; - } + task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid); } rcu_read_unlock(); return 0; @@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * - * Before the /proc/pid/status file was created the only way to read - * the effective uid of a /process was to stat /proc/pid. Reading - * /proc/pid/status is slow enough that procps and other packages - * kept stating /proc/pid. To keep the rules in /proc simple I have - * made this apply to all per process world readable and executable - * directories. */ int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; - const struct cred *cred; if (flags & LOOKUP_RCU) return -ECHILD; @@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) task = get_proc_task(inode); if (task) { - if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || - task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } + task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); + inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); put_task_struct(task); @@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) bool exact_vma_exists = false; struct mm_struct *mm = NULL; struct task_struct *task; - const struct cred *cred; struct inode *inode; int status = 0; @@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) mmput(mm); if (exact_vma_exists) { - if (task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); + security_task_to_inode(task, inode); status = 1; } @@ -2179,7 +2193,7 @@ static const struct file_operations proc_map_files_operations = { .llseek = generic_file_llseek, }; -#ifdef CONFIG_CHECKPOINT_RESTORE +#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { struct pid *pid; struct task_struct *task; @@ -2488,6 +2502,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, length = -ESRCH; if (!task) goto out_no_task; + + /* A task may only write its own attributes. */ + length = -EACCES; + if (current != task) + goto out; + if (count > PAGE_SIZE) count = PAGE_SIZE; @@ -2503,14 +2523,13 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, } /* Guard against adverse ptrace interaction */ - length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + length = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); if (length < 0) goto out_free; - length = security_setprocattr(task, - (char*)file->f_path.dentry->d_name.name, + length = security_setprocattr(file->f_path.dentry->d_name.name, page, count); - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(¤t->signal->cred_guard_mutex); out_free: kfree(page); out: @@ -2936,7 +2955,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif -#ifdef CONFIG_CHECKPOINT_RESTORE +#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) REG("timers", S_IRUGO, proc_timers_operations), #endif REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), @@ -3179,6 +3198,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[PROC_NUMBUF]; int len; + + cond_resched(); if (!has_pid_permissions(ns, iter.task, 2)) continue; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 4274f83bf100..00ce1531b2f5 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) { struct files_struct *files; struct task_struct *task; - const struct cred *cred; struct inode *inode; unsigned int fd; @@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) rcu_read_unlock(); put_files_struct(files); - if (task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); if (S_ISLNK(inode->i_mode)) { unsigned i_mode = S_IFLNK; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 842a5ff5b85c..7ad9ed7958af 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode) de = PDE(inode); if (de) pde_put(de); + head = PROC_I(inode)->sysctl; if (head) { RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); - sysctl_head_put(head); + proc_sys_evict_inode(inode, head); } } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 2de5194ba378..5d6960f5f1c0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -65,6 +65,7 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; + struct list_head sysctl_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; }; @@ -97,20 +98,8 @@ static inline struct task_struct *get_proc_task(struct inode *inode) return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int task_dumpable(struct task_struct *task) -{ - int dumpable = 0; - struct mm_struct *mm; - - task_lock(task); - mm = task->mm; - if (mm) - dumpable = get_dumpable(mm); - task_unlock(task); - if (dumpable == SUID_DUMP_USER) - return 1; - return 0; -} +void task_dump_owner(struct task_struct *task, mode_t mode, + kuid_t *ruid, kgid_t *rgid); static inline unsigned name_to_int(const struct qstr *qstr) { @@ -249,10 +238,12 @@ extern void proc_thread_self_init(void); */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); -extern void sysctl_head_put(struct ctl_table_header *); +extern void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } -static inline void sysctl_head_put(struct ctl_table_header *head) { } +static inline void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head) { } #endif /* diff --git a/fs/proc/page.c b/fs/proc/page.c index a2066e6dee90..2726536489b1 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -173,7 +173,8 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); - u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); + if (PageSwapCache(page)) + u |= 1 << KPF_SWAPCACHE; u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 55313d994895..3e64c6502dc8 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head, head->set = set; head->parent = NULL; head->node = node; + INIT_LIST_HEAD(&head->inodes); if (node) { struct ctl_table *entry; for (entry = table; entry->procname; entry++, node++) @@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p) complete(p->unregistering); } +/* called under sysctl_lock */ +static void proc_sys_prune_dcache(struct ctl_table_header *head) +{ + struct inode *inode, *prev = NULL; + struct proc_inode *ei; + + rcu_read_lock(); + list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) { + inode = igrab(&ei->vfs_inode); + if (inode) { + rcu_read_unlock(); + iput(prev); + prev = inode; + d_prune_aliases(inode); + rcu_read_lock(); + } + } + rcu_read_unlock(); + iput(prev); +} + /* called under sysctl_lock, will reacquire if has to wait */ static void start_unregistering(struct ctl_table_header *p) { @@ -272,31 +294,22 @@ static void start_unregistering(struct ctl_table_header *p) p->unregistering = &wait; spin_unlock(&sysctl_lock); wait_for_completion(&wait); - spin_lock(&sysctl_lock); } else { /* anything non-NULL; we'll never dereference it */ p->unregistering = ERR_PTR(-EINVAL); + spin_unlock(&sysctl_lock); } /* + * Prune dentries for unregistered sysctls: namespaced sysctls + * can have duplicate names and contaminate dcache very badly. + */ + proc_sys_prune_dcache(p); + /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. */ - erase_header(p); -} - -static void sysctl_head_get(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - head->count++; - spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ spin_lock(&sysctl_lock); - if (!--head->count) - kfree_rcu(head, rcu); - spin_unlock(&sysctl_lock); + erase_header(p); } static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) @@ -440,10 +453,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_ino = get_next_ino(); - sysctl_head_get(head); ei = PROC_I(inode); + + spin_lock(&sysctl_lock); + if (unlikely(head->unregistering)) { + spin_unlock(&sysctl_lock); + iput(inode); + inode = NULL; + goto out; + } ei->sysctl = head; ei->sysctl_entry = table; + list_add_rcu(&ei->sysctl_inodes, &head->inodes); + head->count++; + spin_unlock(&sysctl_lock); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mode = table->mode; @@ -466,6 +489,15 @@ out: return inode; } +void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + list_del_rcu(&PROC_I(inode)->sysctl_inodes); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; @@ -709,7 +741,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) - return 0; + goto out; pos = 2; @@ -719,6 +751,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) break; } } +out: sysctl_head_finish(head); return 0; } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index d700c42b3572..e47c3e8c4dfe 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -21,9 +21,9 @@ #ifdef arch_idle_time -static cputime64_t get_idle_time(int cpu) +static u64 get_idle_time(int cpu) { - cputime64_t idle; + u64 idle; idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) @@ -31,9 +31,9 @@ static cputime64_t get_idle_time(int cpu) return idle; } -static cputime64_t get_iowait_time(int cpu) +static u64 get_iowait_time(int cpu) { - cputime64_t iowait; + u64 iowait; iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; if (cpu_online(cpu) && nr_iowait_cpu(cpu)) @@ -45,32 +45,32 @@ static cputime64_t get_iowait_time(int cpu) static u64 get_idle_time(int cpu) { - u64 idle, idle_time = -1ULL; + u64 idle, idle_usecs = -1ULL; if (cpu_online(cpu)) - idle_time = get_cpu_idle_time_us(cpu, NULL); + idle_usecs = get_cpu_idle_time_us(cpu, NULL); - if (idle_time == -1ULL) + if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; else - idle = usecs_to_cputime64(idle_time); + idle = idle_usecs * NSEC_PER_USEC; return idle; } static u64 get_iowait_time(int cpu) { - u64 iowait, iowait_time = -1ULL; + u64 iowait, iowait_usecs = -1ULL; if (cpu_online(cpu)) - iowait_time = get_cpu_iowait_time_us(cpu, NULL); + iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); - if (iowait_time == -1ULL) + if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; else - iowait = usecs_to_cputime64(iowait_time); + iowait = iowait_usecs * NSEC_PER_USEC; return iowait; } @@ -115,16 +115,16 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_put_decimal_ull(p, "cpu ", cputime64_to_clock_t(user)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); + seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); for_each_online_cpu(i) { @@ -140,16 +140,16 @@ static int show_stat(struct seq_file *p, void *v) guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); - seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); } seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 33de567c25af..7981c4ffe787 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -5,23 +5,20 @@ #include <linux/seq_file.h> #include <linux/time.h> #include <linux/kernel_stat.h> -#include <linux/cputime.h> static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; struct timespec idle; - u64 idletime; u64 nsec; u32 rem; int i; - idletime = 0; + nsec = 0; for_each_possible_cpu(i) - idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; + nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; get_monotonic_boottime(&uptime); - nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 27c059e1760a..11f918d34b1e 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -133,7 +133,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, struct persistent_ram_zone *prz; int i = (*c)++; - if (i >= max) + /* Give up if we never existed or have hit the end. */ + if (!przs || i >= max) return NULL; prz = przs[i]; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index a857338b7dab..bc927e30bdcc 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -467,8 +467,7 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, } static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, - struct persistent_ram_ecc_info *ecc_info, - unsigned long flags) + struct persistent_ram_ecc_info *ecc_info) { int ret; @@ -494,10 +493,9 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, prz->buffer->sig); } + /* Rewind missing or invalid memory area. */ prz->buffer->sig = sig; persistent_ram_zap(prz); - prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock); - prz->flags = flags; return 0; } @@ -533,11 +531,15 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, goto err; } + /* Initialize general buffer state. */ + prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock); + prz->flags = flags; + ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) goto err; - ret = persistent_ram_post_init(prz, sig, ecc_info, flags); + ret = persistent_ram_post_init(prz, sig, ecc_info); if (ret) goto err; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index d0f8a38dfafa..0186fe6d39f3 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -74,6 +74,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/uaccess.h> +#include <linux/major.h> #include "internal.h" static struct kmem_cache *romfs_inode_cachep; @@ -416,7 +417,22 @@ static void romfs_destroy_inode(struct inode *inode) static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + u64 id = 0; + + /* When calling huge_encode_dev(), + * use sb->s_bdev->bd_dev when, + * - CONFIG_ROMFS_ON_BLOCK defined + * use sb->s_dev when, + * - CONFIG_ROMFS_ON_BLOCK undefined and + * - CONFIG_ROMFS_ON_MTD defined + * leave id as 0 when, + * - CONFIG_ROMFS_ON_BLOCK undefined and + * - CONFIG_ROMFS_ON_MTD undefined + */ + if (sb->s_bdev) + id = huge_encode_dev(sb->s_bdev->bd_dev); + else if (sb->s_dev) + id = huge_encode_dev(sb->s_dev); buf->f_type = ROMFS_MAGIC; buf->f_namelen = ROMFS_MAXFN; @@ -489,6 +505,11 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= MS_RDONLY | MS_NOATIME; sb->s_op = &romfs_super_ops; +#ifdef CONFIG_ROMFS_ON_MTD + /* Use same dev ID from the underlying mtdblock device */ + if (sb->s_mtd) + sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index); +#endif /* read the image superblock and check it */ rsb = kmalloc(512, GFP_KERNEL); if (!rsb) diff --git a/fs/splice.c b/fs/splice.c index 6518f058bd7f..eaafa3d8869a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -204,6 +204,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; + buf->flags = 0; pipe->nrbufs++; page_nr++; diff --git a/fs/super.c b/fs/super.c index 1709ed029a2c..b8b6a086c03b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type, struct super_block *old; int err; - if (!(flags & MS_KERNMOUNT) && + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !(type->fs_flags & FS_USERNS_MOUNT) && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -499,7 +499,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags, user_ns); + s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type, { struct user_namespace *user_ns = current_user_ns(); + /* We don't yet pass the user namespace of the parent + * mount through to here so always use &init_user_ns + * until that changes. + */ + if (flags & MS_SUBMOUNT) + user_ns = &init_user_ns; + /* Ensure the requestor has permissions over the target filesystem */ - if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); return sget_userns(type, test, set, flags, user_ns, data); @@ -1047,7 +1054,7 @@ static int set_bdev_super(struct super_block *s, void *data) * We set the bdi here to the queue backing, file systems can * overwrite this in ->fill_super() */ - s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; + s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } diff --git a/fs/timerfd.c b/fs/timerfd.c index c173cc196175..384fa759a563 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -40,6 +40,7 @@ struct timerfd_ctx { short unsigned settime_flags; /* to show in fdinfo */ struct rcu_head rcu; struct list_head clist; + spinlock_t cancel_lock; bool might_cancel; }; @@ -112,7 +113,7 @@ void timerfd_clock_was_set(void) rcu_read_unlock(); } -static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { ctx->might_cancel = false; @@ -122,6 +123,13 @@ static void timerfd_remove_cancel(struct timerfd_ctx *ctx) } } +static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +{ + spin_lock(&ctx->cancel_lock); + __timerfd_remove_cancel(ctx); + spin_unlock(&ctx->cancel_lock); +} + static bool timerfd_canceled(struct timerfd_ctx *ctx) { if (!ctx->might_cancel || ctx->moffs != KTIME_MAX) @@ -132,6 +140,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx) static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) { + spin_lock(&ctx->cancel_lock); if ((ctx->clockid == CLOCK_REALTIME || ctx->clockid == CLOCK_REALTIME_ALARM) && (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { @@ -141,9 +150,10 @@ static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) list_add_rcu(&ctx->clist, &cancel_list); spin_unlock(&cancel_lock); } - } else if (ctx->might_cancel) { - timerfd_remove_cancel(ctx); + } else { + __timerfd_remove_cancel(ctx); } + spin_unlock(&ctx->cancel_lock); } static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) @@ -400,6 +410,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return -ENOMEM; init_waitqueue_head(&ctx->wqh); + spin_lock_init(&ctx->cancel_lock); ctx->clockid = clockid; if (isalarm(ctx)) diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 0a908ae7af13..b0d0623c83ed 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -53,7 +53,7 @@ config UBIFS_ATIME_SUPPORT config UBIFS_FS_ENCRYPTION bool "UBIFS Encryption" - depends on UBIFS_FS + depends on UBIFS_FS && BLOCK select FS_ENCRYPTION default n help diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 3402720f2b28..382ed428cfd2 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -26,15 +26,6 @@ static unsigned int ubifs_crypt_max_namelen(struct inode *inode) return UBIFS_MAX_NLEN; } -static int ubifs_key_prefix(struct inode *inode, u8 **key) -{ - static char prefix[] = "ubifs:"; - - *key = prefix; - - return sizeof(prefix) - 1; -} - int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { @@ -86,12 +77,12 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, return 0; } -struct fscrypt_operations ubifs_crypt_operations = { +const struct fscrypt_operations ubifs_crypt_operations = { .flags = FS_CFLG_OWN_PAGES, + .key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .is_encrypted = __ubifs_crypt_is_encrypted, .empty_dir = ubifs_crypt_empty_dir, .max_namelen = ubifs_crypt_max_namelen, - .key_prefix = ubifs_key_prefix, }; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 1c5331ac9614..528369f3e472 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -390,16 +390,6 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, dbg_gen("dent '%pd', mode %#hx in dir ino %lu", dentry, mode, dir->i_ino); - if (ubifs_crypt_is_encrypted(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) - return err; - - if (!fscrypt_has_encryption_key(dir)) { - return -EPERM; - } - } - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); if (err) return err; @@ -741,17 +731,9 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(inode_is_locked(dir)); ubifs_assert(inode_is_locked(inode)); - if (ubifs_crypt_is_encrypted(dir)) { - if (!fscrypt_has_permitted_context(dir, inode)) - return -EPERM; - - err = fscrypt_get_encryption_info(inode); - if (err) - return err; - - if (!fscrypt_has_encryption_key(inode)) - return -EPERM; - } + if (ubifs_crypt_is_encrypted(dir) && + !fscrypt_has_permitted_context(dir, inode)) + return -EPERM; err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); if (err) @@ -1000,17 +982,6 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) return err; - if (ubifs_crypt_is_encrypted(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) - goto out_budg; - - if (!fscrypt_has_encryption_key(dir)) { - err = -EPERM; - goto out_budg; - } - } - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); if (err) goto out_budg; @@ -1096,17 +1067,6 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, return err; } - if (ubifs_crypt_is_encrypted(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) - goto out_budg; - - if (!fscrypt_has_encryption_key(dir)) { - err = -EPERM; - goto out_budg; - } - } - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); if (err) goto out_budg; @@ -1231,18 +1191,6 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, goto out_inode; } - err = fscrypt_get_encryption_info(inode); - if (err) { - kfree(sd); - goto out_inode; - } - - if (!fscrypt_has_encryption_key(inode)) { - kfree(sd); - err = -EPERM; - goto out_inode; - } - ostr.name = sd->encrypted_path; ostr.len = disk_link.len; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 78d713644df3..da519ba205f6 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -217,6 +217,9 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index a459211a1c21..294519b98874 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -744,6 +744,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, } else { data->compr_size = 0; + out_len = compr_len; } dlen = UBIFS_DATA_NODE_SZ + out_len; @@ -1319,6 +1320,7 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in dn->compr_type = cpu_to_le16(compr_type); dn->size = cpu_to_le32(*new_len); *new_len = UBIFS_DATA_NODE_SZ + out_len; + err = 0; out: kfree(buf); return err; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index e08aa04fc835..b73811bd7676 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2000,7 +2000,7 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) } #ifndef CONFIG_UBIFS_FS_ENCRYPTION -struct fscrypt_operations ubifs_crypt_operations = { +const struct fscrypt_operations ubifs_crypt_operations = { .is_encrypted = __ubifs_crypt_is_encrypted, }; #endif diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 74ae2de949df..709aa098dd46 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -34,6 +34,11 @@ #include <linux/slab.h> #include "ubifs.h" +static int try_read_node(const struct ubifs_info *c, void *buf, int type, + int len, int lnum, int offs); +static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_zbranch *zbr, void *node); + /* * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions. * @NAME_LESS: name corresponding to the first argument is less than second @@ -402,7 +407,19 @@ static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, return 0; } - err = ubifs_tnc_read_node(c, zbr, node); + if (c->replaying) { + err = fallible_read_node(c, &zbr->key, zbr, node); + /* + * When the node was not found, return -ENOENT, 0 otherwise. + * Negative return codes stay as-is. + */ + if (err == 0) + err = -ENOENT; + else if (err == 1) + err = 0; + } else { + err = ubifs_tnc_read_node(c, zbr, node); + } if (err) return err; @@ -2857,7 +2874,11 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, if (fname_len(nm) > 0) { if (err) { /* Handle collisions */ - err = resolve_collision(c, key, &znode, &n, nm); + if (c->replaying) + err = fallible_resolve_collision(c, key, &znode, &n, + nm, 0); + else + err = resolve_collision(c, key, &znode, &n, nm); dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); if (unlikely(err < 0)) diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index ca72382ce6cc..f0c86f076535 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -38,7 +38,11 @@ #include <linux/backing-dev.h> #include <linux/security.h> #include <linux/xattr.h> -#include <linux/fscrypto.h> +#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#include <linux/fscrypt_supp.h> +#else +#include <linux/fscrypt_notsupp.h> +#endif #include <linux/random.h> #include "ubifs-media.h" @@ -1797,28 +1801,6 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, #include "key.h" #ifndef CONFIG_UBIFS_FS_ENCRYPTION -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy -#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk static inline int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, @@ -1842,7 +1824,7 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int *out_len, int block); #endif -extern struct fscrypt_operations ubifs_crypt_operations; +extern const struct fscrypt_operations ubifs_crypt_operations; static inline bool __ubifs_crypt_is_encrypted(struct inode *inode) { diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index 4792b771aa80..9f24bd1a9f44 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -41,7 +41,7 @@ struct charspec { uint8_t charSetType; uint8_t charSetInfo[63]; -} __attribute__ ((packed)); +} __packed; /* Character Set Type (ECMA 167r3 1/7.2.1.1) */ #define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */ @@ -68,7 +68,7 @@ struct timestamp { uint8_t centiseconds; uint8_t hundredsOfMicroseconds; uint8_t microseconds; -} __attribute__ ((packed)); +} __packed; /* Type and Time Zone (ECMA 167r3 1/7.3.1) */ #define TIMESTAMP_TYPE_MASK 0xF000 @@ -82,7 +82,7 @@ struct regid { uint8_t flags; uint8_t ident[23]; uint8_t identSuffix[8]; -} __attribute__ ((packed)); +} __packed; /* Flags (ECMA 167r3 1/7.4.1) */ #define ENTITYID_FLAGS_DIRTY 0x00 @@ -95,7 +95,7 @@ struct volStructDesc { uint8_t stdIdent[VSD_STD_ID_LEN]; uint8_t structVersion; uint8_t structData[2041]; -} __attribute__ ((packed)); +} __packed; /* Standard Identifier (EMCA 167r2 2/9.1.2) */ #define VSD_STD_ID_NSR02 "NSR02" /* (3/9.1) */ @@ -114,7 +114,7 @@ struct beginningExtendedAreaDesc { uint8_t stdIdent[VSD_STD_ID_LEN]; uint8_t structVersion; uint8_t structData[2041]; -} __attribute__ ((packed)); +} __packed; /* Terminating Extended Area Descriptor (ECMA 167r3 2/9.3) */ struct terminatingExtendedAreaDesc { @@ -122,7 +122,7 @@ struct terminatingExtendedAreaDesc { uint8_t stdIdent[VSD_STD_ID_LEN]; uint8_t structVersion; uint8_t structData[2041]; -} __attribute__ ((packed)); +} __packed; /* Boot Descriptor (ECMA 167r3 2/9.4) */ struct bootDesc { @@ -140,7 +140,7 @@ struct bootDesc { __le16 flags; uint8_t reserved2[32]; uint8_t bootUse[1906]; -} __attribute__ ((packed)); +} __packed; /* Flags (ECMA 167r3 2/9.4.12) */ #define BOOT_FLAGS_ERASE 0x01 @@ -149,7 +149,7 @@ struct bootDesc { struct extent_ad { __le32 extLength; __le32 extLocation; -} __attribute__ ((packed)); +} __packed; struct kernel_extent_ad { uint32_t extLength; @@ -166,7 +166,7 @@ struct tag { __le16 descCRC; __le16 descCRCLength; __le32 tagLocation; -} __attribute__ ((packed)); +} __packed; /* Tag Identifier (ECMA 167r3 3/7.2.1) */ #define TAG_IDENT_PVD 0x0001 @@ -186,7 +186,7 @@ struct NSRDesc { uint8_t structVersion; uint8_t reserved; uint8_t structData[2040]; -} __attribute__ ((packed)); +} __packed; /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ struct primaryVolDesc { @@ -212,7 +212,7 @@ struct primaryVolDesc { __le32 predecessorVolDescSeqLocation; __le16 flags; uint8_t reserved[22]; -} __attribute__ ((packed)); +} __packed; /* Flags (ECMA 167r3 3/10.1.21) */ #define PVD_FLAGS_VSID_COMMON 0x0001 @@ -223,7 +223,7 @@ struct anchorVolDescPtr { struct extent_ad mainVolDescSeqExt; struct extent_ad reserveVolDescSeqExt; uint8_t reserved[480]; -} __attribute__ ((packed)); +} __packed; /* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */ struct volDescPtr { @@ -231,7 +231,7 @@ struct volDescPtr { __le32 volDescSeqNum; struct extent_ad nextVolDescSeqExt; uint8_t reserved[484]; -} __attribute__ ((packed)); +} __packed; /* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */ struct impUseVolDesc { @@ -239,7 +239,7 @@ struct impUseVolDesc { __le32 volDescSeqNum; struct regid impIdent; uint8_t impUse[460]; -} __attribute__ ((packed)); +} __packed; /* Partition Descriptor (ECMA 167r3 3/10.5) */ struct partitionDesc { @@ -255,7 +255,7 @@ struct partitionDesc { struct regid impIdent; uint8_t impUse[128]; uint8_t reserved[156]; -} __attribute__ ((packed)); +} __packed; /* Partition Flags (ECMA 167r3 3/10.5.3) */ #define PD_PARTITION_FLAGS_ALLOC 0x0001 @@ -291,14 +291,14 @@ struct logicalVolDesc { uint8_t impUse[128]; struct extent_ad integritySeqExt; uint8_t partitionMaps[0]; -} __attribute__ ((packed)); +} __packed; /* Generic Partition Map (ECMA 167r3 3/10.7.1) */ struct genericPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t partitionMapping[0]; -} __attribute__ ((packed)); +} __packed; /* Partition Map Type (ECMA 167r3 3/10.7.1.1) */ #define GP_PARTITION_MAP_TYPE_UNDEF 0x00 @@ -311,14 +311,14 @@ struct genericPartitionMap1 { uint8_t partitionMapLength; __le16 volSeqNum; __le16 partitionNum; -} __attribute__ ((packed)); +} __packed; /* Type 2 Partition Map (ECMA 167r3 3/10.7.3) */ struct genericPartitionMap2 { uint8_t partitionMapType; uint8_t partitionMapLength; uint8_t partitionIdent[62]; -} __attribute__ ((packed)); +} __packed; /* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */ struct unallocSpaceDesc { @@ -326,13 +326,13 @@ struct unallocSpaceDesc { __le32 volDescSeqNum; __le32 numAllocDescs; struct extent_ad allocDescs[0]; -} __attribute__ ((packed)); +} __packed; /* Terminating Descriptor (ECMA 167r3 3/10.9) */ struct terminatingDesc { struct tag descTag; uint8_t reserved[496]; -} __attribute__ ((packed)); +} __packed; /* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */ struct logicalVolIntegrityDesc { @@ -346,7 +346,7 @@ struct logicalVolIntegrityDesc { __le32 freeSpaceTable[0]; __le32 sizeTable[0]; uint8_t impUse[0]; -} __attribute__ ((packed)); +} __packed; /* Integrity Type (ECMA 167r3 3/10.10.3) */ #define LVID_INTEGRITY_TYPE_OPEN 0x00000000 @@ -356,7 +356,7 @@ struct logicalVolIntegrityDesc { struct lb_addr { __le32 logicalBlockNum; __le16 partitionReferenceNum; -} __attribute__ ((packed)); +} __packed; /* ... and its in-core analog */ struct kernel_lb_addr { @@ -368,14 +368,14 @@ struct kernel_lb_addr { struct short_ad { __le32 extLength; __le32 extPosition; -} __attribute__ ((packed)); +} __packed; /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ struct long_ad { __le32 extLength; struct lb_addr extLocation; uint8_t impUse[6]; -} __attribute__ ((packed)); +} __packed; struct kernel_long_ad { uint32_t extLength; @@ -389,7 +389,7 @@ struct ext_ad { __le32 recordedLength; __le32 informationLength; struct lb_addr extLocation; -} __attribute__ ((packed)); +} __packed; struct kernel_ext_ad { uint32_t extLength; @@ -434,7 +434,7 @@ struct fileSetDesc { struct long_ad nextExt; struct long_ad streamDirectoryICB; uint8_t reserved[32]; -} __attribute__ ((packed)); +} __packed; /* Partition Header Descriptor (ECMA 167r3 4/14.3) */ struct partitionHeaderDesc { @@ -444,7 +444,7 @@ struct partitionHeaderDesc { struct short_ad freedSpaceTable; struct short_ad freedSpaceBitmap; uint8_t reserved[88]; -} __attribute__ ((packed)); +} __packed; /* File Identifier Descriptor (ECMA 167r3 4/14.4) */ struct fileIdentDesc { @@ -457,7 +457,7 @@ struct fileIdentDesc { uint8_t impUse[0]; uint8_t fileIdent[0]; uint8_t padding[0]; -} __attribute__ ((packed)); +} __packed; /* File Characteristics (ECMA 167r3 4/14.4.3) */ #define FID_FILE_CHAR_HIDDEN 0x01 @@ -471,7 +471,7 @@ struct allocExtDesc { struct tag descTag; __le32 previousAllocExtLocation; __le32 lengthAllocDescs; -} __attribute__ ((packed)); +} __packed; /* ICB Tag (ECMA 167r3 4/14.6) */ struct icbtag { @@ -483,7 +483,7 @@ struct icbtag { uint8_t fileType; struct lb_addr parentICBLocation; __le16 flags; -} __attribute__ ((packed)); +} __packed; /* Strategy Type (ECMA 167r3 4/14.6.2) */ #define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000 @@ -531,13 +531,13 @@ struct indirectEntry { struct tag descTag; struct icbtag icbTag; struct long_ad indirectICB; -} __attribute__ ((packed)); +} __packed; /* Terminal Entry (ECMA 167r3 4/14.8) */ struct terminalEntry { struct tag descTag; struct icbtag icbTag; -} __attribute__ ((packed)); +} __packed; /* File Entry (ECMA 167r3 4/14.9) */ struct fileEntry { @@ -563,7 +563,7 @@ struct fileEntry { __le32 lengthAllocDescs; uint8_t extendedAttr[0]; uint8_t allocDescs[0]; -} __attribute__ ((packed)); +} __packed; /* Permissions (ECMA 167r3 4/14.9.5) */ #define FE_PERM_O_EXEC 0x00000001U @@ -607,7 +607,7 @@ struct extendedAttrHeaderDesc { struct tag descTag; __le32 impAttrLocation; __le32 appAttrLocation; -} __attribute__ ((packed)); +} __packed; /* Generic Format (ECMA 167r3 4/14.10.2) */ struct genericFormat { @@ -616,7 +616,7 @@ struct genericFormat { uint8_t reserved[3]; __le32 attrLength; uint8_t attrData[0]; -} __attribute__ ((packed)); +} __packed; /* Character Set Information (ECMA 167r3 4/14.10.3) */ struct charSetInfo { @@ -627,7 +627,7 @@ struct charSetInfo { __le32 escapeSeqLength; uint8_t charSetType; uint8_t escapeSeq[0]; -} __attribute__ ((packed)); +} __packed; /* Alternate Permissions (ECMA 167r3 4/14.10.4) */ struct altPerms { @@ -638,7 +638,7 @@ struct altPerms { __le16 ownerIdent; __le16 groupIdent; __le16 permission; -} __attribute__ ((packed)); +} __packed; /* File Times Extended Attribute (ECMA 167r3 4/14.10.5) */ struct fileTimesExtAttr { @@ -649,7 +649,7 @@ struct fileTimesExtAttr { __le32 dataLength; __le32 fileTimeExistence; uint8_t fileTimes; -} __attribute__ ((packed)); +} __packed; /* FileTimeExistence (ECMA 167r3 4/14.10.5.6) */ #define FTE_CREATION 0x00000001 @@ -666,7 +666,7 @@ struct infoTimesExtAttr { __le32 dataLength; __le32 infoTimeExistence; uint8_t infoTimes[0]; -} __attribute__ ((packed)); +} __packed; /* Device Specification (ECMA 167r3 4/14.10.7) */ struct deviceSpec { @@ -678,7 +678,7 @@ struct deviceSpec { __le32 majorDeviceIdent; __le32 minorDeviceIdent; uint8_t impUse[0]; -} __attribute__ ((packed)); +} __packed; /* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */ struct impUseExtAttr { @@ -689,7 +689,7 @@ struct impUseExtAttr { __le32 impUseLength; struct regid impIdent; uint8_t impUse[0]; -} __attribute__ ((packed)); +} __packed; /* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */ struct appUseExtAttr { @@ -700,7 +700,7 @@ struct appUseExtAttr { __le32 appUseLength; struct regid appIdent; uint8_t appUse[0]; -} __attribute__ ((packed)); +} __packed; #define EXTATTR_CHAR_SET 1 #define EXTATTR_ALT_PERMS 3 @@ -716,7 +716,7 @@ struct unallocSpaceEntry { struct icbtag icbTag; __le32 lengthAllocDescs; uint8_t allocDescs[0]; -} __attribute__ ((packed)); +} __packed; /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ struct spaceBitmapDesc { @@ -724,7 +724,7 @@ struct spaceBitmapDesc { __le32 numOfBits; __le32 numOfBytes; uint8_t bitmap[0]; -} __attribute__ ((packed)); +} __packed; /* Partition Integrity Entry (ECMA 167r3 4/14.13) */ struct partitionIntegrityEntry { @@ -735,7 +735,7 @@ struct partitionIntegrityEntry { uint8_t reserved[175]; struct regid impIdent; uint8_t impUse[256]; -} __attribute__ ((packed)); +} __packed; /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ @@ -753,7 +753,7 @@ struct partitionIntegrityEntry { struct logicalVolHeaderDesc { __le64 uniqueID; uint8_t reserved[24]; -} __attribute__ ((packed)); +} __packed; /* Path Component (ECMA 167r3 4/14.16.1) */ struct pathComponent { @@ -761,7 +761,7 @@ struct pathComponent { uint8_t lengthComponentIdent; __le16 componentFileVersionNum; dstring componentIdent[0]; -} __attribute__ ((packed)); +} __packed; /* File Entry (ECMA 167r3 4/14.17) */ struct extendedFileEntry { @@ -791,6 +791,6 @@ struct extendedFileEntry { __le32 lengthAllocDescs; uint8_t extendedAttr[0]; uint8_t allocDescs[0]; -} __attribute__ ((packed)); +} __packed; #endif /* _ECMA_167_H */ diff --git a/fs/udf/file.c b/fs/udf/file.c index dbcb3a4a0cb9..e04cc0cdca9d 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -176,54 +176,46 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); long old_block, new_block; - int result = -EINVAL; + int result; if (inode_permission(inode, MAY_READ) != 0) { udf_debug("no permission to access inode %lu\n", inode->i_ino); - result = -EPERM; - goto out; + return -EPERM; } - if (!arg) { + if (!arg && ((cmd == UDF_GETVOLIDENT) || (cmd == UDF_GETEASIZE) || + (cmd == UDF_RELOCATE_BLOCKS) || (cmd == UDF_GETEABLOCK))) { udf_debug("invalid argument to udf_ioctl\n"); - result = -EINVAL; - goto out; + return -EINVAL; } switch (cmd) { case UDF_GETVOLIDENT: if (copy_to_user((char __user *)arg, UDF_SB(inode->i_sb)->s_volume_ident, 32)) - result = -EFAULT; - else - result = 0; - goto out; + return -EFAULT; + return 0; case UDF_RELOCATE_BLOCKS: - if (!capable(CAP_SYS_ADMIN)) { - result = -EPERM; - goto out; - } - if (get_user(old_block, (long __user *)arg)) { - result = -EFAULT; - goto out; - } + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(old_block, (long __user *)arg)) + return -EFAULT; result = udf_relocate_blocks(inode->i_sb, old_block, &new_block); if (result == 0) result = put_user(new_block, (long __user *)arg); - goto out; + return result; case UDF_GETEASIZE: - result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); - goto out; + return put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); case UDF_GETEABLOCK: - result = copy_to_user((char __user *)arg, - UDF_I(inode)->i_ext.i_data, - UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; - goto out; + return copy_to_user((char __user *)arg, + UDF_I(inode)->i_ext.i_data, + UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; + default: + return -ENOIOCTLCMD; } -out: - return result; + return 0; } static int udf_release_file(struct inode *inode, struct file *filp) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 0f3db71753aa..8ec6b3df0bc7 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -43,10 +43,6 @@ #include "udf_i.h" #include "udf_sb.h" -MODULE_AUTHOR("Ben Fennema"); -MODULE_DESCRIPTION("Universal Disk Format Filesystem"); -MODULE_LICENSE("GPL"); - #define EXTENT_MERGE_SIZE 5 static umode_t udf_convert_permissions(struct fileEntry *); @@ -57,14 +53,12 @@ static sector_t inode_getblk(struct inode *, sector_t, int *, int *); static int8_t udf_insert_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, int, - struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); + struct kernel_long_ad *, int *); static void udf_prealloc_extents(struct inode *, int, int, - struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); -static void udf_merge_extents(struct inode *, - struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); -static void udf_update_extents(struct inode *, - struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int, - struct extent_position *); + struct kernel_long_ad *, int *); +static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *); +static void udf_update_extents(struct inode *, struct kernel_long_ad *, int, + int, struct extent_position *); static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); static void __udf_clear_extent_cache(struct inode *inode) @@ -111,7 +105,7 @@ static int udf_read_extent_cache(struct inode *inode, loff_t bcount, /* Add extent to extent cache */ static void udf_update_extent_cache(struct inode *inode, loff_t estart, - struct extent_position *pos, int next_epos) + struct extent_position *pos) { struct udf_inode_info *iinfo = UDF_I(inode); @@ -120,19 +114,16 @@ static void udf_update_extent_cache(struct inode *inode, loff_t estart, __udf_clear_extent_cache(inode); if (pos->bh) get_bh(pos->bh); - memcpy(&iinfo->cached_extent.epos, pos, - sizeof(struct extent_position)); + memcpy(&iinfo->cached_extent.epos, pos, sizeof(struct extent_position)); iinfo->cached_extent.lstart = estart; - if (next_epos) - switch (iinfo->i_alloc_type) { - case ICBTAG_FLAG_AD_SHORT: - iinfo->cached_extent.epos.offset -= - sizeof(struct short_ad); - break; - case ICBTAG_FLAG_AD_LONG: - iinfo->cached_extent.epos.offset -= - sizeof(struct long_ad); - } + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + iinfo->cached_extent.epos.offset -= sizeof(struct short_ad); + break; + case ICBTAG_FLAG_AD_LONG: + iinfo->cached_extent.epos.offset -= sizeof(struct long_ad); + break; + } spin_unlock(&iinfo->i_extent_cache_lock); } @@ -747,11 +738,8 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, ~(inode->i_sb->s_blocksize - 1)); udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } - brelse(prev_epos.bh); - brelse(cur_epos.bh); - brelse(next_epos.bh); newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); - return newblock; + goto out_free; } /* Are we beyond EOF? */ @@ -774,11 +762,9 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, /* Create extents for the hole between EOF and offset */ ret = udf_do_extend_file(inode, &prev_epos, laarr, offset); if (ret < 0) { - brelse(prev_epos.bh); - brelse(cur_epos.bh); - brelse(next_epos.bh); *err = ret; - return 0; + newblock = 0; + goto out_free; } c = 0; offset = 0; @@ -841,11 +827,9 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, iinfo->i_location.partitionReferenceNum, goal, err); if (!newblocknum) { - brelse(prev_epos.bh); - brelse(cur_epos.bh); - brelse(next_epos.bh); *err = -ENOSPC; - return 0; + newblock = 0; + goto out_free; } if (isBeyondEOF) iinfo->i_lenExtents += inode->i_sb->s_blocksize; @@ -857,14 +841,12 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, * block */ udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); -#ifdef UDF_PREALLOCATE /* We preallocate blocks only for regular files. It also makes sense * for directories but there's a problem when to drop the * preallocation. We might use some delayed work for that but I feel * it's overengineering for a filesystem like UDF. */ if (S_ISREG(inode->i_mode)) udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); -#endif /* merge any continuous blocks in laarr */ udf_merge_extents(inode, laarr, &endnum); @@ -874,15 +856,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, * the new number of extents is less than the old number */ udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); - brelse(prev_epos.bh); - brelse(cur_epos.bh); - brelse(next_epos.bh); - newblock = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); if (!newblock) { *err = -EIO; - return 0; + goto out_free; } *new = 1; iinfo->i_next_alloc_block = block; @@ -893,13 +871,15 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, udf_sync_inode(inode); else mark_inode_dirty(inode); - +out_free: + brelse(prev_epos.bh); + brelse(cur_epos.bh); + brelse(next_epos.bh); return newblock; } static void udf_split_extents(struct inode *inode, int *c, int offset, - int newblocknum, - struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + int newblocknum, struct kernel_long_ad *laarr, int *endnum) { unsigned long blocksize = inode->i_sb->s_blocksize; @@ -963,7 +943,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset, } static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, - struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + struct kernel_long_ad *laarr, int *endnum) { int start, length = 0, currlength = 0, i; @@ -1058,8 +1038,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, } } -static void udf_merge_extents(struct inode *inode, - struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], +static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, int *endnum) { int i; @@ -1158,8 +1137,7 @@ static void udf_merge_extents(struct inode *inode, } } -static void udf_update_extents(struct inode *inode, - struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], +static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, int startnum, int endnum, struct extent_position *epos) { @@ -1299,6 +1277,12 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) int ret = -EIO; reread: + if (iloc->partitionReferenceNum >= sbi->s_partitions) { + udf_debug("partition reference: %d > logical volume partitions: %d\n", + iloc->partitionReferenceNum, sbi->s_partitions); + return -EIO; + } + if (iloc->logicalBlockNum >= sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { udf_debug("block=%d, partition=%d out of range\n", @@ -1549,7 +1533,7 @@ reread: break; case ICBTAG_FILE_TYPE_SYMLINK: inode->i_data.a_ops = &udf_symlink_aops; - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; break; @@ -1627,6 +1611,14 @@ static int udf_sync_inode(struct inode *inode) return udf_update_inode(inode, 1); } +static void udf_adjust_time(struct udf_inode_info *iinfo, struct timespec time) +{ + if (iinfo->i_crtime.tv_sec > time.tv_sec || + (iinfo->i_crtime.tv_sec == time.tv_sec && + iinfo->i_crtime.tv_nsec > time.tv_nsec)) + iinfo->i_crtime = time; +} + static int udf_update_inode(struct inode *inode, int do_sync) { struct buffer_head *bh = NULL; @@ -1753,20 +1745,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) efe->objectSize = cpu_to_le64(inode->i_size); efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); - if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec || - (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec && - iinfo->i_crtime.tv_nsec > inode->i_atime.tv_nsec)) - iinfo->i_crtime = inode->i_atime; - - if (iinfo->i_crtime.tv_sec > inode->i_mtime.tv_sec || - (iinfo->i_crtime.tv_sec == inode->i_mtime.tv_sec && - iinfo->i_crtime.tv_nsec > inode->i_mtime.tv_nsec)) - iinfo->i_crtime = inode->i_mtime; - - if (iinfo->i_crtime.tv_sec > inode->i_ctime.tv_sec || - (iinfo->i_crtime.tv_sec == inode->i_ctime.tv_sec && - iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec)) - iinfo->i_crtime = inode->i_ctime; + udf_adjust_time(iinfo, inode->i_atime); + udf_adjust_time(iinfo, inode->i_mtime); + udf_adjust_time(iinfo, inode->i_ctime); udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); @@ -2286,8 +2267,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block, uint32_t *elen, sector_t *offset) { unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; - loff_t lbcount = 0, bcount = - (loff_t) block << blocksize_bits; + loff_t lbcount = 0, bcount = (loff_t) block << blocksize_bits; int8_t etype; struct udf_inode_info *iinfo; @@ -2308,7 +2288,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block, lbcount += *elen; } while (lbcount <= bcount); /* update extent cache */ - udf_update_extent_cache(inode, lbcount - *elen, pos, 1); + udf_update_extent_cache(inode, lbcount - *elen, pos); *offset = (bcount + *elen - lbcount) >> blocksize_bits; return etype; diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 6ad5a453af97..5c7ec121990d 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -58,7 +58,7 @@ unsigned long udf_get_last_block(struct super_block *sb) */ if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) || lblock == 0) - lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits; + lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits; if (lblock) return lblock - 1; diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 71d1c25f360d..3949c4bec3a3 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -141,8 +141,6 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, iinfo->i_lenEAttr += size; return (struct genericFormat *)&ea[offset]; } - if (loc & 0x02) - ; return NULL; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 2d65e280748b..babf48d0e553 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -931,7 +931,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, } inode->i_data.a_ops = &udf_symlink_aops; - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index fbff74654df2..a4da59e38b7f 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -70,17 +70,17 @@ struct UDFIdentSuffix { uint8_t OSClass; uint8_t OSIdentifier; uint8_t reserved[4]; -} __attribute__ ((packed)); +} __packed; struct impIdentSuffix { uint8_t OSClass; uint8_t OSIdentifier; uint8_t reserved[6]; -} __attribute__ ((packed)); +} __packed; struct appIdentSuffix { uint8_t impUse[8]; -} __attribute__ ((packed)); +} __packed; /* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ /* Implementation Use (UDF 2.50 2.2.6.4) */ @@ -92,7 +92,7 @@ struct logicalVolIntegrityDescImpUse { __le16 minUDFWriteRev; __le16 maxUDFWriteRev; uint8_t impUse[0]; -} __attribute__ ((packed)); +} __packed; /* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ /* Implementation Use (UDF 2.50 2.2.7.2) */ @@ -104,7 +104,7 @@ struct impUseVolDescImpUse { dstring LVInfo3[36]; struct regid impIdent; uint8_t impUse[128]; -} __attribute__ ((packed)); +} __packed; struct udfPartitionMap2 { uint8_t partitionMapType; @@ -113,7 +113,7 @@ struct udfPartitionMap2 { struct regid partIdent; __le16 volSeqNum; __le16 partitionNum; -} __attribute__ ((packed)); +} __packed; /* Virtual Partition Map (UDF 2.50 2.2.8) */ struct virtualPartitionMap { @@ -124,7 +124,7 @@ struct virtualPartitionMap { __le16 volSeqNum; __le16 partitionNum; uint8_t reserved2[24]; -} __attribute__ ((packed)); +} __packed; /* Sparable Partition Map (UDF 2.50 2.2.9) */ struct sparablePartitionMap { @@ -139,7 +139,7 @@ struct sparablePartitionMap { uint8_t reserved2[1]; __le32 sizeSparingTable; __le32 locSparingTable[4]; -} __attribute__ ((packed)); +} __packed; /* Metadata Partition Map (UDF 2.4.0 2.2.10) */ struct metadataPartitionMap { @@ -156,14 +156,14 @@ struct metadataPartitionMap { __le16 alignUnitSize; uint8_t flags; uint8_t reserved2[5]; -} __attribute__ ((packed)); +} __packed; /* Virtual Allocation Table (UDF 1.5 2.2.10) */ struct virtualAllocationTable15 { __le32 VirtualSector[0]; struct regid vatIdent; __le32 previousVATICBLoc; -} __attribute__ ((packed)); +} __packed; #define ICBTAG_FILE_TYPE_VAT15 0x00U @@ -181,7 +181,7 @@ struct virtualAllocationTable20 { __le16 reserved; uint8_t impUse[0]; __le32 vatEntry[0]; -} __attribute__ ((packed)); +} __packed; #define ICBTAG_FILE_TYPE_VAT20 0xF8U @@ -189,7 +189,7 @@ struct virtualAllocationTable20 { struct sparingEntry { __le32 origLocation; __le32 mappedLocation; -} __attribute__ ((packed)); +} __packed; struct sparingTable { struct tag descTag; @@ -199,7 +199,7 @@ struct sparingTable { __le32 sequenceNum; struct sparingEntry mapEntry[0]; -} __attribute__ ((packed)); +} __packed; /* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */ #define ICBTAG_FILE_TYPE_MAIN 0xFA @@ -210,7 +210,7 @@ struct sparingTable { struct allocDescImpUse { __le16 flags; uint8_t impUse[4]; -} __attribute__ ((packed)); +} __packed; #define AD_IU_EXT_ERASED 0x0001 @@ -222,7 +222,7 @@ struct allocDescImpUse { struct freeEaSpace { __le16 headerChecksum; uint8_t freeEASpace[0]; -} __attribute__ ((packed)); +} __packed; /* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */ struct DVDCopyrightImpUse { @@ -230,14 +230,14 @@ struct DVDCopyrightImpUse { uint8_t CGMSInfo; uint8_t dataType; uint8_t protectionSystemInfo[4]; -} __attribute__ ((packed)); +} __packed; /* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */ /* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */ struct freeAppEASpace { __le16 headerChecksum; uint8_t freeEASpace[0]; -} __attribute__ ((packed)); +} __packed; /* UDF Defined System Stream (UDF 2.50 3.3.7) */ #define UDF_ID_UNIQUE_ID "*UDF Unique ID Mapping Data" diff --git a/fs/udf/super.c b/fs/udf/super.c index 4942549e7dc8..14b4bc1f6801 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -264,9 +264,6 @@ static void __exit exit_udf_fs(void) destroy_inodecache(); } -module_init(init_udf_fs) -module_exit(exit_udf_fs) - static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) { struct udf_sb_info *sbi = UDF_SB(sb); @@ -1216,7 +1213,8 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; - sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >> + sb->s_blocksize_bits; udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && @@ -1806,7 +1804,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && udf_fixed_to_variable(block) >= - sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) + i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits) return -EAGAIN; bh = udf_read_tagged(sb, block, block, &ident); @@ -1868,7 +1866,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, last[last_count++] = *lastblock - 152; for (i = 0; i < last_count; i++) { - if (last[i] >= sb->s_bdev->bd_inode->i_size >> + if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits) continue; ret = udf_check_anchor_block(sb, last[i], fileset); @@ -1957,7 +1955,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, if (!nsr_off) { if (!silent) udf_warn(sb, "No VRS found\n"); - return 0; + return -EINVAL; } if (nsr_off == -1) udf_debug("Failed to read sector at offset %d. " @@ -1986,6 +1984,7 @@ static void udf_open_lvid(struct super_block *sb) struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; + struct timespec ts; if (!bh) return; @@ -1997,8 +1996,8 @@ static void udf_open_lvid(struct super_block *sb) mutex_lock(&sbi->s_alloc_mutex); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; - udf_time_to_disk_stamp(&lvid->recordingDateAndTime, - CURRENT_TIME); + ktime_get_real_ts(&ts); + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); lvid->descTag.descCRC = cpu_to_le16( @@ -2019,6 +2018,7 @@ static void udf_close_lvid(struct super_block *sb) struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; + struct timespec ts; if (!bh) return; @@ -2030,7 +2030,8 @@ static void udf_close_lvid(struct super_block *sb) mutex_lock(&sbi->s_alloc_mutex); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; - udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME); + ktime_get_real_ts(&ts); + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION); if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) @@ -2158,15 +2159,25 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = udf_load_vrs(sb, &uopt, silent, &fileset); } else { uopt.blocksize = bdev_logical_block_size(sb->s_bdev); - ret = udf_load_vrs(sb, &uopt, silent, &fileset); - if (ret == -EAGAIN && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { - if (!silent) - pr_notice("Rescanning with blocksize %d\n", - UDF_DEFAULT_BLOCKSIZE); - brelse(sbi->s_lvid_bh); - sbi->s_lvid_bh = NULL; - uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; + while (uopt.blocksize <= 4096) { ret = udf_load_vrs(sb, &uopt, silent, &fileset); + if (ret < 0) { + if (!silent && ret != -EACCES) { + pr_notice("Scanning with blocksize %d failed\n", + uopt.blocksize); + } + brelse(sbi->s_lvid_bh); + sbi->s_lvid_bh = NULL; + /* + * EACCES is special - we want to propagate to + * upper layers that we cannot handle RW mount. + */ + if (ret == -EACCES) + break; + } else + break; + + uopt.blocksize <<= 1; } } if (ret < 0) { @@ -2497,3 +2508,9 @@ static unsigned int udf_count_free(struct super_block *sb) return accum; } + +MODULE_AUTHOR("Ben Fennema"); +MODULE_DESCRIPTION("Universal Disk Format Filesystem"); +MODULE_LICENSE("GPL"); +module_init(init_udf_fs) +module_exit(exit_udf_fs) diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 8d619773056b..f7dfef53f739 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -152,9 +152,39 @@ out_unmap: return err; } +static int udf_symlink_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_backing_inode(dentry); + struct page *page; + + generic_fillattr(inode, stat); + page = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + /* + * UDF uses non-trivial encoding of symlinks so i_size does not match + * number of characters reported by readlink(2) which apparently some + * applications expect. Also POSIX says that "The value returned in the + * st_size field shall be the length of the contents of the symbolic + * link, and shall not count a trailing null if one is present." So + * let's report the length of string returned by readlink(2) for + * st_size. + */ + stat->size = strlen(page_address(page)); + put_page(page); + + return 0; +} + /* * symlinks can't do much... */ const struct address_space_operations udf_symlink_aops = { .readpage = udf_symlink_filler, }; + +const struct inode_operations udf_symlink_inode_operations = { + .get_link = page_get_link, + .getattr = udf_symlink_getattr, +}; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 263829ef1873..63b034984378 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -15,7 +15,6 @@ #include "udfend.h" #include "udf_i.h" -#define UDF_PREALLOCATE #define UDF_DEFAULT_PREALLOC_BLOCKS 8 extern __printf(3, 4) void _udf_err(struct super_block *sb, @@ -85,6 +84,7 @@ extern const struct inode_operations udf_dir_inode_operations; extern const struct file_operations udf_dir_operations; extern const struct inode_operations udf_file_inode_operations; extern const struct file_operations udf_file_operations; +extern const struct inode_operations udf_symlink_inode_operations; extern const struct address_space_operations udf_aops; extern const struct address_space_operations udf_adinicb_aops; extern const struct address_space_operations udf_symlink_aops; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d96e2f30084b..18406158e13f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -12,6 +12,7 @@ * mm/ksm.c (mm hashing). */ +#include <linux/list.h> #include <linux/hashtable.h> #include <linux/sched.h> #include <linux/mm.h> @@ -26,6 +27,7 @@ #include <linux/mempolicy.h> #include <linux/ioctl.h> #include <linux/security.h> +#include <linux/hugetlb.h> static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; @@ -45,12 +47,16 @@ struct userfaultfd_ctx { wait_queue_head_t fault_wqh; /* waitqueue head for the pseudo fd to wakeup poll/read */ wait_queue_head_t fd_wqh; + /* waitqueue head for events */ + wait_queue_head_t event_wqh; /* a refile sequence protected by fault_pending_wqh lock */ struct seqcount refile_seq; /* pseudo fd refcounting */ atomic_t refcount; /* userfaultfd syscall flags */ unsigned int flags; + /* features requested from the userspace */ + unsigned int features; /* state machine */ enum userfaultfd_state state; /* released */ @@ -59,10 +65,17 @@ struct userfaultfd_ctx { struct mm_struct *mm; }; +struct userfaultfd_fork_ctx { + struct userfaultfd_ctx *orig; + struct userfaultfd_ctx *new; + struct list_head list; +}; + struct userfaultfd_wait_queue { struct uffd_msg msg; wait_queue_t wq; struct userfaultfd_ctx *ctx; + bool waken; }; struct userfaultfd_wake_range { @@ -86,6 +99,12 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, if (len && (start > uwq->msg.arg.pagefault.address || start + len <= uwq->msg.arg.pagefault.address)) goto out; + WRITE_ONCE(uwq->waken, true); + /* + * The implicit smp_mb__before_spinlock in try_to_wake_up() + * renders uwq->waken visible to other CPUs before the task is + * waken. + */ ret = wake_up_state(wq->private, mode); if (ret) /* @@ -135,6 +154,8 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); + VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); mmdrop(ctx->mm); @@ -162,7 +183,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address, msg.arg.pagefault.address = address; if (flags & FAULT_FLAG_WRITE) /* - * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the + * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE * was not set in a UFFD_EVENT_PAGEFAULT, it means it * was a read fault, otherwise if set it means it's @@ -181,6 +202,49 @@ static inline struct uffd_msg userfault_msg(unsigned long address, return msg; } +#ifdef CONFIG_HUGETLB_PAGE +/* + * Same functionality as userfaultfd_must_wait below with modifications for + * hugepmd ranges. + */ +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + struct mm_struct *mm = ctx->mm; + pte_t *pte; + bool ret = true; + + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + pte = huge_pte_offset(mm, address); + if (!pte) + goto out; + + ret = false; + + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + if (huge_pte_none(*pte)) + ret = true; + if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP)) + ret = true; +out: + return ret; +} +#else +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + return false; /* should never get here */ +} +#endif /* CONFIG_HUGETLB_PAGE */ + /* * Verify the pagetables are still not ok after having reigstered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any @@ -264,6 +328,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) struct userfaultfd_wait_queue uwq; int ret; bool must_wait, return_to_userland; + long blocking_state; BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); @@ -334,10 +399,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) uwq.wq.private = current; uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); uwq.ctx = ctx; + uwq.waken = false; return_to_userland = (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); + blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : + TASK_KILLABLE; spin_lock(&ctx->fault_pending_wqh.lock); /* @@ -350,12 +418,15 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * following the spin_unlock to happen before the list_add in * __add_wait_queue. */ - set_current_state(return_to_userland ? TASK_INTERRUPTIBLE : - TASK_KILLABLE); + set_current_state(blocking_state); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, - reason); + if (!is_vm_hugetlb_page(vmf->vma)) + must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, + reason); + else + must_wait = userfaultfd_huge_must_wait(ctx, vmf->address, + vmf->flags, reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && @@ -364,6 +435,29 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) wake_up_poll(&ctx->fd_wqh, POLLIN); schedule(); ret |= VM_FAULT_MAJOR; + + /* + * False wakeups can orginate even from rwsem before + * up_read() however userfaults will wait either for a + * targeted wakeup on the specific uwq waitqueue from + * wake_userfault() or for signals or for uffd + * release. + */ + while (!READ_ONCE(uwq.waken)) { + /* + * This needs the full smp_store_mb() + * guarantee as the state write must be + * visible to other CPUs before reading + * uwq.waken from other CPUs. + */ + set_current_state(blocking_state); + if (READ_ONCE(uwq.waken) || + READ_ONCE(ctx->released) || + (return_to_userland ? signal_pending(current) : + fatal_signal_pending(current))) + break; + schedule(); + } } __set_current_state(TASK_RUNNING); @@ -425,6 +519,196 @@ out: return ret; } +static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + int ret = 0; + + ewq->ctx = ctx; + init_waitqueue_entry(&ewq->wq, current); + + spin_lock(&ctx->event_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->event_wqh, &ewq->wq); + for (;;) { + set_current_state(TASK_KILLABLE); + if (ewq->msg.event == 0) + break; + if (ACCESS_ONCE(ctx->released) || + fatal_signal_pending(current)) { + ret = -1; + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); + break; + } + + spin_unlock(&ctx->event_wqh.lock); + + wake_up_poll(&ctx->fd_wqh, POLLIN); + schedule(); + + spin_lock(&ctx->event_wqh.lock); + } + __set_current_state(TASK_RUNNING); + spin_unlock(&ctx->event_wqh.lock); + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ + + userfaultfd_ctx_put(ctx); + return ret; +} + +static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + ewq->msg.event = 0; + wake_up_locked(&ctx->event_wqh); + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); +} + +int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) +{ + struct userfaultfd_ctx *ctx = NULL, *octx; + struct userfaultfd_fork_ctx *fctx; + + octx = vma->vm_userfaultfd_ctx.ctx; + if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + return 0; + } + + list_for_each_entry(fctx, fcs, list) + if (fctx->orig == octx) { + ctx = fctx->new; + break; + } + + if (!ctx) { + fctx = kmalloc(sizeof(*fctx), GFP_KERNEL); + if (!fctx) + return -ENOMEM; + + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) { + kfree(fctx); + return -ENOMEM; + } + + atomic_set(&ctx->refcount, 1); + ctx->flags = octx->flags; + ctx->state = UFFD_STATE_RUNNING; + ctx->features = octx->features; + ctx->released = false; + ctx->mm = vma->vm_mm; + atomic_inc(&ctx->mm->mm_count); + + userfaultfd_ctx_get(octx); + fctx->orig = octx; + fctx->new = ctx; + list_add_tail(&fctx->list, fcs); + } + + vma->vm_userfaultfd_ctx.ctx = ctx; + return 0; +} + +static int dup_fctx(struct userfaultfd_fork_ctx *fctx) +{ + struct userfaultfd_ctx *ctx = fctx->orig; + struct userfaultfd_wait_queue ewq; + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_FORK; + ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; + + return userfaultfd_event_wait_completion(ctx, &ewq); +} + +void dup_userfaultfd_complete(struct list_head *fcs) +{ + int ret = 0; + struct userfaultfd_fork_ctx *fctx, *n; + + list_for_each_entry_safe(fctx, n, fcs, list) { + if (!ret) + ret = dup_fctx(fctx); + list_del(&fctx->list); + kfree(fctx); + } +} + +void mremap_userfaultfd_prep(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx; + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { + vm_ctx->ctx = ctx; + userfaultfd_ctx_get(ctx); + } +} + +void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, + unsigned long from, unsigned long to, + unsigned long len) +{ + struct userfaultfd_ctx *ctx = vm_ctx->ctx; + struct userfaultfd_wait_queue ewq; + + if (!ctx) + return; + + if (to & ~PAGE_MASK) { + userfaultfd_ctx_put(ctx); + return; + } + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_REMAP; + ewq.msg.arg.remap.from = from; + ewq.msg.arg.remap.to = to; + ewq.msg.arg.remap.len = len; + + userfaultfd_event_wait_completion(ctx, &ewq); +} + +void madvise_userfault_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue ewq; + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED)) + return; + + userfaultfd_ctx_get(ctx); + up_read(&mm->mmap_sem); + + *prev = NULL; /* We wait for ACK w/o the mmap semaphore */ + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_MADVDONTNEED; + ewq.msg.arg.madv_dn.start = start; + ewq.msg.arg.madv_dn.end = end; + + userfaultfd_event_wait_completion(ctx, &ewq); + + down_read(&mm->mmap_sem); +} + static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; @@ -489,25 +773,36 @@ wakeup: } /* fault_pending_wqh.lock must be hold by the caller */ -static inline struct userfaultfd_wait_queue *find_userfault( - struct userfaultfd_ctx *ctx) +static inline struct userfaultfd_wait_queue *find_userfault_in( + wait_queue_head_t *wqh) { wait_queue_t *wq; struct userfaultfd_wait_queue *uwq; - VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_BUG_ON(!spin_is_locked(&wqh->lock)); uwq = NULL; - if (!waitqueue_active(&ctx->fault_pending_wqh)) + if (!waitqueue_active(wqh)) goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ - wq = list_last_entry(&ctx->fault_pending_wqh.task_list, - typeof(*wq), task_list); + wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list); uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; } +static inline struct userfaultfd_wait_queue *find_userfault( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->fault_pending_wqh); +} + +static inline struct userfaultfd_wait_queue *find_userfault_evt( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->event_wqh); +} + static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) { struct userfaultfd_ctx *ctx = file->private_data; @@ -539,18 +834,59 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) smp_mb(); if (waitqueue_active(&ctx->fault_pending_wqh)) ret = POLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = POLLIN; + return ret; default: - BUG(); + WARN_ON_ONCE(1); + return POLLERR; } } +static const struct file_operations userfaultfd_fops; + +static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, + struct userfaultfd_ctx *new, + struct uffd_msg *msg) +{ + int fd; + struct file *file; + unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS; + + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new, + O_RDWR | flags); + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); + } + + fd_install(fd, file); + msg->arg.reserved.reserved1 = 0; + msg->arg.fork.ufd = fd; + + return 0; +} + static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, struct uffd_msg *msg) { ssize_t ret; DECLARE_WAITQUEUE(wait, current); struct userfaultfd_wait_queue *uwq; + /* + * Handling fork event requires sleeping operations, so + * we drop the event_wqh lock, then do these ops, then + * lock it back and wake up the waiter. While the lock is + * dropped the ewq may go away so we keep track of it + * carefully. + */ + LIST_HEAD(fork_event); + struct userfaultfd_ctx *fork_nctx = NULL; /* always take the fd_wqh lock before the fault_pending_wqh lock */ spin_lock(&ctx->fd_wqh.lock); @@ -602,6 +938,29 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, break; } spin_unlock(&ctx->fault_pending_wqh.lock); + + spin_lock(&ctx->event_wqh.lock); + uwq = find_userfault_evt(ctx); + if (uwq) { + *msg = uwq->msg; + + if (uwq->msg.event == UFFD_EVENT_FORK) { + fork_nctx = (struct userfaultfd_ctx *) + (unsigned long) + uwq->msg.arg.reserved.reserved1; + list_move(&uwq->wq.task_list, &fork_event); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + + userfaultfd_event_complete(ctx, uwq); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->event_wqh.lock); + if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -618,6 +977,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, __set_current_state(TASK_RUNNING); spin_unlock(&ctx->fd_wqh.lock); + if (!ret && msg->event == UFFD_EVENT_FORK) { + ret = resolve_userfault_fork(ctx, fork_nctx, msg); + + if (!ret) { + spin_lock(&ctx->event_wqh.lock); + if (!list_empty(&fork_event)) { + uwq = list_first_entry(&fork_event, + typeof(*uwq), + wq.task_list); + list_del(&uwq->wq.task_list); + __add_wait_queue(&ctx->event_wqh, &uwq->wq); + userfaultfd_event_complete(ctx, uwq); + } + spin_unlock(&ctx->event_wqh.lock); + } + } + return ret; } @@ -720,6 +1096,12 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } +static inline bool vma_can_userfault(struct vm_area_struct *vma) +{ + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); +} + static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -730,6 +1112,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, struct uffdio_register __user *user_uffdio_register; unsigned long vm_flags, new_flags; bool found; + bool non_anon_pages; unsigned long start, end, vma_end; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -781,13 +1164,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out_unlock; /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + + /* * Search for not compatible vmas. - * - * FIXME: this shall be relaxed later so that it doesn't fail - * on tmpfs backed vmas (in addition to the current allowance - * on anonymous vmas). */ found = false; + non_anon_pages = false; for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); @@ -796,8 +1187,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (cur->vm_ops) + if (!vma_can_userfault(cur)) goto out_unlock; + /* + * If this vma contains ending address, and huge pages + * check alignment. + */ + if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && + end > cur->vm_start) { + unsigned long vma_hpagesize = vma_kernel_pagesize(cur); + + ret = -EINVAL; + + if (end & (vma_hpagesize - 1)) + goto out_unlock; + } /* * Check that this vma isn't already owned by a @@ -810,6 +1214,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, cur->vm_userfaultfd_ctx.ctx != ctx) goto out_unlock; + /* + * Note vmas containing huge pages + */ + if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur)) + non_anon_pages = true; + found = true; } BUG_ON(!found); @@ -821,7 +1231,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(vma->vm_ops); + BUG_ON(!vma_can_userfault(vma)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); @@ -879,7 +1289,8 @@ out_unlock: * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(UFFD_API_RANGE_IOCTLS, + if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC : + UFFD_API_RANGE_IOCTLS, &user_uffdio_register->ioctls)) ret = -EFAULT; } @@ -926,11 +1337,18 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out_unlock; /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + + /* * Search for not compatible vmas. - * - * FIXME: this shall be relaxed later so that it doesn't fail - * on tmpfs backed vmas (in addition to the current allowance - * on anonymous vmas). */ found = false; ret = -EINVAL; @@ -947,7 +1365,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (cur->vm_ops) + if (!vma_can_userfault(cur)) goto out_unlock; found = true; @@ -961,7 +1379,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(vma->vm_ops); + BUG_ON(!vma_can_userfault(vma)); /* * Nothing to do: this vma is already registered into this @@ -974,6 +1392,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, start = vma->vm_start; vma_end = min(end, vma->vm_end); + if (userfaultfd_missing(vma)) { + /* + * Wake any concurrent pending userfault while + * we unregister, so they will not hang + * permanently and it avoids userland to call + * UFFDIO_WAKE explicitly. + */ + struct userfaultfd_wake_range range; + range.start = start; + range.len = vma_end - start; + wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); + } + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, @@ -1145,6 +1576,14 @@ out: return ret; } +static inline unsigned int uffd_ctx_features(__u64 user_features) +{ + /* + * For the current set of features the bits just coincide + */ + return (unsigned int)user_features; +} + /* * userland asks for a certain API version and we return which bits * and ioctl commands are implemented in this kernel for such API @@ -1156,6 +1595,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, struct uffdio_api uffdio_api; void __user *buf = (void __user *)arg; int ret; + __u64 features; ret = -EINVAL; if (ctx->state != UFFD_STATE_WAIT_API) @@ -1163,19 +1603,23 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - if (uffdio_api.api != UFFD_API || uffdio_api.features) { + features = uffdio_api.features; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) { memset(&uffdio_api, 0, sizeof(uffdio_api)); if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ret = -EINVAL; goto out; } + /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ctx->state = UFFD_STATE_RUNNING; + /* only enable the requested features for this uffd context */ + ctx->features = uffd_ctx_features(features); ret = 0; out: return ret; @@ -1262,6 +1706,7 @@ static void init_once_userfaultfd_ctx(void *mem) init_waitqueue_head(&ctx->fault_pending_wqh); init_waitqueue_head(&ctx->fault_wqh); + init_waitqueue_head(&ctx->event_wqh); init_waitqueue_head(&ctx->fd_wqh); seqcount_init(&ctx->refile_seq); } @@ -1302,6 +1747,7 @@ static struct file *userfaultfd_file_create(int flags) atomic_set(&ctx->refcount, 1); ctx->flags = flags; + ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; ctx->mm = current->mm; diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e5ebc3770460..33db69be4832 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -39,6 +39,7 @@ #include "xfs_rmap_btree.h" #include "xfs_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_ialloc_btree.h" /* * Per-AG Block Reservations @@ -200,22 +201,30 @@ __xfs_ag_resv_init( struct xfs_mount *mp = pag->pag_mount; struct xfs_ag_resv *resv; int error; + xfs_extlen_t reserved; - resv = xfs_perag_resv(pag, type); if (used > ask) ask = used; - resv->ar_asked = ask; - resv->ar_reserved = resv->ar_orig_reserved = ask - used; - mp->m_ag_max_usable -= ask; + reserved = ask - used; - trace_xfs_ag_resv_init(pag, type, ask); - - error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); - if (error) + error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true); + if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); + xfs_warn(mp, +"Per-AG reservation for AG %u failed. Filesystem may run out of space.", + pag->pag_agno); + return error; + } - return error; + mp->m_ag_max_usable -= ask; + + resv = xfs_perag_resv(pag, type); + resv->ar_asked = ask; + resv->ar_reserved = resv->ar_orig_reserved = reserved; + + trace_xfs_ag_resv_init(pag, type, ask); + return 0; } /* Create a per-AG block reservation. */ @@ -223,6 +232,8 @@ int xfs_ag_resv_init( struct xfs_perag *pag) { + struct xfs_mount *mp = pag->pag_mount; + xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; int error = 0; @@ -231,23 +242,45 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; - error = xfs_refcountbt_calc_reserves(pag->pag_mount, - pag->pag_agno, &ask, &used); + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; - error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, - ask, used); + error = xfs_finobt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) { + /* + * Because we didn't have per-AG reservations when the + * finobt feature was added we might not be able to + * reserve all needed blocks. Warn and fall back to the + * old and potentially buggy code in that case, but + * ensure we do have the reservation for the refcountbt. + */ + ask = used = 0; + + mp->m_inotbt_nores = true; + + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, + &used); + if (error) + goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) + goto out; + } } /* Create the AGFL metadata reservation */ if (pag->pag_agfl_resv.ar_asked == 0) { ask = used = 0; - error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, - &ask, &used); + error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; @@ -256,6 +289,16 @@ xfs_ag_resv_init( goto out; } +#ifdef DEBUG + /* need to read in the AGF for the ASSERT below to work */ + error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0); + if (error) + return error; + + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= + pag->pagf_freeblks + pag->pagf_flcount); +#endif out: return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 5050056a0b06..369adcc18c02 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -95,10 +95,7 @@ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - unsigned int blocks; - - blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); - return blocks; + return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); } /* @@ -224,20 +221,22 @@ xfs_alloc_get_rec( * Compute aligned version of the found extent. * Takes alignment and min length into account. */ -STATIC void +STATIC bool xfs_alloc_compute_aligned( xfs_alloc_arg_t *args, /* allocation argument structure */ xfs_agblock_t foundbno, /* starting block in found extent */ xfs_extlen_t foundlen, /* length in found extent */ xfs_agblock_t *resbno, /* result block number */ - xfs_extlen_t *reslen) /* result length */ + xfs_extlen_t *reslen, /* result length */ + unsigned *busy_gen) { - xfs_agblock_t bno; - xfs_extlen_t len; + xfs_agblock_t bno = foundbno; + xfs_extlen_t len = foundlen; xfs_extlen_t diff; + bool busy; /* Trim busy sections out of found extent */ - xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); /* * If we have a largish extent that happens to start before min_agbno, @@ -262,6 +261,8 @@ xfs_alloc_compute_aligned( *resbno = bno; *reslen = len; } + + return busy; } /* @@ -365,36 +366,12 @@ xfs_alloc_fix_len( return; ASSERT(rlen >= args->minlen && rlen <= args->maxlen); ASSERT(rlen % args->prod == args->mod); + ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= + rlen + args->minleft); args->len = rlen; } /* - * Fix up length if there is too little space left in the a.g. - * Return 1 if ok, 0 if too little, should give up. - */ -STATIC int -xfs_alloc_fix_minleft( - xfs_alloc_arg_t *args) /* allocation argument structure */ -{ - xfs_agf_t *agf; /* a.g. freelist header */ - int diff; /* free space difference */ - - if (args->minleft == 0) - return 1; - agf = XFS_BUF_TO_AGF(args->agbp); - diff = be32_to_cpu(agf->agf_freeblks) - - args->len - args->minleft; - if (diff >= 0) - return 1; - args->len += diff; /* shrink the allocated space */ - /* casts to (int) catch length underflows */ - if ((int)args->len >= (int)args->minlen) - return 1; - args->agbno = NULLAGBLOCK; - return 0; -} - -/* * Update the two btrees, logically removing from freespace the extent * starting at rbno, rlen blocks. The extent is contained within the * actual (current) free extent fbno for flen blocks. @@ -689,8 +666,6 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; - xfs_extlen_t reservation; - xfs_extlen_t oldmax; ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); @@ -699,20 +674,6 @@ xfs_alloc_ag_vextent( ASSERT(args->alignment > 0); /* - * Clamp maxlen to the amount of free space minus any reservations - * that have been made. - */ - oldmax = args->maxlen; - reservation = xfs_ag_resv_needed(args->pag, args->resv); - if (args->maxlen > args->pag->pagf_freeblks - reservation) - args->maxlen = args->pag->pagf_freeblks - reservation; - if (args->maxlen == 0) { - args->agbno = NULLAGBLOCK; - args->maxlen = oldmax; - return 0; - } - - /* * Branch to correct routine based on the type. */ args->wasfromfl = 0; @@ -731,8 +692,6 @@ xfs_alloc_ag_vextent( /* NOTREACHED */ } - args->maxlen = oldmax; - if (error || args->agbno == NULLAGBLOCK) return error; @@ -782,10 +741,11 @@ xfs_alloc_ag_vextent_exact( int error; xfs_agblock_t fbno; /* start block of found extent */ xfs_extlen_t flen; /* length of found extent */ - xfs_agblock_t tbno; /* start block of trimmed extent */ - xfs_extlen_t tlen; /* length of trimmed extent */ - xfs_agblock_t tend; /* end block of trimmed extent */ + xfs_agblock_t tbno; /* start block of busy extent */ + xfs_extlen_t tlen; /* length of busy extent */ + xfs_agblock_t tend; /* end block of busy extent */ int i; /* success/failure of operation */ + unsigned busy_gen; ASSERT(args->alignment == 1); @@ -818,7 +778,9 @@ xfs_alloc_ag_vextent_exact( /* * Check for overlapping busy extents. */ - xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); + tbno = fbno; + tlen = flen; + xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -841,9 +803,6 @@ xfs_alloc_ag_vextent_exact( args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) - args->agbno; xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) - goto not_found; - ASSERT(args->agbno + args->len <= tend); /* @@ -901,6 +860,7 @@ xfs_alloc_find_best_extent( xfs_agblock_t sdiff; int error; int i; + unsigned busy_gen; /* The good extent is perfect, no need to search. */ if (!gdiff) @@ -914,7 +874,8 @@ xfs_alloc_find_best_extent( if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); + xfs_alloc_compute_aligned(args, *sbno, *slen, + sbnoa, slena, &busy_gen); /* * The good extent is closer than this one. @@ -1003,7 +964,8 @@ xfs_alloc_ag_vextent_near( xfs_extlen_t ltlena; /* aligned ... */ xfs_agblock_t ltnew; /* useful start bno of left side */ xfs_extlen_t rlen; /* length of returned extent */ - int forced = 0; + bool busy; + unsigned busy_gen; #ifdef DEBUG /* * Randomly don't execute the first algorithm. @@ -1030,6 +992,7 @@ restart: ltlen = 0; gtlena = 0; ltlena = 0; + busy = false; /* * Get a cursor for the by-size btree. @@ -1112,8 +1075,8 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena); + busy = xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena, &busy_gen); if (ltlena < args->minlen) continue; if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) @@ -1149,12 +1112,7 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; - if (!xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - trace_xfs_alloc_near_nominleft(args); - return 0; - } - blen = args->len; + /* * We are allocating starting at bnew for blen blocks. */ @@ -1236,8 +1194,8 @@ restart: if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena); + busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena, &busy_gen); if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) break; if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) @@ -1252,8 +1210,8 @@ restart: if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, gtbno, gtlen, - >bnoa, >lena); + busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen, + >bnoa, >lena, &busy_gen); if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) break; if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) @@ -1314,9 +1272,9 @@ restart: if (bno_cur_lt == NULL && bno_cur_gt == NULL) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - if (!forced++) { + if (busy) { trace_xfs_alloc_near_busy(args); - xfs_log_force(args->mp, XFS_LOG_SYNC); + xfs_extent_busy_flush(args->mp, args->pag, busy_gen); goto restart; } trace_xfs_alloc_size_neither(args); @@ -1346,12 +1304,6 @@ restart: */ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) { - trace_xfs_alloc_near_nominleft(args); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - return 0; - } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, args->datatype, ltbnoa, ltlena, <new); @@ -1403,7 +1355,8 @@ xfs_alloc_ag_vextent_size( int i; /* temp status variable */ xfs_agblock_t rbno; /* returned block number */ xfs_extlen_t rlen; /* length of returned extent */ - int forced = 0; + bool busy; + unsigned busy_gen; restart: /* @@ -1412,6 +1365,7 @@ restart: cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); bno_cur = NULL; + busy = false; /* * Look for an entry >= maxlen+alignment-1 blocks. @@ -1421,14 +1375,13 @@ restart: goto error0; /* - * If none or we have busy extents that we cannot allocate from, then - * we have to settle for a smaller extent. In the case that there are - * no large extents, this will return the last entry in the tree unless - * the tree is empty. In the case that there are only busy large - * extents, this will return the largest small extent unless there + * If none then we have to settle for a smaller extent. In the case that + * there are no large extents, this will return the last entry in the + * tree unless the tree is empty. In the case that there are only busy + * large extents, this will return the largest small extent unless there * are no smaller extents available. */ - if (!i || forced > 1) { + if (!i) { error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, &flen, &i); if (error) @@ -1439,13 +1392,11 @@ restart: return 0; } ASSERT(i == 1); - xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); + busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, + &rlen, &busy_gen); } else { /* * Search for a non-busy extent that is large enough. - * If we are at low space, don't check, or if we fall of - * the end of the btree, turn off the busy check and - * restart. */ for (;;) { error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); @@ -1453,8 +1404,8 @@ restart: goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, fbno, flen, - &rbno, &rlen); + busy = xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen, &busy_gen); if (rlen >= args->maxlen) break; @@ -1466,18 +1417,13 @@ restart: /* * Our only valid extents must have been busy. * Make it unbusy by forcing the log out and - * retrying. If we've been here before, forcing - * the log isn't making the extents available, - * which means they have probably been freed in - * this transaction. In that case, we have to - * give up on them and we'll attempt a minlen - * allocation the next time around. + * retrying. */ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_busy(args); - if (!forced++) - xfs_log_force(args->mp, XFS_LOG_SYNC); + xfs_extent_busy_flush(args->mp, + args->pag, busy_gen); goto restart; } } @@ -1513,8 +1459,8 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (flen < bestrlen) break; - xfs_alloc_compute_aligned(args, fbno, flen, - &rbno, &rlen); + busy = xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen, &busy_gen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), @@ -1543,18 +1489,16 @@ restart: */ args->len = rlen; if (rlen < args->minlen) { - if (!forced++) { + if (busy) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_busy(args); - xfs_log_force(args->mp, XFS_LOG_SYNC); + xfs_extent_busy_flush(args->mp, args->pag, busy_gen); goto restart; } goto out_nominleft; } xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) - goto out_nominleft; rlen = args->len; XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); /* @@ -2056,7 +2000,7 @@ xfs_alloc_space_available( int flags) { struct xfs_perag *pag = args->pag; - xfs_extlen_t longest; + xfs_extlen_t alloc_len, longest; xfs_extlen_t reservation; /* blocks that are still reserved */ int available; @@ -2066,17 +2010,28 @@ xfs_alloc_space_available( reservation = xfs_ag_resv_needed(pag, args->resv); /* do we have enough contiguous free space for the allocation? */ + alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, reservation); - if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) + if (longest < alloc_len) return false; /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - - reservation - min_free - args->total); - if (available < (int)args->minleft || available <= 0) + reservation - min_free - args->minleft); + if (available < (int)max(args->total, alloc_len)) return false; + /* + * Clamp maxlen to the amount of free space available for the actual + * extent allocation. + */ + if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { + args->maxlen = available; + ASSERT(args->maxlen > 0); + ASSERT(args->maxlen >= args->minlen); + } + return true; } @@ -2122,7 +2077,8 @@ xfs_alloc_fix_freelist( } need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags)) + if (!xfs_alloc_space_available(args, need, flags | + XFS_ALLOC_FLAG_CHECK)) goto out_agbp_relse; /* @@ -2638,12 +2594,10 @@ xfs_alloc_vextent( xfs_agblock_t agsize; /* allocation group size */ int error; int flags; /* XFS_ALLOC_FLAG_... locking flags */ - xfs_extlen_t minleft;/* minimum left value, temp copy */ xfs_mount_t *mp; /* mount structure pointer */ xfs_agnumber_t sagno; /* starting allocation group number */ xfs_alloctype_t type; /* input allocation type */ int bump_rotor = 0; - int no_min = 0; xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ mp = args->mp; @@ -2672,7 +2626,6 @@ xfs_alloc_vextent( trace_xfs_alloc_vextent_badargs(args); return 0; } - minleft = args->minleft; switch (type) { case XFS_ALLOCTYPE_THIS_AG: @@ -2683,9 +2636,7 @@ xfs_alloc_vextent( */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); args->pag = xfs_perag_get(mp, args->agno); - args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); - args->minleft = minleft; if (error) { trace_xfs_alloc_vextent_nofix(args); goto error0; @@ -2713,21 +2664,11 @@ xfs_alloc_vextent( args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); args->type = XFS_ALLOCTYPE_NEAR_BNO; /* FALLTHROUGH */ - case XFS_ALLOCTYPE_ANY_AG: - case XFS_ALLOCTYPE_START_AG: case XFS_ALLOCTYPE_FIRST_AG: /* * Rotate through the allocation groups looking for a winner. */ - if (type == XFS_ALLOCTYPE_ANY_AG) { - /* - * Start with the last place we left off. - */ - args->agno = sagno = (mp->m_agfrotor / rotorstep) % - mp->m_sb.sb_agcount; - args->type = XFS_ALLOCTYPE_THIS_AG; - flags = XFS_ALLOC_FLAG_TRYLOCK; - } else if (type == XFS_ALLOCTYPE_FIRST_AG) { + if (type == XFS_ALLOCTYPE_FIRST_AG) { /* * Start with allocation group given by bno. */ @@ -2736,8 +2677,6 @@ xfs_alloc_vextent( sagno = 0; flags = 0; } else { - if (type == XFS_ALLOCTYPE_START_AG) - args->type = XFS_ALLOCTYPE_THIS_AG; /* * Start with the given allocation group. */ @@ -2750,9 +2689,7 @@ xfs_alloc_vextent( */ for (;;) { args->pag = xfs_perag_get(mp, args->agno); - if (no_min) args->minleft = 0; error = xfs_alloc_fix_freelist(args, flags); - args->minleft = minleft; if (error) { trace_xfs_alloc_vextent_nofix(args); goto error0; @@ -2792,25 +2729,22 @@ xfs_alloc_vextent( * or switch to non-trylock mode. */ if (args->agno == sagno) { - if (no_min == 1) { + if (flags == 0) { args->agbno = NULLAGBLOCK; trace_xfs_alloc_vextent_allfailed(args); break; } - if (flags == 0) { - no_min = 1; - } else { - flags = 0; - if (type == XFS_ALLOCTYPE_START_BNO) { - args->agbno = XFS_FSB_TO_AGBNO(mp, - args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - } + + flags = 0; + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; } } xfs_perag_put(args->pag); } - if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { + if (bump_rotor) { if (args->agno == sagno) mp->m_agfrotor = (mp->m_agfrotor + 1) % (mp->m_sb.sb_agcount * rotorstep); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7c404a6b0ae3..2a8d0fa6fbbe 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -29,9 +29,7 @@ extern struct workqueue_struct *xfs_alloc_wq; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. */ -#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */ #define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ -#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */ #define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ #define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ #define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ @@ -41,9 +39,7 @@ extern struct workqueue_struct *xfs_alloc_wq; typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_TYPES \ - { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ - { XFS_ALLOCTYPE_START_AG, "START_AG" }, \ { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ @@ -56,7 +52,7 @@ typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ #define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ #define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ - +#define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */ /* * Argument structure for xfs_alloc routines. diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index af1ecb19121e..6622d46ddec3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -131,9 +131,6 @@ xfs_attr_get( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (!xfs_inode_hasattr(ip)) - return -ENOATTR; - error = xfs_attr_args_init(&args, ip, name, flags); if (error) return error; @@ -392,9 +389,6 @@ xfs_attr_remove( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - if (!xfs_inode_hasattr(dp)) - return -ENOATTR; - error = xfs_attr_args_init(&args, dp, name, flags); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2760bc3b2536..a9c66d47757a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -740,15 +740,9 @@ xfs_bmap_extents_to_btree( * Fill in the root. */ block = ifp->if_broot; - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, - XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, - XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); - else - xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, - XFS_BMAP_MAGIC, 1, 1, ip->i_ino, + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BTNUM_BMAP, 1, 1, ip->i_ino, XFS_BTREE_LONG_PTRS); - /* * Need a cursor. Can't allocate until bb_level is filled in. */ @@ -804,9 +798,7 @@ try_another_ag: */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (dfops->dop_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock)); *firstblock = cur->bc_private.b.firstblock = args.fsbno; cur->bc_private.b.allocated++; ip->i_d.di_nblocks++; @@ -817,13 +809,8 @@ try_another_ag: */ abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block_int(mp, ablock, abp->b_bn, - XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); - else - xfs_btree_init_block_int(mp, ablock, abp->b_bn, - XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); @@ -1278,7 +1265,6 @@ xfs_bmap_read_extents( /* REFERENCED */ xfs_extnum_t room; /* number of entries there's room for */ - bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : @@ -1291,9 +1277,7 @@ xfs_bmap_read_extents( ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); - ASSERT(bno != NULLFSBLOCK); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. @@ -1864,6 +1848,7 @@ xfs_bmap_add_extent_delay_real( */ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_state(ep, new->br_state); trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); (*nextents)++; @@ -2202,6 +2187,7 @@ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ + int whichfork, xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ @@ -2221,12 +2207,14 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - struct xfs_mount *mp = tp->t_mountp; + struct xfs_mount *mp = ip->i_mount; *logflagsp = 0; cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ASSERT(*idx >= 0); ASSERT(*idx <= xfs_iext_count(ifp)); @@ -2285,7 +2273,7 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < xfs_iext_count(&ip->i_df) - 1) { + if (*idx < xfs_iext_count(ifp) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) @@ -2325,7 +2313,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 2, state); - ip->i_d.di_nextents -= 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2368,7 +2357,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2403,7 +2393,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_set_state(ep, newext); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2515,7 +2506,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2593,7 +2585,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2641,7 +2634,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 2, &r[0], state); - ip->i_d.di_nextents += 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2695,17 +2689,17 @@ xfs_bmap_add_extent_unwritten_real( } /* update reverse mappings */ - error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new); + error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new); if (error) goto done; /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur, - 0, &tmp_logflags, XFS_DATA_FORK); + 0, &tmp_logflags, whichfork); *logflagsp |= tmp_logflags; if (error) goto done; @@ -2717,7 +2711,7 @@ xfs_bmap_add_extent_unwritten_real( *curp = cur; } - xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); + xfs_bmap_check_leaf_extents(*curp, ip, whichfork); done: *logflagsp |= rval; return error; @@ -2809,7 +2803,8 @@ xfs_bmap_add_extent_hole_delay( oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2830,7 +2825,8 @@ xfs_bmap_add_extent_hole_delay( xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2846,7 +2842,8 @@ xfs_bmap_add_extent_hole_delay( temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, nullstartblock((int)newlen), temp, right.br_state); @@ -2899,13 +2896,14 @@ xfs_bmap_add_extent_hole_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); - ASSERT(whichfork != XFS_COW_FORK); XFS_STATS_INC(mp, xs_add_exlist); state = 0; if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; /* * Check and set flags if this segment has a left neighbor. @@ -3629,7 +3627,7 @@ xfs_bmap_btalloc( align = xfs_get_cowextsz_hint(ap->ip); else if (xfs_alloc_is_userdata(ap->datatype)) align = xfs_get_extsz_hint(ap->ip); - if (unlikely(align)) { + if (align) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, &ap->length); @@ -3701,7 +3699,7 @@ xfs_bmap_btalloc( args.minlen = ap->minlen; } /* apply extent size hints if obtained earlier */ - if (unlikely(align)) { + if (align) { args.prod = align; if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); @@ -3812,7 +3810,6 @@ xfs_bmap_btalloc( args.fsbno = 0; args.type = XFS_ALLOCTYPE_FIRST_AG; args.total = ap->minlen; - args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; ap->dfops->dop_low = true; @@ -3823,17 +3820,13 @@ xfs_bmap_btalloc( * the first block that was allocated. */ ASSERT(*ap->firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *ap->firstblock) == - XFS_FSB_TO_AGNO(mp, args.fsbno) || - (ap->dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *ap->firstblock) < - XFS_FSB_TO_AGNO(mp, args.fsbno))); + XFS_FSB_TO_AGNO(mp, *ap->firstblock) <= + XFS_FSB_TO_AGNO(mp, args.fsbno)); ap->blkno = args.fsbno; if (*ap->firstblock == NULLFSBLOCK) *ap->firstblock = args.fsbno; - ASSERT(nullfb || fb_agno == args.agno || - (ap->dfops->dop_low && fb_agno < args.agno)); + ASSERT(nullfb || fb_agno <= args.agno); ap->length = args.len; if (!(ap->flags & XFS_BMAPI_COWFORK)) ap->ip->i_d.di_nblocks += args.len; @@ -4344,8 +4337,6 @@ xfs_bmapi_allocate( if (error) return error; - if (bma->dfops->dop_low) - bma->minleft = 0; if (bma->cur) bma->cur->bc_private.b.firstblock = *bma->firstblock; if (bma->blkno == NULLFSBLOCK) @@ -4371,10 +4362,16 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_NORM; /* - * A wasdelay extent has been initialized, so shouldn't be flagged - * as unwritten. + * In the data fork, a wasdelay extent has been initialized, so + * shouldn't be flagged as unwritten. + * + * For the cow fork, however, we convert delalloc reservations + * (extents allocated for speculative preallocation) to + * allocated unwritten extents, and only convert the unwritten + * extents to real extents when we're about to write the data. */ - if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && + if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && + (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4425,8 +4422,6 @@ xfs_bmapi_convert_unwritten( (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; - ASSERT(whichfork != XFS_COW_FORK); - /* * Modify (by adding) the state flag, if writing. */ @@ -4451,8 +4446,8 @@ xfs_bmapi_convert_unwritten( return error; } - error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, - &bma->cur, mval, bma->firstblock, bma->dfops, + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork, + &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion @@ -4461,8 +4456,12 @@ xfs_bmapi_convert_unwritten( * in the transaction for the sake of fsync(), even if nothing has * changed, because fsync() will not force the log for this transaction * unless it sees the inode pinned. + * + * Note: If we're only converting cow fork extents, there aren't + * any on-disk updates to make, so we don't need to log anything. */ - bma->logflags |= tmp_logflags | XFS_ILOG_CORE; + if (whichfork != XFS_COW_FORK) + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -4517,8 +4516,6 @@ xfs_bmapi_write( int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ int whichfork; /* data or attr fork */ - char inhole; /* current location is hole in file */ - char wasdelay; /* old extent was delayed */ #ifdef DEBUG xfs_fileoff_t orig_bno; /* original block number value */ @@ -4538,15 +4535,15 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & XFS_BMAPI_IGSTATE)); - ASSERT(tp != NULL); + ASSERT(tp != NULL || + (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == + (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); - ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); - ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4606,22 +4603,44 @@ xfs_bmapi_write( bma.firstblock = firstblock; while (bno < end && n < *nmap) { - inhole = eof || bma.got.br_startoff > bno; - wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); + bool need_alloc = false, wasdelay = false; - /* - * Make sure we only reflink into a hole. - */ - if (flags & XFS_BMAPI_REMAP) - ASSERT(inhole); - if (flags & XFS_BMAPI_COWFORK) - ASSERT(!inhole); + /* in hole or beyoned EOF? */ + if (eof || bma.got.br_startoff > bno) { + if (flags & XFS_BMAPI_DELALLOC) { + /* + * For the COW fork we can reasonably get a + * request for converting an extent that races + * with other threads already having converted + * part of it, as there converting COW to + * regular blocks is not protected using the + * IOLOCK. + */ + ASSERT(flags & XFS_BMAPI_COWFORK); + if (!(flags & XFS_BMAPI_COWFORK)) { + error = -EIO; + goto error0; + } + + if (eof || bno >= end) + break; + } else { + need_alloc = true; + } + } else { + /* + * Make sure we only reflink into a hole. + */ + ASSERT(!(flags & XFS_BMAPI_REMAP)); + if (isnullstartblock(bma.got.br_startblock)) + wasdelay = true; + } /* * First, deal with the hole before the allocated space * that we found, if any. */ - if (inhole || wasdelay) { + if (need_alloc || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4729,13 +4748,9 @@ error0: if (bma.cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, *firstblock) <= XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock) || - (dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock))); + bma.cur->bc_private.b.firstblock)); *firstblock = bma.cur->bc_private.b.firstblock; } xfs_btree_del_cursor(bma.cur, @@ -4770,34 +4785,59 @@ xfs_bmap_split_indlen( xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ xfs_filblks_t stolen = 0; + xfs_filblks_t resfactor; /* * Steal as many blocks as we can to try and satisfy the worst case * indlen for both new extents. */ - while (nres > ores && avail) { - nres--; - avail--; - stolen++; - } + if (ores < nres && avail) + stolen = XFS_FILBLKS_MIN(nres - ores, avail); + ores += stolen; + + /* nothing else to do if we've satisfied the new reservation */ + if (ores >= nres) + return stolen; + + /* + * We can't meet the total required reservation for the two extents. + * Calculate the percent of the overall shortage between both extents + * and apply this percentage to each of the requested indlen values. + * This distributes the shortage fairly and reduces the chances that one + * of the two extents is left with nothing when extents are repeatedly + * split. + */ + resfactor = (ores * 100); + do_div(resfactor, nres); + len1 *= resfactor; + do_div(len1, 100); + len2 *= resfactor; + do_div(len2, 100); + ASSERT(len1 + len2 <= ores); + ASSERT(len1 < *indlen1 && len2 < *indlen2); /* - * The only blocks available are those reserved for the original - * extent and what we can steal from the extent being removed. - * If this still isn't enough to satisfy the combined - * requirements for the two new extents, skim blocks off of each - * of the new reservations until they match what is available. + * Hand out the remainder to each extent. If one of the two reservations + * is zero, we want to make sure that one gets a block first. The loop + * below starts with len1, so hand len2 a block right off the bat if it + * is zero. */ - while (nres > ores) { - if (len1) { - len1--; - nres--; + ores -= (len1 + len2); + ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores); + if (ores && !len2 && *indlen2) { + len2++; + ores--; + } + while (ores) { + if (len1 < *indlen1) { + len1++; + ores--; } - if (nres == ores) + if (!ores) break; - if (len2) { - len2--; - nres--; + if (len2 < *indlen2) { + len2++; + ores--; } } @@ -5539,8 +5579,8 @@ __xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, - &lastx, &cur, &del, firstblock, dfops, - &logflags); + whichfork, &lastx, &cur, &del, + firstblock, dfops, &logflags); if (error) goto error0; goto nodelete; @@ -5593,8 +5633,9 @@ __xfs_bunmapi( prev.br_state = XFS_EXT_UNWRITTEN; lastx--; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &prev, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &prev, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; @@ -5602,8 +5643,9 @@ __xfs_bunmapi( ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &del, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &del, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index cecd094404cc..cdef87db5262 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -110,6 +110,9 @@ struct xfs_extent_free_item /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 +/* Only convert delalloc space, don't allocate entirely new extents */ +#define XFS_BMAPI_DELALLOC 0x400 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -120,7 +123,8 @@ struct xfs_extent_free_item { XFS_BMAPI_CONVERT, "CONVERT" }, \ { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ - { XFS_BMAPI_COWFORK, "COWFORK" } + { XFS_BMAPI_COWFORK, "COWFORK" }, \ + { XFS_BMAPI_DELALLOC, "DELALLOC" } static inline int xfs_bmapi_aflag(int w) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d6330c297ca0..f93072b58a58 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -71,15 +71,9 @@ xfs_bmdr_to_bmbt( xfs_bmbt_key_t *tkp; __be64 *tpp; - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, - XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); - else - xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, - XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); - rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; @@ -502,12 +496,11 @@ try_another_ag: if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy - * a full btree split. Try again without minleft and if + * a full btree split. Try again and if * successful activate the lowspace algorithm. */ args.fsbno = 0; args.type = XFS_ALLOCTYPE_FIRST_AG; - args.minleft = 0; error = xfs_alloc_vextent(&args); if (error) goto error0; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 21e6a6ab6b9a..c3decedc9455 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -50,8 +50,18 @@ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, XFS_REFC_CRC_MAGIC } }; -#define xfs_btree_magic(cur) \ - xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] + +__uint32_t +xfs_btree_magic( + int crc, + xfs_btnum_t btnum) +{ + __uint32_t magic = xfs_magics[crc][btnum]; + + /* Ensure we asked for crc for crc-only magics. */ + ASSERT(magic != 0); + return magic; +} STATIC int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lblock( @@ -62,10 +72,13 @@ xfs_btree_check_lblock( { int lblock_ok = 1; /* block passes checks */ struct xfs_mount *mp; /* file system mount point */ + xfs_btnum_t btnum = cur->bc_btnum; + int crc; mp = cur->bc_mp; + crc = xfs_sb_version_hascrc(&mp->m_sb); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (crc) { lblock_ok = lblock_ok && uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid) && @@ -74,7 +87,7 @@ xfs_btree_check_lblock( } lblock_ok = lblock_ok && - be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && @@ -110,13 +123,16 @@ xfs_btree_check_sblock( struct xfs_agf *agf; /* ag. freespace structure */ xfs_agblock_t agflen; /* native ag. freespace length */ int sblock_ok = 1; /* block passes checks */ + xfs_btnum_t btnum = cur->bc_btnum; + int crc; mp = cur->bc_mp; + crc = xfs_sb_version_hascrc(&mp->m_sb); agbp = cur->bc_private.a.agbp; agf = XFS_BUF_TO_AGF(agbp); agflen = be32_to_cpu(agf->agf_length); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (crc) { sblock_ok = sblock_ok && uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid) && @@ -125,7 +141,7 @@ xfs_btree_check_sblock( } sblock_ok = sblock_ok && - be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && @@ -810,7 +826,8 @@ xfs_btree_read_bufl( xfs_daddr_t d; /* real disk block address */ int error; - ASSERT(fsbno != NULLFSBLOCK); + if (!XFS_FSB_SANITY_CHECK(mp, fsbno)) + return -EFSCORRUPTED; d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, mp->m_bsize, lock, &bp, ops); @@ -1084,12 +1101,15 @@ xfs_btree_init_block_int( struct xfs_mount *mp, struct xfs_btree_block *buf, xfs_daddr_t blkno, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, unsigned int flags) { + int crc = xfs_sb_version_hascrc(&mp->m_sb); + __u32 magic = xfs_btree_magic(crc, btnum); + buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); buf->bb_numrecs = cpu_to_be16(numrecs); @@ -1097,7 +1117,7 @@ xfs_btree_init_block_int( if (flags & XFS_BTREE_LONG_PTRS) { buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); - if (flags & XFS_BTREE_CRC_BLOCKS) { + if (crc) { buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); buf->bb_u.l.bb_owner = cpu_to_be64(owner); uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); @@ -1110,7 +1130,7 @@ xfs_btree_init_block_int( buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - if (flags & XFS_BTREE_CRC_BLOCKS) { + if (crc) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); buf->bb_u.s.bb_owner = cpu_to_be32(__owner); uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); @@ -1123,14 +1143,14 @@ void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, unsigned int flags) { xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - magic, level, numrecs, owner, flags); + btnum, level, numrecs, owner, flags); } STATIC void @@ -1140,7 +1160,7 @@ xfs_btree_init_block_cur( int level, int numrecs) { - __u64 owner; + __u64 owner; /* * we can pull the owner from the cursor right now as the different @@ -1154,7 +1174,7 @@ xfs_btree_init_block_cur( owner = cur->bc_private.a.agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - xfs_btree_magic(cur), level, numrecs, + cur->bc_btnum, level, numrecs, owner, cur->bc_flags); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index b69b947c4c1b..4bb62580a7fd 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -76,6 +76,8 @@ union xfs_btree_rec { #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) #define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) +__uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); + /* * For logging record fields. */ @@ -378,7 +380,7 @@ void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, @@ -389,7 +391,7 @@ xfs_btree_init_block_int( struct xfs_mount *mp, struct xfs_btree_block *buf, xfs_daddr_t blkno, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, @@ -456,7 +458,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) #define XFS_FSB_SANITY_CHECK(mp,fsb) \ - (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index f2dc1a950c85..1bdf2888295b 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2633,7 +2633,7 @@ out_free: /* * Readahead the dir/attr block. */ -xfs_daddr_t +int xfs_da_reada_buf( struct xfs_inode *dp, xfs_dablk_t bno, @@ -2664,7 +2664,5 @@ out_free: if (mapp != &map) kmem_free(mapp); - if (error) - return -1; - return mappedbno; + return error; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 98c75cbe6ac2..4e29cb6a3627 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -201,7 +201,7 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, const struct xfs_buf_ops *ops); -xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, +int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, int whichfork, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index c58d72c220f5..2f389d366e93 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -36,21 +36,29 @@ struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; /* - * @mode, if set, indicates that the type field needs to be set up. - * This uses the transformation from file mode to DT_* as defined in linux/fs.h - * for file type specification. This will be propagated into the directory - * structure if appropriate for the given operation and filesystem config. + * Convert inode mode to directory entry filetype */ -const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { - [0] = XFS_DIR3_FT_UNKNOWN, - [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, - [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, - [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, -}; +unsigned char xfs_mode_to_ftype(int mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + return XFS_DIR3_FT_REG_FILE; + case S_IFDIR: + return XFS_DIR3_FT_DIR; + case S_IFCHR: + return XFS_DIR3_FT_CHRDEV; + case S_IFBLK: + return XFS_DIR3_FT_BLKDEV; + case S_IFIFO: + return XFS_DIR3_FT_FIFO; + case S_IFSOCK: + return XFS_DIR3_FT_SOCK; + case S_IFLNK: + return XFS_DIR3_FT_SYMLINK; + default: + return XFS_DIR3_FT_UNKNOWN; + } +} /* * ASCII case-insensitive (ie. A-Z) support for directories that was @@ -631,7 +639,8 @@ xfs_dir2_isblock( if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize); + if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize) + return -EFSCORRUPTED; *vp = rval; return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 0197590fa7d7..d6e6d9d16f6c 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -18,6 +18,9 @@ #ifndef __XFS_DIR2_H__ #define __XFS_DIR2_H__ +#include "xfs_da_format.h" +#include "xfs_da_btree.h" + struct xfs_defer_ops; struct xfs_da_args; struct xfs_inode; @@ -32,10 +35,9 @@ struct xfs_dir2_data_unused; extern struct xfs_name xfs_name_dotdot; /* - * directory filetype conversion tables. + * Convert inode mode to directory entry filetype */ -#define S_SHIFT 12 -extern const unsigned char xfs_mode_to_ftype[]; +extern unsigned char xfs_mode_to_ftype(int mode); /* * directory operations vector for encode/decode routines diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 75a557432d0f..bbd1238852b3 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .verify_write = xfs_dir3_free_write_verify, }; +/* Everything ok in the free block header? */ +static bool +xfs_dir3_free_header_check( + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + unsigned int firstdb; + int maxbests; + + maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo); + firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - + xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * + maxbests; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + if (be32_to_cpu(hdr3->firstdb) != firstdb) + return false; + if (be32_to_cpu(hdr3->nvalid) > maxbests) + return false; + if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) + return false; + } else { + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + + if (be32_to_cpu(hdr->firstdb) != firstdb) + return false; + if (be32_to_cpu(hdr->nvalid) > maxbests) + return false; + if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused)) + return false; + } + return true; +} static int __xfs_dir3_free_read( @@ -168,11 +204,22 @@ __xfs_dir3_free_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) { + xfs_buf_ioerror(*bpp, -EFSCORRUPTED); + xfs_verifier_error(*bpp); + xfs_trans_brelse(tp, *bpp); + return -EFSCORRUPTED; + } /* try read returns without an error or *bpp if it lands in a hole */ - if (!err && tp && *bpp) + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); - return err; + + return 0; } int diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index f272abff11e1..d41ade5d293e 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment( struct xfs_mount *mp) { if (xfs_sb_version_hasalign(&mp->m_sb) && - mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) return mp->m_sb.sb_inoalignmt; return 1; } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 0fd086d03d41..7c471881c9a6 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -82,11 +82,12 @@ xfs_finobt_set_root( } STATIC int -xfs_inobt_alloc_block( +__xfs_inobt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int *stat) + int *stat, + enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( args.maxlen = 1; args.prod = 1; args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.resv = resv; error = xfs_alloc_vextent(&args); if (error) { @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( } STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); +} + +STATIC int +xfs_finobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, + XFS_AG_RESV_METADATA); +} + +STATIC int xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, - .alloc_block = xfs_inobt_alloc_block, + .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_inobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, @@ -480,3 +503,64 @@ xfs_inobt_rec_check_count( return 0; } #endif /* DEBUG */ + +static xfs_extlen_t +xfs_inobt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_inobt_mxr[0] == 0) + return 0; + + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, + (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); +} + +static int +xfs_inobt_count_blocks( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_btnum_t btnum, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + return error; + + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); + error = xfs_btree_count_blocks(cur, tree_blocks); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + return error; +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_finobt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + xfs_extlen_t tree_len = 0; + int error; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); + if (error) + return error; + + *ask += xfs_inobt_max_size(mp); + *used += tree_len; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index bd88453217ce..aa81e2e63f3f 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #define xfs_inobt_rec_check_count(mp, rec) 0 #endif /* DEBUG */ +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index dd483e2767f7..d93f9d918cfc 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -29,6 +29,7 @@ #include "xfs_icache.h" #include "xfs_trans.h" #include "xfs_ialloc.h" +#include "xfs_dir2.h" /* * Check that none of the inode's in the buffer have a next @@ -386,6 +387,7 @@ xfs_dinode_verify( xfs_ino_t ino, struct xfs_dinode *dip) { + uint16_t mode; uint16_t flags; uint64_t flags2; @@ -396,8 +398,12 @@ xfs_dinode_verify( if (be64_to_cpu(dip->di_size) & (1ULL << 63)) return false; - /* No zero-length symlinks. */ - if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0) + mode = be16_to_cpu(dip->di_mode); + if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) + return false; + + /* No zero-length symlinks/dirs. */ + if ((S_ISLNK(mode) || S_ISDIR(mode)) && dip->di_size == 0) return false; /* only version 3 or greater inodes are extensively verified here */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 222e103356c6..25c1e078aef6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -26,6 +26,7 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" +#include "xfs_btree.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" @@ -429,11 +430,13 @@ xfs_iformat_btree( /* REFERENCED */ int nrecs; int size; + int level; ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); /* * blow out if -- fork has less extents than can fit in @@ -446,7 +449,8 @@ xfs_iformat_btree( XFS_IFORK_MAXEXT(ip, whichfork) || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || + level == 0 || level > XFS_BTREE_MAXLEVELS) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, @@ -497,15 +501,14 @@ xfs_iread_extents( * We know that the size is valid (it's checked in iformat_btree) */ ifp->if_bytes = ifp->if_real_bytes = 0; - ifp->if_flags |= XFS_IFEXTENTS; xfs_iext_add(ifp, 0, nextents); error = xfs_bmap_read_extents(tp, ip, whichfork); if (error) { xfs_iext_destroy(ifp); - ifp->if_flags &= ~XFS_IFEXTENTS; return error; } xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + ifp->if_flags |= XFS_IFEXTENTS; return 0; } /* diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index d9f65e2d5cc8..29a01ec89dd0 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -42,7 +42,6 @@ typedef struct xlog_recover_item { xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ } xlog_recover_item_t; -struct xlog_tid; typedef struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 6fb2215f8ff7..50add5272807 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -409,13 +409,14 @@ xfs_refcountbt_calc_size( */ xfs_extlen_t xfs_refcountbt_max_size( - struct xfs_mount *mp) + struct xfs_mount *mp, + xfs_agblock_t agblocks) { /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_refc_mxr[0] == 0) return 0; - return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks); + return xfs_refcountbt_calc_size(mp, agblocks); } /* @@ -430,22 +431,24 @@ xfs_refcountbt_calc_reserves( { struct xfs_buf *agbp; struct xfs_agf *agf; + xfs_agblock_t agblocks; xfs_extlen_t tree_len; int error; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - *ask += xfs_refcountbt_max_size(mp); error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error) return error; agf = XFS_BUF_TO_AGF(agbp); + agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_refcount_blocks); xfs_buf_relse(agbp); + *ask += xfs_refcountbt_max_size(mp, agblocks); *used += tree_len; return error; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 3be7768bd51a..9db008b955b7 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -66,7 +66,8 @@ extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, unsigned long long len); -extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp); +extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp, + xfs_agblock_t agblocks); extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index de25771764ba..74e5a54bc428 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -550,13 +550,14 @@ xfs_rmapbt_calc_size( */ xfs_extlen_t xfs_rmapbt_max_size( - struct xfs_mount *mp) + struct xfs_mount *mp, + xfs_agblock_t agblocks) { /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_rmap_mxr[0] == 0) return 0; - return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks); + return xfs_rmapbt_calc_size(mp, agblocks); } /* @@ -571,25 +572,24 @@ xfs_rmapbt_calc_reserves( { struct xfs_buf *agbp; struct xfs_agf *agf; - xfs_extlen_t pool_len; + xfs_agblock_t agblocks; xfs_extlen_t tree_len; int error; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - /* Reserve 1% of the AG or enough for 1 block per record. */ - pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp)); - *ask += pool_len; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error) return error; agf = XFS_BUF_TO_AGF(agbp); + agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_rmap_blocks); xfs_buf_relse(agbp); + /* Reserve 1% of the AG or enough for 1 block per record. */ + *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); *used += tree_len; return error; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 2a9ac472fb15..19c08e933049 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -60,7 +60,8 @@ extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, unsigned long long len); -extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp); +extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp, + xfs_agblock_t agblocks); extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 2580262e4ea0..584ec896a533 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -242,7 +242,7 @@ xfs_mount_validate_sb( sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_blocksize != (1 << sbp->sb_blocklog) || - sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || sbp->sb_inodelog < XFS_DINODE_MIN_LOG || diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0f56fcd3a5d5..1ff9df7a3ce8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -481,6 +481,12 @@ xfs_submit_ioend( struct xfs_ioend *ioend, int status) { + /* Convert CoW extents to regular */ + if (!status && ioend->io_type == XFS_IO_COW) { + status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), + ioend->io_offset, ioend->io_size); + } + /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && ioend->io_type != XFS_IO_UNWRITTEN && @@ -1152,19 +1158,22 @@ xfs_vm_releasepage( * block_invalidatepage() can send pages that are still marked dirty * but otherwise have invalidated buffers. * - * We've historically freed buffers on the latter. Instead, quietly - * filter out all dirty pages to avoid spurious buffer state warnings. - * This can likely be removed once shrink_active_list() is fixed. + * We want to release the latter to avoid unnecessary buildup of the + * LRU, skip the former and warn if we've left any lingering + * delalloc/unwritten buffers on clean pages. Skip pages with delalloc + * or unwritten buffers and warn if the page is not dirty. Otherwise + * try to release the buffers. */ - if (PageDirty(page)) - return 0; - xfs_count_page_state(page, &delalloc, &unwritten); - if (WARN_ON_ONCE(delalloc)) + if (delalloc) { + WARN_ON_ONCE(!PageDirty(page)); return 0; - if (WARN_ON_ONCE(unwritten)) + } + if (unwritten) { + WARN_ON_ONCE(!PageDirty(page)); return 0; + } return try_to_free_buffers(page); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index b9abce524c33..8b75dcea5966 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -88,7 +88,6 @@ int xfs_bmap_rtalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - xfs_alloctype_t atype = 0; /* type for allocation routines */ int error; /* error return value */ xfs_mount_t *mp; /* mount point structure */ xfs_extlen_t prod = 0; /* product factor for allocators */ @@ -155,18 +154,14 @@ xfs_bmap_rtalloc( /* * Realtime allocation, done through xfs_rtallocate_extent. */ - atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; do_div(ap->blkno, mp->m_sb.sb_rextsize); rtb = ap->blkno; ap->length = ralen; - if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, - &ralen, atype, ap->wasdel, prod, &rtb))) - return error; - if (rtb == NULLFSBLOCK && prod > 1 && - (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, - ap->length, &ralen, atype, - ap->wasdel, 1, &rtb))) + error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, + &ralen, ap->wasdel, prod, &rtb); + if (error) return error; + ap->blkno = rtb; if (ap->blkno != NULLFSBLOCK) { ap->blkno *= mp->m_sb.sb_rextsize; @@ -528,7 +523,6 @@ xfs_getbmap( xfs_bmbt_irec_t *map; /* buffer for user's data */ xfs_mount_t *mp; /* file system mount point */ int nex; /* # of user extents can do */ - int nexleft; /* # of user extents left */ int subnex; /* # of bmapi's can do */ int nmap; /* number of map entries */ struct getbmapx *out; /* output structure */ @@ -686,10 +680,8 @@ xfs_getbmap( goto out_free_map; } - nexleft = nex; - do { - nmap = (nexleft > subnex) ? subnex : nexleft; + nmap = (nex> subnex) ? subnex : nex; error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), XFS_BB_TO_FSB(mp, bmv->bmv_length), map, &nmap, bmapi_flags); @@ -697,8 +689,8 @@ xfs_getbmap( goto out_free_map; ASSERT(nmap <= subnex); - for (i = 0; i < nmap && nexleft && bmv->bmv_length && - cur_ext < bmv->bmv_count; i++) { + for (i = 0; i < nmap && bmv->bmv_length && + cur_ext < bmv->bmv_count - 1; i++) { out[cur_ext].bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; @@ -760,16 +752,27 @@ xfs_getbmap( continue; } + /* + * In order to report shared extents accurately, + * we report each distinct shared/unshared part + * of a single bmbt record using multiple bmap + * extents. To make that happen, we iterate the + * same map array item multiple times, each + * time trimming out the subextent that we just + * reported. + * + * Because of this, we must check the out array + * index (cur_ext) directly against bmv_count-1 + * to avoid overflows. + */ if (inject_map.br_startblock != NULLFSBLOCK) { map[i] = inject_map; i--; - } else - nexleft--; + } bmv->bmv_entries++; cur_ext++; } - } while (nmap && nexleft && bmv->bmv_length && - cur_ext < bmv->bmv_count); + } while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1); out_free_map: kmem_free(map); @@ -779,11 +782,9 @@ xfs_getbmap( xfs_iunlock(ip, XFS_IOLOCK_SHARED); for (i = 0; i < cur_ext; i++) { - int full = 0; /* user array is full */ - /* format results & advance arg */ - error = formatter(&arg, &out[i], &full); - if (error || full) + error = formatter(&arg, &out[i]); + if (error) break; } @@ -909,17 +910,18 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) */ int xfs_free_eofblocks( - xfs_mount_t *mp, - xfs_inode_t *ip, - bool need_iolock) + struct xfs_inode *ip) { - xfs_trans_t *tp; - int error; - xfs_fileoff_t end_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t map_len; - int nimaps; - xfs_bmbt_irec_t imap; + struct xfs_trans *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + struct xfs_bmbt_irec imap; + struct xfs_mount *mp = ip->i_mount; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); /* * Figure out if there are any blocks beyond the end @@ -936,6 +938,10 @@ xfs_free_eofblocks( error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); + /* + * If there are blocks after the end of file, truncate the file to its + * current size to free them up. + */ if (!error && (nimaps != 0) && (imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks)) { @@ -946,22 +952,13 @@ xfs_free_eofblocks( if (error) return error; - /* - * There are blocks after the end of file. - * Free them up now by truncating the file to - * its current size. - */ - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) - return -EAGAIN; - } + /* wait on dio to ensure i_size has settled */ + inode_dio_wait(VFS_I(ip)); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -989,8 +986,6 @@ xfs_free_eofblocks( } xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); } return error; } @@ -1385,10 +1380,16 @@ xfs_shift_file_space( xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; + uint resblks; ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); if (direction == SHIFT_LEFT) { + /* + * Reserve blocks to cover potential extent merges after left + * shift operations. + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); next_fsb = XFS_B_TO_FSB(mp, offset + len); stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); } else { @@ -1396,6 +1397,7 @@ xfs_shift_file_space( * If right shift, delegate the work of initialization of * next_fsb to xfs_bmap_shift_extent as it has ilock held. */ + resblks = 0; next_fsb = NULLFSBLOCK; stop_fsb = XFS_B_TO_FSB(mp, offset); } @@ -1407,7 +1409,7 @@ xfs_shift_file_space( * into the accessible region of the file. */ if (xfs_can_free_eofblocks(ip, true)) { - error = xfs_free_eofblocks(mp, ip, false); + error = xfs_free_eofblocks(ip); if (error) return error; } @@ -1437,21 +1439,14 @@ xfs_shift_file_space( } while (!error && !done) { - /* - * We would need to reserve permanent block for transaction. - * This will come into picture when after shifting extent into - * hole we found that adjacent extents can be merged which - * may lead to freeing of a block during record update. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, + &tp); if (error) break; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, - ip->i_gdquot, ip->i_pdquot, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + ip->i_gdquot, ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 68a621a8e0c0..135d8267e284 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -35,7 +35,7 @@ int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); /* bmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); +typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *); int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, xfs_bmap_format_t formatter, void *arg); @@ -63,8 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); -int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, - bool need_iolock); +int xfs_free_eofblocks(struct xfs_inode *ip); int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, struct xfs_swapext *sx); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 7f0a01f7b592..8c7d01b75922 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -422,6 +422,7 @@ retry: out_free_pages: for (i = 0; i < bp->b_page_count; i++) __free_page(bp->b_pages[i]); + bp->b_flags &= ~_XBF_PAGES; return error; } @@ -757,7 +758,7 @@ xfs_buf_readahead_map( int nmaps, const struct xfs_buf_ops *ops) { - if (bdi_read_congested(target->bt_bdi)) + if (bdi_read_congested(target->bt_bdev->bd_bdi)) return; xfs_buf_read_map(target, map, nmaps, @@ -1790,7 +1791,6 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_bdi = blk_get_backing_dev_info(bdev); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8a9d3a9599f0..3c867e5a63e1 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -109,7 +109,6 @@ typedef unsigned int xfs_buf_flags_t; typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; - struct backing_dev_info *bt_bdi; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 2975cb2319f4..0306168af332 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks( */ bp->b_last_error = 0; bp->b_retries = 0; + bp->b_first_retry_time = 0; xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 4ff499aa7338..d796ffac7296 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -208,32 +208,3 @@ xfs_ioc_trim( return -EFAULT; return 0; } - -int -xfs_discard_extents( - struct xfs_mount *mp, - struct list_head *list) -{ - struct xfs_extent_busy *busyp; - int error = 0; - - list_for_each_entry(busyp, list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); - - error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), - XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, 0); - if (error && error != -EOPNOTSUPP) { - xfs_info(mp, - "discard failed for extent [0x%llx,%u], error %d", - (unsigned long long)busyp->bno, - busyp->length, - error); - return error; - } - } - - return 0; -} diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h index 344879aea646..0f070f9e44e1 100644 --- a/fs/xfs/xfs_discard.h +++ b/fs/xfs/xfs_discard.h @@ -5,6 +5,5 @@ struct fstrim_range; struct list_head; extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); -extern int xfs_discard_extents(struct xfs_mount *, struct list_head *); #endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7a30b8f11db7..9d06cc30e875 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -710,6 +710,10 @@ xfs_dq_get_next_id( /* Simple advance */ next_id = *id + 1; + /* If we'd wrap past the max ID, stop */ + if (next_id < *id) + return -ENOENT; + /* If new ID is within the current chunk, advancing it sufficed */ if (next_id % mp->m_quotainfo->qi_dqperchunk) { *id = next_id; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 162dc186cf04..77760dbf0242 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -45,18 +45,7 @@ xfs_extent_busy_insert( struct rb_node **rbp; struct rb_node *parent = NULL; - new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); - if (!new) { - /* - * No Memory! Since it is now not possible to track the free - * block, make this a synchronous transaction to insure that - * the block is not reused before this transaction commits. - */ - trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); - xfs_trans_set_sync(tp); - return; - } - + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); new->agno = agno; new->bno = bno; new->length = len; @@ -345,25 +334,31 @@ restart: * subset of the extent that is not busy. If *rlen is smaller than * args->minlen no suitable extent could be found, and the higher level * code needs to force out the log and retry the allocation. + * + * Return the current busy generation for the AG if the extent is busy. This + * value can be used to wait for at least one of the currently busy extents + * to be cleared. Note that the busy list is not guaranteed to be empty after + * the gen is woken. The state of a specific extent must always be confirmed + * with another call to xfs_extent_busy_trim() before it can be used. */ -void +bool xfs_extent_busy_trim( struct xfs_alloc_arg *args, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_agblock_t *rbno, - xfs_extlen_t *rlen) + xfs_agblock_t *bno, + xfs_extlen_t *len, + unsigned *busy_gen) { xfs_agblock_t fbno; xfs_extlen_t flen; struct rb_node *rbp; + bool ret = false; - ASSERT(len > 0); + ASSERT(*len > 0); spin_lock(&args->pag->pagb_lock); restart: - fbno = bno; - flen = len; + fbno = *bno; + flen = *len; rbp = args->pag->pagb_tree.rb_node; while (rbp && flen >= args->minlen) { struct xfs_extent_busy *busyp = @@ -515,24 +510,25 @@ restart: flen = fend - fbno; } - spin_unlock(&args->pag->pagb_lock); +out: - if (fbno != bno || flen != len) { - trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + if (fbno != *bno || flen != *len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, fbno, flen); + *bno = fbno; + *len = flen; + *busy_gen = args->pag->pagb_gen; + ret = true; } - *rbno = fbno; - *rlen = flen; - return; + spin_unlock(&args->pag->pagb_lock); + return ret; fail: /* * Return a zero extent length as failure indications. All callers * re-check if the trimmed extent satisfies the minlen requirement. */ - spin_unlock(&args->pag->pagb_lock); - trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); - *rbno = fbno; - *rlen = 0; + flen = 0; + goto out; } STATIC void @@ -551,6 +547,21 @@ xfs_extent_busy_clear_one( kmem_free(busyp); } +static void +xfs_extent_busy_put_pag( + struct xfs_perag *pag, + bool wakeup) + __releases(pag->pagb_lock) +{ + if (wakeup) { + pag->pagb_gen++; + wake_up_all(&pag->pagb_wait); + } + + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + /* * Remove all extents on the passed in list from the busy extents tree. * If do_discard is set skip extents that need to be discarded, and mark @@ -565,27 +576,76 @@ xfs_extent_busy_clear( struct xfs_extent_busy *busyp, *n; struct xfs_perag *pag = NULL; xfs_agnumber_t agno = NULLAGNUMBER; + bool wakeup = false; list_for_each_entry_safe(busyp, n, list, list) { if (busyp->agno != agno) { - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); + if (pag) + xfs_extent_busy_put_pag(pag, wakeup); agno = busyp->agno; + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + wakeup = false; } if (do_discard && busyp->length && - !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { busyp->flags = XFS_EXTENT_BUSY_DISCARDED; - else + } else { xfs_extent_busy_clear_one(mp, pag, busyp); + wakeup = true; + } } - if (pag) { - spin_unlock(&pag->pagb_lock); + if (pag) + xfs_extent_busy_put_pag(pag, wakeup); +} + +/* + * Flush out all busy extents for this AG. + */ +void +xfs_extent_busy_flush( + struct xfs_mount *mp, + struct xfs_perag *pag, + unsigned busy_gen) +{ + DEFINE_WAIT (wait); + int log_flushed = 0, error; + + trace_xfs_log_force(mp, 0, _THIS_IP_); + error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed); + if (error) + return; + + do { + prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); + if (busy_gen != READ_ONCE(pag->pagb_gen)) + break; + schedule(); + } while (1); + + finish_wait(&pag->pagb_wait, &wait); +} + +void +xfs_extent_busy_wait_all( + struct xfs_mount *mp) +{ + DEFINE_WAIT (wait); + xfs_agnumber_t agno; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + struct xfs_perag *pag = xfs_perag_get(mp, agno); + + do { + prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); + if (RB_EMPTY_ROOT(&pag->pagb_tree)) + break; + schedule(); + } while (1); + finish_wait(&pag->pagb_wait, &wait); + xfs_perag_put(pag); } } @@ -596,9 +656,17 @@ xfs_extent_busy_clear( int xfs_extent_busy_ag_cmp( void *priv, - struct list_head *a, - struct list_head *b) + struct list_head *l1, + struct list_head *l2) { - return container_of(a, struct xfs_extent_busy, list)->agno - - container_of(b, struct xfs_extent_busy, list)->agno; + struct xfs_extent_busy *b1 = + container_of(l1, struct xfs_extent_busy, list); + struct xfs_extent_busy *b2 = + container_of(l2, struct xfs_extent_busy, list); + s32 diff; + + diff = b1->agno - b2->agno; + if (!diff) + diff = b1->bno - b2->bno; + return diff; } diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index bfff284d2dcc..60195ea1b84a 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -58,9 +58,16 @@ void xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); +bool +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, + xfs_extlen_t *len, unsigned *busy_gen); + +void +xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag, + unsigned busy_gen); + void -xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, - xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); +xfs_extent_busy_wait_all(struct xfs_mount *mp); int xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index bbb9eb6811b2..022014016d80 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -527,6 +527,15 @@ xfs_file_dio_aio_write( if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) { unaligned_io = 1; + + /* + * We can't properly handle unaligned direct I/O to reflink + * files yet, as we can't unshare a partial block. + */ + if (xfs_is_reflink_inode(ip)) { + trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); + return -EREMCHG; + } iolock = XFS_IOLOCK_EXCL; } else { iolock = XFS_IOLOCK_SHARED; @@ -552,14 +561,6 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - - /* If this is a block-aligned directio CoW, remap immediately. */ - if (xfs_is_reflink_inode(ip) && !unaligned_io) { - ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); - if (ret) - goto out; - } - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: xfs_iunlock(ip, iolock); @@ -614,8 +615,10 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; - int iolock = XFS_IOLOCK_EXCL; + int iolock; +write_retry: + iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); @@ -625,7 +628,6 @@ xfs_file_buffered_aio_write( /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); -write_retry: trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) @@ -641,18 +643,21 @@ write_retry: * running at the same time. */ if (ret == -EDQUOT && !enospc) { + xfs_iunlock(ip, iolock); enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; enospc = xfs_inode_free_quota_cowblocks(ip); if (enospc) goto write_retry; + iolock = 0; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; enospc = 1; xfs_flush_inodes(ip->i_mount); - eofb.eof_scan_owner = ip->i_ino; /* for locking */ + + xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); goto write_retry; @@ -660,7 +665,8 @@ write_retry: current->backing_dev_info = NULL; out: - xfs_iunlock(ip, iolock); + if (iolock) + xfs_iunlock(ip, iolock); return ret; } @@ -908,9 +914,9 @@ xfs_dir_open( */ mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir3_data_readahead(ip, 0, -1); + error = xfs_dir3_data_readahead(ip, 0, -1); xfs_iunlock(ip, mode); - return 0; + return error; } STATIC int @@ -1431,12 +1437,9 @@ xfs_filemap_fault( */ STATIC int xfs_filemap_pmd_fault( - struct vm_area_struct *vma, - unsigned long addr, - pmd_t *pmd, - unsigned int flags) + struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); int ret; @@ -1445,16 +1448,16 @@ xfs_filemap_pmd_fault( trace_xfs_filemap_pmd_fault(ip); - if (flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); + ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (flags & FAULT_FLAG_WRITE) + if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 93d12fa2670d..6ccaae9eb0ee 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -352,12 +352,7 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, - agno, XFS_BTREE_CRC_BLOCKS); - else - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, - agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); @@ -381,12 +376,7 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, - agno, XFS_BTREE_CRC_BLOCKS); - else - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, - agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); @@ -413,8 +403,8 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0, - agno, XFS_BTREE_CRC_BLOCKS); + xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0, + agno, 0); block = XFS_BUF_TO_BLOCK(bp); @@ -488,12 +478,7 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, - agno, XFS_BTREE_CRC_BLOCKS); - else - xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, - agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -513,13 +498,8 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, - 0, 0, agno, - XFS_BTREE_CRC_BLOCKS); - else - xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, - 0, agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO, + 0, 0, agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -540,9 +520,8 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, - 0, 0, agno, - XFS_BTREE_CRC_BLOCKS); + xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC, + 0, 0, agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -631,6 +610,20 @@ xfs_growfs_data_private( xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + /* + * If we expanded the last AG, free the per-AG reservation + * so we can reinitialize it with the new size. + */ + if (new) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + error = xfs_ag_resv_free(pag); + xfs_perag_put(pag); + if (error) + goto out; + } + /* Reserve AG metadata blocks. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index ff4d6311c7f4..7234b9748c36 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1322,13 +1322,10 @@ xfs_inode_free_eofblocks( int flags, void *args) { - int ret; + int ret = 0; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ trace_xfs_inode_free_eofblocks_invalid(ip); @@ -1356,21 +1353,19 @@ xfs_inode_free_eofblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } - ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); - - /* don't revisit the inode if we're not waiting */ - if (ret == -EAGAIN && !(flags & SYNC_WAIT)) - ret = 0; + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (flags & SYNC_WAIT) + ret = -EAGAIN; + return ret; + } + ret = xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -1417,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks( struct xfs_eofblocks eofb = {0}; struct xfs_dquot *dq; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - /* - * Set the scan owner to avoid a potential livelock. Otherwise, the scan - * can repeatedly trylock on the inode we're currently processing. We - * run a sync scan to increase effectiveness and use the union filter to + * Run a sync scan to increase effectiveness and use the union filter to * cover all applicable quotas in a single scan. */ - eofb.eof_scan_owner = ip->i_ino; eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { @@ -1577,12 +1567,9 @@ xfs_inode_free_cowblocks( { int ret; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. @@ -1597,7 +1584,8 @@ xfs_inode_free_cowblocks( * If the mapping is dirty or under writeback we cannot touch the * CoW fork. Leave it alone if we're in the midst of a directio. */ - if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || + if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&VFS_I(ip)->i_dio_count)) return 0; @@ -1614,28 +1602,16 @@ xfs_inode_free_cowblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } /* Free the CoW blocks */ - if (need_iolock) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - } + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); - if (need_iolock) { - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index a1e02f4708ab..8a7c849b4dea 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -27,7 +27,6 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; - xfs_ino_t eof_scan_owner; }; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ @@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user( dst->eof_flags = src->eof_flags; dst->eof_prid = src->eof_prid; dst->eof_min_file_size = src->eof_min_file_size; - dst->eof_scan_owner = NULLFSINO; dst->eof_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b9557795eb74..edfa6a55b064 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1692,32 +1692,34 @@ xfs_release( if (xfs_can_free_eofblocks(ip, false)) { /* + * Check if the inode is being opened, written and closed + * frequently and we have delayed allocation blocks outstanding + * (e.g. streaming writes from the NFS server), truncating the + * blocks past EOF will cause fragmentation to occur. + * + * In this case don't do the truncation, but we have to be + * careful how we detect this case. Blocks beyond EOF show up as + * i_delayed_blks even when the inode is clean, so we need to + * truncate them away first before checking for a dirty release. + * Hence on the first dirty close we will still remove the + * speculative allocation, but after that we will leave it in + * place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + /* * If we can't get the iolock just skip truncating the blocks * past EOF because we could deadlock with the mmap_sem - * otherwise. We'll get another chance to drop them once the + * otherwise. We'll get another chance to drop them once the * last reference to the inode is dropped, so we'll never leak * blocks permanently. - * - * Further, check if the inode is being opened, written and - * closed frequently and we have delayed allocation blocks - * outstanding (e.g. streaming writes from the NFS server), - * truncating the blocks past EOF will cause fragmentation to - * occur. - * - * In this case don't do the truncation, either, but we have to - * be careful how we detect this case. Blocks beyond EOF show - * up as i_delayed_blks even when the inode is clean, so we - * need to truncate them away first before checking for a dirty - * release. Hence on the first dirty close we will still remove - * the speculative allocation, but after that we will leave it - * in place. */ - if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) - return 0; - - error = xfs_free_eofblocks(mp, ip, true); - if (error && error != -EAGAIN) - return error; + if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + error = xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + if (error) + return error; + } /* delalloc blocks after truncation means it really is dirty */ if (ip->i_delayed_blks) @@ -1792,22 +1794,23 @@ xfs_inactive_ifree( int error; /* - * The ifree transaction might need to allocate blocks for record - * insertion to the finobt. We don't want to fail here at ENOSPC, so - * allow ifree to dip into the reserved block pool if necessary. - * - * Freeing large sets of inodes generally means freeing inode chunks, - * directory and file data blocks, so this should be relatively safe. - * Only under severe circumstances should it be possible to free enough - * inodes to exhaust the reserve block pool via finobt expansion while - * at the same time not creating free space in the filesystem. + * We try to use a per-AG reservation for any block needed by the finobt + * tree, but as the finobt feature predates the per-AG reservation + * support a degraded file system might not have enough space for the + * reservation at mount time. In that case try to dip into the reserved + * pool and pray. * * Send a warning if the reservation does happen to fail, as the inode * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (unlikely(mp->m_inotbt_nores)) { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, + &tp); + } else { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); + } if (error) { if (error == -ENOSPC) { xfs_warn_ratelimited(mp, @@ -1903,8 +1906,11 @@ xfs_inactive( * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. */ - if (xfs_can_free_eofblocks(ip, true)) - xfs_free_eofblocks(mp, ip, false); + if (xfs_can_free_eofblocks(ip, true)) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } return; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c67cfb451fd3..cf1363dbf32b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1524,7 +1524,7 @@ out_drop_write: } STATIC int -xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) +xfs_getbmap_format(void **ap, struct getbmapx *bmv) { struct getbmap __user *base = (struct getbmap __user *)*ap; @@ -1567,7 +1567,7 @@ xfs_ioc_getbmap( } STATIC int -xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) +xfs_getbmapx_format(void **ap, struct getbmapx *bmv) { struct getbmapx __user *base = (struct getbmapx __user *)*ap; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 0d147428971e..41662fb14e87 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -162,7 +162,7 @@ xfs_iomap_write_direct( xfs_fileoff_t last_fsb; xfs_filblks_t count_fsb, resaligned; xfs_fsblock_t firstfsb; - xfs_extlen_t extsz, temp; + xfs_extlen_t extsz; int nimaps; int quota_flag; int rt; @@ -203,14 +203,7 @@ xfs_iomap_write_direct( } count_fsb = last_fsb - offset_fsb; ASSERT(count_fsb > 0); - - resaligned = count_fsb; - if (unlikely(extsz)) { - if ((temp = do_mod(offset_fsb, extsz))) - resaligned += temp; - if ((temp = do_mod(resaligned, extsz))) - resaligned += extsz - temp; - } + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz); if (unlikely(rt)) { resrtextents = qblocks = resaligned; @@ -681,11 +674,11 @@ xfs_iomap_write_allocate( xfs_trans_t *tp; int nimaps; int error = 0; - int flags = 0; + int flags = XFS_BMAPI_DELALLOC; int nres; if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK; + flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; /* * Make sure that the dquots are there. @@ -1002,47 +995,31 @@ xfs_file_iomap_begin( offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); - if (xfs_is_reflink_inode(ip) && - (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) { - shared = xfs_reflink_find_cow_mapping(ip, offset, &imap); - if (shared) { - xfs_iunlock(ip, lockmode); - goto alloc_done; - } - ASSERT(!isnullstartblock(imap.br_startblock)); - } - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; - if ((flags & IOMAP_REPORT) || - (xfs_is_reflink_inode(ip) && - (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) { + if (flags & IOMAP_REPORT) { /* Trim the mapping to the nearest shared extent boundary. */ error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) goto out_unlock; - - /* - * We're here because we're trying to do a directio write to a - * region that isn't aligned to a filesystem block. If the - * extent is shared, fall back to buffered mode to handle the - * RMW. - */ - if (!(flags & IOMAP_REPORT) && shared) { - trace_xfs_reflink_bounce_dio_write(ip, &imap); - error = -EREMCHG; - goto out_unlock; - } } if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { - error = xfs_reflink_reserve_cow(ip, &imap, &shared); - if (error) - goto out_unlock; + if (flags & IOMAP_DIRECT) { + /* may drop and re-acquire the ilock */ + error = xfs_reflink_allocate_cow(ip, &imap, &shared, + &lockmode); + if (error) + goto out_unlock; + } else { + error = xfs_reflink_reserve_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + } end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; @@ -1071,7 +1048,6 @@ xfs_file_iomap_begin( if (error) return error; -alloc_done: iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { @@ -1102,7 +1078,19 @@ xfs_file_iomap_end_delalloc( xfs_fileoff_t end_fsb; int error = 0; - start_fsb = XFS_B_TO_FSB(mp, offset + written); + /* behave as if the write failed if drop writes is enabled */ + if (xfs_mp_drop_writes(mp)) + written = 0; + + /* + * start_fsb refers to the first unused block after a short write. If + * nothing was written, round offset down to point at the first block in + * the range. + */ + if (unlikely(!written)) + start_fsb = XFS_B_TO_FSBT(mp, offset); + else + start_fsb = XFS_B_TO_FSB(mp, offset + written); end_fsb = XFS_B_TO_FSB(mp, offset + length); /* @@ -1114,6 +1102,9 @@ xfs_file_iomap_end_delalloc( * blocks in the range, they are ours. */ if (start_fsb < end_fsb) { + truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), + XFS_FSB_TO_B(mp, end_fsb) - 1); + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, end_fsb - start_fsb); @@ -1144,7 +1135,7 @@ xfs_file_iomap_end( return 0; } -struct iomap_ops xfs_iomap_ops = { +const struct iomap_ops xfs_iomap_ops = { .iomap_begin = xfs_file_iomap_begin, .iomap_end = xfs_file_iomap_end, }; @@ -1190,6 +1181,6 @@ out_unlock: return error; } -struct iomap_ops xfs_xattr_iomap_ops = { +const struct iomap_ops xfs_xattr_iomap_ops = { .iomap_begin = xfs_xattr_iomap_begin, }; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 6d45cf01fcff..00db3ecea084 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -33,7 +33,27 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); -extern struct iomap_ops xfs_iomap_ops; -extern struct iomap_ops xfs_xattr_iomap_ops; +static inline xfs_filblks_t +xfs_aligned_fsb_count( + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + xfs_extlen_t extsz) +{ + if (extsz) { + xfs_extlen_t align; + + align = do_mod(offset_fsb, extsz); + if (align) + count_fsb += align; + align = do_mod(count_fsb, extsz); + if (align) + count_fsb += extsz - align; + } + + return count_fsb; +} + +extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_xattr_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 308bebb6dfd2..22c16155f1b4 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -98,12 +98,27 @@ xfs_init_security( static void xfs_dentry_to_name( struct xfs_name *namep, + struct dentry *dentry) +{ + namep->name = dentry->d_name.name; + namep->len = dentry->d_name.len; + namep->type = XFS_DIR3_FT_UNKNOWN; +} + +static int +xfs_dentry_mode_to_name( + struct xfs_name *namep, struct dentry *dentry, int mode) { namep->name = dentry->d_name.name; namep->len = dentry->d_name.len; - namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT]; + namep->type = xfs_mode_to_ftype(mode); + + if (unlikely(namep->type == XFS_DIR3_FT_UNKNOWN)) + return -EFSCORRUPTED; + + return 0; } STATIC void @@ -119,7 +134,7 @@ xfs_cleanup_inode( * xfs_init_security we must back out. * ENOSPC can hit here, among other things. */ - xfs_dentry_to_name(&teardown, dentry, 0); + xfs_dentry_to_name(&teardown, dentry); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); } @@ -154,8 +169,12 @@ xfs_generic_create( if (error) return error; + /* Verify mode is valid also for tmpfile case */ + error = xfs_dentry_mode_to_name(&name, dentry, mode); + if (unlikely(error)) + goto out_free_acl; + if (!tmpfile) { - xfs_dentry_to_name(&name, dentry, mode); error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); } else { error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); @@ -248,7 +267,7 @@ xfs_vn_lookup( if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - xfs_dentry_to_name(&name, dentry, 0); + xfs_dentry_to_name(&name, dentry); error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); if (unlikely(error)) { if (unlikely(error != -ENOENT)) @@ -275,7 +294,7 @@ xfs_vn_ci_lookup( if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - xfs_dentry_to_name(&xname, dentry, 0); + xfs_dentry_to_name(&xname, dentry); error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); if (unlikely(error)) { if (unlikely(error != -ENOENT)) @@ -310,7 +329,9 @@ xfs_vn_link( struct xfs_name name; int error; - xfs_dentry_to_name(&name, dentry, inode->i_mode); + error = xfs_dentry_mode_to_name(&name, dentry, inode->i_mode); + if (unlikely(error)) + return error; error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) @@ -329,7 +350,7 @@ xfs_vn_unlink( struct xfs_name name; int error; - xfs_dentry_to_name(&name, dentry, 0); + xfs_dentry_to_name(&name, dentry); error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry))); if (error) @@ -359,7 +380,9 @@ xfs_vn_symlink( mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); - xfs_dentry_to_name(&name, dentry, mode); + error = xfs_dentry_mode_to_name(&name, dentry, mode); + if (unlikely(error)) + goto out; error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) @@ -395,6 +418,7 @@ xfs_vn_rename( { struct inode *new_inode = d_inode(ndentry); int omode = 0; + int error; struct xfs_name oname; struct xfs_name nname; @@ -405,8 +429,14 @@ xfs_vn_rename( if (flags & RENAME_EXCHANGE) omode = d_inode(ndentry)->i_mode; - xfs_dentry_to_name(&oname, odentry, omode); - xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode); + error = xfs_dentry_mode_to_name(&oname, odentry, omode); + if (omode && unlikely(error)) + return error; + + error = xfs_dentry_mode_to_name(&nname, ndentry, + d_inode(odentry)->i_mode); + if (unlikely(error)) + return error; return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index e467218c0098..7a989de224f4 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -331,11 +331,11 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) } #define ASSERT_ALWAYS(expr) \ - (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #ifdef DEBUG #define ASSERT(expr) \ - (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #ifndef STATIC # define STATIC noinline @@ -346,7 +346,7 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #ifdef XFS_WARN #define ASSERT(expr) \ - (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) #ifndef STATIC # define STATIC static noinline diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c39ac14ff540..b1469f0a91a6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3317,12 +3317,8 @@ xfs_log_force( xfs_mount_t *mp, uint flags) { - int error; - trace_xfs_log_force(mp, 0, _RET_IP_); - error = _xfs_log_force(mp, flags, NULL); - if (error) - xfs_warn(mp, "%s: error %d returned.", __func__, error); + _xfs_log_force(mp, flags, NULL); } /* @@ -3466,12 +3462,8 @@ xfs_log_force_lsn( xfs_lsn_t lsn, uint flags) { - int error; - trace_xfs_log_force(mp, lsn, _RET_IP_); - error = _xfs_log_force_lsn(mp, lsn, flags, NULL); - if (error) - xfs_warn(mp, "%s: error %d returned.", __func__, error); + _xfs_log_force_lsn(mp, lsn, flags, NULL); } /* diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index b5e71072fde5..cc5a9f1574e7 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -124,7 +124,6 @@ struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; -struct xfs_log_callback; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index a4ab192e1792..82f1cbcc4de1 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -30,6 +30,9 @@ #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_log_priv.h" +#include "xfs_trace.h" + +struct workqueue_struct *xfs_discard_wq; /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to @@ -491,6 +494,75 @@ xlog_cil_free_logvec( } } +static void +xlog_discard_endio_work( + struct work_struct *work) +{ + struct xfs_cil_ctx *ctx = + container_of(work, struct xfs_cil_ctx, discard_endio_work); + struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); + kmem_free(ctx); +} + +/* + * Queue up the actual completion to a thread to avoid IRQ-safe locking for + * pagb_lock. Note that we need a unbounded workqueue, otherwise we might + * get the execution delayed up to 30 seconds for weird reasons. + */ +static void +xlog_discard_endio( + struct bio *bio) +{ + struct xfs_cil_ctx *ctx = bio->bi_private; + + INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); + queue_work(xfs_discard_wq, &ctx->discard_endio_work); +} + +static void +xlog_discard_busy_extents( + struct xfs_mount *mp, + struct xfs_cil_ctx *ctx) +{ + struct list_head *list = &ctx->busy_extents; + struct xfs_extent_busy *busyp; + struct bio *bio = NULL; + struct blk_plug plug; + int error = 0; + + ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); + + blk_start_plug(&plug); + list_for_each_entry(busyp, list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, 0, &bio); + if (error && error != -EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llx,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + break; + } + } + + if (bio) { + bio->bi_private = ctx; + bio->bi_end_io = xlog_discard_endio; + submit_bio(bio); + } else { + xlog_discard_endio_work(&ctx->discard_endio_work); + } + blk_finish_plug(&plug); +} + /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as @@ -525,14 +597,10 @@ xlog_cil_committed( xlog_cil_free_logvec(ctx->lv_chain); - if (!list_empty(&ctx->busy_extents)) { - ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); - - xfs_discard_extents(mp, &ctx->busy_extents); - xfs_extent_busy_clear(mp, &ctx->busy_extents, false); - } - - kmem_free(ctx); + if (!list_empty(&ctx->busy_extents)) + xlog_discard_busy_extents(mp, ctx); + else + kmem_free(ctx); } /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2b6eec52178e..c2604a5366f2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -257,6 +257,7 @@ struct xfs_cil_ctx { struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct xfs_log_callback log_cb; /* completion callback hook. */ struct list_head committing; /* ctx committing list */ + struct work_struct discard_endio_work; }; /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9b9540db17a6..450bde68bb75 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -45,6 +45,7 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_reflink.h" +#include "xfs_extent_busy.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -187,7 +188,7 @@ xfs_initialize_perag( xfs_agnumber_t *maxagi) { xfs_agnumber_t index; - xfs_agnumber_t first_initialised = 0; + xfs_agnumber_t first_initialised = NULLAGNUMBER; xfs_perag_t *pag; int error = -ENOMEM; @@ -202,22 +203,21 @@ xfs_initialize_perag( xfs_perag_put(pag); continue; } - if (!first_initialised) - first_initialised = index; pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); if (!pag) - goto out_unwind; + goto out_unwind_new_pags; pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); if (xfs_buf_hash_init(pag)) - goto out_unwind; + goto out_free_pag; + init_waitqueue_head(&pag->pagb_wait); if (radix_tree_preload(GFP_NOFS)) - goto out_unwind; + goto out_hash_destroy; spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { @@ -225,10 +225,13 @@ xfs_initialize_perag( spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); error = -EEXIST; - goto out_unwind; + goto out_hash_destroy; } spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); + /* first new pag is fully initialized */ + if (first_initialised == NULLAGNUMBER) + first_initialised = index; } index = xfs_set_inode_alloc(mp, agcount); @@ -239,11 +242,16 @@ xfs_initialize_perag( mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_unwind: +out_hash_destroy: xfs_buf_hash_destroy(pag); +out_free_pag: kmem_free(pag); - for (; index > first_initialised; index--) { +out_unwind_new_pags: + /* unwind any prior newly initialized pags */ + for (index = first_initialised; index < agcount; index++) { pag = radix_tree_delete(&mp->m_perag_tree, index); + if (!pag) + break; xfs_buf_hash_destroy(pag); kmem_free(pag); } @@ -1073,6 +1081,13 @@ xfs_unmountfs( xfs_log_force(mp, XFS_LOG_SYNC); /* + * Wait for all busy extents to be freed, including completion of + * any discard operation. + */ + xfs_extent_busy_wait_all(mp); + flush_workqueue(xfs_discard_wq); + + /* * We now need to tell the world we are unmounting. This will allow * us to detect that the filesystem is going away and we should error * out anything that we have been retrying in the background. This will diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 84f785218907..6db6fd6b82b0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -140,6 +140,7 @@ typedef struct xfs_mount { int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ + bool m_inotbt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode @@ -199,11 +200,12 @@ typedef struct xfs_mount { /* * DEBUG mode instrumentation to test and/or trigger delayed allocation * block killing in the event of failed writes. When enabled, all - * buffered writes are forced to fail. All delalloc blocks in the range - * of the write (including pre-existing delalloc blocks!) are tossed as - * part of the write failure error handling sequence. + * buffered writes are silenty dropped and handled as if they failed. + * All delalloc blocks in the range of the write (including pre-existing + * delalloc blocks!) are tossed as part of the write failure error + * handling sequence. */ - bool m_fail_writes; + bool m_drop_writes; #endif } xfs_mount_t; @@ -324,13 +326,13 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) #ifdef DEBUG static inline bool -xfs_mp_fail_writes(struct xfs_mount *mp) +xfs_mp_drop_writes(struct xfs_mount *mp) { - return mp->m_fail_writes; + return mp->m_drop_writes; } #else static inline bool -xfs_mp_fail_writes(struct xfs_mount *mp) +xfs_mp_drop_writes(struct xfs_mount *mp) { return 0; } @@ -383,6 +385,8 @@ typedef struct xfs_perag { xfs_agino_t pagl_rightrec; spinlock_t pagb_lock; /* lock for pagb_tree */ struct rb_root pagb_tree; /* ordered tree of busy extents */ + unsigned int pagb_gen; /* generation count for pagb_tree */ + wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ atomic_t pagf_fstrms; /* # of filestreams active in this AG */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 45e50ea90769..b669b123287b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1177,7 +1177,8 @@ xfs_qm_dqusage_adjust( * the case in all other instances. It's OK that we do this because * quotacheck is done only at mount time. */ - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip); + error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL, + &ip); if (error) { *res = BULKSTAT_RV_NOTHING; return error; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index fe86a668a57e..6e4c7446c3d4 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -526,13 +526,14 @@ xfs_cui_recover( xfs_refcount_finish_one_cleanup(tp, rcur, error); error = xfs_defer_finish(&tp, &dfops, NULL); if (error) - goto abort_error; + goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); error = xfs_trans_commit(tp); return error; abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); +abort_defer: xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 07593a362cd0..da6d08fb359c 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -82,11 +82,22 @@ * mappings are a reservation against the free space in the filesystem; * adjacent mappings can also be combined into fewer larger mappings. * + * As an optimization, the CoW extent size hint (cowextsz) creates + * outsized aligned delalloc reservations in the hope of landing out of + * order nearby CoW writes in a single extent on disk, thereby reducing + * fragmentation and improving future performance. + * + * D: --RRRRRRSSSRRRRRRRR--- (data fork) + * C: ------DDDDDDD--------- (CoW fork) + * * When dirty pages are being written out (typically in writepage), the - * delalloc reservations are converted into real mappings by allocating - * blocks and replacing the delalloc mapping with real ones. A delalloc - * mapping can be replaced by several real ones if the free space is - * fragmented. + * delalloc reservations are converted into unwritten mappings by + * allocating blocks and replacing the delalloc mapping with real ones. + * A delalloc mapping can be replaced by several unwritten ones if the + * free space is fragmented. + * + * D: --RRRRRRSSSRRRRRRRR--- + * C: ------UUUUUUU--------- * * We want to adapt the delalloc mechanism for copy-on-write, since the * write paths are similar. The first two steps (creating the reservation @@ -101,13 +112,29 @@ * Block-aligned directio writes will use the same mechanism as buffered * writes. * + * Just prior to submitting the actual disk write requests, we convert + * the extents representing the range of the file actually being written + * (as opposed to extra pieces created for the cowextsize hint) to real + * extents. This will become important in the next step: + * + * D: --RRRRRRSSSRRRRRRRR--- + * C: ------UUrrUUU--------- + * * CoW remapping must be done after the data block write completes, * because we don't want to destroy the old data fork map until we're sure * the new block has been written. Since the new mappings are kept in a * separate fork, we can simply iterate these mappings to find the ones * that cover the file blocks that we just CoW'd. For each extent, simply * unmap the corresponding range in the data fork, map the new range into - * the data fork, and remove the extent from the CoW fork. + * the data fork, and remove the extent from the CoW fork. Because of + * the presence of the cowextsize hint, however, we must be careful + * only to remap the blocks that we've actually written out -- we must + * never remap delalloc reservations nor CoW staging blocks that have + * yet to be written. This corresponds exactly to the real extents in + * the CoW fork: + * + * D: --RRRRRRrrSRRRRRRRR--- + * C: ------UU--UUU--------- * * Since the remapping operation can be applied to an arbitrary file * range, we record the need for the remap step as a flag in the ioend @@ -296,103 +323,165 @@ xfs_reflink_reserve_cow( return 0; } -/* Allocate all CoW reservations covering a range of blocks in a file. */ -static int -__xfs_reflink_allocate_cow( - struct xfs_inode *ip, - xfs_fileoff_t *offset_fsb, - xfs_fileoff_t end_fsb) +/* Convert part of an unwritten CoW extent to a real one. */ +STATIC int +xfs_reflink_convert_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + struct xfs_defer_ops *dfops) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap; - struct xfs_defer_ops dfops; - struct xfs_trans *tp; - xfs_fsblock_t first_block; - int nimaps = 1, error; - bool shared; - - xfs_defer_init(&dfops, &first_block); + xfs_fsblock_t first_block; + int nimaps = 1; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, - XFS_TRANS_RESERVE, &tp); - if (error) - return error; + if (imap->br_state == XFS_EXT_NORM) + return 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trim_extent(imap, offset_fsb, count_fsb); + trace_xfs_reflink_convert_cow(ip, imap); + if (imap->br_blockcount == 0) + return 0; + return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block, + 0, imap, &nimaps, dfops); +} - /* Read extent from the source file. */ - nimaps = 1; - error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, - &imap, &nimaps, 0); - if (error) - goto out_unlock; - ASSERT(nimaps == 1); +/* Convert all of the unwritten CoW extents in a file's range to real ones. */ +int +xfs_reflink_convert_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_bmbt_irec got; + struct xfs_defer_ops dfops; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + xfs_extnum_t idx; + bool found; + int error = 0; - error = xfs_reflink_reserve_cow(ip, &imap, &shared); - if (error) - goto out_trans_cancel; + xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!shared) { - *offset_fsb = imap.br_startoff + imap.br_blockcount; - goto out_trans_cancel; + /* Convert all the extents to real from unwritten. */ + for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); + found && got.br_startoff < end_fsb; + found = xfs_iext_get_extent(ifp, ++idx, &got)) { + error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb, + end_fsb - offset_fsb, &dfops); + if (error) + break; } - xfs_trans_ijoin(tp, ip, 0); - error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, - XFS_BMAPI_COWFORK, &first_block, - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), - &imap, &nimaps, &dfops); - if (error) - goto out_trans_cancel; - - error = xfs_defer_finish(&tp, &dfops, NULL); - if (error) - goto out_trans_cancel; - - error = xfs_trans_commit(tp); - - *offset_fsb = imap.br_startoff + imap.br_blockcount; -out_unlock: + /* Finish up. */ xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; -out_trans_cancel: - xfs_defer_cancel(&dfops); - xfs_trans_cancel(tp); - goto out_unlock; } -/* Allocate all CoW reservations covering a part of a file. */ +/* Allocate all CoW reservations covering a range of blocks in a file. */ int -xfs_reflink_allocate_cow_range( +xfs_reflink_allocate_cow( struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t count) + struct xfs_bmbt_irec *imap, + bool *shared, + uint *lockmode) { struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); - int error; + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + struct xfs_bmbt_irec got; + struct xfs_defer_ops dfops; + struct xfs_trans *tp = NULL; + xfs_fsblock_t first_block; + int nimaps, error = 0; + bool trimmed; + xfs_filblks_t resaligned; + xfs_extlen_t resblks = 0; + xfs_extnum_t idx; +retry: ASSERT(xfs_is_reflink_inode(ip)); - - trace_xfs_reflink_allocate_cow_range(ip, offset, count); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); /* - * Make sure that the dquots are there. + * Even if the extent is not shared we might have a preallocation for + * it in the COW fork. If so use it. */ - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) && + got.br_startoff <= offset_fsb) { + *shared = true; - while (offset_fsb < end_fsb) { - error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); - if (error) { - trace_xfs_reflink_allocate_cow_range_error(ip, error, - _RET_IP_); - break; + /* If we have a real allocation in the COW fork we're done. */ + if (!isnullstartblock(got.br_startblock)) { + xfs_trim_extent(&got, offset_fsb, count_fsb); + *imap = got; + goto convert; } + + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + } else { + error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + if (error || !*shared) + goto out; + } + + if (!tp) { + resaligned = xfs_aligned_fsb_count(imap->br_startoff, + imap->br_blockcount, xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + + xfs_iunlock(ip, *lockmode); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + *lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, *lockmode); + + if (error) + return error; + + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + goto out; + goto retry; } + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_defer_init(&dfops, &first_block); + nimaps = 1; + + /* Allocate the entire reservation as unwritten blocks. */ + error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block, + resblks, imap, &nimaps, &dfops); + if (error) + goto out_bmap_cancel; + + /* Finish up. */ + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp); + if (error) + return error; +convert: + return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb, + &dfops); +out_bmap_cancel: + xfs_defer_cancel(&dfops); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, + XFS_QMOPT_RES_REGBLKS); +out: + if (tp) + xfs_trans_cancel(tp); return error; } @@ -641,6 +730,16 @@ xfs_reflink_end_cow( ASSERT(!isnullstartblock(got.br_startblock)); + /* + * Don't remap unwritten extents; these are + * speculatively preallocated CoW extents that have been + * allocated but have not yet been involved in a write. + */ + if (got.br_state == XFS_EXT_UNWRITTEN) { + idx--; + goto next_extent; + } + /* Unmap the old blocks in the data fork. */ xfs_defer_init(&dfops, &firstfsb); rlen = del.br_blockcount; @@ -855,13 +954,14 @@ STATIC int xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, - xfs_extlen_t cowextsize) + xfs_extlen_t cowextsize, + bool is_dedupe) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; - if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); @@ -882,6 +982,10 @@ xfs_reflink_update_dest( dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; } + if (!is_dedupe) { + xfs_trans_ichgtime(tp, dest, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); error = xfs_trans_commit(tp); @@ -1195,7 +1299,8 @@ xfs_reflink_remap_range( !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) cowextsize = src->i_d.di_cowextsize; - ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + is_dedupe); out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index aa6a4d64bd35..33ac9b8db683 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -28,8 +28,10 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); -extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, - xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); +extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, struct xfs_bmbt_irec *imap); extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 802bcc326d9f..c57aa7f18087 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1093,7 +1093,6 @@ xfs_rtallocate_extent( xfs_extlen_t minlen, /* minimum length to allocate */ xfs_extlen_t maxlen, /* maximum length to allocate */ xfs_extlen_t *len, /* out: actual length allocated */ - xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ int wasdel, /* was a delayed allocation extent */ xfs_extlen_t prod, /* extent product factor */ xfs_rtblock_t *rtblock) /* out: start block allocated */ @@ -1123,27 +1122,16 @@ xfs_rtallocate_extent( } } +retry: sumbp = NULL; - /* - * Allocate by size, or near another block, or exactly at some block. - */ - switch (type) { - case XFS_ALLOCTYPE_ANY_AG: + if (bno == 0) { error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, &sumbp, &sb, prod, &r); - break; - case XFS_ALLOCTYPE_NEAR_BNO: + } else { error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, len, &sumbp, &sb, prod, &r); - break; - case XFS_ALLOCTYPE_THIS_BNO: - error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, - len, &sumbp, &sb, prod, &r); - break; - default: - error = -EIO; - ASSERT(0); } + if (error) return error; @@ -1158,7 +1146,11 @@ xfs_rtallocate_extent( xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); else xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); + } else if (prod > 1) { + prod = 1; + goto retry; } + *rtblock = r; return 0; } diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 355dd9e1cb64..51dd3c726608 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -40,7 +40,6 @@ xfs_rtallocate_extent( xfs_extlen_t minlen, /* minimum length to allocate */ xfs_extlen_t maxlen, /* maximum length to allocate */ xfs_extlen_t *len, /* out: actual length allocated */ - xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ int wasdel, /* was a delayed allocation extent */ xfs_extlen_t prod, /* extent product factor */ xfs_rtblock_t *rtblock); /* out: start block allocated */ @@ -122,7 +121,7 @@ int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, #else -# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) +# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index eecbaac08eba..890862f2447c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1956,12 +1956,20 @@ xfs_init_workqueues(void) if (!xfs_alloc_wq) return -ENOMEM; + xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0); + if (!xfs_discard_wq) + goto out_free_alloc_wq; + return 0; +out_free_alloc_wq: + destroy_workqueue(xfs_alloc_wq); + return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { + destroy_workqueue(xfs_discard_wq); destroy_workqueue(xfs_alloc_wq); } diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index b6418abd85ad..5f2f32408011 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -73,6 +73,8 @@ extern const struct quotactl_ops xfs_quotactl_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); +extern struct workqueue_struct *xfs_discard_wq; + #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 276d3023d60f..80ac15fb9638 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -93,7 +93,7 @@ to_mp(struct kobject *kobject) #ifdef DEBUG STATIC ssize_t -fail_writes_store( +drop_writes_store( struct kobject *kobject, const char *buf, size_t count) @@ -107,9 +107,9 @@ fail_writes_store( return ret; if (val == 1) - mp->m_fail_writes = true; + mp->m_drop_writes = true; else if (val == 0) - mp->m_fail_writes = false; + mp->m_drop_writes = false; else return -EINVAL; @@ -117,21 +117,21 @@ fail_writes_store( } STATIC ssize_t -fail_writes_show( +drop_writes_show( struct kobject *kobject, char *buf) { struct xfs_mount *mp = to_mp(kobject); - return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0); + return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_drop_writes ? 1 : 0); } -XFS_SYSFS_ATTR_RW(fail_writes); +XFS_SYSFS_ATTR_RW(drop_writes); #endif /* DEBUG */ static struct attribute *xfs_mp_attrs[] = { #ifdef DEBUG - ATTR_LIST(fail_writes), + ATTR_LIST(drop_writes), #endif NULL, }; @@ -396,7 +396,7 @@ max_retries_show( int retries; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + if (cfg->max_retries == XFS_ERR_RETRY_FOREVER) retries = -1; else retries = cfg->max_retries; @@ -422,7 +422,7 @@ max_retries_store( return -EINVAL; if (val == -1) - cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + cfg->max_retries = XFS_ERR_RETRY_FOREVER; else cfg->max_retries = val; return count; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 69c5bcd9a51b..fb7555e73a62 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2245,7 +2245,6 @@ DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range); /* deferred ops */ struct xfs_defer_pending; -struct xfs_defer_intake; struct xfs_defer_ops; DECLARE_EVENT_CLASS(xfs_defer_class, @@ -3089,6 +3088,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __field(xfs_fileoff_t, lblk) __field(xfs_extlen_t, len) __field(xfs_fsblock_t, pblk) + __field(int, state) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -3096,13 +3096,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __entry->lblk = irec->br_startoff; __entry->len = irec->br_blockcount; __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->lblk, __entry->len, - __entry->pblk) + __entry->pblk, + __entry->state) ); #define DEFINE_INODE_IREC_EVENT(name) \ DEFINE_EVENT(xfs_inode_irec_class, name, \ @@ -3242,11 +3244,11 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); +DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_RW_EVENT(xfs_reflink_reserve_cow); -DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); -DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); @@ -3254,7 +3256,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 61b7fbdd3ebd..1646f659b60f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -32,7 +32,6 @@ struct xfs_mount; struct xfs_trans; struct xfs_trans_res; struct xfs_dquot_acct; -struct xfs_busy_extent; struct xfs_rud_log_item; struct xfs_rui_log_item; struct xfs_btree_cur; |