diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-06 13:11:19 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-06 13:11:19 -0700 |
commit | 91f8575685e35f3bd021286bc82d26397458f5a9 (patch) | |
tree | 09de8d889758a12071adb9427ed741e27c907aa6 /fs/ceph/addr.c | |
parent | 2e378f3eebd28feefbb1f9953834a5a19482f053 (diff) | |
parent | b5b09be30cf99f9c699e825629f02e3bce555d44 (diff) | |
download | linux-91f8575685e35f3bd021286bc82d26397458f5a9.tar.bz2 |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph changes from Alex Elder:
"This is a big pull.
Most of it is culmination of Alex's work to implement RBD image
layering, which is now complete (yay!).
There is also some work from Yan to fix i_mutex behavior surrounding
writes in cephfs, a sync write fix, a fix for RBD images that get
resized while they are mapped, and a few patches from me that resolve
annoying auth warnings and fix several bugs in the ceph auth code."
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (254 commits)
rbd: fix image request leak on parent read
libceph: use slab cache for osd client requests
libceph: allocate ceph message data with a slab allocator
libceph: allocate ceph messages with a slab allocator
rbd: allocate image object names with a slab allocator
rbd: allocate object requests with a slab allocator
rbd: allocate name separate from obj_request
rbd: allocate image requests with a slab allocator
rbd: use binary search for snapshot lookup
rbd: clear EXISTS flag if mapped snapshot disappears
rbd: kill off the snapshot list
rbd: define rbd_snap_size() and rbd_snap_features()
rbd: use snap_id not index to look up snap info
rbd: look up snapshot name in names buffer
rbd: drop obj_request->version
rbd: drop rbd_obj_method_sync() version parameter
rbd: more version parameter removal
rbd: get rid of some version parameters
rbd: stop tracking header object version
rbd: snap names are pointer to constant data
...
Diffstat (limited to 'fs/ceph/addr.c')
-rw-r--r-- | fs/ceph/addr.c | 222 |
1 files changed, 108 insertions, 114 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a60ea977af6f..3e68ac101040 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -236,15 +236,21 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; + struct ceph_osd_data *osd_data; int rc = req->r_result; int bytes = le32_to_cpu(msg->hdr.data_len); + int num_pages; int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { - struct page *page = req->r_pages[i]; + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + for (i = 0; i < num_pages; i++) { + struct page *page = osd_data->pages[i]; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ @@ -257,8 +263,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) SetPageUptodate(page); unlock_page(page); page_cache_release(page); + bytes -= PAGE_CACHE_SIZE; } - kfree(req->r_pages); + kfree(osd_data->pages); } static void ceph_unlock_page_vector(struct page **pages, int num_pages) @@ -279,6 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) &ceph_inode_to_client(inode)->client->osdc; struct ceph_inode_info *ci = ceph_inode(inode); struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_vino vino; struct ceph_osd_request *req; u64 off; u64 len; @@ -303,18 +311,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) len = nr_pages << PAGE_CACHE_SHIFT; dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, off, len); - - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), - off, &len, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, + vino = ceph_vino(inode); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, + 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, - NULL, false, 0); + false); if (IS_ERR(req)) return PTR_ERR(req); /* build page vector */ - nr_pages = len >> PAGE_CACHE_SHIFT; + nr_pages = calc_pages_for(0, len); pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); ret = -ENOMEM; if (!pages) @@ -336,11 +343,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_pages = pages; - req->r_num_pages = nr_pages; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -373,7 +381,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + dout("readpages %p file %p nr_pages %d max %d\n", inode, + file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); @@ -548,17 +557,23 @@ static void writepages_finish(struct ceph_osd_request *req, { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_data *osd_data; unsigned wrote; struct page *page; + int num_pages; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; int rc = req->r_result; - u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); + u64 bytes = req->r_ops[0].extent.length; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -566,7 +581,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_num_pages; + wrote = num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -575,8 +590,8 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_num_pages; i++) { - page = req->r_pages[i]; + for (i = 0; i < num_pages; i++) { + page = osd_data->pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -605,32 +620,34 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(req->r_pages, req->r_num_pages); - if (req->r_pages_from_pool) - mempool_free(req->r_pages, + ceph_release_pages(osd_data->pages, num_pages); + if (osd_data->pages_from_pool) + mempool_free(osd_data->pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else - kfree(req->r_pages); + kfree(osd_data->pages); ceph_osdc_put_request(req); } -/* - * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_num_pages - * may be less than the maximum write size. - */ -static void alloc_page_vec(struct ceph_fs_client *fsc, - struct ceph_osd_request *req) +static struct ceph_osd_request * +ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, + struct ceph_snap_context *snapc, int num_ops) { - req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, - GFP_NOFS); - if (!req->r_pages) { - req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); - req->r_pages_from_pool = 1; - WARN_ON(!req->r_pages); - } + struct ceph_fs_client *fsc; + struct ceph_inode_info *ci; + struct ceph_vino vino; + + fsc = ceph_inode_to_client(inode); + ci = ceph_inode(inode); + vino = ceph_vino(inode); + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + + return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, offset, len, num_ops, CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK, + snapc, ci->i_truncate_seq, ci->i_truncate_size, true); } /* @@ -653,7 +670,7 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync; - u64 snap_size = 0; + u64 snap_size; /* * Include a 'sync' in the OSD request if this is a data @@ -699,6 +716,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); + snap_size = 0; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -706,6 +724,8 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } + if (snap_size == 0) + snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); if (last_snapc && snapc != last_snapc) { @@ -718,10 +738,14 @@ retry: last_snapc = snapc; while (!done && index <= end) { + int num_ops = do_sync ? 2 : 1; + struct ceph_vino vino; unsigned i; int first; pgoff_t next; int pvec_pages, locked_pages; + struct page **pages = NULL; + mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; int want; u64 offset, len; @@ -773,11 +797,8 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if ((snap_size && page_offset(page) > snap_size) || - (!snap_size && - page_offset(page) > i_size_read(inode))) { - dout("%p page eof %llu\n", page, snap_size ? - snap_size : i_size_read(inode)); + if (page_offset(page) >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); done = 1; unlock_page(page); break; @@ -805,22 +826,23 @@ get_more_pages: break; } - /* ok */ + /* + * We have something to write. If this is + * the first locked page this time through, + * allocate an osd request and a page array + * that it will use. + */ if (locked_pages == 0) { + size_t size; + + BUG_ON(pages); + /* prepare async write request */ - offset = (u64) page_offset(page); + offset = (u64)page_offset(page); len = wsize; - req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, - ceph_vino(inode), - offset, &len, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, - snapc, do_sync, - ci->i_truncate_seq, - ci->i_truncate_size, - &inode->i_mtime, true, 0); + req = ceph_writepages_osd_request(inode, + offset, &len, snapc, + num_ops); if (IS_ERR(req)) { rc = PTR_ERR(req); @@ -828,11 +850,17 @@ get_more_pages: break; } - max_pages = req->r_num_pages; - - alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; + + max_pages = calc_pages_for(0, (u64)len); + size = max_pages * sizeof (*pages); + pages = kmalloc(size, GFP_NOFS); + if (!pages) { + pool = fsc->wb_pagevec_pool; + pages = mempool_alloc(pool, GFP_NOFS); + BUG_ON(!pages); + } } /* note position of first page in pvec */ @@ -850,7 +878,7 @@ get_more_pages: } set_page_writeback(page); - req->r_pages[locked_pages] = page; + pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -879,18 +907,27 @@ get_more_pages: pvec.nr -= i-first; } - /* submit the write */ - offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; - len = min((snap_size ? snap_size : i_size_read(inode)) - offset, + /* Format the osd request message and submit the write */ + + offset = page_offset(pages[0]); + len = min(snap_size - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - /* revise final length, page count */ - req->r_num_pages = locked_pages; - req->r_request_ops[0].extent.length = cpu_to_le64(len); - req->r_request_ops[0].payload_len = cpu_to_le32(len); - req->r_request->hdr.data_len = cpu_to_le32(len); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + !!pool, false); + + pages = NULL; /* request message now owns the pages array */ + pool = NULL; + + /* Update the write op length in case we changed it */ + + osd_req_op_extent_update(req, 0, len); + + vino = ceph_vino(inode); + ceph_osdc_build_request(req, offset, snapc, vino.snap, + &inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); @@ -1067,51 +1104,23 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_file_info *fi = file->private_data; struct page *page; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - int r, want, got = 0; - - if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; - - dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, len, inode->i_size); - r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); - if (r < 0) - return r; - dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); - if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { - ceph_put_cap_refs(ci, got); - return -EAGAIN; - } + int r; do { /* get a page */ page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) { - r = -ENOMEM; - break; - } + if (!page) + return -ENOMEM; + *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); - if (r) - page_cache_release(page); } while (r == -EAGAIN); - if (r) { - ceph_put_cap_refs(ci, got); - } else { - *pagep = page; - *(int *)fsdata = got; - } return r; } @@ -1125,12 +1134,10 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; - int got = (unsigned long)fsdata; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); @@ -1153,19 +1160,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, up_read(&mdsc->snap_rwsem); page_cache_release(page); - if (copied > 0) { - int dirty; - spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&ci->i_ceph_lock); - if (dirty) - __mark_inode_dirty(inode, dirty); - } - - dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); - ceph_put_cap_refs(ci, got); - if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); |