summaryrefslogtreecommitdiffstats
path: root/fs/ceph/addr.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-06 13:11:19 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-06 13:11:19 -0700
commit91f8575685e35f3bd021286bc82d26397458f5a9 (patch)
tree09de8d889758a12071adb9427ed741e27c907aa6 /fs/ceph/addr.c
parent2e378f3eebd28feefbb1f9953834a5a19482f053 (diff)
parentb5b09be30cf99f9c699e825629f02e3bce555d44 (diff)
downloadlinux-91f8575685e35f3bd021286bc82d26397458f5a9.tar.bz2
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph changes from Alex Elder: "This is a big pull. Most of it is culmination of Alex's work to implement RBD image layering, which is now complete (yay!). There is also some work from Yan to fix i_mutex behavior surrounding writes in cephfs, a sync write fix, a fix for RBD images that get resized while they are mapped, and a few patches from me that resolve annoying auth warnings and fix several bugs in the ceph auth code." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (254 commits) rbd: fix image request leak on parent read libceph: use slab cache for osd client requests libceph: allocate ceph message data with a slab allocator libceph: allocate ceph messages with a slab allocator rbd: allocate image object names with a slab allocator rbd: allocate object requests with a slab allocator rbd: allocate name separate from obj_request rbd: allocate image requests with a slab allocator rbd: use binary search for snapshot lookup rbd: clear EXISTS flag if mapped snapshot disappears rbd: kill off the snapshot list rbd: define rbd_snap_size() and rbd_snap_features() rbd: use snap_id not index to look up snap info rbd: look up snapshot name in names buffer rbd: drop obj_request->version rbd: drop rbd_obj_method_sync() version parameter rbd: more version parameter removal rbd: get rid of some version parameters rbd: stop tracking header object version rbd: snap names are pointer to constant data ...
Diffstat (limited to 'fs/ceph/addr.c')
-rw-r--r--fs/ceph/addr.c222
1 files changed, 108 insertions, 114 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a60ea977af6f..3e68ac101040 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -236,15 +236,21 @@ static int ceph_readpage(struct file *filp, struct page *page)
static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
{
struct inode *inode = req->r_inode;
+ struct ceph_osd_data *osd_data;
int rc = req->r_result;
int bytes = le32_to_cpu(msg->hdr.data_len);
+ int num_pages;
int i;
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
/* unlock all pages, zeroing any data we didn't read */
- for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
- struct page *page = req->r_pages[i];
+ osd_data = osd_req_op_extent_osd_data(req, 0);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
+ for (i = 0; i < num_pages; i++) {
+ struct page *page = osd_data->pages[i];
if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */
@@ -257,8 +263,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
SetPageUptodate(page);
unlock_page(page);
page_cache_release(page);
+ bytes -= PAGE_CACHE_SIZE;
}
- kfree(req->r_pages);
+ kfree(osd_data->pages);
}
static void ceph_unlock_page_vector(struct page **pages, int num_pages)
@@ -279,6 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
&ceph_inode_to_client(inode)->client->osdc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct page *page = list_entry(page_list->prev, struct page, lru);
+ struct ceph_vino vino;
struct ceph_osd_request *req;
u64 off;
u64 len;
@@ -303,18 +311,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
len = nr_pages << PAGE_CACHE_SHIFT;
dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
off, len);
-
- req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
- off, &len,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, 0,
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
+ 1, CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ, NULL,
ci->i_truncate_seq, ci->i_truncate_size,
- NULL, false, 0);
+ false);
if (IS_ERR(req))
return PTR_ERR(req);
/* build page vector */
- nr_pages = len >> PAGE_CACHE_SHIFT;
+ nr_pages = calc_pages_for(0, len);
pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
ret = -ENOMEM;
if (!pages)
@@ -336,11 +343,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
}
pages[i] = page;
}
- req->r_pages = pages;
- req->r_num_pages = nr_pages;
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
req->r_callback = finish_read;
req->r_inode = inode;
+ ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
+
dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
ret = ceph_osdc_start_request(osdc, req, false);
if (ret < 0)
@@ -373,7 +381,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
- dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
+ dout("readpages %p file %p nr_pages %d max %d\n", inode,
+ file, nr_pages,
max);
while (!list_empty(page_list)) {
rc = start_read(inode, page_list, max);
@@ -548,17 +557,23 @@ static void writepages_finish(struct ceph_osd_request *req,
{
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_data *osd_data;
unsigned wrote;
struct page *page;
+ int num_pages;
int i;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
int rc = req->r_result;
- u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
+ u64 bytes = req->r_ops[0].extent.length;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
long writeback_stat;
unsigned issued = ceph_caps_issued(ci);
+ osd_data = osd_req_op_extent_osd_data(req, 0);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
if (rc >= 0) {
/*
* Assume we wrote the pages we originally sent. The
@@ -566,7 +581,7 @@ static void writepages_finish(struct ceph_osd_request *req,
* raced with a truncation and was adjusted at the osd,
* so don't believe the reply.
*/
- wrote = req->r_num_pages;
+ wrote = num_pages;
} else {
wrote = 0;
mapping_set_error(mapping, rc);
@@ -575,8 +590,8 @@ static void writepages_finish(struct ceph_osd_request *req,
inode, rc, bytes, wrote);
/* clean all pages */
- for (i = 0; i < req->r_num_pages; i++) {
- page = req->r_pages[i];
+ for (i = 0; i < num_pages; i++) {
+ page = osd_data->pages[i];
BUG_ON(!page);
WARN_ON(!PageUptodate(page));
@@ -605,32 +620,34 @@ static void writepages_finish(struct ceph_osd_request *req,
unlock_page(page);
}
dout("%p wrote+cleaned %d pages\n", inode, wrote);
- ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+ ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
- ceph_release_pages(req->r_pages, req->r_num_pages);
- if (req->r_pages_from_pool)
- mempool_free(req->r_pages,
+ ceph_release_pages(osd_data->pages, num_pages);
+ if (osd_data->pages_from_pool)
+ mempool_free(osd_data->pages,
ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
else
- kfree(req->r_pages);
+ kfree(osd_data->pages);
ceph_osdc_put_request(req);
}
-/*
- * allocate a page vec, either directly, or if necessary, via a the
- * mempool. we avoid the mempool if we can because req->r_num_pages
- * may be less than the maximum write size.
- */
-static void alloc_page_vec(struct ceph_fs_client *fsc,
- struct ceph_osd_request *req)
+static struct ceph_osd_request *
+ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
+ struct ceph_snap_context *snapc, int num_ops)
{
- req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
- GFP_NOFS);
- if (!req->r_pages) {
- req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
- req->r_pages_from_pool = 1;
- WARN_ON(!req->r_pages);
- }
+ struct ceph_fs_client *fsc;
+ struct ceph_inode_info *ci;
+ struct ceph_vino vino;
+
+ fsc = ceph_inode_to_client(inode);
+ ci = ceph_inode(inode);
+ vino = ceph_vino(inode);
+ /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+
+ return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
+ snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
}
/*
@@ -653,7 +670,7 @@ static int ceph_writepages_start(struct address_space *mapping,
unsigned wsize = 1 << inode->i_blkbits;
struct ceph_osd_request *req = NULL;
int do_sync;
- u64 snap_size = 0;
+ u64 snap_size;
/*
* Include a 'sync' in the OSD request if this is a data
@@ -699,6 +716,7 @@ static int ceph_writepages_start(struct address_space *mapping,
retry:
/* find oldest snap context with dirty data */
ceph_put_snap_context(snapc);
+ snap_size = 0;
snapc = get_oldest_context(inode, &snap_size);
if (!snapc) {
/* hmm, why does writepages get called when there
@@ -706,6 +724,8 @@ retry:
dout(" no snap context with dirty data?\n");
goto out;
}
+ if (snap_size == 0)
+ snap_size = i_size_read(inode);
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
snapc, snapc->seq, snapc->num_snaps);
if (last_snapc && snapc != last_snapc) {
@@ -718,10 +738,14 @@ retry:
last_snapc = snapc;
while (!done && index <= end) {
+ int num_ops = do_sync ? 2 : 1;
+ struct ceph_vino vino;
unsigned i;
int first;
pgoff_t next;
int pvec_pages, locked_pages;
+ struct page **pages = NULL;
+ mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page;
int want;
u64 offset, len;
@@ -773,11 +797,8 @@ get_more_pages:
dout("waiting on writeback %p\n", page);
wait_on_page_writeback(page);
}
- if ((snap_size && page_offset(page) > snap_size) ||
- (!snap_size &&
- page_offset(page) > i_size_read(inode))) {
- dout("%p page eof %llu\n", page, snap_size ?
- snap_size : i_size_read(inode));
+ if (page_offset(page) >= snap_size) {
+ dout("%p page eof %llu\n", page, snap_size);
done = 1;
unlock_page(page);
break;
@@ -805,22 +826,23 @@ get_more_pages:
break;
}
- /* ok */
+ /*
+ * We have something to write. If this is
+ * the first locked page this time through,
+ * allocate an osd request and a page array
+ * that it will use.
+ */
if (locked_pages == 0) {
+ size_t size;
+
+ BUG_ON(pages);
+
/* prepare async write request */
- offset = (u64) page_offset(page);
+ offset = (u64)page_offset(page);
len = wsize;
- req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout,
- ceph_vino(inode),
- offset, &len,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
- snapc, do_sync,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- &inode->i_mtime, true, 0);
+ req = ceph_writepages_osd_request(inode,
+ offset, &len, snapc,
+ num_ops);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
@@ -828,11 +850,17 @@ get_more_pages:
break;
}
- max_pages = req->r_num_pages;
-
- alloc_page_vec(fsc, req);
req->r_callback = writepages_finish;
req->r_inode = inode;
+
+ max_pages = calc_pages_for(0, (u64)len);
+ size = max_pages * sizeof (*pages);
+ pages = kmalloc(size, GFP_NOFS);
+ if (!pages) {
+ pool = fsc->wb_pagevec_pool;
+ pages = mempool_alloc(pool, GFP_NOFS);
+ BUG_ON(!pages);
+ }
}
/* note position of first page in pvec */
@@ -850,7 +878,7 @@ get_more_pages:
}
set_page_writeback(page);
- req->r_pages[locked_pages] = page;
+ pages[locked_pages] = page;
locked_pages++;
next = page->index + 1;
}
@@ -879,18 +907,27 @@ get_more_pages:
pvec.nr -= i-first;
}
- /* submit the write */
- offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
- len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+ /* Format the osd request message and submit the write */
+
+ offset = page_offset(pages[0]);
+ len = min(snap_size - offset,
(u64)locked_pages << PAGE_CACHE_SHIFT);
dout("writepages got %d pages at %llu~%llu\n",
locked_pages, offset, len);
- /* revise final length, page count */
- req->r_num_pages = locked_pages;
- req->r_request_ops[0].extent.length = cpu_to_le64(len);
- req->r_request_ops[0].payload_len = cpu_to_le32(len);
- req->r_request->hdr.data_len = cpu_to_le32(len);
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+ !!pool, false);
+
+ pages = NULL; /* request message now owns the pages array */
+ pool = NULL;
+
+ /* Update the write op length in case we changed it */
+
+ osd_req_op_extent_update(req, 0, len);
+
+ vino = ceph_vino(inode);
+ ceph_osdc_build_request(req, offset, snapc, vino.snap,
+ &inode->i_mtime);
rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
BUG_ON(rc);
@@ -1067,51 +1104,23 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_file_info *fi = file->private_data;
struct page *page;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- int r, want, got = 0;
-
- if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_BUFFER;
-
- dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, len, inode->i_size);
- r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
- if (r < 0)
- return r;
- dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
- if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
- ceph_put_cap_refs(ci, got);
- return -EAGAIN;
- }
+ int r;
do {
/* get a page */
page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page) {
- r = -ENOMEM;
- break;
- }
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
dout("write_begin file %p inode %p page %p %d~%d\n", file,
inode, page, (int)pos, (int)len);
r = ceph_update_writeable_page(file, pos, len, page);
- if (r)
- page_cache_release(page);
} while (r == -EAGAIN);
- if (r) {
- ceph_put_cap_refs(ci, got);
- } else {
- *pagep = page;
- *(int *)fsdata = got;
- }
return r;
}
@@ -1125,12 +1134,10 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
int check_cap = 0;
- int got = (unsigned long)fsdata;
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
inode, page, (int)pos, (int)copied, (int)len);
@@ -1153,19 +1160,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
up_read(&mdsc->snap_rwsem);
page_cache_release(page);
- if (copied > 0) {
- int dirty;
- spin_lock(&ci->i_ceph_lock);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
- spin_unlock(&ci->i_ceph_lock);
- if (dirty)
- __mark_inode_dirty(inode, dirty);
- }
-
- dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
- ceph_put_cap_refs(ci, got);
-
if (check_cap)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);