From 00dc417fa3e763345b34ccb6034d72de76eea0a1 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Fri, 3 Oct 2008 17:32:11 -0400
Subject: ocfs2: fiemap support

Plug ocfs2 into ->fiemap. Some portions of ocfs2_get_clusters() had to be
refactored so that the extent cache can be skipped in favor of going
directly to the on-disk records. This makes it easier for us to determine
which extent is the last one in the btree. Also, I'm not sure we want to be
caching fiemap lookups anyway as they're not directly related to data
read/write.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: ocfs2-devel@oss.oracle.com
Cc: linux-fsdevel@vger.kernel.org
---
 fs/ocfs2/alloc.c      |   9 --
 fs/ocfs2/alloc.h      |   9 ++
 fs/ocfs2/extent_map.c | 346 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/ocfs2/extent_map.h |   3 +
 fs/ocfs2/file.c       |   1 +
 5 files changed, 306 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e068..29ff57ec5d1f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -989,15 +989,6 @@ out:
 	return ret;
 }
 
-/*
- * This is only valid for leaf nodes, which are the only ones that can
- * have empty extents anyway.
- */
-static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-{
-	return !rec->e_leaf_clusters;
-}
-
 /*
  * This function will discard the rightmost extent record.
  */
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd8011..60cd3d59230c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -146,4 +146,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
 		return le16_to_cpu(rec->e_leaf_clusters);
 }
 
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+	return !rec->e_leaf_clusters;
+}
+
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326fe..aed268e80b49 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/fiemap.h>
 
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
@@ -282,6 +284,51 @@ out:
 		kfree(new_emi);
 }
 
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	int ret, next_free;
+	u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
+			       &eb_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+	el = &eb->h_list;
+
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		ret = -EROFS;
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		goto out;
+	}
+
+	if (el->l_tree_depth) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has non zero tree depth in "
+			    "leaf block %llu\n", inode->i_ino,
+			    (unsigned long long)eb_bh->b_blocknr);
+		ret = -EROFS;
+		goto out;
+	}
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+		ret = 1;
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
 /*
  * Return the 1st index within el which contains an extent start
  * larger than v_cluster.
@@ -373,42 +420,28 @@ out:
 	return ret;
 }
 
-int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
-		       u32 *p_cluster, u32 *num_clusters,
-		       unsigned int *extent_flags)
+static int ocfs2_get_clusters_nocache(struct inode *inode,
+				      struct buffer_head *di_bh,
+				      u32 v_cluster, unsigned int *hole_len,
+				      struct ocfs2_extent_rec *ret_rec,
+				      unsigned int *is_last)
 {
-	int ret, i;
-	unsigned int flags = 0;
-	struct buffer_head *di_bh = NULL;
-	struct buffer_head *eb_bh = NULL;
+	int i, ret, tree_height, len;
 	struct ocfs2_dinode *di;
-	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_block *uninitialized_var(eb);
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec;
-	u32 coff;
-
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = -ERANGE;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
-				      num_clusters, extent_flags);
-	if (ret == 0)
-		goto out;
+	struct buffer_head *eb_bh = NULL;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	memset(ret_rec, 0, sizeof(*ret_rec));
+	if (is_last)
+		*is_last = 0;
 
 	di = (struct ocfs2_dinode *) di_bh->b_data;
 	el = &di->id2.i_list;
+	tree_height = le16_to_cpu(el->l_tree_depth);
 
-	if (el->l_tree_depth) {
+	if (tree_height > 0) {
 		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
 		if (ret) {
 			mlog_errno(ret);
@@ -431,46 +464,143 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	i = ocfs2_search_extent_list(el, v_cluster);
 	if (i == -1) {
 		/*
-		 * A hole was found. Return some canned values that
-		 * callers can key on. If asked for, num_clusters will
-		 * be populated with the size of the hole.
+		 * Holes can be larger than the maximum size of an
+		 * extent, so we return their lengths in a seperate
+		 * field.
 		 */
-		*p_cluster = 0;
-		if (num_clusters) {
+		if (hole_len) {
 			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
-							 v_cluster,
-							 num_clusters);
+							 v_cluster, &len);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
+
+			*hole_len = len;
 		}
-	} else {
-		rec = &el->l_recs[i];
+		goto out_hole;
+	}
 
-		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+	rec = &el->l_recs[i];
 
-		if (!rec->e_blkno) {
-			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-				    "record (%u, %u, 0)", inode->i_ino,
-				    le32_to_cpu(rec->e_cpos),
-				    ocfs2_rec_clusters(el, rec));
-			ret = -EROFS;
-			goto out;
+	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+	if (!rec->e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0)", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*ret_rec = *rec;
+
+	/*
+	 * Checking for last extent is potentially expensive - we
+	 * might have to look at the next leaf over to see if it's
+	 * empty.
+	 *
+	 * The first two checks are to see whether the caller even
+	 * cares for this information, and if the extent is at least
+	 * the last in it's list.
+	 *
+	 * If those hold true, then the extent is last if any of the
+	 * additional conditions hold true:
+	 *  - Extent list is in-inode
+	 *  - Extent list is right-most
+	 *  - Extent list is 2nd to rightmost, with empty right-most
+	 */
+	if (is_last) {
+		if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+			if (tree_height == 0)
+				*is_last = 1;
+			else if (eb->h_blkno == di->i_last_eb_blk)
+				*is_last = 1;
+			else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+				ret = ocfs2_last_eb_is_empty(inode, di);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+				if (ret == 1)
+					*is_last = 1;
+			}
 		}
+	}
+
+out_hole:
+	ret = 0;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+					  u32 v_cluster,
+					  struct ocfs2_extent_rec *rec,
+					  u32 *p_cluster, u32 *num_clusters)
+
+{
+	u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
+
+	*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+	*p_cluster = *p_cluster + coff;
+
+	if (num_clusters)
+		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
+}
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+		       u32 *p_cluster, u32 *num_clusters,
+		       unsigned int *extent_flags)
+{
+	int ret;
+	unsigned int uninitialized_var(hole_len), flags = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
 
-		coff = v_cluster - le32_to_cpu(rec->e_cpos);
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = -ERANGE;
+		mlog_errno(ret);
+		goto out;
+	}
 
-		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
-						    le64_to_cpu(rec->e_blkno));
-		*p_cluster = *p_cluster + coff;
+	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+				      num_clusters, extent_flags);
+	if (ret == 0)
+		goto out;
 
-		if (num_clusters)
-			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		flags = rec->e_flags;
+	ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+					 &rec, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		ocfs2_extent_map_insert_rec(inode, rec);
+	if (rec.e_blkno == 0ULL) {
+		/*
+		 * A hole was found. Return some canned values that
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
+		 */
+		*p_cluster = 0;
+		if (num_clusters) {
+			*num_clusters = hole_len;
+		}
+	} else {
+		ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+					      p_cluster, num_clusters);
+		flags = rec.e_flags;
+
+		ocfs2_extent_map_insert_rec(inode, &rec);
 	}
 
 	if (extent_flags)
@@ -478,7 +608,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 
 out:
 	brelse(di_bh);
-	brelse(eb_bh);
 	return ret;
 }
 
@@ -521,3 +650,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 out:
 	return ret;
 }
+
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+			       struct fiemap_extent_info *fieinfo,
+			       u64 map_start)
+{
+	int ret;
+	unsigned int id_count;
+	struct ocfs2_dinode *di;
+	u64 phys;
+	u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	id_count = le16_to_cpu(di->id2.i_data.id_count);
+
+	if (map_start < id_count) {
+		phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+		phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+
+		ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+					      flags);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+#define OCFS2_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len)
+{
+	int ret, is_last;
+	u32 mapping_end, cpos;
+	unsigned int hole_size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u64 len_bytes, phys_bytes, virt_bytes;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * Handle inline-data separately.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+		goto out_unlock;
+	}
+
+	cpos = map_start >> osb->s_clustersize_bits;
+	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+					       map_start + map_len);
+	mapping_end -= cpos;
+	is_last = 0;
+	while (cpos < mapping_end && !is_last) {
+		u32 fe_flags;
+
+		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+						 &hole_size, &rec, &is_last);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (rec.e_blkno == 0ULL) {
+			cpos += hole_size;
+			continue;
+		}
+
+		fe_flags = 0;
+		if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+			fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+		if (is_last)
+			fe_flags |= FIEMAP_EXTENT_LAST;
+		len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+		phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+		virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+
+		ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+					      len_bytes, fe_flags);
+		if (ret)
+			break;
+
+		cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+out_unlock:
+	brelse(di_bh);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+
+	return ret;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a22..1b97490e1ea8 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 				u64 *ret_count, unsigned int *extent_flags);
 
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len);
+
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ec2ed15c3daa..ed38796052d2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2228,6 +2228,7 @@ const struct inode_operations ocfs2_file_iops = {
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 	.fallocate	= ocfs2_fallocate,
+	.fiemap		= ocfs2_fiemap,
 };
 
 const struct inode_operations ocfs2_special_file_iops = {
-- 
cgit v1.2.3