summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ext4.txt125
-rw-r--r--drivers/gpu/drm/drm_memory.c5
-rw-r--r--drivers/gpu/drm/radeon/radeon_cp.c2
-rw-r--r--drivers/infiniband/core/addr.c42
-rw-r--r--drivers/infiniband/core/agent.h2
-rw-r--r--drivers/infiniband/core/cache.c2
-rw-r--r--drivers/infiniband/core/cm.c2
-rw-r--r--drivers/infiniband/core/cma.c162
-rw-r--r--drivers/infiniband/core/core_priv.h2
-rw-r--r--drivers/infiniband/core/device.c2
-rw-r--r--drivers/infiniband/core/fmr_pool.c2
-rw-r--r--drivers/infiniband/core/mad_priv.h2
-rw-r--r--drivers/infiniband/core/mad_rmpp.c2
-rw-r--r--drivers/infiniband/core/mad_rmpp.h2
-rw-r--r--drivers/infiniband/core/packer.c2
-rw-r--r--drivers/infiniband/core/sa_query.c24
-rw-r--r--drivers/infiniband/core/sysfs.c122
-rw-r--r--drivers/infiniband/core/ucm.c2
-rw-r--r--drivers/infiniband/core/ud_header.c2
-rw-r--r--drivers/infiniband/core/umem.c2
-rw-r--r--drivers/infiniband/core/user_mad.c2
-rw-r--r--drivers/infiniband/core/uverbs.h2
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c4
-rw-r--r--drivers/infiniband/core/uverbs_main.c2
-rw-r--r--drivers/infiniband/core/verbs.c49
-rw-r--r--drivers/infiniband/hw/amso1100/c2_rnic.c2
-rw-r--r--drivers/infiniband/hw/cxgb3/cxio_hal.c27
-rw-r--r--drivers/infiniband/hw/cxgb3/cxio_hal.h5
-rw-r--r--drivers/infiniband/hw/cxgb3/cxio_wr.h103
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch.c8
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch.h2
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_cq.c15
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.c199
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.h8
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_qp.c261
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.c9
-rw-r--r--drivers/infiniband/hw/ehca/ehca_main.c1
-rw-r--r--drivers/infiniband/hw/ehca/ehca_reqs.c14
-rw-r--r--drivers/infiniband/hw/ehca/hcp_if.c10
-rw-r--r--drivers/infiniband/hw/ehca/hcp_if.h1
-rw-r--r--drivers/infiniband/hw/ipath/ipath_cq.c2
-rw-r--r--drivers/infiniband/hw/ipath/ipath_iba7220.c4
-rw-r--r--drivers/infiniband/hw/ipath/ipath_mad.c6
-rw-r--r--drivers/infiniband/hw/ipath/ipath_rc.c4
-rw-r--r--drivers/infiniband/hw/ipath/ipath_ruc.c4
-rw-r--r--drivers/infiniband/hw/ipath/ipath_uc.c8
-rw-r--r--drivers/infiniband/hw/ipath/ipath_ud.c8
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs.c3
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c12
-rw-r--r--drivers/infiniband/hw/mlx4/mad.c3
-rw-r--r--drivers/infiniband/hw/mlx4/main.c7
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h3
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c73
-rw-r--r--drivers/infiniband/hw/mthca/mthca_allocator.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_av.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_catas.c17
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cmd.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cmd.h2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_config_reg.h2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cq.c6
-rw-r--r--drivers/infiniband/hw/mthca/mthca_dev.h3
-rw-r--r--drivers/infiniband/hw/mthca/mthca_doorbell.h2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_eq.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_mad.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_main.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_mcg.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.h2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_mr.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_pd.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_profile.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_profile.h2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.h2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_qp.c32
-rw-r--r--drivers/infiniband/hw/mthca/mthca_reset.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_srq.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_uar.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_user.h1
-rw-r--r--drivers/infiniband/hw/mthca/mthca_wqe.h2
-rw-r--r--drivers/infiniband/hw/nes/nes.c2
-rw-r--r--drivers/infiniband/hw/nes/nes.h9
-rw-r--r--drivers/infiniband/hw/nes/nes_cm.c1
-rw-r--r--drivers/infiniband/hw/nes/nes_hw.c68
-rw-r--r--drivers/infiniband/hw/nes/nes_hw.h2
-rw-r--r--drivers/infiniband/hw/nes/nes_utils.c33
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.c207
-rw-r--r--drivers/infiniband/ulp/ipoib/Kconfig1
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h48
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c104
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ethtool.c46
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_fs.c2
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c52
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c115
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c27
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c69
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_vlan.c2
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c3
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h2
-rw-r--r--drivers/infiniband/ulp/iser/iser_initiator.c2
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c2
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c2
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c15
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.h2
-rw-r--r--drivers/net/cxgb3/cxgb3_ctl_defs.h1
-rw-r--r--drivers/net/cxgb3/cxgb3_offload.c7
-rw-r--r--drivers/net/cxgb3/version.h2
-rw-r--r--drivers/net/mlx4/fw.c28
-rw-r--r--drivers/net/mlx4/fw.h6
-rw-r--r--drivers/net/mlx4/main.c7
-rw-r--r--drivers/net/mlx4/mcg.c17
-rw-r--r--firmware/Makefile2
-rw-r--r--fs/buffer.c19
-rw-r--r--fs/ext4/balloc.c209
-rw-r--r--fs/ext4/dir.c17
-rw-r--r--fs/ext4/ext4.h61
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h10
-rw-r--r--fs/ext4/ext4_jbd2.h21
-rw-r--r--fs/ext4/ext4_sb.h5
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/inode.c1591
-rw-r--r--fs/ext4/mballoc.c451
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/resize.c52
-rw-r--r--fs/ext4/super.c142
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c294
-rw-r--r--fs/jbd2/journal.c53
-rw-r--r--fs/jbd2/transaction.c365
-rw-r--r--fs/mpage.c14
-rw-r--r--include/drm/drmP.h1
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/jbd2.h73
-rw-r--r--include/linux/mlx4/device.h3
-rw-r--r--include/linux/mpage.h10
-rw-r--r--include/linux/percpu_counter.h12
-rw-r--r--include/linux/writeback.h1
-rw-r--r--include/rdma/ib_addr.h43
-rw-r--r--include/rdma/ib_cache.h2
-rw-r--r--include/rdma/ib_cm.h2
-rw-r--r--include/rdma/ib_fmr_pool.h4
-rw-r--r--include/rdma/ib_mad.h17
-rw-r--r--include/rdma/ib_pack.h2
-rw-r--r--include/rdma/ib_sa.h2
-rw-r--r--include/rdma/ib_smi.h4
-rw-r--r--include/rdma/ib_user_cm.h2
-rw-r--r--include/rdma/ib_user_mad.h2
-rw-r--r--include/rdma/ib_user_verbs.h7
-rw-r--r--include/rdma/ib_verbs.h149
-rw-r--r--include/rdma/iw_cm.h2
-rw-r--r--include/rdma/rdma_cm.h52
-rw-r--r--include/rdma/rdma_cm_ib.h50
-rw-r--r--lib/percpu_counter.c7
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/page-writeback.c3
-rw-r--r--security/selinux/hooks.c22
-rw-r--r--security/selinux/include/security.h2
-rw-r--r--security/selinux/ss/services.c27
166 files changed, 4458 insertions, 2017 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0c5086db8352..80e193d82e2e 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -13,72 +13,93 @@ Mailing list: linux-ext4@vger.kernel.org
1. Quick usage instructions:
===========================
- - Grab updated e2fsprogs from
- ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
- This is a patchset on top of e2fsprogs-1.39, which can be found at
+ - Compile and install the latest version of e2fsprogs (as of this
+ writing version 1.41) from:
+
+ http://sourceforge.net/project/showfiles.php?group_id=2406
+
+ or
+
ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
- - It's still mke2fs -j /dev/hda1
+ or grab the latest git repository from:
+
+ git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+
+ - Create a new filesystem using the ext4dev filesystem type:
+
+ # mke2fs -t ext4dev /dev/hda1
+
+ Or configure an existing ext3 filesystem to support extents and set
+ the test_fs flag to indicate that it's ok for an in-development
+ filesystem to touch this filesystem:
- - mount /dev/hda1 /wherever -t ext4dev
+ # tune2fs -O extents -E test_fs /dev/hda1
- - To enable extents,
+ If the filesystem was created with 128 byte inodes, it can be
+ converted to use 256 byte for greater efficiency via:
- mount /dev/hda1 /wherever -t ext4dev -o extents
+ # tune2fs -I 256 /dev/hda1
- - The filesystem is compatible with the ext3 driver until you add a file
- which has extents (ie: `mount -o extents', then create a file).
+ (Note: we currently do not have tools to convert an ext4dev
+ filesystem back to ext3; so please do not do try this on production
+ filesystems.)
- NOTE: The "extents" mount flag is temporary. It will soon go away and
- extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
+ - Mounting:
+
+ # mount -t ext4dev /dev/hda1 /wherever
- When comparing performance with other filesystems, remember that
- ext3/4 by default offers higher data integrity guarantees than most. So
- when comparing with a metadata-only journalling filesystem, use `mount -o
- data=writeback'. And you might as well use `mount -o nobh' too along
- with it. Making the journal larger than the mke2fs default often helps
- performance with metadata-intensive workloads.
+ ext3/4 by default offers higher data integrity guarantees than most.
+ So when comparing with a metadata-only journalling filesystem, such
+ as ext3, use `mount -o data=writeback'. And you might as well use
+ `mount -o nobh' too along with it. Making the journal larger than
+ the mke2fs default often helps performance with metadata-intensive
+ workloads.
2. Features
===========
2.1 Currently available
-* ability to use filesystems > 16TB
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
* extent format reduces metadata overhead (RAM, IO for access, transactions)
* extent format more robust in face of on-disk corruption due to magics,
* internal redunancy in tree
-
-2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
-
-* dir_index and resize inode will be on by default
-* large inodes will be used by default for fast EAs, nsec timestamps, etc
+* improved file allocation (multi-block alloc)
+* fix 32000 subdirectory limit
+* nsec timestamps for mtime, atime, ctime, create time
+* inode version field on disk (NFSv4, Lustre)
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+ flex_bg feature
+* large file support
+* Inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
+ the ordering)
2.2 Candidate features for future inclusion
-There are several under discussion, whether they all make it in is
-partly a function of how much time everyone has to work on them:
+* Online defrag (patches available but not well tested)
+* reduced mke2fs time via lazy itable initialization in conjuction with
+ the uninit_bg feature (capability to do this is available in e2fsprogs
+ but a kernel thread to do lazy zeroing of unused inode table blocks
+ after filesystem is first mounted is required for safety)
-* improved file allocation (multi-block alloc, delayed alloc; basically done)
-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
-* nsec timestamps for mtime, atime, ctime, create time (patch exists,
- needs some e2fsck work)
-* inode version field on disk (NFSv4, Lustre; prototype exists)
-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
-* journal checksumming for robustness, performance (prototype exists)
-* persistent file preallocation (e.g for streaming media, databases)
+There are several others under discussion, whether they all make it in is
+partly a function of how much time everyone has to work on them. Features like
+metadata checksumming have been discussed and planned for a bit but no patches
+exist yet so I'm not sure they're in the near-term roadmap.
-Features like metadata checksumming have been discussed and planned for
-a bit but no patches exist yet so I'm not sure they're in the near-term
-roadmap.
+The big performance win will come with mballoc, delalloc and flex_bg
+grouping of bitmaps and inode tables. Some test results available here:
-The big performance win will come with mballoc and delalloc. CFS has
-been using mballoc for a few years already with Lustre, and IBM + Bull
-did a lot of benchmarking on it. The reason it isn't in the first set of
-patches is partly a manageability issue, and partly because it doesn't
-directly affect the on-disk format (outside of much better allocation)
-so it isn't critical to get into the first round of changes. I believe
-Alex is working on a new set of patches right now.
+ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
+ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
3. Options
==========
@@ -222,9 +243,11 @@ stripe=n Number of filesystem blocks that mballoc will try
to use for allocation size and alignment. For RAID5/6
systems this should be the number of data
disks * RAID chunk size in file system blocks.
-
+delalloc (*) Deferring block allocation until write-out time.
+nodelalloc Disable delayed allocation. Blocks are allocation
+ when data is copied from user to page cache.
Data Mode
----------
+=========
There are 3 different data modes:
* writeback mode
@@ -236,10 +259,10 @@ typically provide the best ext4 performance.
* ordered mode
In data=ordered mode, ext4 only officially journals metadata, but it logically
-groups metadata and data blocks into a single unit called a transaction. When
-it's time to write the new metadata out to disk, the associated data blocks
-are written first. In general, this mode performs slightly slower than
-writeback but significantly faster than journal mode.
+groups metadata information related to data changes with the data blocks into a
+single unit called a transaction. When it's time to write the new metadata
+out to disk, the associated data blocks are written first. In general,
+this mode performs slightly slower than writeback but significantly faster than journal mode.
* journal mode
data=journal mode provides full data and metadata journaling. All new data is
@@ -247,7 +270,8 @@ written to the journal first, and then to its final location.
In the event of a crash, the journal can be replayed, bringing both data and
metadata into a consistent state. This mode is the slowest except when data
needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all others modes. Curently ext4 does not have delayed
+allocation support if this data journalling mode is selected.
References
==========
@@ -256,7 +280,8 @@ kernel source: <file:fs/ext4/>
<file:fs/jbd2/>
programs: http://e2fsprogs.sourceforge.net/
- http://ext2resize.sourceforge.net
useful links: http://fedoraproject.org/wiki/ext3-devel
http://www.bullopensource.org/ext4/
+ http://ext4.wiki.kernel.org/index.php/Main_Page
+ http://fedoraproject.org/wiki/Features/Ext4
diff --git a/drivers/gpu/drm/drm_memory.c b/drivers/gpu/drm/drm_memory.c
index 845081b44f63..0177012845c6 100644
--- a/drivers/gpu/drm/drm_memory.c
+++ b/drivers/gpu/drm/drm_memory.c
@@ -167,6 +167,11 @@ void drm_core_ioremap(struct drm_map *map, struct drm_device *dev)
}
EXPORT_SYMBOL(drm_core_ioremap);
+void drm_core_ioremap_wc(struct drm_map *map, struct drm_device *dev)
+{
+ map->handle = ioremap_wc(map->offset, map->size);
+}
+EXPORT_SYMBOL(drm_core_ioremap_wc);
void drm_core_ioremapfree(struct drm_map *map, struct drm_device *dev)
{
if (!map->handle || !map->size)
diff --git a/drivers/gpu/drm/radeon/radeon_cp.c b/drivers/gpu/drm/radeon/radeon_cp.c
index e53158f0ecb5..f0de81a5689d 100644
--- a/drivers/gpu/drm/radeon/radeon_cp.c
+++ b/drivers/gpu/drm/radeon/radeon_cp.c
@@ -1154,7 +1154,7 @@ static int radeon_do_init_cp(struct drm_device * dev, drm_radeon_init_t * init)
dev_priv->gart_info.mapping.size =
dev_priv->gart_info.table_size;
- drm_core_ioremap(&dev_priv->gart_info.mapping, dev);
+ drm_core_ioremap_wc(&dev_priv->gart_info.mapping, dev);
dev_priv->gart_info.addr =
dev_priv->gart_info.mapping.handle;
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 781ea5950373..09a2bec7fd32 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -4,28 +4,33 @@
* Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2005 Intel Corporation. All rights reserved.
*
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
*
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/cpl.php.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
*
- * 2) under the terms of the "The BSD License" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/bsd-license.php.
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
*
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- * copy of which is available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/gpl-license.php.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
*
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#include <linux/mutex.h>
@@ -100,6 +105,7 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
if (dst_dev_addr)
memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+ dev_addr->src_dev = dev;
return 0;
}
EXPORT_SYMBOL(rdma_copy_addr);
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
index fb9ed1489f95..6669287009c2 100644
--- a/drivers/infiniband/core/agent.h
+++ b/drivers/infiniband/core/agent.h
@@ -32,8 +32,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: agent.h 1389 2004-12-27 22:56:47Z roland $
*/
#ifndef __AGENT_H_
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index e85f7013de57..68883565b725 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -31,8 +31,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: cache.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/module.h>
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index a47fe64e5c39..55738eead3bf 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -31,8 +31,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: cm.c 4311 2005-12-05 18:42:01Z sean.hefty $
*/
#include <linux/completion.h>
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 671f13738054..ae11d5cc74d0 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4,29 +4,33 @@
* Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
*
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
*
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/cpl.php.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
*
- * 2) under the terms of the "The BSD License" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/bsd-license.php.
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
*
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- * copy of which is available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/gpl-license.php.
- *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
*
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#include <linux/completion.h>
@@ -126,8 +130,7 @@ struct rdma_id_private {
struct completion comp;
atomic_t refcount;
- wait_queue_head_t wait_remove;
- atomic_t dev_remove;
+ struct mutex handler_mutex;
int backlog;
int timeout_ms;
@@ -351,26 +354,15 @@ static void cma_deref_id(struct rdma_id_private *id_priv)
complete(&id_priv->comp);
}
-static int cma_disable_remove(struct rdma_id_private *id_priv,
+static int cma_disable_callback(struct rdma_id_private *id_priv,
enum cma_state state)
{
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&id_priv->lock, flags);
- if (id_priv->state == state) {
- atomic_inc(&id_priv->dev_remove);
- ret = 0;
- } else
- ret = -EINVAL;
- spin_unlock_irqrestore(&id_priv->lock, flags);
- return ret;
-}
-
-static void cma_enable_remove(struct rdma_id_private *id_priv)
-{
- if (atomic_dec_and_test(&id_priv->dev_remove))
- wake_up(&id_priv->wait_remove);
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != state) {
+ mutex_unlock(&id_priv->handler_mutex);
+ return -EINVAL;
+ }
+ return 0;
}
static int cma_has_cm_dev(struct rdma_id_private *id_priv)
@@ -395,8 +387,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
mutex_init(&id_priv->qp_mutex);
init_completion(&id_priv->comp);
atomic_set(&id_priv->refcount, 1);
- init_waitqueue_head(&id_priv->wait_remove);
- atomic_set(&id_priv->dev_remove, 0);
+ mutex_init(&id_priv->handler_mutex);
INIT_LIST_HEAD(&id_priv->listen_list);
INIT_LIST_HEAD(&id_priv->mc_list);
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
@@ -923,7 +914,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
struct rdma_cm_event event;
int ret = 0;
- if (cma_disable_remove(id_priv, CMA_CONNECT))
+ if (cma_disable_callback(id_priv, CMA_CONNECT))
return 0;
memset(&event, 0, sizeof event);
@@ -970,7 +961,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
break;
default:
- printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d",
+ printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
ib_event->event);
goto out;
}
@@ -980,12 +971,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
/* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.ib = NULL;
cma_exch(id_priv, CMA_DESTROYING);
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
rdma_destroy_id(&id_priv->id);
return ret;
}
out:
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
return ret;
}
@@ -998,6 +989,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
union cma_ip_addr *src, *dst;
__be16 port;
u8 ip_ver;
+ int ret;
if (cma_get_net_info(ib_event->private_data, listen_id->ps,
&ip_ver, &port, &src, &dst))
@@ -1022,10 +1014,11 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
if (rt->num_paths == 2)
rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
- ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
- ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
- rt->addr.dev_addr.dev_type = RDMA_NODE_IB_CA;
+ ret = rdma_translate_ip(&id->route.addr.src_addr,
+ &id->route.addr.dev_addr);
+ if (ret)
+ goto destroy_id;
id_priv = container_of(id, struct rdma_id_private, id);
id_priv->state = CMA_CONNECT;
@@ -1095,7 +1088,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
int offset, ret;
listen_id = cm_id->context;
- if (cma_disable_remove(listen_id, CMA_LISTEN))
+ if (cma_disable_callback(listen_id, CMA_LISTEN))
return -ECONNABORTED;
memset(&event, 0, sizeof event);
@@ -1116,7 +1109,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
goto out;
}
- atomic_inc(&conn_id->dev_remove);
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
mutex_lock(&lock);
ret = cma_acquire_dev(conn_id);
mutex_unlock(&lock);
@@ -1138,7 +1131,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
!cma_is_ud_ps(conn_id->id.ps))
ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
mutex_unlock(&lock);
- cma_enable_remove(conn_id);
+ mutex_unlock(&conn_id->handler_mutex);
goto out;
}
@@ -1147,11 +1140,11 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
release_conn_id:
cma_exch(conn_id, CMA_DESTROYING);
- cma_enable_remove(conn_id);
+ mutex_unlock(&conn_id->handler_mutex);
rdma_destroy_id(&conn_id->id);
out:
- cma_enable_remove(listen_id);
+ mutex_unlock(&listen_id->handler_mutex);
return ret;
}
@@ -1217,7 +1210,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
struct sockaddr_in *sin;
int ret = 0;
- if (cma_disable_remove(id_priv, CMA_CONNECT))
+ if (cma_disable_callback(id_priv, CMA_CONNECT))
return 0;
memset(&event, 0, sizeof event);
@@ -1261,12 +1254,12 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
/* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.iw = NULL;
cma_exch(id_priv, CMA_DESTROYING);
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
rdma_destroy_id(&id_priv->id);
return ret;
}
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
return ret;
}
@@ -1282,7 +1275,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
struct ib_device_attr attr;
listen_id = cm_id->context;
- if (cma_disable_remove(listen_id, CMA_LISTEN))
+ if (cma_disable_callback(listen_id, CMA_LISTEN))
return -ECONNABORTED;
/* Create a new RDMA id for the new IW CM ID */
@@ -1294,19 +1287,19 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
goto out;
}
conn_id = container_of(new_cm_id, struct rdma_id_private, id);
- atomic_inc(&conn_id->dev_remove);
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
conn_id->state = CMA_CONNECT;
dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr);
if (!dev) {
ret = -EADDRNOTAVAIL;
- cma_enable_remove(conn_id);
+ mutex_unlock(&conn_id->handler_mutex);
rdma_destroy_id(new_cm_id);
goto out;
}
ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
if (ret) {
- cma_enable_remove(conn_id);
+ mutex_unlock(&conn_id->handler_mutex);
rdma_destroy_id(new_cm_id);
goto out;
}
@@ -1315,7 +1308,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
ret = cma_acquire_dev(conn_id);
mutex_unlock(&lock);
if (ret) {
- cma_enable_remove(conn_id);
+ mutex_unlock(&conn_id->handler_mutex);
rdma_destroy_id(new_cm_id);
goto out;
}
@@ -1331,7 +1324,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
ret = ib_query_device(conn_id->id.device, &attr);
if (ret) {
- cma_enable_remove(conn_id);
+ mutex_unlock(&conn_id->handler_mutex);
rdma_destroy_id(new_cm_id);
goto out;
}
@@ -1347,14 +1340,17 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
/* User wants to destroy the CM ID */
conn_id->cm_id.iw = NULL;
cma_exch(conn_id, CMA_DESTROYING);
- cma_enable_remove(conn_id);
+ mutex_unlock(&conn_id->handler_mutex);
rdma_destroy_id(&conn_id->id);
+ goto out;
}
+ mutex_unlock(&conn_id->handler_mutex);
+
out:
if (dev)
dev_put(dev);
- cma_enable_remove(listen_id);
+ mutex_unlock(&listen_id->handler_mutex);
return ret;
}
@@ -1446,7 +1442,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
ret = rdma_listen(id, id_priv->backlog);
if (ret)
printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
- "listening on device %s", ret, cma_dev->device->name);
+ "listening on device %s\n", ret, cma_dev->device->name);
}
static void cma_listen_on_all(struct rdma_id_private *id_priv)
@@ -1586,7 +1582,7 @@ static void cma_work_handler(struct work_struct *_work)
struct rdma_id_private *id_priv = work->id;
int destroy = 0;
- atomic_inc(&id_priv->dev_remove);
+ mutex_lock(&id_priv->handler_mutex);
if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
goto out;
@@ -1595,7 +1591,7 @@ static void cma_work_handler(struct work_struct *_work)
destroy = 1;
}
out:
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
cma_deref_id(id_priv);
if (destroy)
rdma_destroy_id(&id_priv->id);
@@ -1758,7 +1754,7 @@ static void addr_handler(int status, struct sockaddr *src_addr,
struct rdma_cm_event event;
memset(&event, 0, sizeof event);
- atomic_inc(&id_priv->dev_remove);
+ mutex_lock(&id_priv->handler_mutex);
/*
* Grab mutex to block rdma_destroy_id() from removing the device while
@@ -1787,13 +1783,13 @@ static void addr_handler(int status, struct sockaddr *src_addr,
if (id_priv->id.event_handler(&id_priv->id, &event)) {
cma_exch(id_priv, CMA_DESTROYING);
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
cma_deref_id(id_priv);
rdma_destroy_id(&id_priv->id);
return;
}
out:
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
cma_deref_id(id_priv);
}
@@ -2120,7 +2116,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
int ret = 0;
- if (cma_disable_remove(id_priv, CMA_CONNECT))
+ if (cma_disable_callback(id_priv, CMA_CONNECT))
return 0;
memset(&event, 0, sizeof event);
@@ -2151,7 +2147,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
event.status = 0;
break;
default:
- printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d",
+ printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
ib_event->event);
goto out;
}
@@ -2161,12 +2157,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
/* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.ib = NULL;
cma_exch(id_priv, CMA_DESTROYING);
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
rdma_destroy_id(&id_priv->id);
return ret;
}
out:
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
return ret;
}
@@ -2564,8 +2560,8 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
int ret;
id_priv = mc->id_priv;
- if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) &&
- cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
+ if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) &&
+ cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
return 0;
mutex_lock(&id_priv->qp_mutex);
@@ -2590,12 +2586,12 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
ret = id_priv->id.event_handler(&id_priv->id, &event);
if (ret) {
cma_exch(id_priv, CMA_DESTROYING);
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
rdma_destroy_id(&id_priv->id);
return 0;
}
- cma_enable_remove(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
return 0;
}
@@ -2754,6 +2750,7 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv)
{
struct rdma_cm_event event;
enum cma_state state;
+ int ret = 0;
/* Record that we want to remove the device */
state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
@@ -2761,15 +2758,18 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv)
return 0;
cma_cancel_operation(id_priv, state);
- wait_event(id_priv->wait_remove, !atomic_read(&id_priv->dev_remove));
+ mutex_lock(&id_priv->handler_mutex);
/* Check for destruction from another callback. */
if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
- return 0;
+ goto out;
memset(&event, 0, sizeof event);
event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
- return id_priv->id.event_handler(&id_priv->id, &event);
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
}
static void cma_process_remove(struct cma_device *cma_dev)
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 7ad47a4b166b..05ac36e6acdb 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: core_priv.h 1349 2004-12-16 21:09:43Z roland $
*/
#ifndef _CORE_PRIV_H
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 5ac5ffee05cb..7913b804311e 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: device.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/module.h>
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
index 1286dc1b98b2..4507043d24c8 100644
--- a/drivers/infiniband/core/fmr_pool.c
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: fmr_pool.c 2730 2005-06-28 16:43:03Z sean.hefty $
*/
#include <linux/errno.h>
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 8b75010016ec..05ce331733b0 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mad_priv.h 5596 2006-03-03 01:00:07Z sean.hefty $
*/
#ifndef __IB_MAD_PRIV_H__
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index a5e2a310f312..d0ef7d61c037 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mad_rmpp.c 1921 2005-03-02 22:58:44Z sean.hefty $
*/
#include "mad_priv.h"
diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h
index f0616fd22494..3d336bff1148 100644
--- a/drivers/infiniband/core/mad_rmpp.h
+++ b/drivers/infiniband/core/mad_rmpp.h
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mad_rmpp.h 1921 2005-02-25 22:58:44Z sean.hefty $
*/
#ifndef __MAD_RMPP_H__
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
index c972d7235764..019bd4b0863e 100644
--- a/drivers/infiniband/core/packer.c
+++ b/drivers/infiniband/core/packer.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: packer.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/string.h>
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index cf474ec27070..1341de793e51 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: sa_query.c 2811 2005-07-06 18:11:43Z halr $
*/
#include <linux/module.h>
@@ -361,7 +359,7 @@ static void update_sm_ah(struct work_struct *work)
{
struct ib_sa_port *port =
container_of(work, struct ib_sa_port, update_task);
- struct ib_sa_sm_ah *new_ah, *old_ah;
+ struct ib_sa_sm_ah *new_ah;
struct ib_port_attr port_attr;
struct ib_ah_attr ah_attr;
@@ -397,12 +395,9 @@ static void update_sm_ah(struct work_struct *work)
}
spin_lock_irq(&port->ah_lock);
- old_ah = port->sm_ah;
port->sm_ah = new_ah;
spin_unlock_irq(&port->ah_lock);
- if (old_ah)
- kref_put(&old_ah->ref, free_sm_ah);
}
static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
@@ -413,8 +408,17 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event
event->event == IB_EVENT_PKEY_CHANGE ||
event->event == IB_EVENT_SM_CHANGE ||
event->event == IB_EVENT_CLIENT_REREGISTER) {
- struct ib_sa_device *sa_dev;
- sa_dev = container_of(handler, typeof(*sa_dev), event_handler);
+ unsigned long flags;
+ struct ib_sa_device *sa_dev =
+ container_of(handler, typeof(*sa_dev), event_handler);
+ struct ib_sa_port *port =
+ &sa_dev->port[event->element.port_num - sa_dev->start_port];
+
+ spin_lock_irqsave(&port->ah_lock, flags);
+ if (port->sm_ah)
+ kref_put(&port->sm_ah->ref, free_sm_ah);
+ port->sm_ah = NULL;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
schedule_work(&sa_dev->port[event->element.port_num -
sa_dev->start_port].update_task);
@@ -519,6 +523,10 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
unsigned long flags;
spin_lock_irqsave(&query->port->ah_lock, flags);
+ if (!query->port->sm_ah) {
+ spin_unlock_irqrestore(&query->port->ah_lock, flags);
+ return -EAGAIN;
+ }
kref_get(&query->port->sm_ah->ref);
query->sm_ah = query->port->sm_ah;
spin_unlock_irqrestore(&query->port->ah_lock, flags);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 95756551cf7c..4d1042115598 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: sysfs.c 1349 2004-12-16 21:09:43Z roland $
*/
#include "core_priv.h"
@@ -665,6 +663,120 @@ static struct class ib_class = {
.dev_uevent = ib_device_uevent,
};
+/* Show a given an attribute in the statistics group */
+static ssize_t show_protocol_stat(const struct device *device,
+ struct device_attribute *attr, char *buf,
+ unsigned offset)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ union rdma_protocol_stats stats;
+ ssize_t ret;
+
+ ret = dev->get_protocol_stats(dev, &stats);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%llu\n",
+ (unsigned long long) ((u64 *) &stats)[offset]);
+}
+
+/* generate a read-only iwarp statistics attribute */
+#define IW_STATS_ENTRY(name) \
+static ssize_t show_##name(struct device *device, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return show_protocol_stat(device, attr, buf, \
+ offsetof(struct iw_protocol_stats, name) / \
+ sizeof (u64)); \
+} \
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+IW_STATS_ENTRY(ipInReceives);
+IW_STATS_ENTRY(ipInHdrErrors);
+IW_STATS_ENTRY(ipInTooBigErrors);
+IW_STATS_ENTRY(ipInNoRoutes);
+IW_STATS_ENTRY(ipInAddrErrors);
+IW_STATS_ENTRY(ipInUnknownProtos);
+IW_STATS_ENTRY(ipInTruncatedPkts);
+IW_STATS_ENTRY(ipInDiscards);
+IW_STATS_ENTRY(ipInDelivers);
+IW_STATS_ENTRY(ipOutForwDatagrams);
+IW_STATS_ENTRY(ipOutRequests);
+IW_STATS_ENTRY(ipOutDiscards);
+IW_STATS_ENTRY(ipOutNoRoutes);
+IW_STATS_ENTRY(ipReasmTimeout);
+IW_STATS_ENTRY(ipReasmReqds);
+IW_STATS_ENTRY(ipReasmOKs);
+IW_STATS_ENTRY(ipReasmFails);
+IW_STATS_ENTRY(ipFragOKs);
+IW_STATS_ENTRY(ipFragFails);
+IW_STATS_ENTRY(ipFragCreates);
+IW_STATS_ENTRY(ipInMcastPkts);
+IW_STATS_ENTRY(ipOutMcastPkts);
+IW_STATS_ENTRY(ipInBcastPkts);
+IW_STATS_ENTRY(ipOutBcastPkts);
+IW_STATS_ENTRY(tcpRtoAlgorithm);
+IW_STATS_ENTRY(tcpRtoMin);
+IW_STATS_ENTRY(tcpRtoMax);
+IW_STATS_ENTRY(tcpMaxConn);
+IW_STATS_ENTRY(tcpActiveOpens);
+IW_STATS_ENTRY(tcpPassiveOpens);
+IW_STATS_ENTRY(tcpAttemptFails);
+IW_STATS_ENTRY(tcpEstabResets);
+IW_STATS_ENTRY(tcpCurrEstab);
+IW_STATS_ENTRY(tcpInSegs);
+IW_STATS_ENTRY(tcpOutSegs);
+IW_STATS_ENTRY(tcpRetransSegs);
+IW_STATS_ENTRY(tcpInErrs);
+IW_STATS_ENTRY(tcpOutRsts);
+
+static struct attribute *iw_proto_stats_attrs[] = {
+ &dev_attr_ipInReceives.attr,
+ &dev_attr_ipInHdrErrors.attr,
+ &dev_attr_ipInTooBigErrors.attr,
+ &dev_attr_ipInNoRoutes.attr,
+ &dev_attr_ipInAddrErrors.attr,
+ &dev_attr_ipInUnknownProtos.attr,
+ &dev_attr_ipInTruncatedPkts.attr,
+ &dev_attr_ipInDiscards.attr,
+ &dev_attr_ipInDelivers.attr,
+ &dev_attr_ipOutForwDatagrams.attr,
+ &dev_attr_ipOutRequests.attr,
+ &dev_attr_ipOutDiscards.attr,
+ &dev_attr_ipOutNoRoutes.attr,
+ &dev_attr_ipReasmTimeout.attr,
+ &dev_attr_ipReasmReqds.attr,
+ &dev_attr_ipReasmOKs.attr,
+ &dev_attr_ipReasmFails.attr,
+ &dev_attr_ipFragOKs.attr,
+ &dev_attr_ipFragFails.attr,
+ &dev_attr_ipFragCreates.attr,
+ &dev_attr_ipInMcastPkts.attr,
+ &dev_attr_ipOutMcastPkts.attr,
+ &dev_attr_ipInBcastPkts.attr,
+ &dev_attr_ipOutBcastPkts.attr,
+ &dev_attr_tcpRtoAlgorithm.attr,
+ &dev_attr_tcpRtoMin.attr,
+ &dev_attr_tcpRtoMax.attr,
+ &dev_attr_tcpMaxConn.attr,
+ &dev_attr_tcpActiveOpens.attr,
+ &dev_attr_tcpPassiveOpens.attr,
+ &dev_attr_tcpAttemptFails.attr,
+ &dev_attr_tcpEstabResets.attr,
+ &dev_attr_tcpCurrEstab.attr,
+ &dev_attr_tcpInSegs.attr,
+ &dev_attr_tcpOutSegs.attr,
+ &dev_attr_tcpRetransSegs.attr,
+ &dev_attr_tcpInErrs.attr,
+ &dev_attr_tcpOutRsts.attr,
+ NULL
+};
+
+static struct attribute_group iw_stats_group = {
+ .name = "proto_stats",
+ .attrs = iw_proto_stats_attrs,
+};
+
int ib_device_register_sysfs(struct ib_device *device)
{
struct device *class_dev = &device->dev;
@@ -707,6 +819,12 @@ int ib_device_register_sysfs(struct ib_device *device)
}
}
+ if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
+ ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
+ if (ret)
+ goto err_put;
+ }
+
return 0;
err_put:
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index b25675faaaf5..9494005d1c9a 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ucm.c 4311 2005-12-05 18:42:01Z sean.hefty $
*/
#include <linux/completion.h>
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 997c07db6d8f..8ec7876bedcf 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ud_header.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/errno.h>
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index a1768dbb0720..6f7c096abf13 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: uverbs_mem.c 2743 2005-06-28 22:27:59Z roland $
*/
#include <linux/mm.h>
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 208c7f34323c..268a2d23b7c9 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -31,8 +31,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: user_mad.c 5596 2006-03-03 01:00:07Z sean.hefty $
*/
#include <linux/module.h>
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 376a57ce1b40..b3ea9587dc80 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -32,8 +32,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: uverbs.h 2559 2005-06-06 19:43:16Z roland $
*/
#ifndef UVERBS_H
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 2c3bff5fe867..56feab6c251e 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -31,8 +31,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: uverbs_cmd.c 2708 2005-06-24 17:27:21Z roland $
*/
#include <linux/file.h>
@@ -919,7 +917,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
resp->wc[i].opcode = wc[i].opcode;
resp->wc[i].vendor_err = wc[i].vendor_err;
resp->wc[i].byte_len = wc[i].byte_len;
- resp->wc[i].imm_data = (__u32 __force) wc[i].imm_data;
+ resp->wc[i].ex.imm_data = (__u32 __force) wc[i].ex.imm_data;
resp->wc[i].qp_num = wc[i].qp->qp_num;
resp->wc[i].src_qp = wc[i].src_qp;
resp->wc[i].wc_flags = wc[i].wc_flags;
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 0f34858e31e7..aeee856c4060 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -32,8 +32,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: uverbs_main.c 2733 2005-06-28 19:14:34Z roland $
*/
#include <linux/module.h>
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 05042089de6e..a7da9be43e61 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -34,8 +34,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: verbs.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/errno.h>
@@ -317,7 +315,6 @@ static const struct {
} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
[IB_QPS_RESET] = {
[IB_QPS_RESET] = { .valid = 1 },
- [IB_QPS_ERR] = { .valid = 1 },
[IB_QPS_INIT] = {
.valid = 1,
.req_param = {
@@ -755,6 +752,52 @@ int ib_dereg_mr(struct ib_mr *mr)
}
EXPORT_SYMBOL(ib_dereg_mr);
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->alloc_fast_reg_mr)
+ return ERR_PTR(-ENOSYS);
+
+ mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+ int max_page_list_len)
+{
+ struct ib_fast_reg_page_list *page_list;
+
+ if (!device->alloc_fast_reg_page_list)
+ return ERR_PTR(-ENOSYS);
+
+ page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+ if (!IS_ERR(page_list)) {
+ page_list->device = device;
+ page_list->max_page_list_len = max_page_list_len;
+ }
+
+ return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+ page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
/* Memory windows */
struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c
index b1441aeb60c2..dd05c4835642 100644
--- a/drivers/infiniband/hw/amso1100/c2_rnic.c
+++ b/drivers/infiniband/hw/amso1100/c2_rnic.c
@@ -454,7 +454,7 @@ int __devinit c2_rnic_init(struct c2_dev *c2dev)
(IB_DEVICE_RESIZE_MAX_WR |
IB_DEVICE_CURR_QP_STATE_MOD |
IB_DEVICE_SYS_IMAGE_GUID |
- IB_DEVICE_ZERO_STAG |
+ IB_DEVICE_LOCAL_DMA_LKEY |
IB_DEVICE_MEM_WINDOW);
/* Allocate the qptr_array */
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index 3f441fc57c17..f6d5747153a5 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -145,7 +145,9 @@ static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid)
}
wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
memset(wqe, 0, sizeof(*wqe));
- build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0, qpid, 7);
+ build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD,
+ T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 0, qpid, 7,
+ T3_SOPEOP);
wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
sge_cmd = qpid << 8 | 3;
wqe->sge_cmd = cpu_to_be64(sge_cmd);
@@ -276,7 +278,7 @@ int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
if (!wq->qpid)
return -ENOMEM;
- wq->rq = kzalloc(depth * sizeof(u64), GFP_KERNEL);
+ wq->rq = kzalloc(depth * sizeof(struct t3_swrq), GFP_KERNEL);
if (!wq->rq)
goto err1;
@@ -300,6 +302,7 @@ int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
if (!kernel_domain)
wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
(wq->qpid << rdev_p->qpshift);
+ wq->rdev = rdev_p;
PDBG("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n", __func__,
wq->qpid, wq->doorbell, (unsigned long long) wq->udb);
return 0;
@@ -558,7 +561,7 @@ static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
memset(wqe, 0, sizeof(*wqe));
build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0,
- T3_CTL_QP_TID, 7);
+ T3_CTL_QP_TID, 7, T3_SOPEOP);
wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3;
wqe->sge_cmd = cpu_to_be64(sge_cmd);
@@ -674,7 +677,7 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag,
Q_GENBIT(rdev_p->ctrl_qp.wptr,
T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID,
- wr_len);
+ wr_len, T3_SOPEOP);
if (flag == T3_COMPLETION_FLAG)
ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID);
len -= 96;
@@ -816,6 +819,13 @@ int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
0, 0);
}
+int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr)
+{
+ *stag = T3_STAG_UNSET;
+ return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR,
+ 0, 0, 0ULL, 0, 0, pbl_size, pbl_addr);
+}
+
int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
{
struct t3_rdma_init_wr *wqe;
@@ -1257,13 +1267,16 @@ proc_cqe:
wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe);
PDBG("%s completing sq idx %ld\n", __func__,
Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2));
- *cookie = (wq->sq +
- Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2))->wr_id;
+ *cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id;
wq->sq_rptr++;
} else {
PDBG("%s completing rq idx %ld\n", __func__,
Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
- *cookie = *(wq->rq + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+ *cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id;
+ if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr)
+ cxio_hal_pblpool_free(wq->rdev,
+ wq->rq[Q_PTR2IDX(wq->rq_rptr,
+ wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE);
wq->rq_rptr++;
}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
index 6e128f6bab05..656fe47bc84f 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -45,15 +45,17 @@
#define T3_CTRL_QP_SIZE_LOG2 8
#define T3_CTRL_CQ_ID 0
-/* TBD */
#define T3_MAX_NUM_RI (1<<15)
#define T3_MAX_NUM_QP (1<<15)
#define T3_MAX_NUM_CQ (1<<15)
#define T3_MAX_NUM_PD (1<<15)
#define T3_MAX_PBL_SIZE 256
#define T3_MAX_RQ_SIZE 1024
+#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1)
+#define T3_MAX_CQ_DEPTH 8192
#define T3_MAX_NUM_STAG (1<<15)
#define T3_MAX_MR_SIZE 0x100000000ULL
+#define T3_PAGESIZE_MASK 0xffff000 /* 4KB-128MB */
#define T3_STAG_UNSET 0xffffffff
@@ -165,6 +167,7 @@ int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
u32 pbl_addr);
int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
+int cxio_allocate_stag(struct cxio_rdev *rdev, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr);
int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag);
int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr);
void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
index f1a25a821a45..04618f7bfbb3 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_wr.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -39,6 +39,9 @@
#define T3_MAX_SGE 4
#define T3_MAX_INLINE 64
+#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3)
+#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024)
+#define T3_STAG0_PAGE_SHIFT 15
#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr))
#define Q_FULL(rptr,wptr,size_log2) ( (((wptr)-(rptr))>>(size_log2)) && \
@@ -72,7 +75,8 @@ enum t3_wr_opcode {
T3_WR_BIND = FW_WROPCODE_RI_BIND_MW,
T3_WR_RCV = FW_WROPCODE_RI_RECEIVE,
T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
- T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP
+ T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP,
+ T3_WR_FASTREG = FW_WROPCODE_RI_FASTREGISTER_MR
} __attribute__ ((packed));
enum t3_rdma_opcode {
@@ -89,7 +93,8 @@ enum t3_rdma_opcode {
T3_FAST_REGISTER,
T3_LOCAL_INV,
T3_QP_MOD,
- T3_BYPASS
+ T3_BYPASS,
+ T3_RDMA_READ_REQ_WITH_INV,
} __attribute__ ((packed));
static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
@@ -103,6 +108,7 @@ static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
case T3_WR_BIND: return T3_BIND_MW;
case T3_WR_INIT: return T3_RDMA_INIT;
case T3_WR_QP_MOD: return T3_QP_MOD;
+ case T3_WR_FASTREG: return T3_FAST_REGISTER;
default: break;
}
return -1;
@@ -170,11 +176,54 @@ struct t3_send_wr {
struct t3_sge sgl[T3_MAX_SGE]; /* 4+ */
};
+#define T3_MAX_FASTREG_DEPTH 24
+#define T3_MAX_FASTREG_FRAG 10
+
+struct t3_fastreg_wr {
+ struct fw_riwrh wrh; /* 0 */
+ union t3_wrid wrid; /* 1 */
+ __be32 stag; /* 2 */
+ __be32 len;
+ __be32 va_base_hi; /* 3 */
+ __be32 va_base_lo_fbo;
+ __be32 page_type_perms; /* 4 */
+ __be32 reserved1;
+ __be64 pbl_addrs[0]; /* 5+ */
+};
+
+/*
+ * If a fastreg wr spans multiple wqes, then the 2nd fragment look like this.
+ */
+struct t3_pbl_frag {
+ struct fw_riwrh wrh; /* 0 */
+ __be64 pbl_addrs[14]; /* 1..14 */
+};
+
+#define S_FR_PAGE_COUNT 24
+#define M_FR_PAGE_COUNT 0xff
+#define V_FR_PAGE_COUNT(x) ((x) << S_FR_PAGE_COUNT)
+#define G_FR_PAGE_COUNT(x) ((((x) >> S_FR_PAGE_COUNT)) & M_FR_PAGE_COUNT)
+
+#define S_FR_PAGE_SIZE 16
+#define M_FR_PAGE_SIZE 0x1f
+#define V_FR_PAGE_SIZE(x) ((x) << S_FR_PAGE_SIZE)
+#define G_FR_PAGE_SIZE(x) ((((x) >> S_FR_PAGE_SIZE)) & M_FR_PAGE_SIZE)
+
+#define S_FR_TYPE 8
+#define M_FR_TYPE 0x1
+#define V_FR_TYPE(x) ((x) << S_FR_TYPE)
+#define G_FR_TYPE(x) ((((x) >> S_FR_TYPE)) & M_FR_TYPE)
+
+#define S_FR_PERMS 0
+#define M_FR_PERMS 0xff
+#define V_FR_PERMS(x) ((x) << S_FR_PERMS)
+#define G_FR_PERMS(x) ((((x) >> S_FR_PERMS)) & M_FR_PERMS)
+
struct t3_local_inv_wr {
struct fw_riwrh wrh; /* 0 */
union t3_wrid wrid; /* 1 */
__be32 stag; /* 2 */
- __be32 reserved3;
+ __be32 reserved;
};
struct t3_rdma_write_wr {
@@ -193,7 +242,8 @@ struct t3_rdma_read_wr {
struct fw_riwrh wrh; /* 0 */
union t3_wrid wrid; /* 1 */
u8 rdmaop; /* 2 */
- u8 reserved[3];
+ u8 local_inv;
+ u8 reserved[2];
__be32 rem_stag;
__be64 rem_to; /* 3 */
__be32 local_stag; /* 4 */
@@ -201,18 +251,6 @@ struct t3_rdma_read_wr {
__be64 local_to; /* 5 */
};
-enum t3_addr_type {
- T3_VA_BASED_TO = 0x0,
- T3_ZERO_BASED_TO = 0x1
-} __attribute__ ((packed));
-
-enum t3_mem_perms {
- T3_MEM_ACCESS_LOCAL_READ = 0x1,
- T3_MEM_ACCESS_LOCAL_WRITE = 0x2,
- T3_MEM_ACCESS_REM_READ = 0x4,
- T3_MEM_ACCESS_REM_WRITE = 0x8
-} __attribute__ ((packed));
-
struct t3_bind_mw_wr {
struct fw_riwrh wrh; /* 0 */
union t3_wrid wrid; /* 1 */
@@ -336,6 +374,11 @@ struct t3_genbit {
__be64 genbit;
};
+struct t3_wq_in_err {
+ u64 flit[13];
+ u64 err;
+};
+
enum rdma_init_wr_flags {
MPA_INITIATOR = (1<<0),
PRIV_QP = (1<<1),
@@ -346,13 +389,16 @@ union t3_wr {
struct t3_rdma_write_wr write;
struct t3_rdma_read_wr read;
struct t3_receive_wr recv;
+ struct t3_fastreg_wr fastreg;
+ struct t3_pbl_frag pbl_frag;
struct t3_local_inv_wr local_inv;
struct t3_bind_mw_wr bind;
struct t3_bypass_wr bypass;
struct t3_rdma_init_wr init;
struct t3_modify_qp_wr qp_mod;
struct t3_genbit genbit;
- u64 flit[16];
+ struct t3_wq_in_err wq_in_err;
+ __be64 flit[16];
};
#define T3_SQ_CQE_FLIT 13
@@ -366,12 +412,18 @@ static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe)
return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags));
}
+enum t3_wr_hdr_bits {
+ T3_EOP = 1,
+ T3_SOP = 2,
+ T3_SOPEOP = T3_EOP|T3_SOP,
+};
+
static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op,
enum t3_wr_flags flags, u8 genbit, u32 tid,
- u8 len)
+ u8 len, u8 sopeop)
{
wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) |
- V_FW_RIWR_SOPEOP(M_FW_RIWR_SOPEOP) |
+ V_FW_RIWR_SOPEOP(sopeop) |
V_FW_RIWR_FLAGS(flags));
wmb();
wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) |
@@ -404,6 +456,7 @@ enum tpt_addr_type {
};
enum tpt_mem_perm {
+ TPT_MW_BIND = 0x10,
TPT_LOCAL_READ = 0x8,
TPT_LOCAL_WRITE = 0x4,
TPT_REMOTE_READ = 0x2,
@@ -615,6 +668,11 @@ struct t3_swsq {
int signaled;
};
+struct t3_swrq {
+ __u64 wr_id;
+ __u32 pbl_addr;
+};
+
/*
* A T3 WQ implements both the SQ and RQ.
*/
@@ -631,14 +689,15 @@ struct t3_wq {
u32 sq_wptr; /* sq_wptr - sq_rptr == count of */
u32 sq_rptr; /* pending wrs */
u32 sq_size_log2; /* sq size */
- u64 *rq; /* SW RQ (holds consumer wr_ids */
+ struct t3_swrq *rq; /* SW RQ (holds consumer wr_ids */
u32 rq_wptr; /* rq_wptr - rq_rptr == count of */
u32 rq_rptr; /* pending wrs */
- u64 *rq_oldest_wr; /* oldest wr on the SW RQ */
+ struct t3_swrq *rq_oldest_wr; /* oldest wr on the SW RQ */
u32 rq_size_log2; /* rq size */
u32 rq_addr; /* rq adapter address */
void __iomem *doorbell; /* kernel db */
u64 udb; /* user db if any */
+ struct cxio_rdev *rdev;
};
struct t3_cq {
@@ -659,7 +718,7 @@ struct t3_cq {
static inline void cxio_set_wq_in_error(struct t3_wq *wq)
{
- wq->queue->flit[13] = 1;
+ wq->queue->wq_in_err.err = 1;
}
static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
index 71554eacb13c..4489c89d6710 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -71,18 +71,16 @@ static void rnic_init(struct iwch_dev *rnicp)
idr_init(&rnicp->mmidr);
spin_lock_init(&rnicp->lock);
- rnicp->attr.vendor_id = 0x168;
- rnicp->attr.vendor_part_id = 7;
rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
- rnicp->attr.max_wrs = (1UL << 24) - 1;
+ rnicp->attr.max_wrs = T3_MAX_QP_DEPTH;
rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
- rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1;
+ rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH;
rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
- rnicp->attr.mem_pgsizes_bitmask = 0x7FFF; /* 4KB-128MB */
+ rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK;
rnicp->attr.max_mr_size = T3_MAX_MR_SIZE;
rnicp->attr.can_resize_wq = 0;
rnicp->attr.max_rdma_reads_per_qp = 8;
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
index d2409a505e8d..3773453b2cf0 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.h
+++ b/drivers/infiniband/hw/cxgb3/iwch.h
@@ -48,8 +48,6 @@ struct iwch_qp;
struct iwch_mr;
struct iwch_rnic_attributes {
- u32 vendor_id;
- u32 vendor_part_id;
u32 max_qps;
u32 max_wrs; /* Max for any SQ/RQ */
u32 max_sge_per_wr;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c
index 4ee8ccd0a9e5..cf5474ae68ff 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cq.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c
@@ -81,6 +81,7 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
wc->wr_id = cookie;
wc->qp = &qhp->ibqp;
wc->vendor_err = CQE_STATUS(cqe);
+ wc->wc_flags = 0;
PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x "
"lo 0x%x cookie 0x%llx\n", __func__,
@@ -94,6 +95,11 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
else
wc->byte_len = 0;
wc->opcode = IB_WC_RECV;
+ if (CQE_OPCODE(cqe) == T3_SEND_WITH_INV ||
+ CQE_OPCODE(cqe) == T3_SEND_WITH_SE_INV) {
+ wc->ex.invalidate_rkey = CQE_WRID_STAG(cqe);
+ wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+ }
} else {
switch (CQE_OPCODE(cqe)) {
case T3_RDMA_WRITE:
@@ -105,17 +111,20 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
break;
case T3_SEND:
case T3_SEND_WITH_SE:
+ case T3_SEND_WITH_INV:
+ case T3_SEND_WITH_SE_INV:
wc->opcode = IB_WC_SEND;
break;
case T3_BIND_MW:
wc->opcode = IB_WC_BIND_MW;
break;
- /* these aren't supported yet */
- case T3_SEND_WITH_INV:
- case T3_SEND_WITH_SE_INV:
case T3_LOCAL_INV:
+ wc->opcode = IB_WC_LOCAL_INV;
+ break;
case T3_FAST_REGISTER:
+ wc->opcode = IB_WC_FAST_REG_MR;
+ break;
default:
printk(KERN_ERR MOD "Unexpected opcode %d "
"in the CQE received for QPID=0x%0x\n",
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 95f82cfb6c54..b89640aa6e10 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -56,6 +56,7 @@
#include "iwch_provider.h"
#include "iwch_cm.h"
#include "iwch_user.h"
+#include "common.h"
static int iwch_modify_port(struct ib_device *ibdev,
u8 port, int port_modify_mask,
@@ -747,6 +748,7 @@ static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd)
mhp->attr.type = TPT_MW;
mhp->attr.stag = stag;
mmid = (stag) >> 8;
+ mhp->ibmw.rkey = stag;
insert_handle(rhp, &rhp->mmidr, mhp, mmid);
PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
return &(mhp->ibmw);
@@ -768,6 +770,68 @@ static int iwch_dealloc_mw(struct ib_mw *mw)
return 0;
}
+static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+{
+ struct iwch_dev *rhp;
+ struct iwch_pd *php;
+ struct iwch_mr *mhp;
+ u32 mmid;
+ u32 stag = 0;
+ int ret;
+
+ php = to_iwch_pd(pd);
+ rhp = php->rhp;
+ mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+ if (!mhp)
+ return ERR_PTR(-ENOMEM);
+
+ mhp->rhp = rhp;
+ ret = iwch_alloc_pbl(mhp, pbl_depth);
+ if (ret) {
+ kfree(mhp);
+ return ERR_PTR(ret);
+ }
+ mhp->attr.pbl_size = pbl_depth;
+ ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid,
+ mhp->attr.pbl_size, mhp->attr.pbl_addr);
+ if (ret) {
+ iwch_free_pbl(mhp);
+ kfree(mhp);
+ return ERR_PTR(ret);
+ }
+ mhp->attr.pdid = php->pdid;
+ mhp->attr.type = TPT_NON_SHARED_MR;
+ mhp->attr.stag = stag;
+ mhp->attr.state = 1;
+ mmid = (stag) >> 8;
+ mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+ insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+ PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+ return &(mhp->ibmr);
+}
+
+static struct ib_fast_reg_page_list *iwch_alloc_fastreg_pbl(
+ struct ib_device *device,
+ int page_list_len)
+{
+ struct ib_fast_reg_page_list *page_list;
+
+ page_list = kmalloc(sizeof *page_list + page_list_len * sizeof(u64),
+ GFP_KERNEL);
+ if (!page_list)
+ return ERR_PTR(-ENOMEM);
+
+ page_list->page_list = (u64 *)(page_list + 1);
+ page_list->max_page_list_len = page_list_len;
+
+ return page_list;
+}
+
+static void iwch_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list)
+{
+ kfree(page_list);
+}
+
static int iwch_destroy_qp(struct ib_qp *ib_qp)
{
struct iwch_dev *rhp;
@@ -843,6 +907,15 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
*/
sqsize = roundup_pow_of_two(attrs->cap.max_send_wr);
wqsize = roundup_pow_of_two(rqsize + sqsize);
+
+ /*
+ * Kernel users need more wq space for fastreg WRs which can take
+ * 2 WR fragments.
+ */
+ ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
+ if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
+ wqsize = roundup_pow_of_two(rqsize +
+ roundup_pow_of_two(attrs->cap.max_send_wr * 2));
PDBG("%s wqsize %d sqsize %d rqsize %d\n", __func__,
wqsize, sqsize, rqsize);
qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
@@ -851,7 +924,6 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
qhp->wq.size_log2 = ilog2(wqsize);
qhp->wq.rq_size_log2 = ilog2(rqsize);
qhp->wq.sq_size_log2 = ilog2(sqsize);
- ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq,
ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) {
kfree(qhp);
@@ -935,10 +1007,10 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
qhp->ibqp.qp_num = qhp->wq.qpid;
init_timer(&(qhp->timer));
PDBG("%s sq_num_entries %d, rq_num_entries %d "
- "qpid 0x%0x qhp %p dma_addr 0x%llx size %d\n",
+ "qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n",
__func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr,
- 1 << qhp->wq.size_log2);
+ 1 << qhp->wq.size_log2, qhp->wq.rq_addr);
return &qhp->ibqp;
}
@@ -1023,6 +1095,29 @@ static int iwch_query_gid(struct ib_device *ibdev, u8 port,
return 0;
}
+static u64 fw_vers_string_to_u64(struct iwch_dev *iwch_dev)
+{
+ struct ethtool_drvinfo info;
+ struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+ char *cp, *next;
+ unsigned fw_maj, fw_min, fw_mic;
+
+ rtnl_lock();
+ lldev->ethtool_ops->get_drvinfo(lldev, &info);
+ rtnl_unlock();
+
+ next = info.fw_version + 1;
+ cp = strsep(&next, ".");
+ sscanf(cp, "%i", &fw_maj);
+ cp = strsep(&next, ".");
+ sscanf(cp, "%i", &fw_min);
+ cp = strsep(&next, ".");
+ sscanf(cp, "%i", &fw_mic);
+
+ return (((u64)fw_maj & 0xffff) << 32) | ((fw_min & 0xffff) << 16) |
+ (fw_mic & 0xffff);
+}
+
static int iwch_query_device(struct ib_device *ibdev,
struct ib_device_attr *props)
{
@@ -1033,7 +1128,10 @@ static int iwch_query_device(struct ib_device *ibdev,
dev = to_iwch_dev(ibdev);
memset(props, 0, sizeof *props);
memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+ props->hw_ver = dev->rdev.t3cdev_p->type;
+ props->fw_ver = fw_vers_string_to_u64(dev);
props->device_cap_flags = dev->device_cap_flags;
+ props->page_size_cap = dev->attr.mem_pgsizes_bitmask;
props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor;
props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device;
props->max_mr_size = dev->attr.max_mr_size;
@@ -1048,6 +1146,7 @@ static int iwch_query_device(struct ib_device *ibdev,
props->max_mr = dev->attr.max_mem_regs;
props->max_pd = dev->attr.max_pds;
props->local_ca_ack_delay = 0;
+ props->max_fast_reg_page_list_len = T3_MAX_FASTREG_DEPTH;
return 0;
}
@@ -1088,6 +1187,28 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
}
+static int fw_supports_fastreg(struct iwch_dev *iwch_dev)
+{
+ struct ethtool_drvinfo info;
+ struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+ char *cp, *next;
+ unsigned fw_maj, fw_min;
+
+ rtnl_lock();
+ lldev->ethtool_ops->get_drvinfo(lldev, &info);
+ rtnl_unlock();
+
+ next = info.fw_version+1;
+ cp = strsep(&next, ".");
+ sscanf(cp, "%i", &fw_maj);
+ cp = strsep(&next, ".");
+ sscanf(cp, "%i", &fw_min);
+
+ PDBG("%s maj %u min %u\n", __func__, fw_maj, fw_min);
+
+ return fw_maj > 6 || (fw_maj == 6 && fw_min > 0);
+}
+
static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf)
{
struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
@@ -1127,6 +1248,61 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
iwch_dev->rdev.rnic_info.pdev->device);
}
+static int iwch_get_mib(struct ib_device *ibdev,
+ union rdma_protocol_stats *stats)
+{
+ struct iwch_dev *dev;
+ struct tp_mib_stats m;
+ int ret;
+
+ PDBG("%s ibdev %p\n", __func__, ibdev);
+ dev = to_iwch_dev(ibdev);
+ ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m);
+ if (ret)
+ return -ENOSYS;
+
+ memset(stats, 0, sizeof *stats);
+ stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) +
+ m.ipInReceive_lo;
+ stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) +
+ m.ipInHdrErrors_lo;
+ stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) +
+ m.ipInAddrErrors_lo;
+ stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) +
+ m.ipInUnknownProtos_lo;
+ stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) +
+ m.ipInDiscards_lo;
+ stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) +
+ m.ipInDelivers_lo;
+ stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) +
+ m.ipOutRequests_lo;
+ stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) +
+ m.ipOutDiscards_lo;
+ stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) +
+ m.ipOutNoRoutes_lo;
+ stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout;
+ stats->iw.ipReasmReqds = (u64) m.ipReasmReqds;
+ stats->iw.ipReasmOKs = (u64) m.ipReasmOKs;
+ stats->iw.ipReasmFails = (u64) m.ipReasmFails;
+ stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens;
+ stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens;
+ stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails;
+ stats->iw.tcpEstabResets = (u64) m.tcpEstabResets;
+ stats->iw.tcpOutRsts = (u64) m.tcpOutRsts;
+ stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab;
+ stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) +
+ m.tcpInSegs_lo;
+ stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) +
+ m.tcpOutSegs_lo;
+ stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) +
+ m.tcpRetransSeg_lo;
+ stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) +
+ m.tcpInErrs_lo;
+ stats->iw.tcpRtoMin = (u64) m.tcpRtoMin;
+ stats->iw.tcpRtoMax = (u64) m.tcpRtoMax;
+ return 0;
+}
+
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
@@ -1136,7 +1312,7 @@ static struct device_attribute *iwch_class_attributes[] = {
&dev_attr_hw_rev,
&dev_attr_fw_ver,
&dev_attr_hca_type,
- &dev_attr_board_id
+ &dev_attr_board_id,
};
int iwch_register_device(struct iwch_dev *dev)
@@ -1149,8 +1325,12 @@ int iwch_register_device(struct iwch_dev *dev)
memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
dev->ibdev.owner = THIS_MODULE;
- dev->device_cap_flags =
- (IB_DEVICE_ZERO_STAG | IB_DEVICE_MEM_WINDOW);
+ dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
+
+ /* cxgb3 supports STag 0. */
+ dev->ibdev.local_dma_lkey = 0;
+ if (fw_supports_fastreg(dev))
+ dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
dev->ibdev.uverbs_cmd_mask =
(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
@@ -1202,15 +1382,16 @@ int iwch_register_device(struct iwch_dev *dev)
dev->ibdev.alloc_mw = iwch_alloc_mw;
dev->ibdev.bind_mw = iwch_bind_mw;
dev->ibdev.dealloc_mw = iwch_dealloc_mw;
-
+ dev->ibdev.alloc_fast_reg_mr = iwch_alloc_fast_reg_mr;
+ dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl;
+ dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl;
dev->ibdev.attach_mcast = iwch_multicast_attach;
dev->ibdev.detach_mcast = iwch_multicast_detach;
dev->ibdev.process_mad = iwch_process_mad;
-
dev->ibdev.req_notify_cq = iwch_arm_cq;
dev->ibdev.post_send = iwch_post_send;
dev->ibdev.post_recv = iwch_post_receive;
-
+ dev->ibdev.get_protocol_stats = iwch_get_mib;
dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
if (!dev->ibdev.iwcm)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
index 836163fc5429..f5ceca05c435 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -296,14 +296,6 @@ static inline u32 iwch_ib_to_tpt_access(int acc)
TPT_LOCAL_READ;
}
-static inline u32 iwch_ib_to_mwbind_access(int acc)
-{
- return (acc & IB_ACCESS_REMOTE_WRITE ? T3_MEM_ACCESS_REM_WRITE : 0) |
- (acc & IB_ACCESS_REMOTE_READ ? T3_MEM_ACCESS_REM_READ : 0) |
- (acc & IB_ACCESS_LOCAL_WRITE ? T3_MEM_ACCESS_LOCAL_WRITE : 0) |
- T3_MEM_ACCESS_LOCAL_READ;
-}
-
enum iwch_mmid_state {
IWCH_STAG_STATE_VALID,
IWCH_STAG_STATE_INVALID
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index 992613799228..9a3be3a9d5dc 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -33,10 +33,11 @@
#include "iwch.h"
#include "iwch_cm.h"
#include "cxio_hal.h"
+#include "cxio_resource.h"
#define NO_SUPPORT -1
-static int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
+static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
u8 * flit_cnt)
{
int i;
@@ -44,59 +45,44 @@ static int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
switch (wr->opcode) {
case IB_WR_SEND:
- case IB_WR_SEND_WITH_IMM:
if (wr->send_flags & IB_SEND_SOLICITED)
wqe->send.rdmaop = T3_SEND_WITH_SE;
else
wqe->send.rdmaop = T3_SEND;
wqe->send.rem_stag = 0;
break;
-#if 0 /* Not currently supported */
- case TYPE_SEND_INVALIDATE:
- case TYPE_SEND_INVALIDATE_IMMEDIATE:
- wqe->send.rdmaop = T3_SEND_WITH_INV;
- wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
- break;
- case TYPE_SEND_SE_INVALIDATE:
- wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
- wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
+ case IB_WR_SEND_WITH_INV:
+ if (wr->send_flags & IB_SEND_SOLICITED)
+ wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
+ else
+ wqe->send.rdmaop = T3_SEND_WITH_INV;
+ wqe->send.rem_stag = cpu_to_be32(wr->ex.invalidate_rkey);
break;
-#endif
default:
- break;
+ return -EINVAL;
}
if (wr->num_sge > T3_MAX_SGE)
return -EINVAL;
wqe->send.reserved[0] = 0;
wqe->send.reserved[1] = 0;
wqe->send.reserved[2] = 0;
- if (wr->opcode == IB_WR_SEND_WITH_IMM) {
- plen = 4;
- wqe->send.sgl[0].stag = wr->ex.imm_data;
- wqe->send.sgl[0].len = __constant_cpu_to_be32(0);
- wqe->send.num_sgle = __constant_cpu_to_be32(0);
- *flit_cnt = 5;
- } else {
- plen = 0;
- for (i = 0; i < wr->num_sge; i++) {
- if ((plen + wr->sg_list[i].length) < plen) {
- return -EMSGSIZE;
- }
- plen += wr->sg_list[i].length;
- wqe->send.sgl[i].stag =
- cpu_to_be32(wr->sg_list[i].lkey);
- wqe->send.sgl[i].len =
- cpu_to_be32(wr->sg_list[i].length);
- wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
- }
- wqe->send.num_sgle = cpu_to_be32(wr->num_sge);
- *flit_cnt = 4 + ((wr->num_sge) << 1);
+ plen = 0;
+ for (i = 0; i < wr->num_sge; i++) {
+ if ((plen + wr->sg_list[i].length) < plen)
+ return -EMSGSIZE;
+
+ plen += wr->sg_list[i].length;
+ wqe->send.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
+ wqe->send.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+ wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
}
+ wqe->send.num_sgle = cpu_to_be32(wr->num_sge);
+ *flit_cnt = 4 + ((wr->num_sge) << 1);
wqe->send.plen = cpu_to_be32(plen);
return 0;
}
-static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
+static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
u8 *flit_cnt)
{
int i;
@@ -137,15 +123,18 @@ static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
return 0;
}
-static int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
+static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
u8 *flit_cnt)
{
if (wr->num_sge > 1)
return -EINVAL;
wqe->read.rdmaop = T3_READ_REQ;
+ if (wr->opcode == IB_WR_RDMA_READ_WITH_INV)
+ wqe->read.local_inv = 1;
+ else
+ wqe->read.local_inv = 0;
wqe->read.reserved[0] = 0;
wqe->read.reserved[1] = 0;
- wqe->read.reserved[2] = 0;
wqe->read.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
wqe->read.rem_to = cpu_to_be64(wr->wr.rdma.remote_addr);
wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey);
@@ -155,6 +144,57 @@ static int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
return 0;
}
+static int build_fastreg(union t3_wr *wqe, struct ib_send_wr *wr,
+ u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq)
+{
+ int i;
+ __be64 *p;
+
+ if (wr->wr.fast_reg.page_list_len > T3_MAX_FASTREG_DEPTH)
+ return -EINVAL;
+ *wr_cnt = 1;
+ wqe->fastreg.stag = cpu_to_be32(wr->wr.fast_reg.rkey);
+ wqe->fastreg.len = cpu_to_be32(wr->wr.fast_reg.length);
+ wqe->fastreg.va_base_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32);
+ wqe->fastreg.va_base_lo_fbo =
+ cpu_to_be32(wr->wr.fast_reg.iova_start & 0xffffffff);
+ wqe->fastreg.page_type_perms = cpu_to_be32(
+ V_FR_PAGE_COUNT(wr->wr.fast_reg.page_list_len) |
+ V_FR_PAGE_SIZE(wr->wr.fast_reg.page_shift-12) |
+ V_FR_TYPE(TPT_VATO) |
+ V_FR_PERMS(iwch_ib_to_tpt_access(wr->wr.fast_reg.access_flags)));
+ p = &wqe->fastreg.pbl_addrs[0];
+ for (i = 0; i < wr->wr.fast_reg.page_list_len; i++, p++) {
+
+ /* If we need a 2nd WR, then set it up */
+ if (i == T3_MAX_FASTREG_FRAG) {
+ *wr_cnt = 2;
+ wqe = (union t3_wr *)(wq->queue +
+ Q_PTR2IDX((wq->wptr+1), wq->size_log2));
+ build_fw_riwrh((void *)wqe, T3_WR_FASTREG, 0,
+ Q_GENBIT(wq->wptr + 1, wq->size_log2),
+ 0, 1 + wr->wr.fast_reg.page_list_len - T3_MAX_FASTREG_FRAG,
+ T3_EOP);
+
+ p = &wqe->pbl_frag.pbl_addrs[0];
+ }
+ *p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]);
+ }
+ *flit_cnt = 5 + wr->wr.fast_reg.page_list_len;
+ if (*flit_cnt > 15)
+ *flit_cnt = 15;
+ return 0;
+}
+
+static int build_inv_stag(union t3_wr *wqe, struct ib_send_wr *wr,
+ u8 *flit_cnt)
+{
+ wqe->local_inv.stag = cpu_to_be32(wr->ex.invalidate_rkey);
+ wqe->local_inv.reserved = 0;
+ *flit_cnt = sizeof(struct t3_local_inv_wr) >> 3;
+ return 0;
+}
+
/*
* TBD: this is going to be moved to firmware. Missing pdid/qpid check for now.
*/
@@ -205,23 +245,106 @@ static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
return 0;
}
-static int iwch_build_rdma_recv(struct iwch_dev *rhp, union t3_wr *wqe,
+static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe,
struct ib_recv_wr *wr)
{
- int i;
- if (wr->num_sge > T3_MAX_SGE)
- return -EINVAL;
+ int i, err = 0;
+ u32 pbl_addr[T3_MAX_SGE];
+ u8 page_size[T3_MAX_SGE];
+
+ err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr,
+ page_size);
+ if (err)
+ return err;
+ wqe->recv.pagesz[0] = page_size[0];
+ wqe->recv.pagesz[1] = page_size[1];
+ wqe->recv.pagesz[2] = page_size[2];
+ wqe->recv.pagesz[3] = page_size[3];
wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
for (i = 0; i < wr->num_sge; i++) {
wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+
+ /* to in the WQE == the offset into the page */
+ wqe->recv.sgl[i].to = cpu_to_be64(((u32) wr->sg_list[i].addr) %
+ (1UL << (12 + page_size[i])));
+
+ /* pbl_addr is the adapters address in the PBL */
+ wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
+ }
+ for (; i < T3_MAX_SGE; i++) {
+ wqe->recv.sgl[i].stag = 0;
+ wqe->recv.sgl[i].len = 0;
+ wqe->recv.sgl[i].to = 0;
+ wqe->recv.pbl_addr[i] = 0;
+ }
+ qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+ qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+ qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+ qhp->wq.rq_size_log2)].pbl_addr = 0;
+ return 0;
+}
+
+static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe,
+ struct ib_recv_wr *wr)
+{
+ int i;
+ u32 pbl_addr;
+ u32 pbl_offset;
+
+
+ /*
+ * The T3 HW requires the PBL in the HW recv descriptor to reference
+ * a PBL entry. So we allocate the max needed PBL memory here and pass
+ * it to the uP in the recv WR. The uP will build the PBL and setup
+ * the HW recv descriptor.
+ */
+ pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE);
+ if (!pbl_addr)
+ return -ENOMEM;
+
+ /*
+ * Compute the 8B aligned offset.
+ */
+ pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3;
+
+ wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+
+ for (i = 0; i < wr->num_sge; i++) {
+
+ /*
+ * Use a 128MB page size. This and an imposed 128MB
+ * sge length limit allows us to require only a 2-entry HW
+ * PBL for each SGE. This restriction is acceptable since
+ * since it is not possible to allocate 128MB of contiguous
+ * DMA coherent memory!
+ */
+ if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN)
+ return -EINVAL;
+ wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT;
+
+ /*
+ * T3 restricts a recv to all zero-stag or all non-zero-stag.
+ */
+ if (wr->sg_list[i].lkey != 0)
+ return -EINVAL;
+ wqe->recv.sgl[i].stag = 0;
+ wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
wqe->recv.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
+ wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_offset);
+ pbl_offset += 2;
}
for (; i < T3_MAX_SGE; i++) {
+ wqe->recv.pagesz[i] = 0;
wqe->recv.sgl[i].stag = 0;
wqe->recv.sgl[i].len = 0;
wqe->recv.sgl[i].to = 0;
+ wqe->recv.pbl_addr[i] = 0;
}
+ qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+ qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+ qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+ qhp->wq.rq_size_log2)].pbl_addr = pbl_addr;
return 0;
}
@@ -238,6 +361,7 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
u32 num_wrs;
unsigned long flag;
struct t3_swsq *sqp;
+ int wr_cnt = 1;
qhp = to_iwch_qp(ibqp);
spin_lock_irqsave(&qhp->lock, flag);
@@ -262,33 +386,45 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
t3_wr_flags = 0;
if (wr->send_flags & IB_SEND_SOLICITED)
t3_wr_flags |= T3_SOLICITED_EVENT_FLAG;
- if (wr->send_flags & IB_SEND_FENCE)
- t3_wr_flags |= T3_READ_FENCE_FLAG;
if (wr->send_flags & IB_SEND_SIGNALED)
t3_wr_flags |= T3_COMPLETION_FLAG;
sqp = qhp->wq.sq +
Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
switch (wr->opcode) {
case IB_WR_SEND:
- case IB_WR_SEND_WITH_IMM:
+ case IB_WR_SEND_WITH_INV:
+ if (wr->send_flags & IB_SEND_FENCE)
+ t3_wr_flags |= T3_READ_FENCE_FLAG;
t3_wr_opcode = T3_WR_SEND;
- err = iwch_build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
+ err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
break;
case IB_WR_RDMA_WRITE:
case IB_WR_RDMA_WRITE_WITH_IMM:
t3_wr_opcode = T3_WR_WRITE;
- err = iwch_build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
+ err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
break;
case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_READ_WITH_INV:
t3_wr_opcode = T3_WR_READ;
t3_wr_flags = 0; /* T3 reads are always signaled */
- err = iwch_build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
+ err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
if (err)
break;
sqp->read_len = wqe->read.local_len;
if (!qhp->wq.oldest_read)
qhp->wq.oldest_read = sqp;
break;
+ case IB_WR_FAST_REG_MR:
+ t3_wr_opcode = T3_WR_FASTREG;
+ err = build_fastreg(wqe, wr, &t3_wr_flit_cnt,
+ &wr_cnt, &qhp->wq);
+ break;
+ case IB_WR_LOCAL_INV:
+ if (wr->send_flags & IB_SEND_FENCE)
+ t3_wr_flags |= T3_LOCAL_FENCE_FLAG;
+ t3_wr_opcode = T3_WR_INV_STAG;
+ err = build_inv_stag(wqe, wr, &t3_wr_flit_cnt);
+ break;
default:
PDBG("%s post of type=%d TBD!\n", __func__,
wr->opcode);
@@ -307,14 +443,15 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags,
Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
- 0, t3_wr_flit_cnt);
+ 0, t3_wr_flit_cnt,
+ (wr_cnt == 1) ? T3_SOPEOP : T3_SOP);
PDBG("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n",
__func__, (unsigned long long) wr->wr_id, idx,
Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2),
sqp->opcode);
wr = wr->next;
num_wrs--;
- ++(qhp->wq.wptr);
+ qhp->wq.wptr += wr_cnt;
++(qhp->wq.sq_wptr);
}
spin_unlock_irqrestore(&qhp->lock, flag);
@@ -345,21 +482,27 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
return -EINVAL;
}
while (wr) {
+ if (wr->num_sge > T3_MAX_SGE) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ break;
+ }
idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
wqe = (union t3_wr *) (qhp->wq.queue + idx);
if (num_wrs)
- err = iwch_build_rdma_recv(qhp->rhp, wqe, wr);
+ if (wr->sg_list[0].lkey)
+ err = build_rdma_recv(qhp, wqe, wr);
+ else
+ err = build_zero_stag_recv(qhp, wqe, wr);
else
err = -ENOMEM;
if (err) {
*bad_wr = wr;
break;
}
- qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, qhp->wq.rq_size_log2)] =
- wr->wr_id;
build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG,
Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
- 0, sizeof(struct t3_receive_wr) >> 3);
+ 0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP);
PDBG("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x "
"wqe %p \n", __func__, (unsigned long long) wr->wr_id,
idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe);
@@ -419,10 +562,10 @@ int iwch_bind_mw(struct ib_qp *qp,
sgl.lkey = mw_bind->mr->lkey;
sgl.length = mw_bind->length;
wqe->bind.reserved = 0;
- wqe->bind.type = T3_VA_BASED_TO;
+ wqe->bind.type = TPT_VATO;
/* TBD: check perms */
- wqe->bind.perms = iwch_ib_to_mwbind_access(mw_bind->mw_access_flags);
+ wqe->bind.perms = iwch_ib_to_tpt_access(mw_bind->mw_access_flags);
wqe->bind.mr_stag = cpu_to_be32(mw_bind->mr->lkey);
wqe->bind.mw_stag = cpu_to_be32(mw->rkey);
wqe->bind.mw_len = cpu_to_be32(mw_bind->length);
@@ -430,7 +573,7 @@ int iwch_bind_mw(struct ib_qp *qp,
err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size);
if (err) {
spin_unlock_irqrestore(&qhp->lock, flag);
- return err;
+ return err;
}
wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
@@ -441,10 +584,9 @@ int iwch_bind_mw(struct ib_qp *qp,
sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED);
wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr);
wqe->bind.mr_pagesz = page_size;
- wqe->flit[T3_SQ_COOKIE_FLIT] = mw_bind->wr_id;
build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags,
Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0,
- sizeof(struct t3_bind_mw_wr) >> 3);
+ sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP);
++(qhp->wq.wptr);
++(qhp->wq.sq_wptr);
spin_unlock_irqrestore(&qhp->lock, flag);
@@ -758,7 +900,8 @@ static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
init_attr.rqe_count = iwch_rqes_posted(qhp);
init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0;
- init_attr.flags |= capable(CAP_NET_BIND_SERVICE) ? PRIV_QP : 0;
+ if (!qhp->ibqp.uobject)
+ init_attr.flags |= PRIV_QP;
if (peer2peer) {
init_attr.rtr_type = RTR_READ;
if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator)
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index ce1ab0571be3..0792d930c481 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -531,7 +531,7 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq)
{
struct ehca_eq *eq = &shca->eq;
struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
- u64 eqe_value;
+ u64 eqe_value, ret;
unsigned long flags;
int eqe_cnt, i;
int eq_empty = 0;
@@ -583,8 +583,13 @@ void ehca_process_eq(struct ehca_shca *shca, int is_irq)
ehca_dbg(&shca->ib_device,
"No eqe found for irq event");
goto unlock_irq_spinlock;
- } else if (!is_irq)
+ } else if (!is_irq) {
+ ret = hipz_h_eoi(eq->ist);
+ if (ret != H_SUCCESS)
+ ehca_err(&shca->ib_device,
+ "bad return code EOI -rc = %ld\n", ret);
ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
+ }
if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
/* enable irq for new packets */
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 482103eb6eac..598844d2edc9 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -923,6 +923,7 @@ static struct of_device_id ehca_device_table[] =
},
{},
};
+MODULE_DEVICE_TABLE(of, ehca_device_table);
static struct of_platform_driver ehca_driver = {
.name = "ehca",
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
index f093b0033daf..dd9bc68f1c7b 100644
--- a/drivers/infiniband/hw/ehca/ehca_reqs.c
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -544,8 +544,16 @@ int ehca_post_recv(struct ib_qp *qp,
struct ib_recv_wr *recv_wr,
struct ib_recv_wr **bad_recv_wr)
{
- return internal_post_recv(container_of(qp, struct ehca_qp, ib_qp),
- qp->device, recv_wr, bad_recv_wr);
+ struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+
+ /* Reject WR if QP is in RESET state */
+ if (unlikely(my_qp->state == IB_QPS_RESET)) {
+ ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x",
+ my_qp->state, qp->qp_num);
+ return -EINVAL;
+ }
+
+ return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
}
int ehca_post_srq_recv(struct ib_srq *srq,
@@ -681,7 +689,7 @@ poll_cq_one_read_cqe:
wc->dlid_path_bits = cqe->dlid;
wc->src_qp = cqe->remote_qp_number;
wc->wc_flags = cqe->w_completion_flags;
- wc->imm_data = cpu_to_be32(cqe->immediate_data);
+ wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
wc->sl = cqe->service_level;
poll_cq_one_exit0:
diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
index 5245e13c3a30..415d3a465de6 100644
--- a/drivers/infiniband/hw/ehca/hcp_if.c
+++ b/drivers/infiniband/hw/ehca/hcp_if.c
@@ -933,3 +933,13 @@ u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
r_cb,
0, 0, 0, 0);
}
+
+u64 hipz_h_eoi(int irq)
+{
+ unsigned long xirr;
+
+ iosync();
+ xirr = (0xffULL << 24) | irq;
+
+ return plpar_hcall_norets(H_EOI, xirr);
+}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h
index 60ce02b70663..2c3c6e0ea5c2 100644
--- a/drivers/infiniband/hw/ehca/hcp_if.h
+++ b/drivers/infiniband/hw/ehca/hcp_if.h
@@ -260,5 +260,6 @@ u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
const u64 ressource_handle,
void *rblock,
unsigned long *byte_count);
+u64 hipz_h_eoi(int irq);
#endif /* __HCP_IF_H__ */
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
index a03bd28d9b48..d385e4168c97 100644
--- a/drivers/infiniband/hw/ipath/ipath_cq.c
+++ b/drivers/infiniband/hw/ipath/ipath_cq.c
@@ -82,7 +82,7 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
wc->uqueue[head].opcode = entry->opcode;
wc->uqueue[head].vendor_err = entry->vendor_err;
wc->uqueue[head].byte_len = entry->byte_len;
- wc->uqueue[head].imm_data = (__u32 __force)entry->imm_data;
+ wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
wc->uqueue[head].qp_num = entry->qp->qp_num;
wc->uqueue[head].src_qp = entry->src_qp;
wc->uqueue[head].wc_flags = entry->wc_flags;
diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c
index 8eee7830f042..fb70712ac85c 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c
@@ -2228,8 +2228,8 @@ static void ipath_autoneg_send(struct ipath_devdata *dd, int which)
0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x40000001, 0x1388, 0x15e, /* rest 0's */
};
- dcnt = sizeof(madpayload_start)/sizeof(madpayload_start[0]);
- hcnt = sizeof(hdr)/sizeof(hdr[0]);
+ dcnt = ARRAY_SIZE(madpayload_start);
+ hcnt = ARRAY_SIZE(hdr);
if (!swapped) {
/* for maintainability, do it at runtime */
for (i = 0; i < hcnt; i++) {
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
index 5f9315d77a43..be4fc9ada8e7 100644
--- a/drivers/infiniband/hw/ipath/ipath_mad.c
+++ b/drivers/infiniband/hw/ipath/ipath_mad.c
@@ -111,9 +111,9 @@ static int recv_subn_get_nodeinfo(struct ib_smp *smp,
nip->revision = cpu_to_be32((majrev << 16) | minrev);
nip->local_port_num = port;
vendor = dd->ipath_vendorid;
- nip->vendor_id[0] = 0;
- nip->vendor_id[1] = vendor >> 8;
- nip->vendor_id[2] = vendor;
+ nip->vendor_id[0] = IPATH_SRC_OUI_1;
+ nip->vendor_id[1] = IPATH_SRC_OUI_2;
+ nip->vendor_id[2] = IPATH_SRC_OUI_3;
return reply(smp);
}
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
index 108df667d2ee..97710522624d 100644
--- a/drivers/infiniband/hw/ipath/ipath_rc.c
+++ b/drivers/infiniband/hw/ipath/ipath_rc.c
@@ -1703,11 +1703,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
case OP(SEND_LAST_WITH_IMMEDIATE):
send_last_imm:
if (header_in_data) {
- wc.imm_data = *(__be32 *) data;
+ wc.ex.imm_data = *(__be32 *) data;
data += sizeof(__be32);
} else {
/* Immediate data comes after BTH */
- wc.imm_data = ohdr->u.imm_data;
+ wc.ex.imm_data = ohdr->u.imm_data;
}
hdrsize += 4;
wc.wc_flags = IB_WC_WITH_IMM;
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index a4b5521567fe..af051f757663 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -331,7 +331,7 @@ again:
switch (wqe->wr.opcode) {
case IB_WR_SEND_WITH_IMM:
wc.wc_flags = IB_WC_WITH_IMM;
- wc.imm_data = wqe->wr.ex.imm_data;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
/* FALLTHROUGH */
case IB_WR_SEND:
if (!ipath_get_rwqe(qp, 0))
@@ -342,7 +342,7 @@ again:
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
goto inv_err;
wc.wc_flags = IB_WC_WITH_IMM;
- wc.imm_data = wqe->wr.ex.imm_data;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
if (!ipath_get_rwqe(qp, 1))
goto rnr_nak;
/* FALLTHROUGH */
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
index 0596ec16fcbd..82cc588b8bf2 100644
--- a/drivers/infiniband/hw/ipath/ipath_uc.c
+++ b/drivers/infiniband/hw/ipath/ipath_uc.c
@@ -379,11 +379,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
case OP(SEND_LAST_WITH_IMMEDIATE):
send_last_imm:
if (header_in_data) {
- wc.imm_data = *(__be32 *) data;
+ wc.ex.imm_data = *(__be32 *) data;
data += sizeof(__be32);
} else {
/* Immediate data comes after BTH */
- wc.imm_data = ohdr->u.imm_data;
+ wc.ex.imm_data = ohdr->u.imm_data;
}
hdrsize += 4;
wc.wc_flags = IB_WC_WITH_IMM;
@@ -483,11 +483,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
rdma_last_imm:
if (header_in_data) {
- wc.imm_data = *(__be32 *) data;
+ wc.ex.imm_data = *(__be32 *) data;
data += sizeof(__be32);
} else {
/* Immediate data comes after BTH */
- wc.imm_data = ohdr->u.imm_data;
+ wc.ex.imm_data = ohdr->u.imm_data;
}
hdrsize += 4;
wc.wc_flags = IB_WC_WITH_IMM;
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
index 77ca8ca74e78..36aa242c487c 100644
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -96,7 +96,7 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
wc.wc_flags = IB_WC_WITH_IMM;
- wc.imm_data = swqe->wr.ex.imm_data;
+ wc.ex.imm_data = swqe->wr.ex.imm_data;
}
/*
@@ -492,14 +492,14 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
if (qp->ibqp.qp_num > 1 &&
opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
if (header_in_data) {
- wc.imm_data = *(__be32 *) data;
+ wc.ex.imm_data = *(__be32 *) data;
data += sizeof(__be32);
} else
- wc.imm_data = ohdr->u.ud.imm_data;
+ wc.ex.imm_data = ohdr->u.ud.imm_data;
wc.wc_flags = IB_WC_WITH_IMM;
hdrsize += sizeof(u32);
} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
- wc.imm_data = 0;
+ wc.ex.imm_data = 0;
wc.wc_flags = 0;
} else {
dev->n_pkt_drops++;
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index 7779165b2c2c..9e23ab0b51a1 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -1497,7 +1497,8 @@ static int ipath_query_device(struct ib_device *ibdev,
IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
props->page_size_cap = PAGE_SIZE;
- props->vendor_id = dev->dd->ipath_vendorid;
+ props->vendor_id =
+ IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
props->vendor_part_id = dev->dd->ipath_deviceid;
props->hw_ver = dev->dd->ipath_pcirev;
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 4521319b1406..299f20832ab6 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -663,18 +663,18 @@ repoll:
switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
- wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
- wc->wc_flags = IB_WC_WITH_IMM;
- wc->imm_data = cqe->immed_rss_invalid;
+ wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->ex.imm_data = cqe->immed_rss_invalid;
break;
case MLX4_RECV_OPCODE_SEND:
wc->opcode = IB_WC_RECV;
wc->wc_flags = 0;
break;
case MLX4_RECV_OPCODE_SEND_IMM:
- wc->opcode = IB_WC_RECV;
- wc->wc_flags = IB_WC_WITH_IMM;
- wc->imm_data = cqe->immed_rss_invalid;
+ wc->opcode = IB_WC_RECV;
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->ex.imm_data = cqe->immed_rss_invalid;
break;
}
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 4c1e72fc8f57..cdca3a511e1c 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -255,7 +255,8 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
return IB_MAD_RESULT_SUCCESS;
} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 ||
- in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2) {
+ in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2 ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
in_mad->mad_hdr.method != IB_MGMT_METHOD_SET)
return IB_MAD_RESULT_SUCCESS;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 4d61e32866c6..bcf50648fa18 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -90,7 +90,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
IB_DEVICE_PORT_ACTIVE_EVENT |
IB_DEVICE_SYS_IMAGE_GUID |
- IB_DEVICE_RC_RNR_NAK_GEN;
+ IB_DEVICE_RC_RNR_NAK_GEN |
+ IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
@@ -437,7 +438,9 @@ static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
{
return mlx4_multicast_attach(to_mdev(ibqp->device)->dev,
- &to_mqp(ibqp)->mqp, gid->raw);
+ &to_mqp(ibqp)->mqp, gid->raw,
+ !!(to_mqp(ibqp)->flags &
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK));
}
static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 5cf994794d25..c4cf5b69eefa 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -101,7 +101,8 @@ struct mlx4_ib_wq {
};
enum mlx4_ib_qp_flags {
- MLX4_IB_QP_LSO = 1 << 0
+ MLX4_IB_QP_LSO = 1 << 0,
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 1,
};
struct mlx4_ib_qp {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index a80df22deae8..89eb6cbe592e 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -129,9 +129,10 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
int ind;
void *buf;
__be32 stamp;
+ struct mlx4_wqe_ctrl_seg *ctrl;
- s = roundup(size, 1U << qp->sq.wqe_shift);
if (qp->sq_max_wqes_per_wr > 1) {
+ s = roundup(size, 1U << qp->sq.wqe_shift);
for (i = 0; i < s; i += 64) {
ind = (i >> qp->sq.wqe_shift) + n;
stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
@@ -141,7 +142,8 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
*wqe = stamp;
}
} else {
- buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+ ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+ s = (ctrl->fence_size & 0x3f) << 4;
for (i = 64; i < s; i += 64) {
wqe = buf + i;
*wqe = cpu_to_be32(0xffffffff);
@@ -452,19 +454,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
spin_lock_init(&qp->rq.lock);
qp->state = IB_QPS_RESET;
- qp->atomic_rd_en = 0;
- qp->resp_depth = 0;
-
- qp->rq.head = 0;
- qp->rq.tail = 0;
- qp->sq.head = 0;
- qp->sq.tail = 0;
- qp->sq_next_wqe = 0;
-
if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
- else
- qp->sq_signal_bits = 0;
err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
if (err)
@@ -509,6 +500,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
} else {
qp->sq_no_prefetch = 0;
+ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+ qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
qp->flags |= MLX4_IB_QP_LSO;
@@ -682,10 +676,15 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
struct mlx4_ib_qp *qp;
int err;
- /* We only support LSO, and only for kernel UD QPs. */
- if (init_attr->create_flags & ~IB_QP_CREATE_IPOIB_UD_LSO)
+ /*
+ * We only support LSO and multicast loopback blocking, and
+ * only for kernel UD QPs.
+ */
+ if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+ IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
return ERR_PTR(-EINVAL);
- if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO &&
+
+ if (init_attr->create_flags &&
(pd->uobject || init_attr->qp_type != IB_QPT_UD))
return ERR_PTR(-EINVAL);
@@ -694,7 +693,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
case IB_QPT_UC:
case IB_QPT_UD:
{
- qp = kmalloc(sizeof *qp, GFP_KERNEL);
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
if (!qp)
return ERR_PTR(-ENOMEM);
@@ -715,7 +714,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
if (pd->uobject)
return ERR_PTR(-EINVAL);
- sqp = kmalloc(sizeof *sqp, GFP_KERNEL);
+ sqp = kzalloc(sizeof *sqp, GFP_KERNEL);
if (!sqp)
return ERR_PTR(-ENOMEM);
@@ -906,7 +905,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
attr->path_mtu);
goto out;
}
- context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+ context->mtu_msgmax = (attr->path_mtu << 5) |
+ ilog2(dev->dev->caps.max_msg_sz);
}
if (qp->rq.wqe_cnt)
@@ -1063,6 +1063,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
for (i = 0; i < qp->sq.wqe_cnt; ++i) {
ctrl = get_send_wqe(qp, i);
ctrl->owner_opcode = cpu_to_be32(1 << 31);
+ if (qp->sq_max_wqes_per_wr == 1)
+ ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
}
@@ -1127,23 +1129,6 @@ out:
return err;
}
-static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 };
-static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = {
- [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_QKEY),
- [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
-};
-
int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
int attr_mask, struct ib_udata *udata)
{
@@ -1186,15 +1171,6 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
goto out;
}
- if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
- err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr,
- mlx4_ib_qp_attr_mask_table[ibqp->qp_type],
- IB_QPS_RESET, IB_QPS_INIT);
- if (err)
- goto out;
- cur_state = IB_QPS_INIT;
- }
-
err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
out:
@@ -1865,6 +1841,13 @@ done:
qp_init_attr->cap = qp_attr->cap;
+ qp_init_attr->create_flags = 0;
+ if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+ qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+ if (qp->flags & MLX4_IB_QP_LSO)
+ qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
out:
mutex_unlock(&qp->mutex);
return err;
diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c
index a76306709618..c5ccc2daab60 100644
--- a/drivers/infiniband/hw/mthca/mthca_allocator.c
+++ b/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_allocator.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/errno.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 4b111a852ff6..32f6c6315454 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_av.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/string.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c
index e948158a28d9..cc440f90000b 100644
--- a/drivers/infiniband/hw/mthca/mthca_catas.c
+++ b/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id$
*/
#include <linux/jiffies.h>
@@ -128,7 +126,6 @@ static void handle_catas(struct mthca_dev *dev)
static void poll_catas(unsigned long dev_ptr)
{
struct mthca_dev *dev = (struct mthca_dev *) dev_ptr;
- unsigned long flags;
int i;
for (i = 0; i < dev->catas_err.size; ++i)
@@ -137,13 +134,8 @@ static void poll_catas(unsigned long dev_ptr)
return;
}
- spin_lock_irqsave(&catas_lock, flags);
- if (!dev->catas_err.stop)
- mod_timer(&dev->catas_err.timer,
- jiffies + MTHCA_CATAS_POLL_INTERVAL);
- spin_unlock_irqrestore(&catas_lock, flags);
-
- return;
+ mod_timer(&dev->catas_err.timer,
+ round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL));
}
void mthca_start_catas_poll(struct mthca_dev *dev)
@@ -151,7 +143,6 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
unsigned long addr;
init_timer(&dev->catas_err.timer);
- dev->catas_err.stop = 0;
dev->catas_err.map = NULL;
addr = pci_resource_start(dev->pdev, 0) +
@@ -182,10 +173,6 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
void mthca_stop_catas_poll(struct mthca_dev *dev)
{
- spin_lock_irq(&catas_lock);
- dev->catas_err.stop = 1;
- spin_unlock_irq(&catas_lock);
-
del_timer_sync(&dev->catas_err.timer);
if (dev->catas_err.map) {
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 54d230ee7d63..c33e1c53c799 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_cmd.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/completion.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h
index 8928ca4a9325..6efd3265f248 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.h
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_cmd.h 1349 2004-12-16 21:09:43Z roland $
*/
#ifndef MTHCA_CMD_H
diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h
index afa56bfaab2e..75671f75cac4 100644
--- a/drivers/infiniband/hw/mthca/mthca_config_reg.h
+++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_config_reg.h 1349 2004-12-16 21:09:43Z roland $
*/
#ifndef MTHCA_CONFIG_REG_H
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index 20401d2ba6b2..d9f4735c2b37 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -32,8 +32,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_cq.c 1369 2004-12-20 16:17:07Z roland $
*/
#include <linux/hardirq.h>
@@ -622,13 +620,13 @@ static inline int mthca_poll_one(struct mthca_dev *dev,
case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
entry->wc_flags = IB_WC_WITH_IMM;
- entry->imm_data = cqe->imm_etype_pkey_eec;
+ entry->ex.imm_data = cqe->imm_etype_pkey_eec;
entry->opcode = IB_WC_RECV;
break;
case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
entry->wc_flags = IB_WC_WITH_IMM;
- entry->imm_data = cqe->imm_etype_pkey_eec;
+ entry->ex.imm_data = cqe->imm_etype_pkey_eec;
entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
break;
default:
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 7bc32f8e377e..ee4d073c889f 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -32,8 +32,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_dev.h 1349 2004-12-16 21:09:43Z roland $
*/
#ifndef MTHCA_DEV_H
@@ -279,7 +277,6 @@ struct mthca_mcg_table {
struct mthca_catas_err {
u64 addr;
u32 __iomem *map;
- unsigned long stop;
u32 size;
struct timer_list timer;
struct list_head list;
diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h
index b374dc395be1..14f51ef97d7e 100644
--- a/drivers/infiniband/hw/mthca/mthca_doorbell.h
+++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_doorbell.h 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/types.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
index 8bde7f98e58a..4e36aa7cb3d2 100644
--- a/drivers/infiniband/hw/mthca/mthca_eq.c
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_eq.c 1382 2004-12-24 02:21:02Z roland $
*/
#include <linux/errno.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 8b7e83e6e88f..640449582aba 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_mad.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/string.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 200cf13fc9bb..fb9f91b60f30 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_main.c 1396 2004-12-28 04:10:27Z roland $
*/
#include <linux/module.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
index a8ad072be074..3f5f94879208 100644
--- a/drivers/infiniband/hw/mthca/mthca_mcg.c
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_mcg.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/string.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index d5862e5d99a0..1f7d1a29d2a8 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id$
*/
#include <linux/mm.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h
index a1ab06847b75..da9b8f9b884f 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.h
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id$
*/
#ifndef MTHCA_MEMFREE_H
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
index 820205dec560..8489b1e81c0f 100644
--- a/drivers/infiniband/hw/mthca/mthca_mr.c
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_mr.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/slab.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_pd.c b/drivers/infiniband/hw/mthca/mthca_pd.c
index c1e950764bd8..266f14e47406 100644
--- a/drivers/infiniband/hw/mthca/mthca_pd.c
+++ b/drivers/infiniband/hw/mthca/mthca_pd.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_pd.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/errno.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
index 605a8d57fac6..d168c2540611 100644
--- a/drivers/infiniband/hw/mthca/mthca_profile.c
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_profile.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/module.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.h b/drivers/infiniband/hw/mthca/mthca_profile.h
index e76cb62d8e32..62b009cc8730 100644
--- a/drivers/infiniband/hw/mthca/mthca_profile.h
+++ b/drivers/infiniband/hw/mthca/mthca_profile.h
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_profile.h 1349 2004-12-16 21:09:43Z roland $
*/
#ifndef MTHCA_PROFILE_H
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index be34f99ca625..87ad889e367b 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -32,8 +32,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_provider.c 4859 2006-01-09 21:55:10Z roland $
*/
#include <rdma/ib_smi.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
index 934bf9544037..c621f8794b88 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.h
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_provider.h 1349 2004-12-16 21:09:43Z roland $
*/
#ifndef MTHCA_PROVIDER_H
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 09dc3614cf2c..f5081bfde6db 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -31,8 +31,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_qp.c 1355 2004-12-17 15:23:43Z roland $
*/
#include <linux/string.h>
@@ -850,23 +848,6 @@ out:
return err;
}
-static const struct ib_qp_attr dummy_init_attr = { .port_num = 1 };
-static const int dummy_init_attr_mask[] = {
- [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_QKEY),
- [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
-};
-
int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
struct ib_udata *udata)
{
@@ -928,15 +909,6 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
goto out;
}
- if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
- err = __mthca_modify_qp(ibqp, &dummy_init_attr,
- dummy_init_attr_mask[ibqp->qp_type],
- IB_QPS_RESET, IB_QPS_INIT);
- if (err)
- goto out;
- cur_state = IB_QPS_INIT;
- }
-
err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
out:
@@ -1277,10 +1249,10 @@ static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap,
return -EINVAL;
/*
- * For MLX transport we need 2 extra S/G entries:
+ * For MLX transport we need 2 extra send gather entries:
* one for the header and one for the checksum at the end
*/
- if (qp->transport == MLX && cap->max_recv_sge + 2 > dev->limits.max_sg)
+ if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg)
return -EINVAL;
if (mthca_is_memfree(dev)) {
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
index 91934f2d9dba..acb6817f6060 100644
--- a/drivers/infiniband/hw/mthca/mthca_reset.c
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_reset.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/init.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index a5ffff6e1026..4fabe62aab8a 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_srq.c 3047 2005-08-10 03:59:35Z roland $
*/
#include <linux/slab.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c
index 8b728486410d..ca5900c96fcf 100644
--- a/drivers/infiniband/hw/mthca/mthca_uar.c
+++ b/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id$
*/
#include <asm/page.h> /* PAGE_SHIFT */
diff --git a/drivers/infiniband/hw/mthca/mthca_user.h b/drivers/infiniband/hw/mthca/mthca_user.h
index e1262c942db8..5fe56e810739 100644
--- a/drivers/infiniband/hw/mthca/mthca_user.h
+++ b/drivers/infiniband/hw/mthca/mthca_user.h
@@ -29,7 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
*/
#ifndef MTHCA_USER_H
diff --git a/drivers/infiniband/hw/mthca/mthca_wqe.h b/drivers/infiniband/hw/mthca/mthca_wqe.h
index b3551a8dea1d..341a5ae881c1 100644
--- a/drivers/infiniband/hw/mthca/mthca_wqe.h
+++ b/drivers/infiniband/hw/mthca/mthca_wqe.h
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: mthca_wqe.h 3047 2005-08-10 03:59:35Z roland $
*/
#ifndef MTHCA_WQE_H
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index a4e9269a29bd..d2884e778098 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -328,7 +328,7 @@ void nes_rem_ref(struct ib_qp *ibqp)
set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
u64temp = (u64)nesqp->nesqp_context_pbase;
set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
}
}
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
index 61b46e9c7d2d..39bd897b40c6 100644
--- a/drivers/infiniband/hw/nes/nes.h
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -94,9 +94,6 @@
#define MAX_DPC_ITERATIONS 128
-#define NES_CQP_REQUEST_NO_DOORBELL_RING 0
-#define NES_CQP_REQUEST_RING_DOORBELL 1
-
#define NES_DRV_OPT_ENABLE_MPA_VER_0 0x00000001
#define NES_DRV_OPT_DISABLE_MPA_CRC 0x00000002
#define NES_DRV_OPT_DISABLE_FIRST_WRITE 0x00000004
@@ -538,7 +535,11 @@ void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *);
void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16);
void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16);
struct nes_cqp_request *nes_get_cqp_request(struct nes_device *);
-void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *, int);
+void nes_free_cqp_request(struct nes_device *nesdev,
+ struct nes_cqp_request *cqp_request);
+void nes_put_cqp_request(struct nes_device *nesdev,
+ struct nes_cqp_request *cqp_request);
+void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *);
int nes_arp_table(struct nes_device *, u32, u8 *, u32);
void nes_mh_fix(unsigned long);
void nes_clc(unsigned long);
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 9a4b40fae40d..6aa531d5276d 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -1603,7 +1603,6 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,
return NULL;
}
- memset(listener, 0, sizeof(struct nes_cm_listener));
listener->loc_addr = htonl(cm_info->loc_addr);
listener->loc_port = htons(cm_info->loc_port);
listener->reused_node = 0;
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
index d3278f111ca7..85f26d19a32b 100644
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -398,7 +398,7 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
nesadapter->base_pd = 1;
nesadapter->device_cap_flags =
- IB_DEVICE_ZERO_STAG | IB_DEVICE_MEM_WINDOW;
+ IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter)
[(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]);
@@ -2710,39 +2710,11 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq)
barrier();
cqp_request->request_done = 1;
wake_up(&cqp_request->waitq);
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
- cqp_request,
- le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f);
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
- } else if (cqp_request->callback) {
- /* Envoke the callback routine */
- cqp_request->cqp_callback(nesdev, cqp_request);
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
+ nes_put_cqp_request(nesdev, cqp_request);
} else {
- nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
- cqp_request,
- le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f);
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
+ if (cqp_request->callback)
+ cqp_request->cqp_callback(nesdev, cqp_request);
+ nes_free_cqp_request(nesdev, cqp_request);
}
} else {
wake_up(&nesdev->cqp.waitq);
@@ -3149,7 +3121,6 @@ int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
{
struct nes_device *nesdev = nesvnic->nesdev;
struct nes_hw_cqp_wqe *cqp_wqe;
- unsigned long flags;
struct nes_cqp_request *cqp_request;
int ret = 0;
u16 major_code;
@@ -3176,7 +3147,7 @@ int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n");
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
if (add_port == NES_MANAGE_APBVT_ADD)
ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
@@ -3184,15 +3155,9 @@ int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
nes_debug(NES_DBG_QP, "Completed, ret=%u, CQP Major:Minor codes = 0x%04X:0x%04X\n",
ret, cqp_request->major_code, cqp_request->minor_code);
major_code = cqp_request->major_code;
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
+
+ nes_put_cqp_request(nesdev, cqp_request);
+
if (!ret)
return -ETIME;
else if (major_code)
@@ -3252,7 +3217,7 @@ void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
atomic_set(&cqp_request->refcount, 1);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
}
@@ -3262,7 +3227,6 @@ void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
u32 which_wq, u32 wait_completion)
{
- unsigned long flags;
struct nes_cqp_request *cqp_request;
struct nes_hw_cqp_wqe *cqp_wqe;
int ret;
@@ -3285,7 +3249,7 @@ void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq);
cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
if (wait_completion) {
/* Wait for CQP */
@@ -3294,14 +3258,6 @@ void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u,"
" CQP Major:Minor codes = 0x%04X:0x%04X\n",
ret, cqp_request->major_code, cqp_request->minor_code);
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
+ nes_put_cqp_request(nesdev, cqp_request);
}
}
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
index 745bf94f3f07..7b81e0ae0076 100644
--- a/drivers/infiniband/hw/nes/nes_hw.h
+++ b/drivers/infiniband/hw/nes/nes_hw.h
@@ -1172,7 +1172,7 @@ struct nes_vnic {
u32 mcrq_qp_id;
struct nes_ucontext *mcrq_ucontext;
struct nes_cqp_request* (*get_cqp_request)(struct nes_device *nesdev);
- void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *, int);
+ void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *);
int (*mcrq_mcast_filter)( struct nes_vnic* nesvnic, __u8* dmi_addr );
struct net_device_stats netstats;
/* used to put the netdev on the adapters logical port list */
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
index fe83d1b2b177..fb8cbd71a2ef 100644
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -567,12 +567,36 @@ struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev)
return cqp_request;
}
+void nes_free_cqp_request(struct nes_device *nesdev,
+ struct nes_cqp_request *cqp_request)
+{
+ unsigned long flags;
+
+ nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
+ cqp_request,
+ le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f);
+
+ if (cqp_request->dynamic) {
+ kfree(cqp_request);
+ } else {
+ spin_lock_irqsave(&nesdev->cqp.lock, flags);
+ list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+ spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+ }
+}
+
+void nes_put_cqp_request(struct nes_device *nesdev,
+ struct nes_cqp_request *cqp_request)
+{
+ if (atomic_dec_and_test(&cqp_request->refcount))
+ nes_free_cqp_request(nesdev, cqp_request);
+}
/**
* nes_post_cqp_request
*/
void nes_post_cqp_request(struct nes_device *nesdev,
- struct nes_cqp_request *cqp_request, int ring_doorbell)
+ struct nes_cqp_request *cqp_request)
{
struct nes_hw_cqp_wqe *cqp_wqe;
unsigned long flags;
@@ -600,10 +624,9 @@ void nes_post_cqp_request(struct nes_device *nesdev,
nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size,
cqp_request->waiting, atomic_read(&cqp_request->refcount));
barrier();
- if (ring_doorbell) {
- /* Ring doorbell (1 WQEs) */
- nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
- }
+
+ /* Ring doorbell (1 WQEs) */
+ nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
barrier();
} else {
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index d617da9bd351..e3939d13484e 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -55,7 +55,6 @@ static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev);
* nes_alloc_mw
*/
static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) {
- unsigned long flags;
struct nes_pd *nespd = to_nespd(ibpd);
struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
struct nes_device *nesdev = nesvnic->nesdev;
@@ -119,7 +118,7 @@ static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) {
set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
/* Wait for CQP */
ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
@@ -128,15 +127,7 @@ static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) {
" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
stag, ret, cqp_request->major_code, cqp_request->minor_code);
if ((!ret) || (cqp_request->major_code)) {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
+ nes_put_cqp_request(nesdev, cqp_request);
kfree(nesmr);
nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
if (!ret) {
@@ -144,17 +135,8 @@ static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) {
} else {
return ERR_PTR(-ENOMEM);
}
- } else {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
}
+ nes_put_cqp_request(nesdev, cqp_request);
nesmr->ibmw.rkey = stag;
nesmr->mode = IWNES_MEMREG_TYPE_MW;
@@ -178,7 +160,6 @@ static int nes_dealloc_mw(struct ib_mw *ibmw)
struct nes_hw_cqp_wqe *cqp_wqe;
struct nes_cqp_request *cqp_request;
int err = 0;
- unsigned long flags;
int ret;
/* Deallocate the window with the adapter */
@@ -194,7 +175,7 @@ static int nes_dealloc_mw(struct ib_mw *ibmw)
set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey);
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
/* Wait for CQP */
nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n",
@@ -204,32 +185,12 @@ static int nes_dealloc_mw(struct ib_mw *ibmw)
nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u,"
" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
ret, cqp_request->major_code, cqp_request->minor_code);
- if ((!ret) || (cqp_request->major_code)) {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
- if (!ret) {
- err = -ETIME;
- } else {
- err = -EIO;
- }
- } else {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
- }
+ if (!ret)
+ err = -ETIME;
+ else if (cqp_request->major_code)
+ err = -EIO;
+
+ nes_put_cqp_request(nesdev, cqp_request);
nes_free_resource(nesadapter, nesadapter->allocated_mrs,
(ibmw->rkey & 0x0fffff00) >> 8);
@@ -516,7 +477,7 @@ static struct ib_fmr *nes_alloc_fmr(struct ib_pd *ibpd,
(nesfmr->nesmr.pbls_used-1) : nesfmr->nesmr.pbls_used);
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
/* Wait for CQP */
ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
@@ -526,29 +487,11 @@ static struct ib_fmr *nes_alloc_fmr(struct ib_pd *ibpd,
stag, ret, cqp_request->major_code, cqp_request->minor_code);
if ((!ret) || (cqp_request->major_code)) {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
+ nes_put_cqp_request(nesdev, cqp_request);
ret = (!ret) ? -ETIME : -EIO;
goto failed_leaf_vpbl_pages_alloc;
- } else {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
}
-
+ nes_put_cqp_request(nesdev, cqp_request);
nesfmr->nesmr.ibfmr.lkey = stag;
nesfmr->nesmr.ibfmr.rkey = stag;
nesfmr->attr = *ibfmr_attr;
@@ -1474,7 +1417,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
/* Wait for CQP */
nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n",
@@ -1487,15 +1430,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail,
cqp_request->major_code, cqp_request->minor_code);
if ((!ret) || (cqp_request->major_code)) {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
+ nes_put_cqp_request(nesdev, cqp_request);
nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
nes_free_qp_mem(nesdev, nesqp,virt_wqs);
kfree(nesqp->allocated_buffer);
@@ -1504,18 +1439,10 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
} else {
return ERR_PTR(-EIO);
}
- } else {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
}
+ nes_put_cqp_request(nesdev, cqp_request);
+
if (ibpd->uobject) {
uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index;
uresp.actual_sq_size = sq_size;
@@ -1817,7 +1744,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries,
cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
/* Wait for CQP */
nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n",
@@ -1827,32 +1754,15 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries,
nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n",
nescq->hw_cq.cq_number, ret);
if ((!ret) || (cqp_request->major_code)) {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
+ nes_put_cqp_request(nesdev, cqp_request);
if (!context)
pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
nescq->hw_cq.cq_pbase);
nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
kfree(nescq);
return ERR_PTR(-EIO);
- } else {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
}
+ nes_put_cqp_request(nesdev, cqp_request);
if (context) {
/* free the nespbl */
@@ -1931,7 +1841,7 @@ static int nes_destroy_cq(struct ib_cq *ib_cq)
(nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16)));
nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number);
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
/* Wait for CQP */
nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n",
@@ -1942,37 +1852,18 @@ static int nes_destroy_cq(struct ib_cq *ib_cq)
" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
nescq->hw_cq.cq_number, ret, cqp_request->major_code,
cqp_request->minor_code);
- if ((!ret) || (cqp_request->major_code)) {
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
- if (!ret) {
- nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
+ if (!ret) {
+ nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
nescq->hw_cq.cq_number);
- ret = -ETIME;
- } else {
- nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
+ ret = -ETIME;
+ } else if (cqp_request->major_code) {
+ nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
nescq->hw_cq.cq_number);
- ret = -EIO;
- }
+ ret = -EIO;
} else {
ret = 0;
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
}
+ nes_put_cqp_request(nesdev, cqp_request);
if (nescq->cq_mem_size)
pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
@@ -2096,7 +1987,7 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
barrier();
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
/* Wait for CQP */
ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
@@ -2105,15 +1996,8 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
stag, ret, cqp_request->major_code, cqp_request->minor_code);
major_code = cqp_request->major_code;
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
+ nes_put_cqp_request(nesdev, cqp_request);
+
if (!ret)
return -ETIME;
else if (major_code)
@@ -2754,7 +2638,7 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey);
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
/* Wait for CQP */
nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey);
@@ -2771,15 +2655,9 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
major_code = cqp_request->major_code;
minor_code = cqp_request->minor_code;
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
+
+ nes_put_cqp_request(nesdev, cqp_request);
+
if (!ret) {
nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag,"
" ib_mr=%p, rkey = 0x%08X\n",
@@ -2904,7 +2782,6 @@ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
/* struct iw_cm_id *cm_id = nesqp->cm_id; */
/* struct iw_cm_event cm_event; */
struct nes_cqp_request *cqp_request;
- unsigned long flags;
int ret;
u16 major_code;
@@ -2932,7 +2809,7 @@ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase);
atomic_set(&cqp_request->refcount, 2);
- nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+ nes_post_cqp_request(nesdev, cqp_request);
/* Wait for CQP */
if (wait_completion) {
@@ -2950,15 +2827,9 @@ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
nesqp->hwqp.qp_id, cqp_request->major_code,
cqp_request->minor_code, next_iwarp_state);
}
- if (atomic_dec_and_test(&cqp_request->refcount)) {
- if (cqp_request->dynamic) {
- kfree(cqp_request);
- } else {
- spin_lock_irqsave(&nesdev->cqp.lock, flags);
- list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
- spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
- }
- }
+
+ nes_put_cqp_request(nesdev, cqp_request);
+
if (!ret)
return -ETIME;
else if (major_code)
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
index 1f76bad020f3..691525cf394a 100644
--- a/drivers/infiniband/ulp/ipoib/Kconfig
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -1,6 +1,7 @@
config INFINIBAND_IPOIB
tristate "IP-over-InfiniBand"
depends on NETDEVICES && INET && (IPV6 || IPV6=n)
+ select INET_LRO
---help---
Support for the IP-over-InfiniBand protocol (IPoIB). This
transports IP packets over InfiniBand so you can use your IB
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index ca126fc2b853..b0ffc9abe8c0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ipoib.h 1358 2004-12-17 22:00:11Z roland $
*/
#ifndef _IPOIB_H
@@ -52,9 +50,16 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_pack.h>
#include <rdma/ib_sa.h>
+#include <linux/inet_lro.h>
/* constants */
+enum ipoib_flush_level {
+ IPOIB_FLUSH_LIGHT,
+ IPOIB_FLUSH_NORMAL,
+ IPOIB_FLUSH_HEAVY
+};
+
enum {
IPOIB_ENCAP_LEN = 4,
@@ -65,8 +70,8 @@ enum {
IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN,
IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
- IPOIB_RX_RING_SIZE = 128,
- IPOIB_TX_RING_SIZE = 64,
+ IPOIB_RX_RING_SIZE = 256,
+ IPOIB_TX_RING_SIZE = 128,
IPOIB_MAX_QUEUE_SIZE = 8192,
IPOIB_MIN_QUEUE_SIZE = 2,
IPOIB_CM_MAX_CONN_QP = 4096,
@@ -84,7 +89,6 @@ enum {
IPOIB_FLAG_SUBINTERFACE = 5,
IPOIB_MCAST_RUN = 6,
IPOIB_STOP_REAPER = 7,
- IPOIB_MCAST_STARTED = 8,
IPOIB_FLAG_ADMIN_CM = 9,
IPOIB_FLAG_UMCAST = 10,
IPOIB_FLAG_CSUM = 11,
@@ -96,7 +100,11 @@ enum {
IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */
IPOIB_MCAST_FLAG_ATTACHED = 3,
+ IPOIB_MAX_LRO_DESCRIPTORS = 8,
+ IPOIB_LRO_MAX_AGGR = 64,
+
MAX_SEND_CQE = 16,
+ IPOIB_CM_COPYBREAK = 256,
};
#define IPOIB_OP_RECV (1ul << 31)
@@ -149,6 +157,11 @@ struct ipoib_tx_buf {
u64 mapping[MAX_SKB_FRAGS + 1];
};
+struct ipoib_cm_tx_buf {
+ struct sk_buff *skb;
+ u64 mapping;
+};
+
struct ib_cm_id;
struct ipoib_cm_data {
@@ -207,7 +220,7 @@ struct ipoib_cm_tx {
struct net_device *dev;
struct ipoib_neigh *neigh;
struct ipoib_path *path;
- struct ipoib_tx_buf *tx_ring;
+ struct ipoib_cm_tx_buf *tx_ring;
unsigned tx_head;
unsigned tx_tail;
unsigned long flags;
@@ -249,6 +262,11 @@ struct ipoib_ethtool_st {
u16 max_coalesced_frames;
};
+struct ipoib_lro {
+ struct net_lro_mgr lro_mgr;
+ struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS];
+};
+
/*
* Device private locking: tx_lock protects members used in TX fast
* path (and we use LLTX so upper layers don't do extra locking).
@@ -264,7 +282,6 @@ struct ipoib_dev_priv {
unsigned long flags;
- struct mutex mcast_mutex;
struct mutex vlan_mutex;
struct rb_root path_tree;
@@ -276,10 +293,11 @@ struct ipoib_dev_priv {
struct delayed_work pkey_poll_task;
struct delayed_work mcast_task;
- struct work_struct flush_task;
+ struct work_struct flush_light;
+ struct work_struct flush_normal;
+ struct work_struct flush_heavy;
struct work_struct restart_task;
struct delayed_work ah_reap_task;
- struct work_struct pkey_event_task;
struct ib_device *ca;
u8 port;
@@ -335,6 +353,8 @@ struct ipoib_dev_priv {
int hca_caps;
struct ipoib_ethtool_st ethtool;
struct timer_list poll_timer;
+
+ struct ipoib_lro lro;
};
struct ipoib_ah {
@@ -359,6 +379,7 @@ struct ipoib_path {
struct rb_node rb_node;
struct list_head list;
+ int valid;
};
struct ipoib_neigh {
@@ -423,11 +444,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
struct ipoib_ah *address, u32 qpn);
void ipoib_reap_ah(struct work_struct *work);
+void ipoib_mark_paths_invalid(struct net_device *dev);
void ipoib_flush_paths(struct net_device *dev);
struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
-void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_ib_dev_flush_light(struct work_struct *work);
+void ipoib_ib_dev_flush_normal(struct work_struct *work);
+void ipoib_ib_dev_flush_heavy(struct work_struct *work);
void ipoib_pkey_event(struct work_struct *work);
void ipoib_ib_dev_cleanup(struct net_device *dev);
@@ -466,9 +490,7 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
#endif
int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
- union ib_gid *mgid);
-int ipoib_mcast_detach(struct net_device *dev, u16 mlid,
- union ib_gid *mgid);
+ union ib_gid *mgid, int set_qkey);
int ipoib_init_qp(struct net_device *dev);
int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 97e67d36378f..0f2d3045061a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id$
*/
#include <rdma/ib_cm.h>
@@ -113,18 +111,20 @@ static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
}
static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
- struct ipoib_cm_rx *rx, int id)
+ struct ipoib_cm_rx *rx,
+ struct ib_recv_wr *wr,
+ struct ib_sge *sge, int id)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_recv_wr *bad_wr;
int i, ret;
- priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+ wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
for (i = 0; i < IPOIB_CM_RX_SG; ++i)
- priv->cm.rx_sge[i].addr = rx->rx_ring[id].mapping[i];
+ sge[i].addr = rx->rx_ring[id].mapping[i];
- ret = ib_post_recv(rx->qp, &priv->cm.rx_wr, &bad_wr);
+ ret = ib_post_recv(rx->qp, wr, &bad_wr);
if (unlikely(ret)) {
ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
@@ -322,10 +322,33 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
return 0;
}
+static void ipoib_cm_init_rx_wr(struct net_device *dev,
+ struct ib_recv_wr *wr,
+ struct ib_sge *sge)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < priv->cm.num_frags; ++i)
+ sge[i].lkey = priv->mr->lkey;
+
+ sge[0].length = IPOIB_CM_HEAD_SIZE;
+ for (i = 1; i < priv->cm.num_frags; ++i)
+ sge[i].length = PAGE_SIZE;
+
+ wr->next = NULL;
+ wr->sg_list = priv->cm.rx_sge;
+ wr->num_sge = priv->cm.num_frags;
+}
+
static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
struct ipoib_cm_rx *rx)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct {
+ struct ib_recv_wr wr;
+ struct ib_sge sge[IPOIB_CM_RX_SG];
+ } *t;
int ret;
int i;
@@ -333,6 +356,14 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
if (!rx->rx_ring)
return -ENOMEM;
+ t = kmalloc(sizeof *t, GFP_KERNEL);
+ if (!t) {
+ ret = -ENOMEM;
+ goto err_free;
+ }
+
+ ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
+
spin_lock_irq(&priv->lock);
if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
@@ -351,8 +382,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
ret = -ENOMEM;
goto err_count;
- }
- ret = ipoib_cm_post_receive_nonsrq(dev, rx, i);
+ }
+ ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
if (ret) {
ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
"failed for buf %d\n", i);
@@ -363,6 +394,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
rx->recv_count = ipoib_recvq_size;
+ kfree(t);
+
return 0;
err_count:
@@ -371,6 +404,7 @@ err_count:
spin_unlock_irq(&priv->lock);
err_free:
+ kfree(t);
ipoib_cm_free_rx_ring(dev, rx->rx_ring);
return ret;
@@ -525,6 +559,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
u64 mapping[IPOIB_CM_RX_SG];
int frags;
int has_srq;
+ struct sk_buff *small_skb;
ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
wr_id, wc->status);
@@ -579,6 +614,23 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
}
}
+ if (wc->byte_len < IPOIB_CM_COPYBREAK) {
+ int dlen = wc->byte_len;
+
+ small_skb = dev_alloc_skb(dlen + 12);
+ if (small_skb) {
+ skb_reserve(small_skb, 12);
+ ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
+ dlen, DMA_FROM_DEVICE);
+ skb_copy_from_linear_data(skb, small_skb->data, dlen);
+ ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
+ dlen, DMA_FROM_DEVICE);
+ skb_put(small_skb, dlen);
+ skb = small_skb;
+ goto copied;
+ }
+ }
+
frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
(unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
@@ -601,6 +653,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
+copied:
skb->protocol = ((struct ipoib_header *) skb->data)->proto;
skb_reset_mac_header(skb);
skb_pull(skb, IPOIB_ENCAP_LEN);
@@ -620,7 +673,10 @@ repost:
ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
"for buf %d\n", wr_id);
} else {
- if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, wr_id))) {
+ if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
+ &priv->cm.rx_wr,
+ priv->cm.rx_sge,
+ wr_id))) {
--p->recv_count;
ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
"for buf %d\n", wr_id);
@@ -647,7 +703,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
- struct ipoib_tx_buf *tx_req;
+ struct ipoib_cm_tx_buf *tx_req;
u64 addr;
if (unlikely(skb->len > tx->mtu)) {
@@ -678,7 +734,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
return;
}
- tx_req->mapping[0] = addr;
+ tx_req->mapping = addr;
if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
addr, skb->len))) {
@@ -703,7 +759,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_cm_tx *tx = wc->qp->qp_context;
unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
- struct ipoib_tx_buf *tx_req;
+ struct ipoib_cm_tx_buf *tx_req;
unsigned long flags;
ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
@@ -717,7 +773,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
tx_req = &tx->tx_ring[wr_id];
- ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, DMA_TO_DEVICE);
+ ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
/* FIXME: is this right? Shouldn't we only increment on success? */
++dev->stats.tx_packets;
@@ -1087,7 +1143,7 @@ err_tx:
static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
{
struct ipoib_dev_priv *priv = netdev_priv(p->dev);
- struct ipoib_tx_buf *tx_req;
+ struct ipoib_cm_tx_buf *tx_req;
unsigned long flags;
unsigned long begin;
@@ -1115,7 +1171,7 @@ timeout:
while ((int) p->tx_tail - (int) p->tx_head < 0) {
tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
- ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len,
+ ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
DMA_TO_DEVICE);
dev_kfree_skb_any(tx_req->skb);
++p->tx_tail;
@@ -1384,7 +1440,9 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
ipoib_warn(priv, "enabling connected mode "
"will cause multicast packet drops\n");
+ rtnl_lock();
dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO);
+ rtnl_unlock();
priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
ipoib_flush_paths(dev);
@@ -1393,14 +1451,16 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
if (!strcmp(buf, "datagram\n")) {
clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
- dev->mtu = min(priv->mcast_mtu, dev->mtu);
- ipoib_flush_paths(dev);
+ rtnl_lock();
if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) {
dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
if (priv->hca_caps & IB_DEVICE_UD_TSO)
dev->features |= NETIF_F_TSO;
}
+ dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
+ rtnl_unlock();
+ ipoib_flush_paths(dev);
return count;
}
@@ -1485,15 +1545,7 @@ int ipoib_cm_dev_init(struct net_device *dev)
priv->cm.num_frags = IPOIB_CM_RX_SG;
}
- for (i = 0; i < priv->cm.num_frags; ++i)
- priv->cm.rx_sge[i].lkey = priv->mr->lkey;
-
- priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
- for (i = 1; i < priv->cm.num_frags; ++i)
- priv->cm.rx_sge[i].length = PAGE_SIZE;
- priv->cm.rx_wr.next = NULL;
- priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
- priv->cm.rx_wr.num_sge = priv->cm.num_frags;
+ ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
if (ipoib_cm_has_srq(dev)) {
for (i = 0; i < ipoib_recvq_size; ++i) {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 10279b79c44d..66af5c1a76e5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -86,11 +86,57 @@ static int ipoib_set_coalesce(struct net_device *dev,
return 0;
}
+static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = {
+ "LRO aggregated", "LRO flushed",
+ "LRO avg aggr", "LRO no desc"
+};
+
+static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+{
+ switch (stringset) {
+ case ETH_SS_STATS:
+ memcpy(data, *ipoib_stats_keys, sizeof(ipoib_stats_keys));
+ break;
+ }
+}
+
+static int ipoib_get_sset_count(struct net_device *dev, int sset)
+{
+ switch (sset) {
+ case ETH_SS_STATS:
+ return ARRAY_SIZE(ipoib_stats_keys);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static void ipoib_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, uint64_t *data)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int index = 0;
+
+ /* Get LRO statistics */
+ data[index++] = priv->lro.lro_mgr.stats.aggregated;
+ data[index++] = priv->lro.lro_mgr.stats.flushed;
+ if (priv->lro.lro_mgr.stats.flushed)
+ data[index++] = priv->lro.lro_mgr.stats.aggregated /
+ priv->lro.lro_mgr.stats.flushed;
+ else
+ data[index++] = 0;
+ data[index++] = priv->lro.lro_mgr.stats.no_desc;
+}
+
static const struct ethtool_ops ipoib_ethtool_ops = {
.get_drvinfo = ipoib_get_drvinfo,
.get_tso = ethtool_op_get_tso,
.get_coalesce = ipoib_get_coalesce,
.set_coalesce = ipoib_set_coalesce,
+ .get_flags = ethtool_op_get_flags,
+ .set_flags = ethtool_op_set_flags,
+ .get_strings = ipoib_get_strings,
+ .get_sset_count = ipoib_get_sset_count,
+ .get_ethtool_stats = ipoib_get_ethtool_stats,
};
void ipoib_set_ethtool_ops(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
index 8b882bbd1d05..961c585da216 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ipoib_fs.c 1389 2004-12-27 22:56:47Z roland $
*/
#include <linux/err.h>
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index f429bce24c20..66cafa20c246 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -31,8 +31,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $
*/
#include <linux/delay.h>
@@ -290,7 +288,10 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
skb->ip_summed = CHECKSUM_UNNECESSARY;
- netif_receive_skb(skb);
+ if (dev->features & NETIF_F_LRO)
+ lro_receive_skb(&priv->lro.lro_mgr, skb, NULL);
+ else
+ netif_receive_skb(skb);
repost:
if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
@@ -442,6 +443,9 @@ poll_more:
}
if (done < budget) {
+ if (dev->features & NETIF_F_LRO)
+ lro_flush_all(&priv->lro.lro_mgr);
+
netif_rx_complete(dev, napi);
if (unlikely(ib_req_notify_cq(priv->recv_cq,
IB_CQ_NEXT_COMP |
@@ -898,7 +902,8 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
return 0;
}
-static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
+ enum ipoib_flush_level level)
{
struct ipoib_dev_priv *cpriv;
struct net_device *dev = priv->dev;
@@ -911,7 +916,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
* the parent is down.
*/
list_for_each_entry(cpriv, &priv->child_intfs, list)
- __ipoib_ib_dev_flush(cpriv, pkey_event);
+ __ipoib_ib_dev_flush(cpriv, level);
mutex_unlock(&priv->vlan_mutex);
@@ -925,7 +930,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
return;
}
- if (pkey_event) {
+ if (level == IPOIB_FLUSH_HEAVY) {
if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
ipoib_ib_dev_down(dev, 0);
@@ -943,11 +948,15 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
priv->pkey_index = new_index;
}
- ipoib_dbg(priv, "flushing\n");
+ if (level == IPOIB_FLUSH_LIGHT) {
+ ipoib_mark_paths_invalid(dev);
+ ipoib_mcast_dev_flush(dev);
+ }
- ipoib_ib_dev_down(dev, 0);
+ if (level >= IPOIB_FLUSH_NORMAL)
+ ipoib_ib_dev_down(dev, 0);
- if (pkey_event) {
+ if (level == IPOIB_FLUSH_HEAVY) {
ipoib_ib_dev_stop(dev, 0);
ipoib_ib_dev_open(dev);
}
@@ -957,27 +966,34 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
* we get here, don't bring it back up if it's not configured up
*/
if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
- ipoib_ib_dev_up(dev);
+ if (level >= IPOIB_FLUSH_NORMAL)
+ ipoib_ib_dev_up(dev);
ipoib_mcast_restart_task(&priv->restart_task);
}
}
-void ipoib_ib_dev_flush(struct work_struct *work)
+void ipoib_ib_dev_flush_light(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, flush_light);
+
+ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
+}
+
+void ipoib_ib_dev_flush_normal(struct work_struct *work)
{
struct ipoib_dev_priv *priv =
- container_of(work, struct ipoib_dev_priv, flush_task);
+ container_of(work, struct ipoib_dev_priv, flush_normal);
- ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
- __ipoib_ib_dev_flush(priv, 0);
+ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
}
-void ipoib_pkey_event(struct work_struct *work)
+void ipoib_ib_dev_flush_heavy(struct work_struct *work)
{
struct ipoib_dev_priv *priv =
- container_of(work, struct ipoib_dev_priv, pkey_event_task);
+ container_of(work, struct ipoib_dev_priv, flush_heavy);
- ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
- __ipoib_ib_dev_flush(priv, 1);
+ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
}
void ipoib_ib_dev_cleanup(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 2442090ac8d1..8be9ea0436e6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $
*/
#include "ipoib.h"
@@ -62,6 +60,15 @@ MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+static int lro;
+module_param(lro, bool, 0444);
+MODULE_PARM_DESC(lro, "Enable LRO (Large Receive Offload)");
+
+static int lro_max_aggr = IPOIB_LRO_MAX_AGGR;
+module_param(lro_max_aggr, int, 0644);
+MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated "
+ "(default = 64)");
+
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;
@@ -350,6 +357,23 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+void ipoib_mark_paths_invalid(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_path *path, *tp;
+
+ spin_lock_irq(&priv->lock);
+
+ list_for_each_entry_safe(path, tp, &priv->path_list, list) {
+ ipoib_dbg(priv, "mark path LID 0x%04x GID " IPOIB_GID_FMT " invalid\n",
+ be16_to_cpu(path->pathrec.dlid),
+ IPOIB_GID_ARG(path->pathrec.dgid));
+ path->valid = 0;
+ }
+
+ spin_unlock_irq(&priv->lock);
+}
+
void ipoib_flush_paths(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -386,6 +410,7 @@ static void path_rec_completion(int status,
struct net_device *dev = path->dev;
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_ah *ah = NULL;
+ struct ipoib_ah *old_ah;
struct ipoib_neigh *neigh, *tn;
struct sk_buff_head skqueue;
struct sk_buff *skb;
@@ -409,6 +434,7 @@ static void path_rec_completion(int status,
spin_lock_irqsave(&priv->lock, flags);
+ old_ah = path->ah;
path->ah = ah;
if (ah) {
@@ -421,6 +447,17 @@ static void path_rec_completion(int status,
__skb_queue_tail(&skqueue, skb);
list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+ if (neigh->ah) {
+ WARN_ON(neigh->ah != old_ah);
+ /*
+ * Dropping the ah reference inside
+ * priv->lock is safe here, because we
+ * will hold one more reference from
+ * the original value of path->ah (ie
+ * old_ah).
+ */
+ ipoib_put_ah(neigh->ah);
+ }
kref_get(&path->ah->ref);
neigh->ah = path->ah;
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
@@ -443,6 +480,7 @@ static void path_rec_completion(int status,
while ((skb = __skb_dequeue(&neigh->queue)))
__skb_queue_tail(&skqueue, skb);
}
+ path->valid = 1;
}
path->query = NULL;
@@ -450,6 +488,9 @@ static void path_rec_completion(int status,
spin_unlock_irqrestore(&priv->lock, flags);
+ if (old_ah)
+ ipoib_put_ah(old_ah);
+
while ((skb = __skb_dequeue(&skqueue))) {
skb->dev = dev;
if (dev_queue_xmit(skb))
@@ -623,8 +664,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
spin_lock(&priv->lock);
path = __path_find(dev, phdr->hwaddr + 4);
- if (!path) {
- path = path_rec_create(dev, phdr->hwaddr + 4);
+ if (!path || !path->valid) {
+ if (!path)
+ path = path_rec_create(dev, phdr->hwaddr + 4);
if (path) {
/* put pseudoheader back on for next time */
skb_push(skb, sizeof *phdr);
@@ -938,6 +980,54 @@ static const struct header_ops ipoib_header_ops = {
.create = ipoib_hard_header,
};
+static int get_skb_hdr(struct sk_buff *skb, void **iphdr,
+ void **tcph, u64 *hdr_flags, void *priv)
+{
+ unsigned int ip_len;
+ struct iphdr *iph;
+
+ if (unlikely(skb->protocol != htons(ETH_P_IP)))
+ return -1;
+
+ /*
+ * In the future we may add an else clause that verifies the
+ * checksum and allows devices which do not calculate checksum
+ * to use LRO.
+ */
+ if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY))
+ return -1;
+
+ /* Check for non-TCP packet */
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ if (iph->protocol != IPPROTO_TCP)
+ return -1;
+
+ ip_len = ip_hdrlen(skb);
+ skb_set_transport_header(skb, ip_len);
+ *tcph = tcp_hdr(skb);
+
+ /* check if IP header and TCP header are complete */
+ if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb))
+ return -1;
+
+ *hdr_flags = LRO_IPV4 | LRO_TCP;
+ *iphdr = iph;
+
+ return 0;
+}
+
+static void ipoib_lro_setup(struct ipoib_dev_priv *priv)
+{
+ priv->lro.lro_mgr.max_aggr = lro_max_aggr;
+ priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS;
+ priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc;
+ priv->lro.lro_mgr.get_skb_header = get_skb_hdr;
+ priv->lro.lro_mgr.features = LRO_F_NAPI;
+ priv->lro.lro_mgr.dev = priv->dev;
+ priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY;
+}
+
static void ipoib_setup(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -977,10 +1067,11 @@ static void ipoib_setup(struct net_device *dev)
priv->dev = dev;
+ ipoib_lro_setup(priv);
+
spin_lock_init(&priv->lock);
spin_lock_init(&priv->tx_lock);
- mutex_init(&priv->mcast_mutex);
mutex_init(&priv->vlan_mutex);
INIT_LIST_HEAD(&priv->path_list);
@@ -989,9 +1080,10 @@ static void ipoib_setup(struct net_device *dev)
INIT_LIST_HEAD(&priv->multicast_list);
INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
- INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
- INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush);
+ INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
+ INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
+ INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
}
@@ -1154,6 +1246,9 @@ static struct net_device *ipoib_add_port(const char *format,
priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
}
+ if (lro)
+ priv->dev->features |= NETIF_F_LRO;
+
/*
* Set the full membership bit, so that we join the right
* broadcast group, etc.
@@ -1304,6 +1399,12 @@ static int __init ipoib_init_module(void)
ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
#endif
+ /*
+ * When copying small received packets, we only copy from the
+ * linear data part of the SKB, so we rely on this condition.
+ */
+ BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
+
ret = ipoib_register_debugfs();
if (ret)
return ret;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 3f663fb852c1..1fcc9a898d81 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ipoib_multicast.c 1362 2004-12-18 15:56:29Z roland $
*/
#include <linux/skbuff.h>
@@ -188,6 +186,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_ah *ah;
int ret;
+ int set_qkey = 0;
mcast->mcmember = *mcmember;
@@ -202,6 +201,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
spin_unlock_irq(&priv->lock);
priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+ set_qkey = 1;
}
if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
@@ -214,7 +214,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
}
ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
- &mcast->mcmember.mgid);
+ &mcast->mcmember.mgid, set_qkey);
if (ret < 0) {
ipoib_warn(priv, "couldn't attach QP to multicast group "
IPOIB_GID_FMT "\n",
@@ -575,8 +575,11 @@ void ipoib_mcast_join_task(struct work_struct *work)
priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
- if (!ipoib_cm_admin_enabled(dev))
- dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+ if (!ipoib_cm_admin_enabled(dev)) {
+ rtnl_lock();
+ dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu));
+ rtnl_unlock();
+ }
ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
@@ -594,10 +597,6 @@ int ipoib_mcast_start_thread(struct net_device *dev)
queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
mutex_unlock(&mcast_mutex);
- spin_lock_irq(&priv->lock);
- set_bit(IPOIB_MCAST_STARTED, &priv->flags);
- spin_unlock_irq(&priv->lock);
-
return 0;
}
@@ -607,10 +606,6 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
ipoib_dbg_mcast(priv, "stopping multicast thread\n");
- spin_lock_irq(&priv->lock);
- clear_bit(IPOIB_MCAST_STARTED, &priv->flags);
- spin_unlock_irq(&priv->lock);
-
mutex_lock(&mcast_mutex);
clear_bit(IPOIB_MCAST_RUN, &priv->flags);
cancel_delayed_work(&priv->mcast_task);
@@ -635,10 +630,10 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
IPOIB_GID_ARG(mcast->mcmember.mgid));
/* Remove ourselves from the multicast group */
- ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
- &mcast->mcmember.mgid);
+ ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid,
+ be16_to_cpu(mcast->mcmember.mlid));
if (ret)
- ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
+ ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
}
return 0;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 8766d29ce3b7..68325119f740 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -29,24 +29,17 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
*/
#include "ipoib.h"
-int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
- struct ib_qp_attr *qp_attr;
+ struct ib_qp_attr *qp_attr = NULL;
int ret;
u16 pkey_index;
- ret = -ENOMEM;
- qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
- if (!qp_attr)
- goto out;
-
if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
ret = -ENXIO;
@@ -54,18 +47,23 @@ int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
}
set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
- /* set correct QKey for QP */
- qp_attr->qkey = priv->qkey;
- ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
- if (ret) {
- ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
- goto out;
+ if (set_qkey) {
+ ret = -ENOMEM;
+ qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+ if (!qp_attr)
+ goto out;
+
+ /* set correct QKey for QP */
+ qp_attr->qkey = priv->qkey;
+ ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+ goto out;
+ }
}
/* attach QP to multicast group */
- mutex_lock(&priv->mcast_mutex);
ret = ib_attach_mcast(priv->qp, mgid, mlid);
- mutex_unlock(&priv->mcast_mutex);
if (ret)
ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
@@ -74,20 +72,6 @@ out:
return ret;
}
-int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
-{
- struct ipoib_dev_priv *priv = netdev_priv(dev);
- int ret;
-
- mutex_lock(&priv->mcast_mutex);
- ret = ib_detach_mcast(priv->qp, mgid, mlid);
- mutex_unlock(&priv->mcast_mutex);
- if (ret)
- ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
-
- return ret;
-}
-
int ipoib_init_qp(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -201,7 +185,10 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
init_attr.recv_cq = priv->recv_cq;
if (priv->hca_caps & IB_DEVICE_UD_TSO)
- init_attr.create_flags = IB_QP_CREATE_IPOIB_UD_LSO;
+ init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+ if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)
+ init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
if (dev->features & NETIF_F_SG)
init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
@@ -289,15 +276,17 @@ void ipoib_event(struct ib_event_handler *handler,
if (record->element.port_num != priv->port)
return;
- if (record->event == IB_EVENT_PORT_ERR ||
- record->event == IB_EVENT_PORT_ACTIVE ||
- record->event == IB_EVENT_LID_CHANGE ||
- record->event == IB_EVENT_SM_CHANGE ||
+ ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
+ record->device->name, record->element.port_num);
+
+ if (record->event == IB_EVENT_SM_CHANGE ||
record->event == IB_EVENT_CLIENT_REREGISTER) {
- ipoib_dbg(priv, "Port state change event\n");
- queue_work(ipoib_workqueue, &priv->flush_task);
+ queue_work(ipoib_workqueue, &priv->flush_light);
+ } else if (record->event == IB_EVENT_PORT_ERR ||
+ record->event == IB_EVENT_PORT_ACTIVE ||
+ record->event == IB_EVENT_LID_CHANGE) {
+ queue_work(ipoib_workqueue, &priv->flush_normal);
} else if (record->event == IB_EVENT_PKEY_CHANGE) {
- ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
- queue_work(ipoib_workqueue, &priv->pkey_event_task);
+ queue_work(ipoib_workqueue, &priv->flush_heavy);
}
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 1cdb5cfb0ff1..b08eb56196d3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ipoib_vlan.c 1349 2004-12-16 21:09:43Z roland $
*/
#include <linux/module.h>
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index aeb58cae9a3f..356fac6d105a 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -42,9 +42,6 @@
* Zhenyu Wang
* Modified by:
* Erez Zilber
- *
- *
- * $Id: iscsi_iser.c 6965 2006-05-07 11:36:20Z ogerlitz $
*/
#include <linux/types.h>
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index a8c1b300e34d..0e10703cf59e 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -36,8 +36,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: iscsi_iser.h 7051 2006-05-10 12:29:11Z ogerlitz $
*/
#ifndef __ISCSI_ISER_H__
#define __ISCSI_ISER_H__
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 08dc81c46f41..31ad498bdc51 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: iser_initiator.c 6964 2006-05-07 11:11:43Z ogerlitz $
*/
#include <linux/kernel.h>
#include <linux/slab.h>
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index cac50c4dc159..81e49cb10ed3 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: iser_memory.c 6964 2006-05-07 11:11:43Z ogerlitz $
*/
#include <linux/module.h>
#include <linux/kernel.h>
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index d19cfe605ebb..77cabee7cc08 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: iser_verbs.c 7051 2006-05-10 12:29:11Z ogerlitz $
*/
#include <linux/kernel.h>
#include <linux/module.h>
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 435145709dd6..ed7c5f72cb8b 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_srp.c 3932 2005-11-01 17:19:29Z roland $
*/
#include <linux/module.h>
@@ -49,8 +47,6 @@
#include <scsi/srp.h>
#include <scsi/scsi_transport_srp.h>
-#include <rdma/ib_cache.h>
-
#include "ib_srp.h"
#define DRV_NAME "ib_srp"
@@ -183,10 +179,10 @@ static int srp_init_qp(struct srp_target_port *target,
if (!attr)
return -ENOMEM;
- ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev,
- target->srp_host->port,
- be16_to_cpu(target->path.pkey),
- &attr->pkey_index);
+ ret = ib_find_pkey(target->srp_host->srp_dev->dev,
+ target->srp_host->port,
+ be16_to_cpu(target->path.pkey),
+ &attr->pkey_index);
if (ret)
goto out;
@@ -1883,8 +1879,7 @@ static ssize_t srp_create_target(struct device *dev,
if (ret)
goto err;
- ib_get_cached_gid(host->srp_dev->dev, host->port, 0,
- &target->path.sgid);
+ ib_query_gid(host->srp_dev->dev, host->port, 0, &target->path.sgid);
shost_printk(KERN_DEBUG, target->scsi_host, PFX
"new target: id_ext %016llx ioc_guid %016llx pkey %04x "
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 63d2ae724061..e185b907fc12 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_srp.h 3932 2005-11-01 17:19:29Z roland $
*/
#ifndef IB_SRP_H
diff --git a/drivers/net/cxgb3/cxgb3_ctl_defs.h b/drivers/net/cxgb3/cxgb3_ctl_defs.h
index 6c4f32066919..ed0ecd9679cb 100644
--- a/drivers/net/cxgb3/cxgb3_ctl_defs.h
+++ b/drivers/net/cxgb3/cxgb3_ctl_defs.h
@@ -54,6 +54,7 @@ enum {
RDMA_CQ_DISABLE = 16,
RDMA_CTRL_QP_SETUP = 17,
RDMA_GET_MEM = 18,
+ RDMA_GET_MIB = 19,
GET_RX_PAGE_INFO = 50,
};
diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c
index ff9c013ce535..cf2696873796 100644
--- a/drivers/net/cxgb3/cxgb3_offload.c
+++ b/drivers/net/cxgb3/cxgb3_offload.c
@@ -303,6 +303,12 @@ static int cxgb_rdma_ctl(struct adapter *adapter, unsigned int req, void *data)
spin_unlock_irq(&adapter->sge.reg_lock);
break;
}
+ case RDMA_GET_MIB: {
+ spin_lock(&adapter->stats_lock);
+ t3_tp_get_mib_stats(adapter, (struct tp_mib_stats *)data);
+ spin_unlock(&adapter->stats_lock);
+ break;
+ }
default:
ret = -EOPNOTSUPP;
}
@@ -381,6 +387,7 @@ static int cxgb_offload_ctl(struct t3cdev *tdev, unsigned int req, void *data)
case RDMA_CQ_DISABLE:
case RDMA_CTRL_QP_SETUP:
case RDMA_GET_MEM:
+ case RDMA_GET_MIB:
if (!offload_running(adapter))
return -EAGAIN;
return cxgb_rdma_ctl(adapter, req, data);
diff --git a/drivers/net/cxgb3/version.h b/drivers/net/cxgb3/version.h
index a0177fc55e28..29db711303b9 100644
--- a/drivers/net/cxgb3/version.h
+++ b/drivers/net/cxgb3/version.h
@@ -38,7 +38,7 @@
#define DRV_VERSION "1.0-ko"
/* Firmware version */
-#define FW_VERSION_MAJOR 6
+#define FW_VERSION_MAJOR 7
#define FW_VERSION_MINOR 0
#define FW_VERSION_MICRO 0
#endif /* __CHELSIO_VERSION_H */
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index d82f2751d2c7..2b5006b9be67 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -101,6 +101,34 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags)
mlx4_dbg(dev, " %s\n", fname[i]);
}
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ u32 *inbox;
+ int err = 0;
+
+#define MOD_STAT_CFG_IN_SIZE 0x100
+
+#define MOD_STAT_CFG_PG_SZ_M_OFFSET 0x002
+#define MOD_STAT_CFG_PG_SZ_OFFSET 0x003
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ memset(inbox, 0, MOD_STAT_CFG_IN_SIZE);
+
+ MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET);
+ MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET);
+
+ err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG,
+ MLX4_CMD_TIME_CLASS_A);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
{
struct mlx4_cmd_mailbox *mailbox;
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index 306cb9b0242d..a0e046c149b7 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -38,6 +38,11 @@
#include "mlx4.h"
#include "icm.h"
+struct mlx4_mod_stat_cfg {
+ u8 log_pg_sz;
+ u8 log_pg_sz_m;
+};
+
struct mlx4_dev_cap {
int max_srq_sz;
int max_qp_sz;
@@ -162,5 +167,6 @@ int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
int mlx4_NOP(struct mlx4_dev *dev);
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
#endif /* MLX4_FW_H */
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index a6aa49fc1d68..d3736013fe9b 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -485,6 +485,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
struct mlx4_priv *priv = mlx4_priv(dev);
struct mlx4_adapter adapter;
struct mlx4_dev_cap dev_cap;
+ struct mlx4_mod_stat_cfg mlx4_cfg;
struct mlx4_profile profile;
struct mlx4_init_hca_param init_hca;
u64 icm_size;
@@ -502,6 +503,12 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
return err;
}
+ mlx4_cfg.log_pg_sz_m = 1;
+ mlx4_cfg.log_pg_sz = 0;
+ err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
+ if (err)
+ mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
+
err = mlx4_dev_cap(dev, &dev_cap);
if (err) {
mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c
index 57f7f1f0d4ec..b4b57870ddfd 100644
--- a/drivers/net/mlx4/mcg.c
+++ b/drivers/net/mlx4/mcg.c
@@ -38,6 +38,9 @@
#include "mlx4.h"
+#define MGM_QPN_MASK 0x00FFFFFF
+#define MGM_BLCK_LB_BIT 30
+
struct mlx4_mgm {
__be32 next_gid_index;
__be32 members_count;
@@ -153,7 +156,8 @@ static int find_mgm(struct mlx4_dev *dev,
return err;
}
-int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+ int block_mcast_loopback)
{
struct mlx4_priv *priv = mlx4_priv(dev);
struct mlx4_cmd_mailbox *mailbox;
@@ -202,13 +206,18 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
}
for (i = 0; i < members_count; ++i)
- if (mgm->qp[i] == cpu_to_be32(qp->qpn)) {
+ if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn);
err = 0;
goto out;
}
- mgm->qp[members_count++] = cpu_to_be32(qp->qpn);
+ if (block_mcast_loopback)
+ mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
+ (1 << MGM_BLCK_LB_BIT));
+ else
+ mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK);
+
mgm->members_count = cpu_to_be32(members_count);
err = mlx4_WRITE_MCG(dev, index, mailbox);
@@ -283,7 +292,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
members_count = be32_to_cpu(mgm->members_count);
for (loc = -1, i = 0; i < members_count; ++i)
- if (mgm->qp[i] == cpu_to_be32(qp->qpn))
+ if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn)
loc = i;
if (loc == -1) {
diff --git a/firmware/Makefile b/firmware/Makefile
index 809a52624bda..e4f2fb3d1917 100644
--- a/firmware/Makefile
+++ b/firmware/Makefile
@@ -34,8 +34,6 @@ fw-shipped-$(CONFIG_SND_SB16_CSP) += sb16/mulaw_main.csp sb16/alaw_main.csp \
sb16/ima_adpcm_capture.csp
fw-shipped-$(CONFIG_SND_YMFPCI) += yamaha/ds1_ctrl.fw yamaha/ds1_dsp.fw \
yamaha/ds1e_ctrl.fw
-fw-shipped-$(CONFIG_TIGON3) += tigon/tg3.bin tigon/tg3_tso.bin \
- tigon/tg3_tso5.bin
fw-shipped-$(CONFIG_USB_DABUSB) += dabusb/firmware.fw dabusb/bitstream.bin
fw-shipped-$(CONFIG_USB_EMI26) += emi26/loader.fw emi26/firmware.fw \
emi26/bitstream.fw
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c266..5fa1512cd9a2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+ buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
goto recover;
+ clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
+ !buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ int i_size_changed = 0;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
*/
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
+ i_size_changed = 1;
}
unlock_page(page);
page_cache_release(page);
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
return copied;
}
EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d8..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
- ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
if (actual_group == block_group)
return 1;
return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
- int group_rel = (block_group -
- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
- EXT4_DESC_PER_BLOCK(sb);
- if (group_rel == 0 || group_rel == 1 ||
- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
- bit_max += 1;
+ bit_max += ext4_bg_num_gdb(sb, block_group);
}
if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
return 0;
}
/**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
* @sb: super block
* @block_group: given block group
*
@@ -305,7 +300,7 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc * desc;
struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
prev = rsv;
}
printk("Window map complete.\n");
- if (bad)
- BUG();
+ BUG_ON(bad);
}
#define rsv_window_dump(root, verbose) \
__rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
count -= overflow;
}
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
/**
* ext4_has_free_blocks()
- * @sbi: in-core super block structure.
+ * @sbi: in-core super block structure.
+ * @nblocks: number of neeed blocks
*
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks)
{
- ext4_fsblk_t free_blocks, root_blocks;
+ ext4_fsblk_t free_blocks;
+ ext4_fsblk_t root_blocks = 0;
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+ if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- return 0;
- }
- return 1;
-}
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+ if (free_blocks - root_blocks < FBC_BATCH)
+ free_blocks =
+ percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+ if (free_blocks - root_blocks < nblocks)
+ return free_blocks - root_blocks;
+ return nblocks;
+ }
+
/**
* ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
}
/**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
* @count: target number of blocks to allocate
* @errp: error code
*
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation. It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
*
*/
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
ext4_group_t ngroups;
unsigned long num = *count;
- *errp = -ENOSPC;
sb = inode->i_sb;
if (!sb) {
+ *errp = -ENODEV;
printk("ext4_new_block: nonexistent device");
return 0;
}
+ sbi = EXT4_SB(sb);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ *count = ext4_has_free_blocks(sbi, *count);
+ }
+ if (*count == 0) {
+ *errp = -ENOSPC;
+ return 0; /*return with ENOSPC error */
+ }
+ num = *count;
+
/*
* Check quota for allocation of this block.
*/
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
my_rsv = &block_i->rsv_window_node;
- if (!ext4_has_free_blocks(sbi)) {
- *errp = -ENOSPC;
- goto out;
- }
-
/*
* First, test whether the goal block is free.
*/
@@ -1734,7 +1759,7 @@ retry_alloc:
my_rsv = NULL;
if (free_blocks > 0) {
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
continue;
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
/*
@@ -1882,7 +1907,15 @@ allocated:
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= num;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
return 0;
}
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
+#define EXT4_META_BLOCK 0x1
+
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp, int flags)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
+ return ext4_old_new_blocks(handle, inode, goal, count, errp);
}
memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
+
ar.inode = inode;
ar.goal = goal;
- ar.len = 1;
+ ar.len = *count;
+ ar.logical = iblock;
+
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+ /* enable in-core preallocation for data block allocation */
+ ar.flags = EXT4_MB_HINT_DATA;
+ else
+ /* disable in-core preallocation for non-regular files */
+ ar.flags = 0;
+
ret = ext4_mb_new_blocks(handle, &ar, errp);
+ *count = ar.len;
return ret;
}
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
- struct ext4_allocation_request ar;
ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
+ ret = do_blk_alloc(handle, inode, 0, goal,
+ count, errp, EXT4_META_BLOCK);
+ /*
+ * Account for the allocated meta blocks
+ */
+ if (!(*errp)) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += *count;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
return ret;
}
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @errp: error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+ return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
/**
* ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
continue;
desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, i);
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+ 0, 0, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
while (n) {
/* Do the node's children first */
- if ((n)->rb_left) {
+ if (n->rb_left) {
n = n->rb_left;
continue;
}
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
parent->rb_right = NULL;
n = parent;
}
- root->rb_node = NULL;
}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
p->curr_hash = pos2maj_hash(pos);
p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
return p;
}
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
#include "ext4_i.h"
/*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
*/
/*
@@ -45,7 +45,7 @@
#define ext4_debug(f, a...) \
do { \
printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
- __FILE__, __LINE__, __FUNCTION__); \
+ __FILE__, __LINE__, __func__); \
printk (KERN_DEBUG f, ## a); \
} while (0)
#else
@@ -74,6 +74,9 @@
#define EXT4_MB_HINT_GOAL_ONLY 256
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL 512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED 1024
+
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
__u32 bg_reserved2[3];
};
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ __u32 free_inodes;
+ __u32 free_blocks;
+};
+
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do { \
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+ ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+ ext4_grpblk_t add);
/* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode (struct inode *, int);
extern int ext4_setattr (struct dentry *, struct iattr *);
+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
extern void ext4_delete_inode (struct inode *);
extern int ext4_sync_inode (handle_t *, struct inode *);
extern void ext4_discard_reservation (struct inode *);
extern void ext4_dirty_inode(struct inode *);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate (struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
}
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_log_groups_per_flex;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
- __ext4_std_error((sb), __FUNCTION__, (errno)); \
+ __ext4_std_error((sb), __func__, (errno)); \
} while (0)
/*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock,
unsigned long max_blocks, struct buffer_head *bh_result,
int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
sector_t block, unsigned long max_blocks,
struct buffer_head *bh, int create,
- int extend_disksize);
+ int extend_disksize, int flag);
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
};
/*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
*/
struct rw_semaphore i_data_sem;
struct inode vfs_inode;
+ struct jbd2_inode jinode;
unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
/* mballoc */
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+
+ /* allocation reservation info for delalloc */
+ unsigned long i_reserved_data_blocks;
+ unsigned long i_reserved_meta_blocks;
+ unsigned long i_allocated_meta_blocks;
+ unsigned short i_delalloc_reserved_flag;
+ spinlock_t i_block_reservation_lock;
};
#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
handle_t *handle, struct buffer_head *bh);
#define ext4_journal_get_undo_access(handle, bh) \
- __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_undo_access(__func__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
- __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_write_access(__func__, (handle), (bh))
#define ext4_journal_revoke(handle, blocknr, bh) \
- __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
#define ext4_journal_get_create_access(handle, bh) \
- __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_create_access(__func__, (handle), (bh))
#define ext4_journal_dirty_metadata(handle, bh) \
- __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+ __ext4_journal_dirty_metadata(__func__, (handle), (bh))
#define ext4_journal_forget(handle, bh) \
- __ext4_journal_forget(__FUNCTION__, (handle), (bh))
-
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+ __ext4_journal_forget(__func__, (handle), (bh))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
}
#define ext4_journal_stop(handle) \
- __ext4_journal_stop(__FUNCTION__, (handle))
+ __ext4_journal_stop(__func__, (handle))
static inline handle_t *ext4_journal_current_handle(void)
{
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
return jbd2_journal_force_commit(journal);
}
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
#include <linux/rbtree.h>
/*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
};
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
{
int err;
if (handle->h_buffer_credits > needed)
- return handle;
- if (!ext4_journal_extend(handle, needed))
- return handle;
- err = ext4_journal_restart(handle, needed);
-
- return handle;
+ return 0;
+ err = ext4_journal_extend(handle, needed);
+ if (err)
+ return err;
+ return ext4_journal_restart(handle, needed);
}
/*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
return bg_start + colour + block;
}
+/*
+ * Allocation for a meta data block
+ */
static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex, int *err)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_block(handle, inode, goal, err);
+ newblock = ext4_new_meta_block(handle, inode, goal, err);
return newblock;
}
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
return size;
}
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int lcap, icap, rcap, leafs, idxs, num;
+ int newextents = blocks;
+
+ rcap = ext4_ext_space_root_idx(inode);
+ lcap = ext4_ext_space_block(inode);
+ icap = ext4_ext_space_block_idx(inode);
+
+ /* number of new leaf blocks needed */
+ num = leafs = (newextents + lcap - 1) / lcap;
+
+ /*
+ * Worse case, we need separate index block(s)
+ * to link all new leaf blocks
+ */
+ idxs = (leafs + icap - 1) / icap;
+ do {
+ num += idxs;
+ idxs = (idxs + icap - 1) / icap;
+ } while (idxs > rcap);
+
+ return num;
+}
+
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
alloc = 1;
}
path[0].p_hdr = eh;
+ path[0].p_bh = NULL;
i = depth;
/* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
path[ppos].p_depth = i;
- path[ppos].p_hdr = eh;
path[ppos].p_ext = NULL;
path[ppos].p_idx = NULL;
/* find extent */
ext4_ext_binsearch(inode, path + ppos, block);
+ /* if not an empty leaf */
+ if (path[ppos].p_ext)
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
ext4_ext_show_path(inode, path);
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* allocate all needed blocks */
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
if (newblock == 0)
return err;
@@ -981,6 +1017,8 @@ repeat:
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, path, newext, i);
+ if (err)
+ goto out;
/* refill path */
ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
#endif
- handle = ext4_ext_journal_restart(handle, credits);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
+ err = ext4_ext_journal_restart(handle, credits);
+ if (err)
goto out;
- }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret;
unsigned long allocated = 0;
struct ext4_allocation_request ar;
+ loff_t disksize;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
if (allocated > max_blocks)
allocated = max_blocks;
- /* mark the buffer unwritten */
- __set_bit(BH_Unwritten, &bh_result->b_state);
+ set_buffer_unwritten(bh_result);
goto out2;
}
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
goto out2;
}
- if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = inode->i_size;
-
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
outnew:
- __set_bit(BH_New, &bh_result->b_state);
+ if (extend_disksize) {
+ disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ }
+
+ set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
if (allocated > max_blocks)
allocated = max_blocks;
ext4_ext_show_leaf(inode, path);
- __set_bit(BH_Mapped, &bh_result->b_state);
+ set_buffer_mapped(bh_result);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
out2:
@@ -2744,7 +2785,7 @@ out2:
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
{
struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
*/
err = ext4_writepage_trans_blocks(inode) + 3;
handle = ext4_journal_start(inode, err);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return;
- }
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
* Probably we need not scan at all,
* because page truncation is enough.
*/
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
handle->h_sync = 1;
out_stop:
+ up_write(&EXT4_I(inode)->i_data_sem);
/*
* If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- up_write(&EXT4_I(inode)->i_data_sem);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
}
ret = ext4_get_blocks_wrap(handle, inode, block,
max_blocks, &map_bh,
- EXT4_CREATE_UNINITIALIZED_EXT, 0);
+ EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
return ret;
}
+static struct vm_operations_struct ext4_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext4_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext4_file_mmap,
.open = generic_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.truncate = ext4_truncate,
.setattr = ext4_setattr,
+ .getattr = ext4_getattr,
#ifdef CONFIG_EXT4DEV_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/jbd2.h>
+#include <linux/blkdev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -45,6 +46,7 @@
int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
.nr_to_write = 0, /* sys_fsync did this */
};
ret = sync_inode(inode, &wbc);
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
}
out:
return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
extern unsigned ext4_init_block_bitmap(struct super_block *sb,
struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
struct ext4_super_block * es;
struct ext4_sb_info *sbi;
int fatal = 0, err;
+ ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes++;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
}
BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
return ret;
}
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+ ext4_group_t *best_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ struct flex_groups *flex_group = sbi->s_flex_groups;
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+ ext4_group_t ngroups = sbi->s_groups_count;
+ int flex_size = ext4_flex_bg_size(sbi);
+ ext4_group_t best_flex = parent_fbg_group;
+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+ int flexbg_free_blocks;
+ int flex_freeb_ratio;
+ ext4_group_t n_fbg_groups;
+ ext4_group_t i;
+
+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+ flexbg_free_blocks = flex_group[best_flex].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+ if (flex_group[best_flex].free_inodes &&
+ flex_freeb_ratio > free_block_ratio)
+ goto found_flexbg;
+
+ if (best_flex && best_flex == parent_fbg_group) {
+ best_flex--;
+ goto find_close_to_parent;
+ }
+
+ for (i = 0; i < n_fbg_groups; i++) {
+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
+ continue;
+
+ flexbg_free_blocks = flex_group[i].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+ if (flex_freeb_ratio > free_block_ratio &&
+ flex_group[i].free_inodes) {
+ best_flex = i;
+ goto found_flexbg;
+ }
+
+ if (best_flex < 0 ||
+ (flex_group[i].free_blocks >
+ flex_group[best_flex].free_blocks &&
+ flex_group[i].free_inodes))
+ best_flex = i;
+ }
+
+ if (!flex_group[best_flex].free_inodes ||
+ !flex_group[best_flex].free_blocks)
+ return -1;
+
+found_flexbg:
+ for (i = best_flex * flex_size; i < ngroups &&
+ i < (best_flex + 1) * flex_size; i++) {
+ desc = ext4_get_group_desc(sb, i, &bh);
+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ *best_group = i;
+ goto out;
+ }
+ }
+
+ return -1;
+out:
+ return 0;
+}
+
/*
* Orlov's allocator for directories.
*
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
struct inode *ret;
ext4_group_t i;
int free = 0;
+ ext4_group_t flex_group;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
+
+ if (sbi->s_log_groups_per_flex) {
+ ret2 = find_group_flex(sb, dir, &group);
+ goto got_group;
+ }
+
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
} else
ret2 = find_group_other(sb, dir, &group);
+got_group:
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -600,7 +689,7 @@ got:
/* We may have to initialize the block bitmap if it isn't already */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bh = read_block_bitmap(sb, group);
+ struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
BUFFER_TRACE(block_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
percpu_counter_inc(&sbi->s_dirs_counter);
sb->s_dirt = 1;
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes--;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
goto fail_free_drop;
if (test_opt(sb, EXTENTS)) {
- /* set extent flag only for diretory, file and normal symlink*/
+ /* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
ext4_ext_tree_init(handle, inode);
- err = ext4_update_incompat_feature(handle, sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS);
- if (err)
- goto fail_free_drop;
}
}
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
if (IS_ERR(inode))
goto iget_failed;
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext4_can_truncate(inode))
+ goto bad_orphan;
+
if (NEXT_ORPHAN(inode) > max_ino)
goto bad_orphan;
brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "ext4_extents.h"
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+ new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
/*
* Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
{
handle_t *handle;
+ if (ext4_should_order_data(inode))
+ ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
* direct blocks
*/
static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ int indirect_blks, int blks,
+ ext4_fsblk_t new_blocks[4], int *err)
{
int target, i;
- unsigned long count = 0;
+ unsigned long count = 0, blk_allocated = 0;
int index = 0;
ext4_fsblk_t current_block = 0;
int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
* the first direct block of this branch. That's the
* minimum number of blocks need to allocate(required)
*/
- target = blks + indirect_blks;
-
- while (1) {
+ /* first we try to allocate the indirect blocks */
+ target = indirect_blks;
+ while (target > 0) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+ current_block = ext4_new_meta_blocks(handle, inode,
+ goal, &count, err);
if (*err)
goto failed_out;
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
new_blocks[index++] = current_block++;
count--;
}
-
- if (count > 0)
+ if (count > 0) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ printk(KERN_INFO "%s returned more blocks than "
+ "requested\n", __func__);
+ WARN_ON(1);
break;
+ }
}
- /* save the new block number for the first direct block */
- new_blocks[index] = current_block;
-
+ target = blks - count ;
+ blk_allocated = count;
+ if (!target)
+ goto allocated;
+ /* Now allocate data blocks */
+ count = target;
+ /* allocating blocks for data blocks */
+ current_block = ext4_new_blocks(handle, inode, iblock,
+ goal, &count, err);
+ if (*err && (target == blks)) {
+ /*
+ * if the allocation failed and we didn't allocate
+ * any blocks before
+ */
+ goto failed_out;
+ }
+ if (!*err) {
+ if (target == blks) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ }
+ blk_allocated += count;
+ }
+allocated:
/* total number of blocks allocated for direct blocks */
- ret = count;
+ ret = blk_allocated;
*err = 0;
return ret;
failed_out:
@@ -584,8 +631,9 @@ failed_out:
* as described above and return 0.
*/
static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- int indirect_blks, int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
{
int blocksize = inode->i_sb->s_blocksize;
int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
ext4_fsblk_t new_blocks[4];
ext4_fsblk_t current_block;
- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+ num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
*blks, new_blocks, &err);
if (err)
return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
struct ext4_inode_info *ei = EXT4_I(inode);
int count = 0;
ext4_fsblk_t first_block = 0;
+ loff_t disksize;
J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
/*
* Block out ext4_truncate while we alter the tree
*/
- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
- offsets + (partial - chain), partial);
+ err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+ &count, goal,
+ offsets + (partial - chain), partial);
/*
* The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
* protect it if you're about to implement concurrent
* ext4_get_block() -bzzz
*/
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
+ if (!err && extend_disksize) {
+ disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > ei->i_disksize)
+ ei->i_disksize = disksize;
+ }
if (err)
goto cleanup;
@@ -934,7 +989,7 @@ out:
*/
int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
unsigned long max_blocks, struct buffer_head *bh,
- int create, int extend_disksize)
+ int create, int extend_disksize, int flag)
{
int retval;
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
* with create == 1 flag.
*/
down_write((&EXT4_I(inode)->i_data_sem));
+
+ /*
+ * if the caller is from delayed allocation writeout path
+ * we have already reserved fs blocks for allocation
+ * let the underlying get_block() function know to
+ * avoid double accounting
+ */
+ if (flag)
+ EXT4_I(inode)->i_delalloc_reserved_flag = 1;
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
~EXT4_EXT_MIGRATE;
}
}
+
+ if (flag) {
+ EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+ /*
+ * Update reserved blocks/metadata blocks
+ * after successful block allocation
+ * which were deferred till now
+ */
+ if ((retval > 0) && buffer_delay(bh))
+ ext4_da_release_space(inode, retval, 0);
+ }
+
up_write((&EXT4_I(inode)->i_data_sem));
return retval;
}
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
}
ret = ext4_get_blocks_wrap(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ max_blocks, bh_result, create, 0, 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
err = ext4_get_blocks_wrap(handle, inode, block, 1,
- &dummy, create, 1);
+ &dummy, create, 1, 0);
/*
* ext4_get_blocks_handle() returns number of blocks
* mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
to = from + len;
retry:
- page = __grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
ret = PTR_ERR(handle);
goto out;
}
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
ext4_get_block);
@@ -1225,8 +1302,8 @@ retry:
}
if (ret) {
- ext4_journal_stop(handle);
unlock_page(page);
+ ext4_journal_stop(handle);
page_cache_release(page);
}
@@ -1236,15 +1313,6 @@ out:
return ret;
}
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- int err = jbd2_journal_dirty_data(handle, bh);
- if (err)
- ext4_journal_abort_handle(__func__, __func__,
- bh, handle, err);
- return err;
-}
-
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}
/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- return copied;
-}
-
-/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
unsigned from, to;
int ret = 0, ret2;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext4_journal_dirty_data);
+ ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) {
/*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
loff_t new_i_size;
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
+ unlock_page(page);
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
page_cache_release(page);
return ret ? ret : copied;
}
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ind_blks, dind_blks, tind_blks;
+
+ /* number of new indirect blocks needed */
+ ind_blks = (blocks + icap - 1) / icap;
+
+ dind_blks = (ind_blks + icap - 1) / icap;
+
+ tind_blks = 1;
+
+ return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_calc_metadata_amount(inode, blocks);
+
+ return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned long md_needed, mdblocks, total = 0;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+ mdblocks = ext4_calc_metadata_amount(inode, total);
+ BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+ md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+ total = md_needed + nrblocks;
+
+ if (ext4_has_free_blocks(sbi, total) < total) {
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return -ENOSPC;
+ }
+
+ /* reduce fs free blocks counter */
+ percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+ EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return 0; /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int total, mdb, mdb_free, release;
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ /* recalculate the number of metablocks still need to be reserved */
+ total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+ mdb = ext4_calc_metadata_amount(inode, total);
+
+ /* figure out how many metablocks to release */
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+ /* Account for allocated meta_blocks */
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+ release = to_free + mdb_free;
+
+ /* update fs free blocks counter for truncate case */
+ percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+ /* update per-inode reservations */
+ BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+ EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+ unsigned long offset)
+{
+ int to_release = 0;
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
+ to_release++;
+ clear_buffer_delay(bh);
+ }
+ curr_off = next_off;
+ } while ((bh = bh->b_this_page) != head);
+ ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+ struct inode *inode;
+ struct buffer_head lbh; /* extent of blocks */
+ unsigned long first_page, next_page; /* extent of pages */
+ get_block_t *get_block;
+ struct writeback_control *wbc;
+};
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct mpage_data mpd_pp = {
+ .bio = NULL,
+ .last_block_in_bio = 0,
+ .get_block = mpd->get_block,
+ .use_writepage = 1,
+ };
+ int ret = 0, err, nr_pages, i;
+ unsigned long index, end;
+ struct pagevec pvec;
+
+ BUG_ON(mpd->next_page <= mpd->first_page);
+
+ pagevec_init(&pvec, 0);
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+
+ /*
+ * In error case, we have to continue because
+ * remaining pages are still locked
+ * XXX: unlock and re-dirty them?
+ */
+ if (ret == 0)
+ ret = err;
+ }
+ pagevec_release(&pvec);
+ }
+ if (mpd_pp.bio)
+ mpage_bio_submit(WRITE, mpd_pp.bio);
+
+ return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+ struct buffer_head *exbh)
+{
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+ int blocks = exbh->b_size >> inode->i_blkbits;
+ sector_t pblock = exbh->b_blocknr, cur_logical;
+ struct buffer_head *head, *bh;
+ unsigned long index, end;
+ struct pagevec pvec;
+ int nr_pages, i;
+
+ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ pagevec_init(&pvec, 0);
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ BUG_ON(!page_has_buffers(page));
+
+ bh = page_buffers(page);
+ head = bh;
+
+ /* skip blocks out of the range */
+ do {
+ if (cur_logical >= logical)
+ break;
+ cur_logical++;
+ } while ((bh = bh->b_this_page) != head);
+
+ do {
+ if (cur_logical >= logical + blocks)
+ break;
+ if (buffer_delay(bh)) {
+ bh->b_blocknr = pblock;
+ clear_buffer_delay(bh);
+ } else if (buffer_mapped(bh))
+ BUG_ON(bh->b_blocknr != pblock);
+
+ cur_logical++;
+ pblock++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+ pagevec_release(&pvec);
+ }
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int blocks, i;
+
+ blocks = bh->b_size >> inode->i_blkbits;
+ for (i = 0; i < blocks; i++)
+ unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ int err = 0, remain = lbh->b_size;
+ sector_t next = lbh->b_blocknr;
+ struct buffer_head new;
+
+ /*
+ * We consider only non-mapped and non-allocated blocks
+ */
+ if (buffer_mapped(lbh) && !buffer_delay(lbh))
+ return;
+
+ while (remain) {
+ new.b_state = lbh->b_state;
+ new.b_blocknr = 0;
+ new.b_size = remain;
+ err = mpd->get_block(mpd->inode, next, &new, 1);
+ if (err) {
+ /*
+ * Rather than implement own error handling
+ * here, we just leave remaining blocks
+ * unallocated and try again with ->writepage()
+ */
+ break;
+ }
+ BUG_ON(new.b_size == 0);
+
+ if (buffer_new(&new))
+ __unmap_underlying_blocks(mpd->inode, &new);
+
+ /*
+ * If blocks are delayed marked, we need to
+ * put actual blocknr and drop delayed bit
+ */
+ if (buffer_delay(lbh))
+ mpage_put_bnr_to_bhs(mpd, next, &new);
+
+ /* go for the remaining blocks */
+ next += new.b_size >> mpd->inode->i_blkbits;
+ remain -= new.b_size;
+ }
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+ sector_t logical, struct buffer_head *bh)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ sector_t next;
+
+ next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+
+ /*
+ * First block in the extent
+ */
+ if (lbh->b_size == 0) {
+ lbh->b_blocknr = logical;
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ return;
+ }
+
+ /*
+ * Can we merge the block to our big extent?
+ */
+ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+ lbh->b_size += bh->b_size;
+ return;
+ }
+
+ /*
+ * We couldn't merge the block to our extent, so we
+ * need to flush current extent and start new one
+ */
+ mpage_da_map_blocks(mpd);
+
+ /*
+ * Now start a new extent
+ */
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ lbh->b_blocknr = logical;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+ struct writeback_control *wbc, void *data)
+{
+ struct mpage_da_data *mpd = data;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *bh, *head, fake;
+ sector_t logical;
+
+ /*
+ * Can we merge this page to current extent?
+ */
+ if (mpd->next_page != page->index) {
+ /*
+ * Nope, we can't. So, we map non-allocated blocks
+ * and start IO on them using __mpage_writepage()
+ */
+ if (mpd->next_page != mpd->first_page) {
+ mpage_da_map_blocks(mpd);
+ mpage_da_submit_io(mpd);
+ }
+
+ /*
+ * Start next extent of pages ...
+ */
+ mpd->first_page = page->index;
+
+ /*
+ * ... and blocks
+ */
+ mpd->lbh.b_size = 0;
+ mpd->lbh.b_state = 0;
+ mpd->lbh.b_blocknr = 0;
+ }
+
+ mpd->next_page = page->index + 1;
+ logical = (sector_t) page->index <<
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ if (!page_has_buffers(page)) {
+ /*
+ * There is no attached buffer heads yet (mmap?)
+ * we treat the page asfull of dirty blocks
+ */
+ bh = &fake;
+ bh->b_size = PAGE_CACHE_SIZE;
+ bh->b_state = 0;
+ set_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ } else {
+ /*
+ * Page with regular buffer heads, just add all dirty ones
+ */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
+ if (buffer_dirty(bh))
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+
+ return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ get_block_t get_block)
+{
+ struct mpage_da_data mpd;
+ int ret;
+
+ if (!get_block)
+ return generic_writepages(mapping, wbc);
+
+ mpd.wbc = wbc;
+ mpd.inode = mapping->host;
+ mpd.lbh.b_size = 0;
+ mpd.lbh.b_state = 0;
+ mpd.lbh.b_blocknr = 0;
+ mpd.first_page = 0;
+ mpd.next_page = 0;
+ mpd.get_block = get_block;
+
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+
+ /*
+ * Handle last extent of pages
+ */
+ if (mpd.next_page != mpd.first_page) {
+ mpage_da_map_blocks(&mpd);
+ mpage_da_submit_io(&mpd);
+ }
+
+ return ret;
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+ /*
+ * first, we need to know whether the block is allocated already
+ * preallocated blocks are unmapped but should treated
+ * the same as allocated blocks.
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
+ if ((ret == 0) && !buffer_delay(bh_result)) {
+ /* the block isn't (pre)allocated yet, let's reserve space */
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ ret = ext4_da_reserve_space(inode, 1);
+ if (ret)
+ /* not enough space to reserve */
+ return ret;
+
+ map_bh(bh_result, inode->i_sb, 0);
+ set_buffer_new(bh_result);
+ set_buffer_delay(bh_result);
+ } else if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+
+ return ret;
+}
+#define EXT4_DELALLOC_RSVED 1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ handle = ext4_journal_current_handle();
+ if (!handle) {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ BUG_ON(!ret);
+ } else {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
+ }
+
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ /*
+ * Update on-disk size along with block allocation
+ * we don't use 'extend_disksize' as size may change
+ * within already allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ /*
+ * XXX: replace with spinlock if seen contended -bzzz
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+ if (EXT4_I(inode)->i_disksize == disksize) {
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
+ }
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * unmapped buffer is possible for holes.
+ * delay buffer is possible with delayed allocation
+ */
+ return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+ /*
+ * we don't want to do block allocation in writepage
+ * so call get_block_wrap with create = 0
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned long len;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ page_bufs = page_buffers(page);
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ /*
+ * We don't want to do block allocation
+ * So redirty the page and return
+ * We may reach here when we do a journal commit
+ * via journal_submit_inode_data_buffers.
+ * If we don't have mapping block we just ignore
+ * them. We can also reach here via shrink_page_list
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * The test for page_has_buffers() is subtle:
+ * We know the page is dirty but it lost buffers. That means
+ * that at some moment in time after write_begin()/write_end()
+ * has been called all buffers have been clean and thus they
+ * must have been written at least once. So they are all
+ * mapped and we can happily proceed with mapping them
+ * and writing the page.
+ *
+ * Try to initialize the buffer_heads and check whether
+ * all are mapped and non delay. We don't want to
+ * do block allocation here.
+ */
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (!ret) {
+ page_bufs = page_buffers(page);
+ /* check whether all are mapped and non delay */
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * We can't do block allocation here
+ * so just redity the page and unlock
+ * and return
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+
+ return ret;
+}
+
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
+
+static int ext4_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ handle_t *handle = NULL;
+ int needed_blocks;
+ int ret = 0;
+ long to_write;
+ loff_t range_start = 0;
+
+ /*
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
+ */
+ if (!mapping->nrpages)
+ return 0;
+
+ /*
+ * Estimate the worse case needed credits to write out
+ * EXT4_MAX_BUF_BLOCKS pages
+ */
+ needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+ to_write = wbc->nr_to_write;
+ if (!wbc->range_cyclic) {
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+ range_start = wbc->range_start;
+ }
+
+ while (!ret && to_write) {
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ if (ext4_should_order_data(inode)) {
+ /*
+ * With ordered mode we need to add
+ * the inode to the journal handle
+ * when we do block allocation.
+ */
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out_writepages;
+ }
+
+ }
+ /*
+ * set the max dirty pages could be write at a time
+ * to fit into the reserved transaction credits
+ */
+ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+ wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+ to_write -= wbc->nr_to_write;
+ ret = mpage_da_writepages(mapping, wbc,
+ ext4_da_get_block_write);
+ ext4_journal_stop(handle);
+ if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
+ }
+
+out_writepages:
+ wbc->nr_to_write = to_write;
+ if (range_start)
+ wbc->range_start = range_start;
+ return ret;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret, retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+ struct inode *inode = mapping->host;
+ handle_t *handle;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+retry:
+ /*
+ * With delayed allocation, we don't log the i_disksize update
+ * if there is delayed block allocation. But we still need
+ * to journalling the i_disksize update if writes to the end
+ * of file which has an already mapped buffer.
+ */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ext4_da_get_block_prep);
+ if (ret < 0) {
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ page_cache_release(page);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+ unsigned long offset)
+{
+ struct buffer_head *bh;
+ struct inode *inode = page->mapping->host;
+ unsigned int idx;
+ int i;
+
+ bh = page_buffers(page);
+ idx = offset >> inode->i_blkbits;
+
+ for (i=0; i < idx; i++)
+ bh = bh->b_this_page;
+
+ if (!buffer_mapped(bh) || (buffer_delay(bh)))
+ return 0;
+ return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ handle_t *handle = ext4_journal_current_handle();
+ loff_t new_i_size;
+ unsigned long start, end;
+
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + copied -1;
+
+ /*
+ * generic_write_end() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+
+ new_i_size = pos + copied;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_da_should_update_i_disksize(page, end)) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ /*
+ * Updating i_disksize when extending file
+ * without needing block allocation
+ */
+ if (ext4_should_order_data(inode))
+ ret = ext4_jbd2_file_inode(handle,
+ inode);
+
+ EXT4_I(inode)->i_disksize = new_i_size;
+ }
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+ }
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ copied = ret2;
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+
+ return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
+
+ ext4_da_page_release_reservation(page, offset);
+
+out:
+ ext4_invalidatepage(page, offset);
+
+ return;
+}
+
/*
* bmap() is special. It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
+ /*
+ * With delalloc we want to sync the file
+ * so that we can make sure we allocate
+ * blocks for file
+ */
+ filemap_write_and_wait(mapping);
+ }
+
if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
/*
* This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
- if (buffer_mapped(bh))
- return ext4_journal_dirty_data(handle, bh);
- return 0;
-}
-
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
*
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
*
* Problem:
*
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
* disastrous. Any write() or metadata operation will sync the fs for
* us.
*
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
-
- J_ASSERT(PageLocked(page));
-
- /*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
- */
- if (ext4_journal_current_handle())
- goto out_fail;
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (test_opt(inode->i_sb, NOBH))
+ return nobh_writepage(page,
+ ext4_normal_get_block_write, wbc);
+ else
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+}
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
+static int ext4_normal_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
}
- page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
-
- ret = block_write_full_page(page, ext4_get_block, wbc);
- /*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
- */
+ if (!ext4_journal_current_handle())
+ return __ext4_normal_writepage(page, wbc);
- /*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
- */
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, jbd2_journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-
-out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
- return ret;
+ return 0;
}
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0;
int err;
- if (ext4_journal_current_handle())
- goto out_fail;
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (ret != 0)
+ goto out_unlock;
+
+ page_bufs = page_buffers(page);
+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+ bget_one);
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);
handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_fail;
+ goto out;
}
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ ret = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+ err = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
+ if (ret == 0)
+ ret = err;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- return ret;
-out_fail:
- redirty_page_for_writepage(wbc, page);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ goto out;
+
+out_unlock:
unlock_page(page);
+out:
return ret;
}
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (ext4_journal_current_handle())
- goto no_write;
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
+ }
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
+ if (ext4_journal_current_handle())
goto no_write;
- }
- if (!page_has_buffers(page) || PageChecked(page)) {
+ if (PageChecked(page)) {
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_get_block);
- if (ret != 0) {
- ext4_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- unlock_page(page);
+ return __ext4_journalled_writepage(page, wbc);
} else {
/*
* It may be a page full of checkpoint-mode buffers. We don't
* really know unless we go poke around in the buffer_heads.
* But block_write_full_page will do the right thing.
*/
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
}
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-out:
- return ret;
-
no_write:
redirty_page_for_writepage(wbc, page);
-out_unlock:
unlock_page(page);
- goto out;
+ return 0;
}
static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
static const struct address_space_operations ext4_ordered_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_ordered_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
static const struct address_space_operations ext4_writeback_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_writeback_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
.releasepage = ext4_releasepage,
};
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_da_writepage,
+ .writepages = ext4_da_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_da_write_begin,
+ .write_end = ext4_da_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (ext4_should_order_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
+ else if (ext4_should_writeback_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
+ page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return -EINVAL;
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
err = ext4_journal_dirty_metadata(handle, bh);
} else {
if (ext4_should_order_data(inode))
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if (this_bh) {
BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, this_bh);
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if (bh2jh(this_bh))
+ ext4_journal_dirty_metadata(handle, this_bh);
+ else
+ ext4_error(inode->i_sb, __func__,
+ "circular indirect block detected, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long) this_bh->b_blocknr);
}
}
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
}
+int ext4_can_truncate(struct inode *inode)
+{
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return 0;
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
+ return 0;
+}
+
/*
* ext4_truncate()
*
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
int n;
ext4_lblk_t last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext4_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ if (!ext4_can_truncate(inode))
return;
- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside jbd2_journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
-
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- ext4_ext_truncate(inode, page);
+ ext4_ext_truncate(inode);
return;
}
handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return; /* AKPM: return what? */
- }
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (blocksize - 1))
+ if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+ goto out_stop;
n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
goto out_stop;
/*
+ * From here we block out all ext4_get_block() callers who want to
+ * modify the block allocation tree.
+ */
+ down_write(&ei->i_data_sem);
+ /*
* The orphan list entry will now protect us from any crash which
* occurs before the truncate completes, so it is now safe to propagate
* the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
*/
ei->i_disksize = inode->i_size;
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
-
if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.)
*
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
*/
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (!error)
error = rc;
ext4_journal_stop(handle);
+
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error) {
+ /* Do as much error cleanup as possible */
+ handle = ext4_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ goto err_out;
+ }
+ }
}
rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
return error;
}
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+ unsigned long delalloc_blocks;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+
+ /*
+ * We can't update i_blocks if the block allocation is delayed
+ * otherwise in the case of system crash before the real block
+ * allocation is done, we will have i_blocks inconsistent with
+ * on-disk file blocks.
+ * We always keep i_blocks updated together with real
+ * allocation. But to not confuse with user, stat
+ * will return the blocks that include the delayed allocation
+ * blocks for this file.
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ return 0;
+}
/*
* How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ loff_t size;
+ unsigned long len;
+ int ret = -EINVAL;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+ * get i_mutex because we are already holding mmap_sem.
+ */
+ down_read(&inode->i_alloc_sem);
+ size = i_size_read(inode);
+ if (page->mapping != mapping || size <= page_offset(page)
+ || !PageUptodate(page)) {
+ /* page got truncated from under us? */
+ goto out_unlock;
+ }
+ ret = 0;
+ if (PageMappedToDisk(page))
+ goto out_unlock;
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* return if we have all the buffers mapped */
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped))
+ goto out_unlock;
+ }
+ /*
+ * OK, we need to fill the hole... Do write_begin write_end
+ * to do block allocation/reservation.We are not holding
+ * inode.i__mutex here. That allow * parallel write_begin,
+ * write_end call. lock_page prevent this from happening
+ * on the same page though
+ */
+ ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+ len, len, page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = 0;
+out_unlock:
+ up_read(&inode->i_alloc_sem);
+ return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade150..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_zero_bit(addr, max, start) - fix;
+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static inline int mb_find_next_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_bit(addr, max, start) - fix;
+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (!buffer_uptodate(bh[i]))
goto out;
+ err = 0;
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
int pnum;
int poff;
struct page *page;
+ int ret;
mb_debug("load group %lu\n", group);
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
mb_cmp_bitmaps(e4b, page_address(page) +
(poff * sb->s_blocksize));
}
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
- if (!PageUptodate(page))
- ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+ if (!PageUptodate(page)) {
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ }
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
- return -EIO;
+ return ret;
}
static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
}
}
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
{
int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr += block;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
+ ext4_unlock_group(sb, e4b->bd_group);
ext4_error(sb, __func__, "double-free of inode"
" %lu's block %llu(bit %u in group %lu)\n",
inode ? inode->i_ino : 0, blocknr, block,
e4b->bd_group);
+ ext4_lock_group(sb, e4b->bd_group);
}
mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
} while (1);
}
mb_check_buddy(e4b);
-
- return 0;
}
static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
-
- /* searching for the right group start from the goal value specified */
- group = ac->ac_g_ex.fe_group;
-
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
+ /*
+ * searching for the right group start
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
+
for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
struct ext4_group_info *grp;
struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
int rc;
int size;
+ if (unlikely(sbi->s_mb_history == NULL))
+ return -ENOMEM;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL)
return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
- if (likely(sbi->s_mb_history != NULL))
- memset(sbi->s_mb_history, 0, i);
+ sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
/* if we can't allocate history, then we simple won't use it */
}
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
#define ext4_mb_history_init(sb)
#endif
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ int i, len;
+ int metalen = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info **meta_group_info;
+
+ /*
+ * First check if this group is the first of a reserved block.
+ * If it's true, we have to allocate a new table of pointers
+ * to ext4_group_info structures
+ */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ metalen = sizeof(*meta_group_info) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
+ if (meta_group_info == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+ "buddy group\n");
+ goto exit_meta_group_info;
+ }
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+ meta_group_info;
+ }
+
+ /*
+ * calculate needed size. if change bb_counters size,
+ * don't forget about ext4_mb_generate_buddy()
+ */
+ len = offsetof(typeof(**meta_group_info),
+ bb_counters[sb->s_blocksize_bits + 2]);
+
+ meta_group_info =
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+ meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+ if (meta_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+ goto exit_group_info;
+ }
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+ &(meta_group_info[i]->bb_state));
+
+ /*
+ * initialize bb_free to be able to skip
+ * empty groups without initialization
+ */
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ meta_group_info[i]->bb_free =
+ ext4_free_blocks_after_init(sb, group, desc);
+ } else {
+ meta_group_info[i]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+ {
+ struct buffer_head *bh;
+ meta_group_info[i]->bb_bitmap =
+ kmalloc(sb->s_blocksize, GFP_KERNEL);
+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+ bh = ext4_read_block_bitmap(sb, group);
+ BUG_ON(bh == NULL);
+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+ sb->s_blocksize);
+ put_bh(bh);
+ }
+#endif
+
+ return 0;
+
+exit_group_info:
+ /* If a meta_group_info table has been allocated, release it now */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+ return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+ int err;
+
+ /* Add group based on group descriptor*/
+ err = ext4_mb_add_groupinfo(sb, group, desc);
+ if (err)
+ return err;
+
+ /*
+ * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+ * datas) are set not up to date so that they will be re-initilaized
+ * during the next call to ext4_mb_load_buddy
+ */
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+ grp->bb_free += add;
+}
+
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t i;
- int j, len, metalen;
+ int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int num_meta_group_infos =
- (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
- EXT4_DESC_PER_BLOCK_BITS(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int num_meta_group_infos;
+ int num_meta_group_infos_max;
+ int array_size;
struct ext4_group_info **meta_group_info;
+ struct ext4_group_desc *desc;
+
+ /* This is the number of blocks used by GDT */
+ num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+ 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+
+ /*
+ * This is the total number of blocks used by GDT including
+ * the number of reserved blocks for GDT.
+ * The s_group_info array is allocated with this value
+ * to allow a clean online resize without a complex
+ * manipulation of pointer.
+ * The drawback is the unused memory when no resize
+ * occurs but it's very low in terms of pages
+ * (see comments below)
+ * Need to handle this properly when META_BG resizing is allowed
+ */
+ num_meta_group_infos_max = num_meta_group_infos +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+ /*
+ * array_size is the size of s_group_info array. We round it
+ * to the next power of two because this approximation is done
+ * internally by kmalloc so we can have some more memory
+ * for free here (e.g. may be used for META_BG resize).
+ */
+ array_size = 1;
+ while (array_size < sizeof(*sbi->s_group_info) *
+ num_meta_group_infos_max)
+ array_size = array_size << 1;
/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
* kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
* So a two level scheme suffices for now. */
- sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
- num_meta_group_infos, GFP_KERNEL);
+ sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
if (sbi->s_group_info == NULL) {
printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_group_info[i] = meta_group_info;
}
- /*
- * calculate needed size. if change bb_counters size,
- * don't forget about ext4_mb_generate_buddy()
- */
- len = sizeof(struct ext4_group_info);
- len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
for (i = 0; i < sbi->s_groups_count; i++) {
- struct ext4_group_desc *desc;
-
- meta_group_info =
- sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
- j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
- meta_group_info[j] = kzalloc(len, GFP_KERNEL);
- if (meta_group_info[j] == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
- goto err_freebuddy;
- }
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
printk(KERN_ERR
"EXT4-fs: can't read descriptor %lu\n", i);
- i++;
goto err_freebuddy;
}
- memset(meta_group_info[j], 0, len);
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
- &(meta_group_info[j]->bb_state));
-
- /*
- * initialize bb_free to be able to skip
- * empty groups without initialization
- */
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- meta_group_info[j]->bb_free =
- ext4_free_blocks_after_init(sb, i, desc);
- } else {
- meta_group_info[j]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
- }
-
- INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[j]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_KERNEL);
- BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
- bh = read_block_bitmap(sb, i);
- BUG_ON(bh == NULL);
- memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+ goto err_freebuddy;
}
return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
unsigned i;
unsigned offset;
unsigned max;
+ int ret;
if (!test_opt(sb, MBALLOC))
return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
} while (i <= sb->s_blocksize_bits + 1);
/* init file for buddy data */
- i = ext4_mb_init_backend(sb);
- if (i) {
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0) {
clear_opt(sbi->s_mount_opt, MBALLOC);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
- return i;
+ return ret;
}
spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
ext4_lock_group(sb, md->group);
for (i = 0; i < md->num; i++) {
mb_debug(" %u", md->blocks[i]);
- err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
- BUG_ON(err != 0);
+ mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
}
mb_debug("\n");
ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
-#define MB_PROC_VALUE_READ(name) \
-static int ext4_mb_read_##name(char *page, char **start, \
- off_t off, int count, int *eof, void *data) \
+#define MB_PROC_FOPS(name) \
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
{ \
- struct ext4_sb_info *sbi = data; \
- int len; \
- *eof = 1; \
- if (off != 0) \
- return 0; \
- len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
- *start = page; \
- return len; \
-}
-
-#define MB_PROC_VALUE_WRITE(name) \
-static int ext4_mb_write_##name(struct file *file, \
- const char __user *buf, unsigned long cnt, void *data) \
+ struct ext4_sb_info *sbi = m->private; \
+ \
+ seq_printf(m, "%ld\n", sbi->s_mb_##name); \
+ return 0; \
+} \
+ \
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
+{ \
+ return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+} \
+ \
+static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
+ const char __user *buf, size_t cnt, loff_t *ppos) \
{ \
- struct ext4_sb_info *sbi = data; \
+ struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
char str[32]; \
long value; \
if (cnt >= sizeof(str)) \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \
return -ERANGE; \
sbi->s_mb_##name = value; \
return cnt; \
-}
+} \
+ \
+static const struct file_operations ext4_mb_##name##_proc_fops = { \
+ .owner = THIS_MODULE, \
+ .open = ext4_mb_##name##_proc_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+ .write = ext4_mb_##name##_proc_write, \
+};
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_FOPS(stats);
+MB_PROC_FOPS(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
+MB_PROC_FOPS(order2_reqs);
+MB_PROC_FOPS(stream_request);
+MB_PROC_FOPS(group_prealloc);
#define MB_PROC_HANDLER(name, var) \
do { \
- proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
+ proc = proc_create_data(name, mode, sbi->s_mb_proc, \
+ &ext4_mb_##var##_proc_fops, sbi); \
if (proc == NULL) { \
printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
goto err_out; \
} \
- proc->data = sbi; \
- proc->read_proc = ext4_mb_read_##var ; \
- proc->write_proc = ext4_mb_write_##var; \
} while (0)
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
struct proc_dir_entry *proc;
char devname[64];
+ if (proc_root_ext4 == NULL) {
+ sbi->s_mb_proc = NULL;
+ return -EINVAL;
+ }
bdevname(sb->s_bdev, devname);
sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
err = -EIO;
- bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
if (!bitmap_bh)
goto out_err;
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+ /*
+ * free blocks account has already be reduced/reserved
+ * at write_begin() time for delayed allocation
+ * do not double accounting
+ */
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+ percpu_counter_sub(&sbi->s_freeblocks_counter,
+ ac->ac_b_ex.fe_len);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi,
+ ac->ac_b_ex.fe_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- if (next > end)
- next = end;
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
if (list_empty(&grp->bb_prealloc_list))
return 0;
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
err = ext4_mb_load_buddy(sb, group, &e4b);
BUG_ON(err != 0); /* error handling here */
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sbi = EXT4_SB(sb);
if (!test_opt(sb, MBALLOC)) {
- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+ block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
&(ar->len), errp);
return block;
}
+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ ar->len = ext4_has_free_blocks(sbi, ar->len);
+ }
+
+ if (ar->len == 0) {
+ *errp = -ENOSPC;
+ return 0;
+ }
while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
inquota = ar->len;
+ if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
+ ar->len = 0;
*errp = -ENOMEM;
- return 0;
+ goto out1;
}
ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
- goto out;
+ goto out2;
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
if (!ext4_mb_use_preallocated(ac)) {
-
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
repeat:
@@ -4085,11 +4261,12 @@ repeat:
ext4_mb_release_context(ac);
-out:
+out2:
+ kmem_cache_free(ext4_ac_cachep, ac);
+out1:
if (ar->len < inquota)
DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
- kmem_cache_free(ext4_ac_cachep, ac);
return block;
}
static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
} else {
ext4_lock_group(sb, block_group);
- err = mb_free_blocks(inode, &e4b, bit, count);
+ mb_free_blocks(inode, &e4b, bit, count);
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
ext4_unlock_group(sb, block_group);
- BUG_ON(err != 0);
}
spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
ext4_mb_release_desc(&e4b);
*freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+ return (struct ext4_dir_entry_2 *)((char *)p +
+ ext4_rec_len_from_disk(p->rec_len));
+}
+
+/*
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
- return 0? 20: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit (struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
- return 0? 22: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
/*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
- return (struct ext4_dir_entry_2 *)((char *)p +
- ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
* This function fills a red-black tree with information from a
* directory block. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de))
- if (ext4_match (namelen, name, de)) {
- if (!ext4_check_dir_entry("ext4_find_entry",
- dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(sb))
- +((char *)de - bh->b_data))) {
- brelse (bh);
+ for (; de < top; de = ext4_next_entry(de)) {
+ int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+ + ((char *) de - bh->b_data);
+
+ if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+ brelse(bh);
*err = ERR_BAD_DX_DIR;
goto errout;
}
- *res_dir = de;
- dx_release (frames);
- return bh;
+
+ if (ext4_match(namelen, name, de)) {
+ *res_dir = de;
+ dx_release(frames);
+ return bh;
+ }
}
brelse (bh);
/* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c04239..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
/*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ if (test_opt(sb, MBALLOC)) {
+ err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+ if (err)
+ goto exit_journal;
+ }
+ /*
* Make the new blocks and inodes valid next. We do this before
* increasing the group count so that once the group is enabled,
* all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
handle_t *handle;
int err;
unsigned long freed_blocks;
+ ext4_group_t group;
+ struct ext4_group_info *grp;
/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
/* Handle the remaining blocks in the last group only. */
- ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
if (last == 0) {
ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add);
if ((err = ext4_journal_stop(handle)))
goto exit_put;
+
+ /*
+ * Mark mballoc pages as not up to date so that they will be updated
+ * next time they are loaded by ext4_mb_load_buddy.
+ */
+ if (test_opt(sb, MBALLOC)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Get the info on the last group */
+ grp = ext4_get_group_info(sb, group);
+
+ /* Update free blocks in group info */
+ ext4_mb_update_group_info(grp, add);
+ }
+
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 02bf24343979..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
+ kfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+ ei->i_delalloc_reserved_flag = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
return &ei->vfs_inode;
}
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
kfree(rsv);
+ jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+ &EXT4_I(inode)->jinode);
}
static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
unsigned long def_mount_opts;
struct super_block *sb = vfs->mnt_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- journal_t *journal = sbi->s_journal;
struct ext4_super_block *es = sbi->s_es;
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nomballoc");
if (test_opt(sb, I_VERSION))
seq_puts(seq, ",i_version");
+ if (!test_opt(sb, DELALLOC))
+ seq_puts(seq, ",nodelalloc");
+
if (sbi->s_stripe)
seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe,
+ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
};
static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
{Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
+ {Opt_delalloc, "delalloc"},
+ {Opt_nodelalloc, "nodelalloc"},
{Opt_err, NULL},
};
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
int qtype, qfmt;
char *qname;
#endif
+ ext4_fsblk_t last_block;
if (!options)
return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
clear_opt(sbi->s_mount_opt, NOBH);
break;
case Opt_extents:
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_warning(sb, __func__,
+ "extents feature not enabled "
+ "on this filesystem, use tune2fs\n");
+ return 0;
+ }
set_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_noextents:
+ /*
+ * When e2fsprogs support resizing an already existing
+ * ext3 file system to greater than 2**32 we need to
+ * add support to block allocator to handle growing
+ * already existing block mapped inode so that blocks
+ * allocated for them fall within 2**32
+ */
+ last_block = ext4_blocks_count(sbi->s_es) - 1;
+ if (last_block > 0xffffffffULL) {
+ printk(KERN_ERR "EXT4-fs: Filesystem too "
+ "large to mount with "
+ "-o noextents options\n");
+ return 0;
+ }
clear_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_i_version:
set_opt(sbi->s_mount_opt, I_VERSION);
sb->s_flags |= MS_I_VERSION;
break;
+ case Opt_nodelalloc:
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ break;
case Opt_mballoc:
set_opt(sbi->s_mount_opt, MBALLOC);
break;
@@ -1331,6 +1370,9 @@ set_qf_format:
return 0;
sbi->s_stripe = option;
break;
+ case Opt_delalloc:
+ set_opt(sbi->s_mount_opt, DELALLOC);
+ break;
default:
printk (KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
return res;
}
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *bh;
+ ext4_group_t flex_group_count;
+ ext4_group_t flex_group;
+ int groups_per_flex = 0;
+ __u64 block_bitmap = 0;
+ int i;
+
+ if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = 0;
+ return 1;
+ }
+
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+ groups_per_flex;
+ sbi->s_flex_groups = kmalloc(flex_group_count *
+ sizeof(struct flex_groups), GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
+ goto failed;
+ }
+ memset(sbi->s_flex_groups, 0, flex_group_count *
+ sizeof(struct flex_groups));
+
+ gdp = ext4_get_group_desc(sb, 1, &bh);
+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, &bh);
+
+ flex_group = ext4_flex_group(sbi, i);
+ sbi->s_flex_groups[flex_group].free_inodes +=
+ le16_to_cpu(gdp->bg_free_inodes_count);
+ sbi->s_flex_groups[flex_group].free_blocks +=
+ le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return 1;
+failed:
+ return 0;
+}
+
__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
struct ext4_group_desc *gdp)
{
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
}
static int ext4_fill_super (struct super_block *sb, void *data, int silent)
- __releases(kernel_sem)
- __acquires(kernel_sem)
+ __releases(kernel_lock)
+ __acquires(kernel_lock)
{
struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
goto out_fail;
}
- if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
- goto out_fail;
- }
-
/*
* The ext4 superblock will not be buffer aligned for other than 1kB
* block sizes. We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
/*
* turn on extents feature by default in ext4 filesystem
- * User -o noextents to turn it off
+ * only if feature flag already set by mkfs or tune2fs.
+ * Use -o noextents to turn it off
*/
- set_opt(sbi->s_mount_opt, EXTENTS);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ set_opt(sbi->s_mount_opt, EXTENTS);
+ else
+ ext4_warning(sb, __func__,
+ "extents feature not enabled on this filesystem, "
+ "use tune2fs.\n");
/*
- * turn on mballoc feature by default in ext4 filesystem
- * User -o nomballoc to turn it off
+ * turn on mballoc code by default in ext4 filesystem
+ * Use -o nomballoc to turn it off
*/
set_opt(sbi->s_mount_opt, MBALLOC);
+ /*
+ * enable delayed allocation by default
+ * Use -o nodelalloc to turn it off
+ */
+ set_opt(sbi->s_mount_opt, DELALLOC);
+
+
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
NULL, 0))
goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ printk(KERN_ERR
+ "EXT4-fs: unable to initialize "
+ "flex_bg meta info!\n");
+ goto failed_mount2;
+ }
+
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
ext4_ext_init(sb);
ext4_mb_init(sb, needs_recovery);
@@ -2372,6 +2485,7 @@ cantfind_ext4:
failed_mount4:
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_journal_dirty_metadata(handle, bh);
else {
/* Always do at least ordered writes for quotas */
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
brelse(bh);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_block(handle, inode,
+ ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
goal, &error);
if (error)
goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
- J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
}
/*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
}
/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
- * return 0. j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
- if (!jbd_trylock_bh_state(bh)) {
- spin_unlock(&journal->j_list_lock);
- schedule();
- return 0;
- }
- return 1;
-}
-
-/*
* Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head *bh;
int ret;
int barrier_done = 0;
+ struct timespec now = current_kernel_time();
if (is_journal_aborted(journal))
return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+ tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+ tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
if (JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
}
/*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
*/
-static int journal_wait_on_locked_list(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
{
- int ret = 0;
- struct journal_head *jh;
-
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- ret = -EIO;
- spin_lock(&journal->j_list_lock);
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- jbd2_journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- put_bh(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = 0,
+ .range_end = i_size_read(mapping->host),
+ .for_writepages = 1,
+ };
+
+ ret = generic_writepages(mapping, &wbc);
return ret;
- }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- int i;
+ struct jbd2_inode *jinode;
+ int err, ret = 0;
+ struct address_space *mapping;
- for (i = 0; i < bufs; i++) {
- wbuf[i]->b_end_io = end_buffer_write_sync;
- /* We use-up our safety reference in submit_bh() */
- submit_bh(WRITE, wbuf[i]);
+ spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ mapping = jinode->i_vfs_inode->i_mapping;
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ err = journal_submit_inode_data_buffers(mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ J_ASSERT(jinode->i_transaction == commit_transaction);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
+ spin_unlock(&journal->j_list_lock);
+ return ret;
}
/*
- * Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
*/
-static void journal_submit_data_buffers(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- struct journal_head *jh;
- struct buffer_head *bh;
- int locked;
- int bufs = 0;
- struct buffer_head **wbuf = journal->j_wbuf;
+ struct jbd2_inode *jinode, *next_i;
+ int err, ret = 0;
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- *
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
+ /* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ }
- while (commit_transaction->t_sync_datalist) {
- jh = commit_transaction->t_sync_datalist;
- bh = jh2bh(jh);
- locked = 0;
-
- /* Get reference just to make sure buffer does not disappear
- * when we are forced to drop various locks */
- get_bh(bh);
- /* If the buffer is dirty, we need to submit IO and hence
- * we need the buffer lock. We try to lock the buffer without
- * blocking. If we fail, we need to drop j_list_lock and do
- * blocking lock_buffer().
- */
- if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
- BUFFER_TRACE(bh, "needs blocking lock");
- spin_unlock(&journal->j_list_lock);
- /* Write out all data to prevent deadlocks */
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- lock_buffer(bh);
- spin_lock(&journal->j_list_lock);
- }
- locked = 1;
- }
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- }
- /* Someone already cleaned up the buffer? */
- if (!buffer_jbd(bh)
- || jh->b_transaction != commit_transaction
- || jh->b_jlist != BJ_SyncData) {
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "already cleaned up");
- put_bh(bh);
- continue;
- }
- if (locked && test_clear_buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "needs writeout, adding to array");
- wbuf[bufs++] = bh;
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (bufs == journal->j_wbufsize) {
- spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- goto write_out_data;
- }
- } else if (!locked && buffer_locked(bh)) {
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
+ /* Now refile inode to proper lists */
+ list_for_each_entry_safe(jinode, next_i,
+ &commit_transaction->t_inode_list, i_list) {
+ list_del(&jinode->i_list);
+ if (jinode->i_next_transaction) {
+ jinode->i_transaction = jinode->i_next_transaction;
+ jinode->i_next_transaction = NULL;
+ list_add(&jinode->i_list,
+ &jinode->i_transaction->t_inode_list);
} else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- jbd2_journal_remove_journal_head(bh);
- /* Once for our safety reference, once for
- * jbd2_journal_remove_journal_head() */
- put_bh(bh);
- put_bh(bh);
- }
-
- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
+ jinode->i_transaction = NULL;
}
}
spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
+
+ return ret;
}
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = 0;
- journal_submit_data_buffers(journal, commit_transaction);
-
- /*
- * Wait for all previously submitted IO to complete if commit
- * record is to be written synchronously.
- */
- spin_lock(&journal->j_list_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
-
- spin_unlock(&journal->j_list_lock);
-
+ err = journal_submit_data_buffers(journal, commit_transaction);
if (err)
jbd2_journal_abort(journal, err);
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 2\n");
/*
- * If we found any dirty or locked buffers, then we should have
- * looped back up to the write_out_data label. If there weren't
- * any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
- */
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
- jbd_debug (3, "JBD: commit phase 3\n");
-
- /*
* Way to go: we have now written out all of the data for a
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
+ err = 0;
descriptor = NULL;
bufs = 0;
while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
-
- spin_lock(&journal->j_list_lock);
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
- spin_unlock(&journal->j_list_lock);
- if (err)
- __jbd2_journal_abort_hard(journal);
}
+ /*
+ * This is the right place to wait for data buffers both for ASYNC
+ * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+ * the commit block went to disk (which happens above). If commit is
+ * SYNC, we need to wait for data buffers before we start writing
+ * commit block, which happens below in such setting.
+ */
+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
+ if (err)
+ jbd2_journal_abort(journal, err);
+
/* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD: commit phase 4\n");
+ jbd_debug(3, "JBD: commit phase 3\n");
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD: commit phase 5\n");
+ jbd_debug(3, "JBD: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
/* AKPM: bforget here */
}
- jbd_debug(3, "JBD: commit phase 6\n");
+ jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD: commit phase 7\n");
+ jbd_debug(3, "JBD: commit phase 6\n");
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD: commit phase 8\n");
+ jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
}
/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+ jinode->i_transaction = NULL;
+ jinode->i_next_transaction = NULL;
+ jinode->i_vfs_inode = inode;
+ jinode->i_flags = 0;
+ INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+ struct jbd2_inode *jinode)
+{
+ int writeout = 0;
+
+ if (!journal)
+ return;
+restart:
+ spin_lock(&journal->j_list_lock);
+ /* Is commit writing out inode - we have to wait */
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+ wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ finish_wait(wq, &wait.wait);
+ goto restart;
+ }
+
+ /* Do we need to wait for data writeback? */
+ if (journal->j_committing_transaction == jinode->i_transaction)
+ writeout = 1;
+ if (jinode->i_transaction) {
+ list_del(&jinode->i_list);
+ jinode->i_transaction = NULL;
+ }
+ spin_unlock(&journal->j_list_lock);
+}
+
+/*
* debugfs tunables
*/
#ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
* new transaction and we can't block without protecting against other
* processes trying to touch the journal while it is in transition.
*
- * Called under j_state_lock
*/
static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
+ INIT_LIST_HEAD(&transaction->t_inode_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
}
/**
- * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
- * needs to be flushed before we can commit the
- * current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- journal_t *journal = handle->h_transaction->t_journal;
- int need_brelse = 0;
- struct journal_head *jh;
-
- if (is_handle_aborted(handle))
- return 0;
-
- jh = jbd2_journal_add_journal_head(bh);
- JBUFFER_TRACE(jh, "entry");
-
- /*
- * The buffer could *already* be dirty. Writeout can start
- * at any time.
- */
- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
- /*
- * What if the buffer is already part of a running transaction?
- *
- * There are two cases:
- * 1) It is part of the current running transaction. Refile it,
- * just in case we have allocated it as metadata, deallocated
- * it, then reallocated it as data.
- * 2) It is part of the previous, still-committing transaction.
- * If all we want to do is to guarantee that the buffer will be
- * written to disk before this new transaction commits, then
- * being sure that the *previous* transaction has this same
- * property is sufficient for us! Just leave it on its old
- * transaction.
- *
- * In case (2), the buffer must not already exist as metadata
- * --- that would violate write ordering (a transaction is free
- * to write its data at any point, even before the previous
- * committing transaction has committed). The caller must
- * never, ever allow this to happen: there's nothing we can do
- * about it in this layer.
- */
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- /* Now that we have bh_state locked, are we really still mapped? */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
- goto no_journal;
- }
-
- if (jh->b_transaction) {
- JBUFFER_TRACE(jh, "has transaction");
- if (jh->b_transaction != handle->h_transaction) {
- JBUFFER_TRACE(jh, "belongs to older transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
-
- /* @@@ IS THIS TRUE ? */
- /*
- * Not any more. Scenario: someone does a write()
- * in data=journal mode. The buffer's transaction has
- * moved into commit. Then someone does another
- * write() to the file. We do the frozen data copyout
- * and set b_next_transaction to point to j_running_t.
- * And while we're in that state, someone does a
- * writepage() in an attempt to pageout the same area
- * of the file via a shared mapping. At present that
- * calls jbd2_journal_dirty_data(), and we get right here.
- * It may be too late to journal the data. Simply
- * falling through to the next test will suffice: the
- * data will be dirty and wil be checkpointed. The
- * ordering comments in the next comment block still
- * apply.
- */
- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
- /*
- * If we're journalling data, and this buffer was
- * subject to a write(), it could be metadata, forget
- * or shadow against the committing transaction. Now,
- * someone has dirtied the same darn page via a mapping
- * and it is being writepage()'d.
- * We *could* just steal the page from commit, with some
- * fancy locking there. Instead, we just skip it -
- * don't tie the page's buffers to the new transaction
- * at all.
- * Implication: if we crash before the writepage() data
- * is written into the filesystem, recovery will replay
- * the write() data.
- */
- if (jh->b_jlist != BJ_None &&
- jh->b_jlist != BJ_SyncData &&
- jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "Not stealing");
- goto no_journal;
- }
-
- /*
- * This buffer may be undergoing writeout in commit. We
- * can't return from here and let the caller dirty it
- * again because that can cause the write-out loop in
- * commit to never terminate.
- */
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- need_brelse = 1;
- sync_dirty_buffer(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- /* Since we dropped the lock... */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "buffer got unmapped");
- goto no_journal;
- }
- /* The buffer may become locked again at any
- time if it is redirtied */
- }
-
- /* journal_clean_data_list() may have got there first */
- if (jh->b_transaction != NULL) {
- JBUFFER_TRACE(jh, "unfile from commit");
- __jbd2_journal_temp_unlink_buffer(jh);
- /* It still points to the committing
- * transaction; move it to this one so
- * that the refile assert checks are
- * happy. */
- jh->b_transaction = handle->h_transaction;
- }
- /* The buffer will be refiled below */
-
- }
- /*
- * Special case --- the buffer might actually have been
- * allocated and then immediately deallocated in the previous,
- * committing transaction, so might still be left on that
- * transaction's metadata lists.
- */
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "not on correct data list: unfile");
- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __jbd2_journal_temp_unlink_buffer(jh);
- jh->b_transaction = handle->h_transaction;
- JBUFFER_TRACE(jh, "file as data");
- __jbd2_journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
- }
- } else {
- JBUFFER_TRACE(jh, "not on a transaction");
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
- }
-no_journal:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- if (need_brelse) {
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- }
- JBUFFER_TRACE(jh, "exit");
- jbd2_journal_put_journal_head(jh);
- return 0;
-}
-
-/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
+ * of these pointers, it could go bad. Generally the caller needs to re-read
+ * the pointer from the transaction_t.
*
* Called under j_list_lock. The journal may not be locked.
*/
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
switch (jh->b_jlist) {
case BJ_None:
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers--;
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
- /* A written-back ordered data buffer */
- JBUFFER_TRACE(jh, "release data");
- __jbd2_journal_unfile_buffer(jh);
- jbd2_journal_remove_journal_head(bh);
- __brelse(bh);
- }
- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+ if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
- * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+ * jbd2_journal_remove_journal_head() and
+ * jbd2_journal_put_journal_head().
*/
jh = jbd2_journal_grab_journal_head(bh);
if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!buffer_jbd(bh))
goto zap_buffer_unlocked;
+ /* OK, we have data buffer in journaled mode */
spin_lock(&journal->j_state_lock);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
} else if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "on committing transaction");
- if (jh->b_jlist == BJ_Locked) {
- /*
- * The buffer is on the committing transaction's locked
- * list. We have the buffer locked, so I/O has
- * completed. So we can nail the buffer now.
- */
- may_free = __dispose_buffer(jh, transaction);
- goto zap_buffer;
- }
/*
* If it is committing, we simply cannot touch it. We
* can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
J_ASSERT_JH(jh, !jh->b_committed_data);
J_ASSERT_JH(jh, !jh->b_frozen_data);
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers++;
list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+
+ if (is_handle_aborted(handle))
+ return -EIO;
+
+ jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ transaction->t_tid);
+
+ /*
+ * First check whether inode isn't already on the transaction's
+ * lists without taking the lock. Note that this check is safe
+ * without the lock as we cannot race with somebody removing inode
+ * from the transaction. The reason is that we remove inode from the
+ * transaction only in journal_release_jbd_inode() and when we commit
+ * the transaction. We are guarded from the first case by holding
+ * a reference to the inode. We are safe against the second case
+ * because if jinode->i_transaction == transaction, commit code
+ * cannot touch the transaction because we hold reference to it,
+ * and if jinode->i_next_transaction == transaction, commit code
+ * will only file the inode where we want it.
+ */
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ return 0;
+
+ spin_lock(&journal->j_list_lock);
+
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ goto done;
+
+ /* On some different transaction's list - should be
+ * the committing one */
+ if (jinode->i_transaction) {
+ J_ASSERT(jinode->i_next_transaction == NULL);
+ J_ASSERT(jinode->i_transaction ==
+ journal->j_committing_transaction);
+ jinode->i_next_transaction = transaction;
+ goto done;
+ }
+ /* Not on any transaction list... */
+ J_ASSERT(!jinode->i_next_transaction);
+ jinode->i_transaction = transaction;
+ list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+ spin_unlock(&journal->j_list_lock);
+
+ return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+ loff_t new_size)
+{
+ journal_t *journal;
+ transaction_t *commit_trans;
+ int ret = 0;
+
+ if (!inode->i_transaction && !inode->i_next_transaction)
+ goto out;
+ journal = inode->i_transaction->t_journal;
+ spin_lock(&journal->j_state_lock);
+ commit_trans = journal->j_committing_transaction;
+ spin_unlock(&journal->j_state_lock);
+ if (inode->i_transaction == commit_trans) {
+ ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+ new_size, LLONG_MAX);
+ if (ret)
+ jbd2_journal_abort(journal, ret);
+ }
+out:
+ return ret;
+}
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io_read;
if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
submit_bio(rw, bio);
return NULL;
}
+EXPORT_SYMBOL(mpage_bio_submit);
static struct bio *
mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
* written, so it can intelligently allocate a suitably-sized BIO. For now,
* just allocate full-size (16-page) BIOs.
*/
-struct mpage_data {
- struct bio *bio;
- sector_t last_block_in_bio;
- get_block_t *get_block;
- unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
{
struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
mpd->bio = bio;
return ret;
}
+EXPORT_SYMBOL(__mpage_writepage);
/**
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 0764b662b339..1c1b13e29223 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -1089,6 +1089,7 @@ extern int drm_mm_remove_space_from_tail(struct drm_mm *mm, unsigned long size);
extern int drm_mm_add_space_to_tail(struct drm_mm *mm, unsigned long size);
extern void drm_core_ioremap(struct drm_map *map, struct drm_device *dev);
+extern void drm_core_ioremap_wc(struct drm_map *map, struct drm_device *dev);
extern void drm_core_ioremapfree(struct drm_map *map, struct drm_device *dev);
static __inline__ struct drm_map *drm_core_findmap(struct drm_device *dev,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index faac13e2cc5c..52e510a0aec2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
extern int __filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end, int sync_mode);
+extern int filemap_fdatawrite_range(struct address_space *mapping,
+ loff_t start, loff_t end);
extern long do_fsync(struct file *file, int datasync);
extern void sync_supers(void);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d147f0f90360..3dd209007098 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -168,6 +168,8 @@ struct commit_header {
unsigned char h_chksum_size;
unsigned char h_padding[2];
__be32 h_chksum[JBD2_CHECKSUM_BYTES];
+ __be64 h_commit_sec;
+ __be32 h_commit_nsec;
};
/*
@@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
bit_spin_unlock(BH_JournalHead, &bh->b_state);
}
+/* Flags in jbd_inode->i_flags */
+#define __JI_COMMIT_RUNNING 0
+/* Commit of the inode data in progress. We use this flag to protect us from
+ * concurrent deletion of inode. We cannot use reference to inode for this
+ * since we cannot afford doing last iput() on behalf of kjournald
+ */
+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+
+/**
+ * struct jbd_inode is the structure linking inodes in ordered mode
+ * present in a transaction so that we can sync them during commit.
+ */
+struct jbd2_inode {
+ /* Which transaction does this inode belong to? Either the running
+ * transaction or the committing one. [j_list_lock] */
+ transaction_t *i_transaction;
+
+ /* Pointer to the running transaction modifying inode's data in case
+ * there is already a committing transaction touching it. [j_list_lock] */
+ transaction_t *i_next_transaction;
+
+ /* List of inodes in the i_transaction [j_list_lock] */
+ struct list_head i_list;
+
+ /* VFS inode this inode belongs to [constant during the lifetime
+ * of the structure] */
+ struct inode *i_vfs_inode;
+
+ /* Flags of inode [j_list_lock] */
+ unsigned int i_flags;
+};
+
struct jbd2_revoke_table_s;
/**
@@ -509,24 +543,12 @@ struct transaction_s
struct journal_head *t_reserved_list;
/*
- * Doubly-linked circular list of all buffers under writeout during
- * commit [j_list_lock]
- */
- struct journal_head *t_locked_list;
-
- /*
* Doubly-linked circular list of all metadata buffers owned by this
* transaction [j_list_lock]
*/
struct journal_head *t_buffers;
/*
- * Doubly-linked circular list of all data buffers still to be
- * flushed before this transaction can be committed [j_list_lock]
- */
- struct journal_head *t_sync_datalist;
-
- /*
* Doubly-linked circular list of all forget buffers (superseded
* buffers which we can un-checkpoint once this transaction commits)
* [j_list_lock]
@@ -565,6 +587,12 @@ struct transaction_s
struct journal_head *t_log_list;
/*
+ * List of inodes whose data we've modified in data=ordered mode.
+ * [j_list_lock]
+ */
+ struct list_head t_inode_list;
+
+ /*
* Protects info related to handles
*/
spinlock_t t_handle_lock;
@@ -1004,7 +1032,6 @@ extern int jbd2_journal_extend (handle_t *, int nblocks);
extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
-extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
@@ -1044,6 +1071,10 @@ extern void jbd2_journal_ack_err (journal_t *);
extern int jbd2_journal_clear_err (journal_t *);
extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
extern int jbd2_journal_force_commit(journal_t *);
+extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
+extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
+extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
/*
* journal_head management
@@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
/* journaling buffer types */
#define BJ_None 0 /* Not journaled */
-#define BJ_SyncData 1 /* Normal data: flush before commit */
-#define BJ_Metadata 2 /* Normal journaled metadata */
-#define BJ_Forget 3 /* Buffer superseded by this transaction */
-#define BJ_IO 4 /* Buffer is for temporary IO use */
-#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
-#define BJ_LogCtl 6 /* Buffer contains log descriptors */
-#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
-#define BJ_Locked 8 /* Locked for I/O during commit */
-#define BJ_Types 9
+#define BJ_Metadata 1 /* Normal journaled metadata */
+#define BJ_Forget 2 /* Buffer superseded by this transaction */
+#define BJ_IO 3 /* Buffer is for temporary IO use */
+#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */
+#define BJ_LogCtl 5 /* Buffer contains log descriptors */
+#define BJ_Reserved 6 /* Buffer is reserved for access by journal */
+#define BJ_Types 7
extern int jbd_blocks_per_page(struct inode *inode);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index a744383d16e9..81b3dd5206e0 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -398,7 +398,8 @@ int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_waterm
int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
-int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+ int block_mcast_loopback);
int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 068a0c9946af..5c42821da2d1 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -11,11 +11,21 @@
*/
#ifdef CONFIG_BLOCK
+struct mpage_data {
+ struct bio *bio;
+ sector_t last_block_in_bio;
+ get_block_t *get_block;
+ unsigned use_writepage;
+};
+
struct writeback_control;
+struct bio *mpage_bio_submit(int rw, struct bio *bio);
int mpage_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages, get_block_t get_block);
int mpage_readpage(struct page *page, get_block_t get_block);
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+ void *data);
int mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block);
int mpage_writepage(struct page *page, get_block_t *get_block,
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc112..208388835357 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
- s64 ret = __percpu_counter_sum(fbc);
+ s64 ret = __percpu_counter_sum(fbc, 0);
return ret < 0 ? 0 : ret;
}
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+{
+ return __percpu_counter_sum(fbc, 1);
+}
+
+
static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
- return __percpu_counter_sum(fbc);
+ return __percpu_counter_sum(fbc, 0);
}
static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index bd91987c065f..12b15c561a1f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+ unsigned range_cont:1;
};
/*
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index c36750ff6ae8..483057b2f4b4 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -2,29 +2,33 @@
* Copyright (c) 2005 Voltaire Inc. All rights reserved.
* Copyright (c) 2005 Intel Corporation. All rights reserved.
*
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
*
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/cpl.php.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
*
- * 2) under the terms of the "The BSD License" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/bsd-license.php.
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
*
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- * copy of which is available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/gpl-license.php.
- *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
*
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#if !defined(IB_ADDR_H)
@@ -57,6 +61,7 @@ struct rdma_dev_addr {
unsigned char dst_dev_addr[MAX_ADDR_LEN];
unsigned char broadcast[MAX_ADDR_LEN];
enum rdma_node_type dev_type;
+ struct net_device *src_dev;
};
/**
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index f179d233ffc3..00a2b8ec327f 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_cache.h 1349 2004-12-16 21:09:43Z roland $
*/
#ifndef _IB_CACHE_H
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index a627c8682d2f..ec7c6d99ed3f 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -31,8 +31,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_cm.h 4311 2005-12-05 18:42:01Z sean.hefty $
*/
#if !defined(IB_CM_H)
#define IB_CM_H
diff --git a/include/rdma/ib_fmr_pool.h b/include/rdma/ib_fmr_pool.h
index 00dadbf94e1d..f62b842e6596 100644
--- a/include/rdma/ib_fmr_pool.h
+++ b/include/rdma/ib_fmr_pool.h
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_fmr_pool.h 2730 2005-06-28 16:43:03Z sean.hefty $
*/
#if !defined(IB_FMR_POOL_H)
@@ -61,7 +59,7 @@ struct ib_fmr_pool_param {
int pool_size;
int dirty_watermark;
void (*flush_function)(struct ib_fmr_pool *pool,
- void * arg);
+ void *arg);
void *flush_arg;
unsigned cache:1;
};
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 7228c056b9e9..5f6c40fffcf4 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -32,11 +32,9 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_mad.h 5596 2006-03-03 01:00:07Z sean.hefty $
*/
-#if !defined( IB_MAD_H )
+#if !defined(IB_MAD_H)
#define IB_MAD_H
#include <linux/list.h>
@@ -194,8 +192,7 @@ struct ib_vendor_mad {
u8 data[IB_MGMT_VENDOR_DATA];
};
-struct ib_class_port_info
-{
+struct ib_class_port_info {
u8 base_version;
u8 class_version;
__be16 capability_mask;
@@ -614,11 +611,11 @@ int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
* any class specific header, and MAD data area.
* If @rmpp_active is set, the RMPP header will be initialized for sending.
*/
-struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
- u32 remote_qpn, u16 pkey_index,
- int rmpp_active,
- int hdr_len, int data_len,
- gfp_t gfp_mask);
+struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
+ u32 remote_qpn, u16 pkey_index,
+ int rmpp_active,
+ int hdr_len, int data_len,
+ gfp_t gfp_mask);
/**
* ib_is_mad_class_rmpp - returns whether given management class
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index f926020d6331..d7fc45c4eba9 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -28,8 +28,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_pack.h 1349 2004-12-16 21:09:43Z roland $
*/
#ifndef IB_PACK_H
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 942692b0b92e..3841c1aff692 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -30,8 +30,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_sa.h 2811 2005-07-06 18:11:43Z halr $
*/
#ifndef IB_SA_H
diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h
index f29af135ba83..aaca0878668f 100644
--- a/include/rdma/ib_smi.h
+++ b/include/rdma/ib_smi.h
@@ -32,11 +32,9 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_smi.h 1389 2004-12-27 22:56:47Z roland $
*/
-#if !defined( IB_SMI_H )
+#if !defined(IB_SMI_H)
#define IB_SMI_H
#include <rdma/ib_mad.h>
diff --git a/include/rdma/ib_user_cm.h b/include/rdma/ib_user_cm.h
index 37650afb982c..bd3d380781e0 100644
--- a/include/rdma/ib_user_cm.h
+++ b/include/rdma/ib_user_cm.h
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_user_cm.h 4019 2005-11-11 00:33:09Z sean.hefty $
*/
#ifndef IB_USER_CM_H
diff --git a/include/rdma/ib_user_mad.h b/include/rdma/ib_user_mad.h
index 29d2c7205a90..d6fce1cbdb90 100644
--- a/include/rdma/ib_user_mad.h
+++ b/include/rdma/ib_user_mad.h
@@ -29,8 +29,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_user_mad.h 2814 2005-07-06 19:14:09Z halr $
*/
#ifndef IB_USER_MAD_H
diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
index 8d65bf0a625b..a17f77106149 100644
--- a/include/rdma/ib_user_verbs.h
+++ b/include/rdma/ib_user_verbs.h
@@ -31,8 +31,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_user_verbs.h 4019 2005-11-11 00:33:09Z sean.hefty $
*/
#ifndef IB_USER_VERBS_H
@@ -291,7 +289,10 @@ struct ib_uverbs_wc {
__u32 opcode;
__u32 vendor_err;
__u32 byte_len;
- __u32 imm_data;
+ union {
+ __u32 imm_data;
+ __u32 invalidate_rkey;
+ } ex;
__u32 qp_num;
__u32 src_qp;
__u32 wc_flags;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 31d30b1852e8..90b529f7a154 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -34,8 +34,6 @@
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
- *
- * $Id: ib_verbs.h 1349 2004-12-16 21:09:43Z roland $
*/
#if !defined(IB_VERBS_H)
@@ -93,7 +91,7 @@ enum ib_device_cap_flags {
IB_DEVICE_RC_RNR_NAK_GEN = (1<<12),
IB_DEVICE_SRQ_RESIZE = (1<<13),
IB_DEVICE_N_NOTIFY_CQ = (1<<14),
- IB_DEVICE_ZERO_STAG = (1<<15),
+ IB_DEVICE_LOCAL_DMA_LKEY = (1<<15),
IB_DEVICE_RESERVED = (1<<16), /* old SEND_W_INV */
IB_DEVICE_MEM_WINDOW = (1<<17),
/*
@@ -105,6 +103,8 @@ enum ib_device_cap_flags {
*/
IB_DEVICE_UD_IP_CSUM = (1<<18),
IB_DEVICE_UD_TSO = (1<<19),
+ IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21),
+ IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
};
enum ib_atomic_cap {
@@ -150,6 +150,7 @@ struct ib_device_attr {
int max_srq;
int max_srq_wr;
int max_srq_sge;
+ unsigned int max_fast_reg_page_list_len;
u16 max_pkeys;
u8 local_ca_ack_delay;
};
@@ -226,6 +227,57 @@ static inline int ib_width_enum_to_int(enum ib_port_width width)
}
}
+struct ib_protocol_stats {
+ /* TBD... */
+};
+
+struct iw_protocol_stats {
+ u64 ipInReceives;
+ u64 ipInHdrErrors;
+ u64 ipInTooBigErrors;
+ u64 ipInNoRoutes;
+ u64 ipInAddrErrors;
+ u64 ipInUnknownProtos;
+ u64 ipInTruncatedPkts;
+ u64 ipInDiscards;
+ u64 ipInDelivers;
+ u64 ipOutForwDatagrams;
+ u64 ipOutRequests;
+ u64 ipOutDiscards;
+ u64 ipOutNoRoutes;
+ u64 ipReasmTimeout;
+ u64 ipReasmReqds;
+ u64 ipReasmOKs;
+ u64 ipReasmFails;
+ u64 ipFragOKs;
+ u64 ipFragFails;
+ u64 ipFragCreates;
+ u64 ipInMcastPkts;
+ u64 ipOutMcastPkts;
+ u64 ipInBcastPkts;
+ u64 ipOutBcastPkts;
+
+ u64 tcpRtoAlgorithm;
+ u64 tcpRtoMin;
+ u64 tcpRtoMax;
+ u64 tcpMaxConn;
+ u64 tcpActiveOpens;
+ u64 tcpPassiveOpens;
+ u64 tcpAttemptFails;
+ u64 tcpEstabResets;
+ u64 tcpCurrEstab;
+ u64 tcpInSegs;
+ u64 tcpOutSegs;
+ u64 tcpRetransSegs;
+ u64 tcpInErrs;
+ u64 tcpOutRsts;
+};
+
+union rdma_protocol_stats {
+ struct ib_protocol_stats ib;
+ struct iw_protocol_stats iw;
+};
+
struct ib_port_attr {
enum ib_port_state state;
enum ib_mtu max_mtu;
@@ -413,6 +465,8 @@ enum ib_wc_opcode {
IB_WC_FETCH_ADD,
IB_WC_BIND_MW,
IB_WC_LSO,
+ IB_WC_LOCAL_INV,
+ IB_WC_FAST_REG_MR,
/*
* Set value of IB_WC_RECV so consumers can test if a completion is a
* receive by testing (opcode & IB_WC_RECV).
@@ -423,7 +477,8 @@ enum ib_wc_opcode {
enum ib_wc_flags {
IB_WC_GRH = 1,
- IB_WC_WITH_IMM = (1<<1)
+ IB_WC_WITH_IMM = (1<<1),
+ IB_WC_WITH_INVALIDATE = (1<<2),
};
struct ib_wc {
@@ -433,7 +488,10 @@ struct ib_wc {
u32 vendor_err;
u32 byte_len;
struct ib_qp *qp;
- __be32 imm_data;
+ union {
+ __be32 imm_data;
+ u32 invalidate_rkey;
+ } ex;
u32 src_qp;
int wc_flags;
u16 pkey_index;
@@ -498,7 +556,8 @@ enum ib_qp_type {
};
enum ib_qp_create_flags {
- IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0,
+ IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0,
+ IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1,
};
struct ib_qp_init_attr {
@@ -627,6 +686,9 @@ enum ib_wr_opcode {
IB_WR_ATOMIC_FETCH_AND_ADD,
IB_WR_LSO,
IB_WR_SEND_WITH_INV,
+ IB_WR_RDMA_READ_WITH_INV,
+ IB_WR_LOCAL_INV,
+ IB_WR_FAST_REG_MR,
};
enum ib_send_flags {
@@ -643,6 +705,12 @@ struct ib_sge {
u32 lkey;
};
+struct ib_fast_reg_page_list {
+ struct ib_device *device;
+ u64 *page_list;
+ unsigned int max_page_list_len;
+};
+
struct ib_send_wr {
struct ib_send_wr *next;
u64 wr_id;
@@ -675,6 +743,15 @@ struct ib_send_wr {
u16 pkey_index; /* valid for GSI only */
u8 port_num; /* valid for DR SMPs on switch only */
} ud;
+ struct {
+ u64 iova_start;
+ struct ib_fast_reg_page_list *page_list;
+ unsigned int page_shift;
+ unsigned int page_list_len;
+ u32 length;
+ int access_flags;
+ u32 rkey;
+ } fast_reg;
} wr;
};
@@ -777,7 +854,7 @@ struct ib_cq {
struct ib_uobject *uobject;
ib_comp_handler comp_handler;
void (*event_handler)(struct ib_event *, void *);
- void * cq_context;
+ void *cq_context;
int cqe;
atomic_t usecnt; /* count number of work queues */
};
@@ -883,7 +960,7 @@ struct ib_dma_mapping_ops {
void (*sync_single_for_cpu)(struct ib_device *dev,
u64 dma_handle,
size_t size,
- enum dma_data_direction dir);
+ enum dma_data_direction dir);
void (*sync_single_for_device)(struct ib_device *dev,
u64 dma_handle,
size_t size,
@@ -919,6 +996,8 @@ struct ib_device {
struct iw_cm_verbs *iwcm;
+ int (*get_protocol_stats)(struct ib_device *device,
+ union rdma_protocol_stats *stats);
int (*query_device)(struct ib_device *device,
struct ib_device_attr *device_attr);
int (*query_port)(struct ib_device *device,
@@ -1013,6 +1092,11 @@ struct ib_device {
int (*query_mr)(struct ib_mr *mr,
struct ib_mr_attr *mr_attr);
int (*dereg_mr)(struct ib_mr *mr);
+ struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd,
+ int max_page_list_len);
+ struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
+ int page_list_len);
+ void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
int (*rereg_phys_mr)(struct ib_mr *mr,
int mr_rereg_mask,
struct ib_pd *pd,
@@ -1065,6 +1149,7 @@ struct ib_device {
char node_desc[64];
__be64 node_guid;
+ u32 local_dma_lkey;
u8 node_type;
u8 phys_port_cnt;
};
@@ -1807,6 +1892,54 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
int ib_dereg_mr(struct ib_mr *mr);
/**
+ * ib_alloc_fast_reg_mr - Allocates memory region usable with the
+ * IB_WR_FAST_REG_MR send work request.
+ * @pd: The protection domain associated with the region.
+ * @max_page_list_len: requested max physical buffer list length to be
+ * used with fast register work requests for this MR.
+ */
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+/**
+ * ib_alloc_fast_reg_page_list - Allocates a page list array
+ * @device - ib device pointer.
+ * @page_list_len - size of the page list array to be allocated.
+ *
+ * This allocates and returns a struct ib_fast_reg_page_list * and a
+ * page_list array that is at least page_list_len in size. The actual
+ * size is returned in max_page_list_len. The caller is responsible
+ * for initializing the contents of the page_list array before posting
+ * a send work request with the IB_WC_FAST_REG_MR opcode.
+ *
+ * The page_list array entries must be translated using one of the
+ * ib_dma_*() functions just like the addresses passed to
+ * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct
+ * ib_fast_reg_page_list must not be modified by the caller until the
+ * IB_WC_FAST_REG_MR work request completes.
+ */
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+ struct ib_device *device, int page_list_len);
+
+/**
+ * ib_free_fast_reg_page_list - Deallocates a previously allocated
+ * page list array.
+ * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
+ */
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+/**
+ * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
+ * R_Key and L_Key.
+ * @mr - struct ib_mr pointer to be updated.
+ * @newkey - new key to be used.
+ */
+static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
+{
+ mr->lkey = (mr->lkey & 0xffffff00) | newkey;
+ mr->rkey = (mr->rkey & 0xffffff00) | newkey;
+}
+
+/**
* ib_alloc_mw - Allocates a memory window.
* @pd: The protection domain associated with the memory window.
*/
diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h
index aeefa9b740dc..cbb822e8d791 100644
--- a/include/rdma/iw_cm.h
+++ b/include/rdma/iw_cm.h
@@ -62,7 +62,7 @@ struct iw_cm_event {
struct sockaddr_in remote_addr;
void *private_data;
u8 private_data_len;
- void* provider_data;
+ void *provider_data;
};
/**
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 010f876f41d8..22bb2e7bab1a 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -2,29 +2,33 @@
* Copyright (c) 2005 Voltaire Inc. All rights reserved.
* Copyright (c) 2005 Intel Corporation. All rights reserved.
*
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
*
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/cpl.php.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
*
- * 2) under the terms of the "The BSD License" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/bsd-license.php.
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
*
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- * copy of which is available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/gpl-license.php.
- *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
*
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#if !defined(RDMA_CM_H)
@@ -57,11 +61,11 @@ enum rdma_cm_event_type {
};
enum rdma_port_space {
- RDMA_PS_SDP = 0x0001,
- RDMA_PS_IPOIB= 0x0002,
- RDMA_PS_TCP = 0x0106,
- RDMA_PS_UDP = 0x0111,
- RDMA_PS_SCTP = 0x0183
+ RDMA_PS_SDP = 0x0001,
+ RDMA_PS_IPOIB = 0x0002,
+ RDMA_PS_TCP = 0x0106,
+ RDMA_PS_UDP = 0x0111,
+ RDMA_PS_SCTP = 0x0183
};
struct rdma_addr {
diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h
index 950424b38f16..2389c3b45404 100644
--- a/include/rdma/rdma_cm_ib.h
+++ b/include/rdma/rdma_cm_ib.h
@@ -1,29 +1,33 @@
/*
* Copyright (c) 2006 Intel Corporation. All rights reserved.
*
- * This Software is licensed under one of the following licenses:
- *
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/cpl.php.
- *
- * 2) under the terms of the "The BSD License" a copy of which is
- * available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/bsd-license.php.
- *
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- * copy of which is available from the Open Source Initiative, see
- * http://www.opensource.org/licenses/gpl-license.php.
- *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
- *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
*/
#if !defined(RDMA_CM_IB_H)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 119174494cb5..4a8ba4bf5f6f 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
* Add up all the per-cpu counts, return the result. This is a more accurate
* but much slower version of percpu_counter_read_positive()
*/
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
{
s64 ret;
int cpu;
@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
for_each_online_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
ret += *pcount;
+ if (set)
+ *pcount = 0;
}
+ if (set)
+ fbc->count = ret;
+
spin_unlock(&fbc->lock);
return ret;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6a7d34874f..65d9d9e2b755 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_fdatawrite);
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
/**
* filemap_flush - mostly a non-blocking flush
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b38f700825fc..94c6d8988ab3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -960,6 +960,9 @@ retry:
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+
+ if (wbc->range_cont)
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 91200feb3f9c..63f131fc42e4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -555,15 +555,13 @@ static int selinux_set_mnt_opts(struct super_block *sb,
struct task_security_struct *tsec = current->security;
struct superblock_security_struct *sbsec = sb->s_security;
const char *name = sb->s_type->name;
- struct dentry *root = sb->s_root;
- struct inode *root_inode = root->d_inode;
- struct inode_security_struct *root_isec = root_inode->i_security;
+ struct inode *inode = sbsec->sb->s_root->d_inode;
+ struct inode_security_struct *root_isec = inode->i_security;
u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0;
u32 defcontext_sid = 0;
char **mount_options = opts->mnt_opts;
int *flags = opts->mnt_opts_flags;
int num_opts = opts->num_mnt_opts;
- bool can_xattr = false;
mutex_lock(&sbsec->lock);
@@ -667,24 +665,14 @@ static int selinux_set_mnt_opts(struct super_block *sb,
goto out;
}
- if (strcmp(name, "proc") == 0)
+ if (strcmp(sb->s_type->name, "proc") == 0)
sbsec->proc = 1;
- /*
- * test if the fs supports xattrs, fs_use might make use of this if the
- * fs has no definition in policy.
- */
- if (root_inode->i_op->getxattr) {
- rc = root_inode->i_op->getxattr(root, XATTR_NAME_SELINUX, NULL, 0);
- if (rc >= 0 || rc == -ENODATA)
- can_xattr = true;
- }
-
/* Determine the labeling behavior to use for this filesystem type. */
- rc = security_fs_use(name, &sbsec->behavior, &sbsec->sid, can_xattr);
+ rc = security_fs_use(sb->s_type->name, &sbsec->behavior, &sbsec->sid);
if (rc) {
printk(KERN_WARNING "%s: security_fs_use(%s) returned %d\n",
- __func__, name, rc);
+ __func__, sb->s_type->name, rc);
goto out;
}
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 44cba2e21dcf..7c543003d653 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -136,7 +136,7 @@ int security_get_allow_unknown(void);
#define SECURITY_FS_USE_MNTPOINT 6 /* use mountpoint labeling */
int security_fs_use(const char *fstype, unsigned int *behavior,
- u32 *sid, bool can_xattr);
+ u32 *sid);
int security_genfs_sid(const char *fstype, char *name, u16 sclass,
u32 *sid);
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 8e42da120101..b52f923ce680 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -1934,8 +1934,7 @@ out:
int security_fs_use(
const char *fstype,
unsigned int *behavior,
- u32 *sid,
- bool can_xattr)
+ u32 *sid)
{
int rc = 0;
struct ocontext *c;
@@ -1949,7 +1948,6 @@ int security_fs_use(
c = c->next;
}
- /* look for labeling behavior defined in policy */
if (c) {
*behavior = c->v.behavior;
if (!c->sid[0]) {
@@ -1960,23 +1958,14 @@ int security_fs_use(
goto out;
}
*sid = c->sid[0];
- goto out;
- }
-
- /* labeling behavior not in policy, use xattrs if possible */
- if (can_xattr) {
- *behavior = SECURITY_FS_USE_XATTR;
- *sid = SECINITSID_FS;
- goto out;
- }
-
- /* no behavior in policy and can't use xattrs, try GENFS */
- rc = security_genfs_sid(fstype, "/", SECCLASS_DIR, sid);
- if (rc) {
- *behavior = SECURITY_FS_USE_NONE;
- rc = 0;
} else {
- *behavior = SECURITY_FS_USE_GENFS;
+ rc = security_genfs_sid(fstype, "/", SECCLASS_DIR, sid);
+ if (rc) {
+ *behavior = SECURITY_FS_USE_NONE;
+ rc = 0;
+ } else {
+ *behavior = SECURITY_FS_USE_GENFS;
+ }
}
out: