summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2016-05-21 12:33:04 -0700
committerDan Williams <dan.j.williams@intel.com>2016-05-21 12:33:04 -0700
commit36092ee8ba695fce023b2118ececa6c2a56b1331 (patch)
treeb9579893cdd559e7b72fa569003b19792de58fad
parent1b982baf75e7d9585967fcfccd05b77bf9054010 (diff)
parent03dca343afe080968d90c4d9196404b5bbbc8461 (diff)
downloadlinux-36092ee8ba695fce023b2118ececa6c2a56b1331.tar.bz2
Merge branch 'for-4.7/dax' into libnvdimm-for-next
-rw-r--r--block/ioctl.c32
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/dax/Kconfig26
-rw-r--r--drivers/dax/Makefile4
-rw-r--r--drivers/dax/dax.c575
-rw-r--r--drivers/dax/dax.h24
-rw-r--r--drivers/dax/pmem.c158
-rw-r--r--drivers/nvdimm/bus.c9
-rw-r--r--drivers/nvdimm/claim.c23
-rw-r--r--drivers/nvdimm/core.c3
-rw-r--r--drivers/nvdimm/dax_devs.c35
-rw-r--r--drivers/nvdimm/dimm_devs.c5
-rw-r--r--drivers/nvdimm/nd-core.h3
-rw-r--r--drivers/nvdimm/nd.h11
-rw-r--r--drivers/nvdimm/pfn.h1
-rw-r--r--drivers/nvdimm/pfn_devs.c40
-rw-r--r--drivers/nvdimm/pmem.c3
-rw-r--r--drivers/nvdimm/region_devs.c5
-rw-r--r--fs/block_dev.c96
-rw-r--r--include/linux/fs.h8
-rw-r--r--include/uapi/linux/fs.h1
-rw-r--r--mm/huge_memory.c1
-rw-r--r--mm/hugetlb.c1
-rw-r--r--tools/testing/nvdimm/Kbuild9
-rw-r--r--tools/testing/nvdimm/config_check.c2
26 files changed, 935 insertions, 143 deletions
diff --git a/block/ioctl.c b/block/ioctl.c
index 4ff1f92f89ca..698c7933d582 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -407,35 +407,6 @@ static inline int is_unrecognized_ioctl(int ret)
ret == -ENOIOCTLCMD;
}
-#ifdef CONFIG_FS_DAX
-bool blkdev_dax_capable(struct block_device *bdev)
-{
- struct gendisk *disk = bdev->bd_disk;
-
- if (!disk->fops->direct_access)
- return false;
-
- /*
- * If the partition is not aligned on a page boundary, we can't
- * do dax I/O to it.
- */
- if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
- || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
- return false;
-
- /*
- * If the device has known bad blocks, force all I/O through the
- * driver / page cache.
- *
- * TODO: support finer grained dax error handling
- */
- if (disk->bb && disk->bb->count)
- return false;
-
- return true;
-}
-#endif
-
static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
@@ -598,9 +569,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
- case BLKDAXGET:
- return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
- break;
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, argp);
case IOC_PR_RESERVE:
diff --git a/drivers/Kconfig b/drivers/Kconfig
index d2ac339de85f..8298eab84a6f 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -190,6 +190,8 @@ source "drivers/android/Kconfig"
source "drivers/nvdimm/Kconfig"
+source "drivers/dax/Kconfig"
+
source "drivers/nvmem/Kconfig"
source "drivers/hwtracing/stm/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 8f5d076baeb0..0b6f3d60193d 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -66,6 +66,7 @@ obj-$(CONFIG_PARPORT) += parport/
obj-$(CONFIG_NVM) += lightnvm/
obj-y += base/ block/ misc/ mfd/ nfc/
obj-$(CONFIG_LIBNVDIMM) += nvdimm/
+obj-$(CONFIG_DEV_DAX) += dax/
obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
obj-$(CONFIG_NUBUS) += nubus/
obj-y += macintosh/
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
new file mode 100644
index 000000000000..cedab7572de3
--- /dev/null
+++ b/drivers/dax/Kconfig
@@ -0,0 +1,26 @@
+menuconfig DEV_DAX
+ tristate "DAX: direct access to differentiated memory"
+ default m if NVDIMM_DAX
+ depends on TRANSPARENT_HUGEPAGE
+ help
+ Support raw access to differentiated (persistence, bandwidth,
+ latency...) memory via an mmap(2) capable character
+ device. Platform firmware or a device driver may identify a
+ platform memory resource that is differentiated from the
+ baseline memory pool. Mappings of a /dev/daxX.Y device impose
+ restrictions that make the mapping behavior deterministic.
+
+if DEV_DAX
+
+config DEV_DAX_PMEM
+ tristate "PMEM DAX: direct access to persistent memory"
+ depends on NVDIMM_DAX
+ default DEV_DAX
+ help
+ Support raw access to persistent memory. Note that this
+ driver consumes memory ranges allocated and exported by the
+ libnvdimm sub-system.
+
+ Say Y if unsure
+
+endif
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
new file mode 100644
index 000000000000..27c54e38478a
--- /dev/null
+++ b/drivers/dax/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
+
+dax_pmem-y := pmem.o
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
new file mode 100644
index 000000000000..b891a129b275
--- /dev/null
+++ b/drivers/dax/dax.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/pfn_t.h>
+#include <linux/slab.h>
+#include <linux/dax.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+static int dax_major;
+static struct class *dax_class;
+static DEFINE_IDA(dax_minor_ida);
+
+/**
+ * struct dax_region - mapping infrastructure for dax devices
+ * @id: kernel-wide unique region for a memory range
+ * @base: linear address corresponding to @res
+ * @kref: to pin while other agents have a need to do lookups
+ * @dev: parent device backing this region
+ * @align: allocation and mapping alignment for child dax devices
+ * @res: physical address range of the region
+ * @pfn_flags: identify whether the pfns are paged back or not
+ */
+struct dax_region {
+ int id;
+ struct ida ida;
+ void *base;
+ struct kref kref;
+ struct device *dev;
+ unsigned int align;
+ struct resource res;
+ unsigned long pfn_flags;
+};
+
+/**
+ * struct dax_dev - subdivision of a dax region
+ * @region - parent region
+ * @dev - device backing the character device
+ * @kref - enable this data to be tracked in filp->private_data
+ * @alive - !alive + rcu grace period == no new mappings can be established
+ * @id - child id in the region
+ * @num_resources - number of physical address extents in this device
+ * @res - array of physical address ranges
+ */
+struct dax_dev {
+ struct dax_region *region;
+ struct device *dev;
+ struct kref kref;
+ bool alive;
+ int id;
+ int num_resources;
+ struct resource res[0];
+};
+
+static void dax_region_free(struct kref *kref)
+{
+ struct dax_region *dax_region;
+
+ dax_region = container_of(kref, struct dax_region, kref);
+ kfree(dax_region);
+}
+
+void dax_region_put(struct dax_region *dax_region)
+{
+ kref_put(&dax_region->kref, dax_region_free);
+}
+EXPORT_SYMBOL_GPL(dax_region_put);
+
+static void dax_dev_free(struct kref *kref)
+{
+ struct dax_dev *dax_dev;
+
+ dax_dev = container_of(kref, struct dax_dev, kref);
+ dax_region_put(dax_dev->region);
+ kfree(dax_dev);
+}
+
+static void dax_dev_put(struct dax_dev *dax_dev)
+{
+ kref_put(&dax_dev->kref, dax_dev_free);
+}
+
+struct dax_region *alloc_dax_region(struct device *parent, int region_id,
+ struct resource *res, unsigned int align, void *addr,
+ unsigned long pfn_flags)
+{
+ struct dax_region *dax_region;
+
+ dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
+
+ if (!dax_region)
+ return NULL;
+
+ memcpy(&dax_region->res, res, sizeof(*res));
+ dax_region->pfn_flags = pfn_flags;
+ kref_init(&dax_region->kref);
+ dax_region->id = region_id;
+ ida_init(&dax_region->ida);
+ dax_region->align = align;
+ dax_region->dev = parent;
+ dax_region->base = addr;
+
+ return dax_region;
+}
+EXPORT_SYMBOL_GPL(alloc_dax_region);
+
+static ssize_t size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dax_dev *dax_dev = dev_get_drvdata(dev);
+ unsigned long long size = 0;
+ int i;
+
+ for (i = 0; i < dax_dev->num_resources; i++)
+ size += resource_size(&dax_dev->res[i]);
+
+ return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static struct attribute *dax_device_attributes[] = {
+ &dev_attr_size.attr,
+ NULL,
+};
+
+static const struct attribute_group dax_device_attribute_group = {
+ .attrs = dax_device_attributes,
+};
+
+static const struct attribute_group *dax_attribute_groups[] = {
+ &dax_device_attribute_group,
+ NULL,
+};
+
+static void unregister_dax_dev(void *_dev)
+{
+ struct device *dev = _dev;
+ struct dax_dev *dax_dev = dev_get_drvdata(dev);
+ struct dax_region *dax_region = dax_dev->region;
+
+ dev_dbg(dev, "%s\n", __func__);
+
+ /*
+ * Note, rcu is not protecting the liveness of dax_dev, rcu is
+ * ensuring that any fault handlers that might have seen
+ * dax_dev->alive == true, have completed. Any fault handlers
+ * that start after synchronize_rcu() has started will abort
+ * upon seeing dax_dev->alive == false.
+ */
+ dax_dev->alive = false;
+ synchronize_rcu();
+
+ get_device(dev);
+ device_unregister(dev);
+ ida_simple_remove(&dax_region->ida, dax_dev->id);
+ ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));
+ put_device(dev);
+ dax_dev_put(dax_dev);
+}
+
+int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
+ int count)
+{
+ struct device *parent = dax_region->dev;
+ struct dax_dev *dax_dev;
+ struct device *dev;
+ int rc, minor;
+ dev_t dev_t;
+
+ dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL);
+ if (!dax_dev)
+ return -ENOMEM;
+ memcpy(dax_dev->res, res, sizeof(*res) * count);
+ dax_dev->num_resources = count;
+ kref_init(&dax_dev->kref);
+ dax_dev->alive = true;
+ dax_dev->region = dax_region;
+ kref_get(&dax_region->kref);
+
+ dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
+ if (dax_dev->id < 0) {
+ rc = dax_dev->id;
+ goto err_id;
+ }
+
+ minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL);
+ if (minor < 0) {
+ rc = minor;
+ goto err_minor;
+ }
+
+ dev_t = MKDEV(dax_major, minor);
+ dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev,
+ dax_attribute_groups, "dax%d.%d", dax_region->id,
+ dax_dev->id);
+ if (IS_ERR(dev)) {
+ rc = PTR_ERR(dev);
+ goto err_create;
+ }
+ dax_dev->dev = dev;
+
+ rc = devm_add_action(dax_region->dev, unregister_dax_dev, dev);
+ if (rc) {
+ unregister_dax_dev(dev);
+ return rc;
+ }
+
+ return 0;
+
+ err_create:
+ ida_simple_remove(&dax_minor_ida, minor);
+ err_minor:
+ ida_simple_remove(&dax_region->ida, dax_dev->id);
+ err_id:
+ dax_dev_put(dax_dev);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(devm_create_dax_dev);
+
+/* return an unmapped area aligned to the dax region specified alignment */
+static unsigned long dax_dev_get_unmapped_area(struct file *filp,
+ unsigned long addr, unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ unsigned long off, off_end, off_align, len_align, addr_align, align;
+ struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
+ struct dax_region *dax_region;
+
+ if (!dax_dev || addr)
+ goto out;
+
+ dax_region = dax_dev->region;
+ align = dax_region->align;
+ off = pgoff << PAGE_SHIFT;
+ off_end = off + len;
+ off_align = round_up(off, align);
+
+ if ((off_end <= off_align) || ((off_end - off_align) < align))
+ goto out;
+
+ len_align = len + align;
+ if ((off + len_align) < off)
+ goto out;
+
+ addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
+ pgoff, flags);
+ if (!IS_ERR_VALUE(addr_align)) {
+ addr_align += (off - addr_align) & (align - 1);
+ return addr_align;
+ }
+ out:
+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+static int __match_devt(struct device *dev, const void *data)
+{
+ const dev_t *devt = data;
+
+ return dev->devt == *devt;
+}
+
+static struct device *dax_dev_find(dev_t dev_t)
+{
+ return class_find_device(dax_class, NULL, &dev_t, __match_devt);
+}
+
+static int dax_dev_open(struct inode *inode, struct file *filp)
+{
+ struct dax_dev *dax_dev = NULL;
+ struct device *dev;
+
+ dev = dax_dev_find(inode->i_rdev);
+ if (!dev)
+ return -ENXIO;
+
+ device_lock(dev);
+ dax_dev = dev_get_drvdata(dev);
+ if (dax_dev) {
+ dev_dbg(dev, "%s\n", __func__);
+ filp->private_data = dax_dev;
+ kref_get(&dax_dev->kref);
+ inode->i_flags = S_DAX;
+ }
+ device_unlock(dev);
+
+ if (!dax_dev) {
+ put_device(dev);
+ return -ENXIO;
+ }
+ return 0;
+}
+
+static int dax_dev_release(struct inode *inode, struct file *filp)
+{
+ struct dax_dev *dax_dev = filp->private_data;
+ struct device *dev = dax_dev->dev;
+
+ dev_dbg(dax_dev->dev, "%s\n", __func__);
+ dax_dev_put(dax_dev);
+ put_device(dev);
+
+ return 0;
+}
+
+static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+ const char *func)
+{
+ struct dax_region *dax_region = dax_dev->region;
+ struct device *dev = dax_dev->dev;
+ unsigned long mask;
+
+ if (!dax_dev->alive)
+ return -ENXIO;
+
+ /* prevent private / writable mappings from being established */
+ if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
+ dev_info(dev, "%s: %s: fail, attempted private mapping\n",
+ current->comm, func);
+ return -EINVAL;
+ }
+
+ mask = dax_region->align - 1;
+ if (vma->vm_start & mask || vma->vm_end & mask) {
+ dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
+ current->comm, func, vma->vm_start, vma->vm_end,
+ mask);
+ return -EINVAL;
+ }
+
+ if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
+ && (vma->vm_flags & VM_DONTCOPY) == 0) {
+ dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
+ current->comm, func);
+ return -EINVAL;
+ }
+
+ if (!vma_is_dax(vma)) {
+ dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
+ current->comm, func);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
+ unsigned long size)
+{
+ struct resource *res;
+ phys_addr_t phys;
+ int i;
+
+ for (i = 0; i < dax_dev->num_resources; i++) {
+ res = &dax_dev->res[i];
+ phys = pgoff * PAGE_SIZE + res->start;
+ if (phys >= res->start && phys <= res->end)
+ break;
+ pgoff -= PHYS_PFN(resource_size(res));
+ }
+
+ if (i < dax_dev->num_resources) {
+ res = &dax_dev->res[i];
+ if (phys + size - 1 <= res->end)
+ return phys;
+ }
+
+ return -1;
+}
+
+static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ unsigned long vaddr = (unsigned long) vmf->virtual_address;
+ struct device *dev = dax_dev->dev;
+ struct dax_region *dax_region;
+ int rc = VM_FAULT_SIGBUS;
+ phys_addr_t phys;
+ pfn_t pfn;
+
+ if (check_vma(dax_dev, vma, __func__))
+ return VM_FAULT_SIGBUS;
+
+ dax_region = dax_dev->region;
+ if (dax_region->align > PAGE_SIZE) {
+ dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+ return VM_FAULT_SIGBUS;
+ }
+
+ phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
+ if (phys == -1) {
+ dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+ vmf->pgoff);
+ return VM_FAULT_SIGBUS;
+ }
+
+ pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+ rc = vm_insert_mixed(vma, vaddr, pfn);
+
+ if (rc == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (rc < 0 && rc != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
+}
+
+static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int rc;
+ struct file *filp = vma->vm_file;
+ struct dax_dev *dax_dev = filp->private_data;
+
+ dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+ current->comm, (vmf->flags & FAULT_FLAG_WRITE)
+ ? "write" : "read", vma->vm_start, vma->vm_end);
+ rcu_read_lock();
+ rc = __dax_dev_fault(dax_dev, vma, vmf);
+ rcu_read_unlock();
+
+ return rc;
+}
+
+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
+ struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
+ unsigned int flags)
+{
+ unsigned long pmd_addr = addr & PMD_MASK;
+ struct device *dev = dax_dev->dev;
+ struct dax_region *dax_region;
+ phys_addr_t phys;
+ pgoff_t pgoff;
+ pfn_t pfn;
+
+ if (check_vma(dax_dev, vma, __func__))
+ return VM_FAULT_SIGBUS;
+
+ dax_region = dax_dev->region;
+ if (dax_region->align > PMD_SIZE) {
+ dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+ return VM_FAULT_SIGBUS;
+ }
+
+ /* dax pmd mappings require pfn_t_devmap() */
+ if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
+ dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+ return VM_FAULT_SIGBUS;
+ }
+
+ pgoff = linear_page_index(vma, pmd_addr);
+ phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
+ if (phys == -1) {
+ dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+ pgoff);
+ return VM_FAULT_SIGBUS;
+ }
+
+ pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+ return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
+ flags & FAULT_FLAG_WRITE);
+}
+
+static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ int rc;
+ struct file *filp = vma->vm_file;
+ struct dax_dev *dax_dev = filp->private_data;
+
+ dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+ current->comm, (flags & FAULT_FLAG_WRITE)
+ ? "write" : "read", vma->vm_start, vma->vm_end);
+
+ rcu_read_lock();
+ rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
+ rcu_read_unlock();
+
+ return rc;
+}
+
+static void dax_dev_vm_open(struct vm_area_struct *vma)
+{
+ struct file *filp = vma->vm_file;
+ struct dax_dev *dax_dev = filp->private_data;
+
+ dev_dbg(dax_dev->dev, "%s\n", __func__);
+ kref_get(&dax_dev->kref);
+}
+
+static void dax_dev_vm_close(struct vm_area_struct *vma)
+{
+ struct file *filp = vma->vm_file;
+ struct dax_dev *dax_dev = filp->private_data;
+
+ dev_dbg(dax_dev->dev, "%s\n", __func__);
+ dax_dev_put(dax_dev);
+}
+
+static const struct vm_operations_struct dax_dev_vm_ops = {
+ .fault = dax_dev_fault,
+ .pmd_fault = dax_dev_pmd_fault,
+ .open = dax_dev_vm_open,
+ .close = dax_dev_vm_close,
+};
+
+static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct dax_dev *dax_dev = filp->private_data;
+ int rc;
+
+ dev_dbg(dax_dev->dev, "%s\n", __func__);
+
+ rc = check_vma(dax_dev, vma, __func__);
+ if (rc)
+ return rc;
+
+ kref_get(&dax_dev->kref);
+ vma->vm_ops = &dax_dev_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ return 0;
+
+}
+
+static const struct file_operations dax_fops = {
+ .llseek = noop_llseek,
+ .owner = THIS_MODULE,
+ .open = dax_dev_open,
+ .release = dax_dev_release,
+ .get_unmapped_area = dax_dev_get_unmapped_area,
+ .mmap = dax_dev_mmap,
+};
+
+static int __init dax_init(void)
+{
+ int rc;
+
+ rc = register_chrdev(0, "dax", &dax_fops);
+ if (rc < 0)
+ return rc;
+ dax_major = rc;
+
+ dax_class = class_create(THIS_MODULE, "dax");
+ if (IS_ERR(dax_class)) {
+ unregister_chrdev(dax_major, "dax");
+ return PTR_ERR(dax_class);
+ }
+
+ return 0;
+}
+
+static void __exit dax_exit(void)
+{
+ class_destroy(dax_class);
+ unregister_chrdev(dax_major, "dax");
+ ida_destroy(&dax_minor_ida);
+}
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+subsys_initcall(dax_init);
+module_exit(dax_exit);
diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
new file mode 100644
index 000000000000..d8b8f1f25054
--- /dev/null
+++ b/drivers/dax/dax.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __DAX_H__
+#define __DAX_H__
+struct device;
+struct resource;
+struct dax_region;
+void dax_region_put(struct dax_region *dax_region);
+struct dax_region *alloc_dax_region(struct device *parent,
+ int region_id, struct resource *res, unsigned int align,
+ void *addr, unsigned long flags);
+int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
+ int count);
+#endif /* __DAX_H__ */
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
new file mode 100644
index 000000000000..55d510e36cd1
--- /dev/null
+++ b/drivers/dax/pmem.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/percpu-refcount.h>
+#include <linux/memremap.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include "../nvdimm/pfn.h"
+#include "../nvdimm/nd.h"
+#include "dax.h"
+
+struct dax_pmem {
+ struct device *dev;
+ struct percpu_ref ref;
+ struct completion cmp;
+};
+
+struct dax_pmem *to_dax_pmem(struct percpu_ref *ref)
+{
+ return container_of(ref, struct dax_pmem, ref);
+}
+
+static void dax_pmem_percpu_release(struct percpu_ref *ref)
+{
+ struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+
+ dev_dbg(dax_pmem->dev, "%s\n", __func__);
+ complete(&dax_pmem->cmp);
+}
+
+static void dax_pmem_percpu_exit(void *data)
+{
+ struct percpu_ref *ref = data;
+ struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+
+ dev_dbg(dax_pmem->dev, "%s\n", __func__);
+ percpu_ref_exit(ref);
+ wait_for_completion(&dax_pmem->cmp);
+}
+
+static void dax_pmem_percpu_kill(void *data)
+{
+ struct percpu_ref *ref = data;
+ struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+
+ dev_dbg(dax_pmem->dev, "%s\n", __func__);
+ percpu_ref_kill(ref);
+}
+
+static int dax_pmem_probe(struct device *dev)
+{
+ int rc;
+ void *addr;
+ struct resource res;
+ struct nd_pfn_sb *pfn_sb;
+ struct dax_pmem *dax_pmem;
+ struct nd_region *nd_region;
+ struct nd_namespace_io *nsio;
+ struct dax_region *dax_region;
+ struct nd_namespace_common *ndns;
+ struct nd_dax *nd_dax = to_nd_dax(dev);
+ struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
+ struct vmem_altmap __altmap, *altmap = NULL;
+
+ ndns = nvdimm_namespace_common_probe(dev);
+ if (IS_ERR(ndns))
+ return PTR_ERR(ndns);
+ nsio = to_nd_namespace_io(&ndns->dev);
+
+ /* parse the 'pfn' info block via ->rw_bytes */
+ devm_nsio_enable(dev, nsio);
+ altmap = nvdimm_setup_pfn(nd_pfn, &res, &__altmap);
+ if (IS_ERR(altmap))
+ return PTR_ERR(altmap);
+ devm_nsio_disable(dev, nsio);
+
+ pfn_sb = nd_pfn->pfn_sb;
+
+ if (!devm_request_mem_region(dev, nsio->res.start,
+ resource_size(&nsio->res), dev_name(dev))) {
+ dev_warn(dev, "could not reserve region %pR\n", &nsio->res);
+ return -EBUSY;
+ }
+
+ dax_pmem = devm_kzalloc(dev, sizeof(*dax_pmem), GFP_KERNEL);
+ if (!dax_pmem)
+ return -ENOMEM;
+
+ dax_pmem->dev = dev;
+ init_completion(&dax_pmem->cmp);
+ rc = percpu_ref_init(&dax_pmem->ref, dax_pmem_percpu_release, 0,
+ GFP_KERNEL);
+ if (rc)
+ return rc;
+
+ rc = devm_add_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref);
+ if (rc) {
+ dax_pmem_percpu_exit(&dax_pmem->ref);
+ return rc;
+ }
+
+ addr = devm_memremap_pages(dev, &res, &dax_pmem->ref, altmap);
+ if (IS_ERR(addr))
+ return PTR_ERR(addr);
+
+ rc = devm_add_action(dev, dax_pmem_percpu_kill, &dax_pmem->ref);
+ if (rc) {
+ dax_pmem_percpu_kill(&dax_pmem->ref);
+ return rc;
+ }
+
+ nd_region = to_nd_region(dev->parent);
+ dax_region = alloc_dax_region(dev, nd_region->id, &res,
+ le32_to_cpu(pfn_sb->align), addr, PFN_DEV|PFN_MAP);
+ if (!dax_region)
+ return -ENOMEM;
+
+ /* TODO: support for subdividing a dax region... */
+ rc = devm_create_dax_dev(dax_region, &res, 1);
+
+ /* child dax_dev instances now own the lifetime of the dax_region */
+ dax_region_put(dax_region);
+
+ return rc;
+}
+
+static struct nd_device_driver dax_pmem_driver = {
+ .probe = dax_pmem_probe,
+ .drv = {
+ .name = "dax_pmem",
+ },
+ .type = ND_DRIVER_DAX_PMEM,
+};
+
+static int __init dax_pmem_init(void)
+{
+ return nd_driver_register(&dax_pmem_driver);
+}
+module_init(dax_pmem_init);
+
+static void __exit dax_pmem_exit(void)
+{
+ driver_unregister(&dax_pmem_driver.drv);
+}
+module_exit(dax_pmem_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 04c2c3fda1ab..f085f8bceae8 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -124,9 +124,10 @@ static int nvdimm_bus_remove(struct device *dev)
struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
struct module *provider = to_bus_provider(dev);
struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
- int rc;
+ int rc = 0;
- rc = nd_drv->remove(dev);
+ if (nd_drv->remove)
+ rc = nd_drv->remove(dev);
nd_region_disable(nvdimm_bus, dev);
dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
@@ -296,8 +297,8 @@ int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
return -EINVAL;
}
- if (!nd_drv->probe || !nd_drv->remove) {
- pr_debug("->probe() and ->remove() must be specified\n");
+ if (!nd_drv->probe) {
+ pr_debug("%s ->probe() must be specified\n", mod_name);
return -EINVAL;
}
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 5f53db59a058..8b2e3c4fb0ad 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -93,6 +93,25 @@ static bool is_idle(struct device *dev, struct nd_namespace_common *ndns)
return true;
}
+struct nd_pfn *to_nd_pfn_safe(struct device *dev)
+{
+ /*
+ * pfn device attributes are re-used by dax device instances, so we
+ * need to be careful to correct device-to-nd_pfn conversion.
+ */
+ if (is_nd_pfn(dev))
+ return to_nd_pfn(dev);
+
+ if (is_nd_dax(dev)) {
+ struct nd_dax *nd_dax = to_nd_dax(dev);
+
+ return &nd_dax->nd_pfn;
+ }
+
+ WARN_ON(1);
+ return NULL;
+}
+
static void nd_detach_and_reset(struct device *dev,
struct nd_namespace_common **_ndns)
{
@@ -106,8 +125,8 @@ static void nd_detach_and_reset(struct device *dev,
nd_btt->lbasize = 0;
kfree(nd_btt->uuid);
nd_btt->uuid = NULL;
- } else if (is_nd_pfn(dev)) {
- struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+ } else if (is_nd_pfn(dev) || is_nd_dax(dev)) {
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
kfree(nd_pfn->uuid);
nd_pfn->uuid = NULL;
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index e8688a13cf4f..be89764315c2 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -648,6 +648,9 @@ static __exit void libnvdimm_exit(void)
nd_region_exit();
nvdimm_exit();
nvdimm_bus_exit();
+ nd_region_devs_exit();
+ nvdimm_devs_exit();
+ ida_destroy(&nd_ida);
}
MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
index f90f7549e7f4..45fa82cae87c 100644
--- a/drivers/nvdimm/dax_devs.c
+++ b/drivers/nvdimm/dax_devs.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include "nd-core.h"
+#include "pfn.h"
#include "nd.h"
static void nd_dax_release(struct device *dev)
@@ -97,3 +98,37 @@ struct device *nd_dax_create(struct nd_region *nd_region)
__nd_device_register(dev);
return dev;
}
+
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns)
+{
+ int rc;
+ struct nd_dax *nd_dax;
+ struct device *dax_dev;
+ struct nd_pfn *nd_pfn;
+ struct nd_pfn_sb *pfn_sb;
+ struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+ if (ndns->force_raw)
+ return -ENODEV;
+
+ nvdimm_bus_lock(&ndns->dev);
+ nd_dax = nd_dax_alloc(nd_region);
+ nd_pfn = &nd_dax->nd_pfn;
+ dax_dev = nd_pfn_devinit(nd_pfn, ndns);
+ nvdimm_bus_unlock(&ndns->dev);
+ if (!dax_dev)
+ return -ENOMEM;
+ pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
+ nd_pfn->pfn_sb = pfn_sb;
+ rc = nd_pfn_validate(nd_pfn, DAX_SIG);
+ dev_dbg(dev, "%s: dax: %s\n", __func__,
+ rc == 0 ? dev_name(dax_dev) : "<none>");
+ if (rc < 0) {
+ __nd_detach_ndns(dax_dev, &nd_pfn->ndns);
+ put_device(dax_dev);
+ } else
+ __nd_device_register(dax_dev);
+
+ return rc;
+}
+EXPORT_SYMBOL(nd_dax_probe);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 79a35a02053c..bbde28d3dec5 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -552,3 +552,8 @@ int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count)
return 0;
}
EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count);
+
+void __exit nvdimm_devs_exit(void)
+{
+ ida_destroy(&dimm_ida);
+}
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 86d985ccce82..284cdaa268cf 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -49,6 +49,8 @@ bool is_nd_blk(struct device *dev);
struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
int __init nvdimm_bus_init(void);
void nvdimm_bus_exit(void);
+void nvdimm_devs_exit(void);
+void nd_region_devs_exit(void);
void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
struct nd_region;
void nd_region_create_blk_seed(struct nd_region *nd_region);
@@ -92,4 +94,5 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
ssize_t nd_namespace_store(struct device *dev,
struct nd_namespace_common **_ndns, const char *buf,
size_t len);
+struct nd_pfn *to_nd_pfn_safe(struct device *dev);
#endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 46910b8f32b1..d0ac93c31dda 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -232,7 +232,7 @@ bool is_nd_pfn(struct device *dev);
struct device *nd_pfn_create(struct nd_region *nd_region);
struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
struct nd_namespace_common *ndns);
-int nd_pfn_validate(struct nd_pfn *nd_pfn);
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig);
extern struct attribute_group nd_pfn_attribute_group;
#else
static inline int nd_pfn_probe(struct device *dev,
@@ -251,7 +251,7 @@ static inline struct device *nd_pfn_create(struct nd_region *nd_region)
return NULL;
}
-static inline int nd_pfn_validate(struct nd_pfn *nd_pfn)
+static inline int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
{
return -ENODEV;
}
@@ -259,9 +259,16 @@ static inline int nd_pfn_validate(struct nd_pfn *nd_pfn)
struct nd_dax *to_nd_dax(struct device *dev);
#if IS_ENABLED(CONFIG_NVDIMM_DAX)
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns);
bool is_nd_dax(struct device *dev);
struct device *nd_dax_create(struct nd_region *nd_region);
#else
+static inline int nd_dax_probe(struct device *dev,
+ struct nd_namespace_common *ndns)
+{
+ return -ENODEV;
+}
+
static inline bool is_nd_dax(struct device *dev)
{
return false;
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index 9d2704c83fa7..dde9853453d3 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -19,6 +19,7 @@
#define PFN_SIG_LEN 16
#define PFN_SIG "NVDIMM_PFN_INFO\0"
+#define DAX_SIG "NVDIMM_DAX_INFO\0"
struct nd_pfn_sb {
u8 signature[PFN_SIG_LEN];
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 2248056d29e7..f7718ec685fa 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -54,25 +54,6 @@ struct nd_pfn *to_nd_pfn(struct device *dev)
}
EXPORT_SYMBOL(to_nd_pfn);
-static struct nd_pfn *to_nd_pfn_safe(struct device *dev)
-{
- /*
- * pfn device attributes are re-used by dax device instances, so we
- * need to be careful to correct device-to-nd_pfn conversion.
- */
- if (is_nd_pfn(dev))
- return to_nd_pfn(dev);
-
- if (is_nd_dax(dev)) {
- struct nd_dax *nd_dax = to_nd_dax(dev);
-
- return &nd_dax->nd_pfn;
- }
-
- WARN_ON(1);
- return NULL;
-}
-
static ssize_t mode_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -360,7 +341,7 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
return dev;
}
-int nd_pfn_validate(struct nd_pfn *nd_pfn)
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
{
u64 checksum, offset;
struct nd_namespace_io *nsio;
@@ -377,7 +358,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
return -ENXIO;
- if (memcmp(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN) != 0)
+ if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0)
return -ENODEV;
checksum = le64_to_cpu(pfn_sb->checksum);
@@ -416,6 +397,8 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
return -ENODEV;
}
+ if (nd_pfn->align == 0)
+ nd_pfn->align = le32_to_cpu(pfn_sb->align);
if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) {
dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
nd_pfn->align, nvdimm_namespace_capacity(ndns));
@@ -436,8 +419,8 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
return -EBUSY;
}
- nd_pfn->align = le32_to_cpu(pfn_sb->align);
- if (!is_power_of_2(offset) || offset < PAGE_SIZE) {
+ if ((nd_pfn->align && !IS_ALIGNED(offset, nd_pfn->align))
+ || !IS_ALIGNED(offset, PAGE_SIZE)) {
dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n",
offset);
return -ENXIO;
@@ -467,7 +450,7 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
nd_pfn = to_nd_pfn(pfn_dev);
nd_pfn->pfn_sb = pfn_sb;
- rc = nd_pfn_validate(nd_pfn);
+ rc = nd_pfn_validate(nd_pfn, PFN_SIG);
dev_dbg(dev, "%s: pfn: %s\n", __func__,
rc == 0 ? dev_name(pfn_dev) : "<none>");
if (rc < 0) {
@@ -552,6 +535,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
struct nd_pfn_sb *pfn_sb;
unsigned long npfns;
phys_addr_t offset;
+ const char *sig;
u64 checksum;
int rc;
@@ -560,7 +544,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
return -ENOMEM;
nd_pfn->pfn_sb = pfn_sb;
- rc = nd_pfn_validate(nd_pfn);
+ if (is_nd_dax(&nd_pfn->dev))
+ sig = DAX_SIG;
+ else
+ sig = PFN_SIG;
+ rc = nd_pfn_validate(nd_pfn, sig);
if (rc != -ENODEV)
return rc;
@@ -635,7 +623,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
pfn_sb->dataoff = cpu_to_le64(offset);
pfn_sb->npfns = cpu_to_le64(npfns);
- memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
+ memcpy(pfn_sb->signature, sig, PFN_SIG_LEN);
memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
pfn_sb->version_major = cpu_to_le16(1);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index d9a0dbc2d023..042baec56931 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -320,7 +320,8 @@ static int nd_pmem_probe(struct device *dev)
return pmem_attach_disk(dev, ndns);
/* if we find a valid info-block we'll come back as that personality */
- if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0)
+ if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
+ || nd_dax_probe(dev, ndns) == 0)
return -ENXIO;
/* ...otherwise we're just a raw pmem device */
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 9e1b054e0e61..40fcfea26fbb 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -793,3 +793,8 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
__func__);
}
EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
+
+void __exit nd_region_devs_exit(void)
+{
+ ida_destroy(&region_ida);
+}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02b77c4..36ee10ca503e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,7 @@
#include <linux/log2.h>
#include <linux/cleancache.h>
#include <linux/dax.h>
+#include <linux/badblocks.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -1159,6 +1160,33 @@ void bd_set_size(struct block_device *bdev, loff_t size)
}
EXPORT_SYMBOL(bd_set_size);
+static bool blkdev_dax_capable(struct block_device *bdev)
+{
+ struct gendisk *disk = bdev->bd_disk;
+
+ if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
+ return false;
+
+ /*
+ * If the partition is not aligned on a page boundary, we can't
+ * do dax I/O to it.
+ */
+ if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+ || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+ return false;
+
+ /*
+ * If the device has known bad blocks, force all I/O through the
+ * driver / page cache.
+ *
+ * TODO: support finer grained dax error handling
+ */
+ if (disk->bb && disk->bb->count)
+ return false;
+
+ return true;
+}
+
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
/*
@@ -1724,79 +1752,13 @@ static const struct address_space_operations def_blk_aops = {
.is_dirty_writeback = buffer_check_dirty_writeback,
};
-#ifdef CONFIG_FS_DAX
-/*
- * In the raw block case we do not need to contend with truncation nor
- * unwritten file extents. Without those concerns there is no need for
- * additional locking beyond the mmap_sem context that these routines
- * are already executing under.
- *
- * Note, there is no protection if the block device is dynamically
- * resized (partition grow/shrink) during a fault. A stable block device
- * size is already not enforced in the blkdev_direct_IO path.
- *
- * For DAX, it is the responsibility of the block device driver to
- * ensure the whole-disk device size is stable while requests are in
- * flight.
- *
- * Finally, unlike the filemap_page_mkwrite() case there is no
- * filesystem superblock to sync against freezing. We still include a
- * pfn_mkwrite callback for dax drivers to receive write fault
- * notifications.
- */
-static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- return __dax_fault(vma, vmf, blkdev_get_block, NULL);
-}
-
-static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
- struct vm_fault *vmf)
-{
- return dax_pfn_mkwrite(vma, vmf);
-}
-
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags)
-{
- return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static const struct vm_operations_struct blkdev_dax_vm_ops = {
- .fault = blkdev_dax_fault,
- .pmd_fault = blkdev_dax_pmd_fault,
- .pfn_mkwrite = blkdev_dax_pfn_mkwrite,
-};
-
-static const struct vm_operations_struct blkdev_default_vm_ops = {
- .fault = filemap_fault,
- .map_pages = filemap_map_pages,
-};
-
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
-{
- struct inode *bd_inode = bdev_file_inode(file);
-
- file_accessed(file);
- if (IS_DAX(bd_inode)) {
- vma->vm_ops = &blkdev_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
- } else {
- vma->vm_ops = &blkdev_default_vm_ops;
- }
-
- return 0;
-}
-#else
-#define blkdev_mmap generic_file_mmap
-#endif
-
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
- .mmap = blkdev_mmap,
+ .mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b58baaf..8363a10660f6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2320,14 +2320,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
extern void emergency_thaw_all(void);
extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
extern int fsync_bdev(struct block_device *);
-#ifdef CONFIG_FS_DAX
-extern bool blkdev_dax_capable(struct block_device *bdev);
-#else
-static inline bool blkdev_dax_capable(struct block_device *bdev)
-{
- return false;
-}
-#endif
extern struct super_block *blockdev_superblock;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a079d50376e1..fbff8b28aa35 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -222,7 +222,6 @@ struct fsxattr {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
-#define BLKDAXGET _IO(0x12,129)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86f9f8b82f8e..52ea012d8a80 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1013,6 +1013,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
return VM_FAULT_NOPAGE;
}
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08b396f..b14e98129b07 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -624,6 +624,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
{
return vma_hugecache_offset(hstate_vma(vma), vma, address);
}
+EXPORT_SYMBOL_GPL(linear_hugepage_index);
/*
* Return the size of the pages allocated when backing a VMA. In the majority
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 5ff6d3c126a9..785985677159 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -16,6 +16,7 @@ ldflags-y += --wrap=phys_to_pfn_t
DRIVERS := ../../../drivers
NVDIMM_SRC := $(DRIVERS)/nvdimm
ACPI_SRC := $(DRIVERS)/acpi
+DAX_SRC := $(DRIVERS)/dax
obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
@@ -23,6 +24,8 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
obj-$(CONFIG_ND_BLK) += nd_blk.o
obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
obj-$(CONFIG_ACPI_NFIT) += nfit.o
+obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
nfit-y := $(ACPI_SRC)/nfit.o
nfit-y += config_check.o
@@ -39,6 +42,12 @@ nd_blk-y += config_check.o
nd_e820-y := $(NVDIMM_SRC)/e820.o
nd_e820-y += config_check.o
+dax-y := $(DAX_SRC)/dax.o
+dax-y += config_check.o
+
+dax_pmem-y := $(DAX_SRC)/pmem.o
+dax_pmem-y += config_check.o
+
libnvdimm-y := $(NVDIMM_SRC)/core.o
libnvdimm-y += $(NVDIMM_SRC)/bus.o
libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o
diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c
index f2c7615554eb..adf18bfeca00 100644
--- a/tools/testing/nvdimm/config_check.c
+++ b/tools/testing/nvdimm/config_check.c
@@ -12,4 +12,6 @@ void check(void)
BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BTT));
BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BLK));
BUILD_BUG_ON(!IS_MODULE(CONFIG_ACPI_NFIT));
+ BUILD_BUG_ON(!IS_MODULE(CONFIG_DEV_DAX));
+ BUILD_BUG_ON(!IS_MODULE(CONFIG_DEV_DAX_PMEM));
}