From bd26d0d0ce7434a86dde61a7c65c94fe3801d8f6 Mon Sep 17 00:00:00 2001 From: Dmitry Krivenok Date: Wed, 2 Dec 2015 00:48:12 +0300 Subject: nvdimm: improve diagnosibility of namespaces In order to bind namespace to the driver user must first set all mandatory attributes in the following order: - uuid - size - sector_size (for blk namespace only) If the order is wrong, then user either won't be able to set the attribute or bind the namespace. This simple patch improves diagnosibility of common operations with namespaces by printing some details about the error instead of failing silently. Below are examples of error messages (assuming dyndbg is enabled for nvdimms): [/]# echo 4194304 > /sys/bus/nd/devices/region5/namespace5.0/size [ 288.372612] nd namespace5.0: __size_store: uuid not set [ 288.374839] nd namespace5.0: size_store: 400000 fail (-6) sh: write error: No such device or address [/]# [/]# echo namespace5.0 > /sys/bus/nd/drivers/nd_blk/bind [ 554.671648] nd_blk namespace5.0: nvdimm_namespace_common_probe: sector size not set [ 554.674688] ndbus1: nd_blk.probe(namespace5.0) = -19 sh: write error: No such device [/]# Signed-off-by: Dmitry V. Krivenok Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 0955b2cb10fe..f981177524a1 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -791,6 +791,15 @@ static void nd_namespace_pmem_set_size(struct nd_region *nd_region, res->end = nd_region->ndr_start + size - 1; } +static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where) +{ + if (!uuid) { + dev_dbg(dev, "%s: uuid not set\n", where); + return true; + } + return false; +} + static ssize_t __size_store(struct device *dev, unsigned long long val) { resource_size_t allocated = 0, available = 0; @@ -820,8 +829,12 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) * We need a uuid for the allocation-label and dimm(s) on which * to store the label. */ - if (!uuid || nd_region->ndr_mappings == 0) + if (uuid_not_set(uuid, dev, __func__)) return -ENXIO; + if (nd_region->ndr_mappings == 0) { + dev_dbg(dev, "%s: not associated with dimm(s)\n", __func__); + return -ENXIO; + } div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder); if (remainder) { @@ -1343,14 +1356,19 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) struct nd_namespace_pmem *nspm; nspm = to_nd_namespace_pmem(&ndns->dev); - if (!nspm->uuid) { - dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__); + if (uuid_not_set(nspm->uuid, &ndns->dev, __func__)) return ERR_PTR(-ENODEV); - } } else if (is_namespace_blk(&ndns->dev)) { struct nd_namespace_blk *nsblk; nsblk = to_nd_namespace_blk(&ndns->dev); + if (uuid_not_set(nsblk->uuid, &ndns->dev, __func__)) + return ERR_PTR(-ENODEV); + if (!nsblk->lbasize) { + dev_dbg(&ndns->dev, "%s: sector size not set\n", + __func__); + return ERR_PTR(-ENODEV); + } if (!nd_namespace_blk_validate(nsblk)) return ERR_PTR(-ENODEV); } -- cgit v1.2.3 From 6bb691ac089c39bb0aa73bdcc21ffd8c846e4ba5 Mon Sep 17 00:00:00 2001 From: Dmitry Krivenok Date: Wed, 2 Dec 2015 09:39:29 +0300 Subject: nvdimm: do not show pfn_seed for non pmem regions This simple change hides pfn_seed attribute for non pmem regions because they don't support pfn anyway. Signed-off-by: Dmitry V. Krivenok Signed-off-by: Dan Williams --- drivers/nvdimm/region_devs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 529f3f02e7b2..3d730d1f25db 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -406,6 +406,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) struct nd_interleave_set *nd_set = nd_region->nd_set; int type = nd_region_to_nstype(nd_region); + if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr) + return 0; + if (a != &dev_attr_set_cookie.attr && a != &dev_attr_available_size.attr) return a->mode; -- cgit v1.2.3 From 9f1e8cee7742cadbe6b97f2c80b787b4ee067bae Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Dec 2015 15:14:20 -0800 Subject: libnvdimm, pfn: kill ND_PFN_ALIGN The alignment constraint isn't necessary now that devm_memremap_pages() allows for unaligned mappings. Signed-off-by: Dan Williams --- drivers/nvdimm/nd.h | 7 ------- drivers/nvdimm/pfn_devs.c | 11 +---------- drivers/nvdimm/pmem.c | 15 --------------- 3 files changed, 1 insertion(+), 32 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 417e521d299c..2ce428e9b584 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -29,13 +29,6 @@ enum { ND_MAX_LANES = 256, SECTOR_SHIFT = 9, INT_LBASIZE_ALIGNMENT = 64, -#if IS_ENABLED(CONFIG_NVDIMM_PFN) - ND_PFN_ALIGN = PAGES_PER_SECTION * PAGE_SIZE, - ND_PFN_MASK = ND_PFN_ALIGN - 1, -#else - ND_PFN_ALIGN = 0, - ND_PFN_MASK = 0, -#endif }; struct nvdimm_drvdata { diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 71805a1aa0f3..96c122918ee1 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -241,10 +241,6 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) if (!is_nd_pmem(nd_pfn->dev.parent)) return -ENODEV; - /* section alignment for simple hotplug */ - if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN) - return -ENODEV; - if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb))) return -ENXIO; @@ -286,12 +282,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) */ offset = le64_to_cpu(pfn_sb->dataoff); nsio = to_nd_namespace_io(&ndns->dev); - if (nsio->res.start & ND_PFN_MASK) { - dev_err(&nd_pfn->dev, - "init failed: %s not section aligned\n", - dev_name(&ndns->dev)); - return -EBUSY; - } else if (offset >= resource_size(&nsio->res)) { + if (offset >= resource_size(&nsio->res)) { dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n", dev_name(&ndns->dev)); return -EBUSY; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 8ee79893d2f5..520c00321dad 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -241,11 +241,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) if (rc == 0 || rc == -EBUSY) return rc; - /* section alignment for simple hotplug */ - if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN - || pmem->phys_addr & ND_PFN_MASK) - return -ENODEV; - nd_region = to_nd_region(nd_pfn->dev.parent); if (nd_region->ro) { dev_info(&nd_pfn->dev, @@ -326,16 +321,6 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) if (rc) return rc; - if (PAGE_SIZE != SZ_4K) { - dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n"); - return -ENXIO; - } - if (nsio->res.start & ND_PFN_MASK) { - dev_err(dev, "%s not memory hotplug section aligned\n", - dev_name(&ndns->dev)); - return -ENXIO; - } - pfn_sb = nd_pfn->pfn_sb; offset = le64_to_cpu(pfn_sb->dataoff); nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); -- cgit v1.2.3 From f7c6ab80fa5fee3daccb83a3c1b3a9f39d7b931c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Dec 2015 16:11:59 -0800 Subject: libnvdimm, pfn: clean up pfn create parameters In all cases __nd_pfn_create is called with default parameters which are then overridden by values in the info block. Clean up pfn creation by dropping the parameters and setting default values internal to __nd_pfn_create. Signed-off-by: Dan Williams --- drivers/nvdimm/pfn_devs.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 96c122918ee1..613ffcca6ecb 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -179,7 +179,6 @@ static const struct attribute_group *nd_pfn_attribute_groups[] = { }; static struct device *__nd_pfn_create(struct nd_region *nd_region, - u8 *uuid, enum nd_pfn_mode mode, struct nd_namespace_common *ndns) { struct nd_pfn *nd_pfn; @@ -199,10 +198,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region, return NULL; } - nd_pfn->mode = mode; - if (uuid) - uuid = kmemdup(uuid, 16, GFP_KERNEL); - nd_pfn->uuid = uuid; + nd_pfn->mode = PFN_MODE_NONE; dev = &nd_pfn->dev; dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id); dev->parent = &nd_region->dev; @@ -220,8 +216,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region, struct device *nd_pfn_create(struct nd_region *nd_region) { - struct device *dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, - NULL); + struct device *dev = __nd_pfn_create(nd_region, NULL); if (dev) __nd_device_register(dev); @@ -304,7 +299,7 @@ int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata) return -ENODEV; nvdimm_bus_lock(&ndns->dev); - dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, ndns); + dev = __nd_pfn_create(nd_region, ndns); nvdimm_bus_unlock(&ndns->dev); if (!dev) return -ENOMEM; -- cgit v1.2.3 From 315c562536c42aa4da9b6c5a2135dd6715a5e0b5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Dec 2015 14:45:23 -0800 Subject: libnvdimm, pfn: add 'align' attribute, default to HPAGE_SIZE When setting aside capacity for struct page it must be aligned to the largest mapping size that is to be made available via DAX. Make the alignment configurable to enable support for 1GiB page-size mappings. The offset for PFN_MODE_RAM may now be larger than SZ_8K, so fixup the offset check in nvdimm_namespace_attach_pfn(). Reported-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/nvdimm/nd.h | 1 + drivers/nvdimm/pfn_devs.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/pmem.c | 6 ++--- 3 files changed, 65 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 2ce428e9b584..e4e9f9ae0cc8 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -146,6 +146,7 @@ struct nd_pfn { int id; u8 *uuid; struct device dev; + unsigned long align; unsigned long npfns; enum nd_pfn_mode mode; struct nd_pfn_sb *pfn_sb; diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 613ffcca6ecb..95ecd7b0fab3 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -103,6 +103,52 @@ static ssize_t mode_store(struct device *dev, } static DEVICE_ATTR_RW(mode); +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + return sprintf(buf, "%lx\n", nd_pfn->align); +} + +static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf) +{ + unsigned long val; + int rc; + + rc = kstrtoul(buf, 0, &val); + if (rc) + return rc; + + if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G) + return -EINVAL; + + if (nd_pfn->dev.driver) + return -EBUSY; + else + nd_pfn->align = val; + + return 0; +} + +static ssize_t align_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = __align_store(nd_pfn, buf); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(align); + static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -164,6 +210,7 @@ static struct attribute *nd_pfn_attributes[] = { &dev_attr_mode.attr, &dev_attr_namespace.attr, &dev_attr_uuid.attr, + &dev_attr_align.attr, NULL, }; @@ -199,6 +246,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region, } nd_pfn->mode = PFN_MODE_NONE; + nd_pfn->align = HPAGE_SIZE; dev = &nd_pfn->dev; dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id); dev->parent = &nd_region->dev; @@ -269,6 +317,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) return -EINVAL; } + if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) { + dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n", + nd_pfn->align, nvdimm_namespace_capacity(ndns)); + return -EINVAL; + } + /* * These warnings are verbose because they can only trigger in * the case where the physical address alignment of the @@ -283,6 +337,13 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) return -EBUSY; } + nd_pfn->align = 1UL << ilog2(offset); + if (!is_power_of_2(offset) || offset < PAGE_SIZE) { + dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n", + offset); + return -ENXIO; + } + return 0; } EXPORT_SYMBOL(nd_pfn_validate); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 520c00321dad..5ba351e4f26a 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -258,9 +258,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) * ->direct_access() to those that are included in the memmap. */ if (nd_pfn->mode == PFN_MODE_PMEM) - offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE); + offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align); else if (nd_pfn->mode == PFN_MODE_RAM) - offset = SZ_8K; + offset = ALIGN(SZ_8K, nd_pfn->align); else goto err; @@ -325,7 +325,7 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) offset = le64_to_cpu(pfn_sb->dataoff); nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); if (nd_pfn->mode == PFN_MODE_RAM) { - if (offset != SZ_8K) + if (offset < SZ_8K) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); altmap = NULL; -- cgit v1.2.3 From a34d5e8a6ad1a31b186019c9c351777626698863 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 12 Dec 2015 16:09:14 -0800 Subject: libnvdimm, pfn: add parent uuid validation Track and check the uuid of the namespace hosting a pfn instance. This forces the pfn info block to be invalidated if the namespace is re-configured with a different uuid. Signed-off-by: Dan Williams --- drivers/nvdimm/pfn_devs.c | 10 +++++++--- drivers/nvdimm/pmem.c | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 95ecd7b0fab3..f9b674bc49db 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -273,10 +273,11 @@ struct device *nd_pfn_create(struct nd_region *nd_region) int nd_pfn_validate(struct nd_pfn *nd_pfn) { - struct nd_namespace_common *ndns = nd_pfn->ndns; - struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; - struct nd_namespace_io *nsio; u64 checksum, offset; + struct nd_namespace_io *nsio; + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + struct nd_namespace_common *ndns = nd_pfn->ndns; + const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev); if (!pfn_sb || !ndns) return -ENODEV; @@ -296,6 +297,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) return -ENODEV; pfn_sb->checksum = cpu_to_le64(checksum); + if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0) + return -ENODEV; + switch (le32_to_cpu(pfn_sb->mode)) { case PFN_MODE_RAM: break; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5ba351e4f26a..dc6866734f70 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -270,6 +270,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) pfn_sb->npfns = cpu_to_le64(npfns); memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN); memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); + memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); pfn_sb->version_major = cpu_to_le16(1); checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); pfn_sb->checksum = cpu_to_le64(checksum); -- cgit v1.2.3 From 2dc43331e34fa992a67f42ed44e5111cafafd6f3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 13 Dec 2015 11:41:36 -0800 Subject: libnvdimm, pfn: fix pfn seed creation Similar to btt, plant a new pfn seed when the existing one is activated. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 12 ++++++++++++ drivers/nvdimm/nd-core.h | 1 + drivers/nvdimm/region_devs.c | 7 +++++++ 3 files changed, 20 insertions(+) (limited to 'drivers') diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index f981177524a1..ea8dd0cb28ca 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1707,6 +1707,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region) nd_device_register(nd_region->ns_seed); } +void nd_region_create_pfn_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->pfn_seed = nd_pfn_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->pfn_seed) + dev_err(&nd_region->dev, "failed to create pfn namespace\n"); +} + void nd_region_create_btt_seed(struct nd_region *nd_region) { WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 159aed532042..3249c498892a 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -52,6 +52,7 @@ void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); struct nd_region; void nd_region_create_blk_seed(struct nd_region *nd_region); void nd_region_create_btt_seed(struct nd_region *nd_region); +void nd_region_create_pfn_seed(struct nd_region *nd_region); void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 3d730d1f25db..9c632f73915e 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -490,6 +490,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, nd_region_create_blk_seed(nd_region); nvdimm_bus_unlock(dev); } + if (is_nd_pfn(dev) && probe) { + nd_region = to_nd_region(dev->parent); + nvdimm_bus_lock(dev); + if (nd_region->pfn_seed == dev) + nd_region_create_pfn_seed(nd_region); + nvdimm_bus_unlock(dev); + } } void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev) -- cgit v1.2.3 From 3fa962686568a1617d174e3d2f5d522e963077c5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 13 Dec 2015 11:35:52 -0800 Subject: libnvdimm, pfn: fix nd_pfn_validate() return value handling The -ENODEV case indicates that the info-block needs to established. All other return codes cause nd_pfn_init() to abort. Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index dc6866734f70..ab689bca727d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -238,7 +238,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) nd_pfn->pfn_sb = pfn_sb; rc = nd_pfn_validate(nd_pfn); - if (rc == 0 || rc == -EBUSY) + if (rc == -ENODEV) + /* no info block, do init */; + else return rc; nd_region = to_nd_region(nd_pfn->dev.parent); -- cgit v1.2.3 From 0731de0dd95b251ed6cfb5f132486e52357fce53 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 14 Dec 2015 15:34:15 -0800 Subject: libnvdimm, pfn: move 'memory mode' indication to sysfs 'Memory mode' is defined as the capability of a DAX mapping to be the source/target of DMA and other "direct I/O" scenarios. While it currently requires allocating 'struct page' for each page frame of persistent memory in the namespace it will not always be the case. Work continues on reducing the kernel's dependency on 'struct page'. Let's not maintain a suffix that is expected to lose meaning over time. In other words a future 'raw mode' pmem namespace may be as capable as today's 'memory mode' namespace. Undo the encoding of the mode in the device name and leave it to other tooling to determine the mode of the namespace from its attributes. Reported-by: Matthew Wilcox Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index ea8dd0cb28ca..ee6dee41155a 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -104,20 +104,10 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, struct nd_region *nd_region = to_nd_region(ndns->dev.parent); const char *suffix = NULL; - if (ndns->claim) { - if (is_nd_btt(ndns->claim)) - suffix = "s"; - else if (is_nd_pfn(ndns->claim)) - suffix = "m"; - else - dev_WARN_ONCE(&ndns->dev, 1, - "unknown claim type by %s\n", - dev_name(ndns->claim)); - } + if (ndns->claim && is_nd_btt(ndns->claim)) + suffix = "s"; if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) { - if (!suffix && pmem_should_map_pages(&ndns->dev)) - suffix = "m"; sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : ""); } else if (is_namespace_blk(&ndns->dev)) { struct nd_namespace_blk *nsblk; @@ -1224,6 +1214,29 @@ static ssize_t holder_show(struct device *dev, } static DEVICE_ATTR_RO(holder); +static ssize_t mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + struct device *claim; + char *mode; + ssize_t rc; + + device_lock(dev); + claim = ndns->claim; + if (pmem_should_map_pages(dev) || (claim && is_nd_pfn(claim))) + mode = "memory"; + else if (claim && is_nd_btt(claim)) + mode = "safe"; + else + mode = "raw"; + rc = sprintf(buf, "%s\n", mode); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(mode); + static ssize_t force_raw_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -1247,6 +1260,7 @@ static DEVICE_ATTR_RW(force_raw); static struct attribute *nd_namespace_attributes[] = { &dev_attr_nstype.attr, &dev_attr_size.attr, + &dev_attr_mode.attr, &dev_attr_uuid.attr, &dev_attr_holder.attr, &dev_attr_resource.attr, @@ -1280,7 +1294,8 @@ static umode_t namespace_visible(struct kobject *kobj, if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr || a == &dev_attr_holder.attr - || a == &dev_attr_force_raw.attr) + || a == &dev_attr_force_raw.attr + || a == &dev_attr_mode.attr) return a->mode; return 0; -- cgit v1.2.3 From e07ecd76d4db7bda1e9495395b2110a3fe28845a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 5 Jan 2016 18:37:23 -0800 Subject: libnvdimm: fix namespace object confusion in is_uuid_busy() When btt devices were re-worked to be child devices of regions this routine was overlooked. It mistakenly attempts to_nd_namespace_pmem() or to_nd_namespace_blk() conversions on btt and pfn devices. By luck to date we have happened to be hitting valid memory leading to a uuid miscompare, but a recent change to struct nd_namespace_common causes: BUG: unable to handle kernel NULL pointer dereference at 0000000000000001 IP: [] memcmp+0xc/0x40 [..] Call Trace: [] is_uuid_busy+0xc1/0x2a0 [libnvdimm] [] ? to_nd_blk_region+0x50/0x50 [libnvdimm] [] device_for_each_child+0x50/0x90 Cc: Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 53 ++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/region_devs.c | 56 ----------------------------------------- 2 files changed, 53 insertions(+), 56 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index ee6dee41155a..8ebfcaae3f5a 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -77,6 +77,59 @@ static bool is_namespace_io(struct device *dev) return dev ? dev->type == &namespace_io_device_type : false; } +static int is_uuid_busy(struct device *dev, void *data) +{ + u8 *uuid1 = data, *uuid2 = NULL; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid2 = nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid2 = nsblk->uuid; + } else if (is_nd_btt(dev)) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + uuid2 = nd_btt->uuid; + } else if (is_nd_pfn(dev)) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + uuid2 = nd_pfn->uuid; + } + + if (uuid2 && memcmp(uuid1, uuid2, NSLABEL_UUID_LEN) == 0) + return -EBUSY; + + return 0; +} + +static int is_namespace_uuid_busy(struct device *dev, void *data) +{ + if (is_nd_pmem(dev) || is_nd_blk(dev)) + return device_for_each_child(dev, data, is_uuid_busy); + return 0; +} + +/** + * nd_is_uuid_unique - verify that no other namespace has @uuid + * @dev: any device on a nvdimm_bus + * @uuid: uuid to check + */ +bool nd_is_uuid_unique(struct device *dev, u8 *uuid) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev)); + if (device_for_each_child(&nvdimm_bus->dev, uuid, + is_namespace_uuid_busy) != 0) + return false; + return true; +} + bool pmem_should_map_pages(struct device *dev) { struct nd_region *nd_region = to_nd_region(dev->parent); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 9c632f73915e..139bf71ca549 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -134,62 +134,6 @@ int nd_region_to_nstype(struct nd_region *nd_region) } EXPORT_SYMBOL(nd_region_to_nstype); -static int is_uuid_busy(struct device *dev, void *data) -{ - struct nd_region *nd_region = to_nd_region(dev->parent); - u8 *uuid = data; - - switch (nd_region_to_nstype(nd_region)) { - case ND_DEVICE_NAMESPACE_PMEM: { - struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - - if (!nspm->uuid) - break; - if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0) - return -EBUSY; - break; - } - case ND_DEVICE_NAMESPACE_BLK: { - struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); - - if (!nsblk->uuid) - break; - if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0) - return -EBUSY; - break; - } - default: - break; - } - - return 0; -} - -static int is_namespace_uuid_busy(struct device *dev, void *data) -{ - if (is_nd_pmem(dev) || is_nd_blk(dev)) - return device_for_each_child(dev, data, is_uuid_busy); - return 0; -} - -/** - * nd_is_uuid_unique - verify that no other namespace has @uuid - * @dev: any device on a nvdimm_bus - * @uuid: uuid to check - */ -bool nd_is_uuid_unique(struct device *dev, u8 *uuid) -{ - struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); - - if (!nvdimm_bus) - return false; - WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev)); - if (device_for_each_child(&nvdimm_bus->dev, uuid, - is_namespace_uuid_busy) != 0) - return false; - return true; -} - static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { -- cgit v1.2.3 From fc974ee2bffdde47d1e4b220cf326952cc2c4794 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 24 Dec 2015 19:20:34 -0700 Subject: md: convert to use the generic badblocks code Retain badblocks as part of rdev, but use the accessor functions from include/linux/badblocks for all manipulation. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/md/md.c | 516 +++----------------------------------------------------- drivers/md/md.h | 40 +---- 2 files changed, 28 insertions(+), 528 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 807095f4c793..1e48aa9de352 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -709,8 +710,7 @@ void md_rdev_clear(struct md_rdev *rdev) put_page(rdev->bb_page); rdev->bb_page = NULL; } - kfree(rdev->badblocks.page); - rdev->badblocks.page = NULL; + badblocks_free(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); @@ -1360,8 +1360,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) return cpu_to_le32(csum); } -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; @@ -1486,8 +1484,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (md_set_badblocks(&rdev->badblocks, - sector, count, 1) == 0) + if (badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -2319,7 +2316,7 @@ repeat: rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { rdev->badblocks.changed = 0; - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } clear_bit(Blocked, &rdev->flags); @@ -2445,7 +2442,7 @@ repeat: clear_bit(Blocked, &rdev->flags); if (any_badblocks_changed) - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } @@ -3046,11 +3043,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_ static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack); -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); - +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ static ssize_t bb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 0); @@ -3165,14 +3168,7 @@ int md_rdev_init(struct md_rdev *rdev) * This reserves the space even on arrays where it cannot * be used - I wonder if that matters */ - rdev->badblocks.count = 0; - rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ - rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); - seqlock_init(&rdev->badblocks.lock); - if (rdev->badblocks.page == NULL) - return -ENOMEM; - - return 0; + return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); /* @@ -8478,254 +8474,9 @@ void md_finish_reshape(struct mddev *mddev) } EXPORT_SYMBOL(md_finish_reshape); -/* Bad block management. - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. - * - * Locking of the bad-block table uses a seqlock so md_is_badblock - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. - * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. - * We return - * 0 if there are no known bad blocks in the range - * 1 if there are known bad block which are all acknowledged - * -1 if there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. - */ -int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - int hi; - int lo; - u64 *p = bb->page; - int rv; - sector_t target = s + sectors; - unsigned seq; - - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<shift) - 1; - target >>= bb->shift; - sectors = target - s; - } - /* 'target' is now the first block after the bad range */ - -retry: - seq = read_seqbegin(&bb->lock); - lo = 0; - rv = 0; - hi = bb->count; - - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. */ - lo = mid; - else - /* This and later ranges are definitely out. */ - hi = mid; - } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. - */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; - } - } - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return rv; -} -EXPORT_SYMBOL_GPL(md_is_badblock); - -/* - * Add a range of bad blocks to the table. - * This might extend the table, or might contract it - * if two adjacent ranges can be merged. - * We binary-search to find the 'insertion' point, then - * decide how best to handle it. - */ -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) -{ - u64 *p; - int lo, hi; - int rv = 1; - unsigned long flags; - - if (bb->shift < 0) - /* badblocks are disabled */ - return 0; - - if (bb->shift) { - /* round the start down, and the end up */ - sector_t next = s + sectors; - s >>= bb->shift; - next += (1<shift) - 1; - next >>= bb->shift; - sectors = next - s; - } - - write_seqlock_irqsave(&bb->lock, flags); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; - } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; - } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - } - } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - lo = hi; - hi++; - } - } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; - } - } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' */ - if (bb->count >= MD_MAX_BADBLOCKS) { - /* No room for more */ - rv = 0; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; - - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } - } - - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - write_sequnlock_irqrestore(&bb->lock, flags); - - return rv; -} +/* Bad block management */ +/* Returns 1 on success, 0 on failure */ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { @@ -8734,114 +8485,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - rv = md_set_badblocks(&rdev->badblocks, - s, sectors, 0); - if (rv) { + rv = badblocks_set(&rdev->badblocks, s, sectors, 0); + if (rv == 0) { /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); md_wakeup_thread(rdev->mddev->thread); - } - return rv; + return 1; + } else + return 0; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -/* - * Remove a range of bad blocks from the table. - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - */ -static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) -{ - u64 *p; - int lo, hi; - sector_t target = s + sectors; - int rv = 0; - - if (bb->shift > 0) { - /* When clearing we round the start up and the end down. - * This should not matter as the shift should align with - * the block size and no rounding should ever be needed. - * However it is better the think a block is bad when it - * isn't than to think a block is not bad when it is. - */ - s += (1<shift) - 1; - s >>= bb->shift; - target >>= bb->shift; - sectors = target - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; - } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MD_MAX_BADBLOCKS) { - rv = -ENOSPC; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; - } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded - */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); - } - } - - bb->changed = 1; -out: - write_sequnlock_irq(&bb->lock); - return rv; -} - int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { @@ -8849,133 +8505,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - return md_clear_badblocks(&rdev->badblocks, + return badblocks_clear(&rdev->badblocks, s, sectors); } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); -/* - * Acknowledge all bad blocks in a list. - * This only succeeds if ->changed is clear. It is used by - * in-kernel metadata updates - */ -void md_ack_all_badblocks(struct badblocks *bb) -{ - if (bb->page == NULL || bb->changed) - /* no point even trying */ - return; - write_seqlock_irq(&bb->lock); - - if (bb->changed == 0 && bb->unacked_exist) { - u64 *p = bb->page; - int i; - for (i = 0; i < bb->count ; i++) { - if (!BB_ACK(p[i])) { - sector_t start = BB_OFFSET(p[i]); - int len = BB_LEN(p[i]); - p[i] = BB_MAKE(start, len, 1); - } - } - bb->unacked_exist = 0; - } - write_sequnlock_irq(&bb->lock); -} -EXPORT_SYMBOL_GPL(md_ack_all_badblocks); - -/* sysfs access to bad-blocks list. - * We present two files. - * 'bad-blocks' lists sector numbers and lengths of ranges that - * are recorded as bad. The list is truncated to fit within - * the one-page limit of sysfs. - * Writing "sector length" to this file adds an acknowledged - * bad block list. - * 'unacknowledged-bad-blocks' lists bad blocks that have not yet - * been acknowledged. Writing to this file adds bad blocks - * without acknowledging them. This is largely for testing. - */ - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack) -{ - size_t len; - int i; - u64 *p = bb->page; - unsigned seq; - - if (bb->shift < 0) - return 0; - -retry: - seq = read_seqbegin(&bb->lock); - - len = 0; - i = 0; - - while (len < PAGE_SIZE && i < bb->count) { - sector_t s = BB_OFFSET(p[i]); - unsigned int length = BB_LEN(p[i]); - int ack = BB_ACK(p[i]); - i++; - - if (unack && ack) - continue; - - len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", - (unsigned long long)s << bb->shift, - length << bb->shift); - } - if (unack && len == 0) - bb->unacked_exist = 0; - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return len; -} - -#define DO_DEBUG 1 - -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) -{ - unsigned long long sector; - int length; - char newline; -#ifdef DO_DEBUG - /* Allow clearing via sysfs *only* for testing/debugging. - * Normally only a successful write may clear a badblock - */ - int clear = 0; - if (page[0] == '-') { - clear = 1; - page++; - } -#endif /* DO_DEBUG */ - - switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { - case 3: - if (newline != '\n') - return -EINVAL; - case 2: - if (length <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - -#ifdef DO_DEBUG - if (clear) { - md_clear_badblocks(bb, sector, length); - return len; - } -#endif /* DO_DEBUG */ - if (md_set_badblocks(bb, sector, length, !unack)) - return len; - else - return -ENOSPC; -} - static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 2bea51edfab7..389afc420db6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -28,13 +29,6 @@ #define MaxSector (~(sector_t)0) -/* Bad block numbers are stored sorted in a single page. - * 64bits is used for each block or extent. - * 54 bits are sector number, 9 bits are extent size, - * 1 bit is an 'acknowledged' flag. - */ -#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) - /* * MD's 'extended' device */ @@ -117,22 +111,7 @@ struct md_rdev { struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ - struct badblocks { - int count; /* count of bad blocks */ - int unacked_exist; /* there probably are unacknowledged - * bad blocks. This is only cleared - * when a read discovers none - */ - int shift; /* shift from sectors to block size - * a -ve shift means badblocks are - * disabled.*/ - u64 *page; /* badblock list */ - int changed; - seqlock_t lock; - - sector_t sector; - sector_t size; /* in sectors */ - } badblocks; + struct badblocks badblocks; }; enum flag_bits { Faulty, /* device is known to have a fault */ @@ -185,22 +164,11 @@ enum flag_bits { */ }; -#define BB_LEN_MASK (0x00000000000001FFULL) -#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) -#define BB_ACK_MASK (0x8000000000000000ULL) -#define BB_MAX_LEN 512 -#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) -#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) -#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) -#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) - -extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { if (unlikely(rdev->badblocks.count)) { - int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, + int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, sectors, first_bad, bad_sectors); if (rv) @@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); -extern void md_ack_all_badblocks(struct badblocks *bb); - struct md_cluster_info; struct mddev { -- cgit v1.2.3 From 0caeef63e6d2f866d85bb507bf63e0ce8ec91cef Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 24 Dec 2015 19:21:43 -0700 Subject: libnvdimm: Add a poison list and export badblocks During region creation, perform Address Range Scrubs (ARS) for the SPA (System Physical Address) ranges to retrieve known poison locations from firmware. Add a new data structure 'nd_poison' which is used as a list in nvdimm_bus to store these poison locations. When creating a pmem namespace, if there is any known poison associated with its physical address space, convert the poison ranges to bad sectors that are exposed using the badblocks interface. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 203 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/core.c | 187 ++++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/nd-core.h | 3 + drivers/nvdimm/nd.h | 6 ++ drivers/nvdimm/pmem.c | 6 ++ include/linux/libnvdimm.h | 1 + 6 files changed, 406 insertions(+) (limited to 'drivers') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index e7ed39bab97d..e1dbc8da09b7 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus, /* devm will free nfit_blk */ } +static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_cap *cmd, u64 addr, u64 length) +{ + cmd->address = addr; + cmd->length = length; + + return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd, + sizeof(*cmd)); +} + +static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_start *cmd, u64 addr, u64 length) +{ + int rc; + + cmd->address = addr; + cmd->length = length; + cmd->type = ND_ARS_PERSISTENT; + + while (1) { + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd, + sizeof(*cmd)); + if (rc) + return rc; + switch (cmd->status) { + case 0: + return 0; + case 1: + /* ARS unsupported, but we should never get here */ + return 0; + case 2: + return -EINVAL; + case 3: + /* ARS is in progress */ + msleep(1000); + break; + default: + return -ENXIO; + } + } +} + +static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_status *cmd) +{ + int rc; + + while (1) { + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd, + sizeof(*cmd)); + if (rc || cmd->status & 0xffff) + return -ENXIO; + + /* Check extended status (Upper two bytes) */ + switch (cmd->status >> 16) { + case 0: + return 0; + case 1: + /* ARS is in progress */ + msleep(1000); + break; + case 2: + /* No ARS performed for the current boot */ + return 0; + default: + return -ENXIO; + } + } +} + +static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus, + struct nd_cmd_ars_status *ars_status, u64 start) +{ + int rc; + u32 i; + + /* + * The address field returned by ars_status should be either + * less than or equal to the address we last started ARS for. + * The (start, length) returned by ars_status should also have + * non-zero overlap with the range we started ARS for. + * If this is not the case, bail. + */ + if (ars_status->address > start || + (ars_status->address + ars_status->length < start)) + return -ENXIO; + + for (i = 0; i < ars_status->num_records; i++) { + rc = nvdimm_bus_add_poison(nvdimm_bus, + ars_status->records[i].err_address, + ars_status->records[i].length); + if (rc) + return rc; + } + + return 0; +} + +static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc, + struct nd_region_desc *ndr_desc) +{ + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; + struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus; + struct nd_cmd_ars_status *ars_status = NULL; + struct nd_cmd_ars_start *ars_start = NULL; + struct nd_cmd_ars_cap *ars_cap = NULL; + u64 start, len, cur, remaining; + int rc; + + ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL); + if (!ars_cap) + return -ENOMEM; + + start = ndr_desc->res->start; + len = ndr_desc->res->end - ndr_desc->res->start + 1; + + rc = ars_get_cap(nd_desc, ars_cap, start, len); + if (rc) + goto out; + + /* + * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in + * extended status is not set, skip this but continue initialization + */ + if ((ars_cap->status & 0xffff) || + !(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) { + dev_warn(acpi_desc->dev, + "ARS unsupported (status: 0x%x), won't create an error list\n", + ars_cap->status); + goto out; + } + + /* + * Check if a full-range ARS has been run. If so, use those results + * without having to start a new ARS. + */ + ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status), + GFP_KERNEL); + if (!ars_status) { + rc = -ENOMEM; + goto out; + } + + rc = ars_get_status(nd_desc, ars_status); + if (rc) + goto out; + + if (ars_status->address <= start && + (ars_status->address + ars_status->length >= start + len)) { + rc = ars_status_process_records(nvdimm_bus, ars_status, start); + goto out; + } + + /* + * ARS_STATUS can overflow if the number of poison entries found is + * greater than the maximum buffer size (ars_cap->max_ars_out) + * To detect overflow, check if the length field of ars_status + * is less than the length we supplied. If so, process the + * error entries we got, adjust the start point, and start again + */ + ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL); + if (!ars_start) + return -ENOMEM; + + cur = start; + remaining = len; + do { + u64 done, end; + + rc = ars_do_start(nd_desc, ars_start, cur, remaining); + if (rc) + goto out; + + rc = ars_get_status(nd_desc, ars_status); + if (rc) + goto out; + + rc = ars_status_process_records(nvdimm_bus, ars_status, cur); + if (rc) + goto out; + + end = min(cur + remaining, + ars_status->address + ars_status->length); + done = end - cur; + cur += done; + remaining -= done; + } while (remaining); + + out: + kfree(ars_cap); + kfree(ars_start); + kfree(ars_status); + return rc; +} + static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, @@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, nvdimm_bus = acpi_desc->nvdimm_bus; if (nfit_spa_type(spa) == NFIT_SPA_PM) { + rc = acpi_nfit_find_poison(acpi_desc, ndr_desc); + if (rc) { + dev_err(acpi_desc->dev, + "error while performing ARS to find poison: %d\n", + rc); + return rc; + } if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc)) return -ENOMEM; } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 82c49bb87055..21003b7f0b38 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -325,6 +325,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, if (!nvdimm_bus) return NULL; INIT_LIST_HEAD(&nvdimm_bus->list); + INIT_LIST_HEAD(&nvdimm_bus->poison_list); init_waitqueue_head(&nvdimm_bus->probe_wait); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); mutex_init(&nvdimm_bus->reconfig_mutex); @@ -359,6 +360,191 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(__nvdimm_bus_register); +/** + * __add_badblock_range() - Convert a physical address range to bad sectors + * @disk: the disk associated with the namespace + * @ns_offset: namespace offset where the error range begins (in bytes) + * @len: number of bytes of poison to be added + * + * This assumes that the range provided with (ns_offset, len) is within + * the bounds of physical addresses for this namespace, i.e. lies in the + * interval [ns_start, ns_start + ns_size) + */ +static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) +{ + unsigned int sector_size = queue_logical_block_size(disk->queue); + sector_t start_sector; + u64 num_sectors; + u32 rem; + int rc; + + start_sector = div_u64(ns_offset, sector_size); + num_sectors = div_u64_rem(len, sector_size, &rem); + if (rem) + num_sectors++; + + if (!disk->bb) { + rc = disk_alloc_badblocks(disk); + if (rc) + return rc; + } + + if (unlikely(num_sectors > (u64)INT_MAX)) { + u64 remaining = num_sectors; + sector_t s = start_sector; + + while (remaining) { + int done = min_t(u64, remaining, INT_MAX); + + rc = disk_set_badblocks(disk, s, done); + if (rc) + return rc; + remaining -= done; + s += done; + } + return 0; + } else + return disk_set_badblocks(disk, start_sector, num_sectors); +} + +/** + * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks + * @disk: the gendisk associated with the namespace where badblocks + * will be stored + * @offset: offset at the start of the namespace before 'sector 0' + * @ndns: the namespace containing poison ranges + * + * The poison list generated during NFIT initialization may contain multiple, + * possibly overlapping ranges in the SPA (System Physical Address) space. + * Compare each of these ranges to the namespace currently being initialized, + * and add badblocks to the gendisk for all matching sub-ranges + * + * Return: + * 0 - Success + */ +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, + struct nd_namespace_common *ndns) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + struct nvdimm_bus *nvdimm_bus; + struct list_head *poison_list; + u64 ns_start, ns_end, ns_size; + struct nd_poison *pl; + int rc; + + ns_size = nvdimm_namespace_capacity(ndns) - offset; + ns_start = nsio->res.start + offset; + ns_end = nsio->res.end; + + nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent); + poison_list = &nvdimm_bus->poison_list; + if (list_empty(poison_list)) + return 0; + + list_for_each_entry(pl, poison_list, list) { + u64 pl_end = pl->start + pl->length - 1; + + /* Discard intervals with no intersection */ + if (pl_end < ns_start) + continue; + if (pl->start > ns_end) + continue; + /* Deal with any overlap after start of the namespace */ + if (pl->start >= ns_start) { + u64 start = pl->start; + u64 len; + + if (pl_end <= ns_end) + len = pl->length; + else + len = ns_start + ns_size - pl->start; + + rc = __add_badblock_range(disk, start - ns_start, len); + if (rc) + return rc; + dev_info(&nvdimm_bus->dev, + "Found a poison range (0x%llx, 0x%llx)\n", + start, len); + continue; + } + /* Deal with overlap for poison starting before the namespace */ + if (pl->start < ns_start) { + u64 len; + + if (pl_end < ns_end) + len = pl->start + pl->length - ns_start; + else + len = ns_size; + + rc = __add_badblock_range(disk, 0, len); + if (rc) + return rc; + dev_info(&nvdimm_bus->dev, + "Found a poison range (0x%llx, 0x%llx)\n", + pl->start, len); + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison); + +static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +{ + struct nd_poison *pl; + + pl = kzalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + + pl->start = addr; + pl->length = length; + list_add_tail(&pl->list, &nvdimm_bus->poison_list); + + return 0; +} + +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +{ + struct nd_poison *pl; + + if (list_empty(&nvdimm_bus->poison_list)) + return __add_poison(nvdimm_bus, addr, length); + + /* + * There is a chance this is a duplicate, check for those first. + * This will be the common case as ARS_STATUS returns all known + * errors in the SPA space, and we can't query it per region + */ + list_for_each_entry(pl, &nvdimm_bus->poison_list, list) + if (pl->start == addr) { + /* If length has changed, update this list entry */ + if (pl->length != length) + pl->length = length; + return 0; + } + + /* + * If not a duplicate or a simple length update, add the entry as is, + * as any overlapping ranges will get resolved when the list is consumed + * and converted to badblocks + */ + return __add_poison(nvdimm_bus, addr, length); +} +EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); + +static void free_poison_list(struct list_head *poison_list) +{ + struct nd_poison *pl, *next; + + list_for_each_entry_safe(pl, next, poison_list, list) { + list_del(&pl->list); + kfree(pl); + } + list_del_init(poison_list); +} + static int child_unregister(struct device *dev, void *data) { /* @@ -385,6 +571,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) nd_synchronize(); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); + free_poison_list(&nvdimm_bus->poison_list); nvdimm_bus_destroy_ndctl(nvdimm_bus); device_unregister(&nvdimm_bus->dev); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 159aed532042..d3b7ea78df96 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -30,6 +30,7 @@ struct nvdimm_bus { struct list_head list; struct device dev; int id, probe_active; + struct list_head poison_list; struct mutex reconfig_mutex; }; @@ -89,4 +90,6 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, ssize_t nd_namespace_store(struct device *dev, struct nd_namespace_common **_ndns, const char *buf, size_t len); +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, + struct nd_namespace_common *ndns); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 417e521d299c..ba91fcd5818d 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -38,6 +38,12 @@ enum { #endif }; +struct nd_poison { + u64 start; + u64 length; + struct list_head list; +}; + struct nvdimm_drvdata { struct device *dev; int nsindex_size; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 8ee79893d2f5..5b95043443a3 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -27,6 +27,7 @@ #include #include #include +#include "nd-core.h" #include "pfn.h" #include "nd.h" @@ -168,6 +169,7 @@ static int pmem_attach_disk(struct device *dev, { int nid = dev_to_node(dev); struct gendisk *disk; + int ret; pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid); if (!pmem->pmem_queue) @@ -196,6 +198,10 @@ static int pmem_attach_disk(struct device *dev, set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; + ret = nvdimm_namespace_add_poison(disk, pmem->data_offset, ndns); + if (ret) + return ret; + add_disk(disk); revalidate_disk(disk); diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 3f021dc5da8c..bed40dff0e86 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( } +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc, struct module *module); #define nvdimm_bus_register(parent, desc) \ -- cgit v1.2.3 From ad9a8bde2cb19f6876f964fc48acc8b6a2f325ff Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:41 -0800 Subject: libnvdimm, pmem: move definition of nvdimm_namespace_add_poison to nd.h nd-core.h is private to the libnvdimm core internals and should not be used by drivers. Signed-off-by: Dan Williams --- drivers/nvdimm/nd-core.h | 2 -- drivers/nvdimm/nd.h | 2 ++ drivers/nvdimm/pmem.c | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index d3b7ea78df96..29acdaa757e2 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -90,6 +90,4 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, ssize_t nd_namespace_store(struct device *dev, struct nd_namespace_common **_ndns, const char *buf, size_t len); -int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, - struct nd_namespace_common *ndns); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index ba91fcd5818d..198933da83e5 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -268,6 +268,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, + struct nd_namespace_common *ndns); int nd_blk_region_init(struct nd_region *nd_region); void __nd_iostat_start(struct bio *bio, unsigned long *start); static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5b95043443a3..65b2056e7540 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -27,7 +27,6 @@ #include #include #include -#include "nd-core.h" #include "pfn.h" #include "nd.h" -- cgit v1.2.3 From d3b407fb3f782bd915db64e266010ea30a2d381e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:19:22 -0800 Subject: badblocks: rename badblocks_free to badblocks_exit For symmetry with badblocks_init() make it clear that this path only destroys incremental allocations of a badblocks instance, and does not free the badblocks instance itself. Signed-off-by: Dan Williams --- block/badblocks.c | 6 +++--- block/genhd.c | 2 +- drivers/md/md.c | 2 +- include/linux/badblocks.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/block/badblocks.c b/block/badblocks.c index 96aeb9194a2e..fabf6b64c2d1 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -550,12 +550,12 @@ int badblocks_init(struct badblocks *bb, int enable) EXPORT_SYMBOL_GPL(badblocks_init); /** - * badblocks_free() - free the badblocks structure + * badblocks_exit() - free the badblocks structure * @bb: the badblocks structure that holds all badblock information */ -void badblocks_free(struct badblocks *bb) +void badblocks_exit(struct badblocks *bb) { kfree(bb->page); bb->page = NULL; } -EXPORT_SYMBOL_GPL(badblocks_free); +EXPORT_SYMBOL_GPL(badblocks_exit); diff --git a/block/genhd.c b/block/genhd.c index 88579cf373b8..f463c67e6ba2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -671,7 +671,7 @@ void del_gendisk(struct gendisk *disk) blk_unregister_region(disk_devt(disk), disk->minors); if (disk->bb) { - badblocks_free(disk->bb); + badblocks_exit(disk->bb); kfree(disk->bb); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 1e48aa9de352..96a991821ae6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -710,7 +710,7 @@ void md_rdev_clear(struct md_rdev *rdev) put_page(rdev->bb_page); rdev->bb_page = NULL; } - badblocks_free(&rdev->badblocks); + badblocks_exit(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 929344630b51..2d98c026c57f 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -48,6 +48,6 @@ ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); int badblocks_init(struct badblocks *bb, int enable); -void badblocks_free(struct badblocks *bb); +void badblocks_exit(struct badblocks *bb); #endif -- cgit v1.2.3 From 87ba05dff3510f9e058b35d3c3fa222b6f406ecc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 9 Jan 2016 07:48:43 -0800 Subject: libnvdimm: don't fail init for full badblocks list If the badblocks list runs out of space it simply means that software is unable to intercept all errors. This is no different than the latent discovery of new badblocks case and should not be an initialization failure condition. Signed-off-by: Dan Williams --- drivers/nvdimm/core.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 21003b7f0b38..e419d661e294 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -360,6 +360,18 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(__nvdimm_bus_register); +static void set_badblock(struct gendisk *disk, sector_t s, int num) +{ + struct device *dev = disk->driverfs_dev; + + dev_dbg(dev, "Found a poison range (0x%llx, 0x%llx)\n", + (u64) s * 512, (u64) num * 512); + /* this isn't an error as the hardware will still throw an exception */ + if (disk_set_badblocks(disk, s, num)) + dev_info_once(dev, "%s: failed for sector %llx\n", + __func__, (u64) s); +} + /** * __add_badblock_range() - Convert a physical address range to bad sectors * @disk: the disk associated with the namespace @@ -396,15 +408,14 @@ static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) while (remaining) { int done = min_t(u64, remaining, INT_MAX); - rc = disk_set_badblocks(disk, s, done); - if (rc) - return rc; + set_badblock(disk, s, done); remaining -= done; s += done; } - return 0; } else - return disk_set_badblocks(disk, start_sector, num_sectors); + set_badblock(disk, start_sector, num_sectors); + + return 0; } /** @@ -463,9 +474,6 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, rc = __add_badblock_range(disk, start - ns_start, len); if (rc) return rc; - dev_info(&nvdimm_bus->dev, - "Found a poison range (0x%llx, 0x%llx)\n", - start, len); continue; } /* Deal with overlap for poison starting before the namespace */ @@ -480,9 +488,6 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, rc = __add_badblock_range(disk, 0, len); if (rc) return rc; - dev_info(&nvdimm_bus->dev, - "Found a poison range (0x%llx, 0x%llx)\n", - pl->start, len); } } -- cgit v1.2.3 From b95f5f4391fad65f1819c2404080b05ca95bdd92 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 4 Jan 2016 23:50:23 -0800 Subject: libnvdimm: convert to statically allocated badblocks If a device will ever have badblocks it should always have a badblocks instance available. So, similar to md, embed a badblocks instance in pmem_device. This reduces pointer chasing in the i/o fast path, and simplifies the init path. Reported-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/core.c | 57 +++++++++++++++------------------------------------ drivers/nvdimm/nd.h | 4 ++-- drivers/nvdimm/pmem.c | 10 ++++----- 3 files changed, 24 insertions(+), 47 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index e419d661e294..2e2832b83c93 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #include +#include #include #include #include @@ -360,21 +361,19 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(__nvdimm_bus_register); -static void set_badblock(struct gendisk *disk, sector_t s, int num) +static void set_badblock(struct badblocks *bb, sector_t s, int num) { - struct device *dev = disk->driverfs_dev; - - dev_dbg(dev, "Found a poison range (0x%llx, 0x%llx)\n", + dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n", (u64) s * 512, (u64) num * 512); /* this isn't an error as the hardware will still throw an exception */ - if (disk_set_badblocks(disk, s, num)) - dev_info_once(dev, "%s: failed for sector %llx\n", + if (badblocks_set(bb, s, num, 1)) + dev_info_once(bb->dev, "%s: failed for sector %llx\n", __func__, (u64) s); } /** * __add_badblock_range() - Convert a physical address range to bad sectors - * @disk: the disk associated with the namespace + * @bb: badblocks instance to populate * @ns_offset: namespace offset where the error range begins (in bytes) * @len: number of bytes of poison to be added * @@ -382,25 +381,18 @@ static void set_badblock(struct gendisk *disk, sector_t s, int num) * the bounds of physical addresses for this namespace, i.e. lies in the * interval [ns_start, ns_start + ns_size) */ -static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) +static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) { - unsigned int sector_size = queue_logical_block_size(disk->queue); + const unsigned int sector_size = 512; sector_t start_sector; u64 num_sectors; u32 rem; - int rc; start_sector = div_u64(ns_offset, sector_size); num_sectors = div_u64_rem(len, sector_size, &rem); if (rem) num_sectors++; - if (!disk->bb) { - rc = disk_alloc_badblocks(disk); - if (rc) - return rc; - } - if (unlikely(num_sectors > (u64)INT_MAX)) { u64 remaining = num_sectors; sector_t s = start_sector; @@ -408,33 +400,27 @@ static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) while (remaining) { int done = min_t(u64, remaining, INT_MAX); - set_badblock(disk, s, done); + set_badblock(bb, s, done); remaining -= done; s += done; } } else - set_badblock(disk, start_sector, num_sectors); - - return 0; + set_badblock(bb, start_sector, num_sectors); } /** * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks - * @disk: the gendisk associated with the namespace where badblocks - * will be stored - * @offset: offset at the start of the namespace before 'sector 0' * @ndns: the namespace containing poison ranges + * @bb: badblocks instance to populate + * @offset: offset at the start of the namespace before 'sector 0' * * The poison list generated during NFIT initialization may contain multiple, * possibly overlapping ranges in the SPA (System Physical Address) space. * Compare each of these ranges to the namespace currently being initialized, * and add badblocks to the gendisk for all matching sub-ranges - * - * Return: - * 0 - Success */ -int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, - struct nd_namespace_common *ndns) +void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns, + struct badblocks *bb, resource_size_t offset) { struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_region *nd_region = to_nd_region(ndns->dev.parent); @@ -442,7 +428,6 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, struct list_head *poison_list; u64 ns_start, ns_end, ns_size; struct nd_poison *pl; - int rc; ns_size = nvdimm_namespace_capacity(ndns) - offset; ns_start = nsio->res.start + offset; @@ -451,7 +436,7 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent); poison_list = &nvdimm_bus->poison_list; if (list_empty(poison_list)) - return 0; + return; list_for_each_entry(pl, poison_list, list) { u64 pl_end = pl->start + pl->length - 1; @@ -470,10 +455,7 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, len = pl->length; else len = ns_start + ns_size - pl->start; - - rc = __add_badblock_range(disk, start - ns_start, len); - if (rc) - return rc; + __add_badblock_range(bb, start - ns_start, len); continue; } /* Deal with overlap for poison starting before the namespace */ @@ -484,14 +466,9 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, len = pl->start + pl->length - ns_start; else len = ns_size; - - rc = __add_badblock_range(disk, 0, len); - if (rc) - return rc; + __add_badblock_range(bb, 0, len); } } - - return 0; } EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 198933da83e5..288d96ec7233 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -268,8 +268,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); -int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, - struct nd_namespace_common *ndns); +void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns, + struct badblocks *bb, resource_size_t offset); int nd_blk_region_init(struct nd_region *nd_region); void __nd_iostat_start(struct bio *bio, unsigned long *start); static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 65b2056e7540..2b1f3009f827 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ struct pmem_device { phys_addr_t data_offset; void __pmem *virt_addr; size_t size; + struct badblocks bb; }; static int pmem_major; @@ -168,7 +170,6 @@ static int pmem_attach_disk(struct device *dev, { int nid = dev_to_node(dev); struct gendisk *disk; - int ret; pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid); if (!pmem->pmem_queue) @@ -196,10 +197,9 @@ static int pmem_attach_disk(struct device *dev, disk->driverfs_dev = dev; set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; - - ret = nvdimm_namespace_add_poison(disk, pmem->data_offset, ndns); - if (ret) - return ret; + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); add_disk(disk); revalidate_disk(disk); -- cgit v1.2.3 From e10624f8c09710b3b0740ea3847627ea02f55c39 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:41 -0800 Subject: pmem: fail io-requests to known bad blocks Check the sectors specified in a read bio to see if they hit a known bad block, and return an error code pmem_do_bvec(). Note that the ->rw_page() is not in a position to return errors. For now, copy the same layering violation present in zram_rw_page() to avoid crashes of the form: kernel BUG at mm/filemap.c:822! [..] Call Trace: [] page_endio+0x1e/0x60 [] mpage_end_io+0x39/0x60 [] bio_endio+0x3f/0x60 [] pmem_make_request+0x111/0x230 [nd_pmem] ...i.e. unlock a page that was already unlocked via pmem_rw_page() => page_endio(). Reported-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 2b1f3009f827..d00c659d1304 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -47,7 +47,20 @@ struct pmem_device { static int pmem_major; -static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, +static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) +{ + if (bb->count) { + sector_t first_bad; + int num_bad; + + return !!badblocks_check(bb, sector, len / 512, &first_bad, + &num_bad); + } + + return false; +} + +static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, int rw, sector_t sector) { @@ -56,6 +69,8 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, void __pmem *pmem_addr = pmem->virt_addr + pmem_off; if (rw == READ) { + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) + return -EIO; memcpy_from_pmem(mem + off, pmem_addr, len); flush_dcache_page(page); } else { @@ -64,10 +79,12 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, } kunmap_atomic(mem); + return 0; } static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { + int rc = 0; bool do_acct; unsigned long start; struct bio_vec bvec; @@ -76,9 +93,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) struct pmem_device *pmem = bdev->bd_disk->private_data; do_acct = nd_iostat_start(bio, &start); - bio_for_each_segment(bvec, bio, iter) - pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, - bio_data_dir(bio), iter.bi_sector); + bio_for_each_segment(bvec, bio, iter) { + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, + bvec.bv_offset, bio_data_dir(bio), + iter.bi_sector); + if (rc) { + bio->bi_error = rc; + break; + } + } if (do_acct) nd_iostat_end(bio, start); @@ -93,13 +116,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct page *page, int rw) { struct pmem_device *pmem = bdev->bd_disk->private_data; + int rc; - pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); + rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); if (rw & WRITE) wmb_pmem(); - page_endio(page, rw & WRITE, 0); - return 0; + /* + * The ->rw_page interface is subtle and tricky. The core + * retries on any error, so we can only invoke page_endio() in + * the successful completion case. Otherwise, we'll see crashes + * caused by double completion. + */ + if (rc == 0) + page_endio(page, rw & WRITE, 0); + + return rc; } static long pmem_direct_access(struct block_device *bdev, sector_t sector, -- cgit v1.2.3 From 57f7f317abdd07954cb116280c88d18378afb33e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:42 -0800 Subject: pmem, dax: disable dax in the presence of bad blocks Longer term teach dax to punch "error" holes in mapping requests and deliver SIGBUS to applications that consume a bad pmem page. For now, simply disable the dax performance optimization in the presence of known errors. Signed-off-by: Dan Williams --- block/ioctl.c | 10 ++++++++++ drivers/nvdimm/pmem.c | 1 + 2 files changed, 11 insertions(+) (limited to 'drivers') diff --git a/block/ioctl.c b/block/ioctl.c index 7a964d842913..2c84683aada5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -422,6 +423,15 @@ bool blkdev_dax_capable(struct block_device *bdev) || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) return false; + /* + * If the device has known bad blocks, force all I/O through the + * driver / page cache. + * + * TODO: support finer grained dax error handling + */ + if (disk->bb && disk->bb->count) + return false; + return true; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d00c659d1304..6a1832b3983c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -233,6 +233,7 @@ static int pmem_attach_disk(struct device *dev, return -ENOMEM; nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); + disk->bb = &pmem->bb; add_disk(disk); revalidate_disk(disk); -- cgit v1.2.3 From 710d69cc99507803ed91b4ec7368fbd66d59f014 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 4 Jan 2016 23:31:24 -0800 Subject: libnvdimm, pmem: nvdimm_read_bytes() badblocks support Support badblock checking in all the pmem read paths that do not go through the block layer. This protects info block reads (btt or pfn) as well as data reads to a pmem namespace via a btt instance. Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 6a1832b3983c..a88762d0d086 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -229,6 +229,7 @@ static int pmem_attach_disk(struct device *dev, disk->driverfs_dev = dev; set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; + devm_exit_badblocks(dev, &pmem->bb); if (devm_init_badblocks(dev, &pmem->bb)) return -ENOMEM; nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); @@ -250,9 +251,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns, return -EFAULT; } - if (rw == READ) + if (rw == READ) { + unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); + + if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align))) + return -EIO; memcpy_from_pmem(buf, pmem->virt_addr + offset, size); - else { + } else { memcpy_to_pmem(pmem->virt_addr + offset, buf, size); wmb_pmem(); } @@ -427,6 +432,9 @@ static int nd_pmem_probe(struct device *dev) pmem->ndns = ndns; dev_set_drvdata(dev, pmem); ndns->rw_bytes = pmem_rw_bytes; + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); if (is_nd_btt(dev)) return nvdimm_namespace_attach_btt(ndns); -- cgit v1.2.3