From 9f1e8cee7742cadbe6b97f2c80b787b4ee067bae Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Dec 2015 15:14:20 -0800 Subject: libnvdimm, pfn: kill ND_PFN_ALIGN The alignment constraint isn't necessary now that devm_memremap_pages() allows for unaligned mappings. Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 8ee79893d2f5..520c00321dad 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -241,11 +241,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) if (rc == 0 || rc == -EBUSY) return rc; - /* section alignment for simple hotplug */ - if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN - || pmem->phys_addr & ND_PFN_MASK) - return -ENODEV; - nd_region = to_nd_region(nd_pfn->dev.parent); if (nd_region->ro) { dev_info(&nd_pfn->dev, @@ -326,16 +321,6 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) if (rc) return rc; - if (PAGE_SIZE != SZ_4K) { - dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n"); - return -ENXIO; - } - if (nsio->res.start & ND_PFN_MASK) { - dev_err(dev, "%s not memory hotplug section aligned\n", - dev_name(&ndns->dev)); - return -ENXIO; - } - pfn_sb = nd_pfn->pfn_sb; offset = le64_to_cpu(pfn_sb->dataoff); nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); -- cgit v1.2.3 From 315c562536c42aa4da9b6c5a2135dd6715a5e0b5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Dec 2015 14:45:23 -0800 Subject: libnvdimm, pfn: add 'align' attribute, default to HPAGE_SIZE When setting aside capacity for struct page it must be aligned to the largest mapping size that is to be made available via DAX. Make the alignment configurable to enable support for 1GiB page-size mappings. The offset for PFN_MODE_RAM may now be larger than SZ_8K, so fixup the offset check in nvdimm_namespace_attach_pfn(). Reported-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/nvdimm/nd.h | 1 + drivers/nvdimm/pfn_devs.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/pmem.c | 6 ++--- 3 files changed, 65 insertions(+), 3 deletions(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 2ce428e9b584..e4e9f9ae0cc8 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -146,6 +146,7 @@ struct nd_pfn { int id; u8 *uuid; struct device dev; + unsigned long align; unsigned long npfns; enum nd_pfn_mode mode; struct nd_pfn_sb *pfn_sb; diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 613ffcca6ecb..95ecd7b0fab3 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -103,6 +103,52 @@ static ssize_t mode_store(struct device *dev, } static DEVICE_ATTR_RW(mode); +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + return sprintf(buf, "%lx\n", nd_pfn->align); +} + +static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf) +{ + unsigned long val; + int rc; + + rc = kstrtoul(buf, 0, &val); + if (rc) + return rc; + + if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G) + return -EINVAL; + + if (nd_pfn->dev.driver) + return -EBUSY; + else + nd_pfn->align = val; + + return 0; +} + +static ssize_t align_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = __align_store(nd_pfn, buf); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(align); + static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -164,6 +210,7 @@ static struct attribute *nd_pfn_attributes[] = { &dev_attr_mode.attr, &dev_attr_namespace.attr, &dev_attr_uuid.attr, + &dev_attr_align.attr, NULL, }; @@ -199,6 +246,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region, } nd_pfn->mode = PFN_MODE_NONE; + nd_pfn->align = HPAGE_SIZE; dev = &nd_pfn->dev; dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id); dev->parent = &nd_region->dev; @@ -269,6 +317,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) return -EINVAL; } + if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) { + dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n", + nd_pfn->align, nvdimm_namespace_capacity(ndns)); + return -EINVAL; + } + /* * These warnings are verbose because they can only trigger in * the case where the physical address alignment of the @@ -283,6 +337,13 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) return -EBUSY; } + nd_pfn->align = 1UL << ilog2(offset); + if (!is_power_of_2(offset) || offset < PAGE_SIZE) { + dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n", + offset); + return -ENXIO; + } + return 0; } EXPORT_SYMBOL(nd_pfn_validate); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 520c00321dad..5ba351e4f26a 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -258,9 +258,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) * ->direct_access() to those that are included in the memmap. */ if (nd_pfn->mode == PFN_MODE_PMEM) - offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE); + offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align); else if (nd_pfn->mode == PFN_MODE_RAM) - offset = SZ_8K; + offset = ALIGN(SZ_8K, nd_pfn->align); else goto err; @@ -325,7 +325,7 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) offset = le64_to_cpu(pfn_sb->dataoff); nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); if (nd_pfn->mode == PFN_MODE_RAM) { - if (offset != SZ_8K) + if (offset < SZ_8K) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); altmap = NULL; -- cgit v1.2.3 From a34d5e8a6ad1a31b186019c9c351777626698863 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 12 Dec 2015 16:09:14 -0800 Subject: libnvdimm, pfn: add parent uuid validation Track and check the uuid of the namespace hosting a pfn instance. This forces the pfn info block to be invalidated if the namespace is re-configured with a different uuid. Signed-off-by: Dan Williams --- drivers/nvdimm/pfn_devs.c | 10 +++++++--- drivers/nvdimm/pmem.c | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 95ecd7b0fab3..f9b674bc49db 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -273,10 +273,11 @@ struct device *nd_pfn_create(struct nd_region *nd_region) int nd_pfn_validate(struct nd_pfn *nd_pfn) { - struct nd_namespace_common *ndns = nd_pfn->ndns; - struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; - struct nd_namespace_io *nsio; u64 checksum, offset; + struct nd_namespace_io *nsio; + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + struct nd_namespace_common *ndns = nd_pfn->ndns; + const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev); if (!pfn_sb || !ndns) return -ENODEV; @@ -296,6 +297,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) return -ENODEV; pfn_sb->checksum = cpu_to_le64(checksum); + if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0) + return -ENODEV; + switch (le32_to_cpu(pfn_sb->mode)) { case PFN_MODE_RAM: break; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5ba351e4f26a..dc6866734f70 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -270,6 +270,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) pfn_sb->npfns = cpu_to_le64(npfns); memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN); memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); + memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); pfn_sb->version_major = cpu_to_le16(1); checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); pfn_sb->checksum = cpu_to_le64(checksum); -- cgit v1.2.3 From 3fa962686568a1617d174e3d2f5d522e963077c5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 13 Dec 2015 11:35:52 -0800 Subject: libnvdimm, pfn: fix nd_pfn_validate() return value handling The -ENODEV case indicates that the info-block needs to established. All other return codes cause nd_pfn_init() to abort. Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index dc6866734f70..ab689bca727d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -238,7 +238,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) nd_pfn->pfn_sb = pfn_sb; rc = nd_pfn_validate(nd_pfn); - if (rc == 0 || rc == -EBUSY) + if (rc == -ENODEV) + /* no info block, do init */; + else return rc; nd_region = to_nd_region(nd_pfn->dev.parent); -- cgit v1.2.3 From 0caeef63e6d2f866d85bb507bf63e0ce8ec91cef Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 24 Dec 2015 19:21:43 -0700 Subject: libnvdimm: Add a poison list and export badblocks During region creation, perform Address Range Scrubs (ARS) for the SPA (System Physical Address) ranges to retrieve known poison locations from firmware. Add a new data structure 'nd_poison' which is used as a list in nvdimm_bus to store these poison locations. When creating a pmem namespace, if there is any known poison associated with its physical address space, convert the poison ranges to bad sectors that are exposed using the badblocks interface. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 203 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/core.c | 187 ++++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/nd-core.h | 3 + drivers/nvdimm/nd.h | 6 ++ drivers/nvdimm/pmem.c | 6 ++ include/linux/libnvdimm.h | 1 + 6 files changed, 406 insertions(+) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index e7ed39bab97d..e1dbc8da09b7 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus, /* devm will free nfit_blk */ } +static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_cap *cmd, u64 addr, u64 length) +{ + cmd->address = addr; + cmd->length = length; + + return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd, + sizeof(*cmd)); +} + +static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_start *cmd, u64 addr, u64 length) +{ + int rc; + + cmd->address = addr; + cmd->length = length; + cmd->type = ND_ARS_PERSISTENT; + + while (1) { + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd, + sizeof(*cmd)); + if (rc) + return rc; + switch (cmd->status) { + case 0: + return 0; + case 1: + /* ARS unsupported, but we should never get here */ + return 0; + case 2: + return -EINVAL; + case 3: + /* ARS is in progress */ + msleep(1000); + break; + default: + return -ENXIO; + } + } +} + +static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_status *cmd) +{ + int rc; + + while (1) { + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd, + sizeof(*cmd)); + if (rc || cmd->status & 0xffff) + return -ENXIO; + + /* Check extended status (Upper two bytes) */ + switch (cmd->status >> 16) { + case 0: + return 0; + case 1: + /* ARS is in progress */ + msleep(1000); + break; + case 2: + /* No ARS performed for the current boot */ + return 0; + default: + return -ENXIO; + } + } +} + +static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus, + struct nd_cmd_ars_status *ars_status, u64 start) +{ + int rc; + u32 i; + + /* + * The address field returned by ars_status should be either + * less than or equal to the address we last started ARS for. + * The (start, length) returned by ars_status should also have + * non-zero overlap with the range we started ARS for. + * If this is not the case, bail. + */ + if (ars_status->address > start || + (ars_status->address + ars_status->length < start)) + return -ENXIO; + + for (i = 0; i < ars_status->num_records; i++) { + rc = nvdimm_bus_add_poison(nvdimm_bus, + ars_status->records[i].err_address, + ars_status->records[i].length); + if (rc) + return rc; + } + + return 0; +} + +static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc, + struct nd_region_desc *ndr_desc) +{ + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; + struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus; + struct nd_cmd_ars_status *ars_status = NULL; + struct nd_cmd_ars_start *ars_start = NULL; + struct nd_cmd_ars_cap *ars_cap = NULL; + u64 start, len, cur, remaining; + int rc; + + ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL); + if (!ars_cap) + return -ENOMEM; + + start = ndr_desc->res->start; + len = ndr_desc->res->end - ndr_desc->res->start + 1; + + rc = ars_get_cap(nd_desc, ars_cap, start, len); + if (rc) + goto out; + + /* + * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in + * extended status is not set, skip this but continue initialization + */ + if ((ars_cap->status & 0xffff) || + !(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) { + dev_warn(acpi_desc->dev, + "ARS unsupported (status: 0x%x), won't create an error list\n", + ars_cap->status); + goto out; + } + + /* + * Check if a full-range ARS has been run. If so, use those results + * without having to start a new ARS. + */ + ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status), + GFP_KERNEL); + if (!ars_status) { + rc = -ENOMEM; + goto out; + } + + rc = ars_get_status(nd_desc, ars_status); + if (rc) + goto out; + + if (ars_status->address <= start && + (ars_status->address + ars_status->length >= start + len)) { + rc = ars_status_process_records(nvdimm_bus, ars_status, start); + goto out; + } + + /* + * ARS_STATUS can overflow if the number of poison entries found is + * greater than the maximum buffer size (ars_cap->max_ars_out) + * To detect overflow, check if the length field of ars_status + * is less than the length we supplied. If so, process the + * error entries we got, adjust the start point, and start again + */ + ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL); + if (!ars_start) + return -ENOMEM; + + cur = start; + remaining = len; + do { + u64 done, end; + + rc = ars_do_start(nd_desc, ars_start, cur, remaining); + if (rc) + goto out; + + rc = ars_get_status(nd_desc, ars_status); + if (rc) + goto out; + + rc = ars_status_process_records(nvdimm_bus, ars_status, cur); + if (rc) + goto out; + + end = min(cur + remaining, + ars_status->address + ars_status->length); + done = end - cur; + cur += done; + remaining -= done; + } while (remaining); + + out: + kfree(ars_cap); + kfree(ars_start); + kfree(ars_status); + return rc; +} + static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, @@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, nvdimm_bus = acpi_desc->nvdimm_bus; if (nfit_spa_type(spa) == NFIT_SPA_PM) { + rc = acpi_nfit_find_poison(acpi_desc, ndr_desc); + if (rc) { + dev_err(acpi_desc->dev, + "error while performing ARS to find poison: %d\n", + rc); + return rc; + } if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc)) return -ENOMEM; } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 82c49bb87055..21003b7f0b38 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -325,6 +325,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, if (!nvdimm_bus) return NULL; INIT_LIST_HEAD(&nvdimm_bus->list); + INIT_LIST_HEAD(&nvdimm_bus->poison_list); init_waitqueue_head(&nvdimm_bus->probe_wait); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); mutex_init(&nvdimm_bus->reconfig_mutex); @@ -359,6 +360,191 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(__nvdimm_bus_register); +/** + * __add_badblock_range() - Convert a physical address range to bad sectors + * @disk: the disk associated with the namespace + * @ns_offset: namespace offset where the error range begins (in bytes) + * @len: number of bytes of poison to be added + * + * This assumes that the range provided with (ns_offset, len) is within + * the bounds of physical addresses for this namespace, i.e. lies in the + * interval [ns_start, ns_start + ns_size) + */ +static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) +{ + unsigned int sector_size = queue_logical_block_size(disk->queue); + sector_t start_sector; + u64 num_sectors; + u32 rem; + int rc; + + start_sector = div_u64(ns_offset, sector_size); + num_sectors = div_u64_rem(len, sector_size, &rem); + if (rem) + num_sectors++; + + if (!disk->bb) { + rc = disk_alloc_badblocks(disk); + if (rc) + return rc; + } + + if (unlikely(num_sectors > (u64)INT_MAX)) { + u64 remaining = num_sectors; + sector_t s = start_sector; + + while (remaining) { + int done = min_t(u64, remaining, INT_MAX); + + rc = disk_set_badblocks(disk, s, done); + if (rc) + return rc; + remaining -= done; + s += done; + } + return 0; + } else + return disk_set_badblocks(disk, start_sector, num_sectors); +} + +/** + * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks + * @disk: the gendisk associated with the namespace where badblocks + * will be stored + * @offset: offset at the start of the namespace before 'sector 0' + * @ndns: the namespace containing poison ranges + * + * The poison list generated during NFIT initialization may contain multiple, + * possibly overlapping ranges in the SPA (System Physical Address) space. + * Compare each of these ranges to the namespace currently being initialized, + * and add badblocks to the gendisk for all matching sub-ranges + * + * Return: + * 0 - Success + */ +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, + struct nd_namespace_common *ndns) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + struct nvdimm_bus *nvdimm_bus; + struct list_head *poison_list; + u64 ns_start, ns_end, ns_size; + struct nd_poison *pl; + int rc; + + ns_size = nvdimm_namespace_capacity(ndns) - offset; + ns_start = nsio->res.start + offset; + ns_end = nsio->res.end; + + nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent); + poison_list = &nvdimm_bus->poison_list; + if (list_empty(poison_list)) + return 0; + + list_for_each_entry(pl, poison_list, list) { + u64 pl_end = pl->start + pl->length - 1; + + /* Discard intervals with no intersection */ + if (pl_end < ns_start) + continue; + if (pl->start > ns_end) + continue; + /* Deal with any overlap after start of the namespace */ + if (pl->start >= ns_start) { + u64 start = pl->start; + u64 len; + + if (pl_end <= ns_end) + len = pl->length; + else + len = ns_start + ns_size - pl->start; + + rc = __add_badblock_range(disk, start - ns_start, len); + if (rc) + return rc; + dev_info(&nvdimm_bus->dev, + "Found a poison range (0x%llx, 0x%llx)\n", + start, len); + continue; + } + /* Deal with overlap for poison starting before the namespace */ + if (pl->start < ns_start) { + u64 len; + + if (pl_end < ns_end) + len = pl->start + pl->length - ns_start; + else + len = ns_size; + + rc = __add_badblock_range(disk, 0, len); + if (rc) + return rc; + dev_info(&nvdimm_bus->dev, + "Found a poison range (0x%llx, 0x%llx)\n", + pl->start, len); + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison); + +static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +{ + struct nd_poison *pl; + + pl = kzalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + + pl->start = addr; + pl->length = length; + list_add_tail(&pl->list, &nvdimm_bus->poison_list); + + return 0; +} + +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +{ + struct nd_poison *pl; + + if (list_empty(&nvdimm_bus->poison_list)) + return __add_poison(nvdimm_bus, addr, length); + + /* + * There is a chance this is a duplicate, check for those first. + * This will be the common case as ARS_STATUS returns all known + * errors in the SPA space, and we can't query it per region + */ + list_for_each_entry(pl, &nvdimm_bus->poison_list, list) + if (pl->start == addr) { + /* If length has changed, update this list entry */ + if (pl->length != length) + pl->length = length; + return 0; + } + + /* + * If not a duplicate or a simple length update, add the entry as is, + * as any overlapping ranges will get resolved when the list is consumed + * and converted to badblocks + */ + return __add_poison(nvdimm_bus, addr, length); +} +EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); + +static void free_poison_list(struct list_head *poison_list) +{ + struct nd_poison *pl, *next; + + list_for_each_entry_safe(pl, next, poison_list, list) { + list_del(&pl->list); + kfree(pl); + } + list_del_init(poison_list); +} + static int child_unregister(struct device *dev, void *data) { /* @@ -385,6 +571,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) nd_synchronize(); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); + free_poison_list(&nvdimm_bus->poison_list); nvdimm_bus_destroy_ndctl(nvdimm_bus); device_unregister(&nvdimm_bus->dev); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 159aed532042..d3b7ea78df96 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -30,6 +30,7 @@ struct nvdimm_bus { struct list_head list; struct device dev; int id, probe_active; + struct list_head poison_list; struct mutex reconfig_mutex; }; @@ -89,4 +90,6 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, ssize_t nd_namespace_store(struct device *dev, struct nd_namespace_common **_ndns, const char *buf, size_t len); +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, + struct nd_namespace_common *ndns); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 417e521d299c..ba91fcd5818d 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -38,6 +38,12 @@ enum { #endif }; +struct nd_poison { + u64 start; + u64 length; + struct list_head list; +}; + struct nvdimm_drvdata { struct device *dev; int nsindex_size; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 8ee79893d2f5..5b95043443a3 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -27,6 +27,7 @@ #include #include #include +#include "nd-core.h" #include "pfn.h" #include "nd.h" @@ -168,6 +169,7 @@ static int pmem_attach_disk(struct device *dev, { int nid = dev_to_node(dev); struct gendisk *disk; + int ret; pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid); if (!pmem->pmem_queue) @@ -196,6 +198,10 @@ static int pmem_attach_disk(struct device *dev, set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; + ret = nvdimm_namespace_add_poison(disk, pmem->data_offset, ndns); + if (ret) + return ret; + add_disk(disk); revalidate_disk(disk); diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 3f021dc5da8c..bed40dff0e86 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( } +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc, struct module *module); #define nvdimm_bus_register(parent, desc) \ -- cgit v1.2.3 From ad9a8bde2cb19f6876f964fc48acc8b6a2f325ff Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:41 -0800 Subject: libnvdimm, pmem: move definition of nvdimm_namespace_add_poison to nd.h nd-core.h is private to the libnvdimm core internals and should not be used by drivers. Signed-off-by: Dan Williams --- drivers/nvdimm/nd-core.h | 2 -- drivers/nvdimm/nd.h | 2 ++ drivers/nvdimm/pmem.c | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index d3b7ea78df96..29acdaa757e2 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -90,6 +90,4 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, ssize_t nd_namespace_store(struct device *dev, struct nd_namespace_common **_ndns, const char *buf, size_t len); -int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, - struct nd_namespace_common *ndns); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index ba91fcd5818d..198933da83e5 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -268,6 +268,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); +int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, + struct nd_namespace_common *ndns); int nd_blk_region_init(struct nd_region *nd_region); void __nd_iostat_start(struct bio *bio, unsigned long *start); static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5b95043443a3..65b2056e7540 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -27,7 +27,6 @@ #include #include #include -#include "nd-core.h" #include "pfn.h" #include "nd.h" -- cgit v1.2.3 From b95f5f4391fad65f1819c2404080b05ca95bdd92 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 4 Jan 2016 23:50:23 -0800 Subject: libnvdimm: convert to statically allocated badblocks If a device will ever have badblocks it should always have a badblocks instance available. So, similar to md, embed a badblocks instance in pmem_device. This reduces pointer chasing in the i/o fast path, and simplifies the init path. Reported-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/core.c | 57 +++++++++++++++------------------------------------ drivers/nvdimm/nd.h | 4 ++-- drivers/nvdimm/pmem.c | 10 ++++----- 3 files changed, 24 insertions(+), 47 deletions(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index e419d661e294..2e2832b83c93 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #include +#include #include #include #include @@ -360,21 +361,19 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(__nvdimm_bus_register); -static void set_badblock(struct gendisk *disk, sector_t s, int num) +static void set_badblock(struct badblocks *bb, sector_t s, int num) { - struct device *dev = disk->driverfs_dev; - - dev_dbg(dev, "Found a poison range (0x%llx, 0x%llx)\n", + dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n", (u64) s * 512, (u64) num * 512); /* this isn't an error as the hardware will still throw an exception */ - if (disk_set_badblocks(disk, s, num)) - dev_info_once(dev, "%s: failed for sector %llx\n", + if (badblocks_set(bb, s, num, 1)) + dev_info_once(bb->dev, "%s: failed for sector %llx\n", __func__, (u64) s); } /** * __add_badblock_range() - Convert a physical address range to bad sectors - * @disk: the disk associated with the namespace + * @bb: badblocks instance to populate * @ns_offset: namespace offset where the error range begins (in bytes) * @len: number of bytes of poison to be added * @@ -382,25 +381,18 @@ static void set_badblock(struct gendisk *disk, sector_t s, int num) * the bounds of physical addresses for this namespace, i.e. lies in the * interval [ns_start, ns_start + ns_size) */ -static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) +static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) { - unsigned int sector_size = queue_logical_block_size(disk->queue); + const unsigned int sector_size = 512; sector_t start_sector; u64 num_sectors; u32 rem; - int rc; start_sector = div_u64(ns_offset, sector_size); num_sectors = div_u64_rem(len, sector_size, &rem); if (rem) num_sectors++; - if (!disk->bb) { - rc = disk_alloc_badblocks(disk); - if (rc) - return rc; - } - if (unlikely(num_sectors > (u64)INT_MAX)) { u64 remaining = num_sectors; sector_t s = start_sector; @@ -408,33 +400,27 @@ static int __add_badblock_range(struct gendisk *disk, u64 ns_offset, u64 len) while (remaining) { int done = min_t(u64, remaining, INT_MAX); - set_badblock(disk, s, done); + set_badblock(bb, s, done); remaining -= done; s += done; } } else - set_badblock(disk, start_sector, num_sectors); - - return 0; + set_badblock(bb, start_sector, num_sectors); } /** * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks - * @disk: the gendisk associated with the namespace where badblocks - * will be stored - * @offset: offset at the start of the namespace before 'sector 0' * @ndns: the namespace containing poison ranges + * @bb: badblocks instance to populate + * @offset: offset at the start of the namespace before 'sector 0' * * The poison list generated during NFIT initialization may contain multiple, * possibly overlapping ranges in the SPA (System Physical Address) space. * Compare each of these ranges to the namespace currently being initialized, * and add badblocks to the gendisk for all matching sub-ranges - * - * Return: - * 0 - Success */ -int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, - struct nd_namespace_common *ndns) +void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns, + struct badblocks *bb, resource_size_t offset) { struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_region *nd_region = to_nd_region(ndns->dev.parent); @@ -442,7 +428,6 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, struct list_head *poison_list; u64 ns_start, ns_end, ns_size; struct nd_poison *pl; - int rc; ns_size = nvdimm_namespace_capacity(ndns) - offset; ns_start = nsio->res.start + offset; @@ -451,7 +436,7 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent); poison_list = &nvdimm_bus->poison_list; if (list_empty(poison_list)) - return 0; + return; list_for_each_entry(pl, poison_list, list) { u64 pl_end = pl->start + pl->length - 1; @@ -470,10 +455,7 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, len = pl->length; else len = ns_start + ns_size - pl->start; - - rc = __add_badblock_range(disk, start - ns_start, len); - if (rc) - return rc; + __add_badblock_range(bb, start - ns_start, len); continue; } /* Deal with overlap for poison starting before the namespace */ @@ -484,14 +466,9 @@ int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, len = pl->start + pl->length - ns_start; else len = ns_size; - - rc = __add_badblock_range(disk, 0, len); - if (rc) - return rc; + __add_badblock_range(bb, 0, len); } } - - return 0; } EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 198933da83e5..288d96ec7233 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -268,8 +268,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); -int nvdimm_namespace_add_poison(struct gendisk *disk, resource_size_t offset, - struct nd_namespace_common *ndns); +void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns, + struct badblocks *bb, resource_size_t offset); int nd_blk_region_init(struct nd_region *nd_region); void __nd_iostat_start(struct bio *bio, unsigned long *start); static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 65b2056e7540..2b1f3009f827 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ struct pmem_device { phys_addr_t data_offset; void __pmem *virt_addr; size_t size; + struct badblocks bb; }; static int pmem_major; @@ -168,7 +170,6 @@ static int pmem_attach_disk(struct device *dev, { int nid = dev_to_node(dev); struct gendisk *disk; - int ret; pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid); if (!pmem->pmem_queue) @@ -196,10 +197,9 @@ static int pmem_attach_disk(struct device *dev, disk->driverfs_dev = dev; set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; - - ret = nvdimm_namespace_add_poison(disk, pmem->data_offset, ndns); - if (ret) - return ret; + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); add_disk(disk); revalidate_disk(disk); -- cgit v1.2.3 From e10624f8c09710b3b0740ea3847627ea02f55c39 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:41 -0800 Subject: pmem: fail io-requests to known bad blocks Check the sectors specified in a read bio to see if they hit a known bad block, and return an error code pmem_do_bvec(). Note that the ->rw_page() is not in a position to return errors. For now, copy the same layering violation present in zram_rw_page() to avoid crashes of the form: kernel BUG at mm/filemap.c:822! [..] Call Trace: [] page_endio+0x1e/0x60 [] mpage_end_io+0x39/0x60 [] bio_endio+0x3f/0x60 [] pmem_make_request+0x111/0x230 [nd_pmem] ...i.e. unlock a page that was already unlocked via pmem_rw_page() => page_endio(). Reported-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 2b1f3009f827..d00c659d1304 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -47,7 +47,20 @@ struct pmem_device { static int pmem_major; -static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, +static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) +{ + if (bb->count) { + sector_t first_bad; + int num_bad; + + return !!badblocks_check(bb, sector, len / 512, &first_bad, + &num_bad); + } + + return false; +} + +static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, int rw, sector_t sector) { @@ -56,6 +69,8 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, void __pmem *pmem_addr = pmem->virt_addr + pmem_off; if (rw == READ) { + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) + return -EIO; memcpy_from_pmem(mem + off, pmem_addr, len); flush_dcache_page(page); } else { @@ -64,10 +79,12 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, } kunmap_atomic(mem); + return 0; } static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { + int rc = 0; bool do_acct; unsigned long start; struct bio_vec bvec; @@ -76,9 +93,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) struct pmem_device *pmem = bdev->bd_disk->private_data; do_acct = nd_iostat_start(bio, &start); - bio_for_each_segment(bvec, bio, iter) - pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, - bio_data_dir(bio), iter.bi_sector); + bio_for_each_segment(bvec, bio, iter) { + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, + bvec.bv_offset, bio_data_dir(bio), + iter.bi_sector); + if (rc) { + bio->bi_error = rc; + break; + } + } if (do_acct) nd_iostat_end(bio, start); @@ -93,13 +116,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct page *page, int rw) { struct pmem_device *pmem = bdev->bd_disk->private_data; + int rc; - pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); + rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); if (rw & WRITE) wmb_pmem(); - page_endio(page, rw & WRITE, 0); - return 0; + /* + * The ->rw_page interface is subtle and tricky. The core + * retries on any error, so we can only invoke page_endio() in + * the successful completion case. Otherwise, we'll see crashes + * caused by double completion. + */ + if (rc == 0) + page_endio(page, rw & WRITE, 0); + + return rc; } static long pmem_direct_access(struct block_device *bdev, sector_t sector, -- cgit v1.2.3 From 57f7f317abdd07954cb116280c88d18378afb33e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Jan 2016 12:03:42 -0800 Subject: pmem, dax: disable dax in the presence of bad blocks Longer term teach dax to punch "error" holes in mapping requests and deliver SIGBUS to applications that consume a bad pmem page. For now, simply disable the dax performance optimization in the presence of known errors. Signed-off-by: Dan Williams --- block/ioctl.c | 10 ++++++++++ drivers/nvdimm/pmem.c | 1 + 2 files changed, 11 insertions(+) (limited to 'drivers/nvdimm/pmem.c') diff --git a/block/ioctl.c b/block/ioctl.c index 7a964d842913..2c84683aada5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -422,6 +423,15 @@ bool blkdev_dax_capable(struct block_device *bdev) || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) return false; + /* + * If the device has known bad blocks, force all I/O through the + * driver / page cache. + * + * TODO: support finer grained dax error handling + */ + if (disk->bb && disk->bb->count) + return false; + return true; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d00c659d1304..6a1832b3983c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -233,6 +233,7 @@ static int pmem_attach_disk(struct device *dev, return -ENOMEM; nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); + disk->bb = &pmem->bb; add_disk(disk); revalidate_disk(disk); -- cgit v1.2.3 From 710d69cc99507803ed91b4ec7368fbd66d59f014 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 4 Jan 2016 23:31:24 -0800 Subject: libnvdimm, pmem: nvdimm_read_bytes() badblocks support Support badblock checking in all the pmem read paths that do not go through the block layer. This protects info block reads (btt or pfn) as well as data reads to a pmem namespace via a btt instance. Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 6a1832b3983c..a88762d0d086 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -229,6 +229,7 @@ static int pmem_attach_disk(struct device *dev, disk->driverfs_dev = dev; set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; + devm_exit_badblocks(dev, &pmem->bb); if (devm_init_badblocks(dev, &pmem->bb)) return -ENOMEM; nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); @@ -250,9 +251,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns, return -EFAULT; } - if (rw == READ) + if (rw == READ) { + unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); + + if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align))) + return -EIO; memcpy_from_pmem(buf, pmem->virt_addr + offset, size); - else { + } else { memcpy_to_pmem(pmem->virt_addr + offset, buf, size); wmb_pmem(); } @@ -427,6 +432,9 @@ static int nd_pmem_probe(struct device *dev) pmem->ndns = ndns; dev_set_drvdata(dev, pmem); ndns->rw_bytes = pmem_rw_bytes; + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); if (is_nd_btt(dev)) return nvdimm_namespace_attach_btt(ndns); -- cgit v1.2.3