diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-17 09:51:57 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-17 09:51:57 -0800 |
commit | a3841f94c7ecb3ede0f888d3fcfe8fb6368ddd7a (patch) | |
tree | 6625eedf10d0672068ee218bb893a5a0e1803df2 | |
parent | adeba81ac2a6451f44545874da3d181081f0ab04 (diff) | |
parent | 4247f24c23589bcc3bc3490515ef8c9497e9ae55 (diff) | |
download | linux-a3841f94c7ecb3ede0f888d3fcfe8fb6368ddd7a.tar.bz2 |
Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm and dax updates from Dan Williams:
"Save for a few late fixes, all of these commits have shipped in -next
releases since before the merge window opened, and 0day has given a
build success notification.
The ext4 touches came from Jan, and the xfs touches have Darrick's
reviewed-by. An xfstest for the MAP_SYNC feature has been through
a few round of reviews and is on track to be merged.
- Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable
'userspace flush' of persistent memory updates via filesystem-dax
mappings. It arranges for any filesystem metadata updates that may
be required to satisfy a write fault to also be flushed ("on disk")
before the kernel returns to userspace from the fault handler.
Effectively every write-fault that dirties metadata completes an
fsync() before returning from the fault handler. The new
MAP_SHARED_VALIDATE mapping type guarantees that the MAP_SYNC flag
is validated as supported by the filesystem's ->mmap() file
operation.
- Add support for the standard ACPI 6.2 label access methods that
replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods.
This enables interoperability with environments that only implement
the standardized methods.
- Add support for the ACPI 6.2 NVDIMM media error injection methods.
- Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for
latch last shutdown status, firmware update, SMART error injection,
and SMART alarm threshold control.
- Cleanup physical address information disclosures to be root-only.
- Fix revalidation of the DIMM "locked label area" status to support
dynamic unlock of the label area.
- Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA
(system-physical-address) command and error injection commands.
Acknowledgements that came after the commits were pushed to -next:
- 957ac8c421ad ("dax: fix PMD faults on zero-length files"):
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
- a39e596baa07 ("xfs: support for synchronous DAX faults") and
7b565c9f965b ("xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()")
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>"
* tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (49 commits)
acpi, nfit: add 'Enable Latch System Shutdown Status' command support
dax: fix general protection fault in dax_alloc_inode
dax: fix PMD faults on zero-length files
dax: stop requiring a live device for dax_flush()
brd: remove dax support
dax: quiet bdev_dax_supported()
fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core
tools/testing/nvdimm: unit test clear-error commands
acpi, nfit: validate commands against the device type
tools/testing/nvdimm: stricter bounds checking for error injection commands
xfs: support for synchronous DAX faults
xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()
ext4: Support for synchronous DAX faults
ext4: Simplify error handling in ext4_dax_huge_fault()
dax: Implement dax_finish_sync_fault()
dax, iomap: Add support for synchronous faults
mm: Define MAP_SYNC and VM_SYNC flags
dax: Allow tuning whether dax_insert_mapping_entry() dirties entry
dax: Allow dax_iomap_fault() to return pfn
dax: Fix comment describing dax_iomap_fault()
...
48 files changed, 1405 insertions, 560 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 540762a62906..e04d108055f0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4208,7 +4208,7 @@ L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/busses/i2c-diolan-u2c.c -DIRECT ACCESS (DAX) +FILESYSTEM DIRECT ACCESS (DAX) M: Matthew Wilcox <mawilcox@microsoft.com> M: Ross Zwisler <ross.zwisler@linux.intel.com> L: linux-fsdevel@vger.kernel.org @@ -4217,6 +4217,12 @@ F: fs/dax.c F: include/linux/dax.h F: include/trace/events/fs_dax.h +DEVICE DIRECT ACCESS (DAX) +M: Dan Williams <dan.j.williams@intel.com> +L: linux-nvdimm@lists.01.org +S: Supported +F: drivers/dax/ + DIRECTORY NOTIFICATION (DNOTIFY) M: Jan Kara <jack@suse.cz> R: Amir Goldstein <amir73il@gmail.com> diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 6bf730063e3f..2dbdf59258d9 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -12,6 +12,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_TYPE 0x0f /* Mask for type of mapping (OSF/1 is _wrong_) */ #define MAP_FIXED 0x100 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10 /* don't use a file */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 20c3df7a8fdd..606e02ca4b6c 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -29,6 +29,7 @@ */ #define MAP_SHARED 0x001 /* Share changes */ #define MAP_PRIVATE 0x002 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index d1af0d74a188..80510ba44c08 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -12,6 +12,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_TYPE 0x03 /* Mask for type of mapping */ #define MAP_FIXED 0x04 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10 /* don't use a file */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 2bfe590694fc..3e9d01ada81f 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -36,6 +36,7 @@ */ #define MAP_SHARED 0x001 /* Share changes */ #define MAP_PRIVATE 0x002 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 9c2c49b6a240..ff2580e7611d 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -183,13 +183,33 @@ static int xlat_bus_status(void *buf, unsigned int cmd, u32 status) return 0; } -static int xlat_nvdimm_status(void *buf, unsigned int cmd, u32 status) +#define ACPI_LABELS_LOCKED 3 + +static int xlat_nvdimm_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd, + u32 status) { + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + switch (cmd) { case ND_CMD_GET_CONFIG_SIZE: + /* + * In the _LSI, _LSR, _LSW case the locked status is + * communicated via the read/write commands + */ + if (nfit_mem->has_lsi) + break; + if (status >> 16 & ND_CONFIG_LOCKED) return -EACCES; break; + case ND_CMD_GET_CONFIG_DATA: + if (nfit_mem->has_lsr && status == ACPI_LABELS_LOCKED) + return -EACCES; + break; + case ND_CMD_SET_CONFIG_DATA: + if (nfit_mem->has_lsw && status == ACPI_LABELS_LOCKED) + return -EACCES; + break; default: break; } @@ -205,13 +225,182 @@ static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd, { if (!nvdimm) return xlat_bus_status(buf, cmd, status); - return xlat_nvdimm_status(buf, cmd, status); + return xlat_nvdimm_status(nvdimm, buf, cmd, status); +} + +/* convert _LS{I,R} packages to the buffer object acpi_nfit_ctl expects */ +static union acpi_object *pkg_to_buf(union acpi_object *pkg) +{ + int i; + void *dst; + size_t size = 0; + union acpi_object *buf = NULL; + + if (pkg->type != ACPI_TYPE_PACKAGE) { + WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", + pkg->type); + goto err; + } + + for (i = 0; i < pkg->package.count; i++) { + union acpi_object *obj = &pkg->package.elements[i]; + + if (obj->type == ACPI_TYPE_INTEGER) + size += 4; + else if (obj->type == ACPI_TYPE_BUFFER) + size += obj->buffer.length; + else { + WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", + obj->type); + goto err; + } + } + + buf = ACPI_ALLOCATE(sizeof(*buf) + size); + if (!buf) + goto err; + + dst = buf + 1; + buf->type = ACPI_TYPE_BUFFER; + buf->buffer.length = size; + buf->buffer.pointer = dst; + for (i = 0; i < pkg->package.count; i++) { + union acpi_object *obj = &pkg->package.elements[i]; + + if (obj->type == ACPI_TYPE_INTEGER) { + memcpy(dst, &obj->integer.value, 4); + dst += 4; + } else if (obj->type == ACPI_TYPE_BUFFER) { + memcpy(dst, obj->buffer.pointer, obj->buffer.length); + dst += obj->buffer.length; + } + } +err: + ACPI_FREE(pkg); + return buf; +} + +static union acpi_object *int_to_buf(union acpi_object *integer) +{ + union acpi_object *buf = ACPI_ALLOCATE(sizeof(*buf) + 4); + void *dst = NULL; + + if (!buf) + goto err; + + if (integer->type != ACPI_TYPE_INTEGER) { + WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", + integer->type); + goto err; + } + + dst = buf + 1; + buf->type = ACPI_TYPE_BUFFER; + buf->buffer.length = 4; + buf->buffer.pointer = dst; + memcpy(dst, &integer->integer.value, 4); +err: + ACPI_FREE(integer); + return buf; +} + +static union acpi_object *acpi_label_write(acpi_handle handle, u32 offset, + u32 len, void *data) +{ + acpi_status rc; + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_object_list input = { + .count = 3, + .pointer = (union acpi_object []) { + [0] = { + .integer.type = ACPI_TYPE_INTEGER, + .integer.value = offset, + }, + [1] = { + .integer.type = ACPI_TYPE_INTEGER, + .integer.value = len, + }, + [2] = { + .buffer.type = ACPI_TYPE_BUFFER, + .buffer.pointer = data, + .buffer.length = len, + }, + }, + }; + + rc = acpi_evaluate_object(handle, "_LSW", &input, &buf); + if (ACPI_FAILURE(rc)) + return NULL; + return int_to_buf(buf.pointer); +} + +static union acpi_object *acpi_label_read(acpi_handle handle, u32 offset, + u32 len) +{ + acpi_status rc; + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_object_list input = { + .count = 2, + .pointer = (union acpi_object []) { + [0] = { + .integer.type = ACPI_TYPE_INTEGER, + .integer.value = offset, + }, + [1] = { + .integer.type = ACPI_TYPE_INTEGER, + .integer.value = len, + }, + }, + }; + + rc = acpi_evaluate_object(handle, "_LSR", &input, &buf); + if (ACPI_FAILURE(rc)) + return NULL; + return pkg_to_buf(buf.pointer); +} + +static union acpi_object *acpi_label_info(acpi_handle handle) +{ + acpi_status rc; + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + + rc = acpi_evaluate_object(handle, "_LSI", NULL, &buf); + if (ACPI_FAILURE(rc)) + return NULL; + return pkg_to_buf(buf.pointer); +} + +static u8 nfit_dsm_revid(unsigned family, unsigned func) +{ + static const u8 revid_table[NVDIMM_FAMILY_MAX+1][32] = { + [NVDIMM_FAMILY_INTEL] = { + [NVDIMM_INTEL_GET_MODES] = 2, + [NVDIMM_INTEL_GET_FWINFO] = 2, + [NVDIMM_INTEL_START_FWUPDATE] = 2, + [NVDIMM_INTEL_SEND_FWUPDATE] = 2, + [NVDIMM_INTEL_FINISH_FWUPDATE] = 2, + [NVDIMM_INTEL_QUERY_FWUPDATE] = 2, + [NVDIMM_INTEL_SET_THRESHOLD] = 2, + [NVDIMM_INTEL_INJECT_ERROR] = 2, + }, + }; + u8 id; + + if (family > NVDIMM_FAMILY_MAX) + return 0; + if (func > 31) + return 0; + id = revid_table[family][func]; + if (id == 0) + return 1; /* default */ + return id; } int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) { struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc); + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); union acpi_object in_obj, in_buf, *out_obj; const struct nd_cmd_desc *desc = NULL; struct device *dev = acpi_desc->dev; @@ -235,7 +424,6 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, } if (nvdimm) { - struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct acpi_device *adev = nfit_mem->adev; if (!adev) @@ -294,7 +482,29 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, in_buf.buffer.pointer, min_t(u32, 256, in_buf.buffer.length), true); - out_obj = acpi_evaluate_dsm(handle, guid, 1, func, &in_obj); + /* call the BIOS, prefer the named methods over _DSM if available */ + if (nvdimm && cmd == ND_CMD_GET_CONFIG_SIZE && nfit_mem->has_lsi) + out_obj = acpi_label_info(handle); + else if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && nfit_mem->has_lsr) { + struct nd_cmd_get_config_data_hdr *p = buf; + + out_obj = acpi_label_read(handle, p->in_offset, p->in_length); + } else if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA + && nfit_mem->has_lsw) { + struct nd_cmd_set_config_hdr *p = buf; + + out_obj = acpi_label_write(handle, p->in_offset, p->in_length, + p->in_buf); + } else { + u8 revid; + + if (nvdimm) + revid = nfit_dsm_revid(nfit_mem->family, func); + else + revid = 1; + out_obj = acpi_evaluate_dsm(handle, guid, revid, func, &in_obj); + } + if (!out_obj) { dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name, cmd_name); @@ -356,8 +566,10 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, * Set fw_status for all the commands with a known format to be * later interpreted by xlat_status(). */ - if (i >= 1 && ((cmd >= ND_CMD_ARS_CAP && cmd <= ND_CMD_CLEAR_ERROR) - || (cmd >= ND_CMD_SMART && cmd <= ND_CMD_VENDOR))) + if (i >= 1 && ((!nvdimm && cmd >= ND_CMD_ARS_CAP + && cmd <= ND_CMD_CLEAR_ERROR) + || (nvdimm && cmd >= ND_CMD_SMART + && cmd <= ND_CMD_VENDOR))) fw_status = *(u32 *) out_obj->buffer.pointer; if (offset + in_buf.buffer.length < buf_len) { @@ -1431,6 +1643,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, { struct acpi_device *adev, *adev_dimm; struct device *dev = acpi_desc->dev; + union acpi_object *obj; unsigned long dsm_mask; const guid_t *guid; int i; @@ -1463,7 +1676,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, * different command sets. Note, that checking for function0 (bit0) * tells us if any commands are reachable through this GUID. */ - for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++) + for (i = 0; i <= NVDIMM_FAMILY_MAX; i++) if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) if (family < 0 || i == default_dsm_family) family = i; @@ -1473,7 +1686,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, if (override_dsm_mask && !disable_vendor_specific) dsm_mask = override_dsm_mask; else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { - dsm_mask = 0x3fe; + dsm_mask = NVDIMM_INTEL_CMDMASK; if (disable_vendor_specific) dsm_mask &= ~(1 << ND_CMD_VENDOR); } else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) { @@ -1493,9 +1706,32 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, guid = to_nfit_uuid(nfit_mem->family); for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) - if (acpi_check_dsm(adev_dimm->handle, guid, 1, 1ULL << i)) + if (acpi_check_dsm(adev_dimm->handle, guid, + nfit_dsm_revid(nfit_mem->family, i), + 1ULL << i)) set_bit(i, &nfit_mem->dsm_mask); + obj = acpi_label_info(adev_dimm->handle); + if (obj) { + ACPI_FREE(obj); + nfit_mem->has_lsi = 1; + dev_dbg(dev, "%s: has _LSI\n", dev_name(&adev_dimm->dev)); + } + + obj = acpi_label_read(adev_dimm->handle, 0, 0); + if (obj) { + ACPI_FREE(obj); + nfit_mem->has_lsr = 1; + dev_dbg(dev, "%s: has _LSR\n", dev_name(&adev_dimm->dev)); + } + + obj = acpi_label_write(adev_dimm->handle, 0, 0, NULL); + if (obj) { + ACPI_FREE(obj); + nfit_mem->has_lsw = 1; + dev_dbg(dev, "%s: has _LSW\n", dev_name(&adev_dimm->dev)); + } + return 0; } @@ -1571,8 +1807,21 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) * userspace interface. */ cmd_mask = 1UL << ND_CMD_CALL; - if (nfit_mem->family == NVDIMM_FAMILY_INTEL) - cmd_mask |= nfit_mem->dsm_mask; + if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { + /* + * These commands have a 1:1 correspondence + * between DSM payload and libnvdimm ioctl + * payload format. + */ + cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK; + } + + if (nfit_mem->has_lsi) + set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask); + if (nfit_mem->has_lsr) + set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask); + if (nfit_mem->has_lsw) + set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask); flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush : NULL; @@ -1645,6 +1894,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) int i; nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; + nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en; adev = to_acpi_dev(acpi_desc); if (!adev) return; @@ -2239,7 +2489,7 @@ static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc, if (ars_status->out_length < 44 + sizeof(struct nd_ars_record) * (i + 1)) break; - rc = nvdimm_bus_add_poison(nvdimm_bus, + rc = nvdimm_bus_add_badrange(nvdimm_bus, ars_status->records[i].err_address, ars_status->records[i].length); if (rc) diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index feeb95d574fa..b92921439657 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -67,7 +67,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, continue; /* If this fails due to an -ENOMEM, there is little we can do */ - nvdimm_bus_add_poison(acpi_desc->nvdimm_bus, + nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus, ALIGN(mce->addr, L1_CACHE_BYTES), L1_CACHE_BYTES); nvdimm_region_notify(nfit_spa->nd_region, diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index 54292db61262..f0cf18b2da8b 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -24,7 +24,7 @@ /* ACPI 6.1 */ #define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba" -/* http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf */ +/* http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf */ #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66" /* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */ @@ -38,6 +38,37 @@ | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED) +#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT + +#define NVDIMM_STANDARD_CMDMASK \ +(1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \ + | 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA \ + | 1 << ND_CMD_SET_CONFIG_DATA | 1 << ND_CMD_VENDOR_EFFECT_LOG_SIZE \ + | 1 << ND_CMD_VENDOR_EFFECT_LOG | 1 << ND_CMD_VENDOR) + +/* + * Command numbers that the kernel needs to know about to handle + * non-default DSM revision ids + */ +enum nvdimm_family_cmds { + NVDIMM_INTEL_LATCH_SHUTDOWN = 10, + NVDIMM_INTEL_GET_MODES = 11, + NVDIMM_INTEL_GET_FWINFO = 12, + NVDIMM_INTEL_START_FWUPDATE = 13, + NVDIMM_INTEL_SEND_FWUPDATE = 14, + NVDIMM_INTEL_FINISH_FWUPDATE = 15, + NVDIMM_INTEL_QUERY_FWUPDATE = 16, + NVDIMM_INTEL_SET_THRESHOLD = 17, + NVDIMM_INTEL_INJECT_ERROR = 18, +}; + +#define NVDIMM_INTEL_CMDMASK \ +(NVDIMM_STANDARD_CMDMASK | 1 << NVDIMM_INTEL_GET_MODES \ + | 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \ + | 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \ + | 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \ + | 1 << NVDIMM_INTEL_INJECT_ERROR | 1 << NVDIMM_INTEL_LATCH_SHUTDOWN) + enum nfit_uuids { /* for simplicity alias the uuid index with the family id */ NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL, @@ -140,6 +171,9 @@ struct nfit_mem { struct resource *flush_wpq; unsigned long dsm_mask; int family; + u32 has_lsi:1; + u32 has_lsr:1; + u32 has_lsw:1; }; struct acpi_nfit_desc { @@ -167,6 +201,7 @@ struct acpi_nfit_desc { unsigned int init_complete:1; unsigned long dimm_cmd_force_en; unsigned long bus_cmd_force_en; + unsigned long bus_nfit_cmd_force_en; int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, void *iobuf, u64 len, int rw); }; diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 923b417eaf4c..40579d0cb3d1 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -302,7 +302,6 @@ config BLK_DEV_SX8 config BLK_DEV_RAM tristate "RAM block device support" - select DAX if BLK_DEV_RAM_DAX ---help--- Saying Y here will allow you to use a portion of your RAM memory as a block device, so that you can make file systems on it, read and @@ -338,17 +337,6 @@ config BLK_DEV_RAM_SIZE The default value is 4096 kilobytes. Only change this if you know what you are doing. -config BLK_DEV_RAM_DAX - bool "Support Direct Access (DAX) to RAM block devices" - depends on BLK_DEV_RAM && FS_DAX - default n - help - Support filesystems using DAX to access RAM block devices. This - avoids double-buffering data in the page cache before copying it - to the block device. Answering Y will slightly enlarge the kernel, - and will prevent RAM block device backing store memory from being - allocated from highmem (only a problem for highmem systems). - config CDROM_PKTCDVD tristate "Packet writing on CD/DVD media (DEPRECATED)" depends on !UML diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 588360d79fca..8028a3a7e7fd 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -21,11 +21,6 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/backing-dev.h> -#ifdef CONFIG_BLK_DEV_RAM_DAX -#include <linux/pfn_t.h> -#include <linux/dax.h> -#include <linux/uio.h> -#endif #include <linux/uaccess.h> @@ -45,9 +40,6 @@ struct brd_device { struct request_queue *brd_queue; struct gendisk *brd_disk; -#ifdef CONFIG_BLK_DEV_RAM_DAX - struct dax_device *dax_dev; -#endif struct list_head brd_list; /* @@ -112,9 +104,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) * restriction might be able to be lifted. */ gfp_flags = GFP_NOIO | __GFP_ZERO; -#ifndef CONFIG_BLK_DEV_RAM_DAX - gfp_flags |= __GFP_HIGHMEM; -#endif page = alloc_page(gfp_flags); if (!page) return NULL; @@ -334,43 +323,6 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, return err; } -#ifdef CONFIG_BLK_DEV_RAM_DAX -static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) -{ - struct page *page; - - if (!brd) - return -ENODEV; - page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT); - if (!page) - return -ENOSPC; - *kaddr = page_address(page); - *pfn = page_to_pfn_t(page); - - return 1; -} - -static long brd_dax_direct_access(struct dax_device *dax_dev, - pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) -{ - struct brd_device *brd = dax_get_private(dax_dev); - - return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn); -} - -static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, - void *addr, size_t bytes, struct iov_iter *i) -{ - return copy_from_iter(addr, bytes, i); -} - -static const struct dax_operations brd_dax_ops = { - .direct_access = brd_dax_direct_access, - .copy_from_iter = brd_dax_copy_from_iter, -}; -#endif - static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .rw_page = brd_rw_page, @@ -451,21 +403,8 @@ static struct brd_device *brd_alloc(int i) set_capacity(disk, rd_size * 2); disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; -#ifdef CONFIG_BLK_DEV_RAM_DAX - queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); - brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops); - if (!brd->dax_dev) - goto out_free_inode; -#endif - - return brd; -#ifdef CONFIG_BLK_DEV_RAM_DAX -out_free_inode: - kill_dax(brd->dax_dev); - put_dax(brd->dax_dev); -#endif out_free_queue: blk_cleanup_queue(brd->brd_queue); out_free_dev: @@ -505,10 +444,6 @@ out: static void brd_del_one(struct brd_device *brd) { list_del(&brd->brd_list); -#ifdef CONFIG_BLK_DEV_RAM_DAX - kill_dax(brd->dax_dev); - put_dax(brd->dax_dev); -#endif del_gendisk(brd->brd_disk); brd_free(brd); } diff --git a/drivers/dax/device.c b/drivers/dax/device.c index e9f3b3e4bbf4..6833ada237ab 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -222,7 +222,8 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { struct resource *res; - phys_addr_t phys; + /* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */ + phys_addr_t uninitialized_var(phys); int i; for (i = 0; i < dev_dax->num_resources; i++) { diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 557b93703532..3ec804672601 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -92,21 +92,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) long len; if (blocksize != PAGE_SIZE) { - pr_err("VFS (%s): error: unsupported blocksize for dax\n", + pr_debug("VFS (%s): error: unsupported blocksize for dax\n", sb->s_id); return -EINVAL; } err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); if (err) { - pr_err("VFS (%s): error: unaligned partition for dax\n", + pr_debug("VFS (%s): error: unaligned partition for dax\n", sb->s_id); return err; } dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); if (!dax_dev) { - pr_err("VFS (%s): error: device does not support dax\n", + pr_debug("VFS (%s): error: device does not support dax\n", sb->s_id); return -EOPNOTSUPP; } @@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) put_dax(dax_dev); if (len < 1) { - pr_err("VFS (%s): error: dax access failed (%ld)", + pr_debug("VFS (%s): error: dax access failed (%ld)\n", sb->s_id, len); return len < 0 ? len : -EIO; } @@ -273,9 +273,6 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter); void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) { - if (unlikely(!dax_alive(dax_dev))) - return; - if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))) return; @@ -344,6 +341,9 @@ static struct inode *dax_alloc_inode(struct super_block *sb) struct inode *inode; dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); + if (!dax_dev) + return NULL; + inode = &dax_dev->inode; inode->i_rdev = 0; return inode; diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 447e0e14f3b6..70d5f3ad9909 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -21,6 +21,7 @@ libnvdimm-y += region_devs.o libnvdimm-y += region.o libnvdimm-y += namespace_devs.o libnvdimm-y += label.o +libnvdimm-y += badrange.o libnvdimm-$(CONFIG_ND_CLAIM) += claim.o libnvdimm-$(CONFIG_BTT) += btt_devs.o libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c new file mode 100644 index 000000000000..e068d72b4357 --- /dev/null +++ b/drivers/nvdimm/badrange.c @@ -0,0 +1,293 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/libnvdimm.h> +#include <linux/badblocks.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/ctype.h> +#include <linux/ndctl.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/io.h> +#include "nd-core.h" +#include "nd.h" + +void badrange_init(struct badrange *badrange) +{ + INIT_LIST_HEAD(&badrange->list); + spin_lock_init(&badrange->lock); +} +EXPORT_SYMBOL_GPL(badrange_init); + +static void append_badrange_entry(struct badrange *badrange, + struct badrange_entry *bre, u64 addr, u64 length) +{ + lockdep_assert_held(&badrange->lock); + bre->start = addr; + bre->length = length; + list_add_tail(&bre->list, &badrange->list); +} + +static int alloc_and_append_badrange_entry(struct badrange *badrange, + u64 addr, u64 length, gfp_t flags) +{ + struct badrange_entry *bre; + + bre = kzalloc(sizeof(*bre), flags); + if (!bre) + return -ENOMEM; + + append_badrange_entry(badrange, bre, addr, length); + return 0; +} + +static int add_badrange(struct badrange *badrange, u64 addr, u64 length) +{ + struct badrange_entry *bre, *bre_new; + + spin_unlock(&badrange->lock); + bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL); + spin_lock(&badrange->lock); + + if (list_empty(&badrange->list)) { + if (!bre_new) + return -ENOMEM; + append_badrange_entry(badrange, bre_new, addr, length); + return 0; + } + + /* + * There is a chance this is a duplicate, check for those first. + * This will be the common case as ARS_STATUS returns all known + * errors in the SPA space, and we can't query it per region + */ + list_for_each_entry(bre, &badrange->list, list) + if (bre->start == addr) { + /* If length has changed, update this list entry */ + if (bre->length != length) + bre->length = length; + kfree(bre_new); + return 0; + } + + /* + * If not a duplicate or a simple length update, add the entry as is, + * as any overlapping ranges will get resolved when the list is consumed + * and converted to badblocks + */ + if (!bre_new) + return -ENOMEM; + append_badrange_entry(badrange, bre_new, addr, length); + + return 0; +} + +int badrange_add(struct badrange *badrange, u64 addr, u64 length) +{ + int rc; + + spin_lock(&badrange->lock); + rc = add_badrange(badrange, addr, length); + spin_unlock(&badrange->lock); + + return rc; +} +EXPORT_SYMBOL_GPL(badrange_add); + +void badrange_forget(struct badrange *badrange, phys_addr_t start, + unsigned int len) +{ + struct list_head *badrange_list = &badrange->list; + u64 clr_end = start + len - 1; + struct badrange_entry *bre, *next; + + spin_lock(&badrange->lock); + + /* + * [start, clr_end] is the badrange interval being cleared. + * [bre->start, bre_end] is the badrange_list entry we're comparing + * the above interval against. The badrange list entry may need + * to be modified (update either start or length), deleted, or + * split into two based on the overlap characteristics + */ + + list_for_each_entry_safe(bre, next, badrange_list, list) { + u64 bre_end = bre->start + bre->length - 1; + + /* Skip intervals with no intersection */ + if (bre_end < start) + continue; + if (bre->start > clr_end) + continue; + /* Delete completely overlapped badrange entries */ + if ((bre->start >= start) && (bre_end <= clr_end)) { + list_del(&bre->list); + kfree(bre); + continue; + } + /* Adjust start point of partially cleared entries */ + if ((start <= bre->start) && (clr_end > bre->start)) { + bre->length -= clr_end - bre->start + 1; + bre->start = clr_end + 1; + continue; + } + /* Adjust bre->length for partial clearing at the tail end */ + if ((bre->start < start) && (bre_end <= clr_end)) { + /* bre->start remains the same */ + bre->length = start - bre->start; + continue; + } + /* + * If clearing in the middle of an entry, we split it into + * two by modifying the current entry to represent one half of + * the split, and adding a new entry for the second half. + */ + if ((bre->start < start) && (bre_end > clr_end)) { + u64 new_start = clr_end + 1; + u64 new_len = bre_end - new_start + 1; + + /* Add new entry covering the right half */ + alloc_and_append_badrange_entry(badrange, new_start, + new_len, GFP_NOWAIT); + /* Adjust this entry to cover the left half */ + bre->length = start - bre->start; + continue; + } + } + spin_unlock(&badrange->lock); +} +EXPORT_SYMBOL_GPL(badrange_forget); + +static void set_badblock(struct badblocks *bb, sector_t s, int num) +{ + dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n", + (u64) s * 512, (u64) num * 512); + /* this isn't an error as the hardware will still throw an exception */ + if (badblocks_set(bb, s, num, 1)) + dev_info_once(bb->dev, "%s: failed for sector %llx\n", + __func__, (u64) s); +} + +/** + * __add_badblock_range() - Convert a physical address range to bad sectors + * @bb: badblocks instance to populate + * @ns_offset: namespace offset where the error range begins (in bytes) + * @len: number of bytes of badrange to be added + * + * This assumes that the range provided with (ns_offset, len) is within + * the bounds of physical addresses for this namespace, i.e. lies in the + * interval [ns_start, ns_start + ns_size) + */ +static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) +{ + const unsigned int sector_size = 512; + sector_t start_sector, end_sector; + u64 num_sectors; + u32 rem; + + start_sector = div_u64(ns_offset, sector_size); + end_sector = div_u64_rem(ns_offset + len, sector_size, &rem); + if (rem) + end_sector++; + num_sectors = end_sector - start_sector; + + if (unlikely(num_sectors > (u64)INT_MAX)) { + u64 remaining = num_sectors; + sector_t s = start_sector; + + while (remaining) { + int done = min_t(u64, remaining, INT_MAX); + + set_badblock(bb, s, done); + remaining -= done; + s += done; + } + } else + set_badblock(bb, start_sector, num_sectors); +} + +static void badblocks_populate(struct badrange *badrange, + struct badblocks *bb, const struct resource *res) +{ + struct badrange_entry *bre; + + if (list_empty(&badrange->list)) + return; + + list_for_each_entry(bre, &badrange->list, list) { + u64 bre_end = bre->start + bre->length - 1; + + /* Discard intervals with no intersection */ + if (bre_end < res->start) + continue; + if (bre->start > res->end) + continue; + /* Deal with any overlap after start of the namespace */ + if (bre->start >= res->start) { + u64 start = bre->start; + u64 len; + + if (bre_end <= res->end) + len = bre->length; + else + len = res->start + resource_size(res) + - bre->start; + __add_badblock_range(bb, start - res->start, len); + continue; + } + /* + * Deal with overlap for badrange starting before + * the namespace. + */ + if (bre->start < res->start) { + u64 len; + + if (bre_end < res->end) + len = bre->start + bre->length - res->start; + else + len = resource_size(res); + __add_badblock_range(bb, 0, len); + } + } +} + +/** + * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks + * @region: parent region of the range to interrogate + * @bb: badblocks instance to populate + * @res: resource range to consider + * + * The badrange list generated during bus initialization may contain + * multiple, possibly overlapping physical address ranges. Compare each + * of these ranges to the resource range currently being initialized, + * and add badblocks entries for all matching sub-ranges + */ +void nvdimm_badblocks_populate(struct nd_region *nd_region, + struct badblocks *bb, const struct resource *res) +{ + struct nvdimm_bus *nvdimm_bus; + + if (!is_memory(&nd_region->dev)) { + dev_WARN_ONCE(&nd_region->dev, 1, + "%s only valid for pmem regions\n", __func__); + return; + } + nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + + nvdimm_bus_lock(&nvdimm_bus->dev); + badblocks_populate(&nvdimm_bus->badrange, bb, res); + nvdimm_bus_unlock(&nvdimm_bus->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index baf283986a7e..0a5e6cd758fe 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/libnvdimm.h> #include <linux/sched/mm.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> @@ -221,7 +222,7 @@ static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t phys, u64 cleared) { if (cleared > 0) - nvdimm_forget_poison(nvdimm_bus, phys, cleared); + badrange_forget(&nvdimm_bus->badrange, phys, cleared); if (cleared > 0 && cleared / 512) nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared); @@ -344,11 +345,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, return NULL; INIT_LIST_HEAD(&nvdimm_bus->list); INIT_LIST_HEAD(&nvdimm_bus->mapping_list); - INIT_LIST_HEAD(&nvdimm_bus->poison_list); init_waitqueue_head(&nvdimm_bus->probe_wait); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); mutex_init(&nvdimm_bus->reconfig_mutex); - spin_lock_init(&nvdimm_bus->poison_lock); + badrange_init(&nvdimm_bus->badrange); if (nvdimm_bus->id < 0) { kfree(nvdimm_bus); return NULL; @@ -395,15 +395,15 @@ static int child_unregister(struct device *dev, void *data) return 0; } -static void free_poison_list(struct list_head *poison_list) +static void free_badrange_list(struct list_head *badrange_list) { - struct nd_poison *pl, *next; + struct badrange_entry *bre, *next; - list_for_each_entry_safe(pl, next, poison_list, list) { - list_del(&pl->list); - kfree(pl); + list_for_each_entry_safe(bre, next, badrange_list, list) { + list_del(&bre->list); + kfree(bre); } - list_del_init(poison_list); + list_del_init(badrange_list); } static int nd_bus_remove(struct device *dev) @@ -417,9 +417,9 @@ static int nd_bus_remove(struct device *dev) nd_synchronize(); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); - spin_lock(&nvdimm_bus->poison_lock); - free_poison_list(&nvdimm_bus->poison_list); - spin_unlock(&nvdimm_bus->poison_lock); + spin_lock(&nvdimm_bus->badrange.lock); + free_badrange_list(&nvdimm_bus->badrange.list); + spin_unlock(&nvdimm_bus->badrange.lock); nvdimm_bus_destroy_ndctl(nvdimm_bus); diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index bb71f0cf8f5d..1dc527660637 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -398,265 +398,11 @@ struct attribute_group nvdimm_bus_attribute_group = { }; EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group); -static void set_badblock(struct badblocks *bb, sector_t s, int num) +int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) { - dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n", - (u64) s * 512, (u64) num * 512); - /* this isn't an error as the hardware will still throw an exception */ - if (badblocks_set(bb, s, num, 1)) - dev_info_once(bb->dev, "%s: failed for sector %llx\n", - __func__, (u64) s); + return badrange_add(&nvdimm_bus->badrange, addr, length); } - -/** - * __add_badblock_range() - Convert a physical address range to bad sectors - * @bb: badblocks instance to populate - * @ns_offset: namespace offset where the error range begins (in bytes) - * @len: number of bytes of poison to be added - * - * This assumes that the range provided with (ns_offset, len) is within - * the bounds of physical addresses for this namespace, i.e. lies in the - * interval [ns_start, ns_start + ns_size) - */ -static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) -{ - const unsigned int sector_size = 512; - sector_t start_sector, end_sector; - u64 num_sectors; - u32 rem; - - start_sector = div_u64(ns_offset, sector_size); - end_sector = div_u64_rem(ns_offset + len, sector_size, &rem); - if (rem) - end_sector++; - num_sectors = end_sector - start_sector; - - if (unlikely(num_sectors > (u64)INT_MAX)) { - u64 remaining = num_sectors; - sector_t s = start_sector; - - while (remaining) { - int done = min_t(u64, remaining, INT_MAX); - - set_badblock(bb, s, done); - remaining -= done; - s += done; - } - } else - set_badblock(bb, start_sector, num_sectors); -} - -static void badblocks_populate(struct list_head *poison_list, - struct badblocks *bb, const struct resource *res) -{ - struct nd_poison *pl; - - if (list_empty(poison_list)) - return; - - list_for_each_entry(pl, poison_list, list) { - u64 pl_end = pl->start + pl->length - 1; - - /* Discard intervals with no intersection */ - if (pl_end < res->start) - continue; - if (pl->start > res->end) - continue; - /* Deal with any overlap after start of the namespace */ - if (pl->start >= res->start) { - u64 start = pl->start; - u64 len; - - if (pl_end <= res->end) - len = pl->length; - else - len = res->start + resource_size(res) - - pl->start; - __add_badblock_range(bb, start - res->start, len); - continue; - } - /* Deal with overlap for poison starting before the namespace */ - if (pl->start < res->start) { - u64 len; - - if (pl_end < res->end) - len = pl->start + pl->length - res->start; - else - len = resource_size(res); - __add_badblock_range(bb, 0, len); - } - } -} - -/** - * nvdimm_badblocks_populate() - Convert a list of poison ranges to badblocks - * @region: parent region of the range to interrogate - * @bb: badblocks instance to populate - * @res: resource range to consider - * - * The poison list generated during bus initialization may contain - * multiple, possibly overlapping physical address ranges. Compare each - * of these ranges to the resource range currently being initialized, - * and add badblocks entries for all matching sub-ranges - */ -void nvdimm_badblocks_populate(struct nd_region *nd_region, - struct badblocks *bb, const struct resource *res) -{ - struct nvdimm_bus *nvdimm_bus; - struct list_head *poison_list; - - if (!is_memory(&nd_region->dev)) { - dev_WARN_ONCE(&nd_region->dev, 1, - "%s only valid for pmem regions\n", __func__); - return; - } - nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); - poison_list = &nvdimm_bus->poison_list; - - nvdimm_bus_lock(&nvdimm_bus->dev); - badblocks_populate(poison_list, bb, res); - nvdimm_bus_unlock(&nvdimm_bus->dev); -} -EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); - -static void append_poison_entry(struct nvdimm_bus *nvdimm_bus, - struct nd_poison *pl, u64 addr, u64 length) -{ - lockdep_assert_held(&nvdimm_bus->poison_lock); - pl->start = addr; - pl->length = length; - list_add_tail(&pl->list, &nvdimm_bus->poison_list); -} - -static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length, - gfp_t flags) -{ - struct nd_poison *pl; - - pl = kzalloc(sizeof(*pl), flags); - if (!pl) - return -ENOMEM; - - append_poison_entry(nvdimm_bus, pl, addr, length); - return 0; -} - -static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) -{ - struct nd_poison *pl, *pl_new; - - spin_unlock(&nvdimm_bus->poison_lock); - pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL); - spin_lock(&nvdimm_bus->poison_lock); - - if (list_empty(&nvdimm_bus->poison_list)) { - if (!pl_new) - return -ENOMEM; - append_poison_entry(nvdimm_bus, pl_new, addr, length); - return 0; - } - - /* - * There is a chance this is a duplicate, check for those first. - * This will be the common case as ARS_STATUS returns all known - * errors in the SPA space, and we can't query it per region - */ - list_for_each_entry(pl, &nvdimm_bus->poison_list, list) - if (pl->start == addr) { - /* If length has changed, update this list entry */ - if (pl->length != length) - pl->length = length; - kfree(pl_new); - return 0; - } - - /* - * If not a duplicate or a simple length update, add the entry as is, - * as any overlapping ranges will get resolved when the list is consumed - * and converted to badblocks - */ - if (!pl_new) - return -ENOMEM; - append_poison_entry(nvdimm_bus, pl_new, addr, length); - - return 0; -} - -int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) -{ - int rc; - - spin_lock(&nvdimm_bus->poison_lock); - rc = bus_add_poison(nvdimm_bus, addr, length); - spin_unlock(&nvdimm_bus->poison_lock); - - return rc; -} -EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); - -void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start, - unsigned int len) -{ - struct list_head *poison_list = &nvdimm_bus->poison_list; - u64 clr_end = start + len - 1; - struct nd_poison *pl, *next; - - spin_lock(&nvdimm_bus->poison_lock); - WARN_ON_ONCE(list_empty(poison_list)); - - /* - * [start, clr_end] is the poison interval being cleared. - * [pl->start, pl_end] is the poison_list entry we're comparing - * the above interval against. The poison list entry may need - * to be modified (update either start or length), deleted, or - * split into two based on the overlap characteristics - */ - - list_for_each_entry_safe(pl, next, poison_list, list) { - u64 pl_end = pl->start + pl->length - 1; - - /* Skip intervals with no intersection */ - if (pl_end < start) - continue; - if (pl->start > clr_end) - continue; - /* Delete completely overlapped poison entries */ - if ((pl->start >= start) && (pl_end <= clr_end)) { - list_del(&pl->list); - kfree(pl); - continue; - } - /* Adjust start point of partially cleared entries */ - if ((start <= pl->start) && (clr_end > pl->start)) { - pl->length -= clr_end - pl->start + 1; - pl->start = clr_end + 1; - continue; - } - /* Adjust pl->length for partial clearing at the tail end */ - if ((pl->start < start) && (pl_end <= clr_end)) { - /* pl->start remains the same */ - pl->length = start - pl->start; - continue; - } - /* - * If clearing in the middle of an entry, we split it into - * two by modifying the current entry to represent one half of - * the split, and adding a new entry for the second half. - */ - if ((pl->start < start) && (pl_end > clr_end)) { - u64 new_start = clr_end + 1; - u64 new_len = pl_end - new_start + 1; - - /* Add new entry covering the right half */ - add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT); - /* Adjust this entry to cover the left half */ - pl->length = start - pl->start; - continue; - } - } - spin_unlock(&nvdimm_bus->poison_lock); -} -EXPORT_SYMBOL_GPL(nvdimm_forget_poison); +EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange); #ifdef CONFIG_BLK_DEV_INTEGRITY int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index e0f0e3ce1a32..f8913b8124b6 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -55,6 +55,8 @@ static int nvdimm_probe(struct device *dev) goto err; rc = nvdimm_init_config_data(ndd); + if (rc == -EACCES) + nvdimm_set_locked(dev); if (rc) goto err; @@ -68,6 +70,7 @@ static int nvdimm_probe(struct device *dev) rc = nd_label_reserve_dpa(ndd); if (ndd->ns_current >= 0) nvdimm_set_aliasing(dev); + nvdimm_clear_locked(dev); nvdimm_bus_unlock(dev); if (rc) diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index f0d1b7e5de01..097794d9f786 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -200,6 +200,13 @@ void nvdimm_set_locked(struct device *dev) set_bit(NDD_LOCKED, &nvdimm->flags); } +void nvdimm_clear_locked(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + clear_bit(NDD_LOCKED, &nvdimm->flags); +} + static void nvdimm_release(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); @@ -324,6 +331,17 @@ static ssize_t commands_show(struct device *dev, } static DEVICE_ATTR_RO(commands); +static ssize_t flags_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + return sprintf(buf, "%s%s\n", + test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "", + test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : ""); +} +static DEVICE_ATTR_RO(flags); + static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -365,6 +383,7 @@ static DEVICE_ATTR_RO(available_slots); static struct attribute *nvdimm_attributes[] = { &dev_attr_state.attr, + &dev_attr_flags.attr, &dev_attr_commands.attr, &dev_attr_available_slots.attr, NULL, diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 9c5f108910e3..de66c02f6140 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -1050,7 +1050,7 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels) nsindex = to_namespace_index(ndd, 0); memset(nsindex, 0, ndd->nsarea.config_size); for (i = 0; i < 2; i++) { - int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT); + int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT); if (rc) return rc; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 3e4d1e7998da..bb3ba8cf24d4 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1620,7 +1620,7 @@ static umode_t namespace_visible(struct kobject *kobj, if (a == &dev_attr_resource.attr) { if (is_namespace_blk(dev)) return 0; - return a->mode; + return 0400; } if (is_namespace_pmem(dev) || is_namespace_blk(dev)) { @@ -1875,7 +1875,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) * @nspm: target namespace to create * @nd_label: target pmem namespace label to evaluate */ -struct device *create_namespace_pmem(struct nd_region *nd_region, +static struct device *create_namespace_pmem(struct nd_region *nd_region, struct nd_namespace_index *nsindex, struct nd_namespace_label *nd_label) { @@ -2186,7 +2186,7 @@ static int add_namespace_resource(struct nd_region *nd_region, return i; } -struct device *create_namespace_blk(struct nd_region *nd_region, +static struct device *create_namespace_blk(struct nd_region *nd_region, struct nd_namespace_label *nd_label, int count) { diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 86bc19ae30da..79274ead54fb 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -29,10 +29,9 @@ struct nvdimm_bus { struct list_head list; struct device dev; int id, probe_active; - struct list_head poison_list; struct list_head mapping_list; struct mutex reconfig_mutex; - spinlock_t poison_lock; + struct badrange badrange; }; struct nvdimm { diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 9c758a91372b..e958f3724c41 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -34,12 +34,6 @@ enum { NVDIMM_IO_ATOMIC = 1, }; -struct nd_poison { - u64 start; - u64 length; - struct list_head list; -}; - struct nvdimm_drvdata { struct device *dev; int nslabel_size; @@ -254,6 +248,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, unsigned int len); void nvdimm_set_aliasing(struct device *dev); void nvdimm_set_locked(struct device *dev); +void nvdimm_clear_locked(struct device *dev); struct nd_btt *to_nd_btt(struct device *dev); struct nd_gen_sb { diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 9576c444f0ab..65cc171c721d 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -282,8 +282,16 @@ static struct attribute *nd_pfn_attributes[] = { NULL, }; +static umode_t pfn_visible(struct kobject *kobj, struct attribute *a, int n) +{ + if (a == &dev_attr_resource.attr) + return 0400; + return a->mode; +} + struct attribute_group nd_pfn_attribute_group = { .attrs = nd_pfn_attributes, + .is_visible = pfn_visible, }; static const struct attribute_group *nd_pfn_attribute_groups[] = { diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 829d760f651c..abaf38c61220 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -562,8 +562,12 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) return 0; - if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr) - return 0; + if (a == &dev_attr_resource.attr) { + if (is_nd_pmem(dev)) + return 0400; + else + return 0; + } if (a == &dev_attr_deep_flush.attr) { int has_flush = nvdimm_has_flush(nd_region); @@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, void *entry, sector_t sector, - unsigned long flags) + unsigned long flags, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; void *new_entry; pgoff_t index = vmf->pgoff; - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { @@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, entry = new_entry; } - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); @@ -825,38 +825,42 @@ out: } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct address_space *mapping, - struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void *entry, - struct vm_area_struct *vma, struct vm_fault *vmf) +static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) { - unsigned long vaddr = vmf->address; - void *ret, *kaddr; + return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; +} + +static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, + pfn_t *pfnp) +{ + const sector_t sector = dax_iomap_sector(iomap, pos); pgoff_t pgoff; + void *kaddr; int id, rc; - pfn_t pfn; + long length; - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); if (rc) return rc; - id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); - if (rc < 0) { - dax_read_unlock(id); - return rc; + length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), + &kaddr, pfnp); + if (length < 0) { + rc = length; + goto out; } + rc = -EINVAL; + if (PFN_PHYS(length) < size) + goto out; + if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) + goto out; + /* For larger pages we need devmap */ + if (length > 1 && !pfn_t_devmap(*pfnp)) + goto out; + rc = 0; +out: dax_read_unlock(id); - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); - if (IS_ERR(ret)) - return PTR_ERR(ret); - - trace_dax_insert_mapping(mapping->host, vmf, ret); - if (vmf->flags & FAULT_FLAG_WRITE) - return vm_insert_mixed_mkwrite(vma, vaddr, pfn); - else - return vm_insert_mixed(vma, vaddr, pfn); + return rc; } /* @@ -882,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, } entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_ZERO_PAGE); + RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(entry2)) { ret = VM_FAULT_SIGBUS; goto out; @@ -941,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) -{ - return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; -} - static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) @@ -1085,19 +1084,33 @@ static int dax_fault_return(int error) return VM_FAULT_SIGBUS; } -static int dax_iomap_pte_fault(struct vm_fault *vmf, +/* + * MAP_SYNC on a dax mapping guarantees dirty metadata is + * flushed on write-faults (non-cow), but not read-faults. + */ +static bool dax_fault_is_synchronous(unsigned long flags, + struct vm_area_struct *vma, struct iomap *iomap) +{ + return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) + && (iomap->flags & IOMAP_F_DIRTY); +} + +static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { - struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - sector_t sector; struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; int vmf_ret = 0; void *entry; + pfn_t pfn; trace_dax_pte_fault(inode, vmf, vmf_ret); /* @@ -1110,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto out; } - if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + if (write && !vmf->cow_page) flags |= IOMAP_WRITE; entry = grab_mapping_entry(mapping, vmf->pgoff, 0); @@ -1145,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto error_finish_iomap; } - sector = dax_iomap_sector(&iomap, pos); - if (vmf->cow_page) { + sector_t sector = dax_iomap_sector(&iomap, pos); + switch (iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: @@ -1173,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto finish_iomap; } + sync = dax_fault_is_synchronous(flags, vma, &iomap); + switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, entry, vmf->vma, vmf); + error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); + if (error < 0) + goto error_finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, + dax_iomap_sector(&iomap, pos), + 0, write && !sync); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto error_finish_iomap; + } + + /* + * If we are doing synchronous page fault and inode needs fsync, + * we can insert PTE into page tables only after that happens. + * Skip insertion for now and return the pfn so that caller can + * insert it after fsync is done. + */ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) { + error = -EIO; + goto error_finish_iomap; + } + *pfnp = pfn; + vmf_ret = VM_FAULT_NEEDDSYNC | major; + goto finish_iomap; + } + trace_dax_insert_mapping(inode, vmf, entry); + if (write) + error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + error = vm_insert_mixed(vma, vaddr, pfn); + /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: - if (!(vmf->flags & FAULT_FLAG_WRITE)) { + if (!write) { vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } @@ -1223,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void *entry) -{ - struct address_space *mapping = vmf->vma->vm_file->f_mapping; - const sector_t sector = dax_iomap_sector(iomap, pos); - struct dax_device *dax_dev = iomap->dax_dev; - struct block_device *bdev = iomap->bdev; - struct inode *inode = mapping->host; - const size_t size = PMD_SIZE; - void *ret = NULL, *kaddr; - long length = 0; - pgoff_t pgoff; - pfn_t pfn = {}; - int id; - - if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) - goto fallback; - - id = dax_read_lock(); - length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); - if (length < 0) - goto unlock_fallback; - length = PFN_PHYS(length); - - if (length < size) - goto unlock_fallback; - if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) - goto unlock_fallback; - if (!pfn_t_devmap(pfn)) - goto unlock_fallback; - dax_read_unlock(id); - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, - RADIX_DAX_PMD); - if (IS_ERR(ret)) - goto fallback; - - trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); - return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - pfn, vmf->flags & FAULT_FLAG_WRITE); - -unlock_fallback: - dax_read_unlock(id); -fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); - return VM_FAULT_FALLBACK; -} +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below functions. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) @@ -1288,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(ret)) goto fallback; @@ -1310,13 +1314,14 @@ fallback: return VM_FAULT_FALLBACK; } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; @@ -1325,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, void *entry; loff_t pos; int error; + pfn_t pfn; /* * Check whether offset isn't beyond end of file now. Caller is @@ -1332,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * this is a reliable test. */ pgoff = linear_page_index(vma, pmd_addr); - max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); @@ -1356,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - if (pgoff > max_pgoff) { + if (pgoff >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ - if ((pgoff | PG_PMD_COLOUR) > max_pgoff) + if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* @@ -1400,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; + sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); + switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); + error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); + if (error < 0) + goto finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, + dax_iomap_sector(&iomap, pos), + RADIX_DAX_PMD, write && !sync); + if (IS_ERR(entry)) + goto finish_iomap; + + /* + * If we are doing synchronous page fault and inode needs fsync, + * we can insert PMD into page tables only after that happens. + * Skip insertion for now and return the pfn so that caller can + * insert it after fsync is done. + */ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) + goto finish_iomap; + *pfnp = pfn; + result = VM_FAULT_NEEDDSYNC; + goto finish_iomap; + } + + trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); + result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, + write); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: @@ -1442,7 +1476,7 @@ out: return result; } #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -1452,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @ops: iomap ops passed from the file system + * @pe_size: Size of the page to fault in + * @pfnp: PFN to insert for synchronous faults if fsync is required + * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller @@ -1460,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * successfully. */ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - const struct iomap_ops *ops) + pfn_t *pfnp, const struct iomap_ops *ops) { switch (pe_size) { case PE_SIZE_PTE: - return dax_iomap_pte_fault(vmf, ops); + return dax_iomap_pte_fault(vmf, pfnp, ops); case PE_SIZE_PMD: - return dax_iomap_pmd_fault(vmf, ops); + return dax_iomap_pmd_fault(vmf, pfnp, ops); default: return VM_FAULT_FALLBACK; } } EXPORT_SYMBOL_GPL(dax_iomap_fault); + +/** + * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function inserts writeable PTE or PMD entry into page tables for mmaped + * DAX file. It takes care of marking corresponding radix tree entry as dirty + * as well. + */ +static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, + enum page_entry_size pe_size, + pfn_t pfn) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + void *entry, **slot; + pgoff_t index = vmf->pgoff; + int vmf_ret, error; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, &slot); + /* Did we race with someone splitting entry or so? */ + if (!entry || + (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || + (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, + VM_FAULT_NOPAGE); + return VM_FAULT_NOPAGE; + } + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + entry = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + switch (pe_size) { + case PE_SIZE_PTE: + error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); + vmf_ret = dax_fault_return(error); + break; +#ifdef CONFIG_FS_DAX_PMD + case PE_SIZE_PMD: + vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + pfn, true); + break; +#endif + default: + vmf_ret = VM_FAULT_FALLBACK; + } + put_locked_mapping_entry(mapping, index); + trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); + return vmf_ret; +} + +/** + * dax_finish_sync_fault - finish synchronous page fault + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function ensures that the file range touched by the page fault is + * stored persistently on the media and handles inserting of appropriate page + * table entry. + */ +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + pfn_t pfn) +{ + int err; + loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; + size_t len = 0; + + if (pe_size == PE_SIZE_PTE) + len = PAGE_SIZE; + else if (pe_size == PE_SIZE_PMD) + len = PMD_SIZE; + else + WARN_ON_ONCE(1); + err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); + if (err) + return VM_FAULT_SIGBUS; + return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); +} +EXPORT_SYMBOL_GPL(dax_finish_sync_fault); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index c67b486488fd..2da67699dc33 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ad204d2724ac..a0ae27b1bc66 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -28,6 +28,7 @@ #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> +#include <linux/mman.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); + pfn_t pfn; if (write) { sb_start_pagefault(sb); @@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); + if (IS_ERR(handle)) { + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + return VM_FAULT_SIGBUS; + } } else { down_read(&EXT4_I(inode)->i_mmap_sem); } - if (!IS_ERR(handle)) - result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); - else - result = VM_FAULT_SIGBUS; + result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops); if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); + ext4_journal_stop(handle); + /* Handling synchronous page fault? */ + if (result & VM_FAULT_NEEDDSYNC) + result = dax_finish_sync_fault(vmf, pe_size, pfn); up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); } else { @@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; + /* + * We don't support synchronous mappings for non-DAX files. At least + * until someone comes with a sensible use case. + */ + if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, + .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8d2b582fb141..0992d76f7ab1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3384,6 +3384,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static bool ext4_inode_datasync_dirty(struct inode *inode) +{ + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal) + return !jbd2_transaction_committed(journal, + EXT4_I(inode)->i_datasync_tid); + /* Any metadata buffers to write? */ + if (!list_empty(&inode->i_mapping->private_list)) + return true; + return inode->i_state & I_DIRTY_DATASYNC; +} + static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { @@ -3497,6 +3510,8 @@ retry: } iomap->flags = 0; + if (ext4_inode_datasync_dirty(inode)) + iomap->flags |= IOMAP_F_DIRTY; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d2a85c9720e9..67546c7ad473 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) return err; } +/* Return 1 when transaction with given tid has already committed. */ +int jbd2_transaction_committed(journal_t *journal, tid_t tid) +{ + int ret = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) + ret = 0; + if (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid) + ret = 0; + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_transaction_committed); + /* * When this function returns the transaction corresponding to tid * will be completed. If the transaction has currently running, start diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 875231c36cb3..339e4c1c044d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -661,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_SYNC)] = "sf", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 18146873a8b3..8601275cc5e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -44,6 +44,7 @@ #include <linux/falloc.h> #include <linux/pagevec.h> #include <linux/backing-dev.h> +#include <linux/mman.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -1045,7 +1046,11 @@ __xfs_filemap_fault( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); + pfn_t pfn; + + ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); } else { if (write_fault) ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); @@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite( } /* - * pfn_mkwrite was originally inteneded to ensure we capture time stamp - * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED - * to ensure we serialise the fault barrier in place. + * pfn_mkwrite was originally intended to ensure we capture time stamp updates + * on write faults. In reality, it needs to serialise against truncate and + * prepare memory for writing so handle is as standard write fault. */ static int xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret = VM_FAULT_NOPAGE; - loff_t size; - - trace_xfs_filemap_pfn_mkwrite(ip); - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - - /* check if the faulting page hasn't raced with truncate */ - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); - return ret; - + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } static const struct vm_operations_struct xfs_file_vm_ops = { @@ -1136,6 +1120,13 @@ xfs_file_mmap( struct file *filp, struct vm_area_struct *vma) { + /* + * We don't support synchronous mappings for non-DAX files. At least + * until someone comes with a sensible use case. + */ + if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) @@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = { .compat_ioctl = xfs_file_compat_ioctl, #endif .mmap = xfs_file_mmap, + .mmap_supported_flags = MAP_SYNC, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 18077e2189a9..33eb4fb2e3fd 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -34,6 +34,7 @@ #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_space.h" +#include "xfs_inode_item.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -1089,6 +1090,10 @@ xfs_file_iomap_begin( trace_xfs_iomap_found(ip, offset, length, 0, &imap); } + if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields + & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + xfs_bmbt_to_iomap(ip, iomap, &imap); if (shared) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 515ba042d75c..d718a10c2271 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); -DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); - TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), diff --git a/include/linux/dax.h b/include/linux/dax.h index 895e16fcc62d..5258346c558c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -96,7 +96,9 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - const struct iomap_ops *ops); + pfn_t *pfnp, const struct iomap_ops *ops); +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/fs.h b/include/linux/fs.h index 269086440071..a2b5d64ea503 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1702,6 +1702,7 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); + unsigned long mmap_supported_flags; int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index ca10767ab73d..19a07de28212 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -21,9 +21,13 @@ struct vm_fault; /* * Flags for all iomap mappings: + * + * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access + * written data and requires fdatasync to commit them to persistent storage. */ #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ #define IOMAP_F_BOUNDARY 0x02 /* mapping ends at metadata boundary */ +#define IOMAP_F_DIRTY 0x04 /* uncommitted metadata */ /* * Flags that only need to be reported for IOMAP_REPORT requests: diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 606b6bce3a5b..296d1e0ea87b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid); int __jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); +int jbd2_transaction_committed(journal_t *journal, tid_t tid); int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 3eaad2fbf284..f8109ddb5ef1 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -18,6 +18,18 @@ #include <linux/sizes.h> #include <linux/types.h> #include <linux/uuid.h> +#include <linux/spinlock.h> + +struct badrange_entry { + u64 start; + u64 length; + struct list_head list; +}; + +struct badrange { + struct list_head list; + spinlock_t lock; +}; enum { /* when a dimm supports both PMEM and BLK access a label is required */ @@ -129,9 +141,12 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( } -int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); -void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, - phys_addr_t start, unsigned int len); +void badrange_init(struct badrange *badrange); +int badrange_add(struct badrange *badrange, u64 addr, u64 length); +void badrange_forget(struct badrange *badrange, phys_addr_t start, + unsigned int len); +int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, + u64 length); struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); diff --git a/include/linux/mm.h b/include/linux/mm.h index c7b1d617dff6..ee073146aaa7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -199,6 +199,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ @@ -1191,8 +1192,9 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ - -#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ +#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables + * and needs fsync() to complete (for + * synchronous page faults in DAX) */ #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ @@ -1210,7 +1212,8 @@ static inline void clear_page_pfmemalloc(struct page *page) { VM_FAULT_LOCKED, "LOCKED" }, \ { VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \ - { VM_FAULT_DONE_COW, "DONE_COW" } + { VM_FAULT_DONE_COW, "DONE_COW" }, \ + { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" } /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((x) << 12) diff --git a/include/linux/mman.h b/include/linux/mman.h index 7c87b6652244..6a4d1caaff5c 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -8,6 +8,48 @@ #include <linux/atomic.h> #include <uapi/linux/mman.h> +/* + * Arrange for legacy / undefined architecture specific flags to be + * ignored by mmap handling code. + */ +#ifndef MAP_32BIT +#define MAP_32BIT 0 +#endif +#ifndef MAP_HUGE_2MB +#define MAP_HUGE_2MB 0 +#endif +#ifndef MAP_HUGE_1GB +#define MAP_HUGE_1GB 0 +#endif +#ifndef MAP_UNINITIALIZED +#define MAP_UNINITIALIZED 0 +#endif +#ifndef MAP_SYNC +#define MAP_SYNC 0 +#endif + +/* + * The historical set of flags that all mmap implementations implicitly + * support when a ->mmap_validate() op is not provided in file_operations. + */ +#define LEGACY_MAP_MASK (MAP_SHARED \ + | MAP_PRIVATE \ + | MAP_FIXED \ + | MAP_ANONYMOUS \ + | MAP_DENYWRITE \ + | MAP_EXECUTABLE \ + | MAP_UNINITIALIZED \ + | MAP_GROWSDOWN \ + | MAP_LOCKED \ + | MAP_NORESERVE \ + | MAP_POPULATE \ + | MAP_NONBLOCK \ + | MAP_STACK \ + | MAP_HUGETLB \ + | MAP_32BIT \ + | MAP_HUGE_2MB \ + | MAP_HUGE_1GB) + extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; @@ -64,8 +106,9 @@ static inline bool arch_validate_prot(unsigned long prot) * ("bit1" and "bit2" must be single bits) */ #define _calc_vm_trans(x, bit1, bit2) \ + ((!(bit1) || !(bit2)) ? 0 : \ ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \ - : ((x) & (bit1)) / ((bit1) / (bit2))) + : ((x) & (bit1)) / ((bit1) / (bit2)))) /* * Combine the mmap "prot" argument into "vm_flags" used internally. @@ -87,7 +130,8 @@ calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | - _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | + _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ); } unsigned long vm_commit_limit(void); diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 8a8df5423dca..97b09fcf7e52 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -149,7 +149,6 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ TP_ARGS(inode, vmf, length, pfn, radix_entry)) DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); -DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); DECLARE_EVENT_CLASS(dax_pte_fault_class, TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), @@ -192,6 +191,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \ DEFINE_PTE_FAULT_EVENT(dax_pte_fault); DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); DEFINE_PTE_FAULT_EVENT(dax_load_hole); +DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry); +DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite); TRACE_EVENT(dax_insert_mapping, TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry), diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6d319c46fd90..f8b134f5608f 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -17,6 +17,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 2dffcbf705b3..653687d9771b 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -13,6 +13,7 @@ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ +#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */ /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ diff --git a/mm/mmap.c b/mm/mmap.c index 680506faceae..924839fac0e6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); + unsigned long flags_mask; + + flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; switch (flags & MAP_TYPE) { case MAP_SHARED: + /* + * Force use of MAP_SHARED_VALIDATE with non-legacy + * flags. E.g. MAP_SYNC is dangerous to use with + * MAP_SHARED as you don't know which consistency model + * you will get. We silently ignore unsupported flags + * with MAP_SHARED to preserve backward compatibility. + */ + flags &= LEGACY_MAP_MASK; + /* fall through */ + case MAP_SHARED_VALIDATE: + if (flags & ~flags_mask) + return -EOPNOTSUPP; if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) return -EACCES; diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 6d319c46fd90..f8b134f5608f 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -17,6 +17,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index 65368d9027f5..db33b28c5ef3 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -70,6 +70,7 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o libnvdimm-y += $(NVDIMM_SRC)/region.o libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o libnvdimm-y += $(NVDIMM_SRC)/label.o +libnvdimm-y += $(NVDIMM_SRC)/badrange.o libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index bef419d4266d..7217b2b953b5 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -168,8 +168,12 @@ struct nfit_test { spinlock_t lock; } ars_state; struct device *dimm_dev[NUM_DCR]; + struct badrange badrange; + struct work_struct work; }; +static struct workqueue_struct *nfit_wq; + static struct nfit_test *to_nfit_test(struct device *dev) { struct platform_device *pdev = to_platform_device(dev); @@ -234,48 +238,68 @@ static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd, return rc; } -#define NFIT_TEST_ARS_RECORDS 4 #define NFIT_TEST_CLEAR_ERR_UNIT 256 static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd, unsigned int buf_len) { + int ars_recs; + if (buf_len < sizeof(*nd_cmd)) return -EINVAL; + /* for testing, only store up to n records that fit within 4k */ + ars_recs = SZ_4K / sizeof(struct nd_ars_record); + nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status) - + NFIT_TEST_ARS_RECORDS * sizeof(struct nd_ars_record); + + ars_recs * sizeof(struct nd_ars_record); nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16; nd_cmd->clear_err_unit = NFIT_TEST_CLEAR_ERR_UNIT; return 0; } -/* - * Initialize the ars_state to return an ars_result 1 second in the future with - * a 4K error range in the middle of the requested address range. - */ -static void post_ars_status(struct ars_state *ars_state, u64 addr, u64 len) +static void post_ars_status(struct ars_state *ars_state, + struct badrange *badrange, u64 addr, u64 len) { struct nd_cmd_ars_status *ars_status; struct nd_ars_record *ars_record; + struct badrange_entry *be; + u64 end = addr + len - 1; + int i = 0; ars_state->deadline = jiffies + 1*HZ; ars_status = ars_state->ars_status; ars_status->status = 0; - ars_status->out_length = sizeof(struct nd_cmd_ars_status) - + sizeof(struct nd_ars_record); ars_status->address = addr; ars_status->length = len; ars_status->type = ND_ARS_PERSISTENT; - ars_status->num_records = 1; - ars_record = &ars_status->records[0]; - ars_record->handle = 0; - ars_record->err_address = addr + len / 2; - ars_record->length = SZ_4K; + + spin_lock(&badrange->lock); + list_for_each_entry(be, &badrange->list, list) { + u64 be_end = be->start + be->length - 1; + u64 rstart, rend; + + /* skip entries outside the range */ + if (be_end < addr || be->start > end) + continue; + + rstart = (be->start < addr) ? addr : be->start; + rend = (be_end < end) ? be_end : end; + ars_record = &ars_status->records[i]; + ars_record->handle = 0; + ars_record->err_address = rstart; + ars_record->length = rend - rstart + 1; + i++; + } + spin_unlock(&badrange->lock); + ars_status->num_records = i; + ars_status->out_length = sizeof(struct nd_cmd_ars_status) + + i * sizeof(struct nd_ars_record); } -static int nfit_test_cmd_ars_start(struct ars_state *ars_state, +static int nfit_test_cmd_ars_start(struct nfit_test *t, + struct ars_state *ars_state, struct nd_cmd_ars_start *ars_start, unsigned int buf_len, int *cmd_rc) { @@ -289,7 +313,7 @@ static int nfit_test_cmd_ars_start(struct ars_state *ars_state, } else { ars_start->status = 0; ars_start->scrub_time = 1; - post_ars_status(ars_state, ars_start->address, + post_ars_status(ars_state, &t->badrange, ars_start->address, ars_start->length); *cmd_rc = 0; } @@ -320,7 +344,8 @@ static int nfit_test_cmd_ars_status(struct ars_state *ars_state, return 0; } -static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err, +static int nfit_test_cmd_clear_error(struct nfit_test *t, + struct nd_cmd_clear_error *clear_err, unsigned int buf_len, int *cmd_rc) { const u64 mask = NFIT_TEST_CLEAR_ERR_UNIT - 1; @@ -330,18 +355,91 @@ static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err, if ((clear_err->address & mask) || (clear_err->length & mask)) return -EINVAL; - /* - * Report 'all clear' success for all commands even though a new - * scrub will find errors again. This is enough to have the - * error removed from the 'badblocks' tracking in the pmem - * driver. - */ + badrange_forget(&t->badrange, clear_err->address, clear_err->length); clear_err->status = 0; clear_err->cleared = clear_err->length; *cmd_rc = 0; return 0; } +struct region_search_spa { + u64 addr; + struct nd_region *region; +}; + +static int is_region_device(struct device *dev) +{ + return !strncmp(dev->kobj.name, "region", 6); +} + +static int nfit_test_search_region_spa(struct device *dev, void *data) +{ + struct region_search_spa *ctx = data; + struct nd_region *nd_region; + resource_size_t ndr_end; + + if (!is_region_device(dev)) + return 0; + + nd_region = to_nd_region(dev); + ndr_end = nd_region->ndr_start + nd_region->ndr_size; + + if (ctx->addr >= nd_region->ndr_start && ctx->addr < ndr_end) { + ctx->region = nd_region; + return 1; + } + + return 0; +} + +static int nfit_test_search_spa(struct nvdimm_bus *bus, + struct nd_cmd_translate_spa *spa) +{ + int ret; + struct nd_region *nd_region = NULL; + struct nvdimm *nvdimm = NULL; + struct nd_mapping *nd_mapping = NULL; + struct region_search_spa ctx = { + .addr = spa->spa, + .region = NULL, + }; + u64 dpa; + + ret = device_for_each_child(&bus->dev, &ctx, + nfit_test_search_region_spa); + + if (!ret) + return -ENODEV; + + nd_region = ctx.region; + + dpa = ctx.addr - nd_region->ndr_start; + + /* + * last dimm is selected for test + */ + nd_mapping = &nd_region->mapping[nd_region->ndr_mappings - 1]; + nvdimm = nd_mapping->nvdimm; + + spa->devices[0].nfit_device_handle = handle[nvdimm->id]; + spa->num_nvdimms = 1; + spa->devices[0].dpa = dpa; + + return 0; +} + +static int nfit_test_cmd_translate_spa(struct nvdimm_bus *bus, + struct nd_cmd_translate_spa *spa, unsigned int buf_len) +{ + if (buf_len < spa->translate_length) + return -EINVAL; + + if (nfit_test_search_spa(bus, spa) < 0 || !spa->num_nvdimms) + spa->status = 2; + + return 0; +} + static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int buf_len) { static const struct nd_smart_payload smart_data = { @@ -378,6 +476,93 @@ static int nfit_test_cmd_smart_threshold(struct nd_cmd_smart_threshold *smart_t, return 0; } +static void uc_error_notify(struct work_struct *work) +{ + struct nfit_test *t = container_of(work, typeof(*t), work); + + __acpi_nfit_notify(&t->pdev.dev, t, NFIT_NOTIFY_UC_MEMORY_ERROR); +} + +static int nfit_test_cmd_ars_error_inject(struct nfit_test *t, + struct nd_cmd_ars_err_inj *err_inj, unsigned int buf_len) +{ + int rc; + + if (buf_len != sizeof(*err_inj)) { + rc = -EINVAL; + goto err; + } + + if (err_inj->err_inj_spa_range_length <= 0) { + rc = -EINVAL; + goto err; + } + + rc = badrange_add(&t->badrange, err_inj->err_inj_spa_range_base, + err_inj->err_inj_spa_range_length); + if (rc < 0) + goto err; + + if (err_inj->err_inj_options & (1 << ND_ARS_ERR_INJ_OPT_NOTIFY)) + queue_work(nfit_wq, &t->work); + + err_inj->status = 0; + return 0; + +err: + err_inj->status = NFIT_ARS_INJECT_INVALID; + return rc; +} + +static int nfit_test_cmd_ars_inject_clear(struct nfit_test *t, + struct nd_cmd_ars_err_inj_clr *err_clr, unsigned int buf_len) +{ + int rc; + + if (buf_len != sizeof(*err_clr)) { + rc = -EINVAL; + goto err; + } + + if (err_clr->err_inj_clr_spa_range_length <= 0) { + rc = -EINVAL; + goto err; + } + + badrange_forget(&t->badrange, err_clr->err_inj_clr_spa_range_base, + err_clr->err_inj_clr_spa_range_length); + + err_clr->status = 0; + return 0; + +err: + err_clr->status = NFIT_ARS_INJECT_INVALID; + return rc; +} + +static int nfit_test_cmd_ars_inject_status(struct nfit_test *t, + struct nd_cmd_ars_err_inj_stat *err_stat, + unsigned int buf_len) +{ + struct badrange_entry *be; + int max = SZ_4K / sizeof(struct nd_error_stat_query_record); + int i = 0; + + err_stat->status = 0; + spin_lock(&t->badrange.lock); + list_for_each_entry(be, &t->badrange.list, list) { + err_stat->record[i].err_inj_stat_spa_range_base = be->start; + err_stat->record[i].err_inj_stat_spa_range_length = be->length; + i++; + if (i > max) + break; + } + spin_unlock(&t->badrange.lock); + err_stat->inj_err_rec_count = i; + + return 0; +} + static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) @@ -449,6 +634,38 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, } } else { struct ars_state *ars_state = &t->ars_state; + struct nd_cmd_pkg *call_pkg = buf; + + if (!nd_desc) + return -ENOTTY; + + if (cmd == ND_CMD_CALL) { + func = call_pkg->nd_command; + + buf_len = call_pkg->nd_size_in + call_pkg->nd_size_out; + buf = (void *) call_pkg->nd_payload; + + switch (func) { + case NFIT_CMD_TRANSLATE_SPA: + rc = nfit_test_cmd_translate_spa( + acpi_desc->nvdimm_bus, buf, buf_len); + return rc; + case NFIT_CMD_ARS_INJECT_SET: + rc = nfit_test_cmd_ars_error_inject(t, buf, + buf_len); + return rc; + case NFIT_CMD_ARS_INJECT_CLEAR: + rc = nfit_test_cmd_ars_inject_clear(t, buf, + buf_len); + return rc; + case NFIT_CMD_ARS_INJECT_GET: + rc = nfit_test_cmd_ars_inject_status(t, buf, + buf_len); + return rc; + default: + return -ENOTTY; + } + } if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask)) return -ENOTTY; @@ -458,15 +675,15 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, rc = nfit_test_cmd_ars_cap(buf, buf_len); break; case ND_CMD_ARS_START: - rc = nfit_test_cmd_ars_start(ars_state, buf, buf_len, - cmd_rc); + rc = nfit_test_cmd_ars_start(t, ars_state, buf, + buf_len, cmd_rc); break; case ND_CMD_ARS_STATUS: rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len, cmd_rc); break; case ND_CMD_CLEAR_ERROR: - rc = nfit_test_cmd_clear_error(buf, buf_len, cmd_rc); + rc = nfit_test_cmd_clear_error(t, buf, buf_len, cmd_rc); break; default: return -ENOTTY; @@ -566,10 +783,9 @@ static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr) static int ars_state_init(struct device *dev, struct ars_state *ars_state) { + /* for testing, only store up to n records that fit within 4k */ ars_state->ars_status = devm_kzalloc(dev, - sizeof(struct nd_cmd_ars_status) - + sizeof(struct nd_ars_record) * NFIT_TEST_ARS_RECORDS, - GFP_KERNEL); + sizeof(struct nd_cmd_ars_status) + SZ_4K, GFP_KERNEL); if (!ars_state->ars_status) return -ENOMEM; spin_lock_init(&ars_state->lock); @@ -1419,7 +1635,8 @@ static void nfit_test0_setup(struct nfit_test *t) + i * sizeof(u64); } - post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE); + post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0], + SPA0_SIZE); acpi_desc = &t->acpi_desc; set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en); @@ -1430,7 +1647,12 @@ static void nfit_test0_setup(struct nfit_test *t) set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en); + set_bit(ND_CMD_CALL, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_SMART_THRESHOLD, &acpi_desc->dimm_cmd_force_en); + set_bit(NFIT_CMD_TRANSLATE_SPA, &acpi_desc->bus_nfit_cmd_force_en); + set_bit(NFIT_CMD_ARS_INJECT_SET, &acpi_desc->bus_nfit_cmd_force_en); + set_bit(NFIT_CMD_ARS_INJECT_CLEAR, &acpi_desc->bus_nfit_cmd_force_en); + set_bit(NFIT_CMD_ARS_INJECT_GET, &acpi_desc->bus_nfit_cmd_force_en); } static void nfit_test1_setup(struct nfit_test *t) @@ -1520,7 +1742,8 @@ static void nfit_test1_setup(struct nfit_test *t) dcr->code = NFIT_FIC_BYTE; dcr->windows = 0; - post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE); + post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0], + SPA2_SIZE); acpi_desc = &t->acpi_desc; set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en); @@ -1589,6 +1812,7 @@ static int nfit_ctl_test(struct device *dev) unsigned long mask, cmd_size, offset; union { struct nd_cmd_get_config_size cfg_size; + struct nd_cmd_clear_error clear_err; struct nd_cmd_ars_status ars_stat; struct nd_cmd_ars_cap ars_cap; char buf[sizeof(struct nd_cmd_ars_status) @@ -1613,10 +1837,15 @@ static int nfit_ctl_test(struct device *dev) .cmd_mask = 1UL << ND_CMD_ARS_CAP | 1UL << ND_CMD_ARS_START | 1UL << ND_CMD_ARS_STATUS - | 1UL << ND_CMD_CLEAR_ERROR, + | 1UL << ND_CMD_CLEAR_ERROR + | 1UL << ND_CMD_CALL, .module = THIS_MODULE, .provider_name = "ACPI.NFIT", .ndctl = acpi_nfit_ctl, + .bus_dsm_mask = 1UL << NFIT_CMD_TRANSLATE_SPA + | 1UL << NFIT_CMD_ARS_INJECT_SET + | 1UL << NFIT_CMD_ARS_INJECT_CLEAR + | 1UL << NFIT_CMD_ARS_INJECT_GET, }, .dev = &adev->dev, }; @@ -1767,6 +1996,23 @@ static int nfit_ctl_test(struct device *dev) return -EIO; } + /* test clear error */ + cmd_size = sizeof(cmds.clear_err); + cmds.clear_err = (struct nd_cmd_clear_error) { + .length = 512, + .cleared = 512, + }; + rc = setup_result(cmds.buf, cmd_size); + if (rc) + return rc; + rc = acpi_nfit_ctl(&acpi_desc->nd_desc, NULL, ND_CMD_CLEAR_ERROR, + cmds.buf, cmd_size, &cmd_rc); + if (rc < 0 || cmd_rc) { + dev_dbg(dev, "%s: failed at: %d rc: %d cmd_rc: %d\n", + __func__, __LINE__, rc, cmd_rc); + return -EIO; + } + return 0; } @@ -1915,6 +2161,10 @@ static __init int nfit_test_init(void) nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm); + nfit_wq = create_singlethread_workqueue("nfit"); + if (!nfit_wq) + return -ENOMEM; + nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm"); if (IS_ERR(nfit_test_dimm)) { rc = PTR_ERR(nfit_test_dimm); @@ -1931,6 +2181,7 @@ static __init int nfit_test_init(void) goto err_register; } INIT_LIST_HEAD(&nfit_test->resources); + badrange_init(&nfit_test->badrange); switch (i) { case 0: nfit_test->num_pm = NUM_PM; @@ -1966,6 +2217,7 @@ static __init int nfit_test_init(void) goto err_register; instances[i] = nfit_test; + INIT_WORK(&nfit_test->work, uc_error_notify); } rc = platform_driver_register(&nfit_test_driver); @@ -1974,6 +2226,7 @@ static __init int nfit_test_init(void) return 0; err_register: + destroy_workqueue(nfit_wq); for (i = 0; i < NUM_NFITS; i++) if (instances[i]) platform_device_unregister(&instances[i]->pdev); @@ -1989,6 +2242,8 @@ static __exit void nfit_test_exit(void) { int i; + flush_workqueue(nfit_wq); + destroy_workqueue(nfit_wq); for (i = 0; i < NUM_NFITS; i++) platform_device_unregister(&instances[i]->pdev); platform_driver_unregister(&nfit_test_driver); diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index d3d63dd5ed38..113b44675a71 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -32,6 +32,58 @@ struct nfit_test_resource { void *buf; }; +#define ND_TRANSLATE_SPA_STATUS_INVALID_SPA 2 +#define NFIT_ARS_INJECT_INVALID 2 + +enum err_inj_options { + ND_ARS_ERR_INJ_OPT_NOTIFY = 0, +}; + +/* nfit commands */ +enum nfit_cmd_num { + NFIT_CMD_TRANSLATE_SPA = 5, + NFIT_CMD_ARS_INJECT_SET = 7, + NFIT_CMD_ARS_INJECT_CLEAR = 8, + NFIT_CMD_ARS_INJECT_GET = 9, +}; + +struct nd_cmd_translate_spa { + __u64 spa; + __u32 status; + __u8 flags; + __u8 _reserved[3]; + __u64 translate_length; + __u32 num_nvdimms; + struct nd_nvdimm_device { + __u32 nfit_device_handle; + __u32 _reserved; + __u64 dpa; + } __packed devices[0]; + +} __packed; + +struct nd_cmd_ars_err_inj { + __u64 err_inj_spa_range_base; + __u64 err_inj_spa_range_length; + __u8 err_inj_options; + __u32 status; +} __packed; + +struct nd_cmd_ars_err_inj_clr { + __u64 err_inj_clr_spa_range_base; + __u64 err_inj_clr_spa_range_length; + __u32 status; +} __packed; + +struct nd_cmd_ars_err_inj_stat { + __u32 status; + __u32 inj_err_rec_count; + struct nd_error_stat_query_record { + __u64 err_inj_stat_spa_range_base; + __u64 err_inj_stat_spa_range_length; + } __packed record[0]; +} __packed; + union acpi_object; typedef void *acpi_handle; |