summaryrefslogtreecommitdiffstats
path: root/drivers/nvdimm
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2017-04-28 10:23:37 -0700
committerIngo Molnar <mingo@kernel.org>2017-05-01 09:15:53 +0200
commit71389703839ebe9cb426c72d5f0bd549592e583c (patch)
treeebae9604d3f43ce673a103c4b897ebac57d9aa57 /drivers/nvdimm
parentdbd68d8e84c606673ebbcf15862f8c155fa92326 (diff)
downloadlinux-71389703839ebe9cb426c72d5f0bd549592e583c.tar.bz2
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash
The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'drivers/nvdimm')
-rw-r--r--drivers/nvdimm/pmem.c13
1 files changed, 11 insertions, 2 deletions
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 5b536be5a12e..fb7bbc79ac26 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -25,6 +25,7 @@
#include <linux/badblocks.h>
#include <linux/memremap.h>
#include <linux/vmalloc.h>
+#include <linux/blk-mq.h>
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/pmem.h>
@@ -231,6 +232,11 @@ static void pmem_release_queue(void *q)
blk_cleanup_queue(q);
}
+static void pmem_freeze_queue(void *q)
+{
+ blk_mq_freeze_queue_start(q);
+}
+
static void pmem_release_disk(void *disk)
{
del_gendisk(disk);
@@ -284,6 +290,9 @@ static int pmem_attach_disk(struct device *dev,
if (!q)
return -ENOMEM;
+ if (devm_add_action_or_reset(dev, pmem_release_queue, q))
+ return -ENOMEM;
+
pmem->pfn_flags = PFN_DEV;
if (is_nd_pfn(dev)) {
addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
@@ -303,10 +312,10 @@ static int pmem_attach_disk(struct device *dev,
pmem->size, ARCH_MEMREMAP_PMEM);
/*
- * At release time the queue must be dead before
+ * At release time the queue must be frozen before
* devm_memremap_pages is unwound
*/
- if (devm_add_action_or_reset(dev, pmem_release_queue, q))
+ if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
return -ENOMEM;
if (IS_ERR(addr))