From 8b9ab62662048a3274361c7e5f64037c2c133e2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Jun 2022 08:05:52 +0200 Subject: block: remove blk_cleanup_disk blk_cleanup_disk is nothing but a trivial wrapper for put_disk now, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20220619060552.1850436-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvdimm/btt.c | 4 ++-- drivers/nvdimm/pmem.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/nvdimm') diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 9613e54c7a67..5e622c0d4b66 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1548,14 +1548,14 @@ static int btt_blk_init(struct btt *btt) return 0; out_cleanup_disk: - blk_cleanup_disk(btt->btt_disk); + put_disk(btt->btt_disk); return rc; } static void btt_blk_cleanup(struct btt *btt) { del_gendisk(btt->btt_disk); - blk_cleanup_disk(btt->btt_disk); + put_disk(btt->btt_disk); } /** diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 629d10fcf53b..a72b81fa3242 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -450,7 +450,7 @@ static void pmem_release_disk(void *__pmem) put_dax(pmem->dax_dev); del_gendisk(pmem->disk); - blk_cleanup_disk(pmem->disk); + put_disk(pmem->disk); } static int pmem_attach_disk(struct device *dev, @@ -596,7 +596,7 @@ out_cleanup_dax: kill_dax(pmem->dax_dev); put_dax(pmem->dax_dev); out: - blk_cleanup_disk(pmem->disk); + put_disk(pmem->disk); return rc; } -- cgit v1.2.3 From 86947df3a9236481276e8baadde50a403b02b4d4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:29 -0700 Subject: block: Change the type of the last .rw_page() argument All .rw_page() callers pass an enum req_op value as last argument. Make this explicit by changing the type of the last argument into enum req_op. See also commit 3f289dcb4b26 ("block: make bdev_ops->rw_page() take a REQ_OP instead of bool"). Cc: Tejun Heo Cc: Minchan Kim Cc: Dan Williams Cc: Hannes Reinecke Cc: Damien Le Moal Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/block/brd.c | 2 +- drivers/block/zram/zram_drv.c | 2 +- drivers/nvdimm/btt.c | 2 +- drivers/nvdimm/pmem.c | 2 +- include/linux/blkdev.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/nvdimm') diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 9e26d5e769f3..7b82876af36e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -310,7 +310,7 @@ static void brd_submit_bio(struct bio *bio) } static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, unsigned int op) + struct page *page, enum req_op op) { struct brd_device *brd = bdev->bd_disk->private_data; int err; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e5233c911e43..a35b86c58aa2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1631,7 +1631,7 @@ static void zram_slot_free_notify(struct block_device *bdev, } static int zram_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, unsigned int op) + struct page *page, enum req_op op) { int offset, ret; u32 index; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 5e622c0d4b66..dfbf73145d16 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1483,7 +1483,7 @@ static void btt_submit_bio(struct bio *bio) } static int btt_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, unsigned int op) + struct page *page, enum req_op op) { struct btt *btt = bdev->bd_disk->private_data; int rc; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index a72b81fa3242..f36efcc11f67 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -239,7 +239,7 @@ static void pmem_submit_bio(struct bio *bio) } static int pmem_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, unsigned int op) + struct page *page, enum req_op op) { struct pmem_device *pmem = bdev->bd_disk->private_data; blk_status_t rc; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2f13f0062192..ca2ff113ea00 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1381,7 +1381,7 @@ struct block_device_operations { unsigned int flags); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); + int (*rw_page)(struct block_device *, sector_t, struct page *, enum req_op); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, -- cgit v1.2.3 From ba229aa8f2494bb76aa3f0c80e8a6c0023c829d7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 14 Jul 2022 11:06:45 -0700 Subject: nvdimm-btt: Use the enum req_op type Improve static type checking by using the enum req_op type where appropriate. Cc: Dan Williams Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20220714180729.1065367-20-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/nvdimm/btt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvdimm') diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index dfbf73145d16..0297b7882e33 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1422,7 +1422,7 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, struct page *page, unsigned int len, unsigned int off, - unsigned int op, sector_t sector) + enum req_op op, sector_t sector) { int ret; -- cgit v1.2.3 From 33a8f7f2b3a3437d016d1b4047a4fd37eb6951b3 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:27 +0800 Subject: pagemap,pmem: introduce ->memory_failure() When memory-failure occurs, we call this function which is implemented by each kind of devices. For the fsdax case, pmem device driver implements it. Pmem device driver will find out the filesystem in which the corrupted page located in. With dax_holder notify support, we are able to notify the memory failure from pmem driver to upper layers. If there is something not support in the notify routine, memory_failure will fall back to the generic hanlder. Link: https://lkml.kernel.org/r/20220603053738.1218681-4-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Reviewed-by: Naoya Horiguchi Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- drivers/nvdimm/pmem.c | 17 +++++++++++++++++ include/linux/memremap.h | 12 ++++++++++++ mm/memory-failure.c | 14 ++++++++++++++ 3 files changed, 43 insertions(+) (limited to 'drivers/nvdimm') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 629d10fcf53b..107c9cb3d57d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -453,6 +453,21 @@ static void pmem_release_disk(void *__pmem) blk_cleanup_disk(pmem->disk); } +static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap, + unsigned long pfn, unsigned long nr_pages, int mf_flags) +{ + struct pmem_device *pmem = + container_of(pgmap, struct pmem_device, pgmap); + u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset; + u64 len = nr_pages << PAGE_SHIFT; + + return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags); +} + +static const struct dev_pagemap_ops fsdax_pagemap_ops = { + .memory_failure = pmem_pagemap_memory_failure, +}; + static int pmem_attach_disk(struct device *dev, struct nd_namespace_common *ndns) { @@ -514,6 +529,7 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; if (is_nd_pfn(dev)) { pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); @@ -527,6 +543,7 @@ static int pmem_attach_disk(struct device *dev, pmem->pgmap.range.end = res->end; pmem->pgmap.nr_range = 1; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; bb_range = pmem->pgmap.range; diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 09320b7f706c..19010491a603 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -87,6 +87,18 @@ struct dev_pagemap_ops { * the page back to a CPU accessible page. */ vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); + + /* + * Handle the memory failure happens on a range of pfns. Notify the + * processes who are using these pfns, and try to recover the data on + * them if necessary. The mf_flags is finally passed to the recover + * function through the whole notify routine. + * + * When this is not implemented, or it returns -EOPNOTSUPP, the caller + * will fall back to a common handler called mf_generic_kill_procs(). + */ + int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, + unsigned long nr_pages, int mf_flags); }; #define PGMAP_ALTMAP_VALID (1 << 0) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f8a8a5d45eba..46c77151f726 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1748,6 +1748,20 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, if (!pgmap_pfn_valid(pgmap, pfn)) goto out; + /* + * Call driver's implementation to handle the memory failure, otherwise + * fall back to generic handler. + */ + if (pgmap->ops->memory_failure) { + rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags); + /* + * Fall back to generic handler too if operation is not + * supported inside the driver/device/filesystem. + */ + if (rc != -EOPNOTSUPP) + goto out; + } + rc = mf_generic_kill_procs(pfn, flags, pgmap); out: /* drop pgmap ref acquired in caller */ -- cgit v1.2.3 From 04ad63f086d1a9649b8b082748cbc7a570ade461 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 11 Jan 2022 08:06:40 -0800 Subject: cxl/region: Introduce cxl_pmem_region objects The LIBNVDIMM subsystem is a platform agnostic representation of system NVDIMM / persistent memory resources. To date, the CXL subsystem's interaction with LIBNVDIMM has been to register an nvdimm-bridge device and cxl_nvdimm objects to proxy CXL capabilities into existing LIBNVDIMM subsystem mechanics. With regions the approach is the same. Create a new cxl_pmem_region object to proxy CXL region details into a LIBNVDIMM definition. With this enabling LIBNVDIMM can partition CXL persistent memory regions with legacy namespace labels. A follow-on patch will add CXL region label and CXL namespace label support to persist region configurations across driver reload / system-reset events. Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165784340111.1758207.3036498385188290968.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/core.h | 3 + drivers/cxl/core/pmem.c | 4 +- drivers/cxl/core/port.c | 2 + drivers/cxl/core/region.c | 142 +++++++++++++++++++++++++- drivers/cxl/cxl.h | 36 ++++++- drivers/cxl/pmem.c | 238 ++++++++++++++++++++++++++++++++++++++++++- drivers/nvdimm/region_devs.c | 28 +++-- include/linux/libnvdimm.h | 5 + 8 files changed, 446 insertions(+), 12 deletions(-) (limited to 'drivers/nvdimm') diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 391aadf9e7fa..1d8f87be283f 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -13,11 +13,13 @@ extern struct attribute_group cxl_base_attribute_group; extern struct device_attribute dev_attr_create_pmem_region; extern struct device_attribute dev_attr_delete_region; extern struct device_attribute dev_attr_region; +extern const struct device_type cxl_pmem_region_type; extern const struct device_type cxl_region_type; void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); #define CXL_REGION_ATTR(x) (&dev_attr_##x.attr) #define CXL_REGION_TYPE(x) (&cxl_region_type) #define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr), +#define CXL_PMEM_REGION_TYPE(x) (&cxl_pmem_region_type) int cxl_region_init(void); void cxl_region_exit(void); #else @@ -34,6 +36,7 @@ static inline void cxl_region_exit(void) #define CXL_REGION_ATTR(x) NULL #define CXL_REGION_TYPE(x) NULL #define SET_CXL_REGION_ATTR(x) +#define CXL_PMEM_REGION_TYPE(x) NULL #endif struct cxl_send_command; diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index bec7cfb54ebf..1d12a8206444 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -62,9 +62,9 @@ static int match_nvdimm_bridge(struct device *dev, void *data) return is_cxl_nvdimm_bridge(dev); } -struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd) +struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *start) { - struct cxl_port *port = find_cxl_root(&cxl_nvd->dev); + struct cxl_port *port = find_cxl_root(start); struct device *dev; if (!port) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index ba66ce25de30..3d2d0119cc3d 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -44,6 +44,8 @@ static int cxl_device_id(struct device *dev) return CXL_DEVICE_NVDIMM_BRIDGE; if (dev->type == &cxl_nvdimm_type) return CXL_DEVICE_NVDIMM; + if (dev->type == CXL_PMEM_REGION_TYPE()) + return CXL_DEVICE_PMEM_REGION; if (is_cxl_port(dev)) { if (is_cxl_root(to_cxl_port(dev))) return CXL_DEVICE_ROOT; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index bf4e7f5499c1..dc71ec457608 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1650,6 +1650,139 @@ static ssize_t delete_region_store(struct device *dev, } DEVICE_ATTR_WO(delete_region); +static void cxl_pmem_region_release(struct device *dev) +{ + struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev); + int i; + + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd; + + put_device(&cxlmd->dev); + } + + kfree(cxlr_pmem); +} + +static const struct attribute_group *cxl_pmem_region_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +const struct device_type cxl_pmem_region_type = { + .name = "cxl_pmem_region", + .release = cxl_pmem_region_release, + .groups = cxl_pmem_region_attribute_groups, +}; + +bool is_cxl_pmem_region(struct device *dev) +{ + return dev->type == &cxl_pmem_region_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, CXL); + +struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev), + "not a cxl_pmem_region device\n")) + return NULL; + return container_of(dev, struct cxl_pmem_region, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, CXL); + +static struct lock_class_key cxl_pmem_region_key; + +static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_pmem_region *cxlr_pmem; + struct device *dev; + int i; + + down_read(&cxl_region_rwsem); + if (p->state != CXL_CONFIG_COMMIT) { + cxlr_pmem = ERR_PTR(-ENXIO); + goto out; + } + + cxlr_pmem = kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets), + GFP_KERNEL); + if (!cxlr_pmem) { + cxlr_pmem = ERR_PTR(-ENOMEM); + goto out; + } + + cxlr_pmem->hpa_range.start = p->res->start; + cxlr_pmem->hpa_range.end = p->res->end; + + /* Snapshot the region configuration underneath the cxl_region_rwsem */ + cxlr_pmem->nr_mappings = p->nr_targets; + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + + m->cxlmd = cxlmd; + get_device(&cxlmd->dev); + m->start = cxled->dpa_res->start; + m->size = resource_size(cxled->dpa_res); + m->position = i; + } + + dev = &cxlr_pmem->dev; + cxlr_pmem->cxlr = cxlr; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_pmem_region_key); + device_set_pm_not_required(dev); + dev->parent = &cxlr->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_pmem_region_type; +out: + up_read(&cxl_region_rwsem); + + return cxlr_pmem; +} + +static void cxlr_pmem_unregister(void *dev) +{ + device_unregister(dev); +} + +/** + * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge + * @cxlr: parent CXL region for this pmem region bridge device + * + * Return: 0 on success negative error code on failure. + */ +static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) +{ + struct cxl_pmem_region *cxlr_pmem; + struct device *dev; + int rc; + + cxlr_pmem = cxl_pmem_region_alloc(cxlr); + if (IS_ERR(cxlr_pmem)) + return PTR_ERR(cxlr_pmem); + + dev = &cxlr_pmem->dev; + rc = dev_set_name(dev, "pmem_region%d", cxlr->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), + dev_name(dev)); + + return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev); + +err: + put_device(dev); + return rc; +} + static int cxl_region_probe(struct device *dev) { struct cxl_region *cxlr = to_cxl_region(dev); @@ -1673,7 +1806,14 @@ static int cxl_region_probe(struct device *dev) */ up_read(&cxl_region_rwsem); - return rc; + switch (cxlr->mode) { + case CXL_DECODER_PMEM: + return devm_cxl_add_pmem_region(cxlr); + default: + dev_dbg(&cxlr->dev, "unsupported region mode: %d\n", + cxlr->mode); + return -ENXIO; + } } static struct cxl_driver cxl_region_driver = { diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 46d3f173a700..75674400cc8d 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -419,6 +419,25 @@ struct cxl_nvdimm { struct device dev; struct cxl_memdev *cxlmd; struct cxl_nvdimm_bridge *bridge; + struct cxl_pmem_region *region; +}; + +struct cxl_pmem_region_mapping { + struct cxl_memdev *cxlmd; + struct cxl_nvdimm *cxl_nvd; + u64 start; + u64 size; + int position; +}; + +struct cxl_pmem_region { + struct device dev; + struct cxl_region *cxlr; + struct nd_region *nd_region; + struct cxl_nvdimm_bridge *bridge; + struct range hpa_range; + int nr_mappings; + struct cxl_pmem_region_mapping mapping[]; }; /** @@ -601,6 +620,7 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv); #define CXL_DEVICE_ROOT 4 #define CXL_DEVICE_MEMORY_EXPANDER 5 #define CXL_DEVICE_REGION 6 +#define CXL_DEVICE_PMEM_REGION 7 #define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*") #define CXL_MODALIAS_FMT "cxl:t%d" @@ -612,7 +632,21 @@ struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm_bridge(struct device *dev); int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd); -struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd); +struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *dev); + +#ifdef CONFIG_CXL_REGION +bool is_cxl_pmem_region(struct device *dev); +struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); +#else +static inline bool is_cxl_pmem_region(struct device *dev) +{ + return false; +} +static inline struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) +{ + return NULL; +} +#endif /* * Unit test builds overrides this to __weak, find the 'strong' version diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index b271f6e90b91..e69f99a0747d 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "cxlmem.h" #include "cxl.h" @@ -27,6 +28,19 @@ static void clear_exclusive(void *cxlds) static void unregister_nvdimm(void *nvdimm) { struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm); + struct cxl_nvdimm_bridge *cxl_nvb = cxl_nvd->bridge; + struct cxl_pmem_region *cxlr_pmem; + + device_lock(&cxl_nvb->dev); + cxlr_pmem = cxl_nvd->region; + dev_set_drvdata(&cxl_nvd->dev, NULL); + cxl_nvd->region = NULL; + device_unlock(&cxl_nvb->dev); + + if (cxlr_pmem) { + device_release_driver(&cxlr_pmem->dev); + put_device(&cxlr_pmem->dev); + } nvdimm_delete(nvdimm); cxl_nvd->bridge = NULL; @@ -42,7 +56,7 @@ static int cxl_nvdimm_probe(struct device *dev) struct nvdimm *nvdimm; int rc; - cxl_nvb = cxl_find_nvdimm_bridge(cxl_nvd); + cxl_nvb = cxl_find_nvdimm_bridge(dev); if (!cxl_nvb) return -ENXIO; @@ -223,6 +237,21 @@ static int cxl_nvdimm_release_driver(struct device *dev, void *cxl_nvb) return 0; } +static int cxl_pmem_region_release_driver(struct device *dev, void *cxl_nvb) +{ + struct cxl_pmem_region *cxlr_pmem; + + if (!is_cxl_pmem_region(dev)) + return 0; + + cxlr_pmem = to_cxl_pmem_region(dev); + if (cxlr_pmem->bridge != cxl_nvb) + return 0; + + device_release_driver(dev); + return 0; +} + static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb, struct nvdimm_bus *nvdimm_bus) { @@ -234,6 +263,8 @@ static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb, * nvdimm_bus_unregister() rips the nvdimm objects out from * underneath them. */ + bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb, + cxl_pmem_region_release_driver); bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb, cxl_nvdimm_release_driver); nvdimm_bus_unregister(nvdimm_bus); @@ -328,6 +359,203 @@ static struct cxl_driver cxl_nvdimm_bridge_driver = { .id = CXL_DEVICE_NVDIMM_BRIDGE, }; +static int match_cxl_nvdimm(struct device *dev, void *data) +{ + return is_cxl_nvdimm(dev); +} + +static void unregister_nvdimm_region(void *nd_region) +{ + struct cxl_nvdimm_bridge *cxl_nvb; + struct cxl_pmem_region *cxlr_pmem; + int i; + + cxlr_pmem = nd_region_provider_data(nd_region); + cxl_nvb = cxlr_pmem->bridge; + device_lock(&cxl_nvb->dev); + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + struct cxl_nvdimm *cxl_nvd = m->cxl_nvd; + + if (cxl_nvd->region) { + put_device(&cxlr_pmem->dev); + cxl_nvd->region = NULL; + } + } + device_unlock(&cxl_nvb->dev); + + nvdimm_region_delete(nd_region); +} + +static void cxlr_pmem_remove_resource(void *res) +{ + remove_resource(res); +} + +struct cxl_pmem_region_info { + u64 offset; + u64 serial; +}; + +static int cxl_pmem_region_probe(struct device *dev) +{ + struct nd_mapping_desc mappings[CXL_DECODER_MAX_INTERLEAVE]; + struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev); + struct cxl_region *cxlr = cxlr_pmem->cxlr; + struct cxl_pmem_region_info *info = NULL; + struct cxl_nvdimm_bridge *cxl_nvb; + struct nd_interleave_set *nd_set; + struct nd_region_desc ndr_desc; + struct cxl_nvdimm *cxl_nvd; + struct nvdimm *nvdimm; + struct resource *res; + int rc, i = 0; + + cxl_nvb = cxl_find_nvdimm_bridge(&cxlr_pmem->mapping[0].cxlmd->dev); + if (!cxl_nvb) { + dev_dbg(dev, "bridge not found\n"); + return -ENXIO; + } + cxlr_pmem->bridge = cxl_nvb; + + device_lock(&cxl_nvb->dev); + if (!cxl_nvb->nvdimm_bus) { + dev_dbg(dev, "nvdimm bus not found\n"); + rc = -ENXIO; + goto err; + } + + memset(&mappings, 0, sizeof(mappings)); + memset(&ndr_desc, 0, sizeof(ndr_desc)); + + res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL); + if (!res) { + rc = -ENOMEM; + goto err; + } + + res->name = "Persistent Memory"; + res->start = cxlr_pmem->hpa_range.start; + res->end = cxlr_pmem->hpa_range.end; + res->flags = IORESOURCE_MEM; + res->desc = IORES_DESC_PERSISTENT_MEMORY; + + rc = insert_resource(&iomem_resource, res); + if (rc) + goto err; + + rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res); + if (rc) + goto err; + + ndr_desc.res = res; + ndr_desc.provider_data = cxlr_pmem; + + ndr_desc.numa_node = memory_add_physaddr_to_nid(res->start); + ndr_desc.target_node = phys_to_target_node(res->start); + if (ndr_desc.target_node == NUMA_NO_NODE) { + ndr_desc.target_node = ndr_desc.numa_node; + dev_dbg(&cxlr->dev, "changing target node from %d to %d", + NUMA_NO_NODE, ndr_desc.target_node); + } + + nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL); + if (!nd_set) { + rc = -ENOMEM; + goto err; + } + + ndr_desc.memregion = cxlr->id; + set_bit(ND_REGION_CXL, &ndr_desc.flags); + set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags); + + info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto err; + } + + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + struct cxl_memdev *cxlmd = m->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *d; + + d = device_find_child(&cxlmd->dev, NULL, match_cxl_nvdimm); + if (!d) { + dev_dbg(dev, "[%d]: %s: no cxl_nvdimm found\n", i, + dev_name(&cxlmd->dev)); + rc = -ENODEV; + goto err; + } + + /* safe to drop ref now with bridge lock held */ + put_device(d); + + cxl_nvd = to_cxl_nvdimm(d); + nvdimm = dev_get_drvdata(&cxl_nvd->dev); + if (!nvdimm) { + dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i, + dev_name(&cxlmd->dev)); + rc = -ENODEV; + goto err; + } + cxl_nvd->region = cxlr_pmem; + get_device(&cxlr_pmem->dev); + m->cxl_nvd = cxl_nvd; + mappings[i] = (struct nd_mapping_desc) { + .nvdimm = nvdimm, + .start = m->start, + .size = m->size, + .position = i, + }; + info[i].offset = m->start; + info[i].serial = cxlds->serial; + } + ndr_desc.num_mappings = cxlr_pmem->nr_mappings; + ndr_desc.mapping = mappings; + + /* + * TODO enable CXL labels which skip the need for 'interleave-set cookie' + */ + nd_set->cookie1 = + nd_fletcher64(info, sizeof(*info) * cxlr_pmem->nr_mappings, 0); + nd_set->cookie2 = nd_set->cookie1; + ndr_desc.nd_set = nd_set; + + cxlr_pmem->nd_region = + nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc); + if (IS_ERR(cxlr_pmem->nd_region)) { + rc = PTR_ERR(cxlr_pmem->nd_region); + goto err; + } + + rc = devm_add_action_or_reset(dev, unregister_nvdimm_region, + cxlr_pmem->nd_region); +out: + kfree(info); + device_unlock(&cxl_nvb->dev); + put_device(&cxl_nvb->dev); + + return rc; + +err: + dev_dbg(dev, "failed to create nvdimm region\n"); + for (i--; i >= 0; i--) { + nvdimm = mappings[i].nvdimm; + cxl_nvd = nvdimm_provider_data(nvdimm); + put_device(&cxl_nvd->region->dev); + cxl_nvd->region = NULL; + } + goto out; +} + +static struct cxl_driver cxl_pmem_region_driver = { + .name = "cxl_pmem_region", + .probe = cxl_pmem_region_probe, + .id = CXL_DEVICE_PMEM_REGION, +}; + /* * Return all bridges to the CXL_NVB_NEW state to invalidate any * ->state_work referring to the now destroyed cxl_pmem_wq. @@ -372,8 +600,14 @@ static __init int cxl_pmem_init(void) if (rc) goto err_nvdimm; + rc = cxl_driver_register(&cxl_pmem_region_driver); + if (rc) + goto err_region; + return 0; +err_region: + cxl_driver_unregister(&cxl_nvdimm_driver); err_nvdimm: cxl_driver_unregister(&cxl_nvdimm_bridge_driver); err_bridge: @@ -383,6 +617,7 @@ err_bridge: static __exit void cxl_pmem_exit(void) { + cxl_driver_unregister(&cxl_pmem_region_driver); cxl_driver_unregister(&cxl_nvdimm_driver); cxl_driver_unregister(&cxl_nvdimm_bridge_driver); destroy_cxl_pmem_wq(); @@ -394,3 +629,4 @@ module_exit(cxl_pmem_exit); MODULE_IMPORT_NS(CXL); MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM_BRIDGE); MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM); +MODULE_ALIAS_CXL(CXL_DEVICE_PMEM_REGION); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index d976260eca7a..473a71bbd9c9 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -133,7 +133,8 @@ static void nd_region_release(struct device *dev) put_device(&nvdimm->dev); } free_percpu(nd_region->lane); - memregion_free(nd_region->id); + if (!test_bit(ND_REGION_CXL, &nd_region->flags)) + memregion_free(nd_region->id); kfree(nd_region); } @@ -982,9 +983,14 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, if (!nd_region) return NULL; - nd_region->id = memregion_alloc(GFP_KERNEL); - if (nd_region->id < 0) - goto err_id; + /* CXL pre-assigns memregion ids before creating nvdimm regions */ + if (test_bit(ND_REGION_CXL, &ndr_desc->flags)) { + nd_region->id = ndr_desc->memregion; + } else { + nd_region->id = memregion_alloc(GFP_KERNEL); + if (nd_region->id < 0) + goto err_id; + } nd_region->lane = alloc_percpu(struct nd_percpu_lane); if (!nd_region->lane) @@ -1043,9 +1049,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, return nd_region; - err_percpu: - memregion_free(nd_region->id); - err_id: +err_percpu: + if (!test_bit(ND_REGION_CXL, &ndr_desc->flags)) + memregion_free(nd_region->id); +err_id: kfree(nd_region); return NULL; } @@ -1068,6 +1075,13 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, } EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create); +void nvdimm_region_delete(struct nd_region *nd_region) +{ + if (nd_region) + nd_device_unregister(&nd_region->dev, ND_SYNC); +} +EXPORT_SYMBOL_GPL(nvdimm_region_delete); + int nvdimm_flush(struct nd_region *nd_region, struct bio *bio) { int rc = 0; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 0d61e07b6827..c74acfa1a3fe 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -59,6 +59,9 @@ enum { /* Platform provides asynchronous flush mechanism */ ND_REGION_ASYNC = 3, + /* Region was created by CXL subsystem */ + ND_REGION_CXL = 4, + /* mark newly adjusted resources as requiring a label update */ DPA_RESOURCE_ADJUSTED = 1 << 0, }; @@ -122,6 +125,7 @@ struct nd_region_desc { int numa_node; int target_node; unsigned long flags; + int memregion; struct device_node *of_node; int (*flush)(struct nd_region *nd_region, struct bio *bio); }; @@ -259,6 +263,7 @@ static inline struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, cmd_mask, num_flush, flush_wpq, NULL, NULL, NULL); } void nvdimm_delete(struct nvdimm *nvdimm); +void nvdimm_region_delete(struct nd_region *nd_region); const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd); const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd); -- cgit v1.2.3 From 020e1aede7ee31d8b0e24a46ee364c7bb0939f02 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 28 Jun 2022 16:34:29 +0800 Subject: virtio_pmem: initialize provider_data through nd_region_desc We used to initialize the provider_data manually after nvdimm_pemm_region_create(). This seems to be racy if the flush is issued before the initialization of provider_data[1]. Fixing this by initializing the provider_data through nd_region_desc to make sure the provider_data is ready after the pmem is created. [1]: [ 80.152281] nd_pmem namespace0.0: unable to guarantee persistence of writes [ 92.393956] BUG: kernel NULL pointer dereference, address: 0000000000000318 [ 92.394551] #PF: supervisor read access in kernel mode [ 92.394955] #PF: error_code(0x0000) - not-present page [ 92.395365] PGD 0 P4D 0 [ 92.395566] Oops: 0000 [#1] PREEMPT SMP PTI [ 92.395867] CPU: 2 PID: 506 Comm: mkfs.ext4 Not tainted 5.19.0-rc1+ #453 [ 92.396365] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 92.397178] RIP: 0010:virtio_pmem_flush+0x2f/0x1f0 [ 92.397521] Code: 55 41 54 55 53 48 81 ec a0 00 00 00 65 48 8b 04 25 28 00 00 00 48 89 84 24 98 00 00 00 31 c0 48 8b 87 78 03 00 00 48 89 04 24 <48> 8b 98 18 03 00 00 e8 85 bf 6b 00 ba 58 00 00 00 be c0 0c 00 00 [ 92.398982] RSP: 0018:ffff9a7380aefc88 EFLAGS: 00010246 [ 92.399349] RAX: 0000000000000000 RBX: ffff8e77c3f86f00 RCX: 0000000000000000 [ 92.399833] RDX: ffffffffad4ea720 RSI: ffff8e77c41e39c0 RDI: ffff8e77c41c5c00 [ 92.400388] RBP: ffff8e77c41e39c0 R08: ffff8e77c19f0600 R09: 0000000000000000 [ 92.400874] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8e77c0814e28 [ 92.401364] R13: 0000000000000000 R14: 0000000000000000 R15: ffff8e77c41e39c0 [ 92.401849] FS: 00007f3cd75b2780(0000) GS:ffff8e7937d00000(0000) knlGS:0000000000000000 [ 92.402423] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 92.402821] CR2: 0000000000000318 CR3: 0000000103c80002 CR4: 0000000000370ee0 [ 92.403307] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 92.403793] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 92.404278] Call Trace: [ 92.404481] [ 92.404654] ? mempool_alloc+0x5d/0x160 [ 92.404939] ? terminate_walk+0x5f/0xf0 [ 92.405226] ? bio_alloc_bioset+0xbb/0x3f0 [ 92.405525] async_pmem_flush+0x17/0x80 [ 92.405806] nvdimm_flush+0x11/0x30 [ 92.406067] pmem_submit_bio+0x1e9/0x200 [ 92.406354] __submit_bio+0x80/0x120 [ 92.406621] submit_bio_noacct_nocheck+0xdc/0x2a0 [ 92.406958] submit_bio_wait+0x4e/0x80 [ 92.407234] blkdev_issue_flush+0x31/0x50 [ 92.407526] ? punt_bios_to_rescuer+0x230/0x230 [ 92.407852] blkdev_fsync+0x1e/0x30 [ 92.408112] do_fsync+0x33/0x70 [ 92.408354] __x64_sys_fsync+0xb/0x10 [ 92.408625] do_syscall_64+0x43/0x90 [ 92.408895] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 92.409257] RIP: 0033:0x7f3cd76c6c44 Fixes 6e84200c0a29 ("virtio-pmem: Add virtio pmem driver") Acked-by: Pankaj Gupta Reviewed-by: Dan Williams Signed-off-by: Jason Wang Message-Id: <20220628083430.61856-1-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/nvdimm/virtio_pmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvdimm') diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c index 995b6cdc67ed..48f8327d0431 100644 --- a/drivers/nvdimm/virtio_pmem.c +++ b/drivers/nvdimm/virtio_pmem.c @@ -81,6 +81,7 @@ static int virtio_pmem_probe(struct virtio_device *vdev) ndr_desc.res = &res; ndr_desc.numa_node = nid; ndr_desc.flush = async_pmem_flush; + ndr_desc.provider_data = vdev; set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); set_bit(ND_REGION_ASYNC, &ndr_desc.flags); nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); @@ -89,7 +90,6 @@ static int virtio_pmem_probe(struct virtio_device *vdev) err = -ENXIO; goto out_nd; } - nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent); return 0; out_nd: nvdimm_bus_unregister(vpmem->nvdimm_bus); -- cgit v1.2.3 From 5d66322b2b23507e9907a68e9c504181ffccd62f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 28 Jun 2022 16:34:30 +0800 Subject: virtio_pmem: set device ready in probe() The NVDIMM region could be available before the virtio_device_ready() that is called by virtio_dev_probe(). This means the driver tries to use device before DRIVER_OK which violates the spec, fixing this by set device ready before the nvdimm_pmem_region_create(). Note that this means the virtio_pmem_host_ack() could be triggered before the creation of the nd region, this is safe since the pmem_lock has been initialized and whether or not any available buffer is added before is validated by virtio_pmem_host_ack(). Fixes 6e84200c0a29 ("virtio-pmem: Add virtio pmem driver") Acked-by: Pankaj Gupta Signed-off-by: Jason Wang Message-Id: <20220628083430.61856-2-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/nvdimm/virtio_pmem.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/nvdimm') diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c index 48f8327d0431..20da455d2ef6 100644 --- a/drivers/nvdimm/virtio_pmem.c +++ b/drivers/nvdimm/virtio_pmem.c @@ -84,6 +84,12 @@ static int virtio_pmem_probe(struct virtio_device *vdev) ndr_desc.provider_data = vdev; set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); set_bit(ND_REGION_ASYNC, &ndr_desc.flags); + /* + * The NVDIMM region could be available before the + * virtio_device_ready() that is called by + * virtio_dev_probe(), so we set device ready here. + */ + virtio_device_ready(vdev); nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); if (!nd_region) { dev_err(&vdev->dev, "failed to create nvdimm region\n"); @@ -92,6 +98,7 @@ static int virtio_pmem_probe(struct virtio_device *vdev) } return 0; out_nd: + virtio_reset_device(vdev); nvdimm_bus_unregister(vpmem->nvdimm_bus); out_vq: vdev->config->del_vqs(vdev); -- cgit v1.2.3