4 files changed, 236 insertions, 41 deletions
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index ff2580e7611d..abeb4df4f22e 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1670,6 +1670,11 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 				dev_name(&adev_dimm->dev));
 		return -ENXIO;
 	}
+	/*
+	 * Record nfit_mem for the notification path to track back to
+	 * the nfit sysfs attributes for this dimm device object.
+	 */
+	dev_set_drvdata(&adev_dimm->dev, nfit_mem);
 
 	/*
 	 * Until standardization materializes we need to consider 4
@@ -1752,9 +1757,11 @@ static void shutdown_dimm_notify(void *data)
 			sysfs_put(nfit_mem->flags_attr);
 			nfit_mem->flags_attr = NULL;
 		}
-		if (adev_dimm)
+		if (adev_dimm) {
 			acpi_remove_notify_handler(adev_dimm->handle,
 					ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify);
+			dev_set_drvdata(&adev_dimm->dev, NULL);
+		}
 	}
 	mutex_unlock(&acpi_desc->init_mutex);
 }
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index e949e3302af4..c586bcdb5190 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -211,12 +211,12 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
 	return ret;
 }
 
-static int btt_log_read_pair(struct arena_info *arena, u32 lane,
-			struct log_entry *ent)
+static int btt_log_group_read(struct arena_info *arena, u32 lane,
+			struct log_group *log)
 {
 	return arena_read_bytes(arena,
-			arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
-			2 * LOG_ENT_SIZE, 0);
+			arena->logoff + (lane * LOG_GRP_SIZE), log,
+			LOG_GRP_SIZE, 0);
 }
 
 static struct dentry *debugfs_root;
@@ -256,6 +256,8 @@ static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
 	debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
 	debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
 	debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
+	debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]);
+	debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]);
 }
 
 static void btt_debugfs_init(struct btt *btt)
@@ -274,6 +276,11 @@ static void btt_debugfs_init(struct btt *btt)
 	}
 }
 
+static u32 log_seq(struct log_group *log, int log_idx)
+{
+	return le32_to_cpu(log->ent[log_idx].seq);
+}
+
 /*
  * This function accepts two log entries, and uses the
  * sequence number to find the 'older' entry.
@@ -283,8 +290,10 @@ static void btt_debugfs_init(struct btt *btt)
  *
  * TODO The logic feels a bit kludge-y. make it better..
  */
-static int btt_log_get_old(struct log_entry *ent)
+static int btt_log_get_old(struct arena_info *a, struct log_group *log)
 {
+	int idx0 = a->log_index[0];
+	int idx1 = a->log_index[1];
 	int old;
 
 	/*
@@ -292,23 +301,23 @@ static int btt_log_get_old(struct log_entry *ent)
 	 * the next time, the following logic works out to put this
 	 * (next) entry into [1]
 	 */
-	if (ent[0].seq == 0) {
-		ent[0].seq = cpu_to_le32(1);
+	if (log_seq(log, idx0) == 0) {
+		log->ent[idx0].seq = cpu_to_le32(1);
 		return 0;
 	}
 
-	if (ent[0].seq == ent[1].seq)
+	if (log_seq(log, idx0) == log_seq(log, idx1))
 		return -EINVAL;
-	if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5)
+	if (log_seq(log, idx0) + log_seq(log, idx1) > 5)
 		return -EINVAL;
 
-	if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) {
-		if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1)
+	if (log_seq(log, idx0) < log_seq(log, idx1)) {
+		if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1)
 			old = 0;
 		else
 			old = 1;
 	} else {
-		if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1)
+		if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1)
 			old = 1;
 		else
 			old = 0;
@@ -328,17 +337,18 @@ static int btt_log_read(struct arena_info *arena, u32 lane,
 {
 	int ret;
 	int old_ent, ret_ent;
-	struct log_entry log[2];
+	struct log_group log;
 
-	ret = btt_log_read_pair(arena, lane, log);
+	ret = btt_log_group_read(arena, lane, &log);
 	if (ret)
 		return -EIO;
 
-	old_ent = btt_log_get_old(log);
+	old_ent = btt_log_get_old(arena, &log);
 	if (old_ent < 0 || old_ent > 1) {
 		dev_err(to_dev(arena),
 				"log corruption (%d): lane %d seq [%d, %d]\n",
-			old_ent, lane, log[0].seq, log[1].seq);
+				old_ent, lane, log.ent[arena->log_index[0]].seq,
+				log.ent[arena->log_index[1]].seq);
 		/* TODO set error state? */
 		return -EIO;
 	}
@@ -346,7 +356,7 @@ static int btt_log_read(struct arena_info *arena, u32 lane,
 	ret_ent = (old_flag ? old_ent : (1 - old_ent));
 
 	if (ent != NULL)
-		memcpy(ent, &log[ret_ent], LOG_ENT_SIZE);
+		memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE);
 
 	return ret_ent;
 }
@@ -360,17 +370,13 @@ static int __btt_log_write(struct arena_info *arena, u32 lane,
 			u32 sub, struct log_entry *ent, unsigned long flags)
 {
 	int ret;
-	/*
-	 * Ignore the padding in log_entry for calculating log_half.
-	 * The entry is 'committed' when we write the sequence number,
-	 * and we want to ensure that that is the last thing written.
-	 * We don't bother writing the padding as that would be extra
-	 * media wear and write amplification
-	 */
-	unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2;
-	u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE);
+	u32 group_slot = arena->log_index[sub];
+	unsigned int log_half = LOG_ENT_SIZE / 2;
 	void *src = ent;
+	u64 ns_off;
 
+	ns_off = arena->logoff + (lane * LOG_GRP_SIZE) +
+		(group_slot * LOG_ENT_SIZE);
 	/* split the 16B write into atomic, durable halves */
 	ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
 	if (ret)
@@ -453,7 +459,7 @@ static int btt_log_init(struct arena_info *arena)
 {
 	size_t logsize = arena->info2off - arena->logoff;
 	size_t chunk_size = SZ_4K, offset = 0;
-	struct log_entry log;
+	struct log_entry ent;
 	void *zerobuf;
 	int ret;
 	u32 i;
@@ -485,11 +491,11 @@ static int btt_log_init(struct arena_info *arena)
 	}
 
 	for (i = 0; i < arena->nfree; i++) {
-		log.lba = cpu_to_le32(i);
-		log.old_map = cpu_to_le32(arena->external_nlba + i);
-		log.new_map = cpu_to_le32(arena->external_nlba + i);
-		log.seq = cpu_to_le32(LOG_SEQ_INIT);
-		ret = __btt_log_write(arena, i, 0, &log, 0);
+		ent.lba = cpu_to_le32(i);
+		ent.old_map = cpu_to_le32(arena->external_nlba + i);
+		ent.new_map = cpu_to_le32(arena->external_nlba + i);
+		ent.seq = cpu_to_le32(LOG_SEQ_INIT);
+		ret = __btt_log_write(arena, i, 0, &ent, 0);
 		if (ret)
 			goto free;
 	}
@@ -594,6 +600,123 @@ static int btt_freelist_init(struct arena_info *arena)
 	return 0;
 }
 
+static bool ent_is_padding(struct log_entry *ent)
+{
+	return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0)
+		&& (ent->seq == 0);
+}
+
+/*
+ * Detecting valid log indices: We read a log group (see the comments in btt.h
+ * for a description of a 'log_group' and its 'slots'), and iterate over its
+ * four slots. We expect that a padding slot will be all-zeroes, and use this
+ * to detect a padding slot vs. an actual entry.
+ *
+ * If a log_group is in the initial state, i.e. hasn't been used since the
+ * creation of this BTT layout, it will have three of the four slots with
+ * zeroes. We skip over these log_groups for the detection of log_index. If
+ * all log_groups are in the initial state (i.e. the BTT has never been
+ * written to), it is safe to assume the 'new format' of log entries in slots
+ * (0, 1).
+ */
+static int log_set_indices(struct arena_info *arena)
+{
+	bool idx_set = false, initial_state = true;
+	int ret, log_index[2] = {-1, -1};
+	u32 i, j, next_idx = 0;
+	struct log_group log;
+	u32 pad_count = 0;
+
+	for (i = 0; i < arena->nfree; i++) {
+		ret = btt_log_group_read(arena, i, &log);
+		if (ret < 0)
+			return ret;
+
+		for (j = 0; j < 4; j++) {
+			if (!idx_set) {
+				if (ent_is_padding(&log.ent[j])) {
+					pad_count++;
+					continue;
+				} else {
+					/* Skip if index has been recorded */
+					if ((next_idx == 1) &&
+						(j == log_index[0]))
+						continue;
+					/* valid entry, record index */
+					log_index[next_idx] = j;
+					next_idx++;
+				}
+				if (next_idx == 2) {
+					/* two valid entries found */
+					idx_set = true;
+				} else if (next_idx > 2) {
+					/* too many valid indices */
+					return -ENXIO;
+				}
+			} else {
+				/*
+				 * once the indices have been set, just verify
+				 * that all subsequent log groups are either in
+				 * their initial state or follow the same
+				 * indices.
+				 */
+				if (j == log_index[0]) {
+					/* entry must be 'valid' */
+					if (ent_is_padding(&log.ent[j]))
+						return -ENXIO;
+				} else if (j == log_index[1]) {
+					;
+					/*
+					 * log_index[1] can be padding if the
+					 * lane never got used and it is still
+					 * in the initial state (three 'padding'
+					 * entries)
+					 */
+				} else {
+					/* entry must be invalid (padding) */
+					if (!ent_is_padding(&log.ent[j]))
+						return -ENXIO;
+				}
+			}
+		}
+		/*
+		 * If any of the log_groups have more than one valid,
+		 * non-padding entry, then the we are no longer in the
+		 * initial_state
+		 */
+		if (pad_count < 3)
+			initial_state = false;
+		pad_count = 0;
+	}
+
+	if (!initial_state && !idx_set)
+		return -ENXIO;
+
+	/*
+	 * If all the entries in the log were in the initial state,
+	 * assume new padding scheme
+	 */
+	if (initial_state)
+		log_index[1] = 1;
+
+	/*
+	 * Only allow the known permutations of log/padding indices,
+	 * i.e. (0, 1), and (0, 2)
+	 */
+	if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2)))
+		; /* known index possibilities */
+	else {
+		dev_err(to_dev(arena), "Found an unknown padding scheme\n");
+		return -ENXIO;
+	}
+
+	arena->log_index[0] = log_index[0];
+	arena->log_index[1] = log_index[1];
+	dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]);
+	dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]);
+	return 0;
+}
+
 static int btt_rtt_init(struct arena_info *arena)
 {
 	arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
@@ -650,8 +773,7 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size,
 	available -= 2 * BTT_PG_SIZE;
 
 	/* The log takes a fixed amount of space based on nfree */
-	logsize = roundup(2 * arena->nfree * sizeof(struct log_entry),
-				BTT_PG_SIZE);
+	logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE);
 	available -= logsize;
 
 	/* Calculate optimal split between map and data area */
@@ -668,6 +790,10 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size,
 	arena->mapoff = arena->dataoff + datasize;
 	arena->logoff = arena->mapoff + mapsize;
 	arena->info2off = arena->logoff + logsize;
+
+	/* Default log indices are (0,1) */
+	arena->log_index[0] = 0;
+	arena->log_index[1] = 1;
 	return arena;
 }
 
@@ -758,6 +884,13 @@ static int discover_arenas(struct btt *btt)
 		arena->external_lba_start = cur_nlba;
 		parse_arena_meta(arena, super, cur_off);
 
+		ret = log_set_indices(arena);
+		if (ret) {
+			dev_err(to_dev(arena),
+				"Unable to deduce log/padding indices\n");
+			goto out;
+		}
+
 		mutex_init(&arena->err_lock);
 		ret = btt_freelist_init(arena);
 		if (ret)
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
index 578c2057524d..db3cb6d4d0d4 100644
--- a/drivers/nvdimm/btt.h
+++ b/drivers/nvdimm/btt.h
@@ -27,6 +27,7 @@
 #define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
 #define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
 #define MAP_ENT_NORMAL 0xC0000000
+#define LOG_GRP_SIZE sizeof(struct log_group)
 #define LOG_ENT_SIZE sizeof(struct log_entry)
 #define ARENA_MIN_SIZE (1UL << 24)	/* 16 MB */
 #define ARENA_MAX_SIZE (1ULL << 39)	/* 512 GB */
@@ -50,12 +51,52 @@ enum btt_init_state {
 	INIT_READY
 };
 
+/*
+ * A log group represents one log 'lane', and consists of four log entries.
+ * Two of the four entries are valid entries, and the remaining two are
+ * padding. Due to an old bug in the padding location, we need to perform a
+ * test to determine the padding scheme being used, and use that scheme
+ * thereafter.
+ *
+ * In kernels prior to 4.15, 'log group' would have actual log entries at
+ * indices (0, 2) and padding at indices (1, 3), where as the correct/updated
+ * format has log entries at indices (0, 1) and padding at indices (2, 3).
+ *
+ * Old (pre 4.15) format:
+ * +-----------------+-----------------+
+ * |      ent[0]     |      ent[1]     |
+ * |       16B       |       16B       |
+ * | lba/old/new/seq |       pad       |
+ * +-----------------------------------+
+ * |      ent[2]     |      ent[3]     |
+ * |       16B       |       16B       |
+ * | lba/old/new/seq |       pad       |
+ * +-----------------+-----------------+
+ *
+ * New format:
+ * +-----------------+-----------------+
+ * |      ent[0]     |      ent[1]     |
+ * |       16B       |       16B       |
+ * | lba/old/new/seq | lba/old/new/seq |
+ * +-----------------------------------+
+ * |      ent[2]     |      ent[3]     |
+ * |       16B       |       16B       |
+ * |       pad       |       pad       |
+ * +-----------------+-----------------+
+ *
+ * We detect during start-up which format is in use, and set
+ * arena->log_index[(0, 1)] with the detected format.
+ */
+
 struct log_entry {
 	__le32 lba;
 	__le32 old_map;
 	__le32 new_map;
 	__le32 seq;
-	__le64 padding[2];
+};
+
+struct log_group {
+	struct log_entry ent[4];
 };
 
 struct btt_sb {
@@ -125,6 +166,8 @@ struct aligned_lock {
  * @list:		List head for list of arenas
  * @debugfs_dir:	Debugfs dentry
  * @flags:		Arena flags - may signify error states.
+ * @err_lock:		Mutex for synchronizing error clearing.
+ * @log_index:		Indices of the valid log entries in a log_group
  *
  * arena_info is a per-arena handle. Once an arena is narrowed down for an
  * IO, this struct is passed around for the duration of the IO.
@@ -157,6 +200,7 @@ struct arena_info {
 	/* Arena flags */
 	u32 flags;
 	struct mutex err_lock;
+	int log_index[2];
 };
 
 /**
@@ -176,6 +220,7 @@ struct arena_info {
  * @init_lock:		Mutex used for the BTT initialization
  * @init_state:		Flag describing the initialization state for the BTT
  * @num_arenas:		Number of arenas in the BTT instance
+ * @phys_bb:		Pointer to the namespace's badblocks structure
  */
 struct btt {
 	struct gendisk *btt_disk;
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 65cc171c721d..2adada1a5855 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -364,9 +364,9 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
 int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 {
 	u64 checksum, offset;
-	unsigned long align;
 	enum nd_pfn_mode mode;
 	struct nd_namespace_io *nsio;
+	unsigned long align, start_pad;
 	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev);
@@ -410,6 +410,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 
 	align = le32_to_cpu(pfn_sb->align);
 	offset = le64_to_cpu(pfn_sb->dataoff);
+	start_pad = le32_to_cpu(pfn_sb->start_pad);
 	if (align == 0)
 		align = 1UL << ilog2(offset);
 	mode = le32_to_cpu(pfn_sb->mode);
@@ -468,7 +469,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 		return -EBUSY;
 	}
 
-	if ((align && !IS_ALIGNED(offset, align))
+	if ((align && !IS_ALIGNED(nsio->res.start + offset + start_pad, align))
 			|| !IS_ALIGNED(offset, PAGE_SIZE)) {
 		dev_err(&nd_pfn->dev,
 				"bad offset: %#llx dax disabled align: %#lx\n",
@@ -582,6 +583,12 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
 	return altmap;
 }
 
+static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys)
+{
+	return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys),
+			ALIGN_DOWN(phys, nd_pfn->align));
+}
+
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
 	u32 dax_label_reserve = is_nd_dax(&nd_pfn->dev) ? SZ_128K : 0;
@@ -637,13 +644,16 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	start = nsio->res.start;
 	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
 	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED) {
+				IORES_DESC_NONE) == REGION_MIXED
+			|| !IS_ALIGNED(start + resource_size(&nsio->res),
+				nd_pfn->align)) {
 		size = resource_size(&nsio->res);
-		end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size);
+		end_trunc = start + size - phys_pmem_align_down(nd_pfn,
+				start + size);
 	}
 
 	if (start_pad + end_trunc)
-		dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n",
+		dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n",
 				dev_name(&ndns->dev), start_pad + end_trunc);
 
 	/*