21 files changed, 617 insertions, 548 deletions
diff --git a/lib/Kconfig b/lib/Kconfig
index 5fe577673b98..7a913937888b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -429,15 +429,50 @@ config SGL_ALLOC
 	bool
 	default n
 
+config NEED_SG_DMA_LENGTH
+	bool
+
+config NEED_DMA_MAP_STATE
+	bool
+
+config ARCH_DMA_ADDR_T_64BIT
+	def_bool 64BIT || PHYS_ADDR_T_64BIT
+
+config IOMMU_HELPER
+	bool
+
+config ARCH_HAS_SYNC_DMA_FOR_DEVICE
+	bool
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU
+	bool
+	select NEED_DMA_MAP_STATE
+
 config DMA_DIRECT_OPS
 	bool
-	depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
-	default n
+	depends on HAS_DMA
+
+config DMA_NONCOHERENT_OPS
+	bool
+	depends on HAS_DMA
+	select DMA_DIRECT_OPS
+
+config DMA_NONCOHERENT_MMAP
+	bool
+	depends on DMA_NONCOHERENT_OPS
+
+config DMA_NONCOHERENT_CACHE_SYNC
+	bool
+	depends on DMA_NONCOHERENT_OPS
 
 config DMA_VIRT_OPS
 	bool
-	depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
-	default n
+	depends on HAS_DMA
+
+config SWIOTLB
+	bool
+	select DMA_DIRECT_OPS
+	select NEED_DMA_MAP_STATE
 
 config CHECK_SIGNATURE
 	bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c40c7b734cd1..76555479ae36 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1634,7 +1634,7 @@ config PROVIDE_OHCI1394_DMA_INIT
 
 config DMA_API_DEBUG
 	bool "Enable debugging of DMA-API usage"
-	depends on HAVE_DMA_API_DEBUG
+	select NEED_DMA_MAP_STATE
 	help
 	  Enable this option to debug the use of the DMA API by device drivers.
 	  With this option you will be able to detect common bugs in device
@@ -1651,6 +1651,23 @@ config DMA_API_DEBUG
 
 	  If unsure, say N.
 
+config DMA_API_DEBUG_SG
+	bool "Debug DMA scatter-gather usage"
+	default y
+	depends on DMA_API_DEBUG
+	help
+	  Perform extra checking that callers of dma_map_sg() have respected the
+	  appropriate segment length/boundary limits for the given device when
+	  preparing DMA scatterlists.
+
+	  This is particularly likely to have been overlooked in cases where the
+	  dma_map_sg() API is used for general bulk mapping of pages rather than
+	  preparing literal scatter-gather descriptors, where there is a risk of
+	  unexpected behaviour from DMA API implementations if the scatterlist
+	  is technically out-of-spec.
+
+	  If unsure, say N.
+
 menuconfig RUNTIME_TESTING_MENU
 	bool "Runtime Testing"
 	def_bool y
diff --git a/lib/Makefile b/lib/Makefile
index ce20696d5a92..9f18c8152281 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -30,6 +30,7 @@ lib-$(CONFIG_PRINTK) += dump_stack.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 lib-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o
+lib-$(CONFIG_DMA_NONCOHERENT_OPS) += dma-noncoherent.o
 lib-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o
 
 lib-y	+= kobject.o klist.o
@@ -147,7 +148,7 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
+obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
 obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 7f5cdc1e6b29..c007d25bee09 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -41,6 +41,11 @@
 #define HASH_FN_SHIFT   13
 #define HASH_FN_MASK    (HASH_SIZE - 1)
 
+/* allow architectures to override this if absolutely required */
+#ifndef PREALLOC_DMA_DEBUG_ENTRIES
+#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
+#endif
+
 enum {
 	dma_debug_single,
 	dma_debug_page,
@@ -127,7 +132,7 @@ static u32 min_free_entries;
 static u32 nr_total_entries;
 
 /* number of preallocated entries requested by kernel cmdline */
-static u32 req_entries;
+static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
 
 /* debugfs dentry's for the stuff above */
 static struct dentry *dma_debug_dent        __read_mostly;
@@ -439,7 +444,6 @@ void debug_dma_dump_mappings(struct device *dev)
 		spin_unlock_irqrestore(&bucket->lock, flags);
 	}
 }
-EXPORT_SYMBOL(debug_dma_dump_mappings);
 
 /*
  * For each mapping (initial cacheline in the case of
@@ -748,7 +752,6 @@ int dma_debug_resize_entries(u32 num_entries)
 
 	return ret;
 }
-EXPORT_SYMBOL(dma_debug_resize_entries);
 
 /*
  * DMA-API debugging init code
@@ -1004,10 +1007,7 @@ void dma_debug_add_bus(struct bus_type *bus)
 	bus_register_notifier(bus, nb);
 }
 
-/*
- * Let the architectures decide how many entries should be preallocated.
- */
-void dma_debug_init(u32 num_entries)
+static int dma_debug_init(void)
 {
 	int i;
 
@@ -1015,7 +1015,7 @@ void dma_debug_init(u32 num_entries)
 	 * called to set dma_debug_initialized
 	 */
 	if (global_disable)
-		return;
+		return 0;
 
 	for (i = 0; i < HASH_SIZE; ++i) {
 		INIT_LIST_HEAD(&dma_entry_hash[i].list);
@@ -1026,17 +1026,14 @@ void dma_debug_init(u32 num_entries)
 		pr_err("DMA-API: error creating debugfs entries - disabling\n");
 		global_disable = true;
 
-		return;
+		return 0;
 	}
 
-	if (req_entries)
-		num_entries = req_entries;
-
-	if (prealloc_memory(num_entries) != 0) {
+	if (prealloc_memory(nr_prealloc_entries) != 0) {
 		pr_err("DMA-API: debugging out of memory error - disabled\n");
 		global_disable = true;
 
-		return;
+		return 0;
 	}
 
 	nr_total_entries = num_free_entries;
@@ -1044,7 +1041,9 @@ void dma_debug_init(u32 num_entries)
 	dma_debug_initialized = true;
 
 	pr_info("DMA-API: debugging enabled by kernel config\n");
+	return 0;
 }
+core_initcall(dma_debug_init);
 
 static __init int dma_debug_cmdline(char *str)
 {
@@ -1061,16 +1060,10 @@ static __init int dma_debug_cmdline(char *str)
 
 static __init int dma_debug_entries_cmdline(char *str)
 {
-	int res;
-
 	if (!str)
 		return -EINVAL;
-
-	res = get_option(&str, &req_entries);
-
-	if (!res)
-		req_entries = 0;
-
+	if (!get_option(&str, &nr_prealloc_entries))
+		nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
 	return 0;
 }
 
@@ -1293,6 +1286,32 @@ out:
 	put_hash_bucket(bucket, &flags);
 }
 
+static void check_sg_segment(struct device *dev, struct scatterlist *sg)
+{
+#ifdef CONFIG_DMA_API_DEBUG_SG
+	unsigned int max_seg = dma_get_max_seg_size(dev);
+	u64 start, end, boundary = dma_get_seg_boundary(dev);
+
+	/*
+	 * Either the driver forgot to set dma_parms appropriately, or
+	 * whoever generated the list forgot to check them.
+	 */
+	if (sg->length > max_seg)
+		err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
+			   sg->length, max_seg);
+	/*
+	 * In some cases this could potentially be the DMA API
+	 * implementation's fault, but it would usually imply that
+	 * the scatterlist was built inappropriately to begin with.
+	 */
+	start = sg_dma_address(sg);
+	end = start + sg_dma_len(sg) - 1;
+	if ((start ^ end) & ~boundary)
+		err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
+			   start, end, boundary);
+#endif
+}
+
 void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
 			size_t size, int direction, dma_addr_t dma_addr,
 			bool map_single)
@@ -1423,6 +1442,8 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 			check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
 		}
 
+		check_sg_segment(dev, s);
+
 		add_dma_entry(entry);
 	}
 }
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index c0bba30fef0a..8be8106270c2 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -34,6 +34,13 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
 		const char *caller)
 {
 	if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
+		if (!dev->dma_mask) {
+			dev_err(dev,
+				"%s: call on device without dma_mask\n",
+				caller);
+			return false;
+		}
+
 		if (*dev->dma_mask >= DMA_BIT_MASK(32)) {
 			dev_err(dev,
 				"%s: overflow %pad+%zu of device mask %llx\n",
@@ -84,7 +91,15 @@ again:
 		__free_pages(page, page_order);
 		page = NULL;
 
-		if (dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
+		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+		    dev->coherent_dma_mask < DMA_BIT_MASK(64) &&
+		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
+			gfp |= GFP_DMA32;
+			goto again;
+		}
+
+		if (IS_ENABLED(CONFIG_ZONE_DMA) &&
+		    dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
 		    !(gfp & GFP_DMA)) {
 			gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
 			goto again;
@@ -120,7 +135,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
 		free_pages((unsigned long)cpu_addr, page_order);
 }
 
-static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
 {
@@ -131,8 +146,8 @@ static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 	return dma_addr;
 }
 
-static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs)
 {
 	int i;
 	struct scatterlist *sg;
@@ -164,10 +179,16 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	if (mask < DMA_BIT_MASK(32))
 		return 0;
 #endif
+	/*
+	 * Various PCI/PCIe bridges have broken support for > 32bit DMA even
+	 * if the device itself might support it.
+	 */
+	if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32))
+		return 0;
 	return 1;
 }
 
-static int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
+int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr == DIRECT_MAPPING_ERROR;
 }
@@ -179,6 +200,5 @@ const struct dma_map_ops dma_direct_ops = {
 	.map_sg			= dma_direct_map_sg,
 	.dma_supported		= dma_direct_supported,
 	.mapping_error		= dma_direct_mapping_error,
-	.is_phys		= 1,
 };
 EXPORT_SYMBOL(dma_direct_ops);
diff --git a/lib/dma-noncoherent.c b/lib/dma-noncoherent.c
new file mode 100644
index 000000000000..79e9a757387f
--- /dev/null
+++ b/lib/dma-noncoherent.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without providing cache
+ * coherence.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/scatterlist.h>
+
+static void dma_noncoherent_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+}
+
+static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	dma_addr_t addr;
+
+	addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(dev, page_to_phys(page) + offset,
+				size, dir);
+	return addr;
+}
+
+static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs);
+	if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir);
+	return nents;
+}
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+static void dma_noncoherent_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+}
+
+static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir);
+}
+#endif
+
+const struct dma_map_ops dma_noncoherent_ops = {
+	.alloc			= arch_dma_alloc,
+	.free			= arch_dma_free,
+	.mmap			= arch_dma_mmap,
+	.sync_single_for_device	= dma_noncoherent_sync_single_for_device,
+	.sync_sg_for_device	= dma_noncoherent_sync_sg_for_device,
+	.map_page		= dma_noncoherent_map_page,
+	.map_sg			= dma_noncoherent_map_sg,
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+	.sync_single_for_cpu	= dma_noncoherent_sync_single_for_cpu,
+	.sync_sg_for_cpu	= dma_noncoherent_sync_sg_for_cpu,
+	.unmap_page		= dma_noncoherent_unmap_page,
+	.unmap_sg		= dma_noncoherent_unmap_sg,
+#endif
+	.dma_supported		= dma_direct_supported,
+	.mapping_error		= dma_direct_mapping_error,
+	.cache_sync		= arch_dma_cache_sync,
+};
+EXPORT_SYMBOL(dma_noncoherent_ops);
diff --git a/lib/errseq.c b/lib/errseq.c
index df782418b333..81f9e33aa7e7 100644
--- a/lib/errseq.c
+++ b/lib/errseq.c
@@ -111,27 +111,22 @@ EXPORT_SYMBOL(errseq_set);
  * errseq_sample() - Grab current errseq_t value.
  * @eseq: Pointer to errseq_t to be sampled.
  *
- * This function allows callers to sample an errseq_t value, marking it as
- * "seen" if required.
+ * This function allows callers to initialise their errseq_t variable.
+ * If the error has been "seen", new callers will not see an old error.
+ * If there is an unseen error in @eseq, the caller of this function will
+ * see it the next time it checks for an error.
  *
+ * Context: Any context.
  * Return: The current errseq value.
  */
 errseq_t errseq_sample(errseq_t *eseq)
 {
 	errseq_t old = READ_ONCE(*eseq);
-	errseq_t new = old;
 
-	/*
-	 * For the common case of no errors ever having been set, we can skip
-	 * marking the SEEN bit. Once an error has been set, the value will
-	 * never go back to zero.
-	 */
-	if (old != 0) {
-		new |= ERRSEQ_SEEN;
-		if (old != new)
-			cmpxchg(eseq, old, new);
-	}
-	return new;
+	/* If nobody has seen this error yet, then we can be the first. */
+	if (!(old & ERRSEQ_SEEN))
+		old = 0;
+	return old;
 }
 EXPORT_SYMBOL(errseq_sample);
 
diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c
index 5985a25e6cbc..5367ffa5c18f 100644
--- a/lib/find_bit_benchmark.c
+++ b/lib/find_bit_benchmark.c
@@ -132,7 +132,12 @@ static int __init find_bit_test(void)
 	test_find_next_bit(bitmap, BITMAP_LEN);
 	test_find_next_zero_bit(bitmap, BITMAP_LEN);
 	test_find_last_bit(bitmap, BITMAP_LEN);
-	test_find_first_bit(bitmap, BITMAP_LEN);
+
+	/*
+	 * test_find_first_bit() may take some time, so
+	 * traverse only part of bitmap to avoid soft lockup.
+	 */
+	test_find_first_bit(bitmap, BITMAP_LEN / 10);
 	test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN);
 
 	pr_err("\nStart testing find_bit() with sparse bitmap\n");
diff --git a/lib/iommu-common.c b/lib/iommu-common.c
deleted file mode 100644
index 55b00de106b5..000000000000
--- a/lib/iommu-common.c
+++ /dev/null
@@ -1,267 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * IOMMU mmap management and range allocation functions.
- * Based almost entirely upon the powerpc iommu allocator.
- */
-
-#include <linux/export.h>
-#include <linux/bitmap.h>
-#include <linux/bug.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu-common.h>
-#include <linux/dma-mapping.h>
-#include <linux/hash.h>
-
-static unsigned long iommu_large_alloc = 15;
-
-static	DEFINE_PER_CPU(unsigned int, iommu_hash_common);
-
-static inline bool need_flush(struct iommu_map_table *iommu)
-{
-	return ((iommu->flags & IOMMU_NEED_FLUSH) != 0);
-}
-
-static inline void set_flush(struct iommu_map_table *iommu)
-{
-	iommu->flags |= IOMMU_NEED_FLUSH;
-}
-
-static inline void clear_flush(struct iommu_map_table *iommu)
-{
-	iommu->flags &= ~IOMMU_NEED_FLUSH;
-}
-
-static void setup_iommu_pool_hash(void)
-{
-	unsigned int i;
-	static bool do_once;
-
-	if (do_once)
-		return;
-	do_once = true;
-	for_each_possible_cpu(i)
-		per_cpu(iommu_hash_common, i) = hash_32(i, IOMMU_POOL_HASHBITS);
-}
-
-/*
- * Initialize iommu_pool entries for the iommu_map_table. `num_entries'
- * is the number of table entries. If `large_pool' is set to true,
- * the top 1/4 of the table will be set aside for pool allocations
- * of more than iommu_large_alloc pages.
- */
-void iommu_tbl_pool_init(struct iommu_map_table *iommu,
-			 unsigned long num_entries,
-			 u32 table_shift,
-			 void (*lazy_flush)(struct iommu_map_table *),
-			 bool large_pool, u32 npools,
-			 bool skip_span_boundary_check)
-{
-	unsigned int start, i;
-	struct iommu_pool *p = &(iommu->large_pool);
-
-	setup_iommu_pool_hash();
-	if (npools == 0)
-		iommu->nr_pools = IOMMU_NR_POOLS;
-	else
-		iommu->nr_pools = npools;
-	BUG_ON(npools > IOMMU_NR_POOLS);
-
-	iommu->table_shift = table_shift;
-	iommu->lazy_flush = lazy_flush;
-	start = 0;
-	if (skip_span_boundary_check)
-		iommu->flags |= IOMMU_NO_SPAN_BOUND;
-	if (large_pool)
-		iommu->flags |= IOMMU_HAS_LARGE_POOL;
-
-	if (!large_pool)
-		iommu->poolsize = num_entries/iommu->nr_pools;
-	else
-		iommu->poolsize = (num_entries * 3 / 4)/iommu->nr_pools;
-	for (i = 0; i < iommu->nr_pools; i++) {
-		spin_lock_init(&(iommu->pools[i].lock));
-		iommu->pools[i].start = start;
-		iommu->pools[i].hint = start;
-		start += iommu->poolsize; /* start for next pool */
-		iommu->pools[i].end = start - 1;
-	}
-	if (!large_pool)
-		return;
-	/* initialize large_pool */
-	spin_lock_init(&(p->lock));
-	p->start = start;
-	p->hint = p->start;
-	p->end = num_entries;
-}
-EXPORT_SYMBOL(iommu_tbl_pool_init);
-
-unsigned long iommu_tbl_range_alloc(struct device *dev,
-				struct iommu_map_table *iommu,
-				unsigned long npages,
-				unsigned long *handle,
-				unsigned long mask,
-				unsigned int align_order)
-{
-	unsigned int pool_hash = __this_cpu_read(iommu_hash_common);
-	unsigned long n, end, start, limit, boundary_size;
-	struct iommu_pool *pool;
-	int pass = 0;
-	unsigned int pool_nr;
-	unsigned int npools = iommu->nr_pools;
-	unsigned long flags;
-	bool large_pool = ((iommu->flags & IOMMU_HAS_LARGE_POOL) != 0);
-	bool largealloc = (large_pool && npages > iommu_large_alloc);
-	unsigned long shift;
-	unsigned long align_mask = 0;
-
-	if (align_order > 0)
-		align_mask = ~0ul >> (BITS_PER_LONG - align_order);
-
-	/* Sanity check */
-	if (unlikely(npages == 0)) {
-		WARN_ON_ONCE(1);
-		return IOMMU_ERROR_CODE;
-	}
-
-	if (largealloc) {
-		pool = &(iommu->large_pool);
-		pool_nr = 0; /* to keep compiler happy */
-	} else {
-		/* pick out pool_nr */
-		pool_nr =  pool_hash & (npools - 1);
-		pool = &(iommu->pools[pool_nr]);
-	}
-	spin_lock_irqsave(&pool->lock, flags);
-
- again:
-	if (pass == 0 && handle && *handle &&
-	    (*handle >= pool->start) && (*handle < pool->end))
-		start = *handle;
-	else
-		start = pool->hint;
-
-	limit = pool->end;
-
-	/* The case below can happen if we have a small segment appended
-	 * to a large, or when the previous alloc was at the very end of
-	 * the available space. If so, go back to the beginning. If a
-	 * flush is needed, it will get done based on the return value
-	 * from iommu_area_alloc() below.
-	 */
-	if (start >= limit)
-		start = pool->start;
-	shift = iommu->table_map_base >> iommu->table_shift;
-	if (limit + shift > mask) {
-		limit = mask - shift + 1;
-		/* If we're constrained on address range, first try
-		 * at the masked hint to avoid O(n) search complexity,
-		 * but on second pass, start at 0 in pool 0.
-		 */
-		if ((start & mask) >= limit || pass > 0) {
-			spin_unlock(&(pool->lock));
-			pool = &(iommu->pools[0]);
-			spin_lock(&(pool->lock));
-			start = pool->start;
-		} else {
-			start &= mask;
-		}
-	}
-
-	if (dev)
-		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-				      1 << iommu->table_shift);
-	else
-		boundary_size = ALIGN(1ULL << 32, 1 << iommu->table_shift);
-
-	boundary_size = boundary_size >> iommu->table_shift;
-	/*
-	 * if the skip_span_boundary_check had been set during init, we set
-	 * things up so that iommu_is_span_boundary() merely checks if the
-	 * (index + npages) < num_tsb_entries
-	 */
-	if ((iommu->flags & IOMMU_NO_SPAN_BOUND) != 0) {
-		shift = 0;
-		boundary_size = iommu->poolsize * iommu->nr_pools;
-	}
-	n = iommu_area_alloc(iommu->map, limit, start, npages, shift,
-			     boundary_size, align_mask);
-	if (n == -1) {
-		if (likely(pass == 0)) {
-			/* First failure, rescan from the beginning.  */
-			pool->hint = pool->start;
-			set_flush(iommu);
-			pass++;
-			goto again;
-		} else if (!largealloc && pass <= iommu->nr_pools) {
-			spin_unlock(&(pool->lock));
-			pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
-			pool = &(iommu->pools[pool_nr]);
-			spin_lock(&(pool->lock));
-			pool->hint = pool->start;
-			set_flush(iommu);
-			pass++;
-			goto again;
-		} else {
-			/* give up */
-			n = IOMMU_ERROR_CODE;
-			goto bail;
-		}
-	}
-	if (iommu->lazy_flush &&
-	    (n < pool->hint || need_flush(iommu))) {
-		clear_flush(iommu);
-		iommu->lazy_flush(iommu);
-	}
-
-	end = n + npages;
-	pool->hint = end;
-
-	/* Update handle for SG allocations */
-	if (handle)
-		*handle = end;
-bail:
-	spin_unlock_irqrestore(&(pool->lock), flags);
-
-	return n;
-}
-EXPORT_SYMBOL(iommu_tbl_range_alloc);
-
-static struct iommu_pool *get_pool(struct iommu_map_table *tbl,
-				   unsigned long entry)
-{
-	struct iommu_pool *p;
-	unsigned long largepool_start = tbl->large_pool.start;
-	bool large_pool = ((tbl->flags & IOMMU_HAS_LARGE_POOL) != 0);
-
-	/* The large pool is the last pool at the top of the table */
-	if (large_pool && entry >= largepool_start) {
-		p = &tbl->large_pool;
-	} else {
-		unsigned int pool_nr = entry / tbl->poolsize;
-
-		BUG_ON(pool_nr >= tbl->nr_pools);
-		p = &tbl->pools[pool_nr];
-	}
-	return p;
-}
-
-/* Caller supplies the index of the entry into the iommu map table
- * itself when the mapping from dma_addr to the entry is not the
- * default addr->entry mapping below.
- */
-void iommu_tbl_range_free(struct iommu_map_table *iommu, u64 dma_addr,
-			  unsigned long npages, unsigned long entry)
-{
-	struct iommu_pool *pool;
-	unsigned long flags;
-	unsigned long shift = iommu->table_shift;
-
-	if (entry == IOMMU_ERROR_CODE) /* use default addr->entry mapping */
-		entry = (dma_addr - iommu->table_map_base) >> shift;
-	pool = get_pool(iommu, entry);
-
-	spin_lock_irqsave(&(pool->lock), flags);
-	bitmap_clear(iommu->map, entry, npages);
-	spin_unlock_irqrestore(&(pool->lock), flags);
-}
-EXPORT_SYMBOL(iommu_tbl_range_free);
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index 23633c0fda4a..92a9f243c0e2 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -3,19 +3,8 @@
  * IOMMU helper functions for the free area management
  */
 
-#include <linux/export.h>
 #include <linux/bitmap.h>
-#include <linux/bug.h>
-
-int iommu_is_span_boundary(unsigned int index, unsigned int nr,
-			   unsigned long shift,
-			   unsigned long boundary_size)
-{
-	BUG_ON(!is_power_of_2(boundary_size));
-
-	shift = (shift + index) & (boundary_size - 1);
-	return shift + nr > boundary_size;
-}
+#include <linux/iommu-helper.h>
 
 unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 			       unsigned long start, unsigned int nr,
@@ -38,4 +27,3 @@ again:
 	}
 	return -1;
 }
-EXPORT_SYMBOL(iommu_area_alloc);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 970212670b6a..7e43cd54c84c 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -573,6 +573,67 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(_copy_to_iter);
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+static int copyout_mcsafe(void __user *to, const void *from, size_t n)
+{
+	if (access_ok(VERIFY_WRITE, to, n)) {
+		kasan_check_read(from, n);
+		n = copy_to_user_mcsafe((__force void *) to, from, n);
+	}
+	return n;
+}
+
+static unsigned long memcpy_mcsafe_to_page(struct page *page, size_t offset,
+		const char *from, size_t len)
+{
+	unsigned long ret;
+	char *to;
+
+	to = kmap_atomic(page);
+	ret = memcpy_mcsafe(to + offset, from, len);
+	kunmap_atomic(to);
+
+	return ret;
+}
+
+size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i)
+{
+	const char *from = addr;
+	unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
+
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return 0;
+	}
+	if (iter_is_iovec(i))
+		might_fault();
+	iterate_and_advance(i, bytes, v,
+		copyout_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
+		({
+		rem = memcpy_mcsafe_to_page(v.bv_page, v.bv_offset,
+                               (from += v.bv_len) - v.bv_len, v.bv_len);
+		if (rem) {
+			curr_addr = (unsigned long) from;
+			bytes = curr_addr - s_addr - rem;
+			return bytes;
+		}
+		}),
+		({
+		rem = memcpy_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len,
+				v.iov_len);
+		if (rem) {
+			curr_addr = (unsigned long) from;
+			bytes = curr_addr - s_addr - rem;
+			return bytes;
+		}
+		})
+	)
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(_copy_to_iter_mcsafe);
+#endif /* CONFIG_ARCH_HAS_UACCESS_MCSAFE */
+
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;
@@ -1012,7 +1073,7 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 }
 EXPORT_SYMBOL(iov_iter_gap_alignment);
 
-static inline size_t __pipe_get_pages(struct iov_iter *i,
+static inline ssize_t __pipe_get_pages(struct iov_iter *i,
 				size_t maxsize,
 				struct page **pages,
 				int idx,
@@ -1102,7 +1163,7 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 		   size_t *start)
 {
 	struct page **p;
-	size_t n;
+	ssize_t n;
 	int idx;
 	int npages;
 
diff --git a/lib/kobject.c b/lib/kobject.c
index e1d1f290bf35..18989b5b3b56 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -233,13 +233,12 @@ static int kobject_add_internal(struct kobject *kobj)
 
 		/* be noisy on error issues */
 		if (error == -EEXIST)
-			WARN(1,
-			     "%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n",
-			     __func__, kobject_name(kobj));
+			pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n",
+			       __func__, kobject_name(kobj));
 		else
-			WARN(1, "%s failed for %s (error: %d parent: %s)\n",
-			     __func__, kobject_name(kobj), error,
-			     parent ? kobject_name(parent) : "'none'");
+			pr_err("%s failed for %s (error: %d parent: %s)\n",
+			       __func__, kobject_name(kobj), error,
+			       parent ? kobject_name(parent) : "'none'");
 	} else
 		kobj->state_in_sysfs = 1;
 
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index da9e10c827df..a9e41aed6de4 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1612,11 +1612,9 @@ static void set_iter_tags(struct radix_tree_iter *iter,
 static void __rcu **skip_siblings(struct radix_tree_node **nodep,
 			void __rcu **slot, struct radix_tree_iter *iter)
 {
-	void *sib = node_to_entry(slot - 1);
-
 	while (iter->index < iter->next_index) {
 		*nodep = rcu_dereference_raw(*slot);
-		if (*nodep && *nodep != sib)
+		if (*nodep && !is_sibling_entry(iter->node, *nodep))
 			return slot;
 		slot++;
 		iter->index = __radix_tree_iter_add(iter, 1);
@@ -1631,7 +1629,7 @@ void __rcu **__radix_tree_next_slot(void __rcu **slot,
 				struct radix_tree_iter *iter, unsigned flags)
 {
 	unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
-	struct radix_tree_node *node = rcu_dereference_raw(*slot);
+	struct radix_tree_node *node;
 
 	slot = skip_siblings(&node, slot, iter);
 
@@ -2036,10 +2034,12 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 			     unsigned long index, void *item)
 {
 	struct radix_tree_node *node = NULL;
-	void __rcu **slot;
+	void __rcu **slot = NULL;
 	void *entry;
 
 	entry = __radix_tree_lookup(root, index, &node, &slot);
+	if (!slot)
+		return NULL;
 	if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
 						get_slot_offset(node, slot))))
 		return NULL;
diff --git a/lib/reed_solomon/decode_rs.c b/lib/reed_solomon/decode_rs.c
index 0ec3f257ffdf..1db74eb098d0 100644
--- a/lib/reed_solomon/decode_rs.c
+++ b/lib/reed_solomon/decode_rs.c
@@ -1,22 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * lib/reed_solomon/decode_rs.c
- *
- * Overview:
- *   Generic Reed Solomon encoder / decoder library
+ * Generic Reed Solomon encoder / decoder library
  *
  * Copyright 2002, Phil Karn, KA9Q
  * May be used under the terms of the GNU General Public License (GPL)
  *
  * Adaption to the kernel by Thomas Gleixner (tglx@linutronix.de)
  *
- * $Id: decode_rs.c,v 1.7 2005/11/07 11:14:59 gleixner Exp $
- *
- */
-
-/* Generic data width independent code which is included by the
- * wrappers.
+ * Generic data width independent code which is included by the wrappers.
  */
 {
+	struct rs_codec *rs = rsc->codec;
 	int deg_lambda, el, deg_omega;
 	int i, j, r, k, pad;
 	int nn = rs->nn;
@@ -27,16 +21,22 @@
 	uint16_t *alpha_to = rs->alpha_to;
 	uint16_t *index_of = rs->index_of;
 	uint16_t u, q, tmp, num1, num2, den, discr_r, syn_error;
-	/* Err+Eras Locator poly and syndrome poly The maximum value
-	 * of nroots is 8. So the necessary stack size will be about
-	 * 220 bytes max.
-	 */
-	uint16_t lambda[nroots + 1], syn[nroots];
-	uint16_t b[nroots + 1], t[nroots + 1], omega[nroots + 1];
-	uint16_t root[nroots], reg[nroots + 1], loc[nroots];
 	int count = 0;
 	uint16_t msk = (uint16_t) rs->nn;
 
+	/*
+	 * The decoder buffers are in the rs control struct. They are
+	 * arrays sized [nroots + 1]
+	 */
+	uint16_t *lambda = rsc->buffers + RS_DECODE_LAMBDA * (nroots + 1);
+	uint16_t *syn = rsc->buffers + RS_DECODE_SYN * (nroots + 1);
+	uint16_t *b = rsc->buffers + RS_DECODE_B * (nroots + 1);
+	uint16_t *t = rsc->buffers + RS_DECODE_T * (nroots + 1);
+	uint16_t *omega = rsc->buffers + RS_DECODE_OMEGA * (nroots + 1);
+	uint16_t *root = rsc->buffers + RS_DECODE_ROOT * (nroots + 1);
+	uint16_t *reg = rsc->buffers + RS_DECODE_REG * (nroots + 1);
+	uint16_t *loc = rsc->buffers + RS_DECODE_LOC * (nroots + 1);
+
 	/* Check length parameter for validity */
 	pad = nn - nroots - len;
 	BUG_ON(pad < 0 || pad >= nn);
diff --git a/lib/reed_solomon/encode_rs.c b/lib/reed_solomon/encode_rs.c
index 0b5b1a6728ec..9112d46e869e 100644
--- a/lib/reed_solomon/encode_rs.c
+++ b/lib/reed_solomon/encode_rs.c
@@ -1,23 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * lib/reed_solomon/encode_rs.c
- *
- * Overview:
- *   Generic Reed Solomon encoder / decoder library
+ * Generic Reed Solomon encoder / decoder library
  *
  * Copyright 2002, Phil Karn, KA9Q
  * May be used under the terms of the GNU General Public License (GPL)
  *
  * Adaption to the kernel by Thomas Gleixner (tglx@linutronix.de)
  *
- * $Id: encode_rs.c,v 1.5 2005/11/07 11:14:59 gleixner Exp $
- *
- */
-
-/* Generic data width independent code which is included by the
- * wrappers.
- * int encode_rsX (struct rs_control *rs, uintX_t *data, int len, uintY_t *par)
+ * Generic data width independent code which is included by the wrappers.
  */
 {
+	struct rs_codec *rs = rsc->codec;
 	int i, j, pad;
 	int nn = rs->nn;
 	int nroots = rs->nroots;
diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c
index 06d04cfa9339..dfcf54242fb9 100644
--- a/lib/reed_solomon/reed_solomon.c
+++ b/lib/reed_solomon/reed_solomon.c
@@ -1,43 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * lib/reed_solomon/reed_solomon.c
- *
- * Overview:
- *   Generic Reed Solomon encoder / decoder library
+ * Generic Reed Solomon encoder / decoder library
  *
  * Copyright (C) 2004 Thomas Gleixner (tglx@linutronix.de)
  *
  * Reed Solomon code lifted from reed solomon library written by Phil Karn
  * Copyright 2002 Phil Karn, KA9Q
  *
- * $Id: rslib.c,v 1.7 2005/11/07 11:14:59 gleixner Exp $
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
  * Description:
  *
  * The generic Reed Solomon library provides runtime configurable
  * encoding / decoding of RS codes.
- * Each user must call init_rs to get a pointer to a rs_control
- * structure for the given rs parameters. This structure is either
- * generated or a already available matching control structure is used.
- * If a structure is generated then the polynomial arrays for
- * fast encoding / decoding are built. This can take some time so
- * make sure not to call this function from a time critical path.
- * Usually a module / driver should initialize the necessary
- * rs_control structure on module / driver init and release it
- * on exit.
- * The encoding puts the calculated syndrome into a given syndrome
- * buffer.
- * The decoding is a two step process. The first step calculates
- * the syndrome over the received (data + syndrome) and calls the
- * second stage, which does the decoding / error correction itself.
- * Many hw encoders provide a syndrome calculation over the received
- * data + syndrome and can call the second stage directly.
  *
+ * Each user must call init_rs to get a pointer to a rs_control structure
+ * for the given rs parameters. The control struct is unique per instance.
+ * It points to a codec which can be shared by multiple control structures.
+ * If a codec is newly allocated then the polynomial arrays for fast
+ * encoding / decoding are built. This can take some time so make sure not
+ * to call this function from a time critical path.  Usually a module /
+ * driver should initialize the necessary rs_control structure on module /
+ * driver init and release it on exit.
+ *
+ * The encoding puts the calculated syndrome into a given syndrome buffer.
+ *
+ * The decoding is a two step process. The first step calculates the
+ * syndrome over the received (data + syndrome) and calls the second stage,
+ * which does the decoding / error correction itself.  Many hw encoders
+ * provide a syndrome calculation over the received data + syndrome and can
+ * call the second stage directly.
  */
-
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -46,32 +37,44 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 
-/* This list holds all currently allocated rs control structures */
-static LIST_HEAD (rslist);
+enum {
+	RS_DECODE_LAMBDA,
+	RS_DECODE_SYN,
+	RS_DECODE_B,
+	RS_DECODE_T,
+	RS_DECODE_OMEGA,
+	RS_DECODE_ROOT,
+	RS_DECODE_REG,
+	RS_DECODE_LOC,
+	RS_DECODE_NUM_BUFFERS
+};
+
+/* This list holds all currently allocated rs codec structures */
+static LIST_HEAD(codec_list);
 /* Protection for the list */
 static DEFINE_MUTEX(rslistlock);
 
 /**
- * rs_init - Initialize a Reed-Solomon codec
+ * codec_init - Initialize a Reed-Solomon codec
  * @symsize:	symbol size, bits (1-8)
  * @gfpoly:	Field generator polynomial coefficients
  * @gffunc:	Field generator function
  * @fcr:	first root of RS code generator polynomial, index form
  * @prim:	primitive element to generate polynomial roots
  * @nroots:	RS code generator polynomial degree (number of roots)
+ * @gfp:	GFP_ flags for allocations
  *
- * Allocate a control structure and the polynom arrays for faster
+ * Allocate a codec structure and the polynom arrays for faster
  * en/decoding. Fill the arrays according to the given parameters.
  */
-static struct rs_control *rs_init(int symsize, int gfpoly, int (*gffunc)(int),
-                                  int fcr, int prim, int nroots)
+static struct rs_codec *codec_init(int symsize, int gfpoly, int (*gffunc)(int),
+				   int fcr, int prim, int nroots, gfp_t gfp)
 {
-	struct rs_control *rs;
 	int i, j, sr, root, iprim;
+	struct rs_codec *rs;
 
-	/* Allocate the control structure */
-	rs = kmalloc(sizeof (struct rs_control), GFP_KERNEL);
-	if (rs == NULL)
+	rs = kzalloc(sizeof(*rs), gfp);
+	if (!rs)
 		return NULL;
 
 	INIT_LIST_HEAD(&rs->list);
@@ -85,17 +88,17 @@ static struct rs_control *rs_init(int symsize, int gfpoly, int (*gffunc)(int),
 	rs->gffunc = gffunc;
 
 	/* Allocate the arrays */
-	rs->alpha_to = kmalloc(sizeof(uint16_t) * (rs->nn + 1), GFP_KERNEL);
+	rs->alpha_to = kmalloc(sizeof(uint16_t) * (rs->nn + 1), gfp);
 	if (rs->alpha_to == NULL)
-		goto errrs;
+		goto err;
 
-	rs->index_of = kmalloc(sizeof(uint16_t) * (rs->nn + 1), GFP_KERNEL);
+	rs->index_of = kmalloc(sizeof(uint16_t) * (rs->nn + 1), gfp);
 	if (rs->index_of == NULL)
-		goto erralp;
+		goto err;
 
-	rs->genpoly = kmalloc(sizeof(uint16_t) * (rs->nroots + 1), GFP_KERNEL);
+	rs->genpoly = kmalloc(sizeof(uint16_t) * (rs->nroots + 1), gfp);
 	if(rs->genpoly == NULL)
-		goto erridx;
+		goto err;
 
 	/* Generate Galois field lookup tables */
 	rs->index_of[0] = rs->nn;	/* log(zero) = -inf */
@@ -120,7 +123,7 @@ static struct rs_control *rs_init(int symsize, int gfpoly, int (*gffunc)(int),
 	}
 	/* If it's not primitive, exit */
 	if(sr != rs->alpha_to[0])
-		goto errpol;
+		goto err;
 
 	/* Find prim-th root of 1, used in decoding */
 	for(iprim = 1; (iprim % prim) != 0; iprim += rs->nn);
@@ -148,42 +151,52 @@ static struct rs_control *rs_init(int symsize, int gfpoly, int (*gffunc)(int),
 	/* convert rs->genpoly[] to index form for quicker encoding */
 	for (i = 0; i <= nroots; i++)
 		rs->genpoly[i] = rs->index_of[rs->genpoly[i]];
+
+	rs->users = 1;
+	list_add(&rs->list, &codec_list);
 	return rs;
 
-	/* Error exit */
-errpol:
+err:
 	kfree(rs->genpoly);
-erridx:
 	kfree(rs->index_of);
-erralp:
 	kfree(rs->alpha_to);
-errrs:
 	kfree(rs);
 	return NULL;
 }
 
 
 /**
- *  free_rs - Free the rs control structure, if it is no longer used
- *  @rs:	the control structure which is not longer used by the
+ *  free_rs - Free the rs control structure
+ *  @rs:	The control structure which is not longer used by the
  *		caller
+ *
+ * Free the control structure. If @rs is the last user of the associated
+ * codec, free the codec as well.
  */
 void free_rs(struct rs_control *rs)
 {
+	struct rs_codec *cd;
+
+	if (!rs)
+		return;
+
+	cd = rs->codec;
 	mutex_lock(&rslistlock);
-	rs->users--;
-	if(!rs->users) {
-		list_del(&rs->list);
-		kfree(rs->alpha_to);
-		kfree(rs->index_of);
-		kfree(rs->genpoly);
-		kfree(rs);
+	cd->users--;
+	if(!cd->users) {
+		list_del(&cd->list);
+		kfree(cd->alpha_to);
+		kfree(cd->index_of);
+		kfree(cd->genpoly);
+		kfree(cd);
 	}
 	mutex_unlock(&rslistlock);
+	kfree(rs);
 }
+EXPORT_SYMBOL_GPL(free_rs);
 
 /**
- * init_rs_internal - Find a matching or allocate a new rs control structure
+ * init_rs_internal - Allocate rs control, find a matching codec or allocate a new one
  *  @symsize:	the symbol size (number of bits)
  *  @gfpoly:	the extended Galois field generator polynomial coefficients,
  *		with the 0th coefficient in the low order bit. The polynomial
@@ -191,55 +204,69 @@ void free_rs(struct rs_control *rs)
  *  @gffunc:	pointer to function to generate the next field element,
  *		or the multiplicative identity element if given 0.  Used
  *		instead of gfpoly if gfpoly is 0
- *  @fcr:  	the first consecutive root of the rs code generator polynomial
+ *  @fcr:	the first consecutive root of the rs code generator polynomial
  *		in index form
  *  @prim:	primitive element to generate polynomial roots
  *  @nroots:	RS code generator polynomial degree (number of roots)
+ *  @gfp:	GFP_ flags for allocations
  */
 static struct rs_control *init_rs_internal(int symsize, int gfpoly,
-                                           int (*gffunc)(int), int fcr,
-                                           int prim, int nroots)
+					   int (*gffunc)(int), int fcr,
+					   int prim, int nroots, gfp_t gfp)
 {
-	struct list_head	*tmp;
-	struct rs_control	*rs;
+	struct list_head *tmp;
+	struct rs_control *rs;
+	unsigned int bsize;
 
 	/* Sanity checks */
 	if (symsize < 1)
 		return NULL;
 	if (fcr < 0 || fcr >= (1<<symsize))
-    		return NULL;
+		return NULL;
 	if (prim <= 0 || prim >= (1<<symsize))
-    		return NULL;
+		return NULL;
 	if (nroots < 0 || nroots >= (1<<symsize))
 		return NULL;
 
+	/*
+	 * The decoder needs buffers in each control struct instance to
+	 * avoid variable size or large fixed size allocations on
+	 * stack. Size the buffers to arrays of [nroots + 1].
+	 */
+	bsize = sizeof(uint16_t) * RS_DECODE_NUM_BUFFERS * (nroots + 1);
+	rs = kzalloc(sizeof(*rs) + bsize, gfp);
+	if (!rs)
+		return NULL;
+
 	mutex_lock(&rslistlock);
 
 	/* Walk through the list and look for a matching entry */
-	list_for_each(tmp, &rslist) {
-		rs = list_entry(tmp, struct rs_control, list);
-		if (symsize != rs->mm)
+	list_for_each(tmp, &codec_list) {
+		struct rs_codec *cd = list_entry(tmp, struct rs_codec, list);
+
+		if (symsize != cd->mm)
 			continue;
-		if (gfpoly != rs->gfpoly)
+		if (gfpoly != cd->gfpoly)
 			continue;
-		if (gffunc != rs->gffunc)
+		if (gffunc != cd->gffunc)
 			continue;
-		if (fcr != rs->fcr)
+		if (fcr != cd->fcr)
 			continue;
-		if (prim != rs->prim)
+		if (prim != cd->prim)
 			continue;
-		if (nroots != rs->nroots)
+		if (nroots != cd->nroots)
 			continue;
 		/* We have a matching one already */
-		rs->users++;
+		cd->users++;
+		rs->codec = cd;
 		goto out;
 	}
 
 	/* Create a new one */
-	rs = rs_init(symsize, gfpoly, gffunc, fcr, prim, nroots);
-	if (rs) {
-		rs->users = 1;
-		list_add(&rs->list, &rslist);
+	rs->codec = codec_init(symsize, gfpoly, gffunc, fcr, prim, nroots, gfp);
+	if (!rs->codec) {
+		kfree(rs);
+		rs = NULL;
 	}
 out:
 	mutex_unlock(&rslistlock);
@@ -247,45 +274,48 @@ out:
 }
 
 /**
- * init_rs - Find a matching or allocate a new rs control structure
+ * init_rs_gfp - Create a RS control struct and initialize it
  *  @symsize:	the symbol size (number of bits)
  *  @gfpoly:	the extended Galois field generator polynomial coefficients,
  *		with the 0th coefficient in the low order bit. The polynomial
  *		must be primitive;
- *  @fcr:  	the first consecutive root of the rs code generator polynomial
+ *  @fcr:	the first consecutive root of the rs code generator polynomial
  *		in index form
  *  @prim:	primitive element to generate polynomial roots
  *  @nroots:	RS code generator polynomial degree (number of roots)
+ *  @gfp:	GFP_ flags for allocations
  */
-struct rs_control *init_rs(int symsize, int gfpoly, int fcr, int prim,
-                           int nroots)
+struct rs_control *init_rs_gfp(int symsize, int gfpoly, int fcr, int prim,
+			       int nroots, gfp_t gfp)
 {
-	return init_rs_internal(symsize, gfpoly, NULL, fcr, prim, nroots);
+	return init_rs_internal(symsize, gfpoly, NULL, fcr, prim, nroots, gfp);
 }
+EXPORT_SYMBOL_GPL(init_rs_gfp);
 
 /**
- * init_rs_non_canonical - Find a matching or allocate a new rs control
- *                         structure, for fields with non-canonical
- *                         representation
+ * init_rs_non_canonical - Allocate rs control struct for fields with
+ *                         non-canonical representation
  *  @symsize:	the symbol size (number of bits)
  *  @gffunc:	pointer to function to generate the next field element,
  *		or the multiplicative identity element if given 0.  Used
  *		instead of gfpoly if gfpoly is 0
- *  @fcr:  	the first consecutive root of the rs code generator polynomial
+ *  @fcr:	the first consecutive root of the rs code generator polynomial
  *		in index form
  *  @prim:	primitive element to generate polynomial roots
  *  @nroots:	RS code generator polynomial degree (number of roots)
  */
 struct rs_control *init_rs_non_canonical(int symsize, int (*gffunc)(int),
-                                         int fcr, int prim, int nroots)
+					 int fcr, int prim, int nroots)
 {
-	return init_rs_internal(symsize, 0, gffunc, fcr, prim, nroots);
+	return init_rs_internal(symsize, 0, gffunc, fcr, prim, nroots,
+				GFP_KERNEL);
 }
+EXPORT_SYMBOL_GPL(init_rs_non_canonical);
 
 #ifdef CONFIG_REED_SOLOMON_ENC8
 /**
  *  encode_rs8 - Calculate the parity for data values (8bit data width)
- *  @rs:	the rs control structure
+ *  @rsc:	the rs control structure
  *  @data:	data field of a given type
  *  @len:	data length
  *  @par:	parity data, must be initialized by caller (usually all 0)
@@ -295,7 +325,7 @@ struct rs_control *init_rs_non_canonical(int symsize, int (*gffunc)(int),
  *  symbol size > 8. The calling code must take care of encoding of the
  *  syndrome result for storage itself.
  */
-int encode_rs8(struct rs_control *rs, uint8_t *data, int len, uint16_t *par,
+int encode_rs8(struct rs_control *rsc, uint8_t *data, int len, uint16_t *par,
 	       uint16_t invmsk)
 {
 #include "encode_rs.c"
@@ -306,7 +336,7 @@ EXPORT_SYMBOL_GPL(encode_rs8);
 #ifdef CONFIG_REED_SOLOMON_DEC8
 /**
  *  decode_rs8 - Decode codeword (8bit data width)
- *  @rs:	the rs control structure
+ *  @rsc:	the rs control structure
  *  @data:	data field of a given type
  *  @par:	received parity data field
  *  @len:	data length
@@ -319,9 +349,14 @@ EXPORT_SYMBOL_GPL(encode_rs8);
  *  The syndrome and parity uses a uint16_t data type to enable
  *  symbol size > 8. The calling code must take care of decoding of the
  *  syndrome result and the received parity before calling this code.
+ *
+ *  Note: The rs_control struct @rsc contains buffers which are used for
+ *  decoding, so the caller has to ensure that decoder invocations are
+ *  serialized.
+ *
  *  Returns the number of corrected bits or -EBADMSG for uncorrectable errors.
  */
-int decode_rs8(struct rs_control *rs, uint8_t *data, uint16_t *par, int len,
+int decode_rs8(struct rs_control *rsc, uint8_t *data, uint16_t *par, int len,
 	       uint16_t *s, int no_eras, int *eras_pos, uint16_t invmsk,
 	       uint16_t *corr)
 {
@@ -333,7 +368,7 @@ EXPORT_SYMBOL_GPL(decode_rs8);
 #ifdef CONFIG_REED_SOLOMON_ENC16
 /**
  *  encode_rs16 - Calculate the parity for data values (16bit data width)
- *  @rs:	the rs control structure
+ *  @rsc:	the rs control structure
  *  @data:	data field of a given type
  *  @len:	data length
  *  @par:	parity data, must be initialized by caller (usually all 0)
@@ -341,7 +376,7 @@ EXPORT_SYMBOL_GPL(decode_rs8);
  *
  *  Each field in the data array contains up to symbol size bits of valid data.
  */
-int encode_rs16(struct rs_control *rs, uint16_t *data, int len, uint16_t *par,
+int encode_rs16(struct rs_control *rsc, uint16_t *data, int len, uint16_t *par,
 	uint16_t invmsk)
 {
 #include "encode_rs.c"
@@ -352,7 +387,7 @@ EXPORT_SYMBOL_GPL(encode_rs16);
 #ifdef CONFIG_REED_SOLOMON_DEC16
 /**
  *  decode_rs16 - Decode codeword (16bit data width)
- *  @rs:	the rs control structure
+ *  @rsc:	the rs control structure
  *  @data:	data field of a given type
  *  @par:	received parity data field
  *  @len:	data length
@@ -363,9 +398,14 @@ EXPORT_SYMBOL_GPL(encode_rs16);
  *  @corr:	buffer to store correction bitmask on eras_pos
  *
  *  Each field in the data array contains up to symbol size bits of valid data.
+ *
+ *  Note: The rc_control struct @rsc contains buffers which are used for
+ *  decoding, so the caller has to ensure that decoder invocations are
+ *  serialized.
+ *
  *  Returns the number of corrected bits or -EBADMSG for uncorrectable errors.
  */
-int decode_rs16(struct rs_control *rs, uint16_t *data, uint16_t *par, int len,
+int decode_rs16(struct rs_control *rsc, uint16_t *data, uint16_t *par, int len,
 		uint16_t *s, int no_eras, int *eras_pos, uint16_t invmsk,
 		uint16_t *corr)
 {
@@ -374,10 +414,6 @@ int decode_rs16(struct rs_control *rs, uint16_t *data, uint16_t *par, int len,
 EXPORT_SYMBOL_GPL(decode_rs16);
 #endif
 
-EXPORT_SYMBOL_GPL(init_rs);
-EXPORT_SYMBOL_GPL(init_rs_non_canonical);
-EXPORT_SYMBOL_GPL(free_rs);
-
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Reed Solomon encoder/decoder");
 MODULE_AUTHOR("Phil Karn, Thomas Gleixner");
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index e6a9c06ec70c..6fdc6267f4a8 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -270,18 +270,33 @@ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
 }
 EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
 
-static unsigned int sbq_calc_wake_batch(unsigned int depth)
+static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
+					unsigned int depth)
 {
 	unsigned int wake_batch;
+	unsigned int shallow_depth;
 
 	/*
 	 * For each batch, we wake up one queue. We need to make sure that our
-	 * batch size is small enough that the full depth of the bitmap is
-	 * enough to wake up all of the queues.
+	 * batch size is small enough that the full depth of the bitmap,
+	 * potentially limited by a shallow depth, is enough to wake up all of
+	 * the queues.
+	 *
+	 * Each full word of the bitmap has bits_per_word bits, and there might
+	 * be a partial word. There are depth / bits_per_word full words and
+	 * depth % bits_per_word bits left over. In bitwise arithmetic:
+	 *
+	 * bits_per_word = 1 << shift
+	 * depth / bits_per_word = depth >> shift
+	 * depth % bits_per_word = depth & ((1 << shift) - 1)
+	 *
+	 * Each word can be limited to sbq->min_shallow_depth bits.
 	 */
-	wake_batch = SBQ_WAKE_BATCH;
-	if (wake_batch > depth / SBQ_WAIT_QUEUES)
-		wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
+	shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
+	depth = ((depth >> sbq->sb.shift) * shallow_depth +
+		 min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
+	wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
+			     SBQ_WAKE_BATCH);
 
 	return wake_batch;
 }
@@ -307,7 +322,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 			*per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
 	}
 
-	sbq->wake_batch = sbq_calc_wake_batch(depth);
+	sbq->min_shallow_depth = UINT_MAX;
+	sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
 	atomic_set(&sbq->wake_index, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
@@ -327,21 +343,28 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
 
-void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+					    unsigned int depth)
 {
-	unsigned int wake_batch = sbq_calc_wake_batch(depth);
+	unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
 	int i;
 
 	if (sbq->wake_batch != wake_batch) {
 		WRITE_ONCE(sbq->wake_batch, wake_batch);
 		/*
-		 * Pairs with the memory barrier in sbq_wake_up() to ensure that
-		 * the batch size is updated before the wait counts.
+		 * Pairs with the memory barrier in sbitmap_queue_wake_up()
+		 * to ensure that the batch size is updated before the wait
+		 * counts.
 		 */
 		smp_mb__before_atomic();
 		for (i = 0; i < SBQ_WAIT_QUEUES; i++)
 			atomic_set(&sbq->ws[i].wait_cnt, 1);
 	}
+}
+
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+{
+	sbitmap_queue_update_wake_batch(sbq, depth);
 	sbitmap_resize(&sbq->sb, depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
@@ -380,6 +403,8 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
 	unsigned int hint, depth;
 	int nr;
 
+	WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth);
+
 	hint = this_cpu_read(*sbq->alloc_hint);
 	depth = READ_ONCE(sbq->sb.depth);
 	if (unlikely(hint >= depth)) {
@@ -403,6 +428,14 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
 
+void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
+				     unsigned int min_shallow_depth)
+{
+	sbq->min_shallow_depth = min_shallow_depth;
+	sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);
+
 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
@@ -425,52 +458,67 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 	return NULL;
 }
 
-static void sbq_wake_up(struct sbitmap_queue *sbq)
+static bool __sbq_wake_up(struct sbitmap_queue *sbq)
 {
 	struct sbq_wait_state *ws;
 	unsigned int wake_batch;
 	int wait_cnt;
 
-	/*
-	 * Pairs with the memory barrier in set_current_state() to ensure the
-	 * proper ordering of clear_bit()/waitqueue_active() in the waker and
-	 * test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
-	 * waiter. See the comment on waitqueue_active(). This is __after_atomic
-	 * because we just did clear_bit_unlock() in the caller.
-	 */
-	smp_mb__after_atomic();
-
 	ws = sbq_wake_ptr(sbq);
 	if (!ws)
-		return;
+		return false;
 
 	wait_cnt = atomic_dec_return(&ws->wait_cnt);
 	if (wait_cnt <= 0) {
+		int ret;
+
 		wake_batch = READ_ONCE(sbq->wake_batch);
+
 		/*
 		 * Pairs with the memory barrier in sbitmap_queue_resize() to
 		 * ensure that we see the batch size update before the wait
 		 * count is reset.
 		 */
 		smp_mb__before_atomic();
+
 		/*
-		 * If there are concurrent callers to sbq_wake_up(), the last
-		 * one to decrement the wait count below zero will bump it back
-		 * up. If there is a concurrent resize, the count reset will
-		 * either cause the cmpxchg to fail or overwrite after the
-		 * cmpxchg.
+		 * For concurrent callers of this, the one that failed the
+		 * atomic_cmpxhcg() race should call this function again
+		 * to wakeup a new batch on a different 'ws'.
 		 */
-		atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch);
-		sbq_index_atomic_inc(&sbq->wake_index);
-		wake_up_nr(&ws->wait, wake_batch);
+		ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
+		if (ret == wait_cnt) {
+			sbq_index_atomic_inc(&sbq->wake_index);
+			wake_up_nr(&ws->wait, wake_batch);
+			return false;
+		}
+
+		return true;
 	}
+
+	return false;
+}
+
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
+{
+	while (__sbq_wake_up(sbq))
+		;
 }
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
 	sbitmap_clear_bit_unlock(&sbq->sb, nr);
-	sbq_wake_up(sbq);
+	/*
+	 * Pairs with the memory barrier in set_current_state() to ensure the
+	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
+	 * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
+	 * waiter. See the comment on waitqueue_active().
+	 */
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(sbq);
+
 	if (likely(!sbq->round_robin && nr < sbq->sb.depth))
 		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
@@ -482,7 +530,7 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
 
 	/*
 	 * Pairs with the memory barrier in set_current_state() like in
-	 * sbq_wake_up().
+	 * sbitmap_queue_wake_up().
 	 */
 	smp_mb();
 	wake_index = atomic_read(&sbq->wake_index);
@@ -528,5 +576,6 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 	seq_puts(m, "}\n");
 
 	seq_printf(m, "round_robin=%d\n", sbq->round_robin);
+	seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_show);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index fece57566d45..04b68d9dffac 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -593,9 +593,8 @@ found:
 }
 
 /*
- * Allocates bounce buffer and returns its kernel virtual address.
+ * Allocates bounce buffer and returns its physical address.
  */
-
 static phys_addr_t
 map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 	   enum dma_data_direction dir, unsigned long attrs)
@@ -614,7 +613,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 }
 
 /*
- * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ * tlb_addr is the physical address of the bounce buffer to unmap.
  */
 void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      size_t size, enum dma_data_direction dir,
@@ -692,7 +691,6 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 	}
 }
 
-#ifdef CONFIG_DMA_DIRECT_OPS
 static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr,
 		size_t size)
 {
@@ -714,7 +712,7 @@ swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 	phys_addr = swiotlb_tbl_map_single(dev,
 			__phys_to_dma(dev, io_tlb_start),
-			0, size, DMA_FROM_DEVICE, 0);
+			0, size, DMA_FROM_DEVICE, attrs);
 	if (phys_addr == SWIOTLB_MAP_ERROR)
 		goto out_warn;
 
@@ -727,7 +725,7 @@ swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 out_unmap:
 	dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
-		(unsigned long long)(dev ? dev->coherent_dma_mask : 0),
+		(unsigned long long)dev->coherent_dma_mask,
 		(unsigned long long)*dma_handle);
 
 	/*
@@ -737,7 +735,7 @@ out_unmap:
 	swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE,
 			DMA_ATTR_SKIP_CPU_SYNC);
 out_warn:
-	if ((attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) {
+	if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) {
 		dev_warn(dev,
 			"swiotlb: coherent allocation failed, size=%zu\n",
 			size);
@@ -764,7 +762,6 @@ static bool swiotlb_free_buffer(struct device *dev, size_t size,
 				 DMA_ATTR_SKIP_CPU_SYNC);
 	return true;
 }
-#endif
 
 static void
 swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
@@ -1045,7 +1042,6 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
 	return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
 
-#ifdef CONFIG_DMA_DIRECT_OPS
 void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs)
 {
@@ -1089,4 +1085,3 @@ const struct dma_map_ops swiotlb_dma_ops = {
 	.unmap_page		= swiotlb_unmap_page,
 	.dma_supported		= dma_direct_supported,
 };
-#endif /* CONFIG_DMA_DIRECT_OPS */
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index de16f7869fb1..6cd7d0740005 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -331,23 +331,32 @@ static void noinline __init test_mem_optimisations(void)
 	unsigned int start, nbits;
 
 	for (start = 0; start < 1024; start += 8) {
-		memset(bmap1, 0x5a, sizeof(bmap1));
-		memset(bmap2, 0x5a, sizeof(bmap2));
 		for (nbits = 0; nbits < 1024 - start; nbits += 8) {
+			memset(bmap1, 0x5a, sizeof(bmap1));
+			memset(bmap2, 0x5a, sizeof(bmap2));
+
 			bitmap_set(bmap1, start, nbits);
 			__bitmap_set(bmap2, start, nbits);
-			if (!bitmap_equal(bmap1, bmap2, 1024))
+			if (!bitmap_equal(bmap1, bmap2, 1024)) {
 				printk("set not equal %d %d\n", start, nbits);
-			if (!__bitmap_equal(bmap1, bmap2, 1024))
+				failed_tests++;
+			}
+			if (!__bitmap_equal(bmap1, bmap2, 1024)) {
 				printk("set not __equal %d %d\n", start, nbits);
+				failed_tests++;
+			}
 
 			bitmap_clear(bmap1, start, nbits);
 			__bitmap_clear(bmap2, start, nbits);
-			if (!bitmap_equal(bmap1, bmap2, 1024))
+			if (!bitmap_equal(bmap1, bmap2, 1024)) {
 				printk("clear not equal %d %d\n", start, nbits);
-			if (!__bitmap_equal(bmap1, bmap2, 1024))
+				failed_tests++;
+			}
+			if (!__bitmap_equal(bmap1, bmap2, 1024)) {
 				printk("clear not __equal %d %d\n", start,
 									nbits);
+				failed_tests++;
+			}
 		}
 	}
 }
diff --git a/lib/textsearch.c b/lib/textsearch.c
index 0b79908dfe89..5939549c0e7b 100644
--- a/lib/textsearch.c
+++ b/lib/textsearch.c
@@ -10,7 +10,10 @@
  * 		Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * ==========================================================================
- *
+ */
+
+/**
+ * DOC: ts_intro
  * INTRODUCTION
  *
  *   The textsearch infrastructure provides text searching facilities for
@@ -19,7 +22,9 @@
  *
  * ARCHITECTURE
  *
- *      User
+ * .. code-block:: none
+ *
+ *     User
  *     +----------------+
  *     |        finish()|<--------------(6)-----------------+
  *     |get_next_block()|<--------------(5)---------------+ |
@@ -33,21 +38,21 @@
  *     |             (3)|----->| find()/next() |-----------+          |
  *     |             (7)|----->| destroy()     |----------------------+
  *     +----------------+      +---------------+
- *  
- *   (1) User configures a search by calling _prepare() specifying the
- *       search parameters such as the pattern and algorithm name.
+ *
+ *   (1) User configures a search by calling textsearch_prepare() specifying
+ *       the search parameters such as the pattern and algorithm name.
  *   (2) Core requests the algorithm to allocate and initialize a search
  *       configuration according to the specified parameters.
- *   (3) User starts the search(es) by calling _find() or _next() to
- *       fetch subsequent occurrences. A state variable is provided
- *       to the algorithm to store persistent variables.
+ *   (3) User starts the search(es) by calling textsearch_find() or
+ *       textsearch_next() to fetch subsequent occurrences. A state variable
+ *       is provided to the algorithm to store persistent variables.
  *   (4) Core eventually resets the search offset and forwards the find()
  *       request to the algorithm.
  *   (5) Algorithm calls get_next_block() provided by the user continuously
  *       to fetch the data to be searched in block by block.
  *   (6) Algorithm invokes finish() after the last call to get_next_block
  *       to clean up any leftovers from get_next_block. (Optional)
- *   (7) User destroys the configuration by calling _destroy().
+ *   (7) User destroys the configuration by calling textsearch_destroy().
  *   (8) Core notifies the algorithm to destroy algorithm specific
  *       allocations. (Optional)
  *
@@ -62,9 +67,10 @@
  *   amount of times and even in parallel as long as a separate struct
  *   ts_state variable is provided to every instance.
  *
- *   The actual search is performed by either calling textsearch_find_-
- *   continuous() for linear data or by providing an own get_next_block()
- *   implementation and calling textsearch_find(). Both functions return
+ *   The actual search is performed by either calling
+ *   textsearch_find_continuous() for linear data or by providing
+ *   an own get_next_block() implementation and
+ *   calling textsearch_find(). Both functions return
  *   the position of the first occurrence of the pattern or UINT_MAX if
  *   no match was found. Subsequent occurrences can be found by calling
  *   textsearch_next() regardless of the linearity of the data.
@@ -72,7 +78,7 @@
  *   Once you're done using a configuration it must be given back via
  *   textsearch_destroy.
  *
- * EXAMPLE
+ * EXAMPLE::
  *
  *   int pos;
  *   struct ts_config *conf;
@@ -87,13 +93,13 @@
  *       goto errout;
  *   }
  *
- *   pos = textsearch_find_continuous(conf, &state, example, strlen(example));
+ *   pos = textsearch_find_continuous(conf, \&state, example, strlen(example));
  *   if (pos != UINT_MAX)
- *       panic("Oh my god, dancing chickens at %d\n", pos);
+ *       panic("Oh my god, dancing chickens at \%d\n", pos);
  *
  *   textsearch_destroy(conf);
- * ==========================================================================
  */
+/* ========================================================================== */
 
 #include <linux/module.h>
 #include <linux/types.h>
@@ -225,7 +231,7 @@ static unsigned int get_linear_data(unsigned int consumed, const u8 **dst,
  *
  * Returns the position of first occurrence of the pattern or
  * %UINT_MAX if no occurrence was found.
- */ 
+ */
 unsigned int textsearch_find_continuous(struct ts_config *conf,
 					struct ts_state *state,
 					const void *data, unsigned int len)
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 30c0cb8cc9bc..23920c5ff728 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1669,19 +1669,22 @@ char *pointer_string(char *buf, char *end, const void *ptr,
 	return number(buf, end, (unsigned long int)ptr, spec);
 }
 
-static bool have_filled_random_ptr_key __read_mostly;
+static DEFINE_STATIC_KEY_TRUE(not_filled_random_ptr_key);
 static siphash_key_t ptr_key __read_mostly;
 
-static void fill_random_ptr_key(struct random_ready_callback *unused)
+static void enable_ptr_key_workfn(struct work_struct *work)
 {
 	get_random_bytes(&ptr_key, sizeof(ptr_key));
-	/*
-	 * have_filled_random_ptr_key==true is dependent on get_random_bytes().
-	 * ptr_to_id() needs to see have_filled_random_ptr_key==true
-	 * after get_random_bytes() returns.
-	 */
-	smp_mb();
-	WRITE_ONCE(have_filled_random_ptr_key, true);
+	/* Needs to run from preemptible context */
+	static_branch_disable(&not_filled_random_ptr_key);
+}
+
+static DECLARE_WORK(enable_ptr_key_work, enable_ptr_key_workfn);
+
+static void fill_random_ptr_key(struct random_ready_callback *unused)
+{
+	/* This may be in an interrupt handler. */
+	queue_work(system_unbound_wq, &enable_ptr_key_work);
 }
 
 static struct random_ready_callback random_ready = {
@@ -1695,7 +1698,8 @@ static int __init initialize_ptr_random(void)
 	if (!ret) {
 		return 0;
 	} else if (ret == -EALREADY) {
-		fill_random_ptr_key(&random_ready);
+		/* This is in preemptible context */
+		enable_ptr_key_workfn(&enable_ptr_key_work);
 		return 0;
 	}
 
@@ -1709,7 +1713,7 @@ static char *ptr_to_id(char *buf, char *end, void *ptr, struct printf_spec spec)
 	unsigned long hashval;
 	const int default_width = 2 * sizeof(ptr);
 
-	if (unlikely(!have_filled_random_ptr_key)) {
+	if (static_branch_unlikely(&not_filled_random_ptr_key)) {
 		spec.field_width = default_width;
 		/* string length must be less than default_width */
 		return string(buf, end, "(ptrval)", spec);