From 45615a9baf4661f953887988eeb4de69fd6cd96e Mon Sep 17 00:00:00 2001 From: Xiaotong Lu Date: Fri, 1 Jun 2018 11:36:22 -0700 Subject: Input: add Spreadtrum vibrator driver This patch adds the Spreadtrum vibrator driver, which embedded in the Spreadtrum SC27xx series PMICs. Signed-off-by: Xiaotong Lu Signed-off-by: Baolin Wang Signed-off-by: Dmitry Torokhov --- .../bindings/input/sprd,sc27xx-vibra.txt | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/sprd,sc27xx-vibra.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/input/sprd,sc27xx-vibra.txt b/Documentation/devicetree/bindings/input/sprd,sc27xx-vibra.txt new file mode 100644 index 000000000000..f2ec0d4f2dff --- /dev/null +++ b/Documentation/devicetree/bindings/input/sprd,sc27xx-vibra.txt @@ -0,0 +1,23 @@ +Spreadtrum SC27xx PMIC Vibrator + +Required properties: +- compatible: should be "sprd,sc2731-vibrator". +- reg: address of vibrator control register. + +Example : + + sc2731_pmic: pmic@0 { + compatible = "sprd,sc2731"; + reg = <0>; + spi-max-frequency = <26000000>; + interrupts = ; + interrupt-controller; + #interrupt-cells = <2>; + #address-cells = <1>; + #size-cells = <0>; + + vibrator@eb4 { + compatible = "sprd,sc2731-vibrator"; + reg = <0xeb4>; + }; + }; -- cgit v1.2.3 From cf65a0f6f6ff7631ba0ac0513a14ca5b65320d80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jun 2018 19:01:45 +0200 Subject: dma-mapping: move all DMA mapping code to kernel/dma Currently the code is split over various files with dma- prefixes in the lib/ and drives/base directories, and the number of files keeps growing. Move them into a single directory to keep the code together and remove the file name prefixes. To match the irq infrastructure this directory is placed under the kernel/ directory. Signed-off-by: Christoph Hellwig --- Documentation/driver-api/infrastructure.rst | 4 +- MAINTAINERS | 9 +- drivers/base/Makefile | 3 - drivers/base/dma-coherent.c | 434 ------- drivers/base/dma-contiguous.c | 278 ----- drivers/base/dma-mapping.c | 345 ------ include/linux/dma-contiguous.h | 2 +- init/Kconfig | 4 - kernel/Makefile | 1 + kernel/dma/Kconfig | 50 + kernel/dma/Makefile | 11 + kernel/dma/coherent.c | 434 +++++++ kernel/dma/contiguous.c | 278 +++++ kernel/dma/debug.c | 1773 +++++++++++++++++++++++++++ kernel/dma/direct.c | 204 +++ kernel/dma/mapping.c | 345 ++++++ kernel/dma/noncoherent.c | 102 ++ kernel/dma/swiotlb.c | 1087 ++++++++++++++++ kernel/dma/virt.c | 59 + lib/Kconfig | 47 +- lib/Makefile | 6 - lib/dma-debug.c | 1773 --------------------------- lib/dma-direct.c | 204 --- lib/dma-noncoherent.c | 102 -- lib/dma-virt.c | 61 - lib/swiotlb.c | 1087 ---------------- 26 files changed, 4350 insertions(+), 4353 deletions(-) delete mode 100644 drivers/base/dma-coherent.c delete mode 100644 drivers/base/dma-contiguous.c delete mode 100644 drivers/base/dma-mapping.c create mode 100644 kernel/dma/Kconfig create mode 100644 kernel/dma/Makefile create mode 100644 kernel/dma/coherent.c create mode 100644 kernel/dma/contiguous.c create mode 100644 kernel/dma/debug.c create mode 100644 kernel/dma/direct.c create mode 100644 kernel/dma/mapping.c create mode 100644 kernel/dma/noncoherent.c create mode 100644 kernel/dma/swiotlb.c create mode 100644 kernel/dma/virt.c delete mode 100644 lib/dma-debug.c delete mode 100644 lib/dma-direct.c delete mode 100644 lib/dma-noncoherent.c delete mode 100644 lib/dma-virt.c delete mode 100644 lib/swiotlb.c (limited to 'Documentation') diff --git a/Documentation/driver-api/infrastructure.rst b/Documentation/driver-api/infrastructure.rst index bee1b9a1702f..6172f3cc3d0b 100644 --- a/Documentation/driver-api/infrastructure.rst +++ b/Documentation/driver-api/infrastructure.rst @@ -49,10 +49,10 @@ Device Drivers Base Device Drivers DMA Management ----------------------------- -.. kernel-doc:: drivers/base/dma-coherent.c +.. kernel-doc:: kernel/dma/coherent.c :export: -.. kernel-doc:: drivers/base/dma-mapping.c +.. kernel-doc:: kernel/dma/mapping.c :export: Device drivers PnP support diff --git a/MAINTAINERS b/MAINTAINERS index c13b9fb3be0b..a6844a9e2f64 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4359,12 +4359,7 @@ L: iommu@lists.linux-foundation.org T: git git://git.infradead.org/users/hch/dma-mapping.git W: http://git.infradead.org/users/hch/dma-mapping.git S: Supported -F: lib/dma-debug.c -F: lib/dma-direct.c -F: lib/dma-noncoherent.c -F: lib/dma-virt.c -F: drivers/base/dma-mapping.c -F: drivers/base/dma-coherent.c +F: kernel/dma/ F: include/asm-generic/dma-mapping.h F: include/linux/dma-direct.h F: include/linux/dma-mapping.h @@ -13642,7 +13637,7 @@ M: Konrad Rzeszutek Wilk L: iommu@lists.linux-foundation.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/konrad/swiotlb.git S: Supported -F: lib/swiotlb.c +F: kernel/dma/swiotlb.c F: arch/*/kernel/pci-swiotlb.c F: include/linux/swiotlb.h diff --git a/drivers/base/Makefile b/drivers/base/Makefile index b074f242a435..704f44295810 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -8,10 +8,7 @@ obj-y := component.o core.o bus.o dd.o syscore.o \ topology.o container.o property.o cacheinfo.o \ devcon.o obj-$(CONFIG_DEVTMPFS) += devtmpfs.o -obj-$(CONFIG_DMA_CMA) += dma-contiguous.o obj-y += power/ -obj-$(CONFIG_HAS_DMA) += dma-mapping.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_ISA_BUS_API) += isa.o obj-y += firmware_loader/ obj-$(CONFIG_NUMA) += node.o diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c deleted file mode 100644 index 597d40893862..000000000000 --- a/drivers/base/dma-coherent.c +++ /dev/null @@ -1,434 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Coherent per-device memory handling. - * Borrowed from i386 - */ -#include -#include -#include -#include -#include - -struct dma_coherent_mem { - void *virt_base; - dma_addr_t device_base; - unsigned long pfn_base; - int size; - int flags; - unsigned long *bitmap; - spinlock_t spinlock; - bool use_dev_dma_pfn_offset; -}; - -static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init; - -static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev) -{ - if (dev && dev->dma_mem) - return dev->dma_mem; - return NULL; -} - -static inline dma_addr_t dma_get_device_base(struct device *dev, - struct dma_coherent_mem * mem) -{ - if (mem->use_dev_dma_pfn_offset) - return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT; - else - return mem->device_base; -} - -static int dma_init_coherent_memory( - phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, - struct dma_coherent_mem **mem) -{ - struct dma_coherent_mem *dma_mem = NULL; - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - int ret; - - if (!size) { - ret = -EINVAL; - goto out; - } - - mem_base = memremap(phys_addr, size, MEMREMAP_WC); - if (!mem_base) { - ret = -EINVAL; - goto out; - } - dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dma_mem) { - ret = -ENOMEM; - goto out; - } - dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dma_mem->bitmap) { - ret = -ENOMEM; - goto out; - } - - dma_mem->virt_base = mem_base; - dma_mem->device_base = device_addr; - dma_mem->pfn_base = PFN_DOWN(phys_addr); - dma_mem->size = pages; - dma_mem->flags = flags; - spin_lock_init(&dma_mem->spinlock); - - *mem = dma_mem; - return 0; - -out: - kfree(dma_mem); - if (mem_base) - memunmap(mem_base); - return ret; -} - -static void dma_release_coherent_memory(struct dma_coherent_mem *mem) -{ - if (!mem) - return; - - memunmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} - -static int dma_assign_coherent_memory(struct device *dev, - struct dma_coherent_mem *mem) -{ - if (!dev) - return -ENODEV; - - if (dev->dma_mem) - return -EBUSY; - - dev->dma_mem = mem; - return 0; -} - -int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - struct dma_coherent_mem *mem; - int ret; - - ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); - if (ret) - return ret; - - ret = dma_assign_coherent_memory(dev, mem); - if (ret) - dma_release_coherent_memory(mem); - return ret; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dma_release_coherent_memory(mem); - dev->dma_mem = NULL; -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - unsigned long flags; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&mem->spinlock, flags); - pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem)); - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - spin_unlock_irqrestore(&mem->spinlock, flags); - - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, - ssize_t size, dma_addr_t *dma_handle) -{ - int order = get_order(size); - unsigned long flags; - int pageno; - void *ret; - - spin_lock_irqsave(&mem->spinlock, flags); - - if (unlikely(size > (mem->size << PAGE_SHIFT))) - goto err; - - pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); - if (unlikely(pageno < 0)) - goto err; - - /* - * Memory was found in the coherent area. - */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); - ret = mem->virt_base + (pageno << PAGE_SHIFT); - spin_unlock_irqrestore(&mem->spinlock, flags); - memset(ret, 0, size); - return ret; -err: - spin_unlock_irqrestore(&mem->spinlock, flags); - return NULL; -} - -/** - * dma_alloc_from_dev_coherent() - allocate memory from device coherent pool - * @dev: device from which we allocate memory - * @size: size of requested memory area - * @dma_handle: This will be filled with the correct dma handle - * @ret: This pointer will be filled with the virtual address - * to allocated area. - * - * This function should be only called from per-arch dma_alloc_coherent() - * to support allocation from per-device coherent memory pools. - * - * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return @ret. - */ -int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); - - if (!mem) - return 0; - - *ret = __dma_alloc_from_coherent(mem, size, dma_handle); - if (*ret) - return 1; - - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; -} -EXPORT_SYMBOL(dma_alloc_from_dev_coherent); - -void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) -{ - if (!dma_coherent_default_memory) - return NULL; - - return __dma_alloc_from_coherent(dma_coherent_default_memory, size, - dma_handle); -} - -static int __dma_release_from_coherent(struct dma_coherent_mem *mem, - int order, void *vaddr) -{ - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - unsigned long flags; - - spin_lock_irqsave(&mem->spinlock, flags); - bitmap_release_region(mem->bitmap, page, order); - spin_unlock_irqrestore(&mem->spinlock, flags); - return 1; - } - return 0; -} - -/** - * dma_release_from_dev_coherent() - free memory to device coherent memory pool - * @dev: device from which the memory was allocated - * @order: the order of pages allocated - * @vaddr: virtual address of allocated pages - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, releases that memory. - * - * Returns 1 if we correctly released the memory, or 0 if the caller should - * proceed with releasing memory from generic pools. - */ -int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); - - return __dma_release_from_coherent(mem, order, vaddr); -} -EXPORT_SYMBOL(dma_release_from_dev_coherent); - -int dma_release_from_global_coherent(int order, void *vaddr) -{ - if (!dma_coherent_default_memory) - return 0; - - return __dma_release_from_coherent(dma_coherent_default_memory, order, - vaddr); -} - -static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, - struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) -{ - if (mem && vaddr >= mem->virt_base && vaddr + size <= - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - unsigned long off = vma->vm_pgoff; - int start = (vaddr - mem->virt_base) >> PAGE_SHIFT; - int user_count = vma_pages(vma); - int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - - *ret = -ENXIO; - if (off < count && user_count <= count - off) { - unsigned long pfn = mem->pfn_base + start + off; - *ret = remap_pfn_range(vma, vma->vm_start, pfn, - user_count << PAGE_SHIFT, - vma->vm_page_prot); - } - return 1; - } - return 0; -} - -/** - * dma_mmap_from_dev_coherent() - mmap memory from the device coherent pool - * @dev: device from which the memory was allocated - * @vma: vm_area for the userspace memory - * @vaddr: cpu address returned by dma_alloc_from_dev_coherent - * @size: size of the memory buffer allocated - * @ret: result from remap_pfn_range() - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, maps that memory to the provided vma. - * - * Returns 1 if @vaddr belongs to the device coherent pool and the caller - * should return @ret, or 0 if they should proceed with mapping memory from - * generic areas. - */ -int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, - void *vaddr, size_t size, int *ret) -{ - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); - - return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); -} -EXPORT_SYMBOL(dma_mmap_from_dev_coherent); - -int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, - size_t size, int *ret) -{ - if (!dma_coherent_default_memory) - return 0; - - return __dma_mmap_from_coherent(dma_coherent_default_memory, vma, - vaddr, size, ret); -} - -/* - * Support for reserved memory regions defined in device tree - */ -#ifdef CONFIG_OF_RESERVED_MEM -#include -#include -#include - -static struct reserved_mem *dma_reserved_default_memory __initdata; - -static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) -{ - struct dma_coherent_mem *mem = rmem->priv; - int ret; - - if (!mem) { - ret = dma_init_coherent_memory(rmem->base, rmem->base, - rmem->size, - DMA_MEMORY_EXCLUSIVE, &mem); - if (ret) { - pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - return ret; - } - } - mem->use_dev_dma_pfn_offset = true; - rmem->priv = mem; - dma_assign_coherent_memory(dev, mem); - return 0; -} - -static void rmem_dma_device_release(struct reserved_mem *rmem, - struct device *dev) -{ - if (dev) - dev->dma_mem = NULL; -} - -static const struct reserved_mem_ops rmem_dma_ops = { - .device_init = rmem_dma_device_init, - .device_release = rmem_dma_device_release, -}; - -static int __init rmem_dma_setup(struct reserved_mem *rmem) -{ - unsigned long node = rmem->fdt_node; - - if (of_get_flat_dt_prop(node, "reusable", NULL)) - return -EINVAL; - -#ifdef CONFIG_ARM - if (!of_get_flat_dt_prop(node, "no-map", NULL)) { - pr_err("Reserved memory: regions without no-map are not yet supported\n"); - return -EINVAL; - } - - if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) { - WARN(dma_reserved_default_memory, - "Reserved memory: region for default DMA coherent area is redefined\n"); - dma_reserved_default_memory = rmem; - } -#endif - - rmem->ops = &rmem_dma_ops; - pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - return 0; -} - -static int __init dma_init_reserved_memory(void) -{ - const struct reserved_mem_ops *ops; - int ret; - - if (!dma_reserved_default_memory) - return -ENOMEM; - - ops = dma_reserved_default_memory->ops; - - /* - * We rely on rmem_dma_device_init() does not propagate error of - * dma_assign_coherent_memory() for "NULL" device. - */ - ret = ops->device_init(dma_reserved_default_memory, NULL); - - if (!ret) { - dma_coherent_default_memory = dma_reserved_default_memory->priv; - pr_info("DMA: default coherent area is set\n"); - } - - return ret; -} - -core_initcall(dma_init_reserved_memory); - -RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup); -#endif diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c deleted file mode 100644 index d987dcd1bd56..000000000000 --- a/drivers/base/dma-contiguous.c +++ /dev/null @@ -1,278 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Contiguous Memory Allocator for DMA mapping framework - * Copyright (c) 2010-2011 by Samsung Electronics. - * Written by: - * Marek Szyprowski - * Michal Nazarewicz - */ - -#define pr_fmt(fmt) "cma: " fmt - -#ifdef CONFIG_CMA_DEBUG -#ifndef DEBUG -# define DEBUG -#endif -#endif - -#include -#include - -#include -#include -#include -#include -#include - -#ifdef CONFIG_CMA_SIZE_MBYTES -#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES -#else -#define CMA_SIZE_MBYTES 0 -#endif - -struct cma *dma_contiguous_default_area; - -/* - * Default global CMA area size can be defined in kernel's .config. - * This is useful mainly for distro maintainers to create a kernel - * that works correctly for most supported systems. - * The size can be set in bytes or as a percentage of the total memory - * in the system. - * - * Users, who want to set the size of global CMA area for their system - * should use cma= kernel parameter. - */ -static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; -static phys_addr_t size_cmdline = -1; -static phys_addr_t base_cmdline; -static phys_addr_t limit_cmdline; - -static int __init early_cma(char *p) -{ - pr_debug("%s(%s)\n", __func__, p); - size_cmdline = memparse(p, &p); - if (*p != '@') - return 0; - base_cmdline = memparse(p + 1, &p); - if (*p != '-') { - limit_cmdline = base_cmdline + size_cmdline; - return 0; - } - limit_cmdline = memparse(p + 1, &p); - - return 0; -} -early_param("cma", early_cma); - -#ifdef CONFIG_CMA_SIZE_PERCENTAGE - -static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) -{ - struct memblock_region *reg; - unsigned long total_pages = 0; - - /* - * We cannot use memblock_phys_mem_size() here, because - * memblock_analyze() has not been called yet. - */ - for_each_memblock(memory, reg) - total_pages += memblock_region_memory_end_pfn(reg) - - memblock_region_memory_base_pfn(reg); - - return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT; -} - -#else - -static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) -{ - return 0; -} - -#endif - -/** - * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling - * @limit: End address of the reserved memory (optional, 0 for any). - * - * This function reserves memory from early allocator. It should be - * called by arch specific code once the early allocator (memblock or bootmem) - * has been activated and all other subsystems have already allocated/reserved - * memory. - */ -void __init dma_contiguous_reserve(phys_addr_t limit) -{ - phys_addr_t selected_size = 0; - phys_addr_t selected_base = 0; - phys_addr_t selected_limit = limit; - bool fixed = false; - - pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); - - if (size_cmdline != -1) { - selected_size = size_cmdline; - selected_base = base_cmdline; - selected_limit = min_not_zero(limit_cmdline, limit); - if (base_cmdline + size_cmdline == limit_cmdline) - fixed = true; - } else { -#ifdef CONFIG_CMA_SIZE_SEL_MBYTES - selected_size = size_bytes; -#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE) - selected_size = cma_early_percent_memory(); -#elif defined(CONFIG_CMA_SIZE_SEL_MIN) - selected_size = min(size_bytes, cma_early_percent_memory()); -#elif defined(CONFIG_CMA_SIZE_SEL_MAX) - selected_size = max(size_bytes, cma_early_percent_memory()); -#endif - } - - if (selected_size && !dma_contiguous_default_area) { - pr_debug("%s: reserving %ld MiB for global area\n", __func__, - (unsigned long)selected_size / SZ_1M); - - dma_contiguous_reserve_area(selected_size, selected_base, - selected_limit, - &dma_contiguous_default_area, - fixed); - } -} - -/** - * dma_contiguous_reserve_area() - reserve custom contiguous area - * @size: Size of the reserved area (in bytes), - * @base: Base address of the reserved area optional, use 0 for any - * @limit: End address of the reserved memory (optional, 0 for any). - * @res_cma: Pointer to store the created cma region. - * @fixed: hint about where to place the reserved area - * - * This function reserves memory from early allocator. It should be - * called by arch specific code once the early allocator (memblock or bootmem) - * has been activated and all other subsystems have already allocated/reserved - * memory. This function allows to create custom reserved areas for specific - * devices. - * - * If @fixed is true, reserve contiguous area at exactly @base. If false, - * reserve in range from @base to @limit. - */ -int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, - phys_addr_t limit, struct cma **res_cma, - bool fixed) -{ - int ret; - - ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, - "reserved", res_cma); - if (ret) - return ret; - - /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(cma_get_base(*res_cma), - cma_get_size(*res_cma)); - - return 0; -} - -/** - * dma_alloc_from_contiguous() - allocate pages from contiguous area - * @dev: Pointer to device for which the allocation is performed. - * @count: Requested number of pages. - * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP flags to use for this allocation. - * - * This function allocates memory buffer for specified device. It uses - * device specific contiguous memory area if available or the default - * global one. Requires architecture specific dev_get_cma_area() helper - * function. - */ -struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int align, gfp_t gfp_mask) -{ - if (align > CONFIG_CMA_ALIGNMENT) - align = CONFIG_CMA_ALIGNMENT; - - return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); -} - -/** - * dma_release_from_contiguous() - release allocated pages - * @dev: Pointer to device for which the pages were allocated. - * @pages: Allocated pages. - * @count: Number of allocated pages. - * - * This function releases memory allocated by dma_alloc_from_contiguous(). - * It returns false when provided pages do not belong to contiguous area and - * true otherwise. - */ -bool dma_release_from_contiguous(struct device *dev, struct page *pages, - int count) -{ - return cma_release(dev_get_cma_area(dev), pages, count); -} - -/* - * Support for reserved memory regions defined in device tree - */ -#ifdef CONFIG_OF_RESERVED_MEM -#include -#include -#include - -#undef pr_fmt -#define pr_fmt(fmt) fmt - -static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) -{ - dev_set_cma_area(dev, rmem->priv); - return 0; -} - -static void rmem_cma_device_release(struct reserved_mem *rmem, - struct device *dev) -{ - dev_set_cma_area(dev, NULL); -} - -static const struct reserved_mem_ops rmem_cma_ops = { - .device_init = rmem_cma_device_init, - .device_release = rmem_cma_device_release, -}; - -static int __init rmem_cma_setup(struct reserved_mem *rmem) -{ - phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - phys_addr_t mask = align - 1; - unsigned long node = rmem->fdt_node; - struct cma *cma; - int err; - - if (!of_get_flat_dt_prop(node, "reusable", NULL) || - of_get_flat_dt_prop(node, "no-map", NULL)) - return -EINVAL; - - if ((rmem->base & mask) || (rmem->size & mask)) { - pr_err("Reserved memory: incorrect alignment of CMA region\n"); - return -EINVAL; - } - - err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); - if (err) { - pr_err("Reserved memory: unable to setup CMA region\n"); - return err; - } - /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(rmem->base, rmem->size); - - if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) - dma_contiguous_set_default(cma); - - rmem->ops = &rmem_cma_ops; - rmem->priv = cma; - - pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - - return 0; -} -RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); -#endif diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c deleted file mode 100644 index f831a582209c..000000000000 --- a/drivers/base/dma-mapping.c +++ /dev/null @@ -1,345 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * drivers/base/dma-mapping.c - arch-independent dma-mapping routines - * - * Copyright (c) 2006 SUSE Linux Products GmbH - * Copyright (c) 2006 Tejun Heo - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * Managed DMA API - */ -struct dma_devres { - size_t size; - void *vaddr; - dma_addr_t dma_handle; - unsigned long attrs; -}; - -static void dmam_release(struct device *dev, void *res) -{ - struct dma_devres *this = res; - - dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle, - this->attrs); -} - -static int dmam_match(struct device *dev, void *res, void *match_data) -{ - struct dma_devres *this = res, *match = match_data; - - if (this->vaddr == match->vaddr) { - WARN_ON(this->size != match->size || - this->dma_handle != match->dma_handle); - return 1; - } - return 0; -} - -/** - * dmam_alloc_coherent - Managed dma_alloc_coherent() - * @dev: Device to allocate coherent memory for - * @size: Size of allocation - * @dma_handle: Out argument for allocated DMA handle - * @gfp: Allocation flags - * - * Managed dma_alloc_coherent(). Memory allocated using this function - * will be automatically released on driver detach. - * - * RETURNS: - * Pointer to allocated memory on success, NULL on failure. - */ -void *dmam_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) -{ - struct dma_devres *dr; - void *vaddr; - - dr = devres_alloc(dmam_release, sizeof(*dr), gfp); - if (!dr) - return NULL; - - vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp); - if (!vaddr) { - devres_free(dr); - return NULL; - } - - dr->vaddr = vaddr; - dr->dma_handle = *dma_handle; - dr->size = size; - - devres_add(dev, dr); - - return vaddr; -} -EXPORT_SYMBOL(dmam_alloc_coherent); - -/** - * dmam_free_coherent - Managed dma_free_coherent() - * @dev: Device to free coherent memory for - * @size: Size of allocation - * @vaddr: Virtual address of the memory to free - * @dma_handle: DMA handle of the memory to free - * - * Managed dma_free_coherent(). - */ -void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle) -{ - struct dma_devres match_data = { size, vaddr, dma_handle }; - - dma_free_coherent(dev, size, vaddr, dma_handle); - WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data)); -} -EXPORT_SYMBOL(dmam_free_coherent); - -/** - * dmam_alloc_attrs - Managed dma_alloc_attrs() - * @dev: Device to allocate non_coherent memory for - * @size: Size of allocation - * @dma_handle: Out argument for allocated DMA handle - * @gfp: Allocation flags - * @attrs: Flags in the DMA_ATTR_* namespace. - * - * Managed dma_alloc_attrs(). Memory allocated using this function will be - * automatically released on driver detach. - * - * RETURNS: - * Pointer to allocated memory on success, NULL on failure. - */ -void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - struct dma_devres *dr; - void *vaddr; - - dr = devres_alloc(dmam_release, sizeof(*dr), gfp); - if (!dr) - return NULL; - - vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs); - if (!vaddr) { - devres_free(dr); - return NULL; - } - - dr->vaddr = vaddr; - dr->dma_handle = *dma_handle; - dr->size = size; - dr->attrs = attrs; - - devres_add(dev, dr); - - return vaddr; -} -EXPORT_SYMBOL(dmam_alloc_attrs); - -#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT - -static void dmam_coherent_decl_release(struct device *dev, void *res) -{ - dma_release_declared_memory(dev); -} - -/** - * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory() - * @dev: Device to declare coherent memory for - * @phys_addr: Physical address of coherent memory to be declared - * @device_addr: Device address of coherent memory to be declared - * @size: Size of coherent memory to be declared - * @flags: Flags - * - * Managed dma_declare_coherent_memory(). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void *res; - int rc; - - res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL); - if (!res) - return -ENOMEM; - - rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size, - flags); - if (!rc) - devres_add(dev, res); - else - devres_free(res); - - return rc; -} -EXPORT_SYMBOL(dmam_declare_coherent_memory); - -/** - * dmam_release_declared_memory - Managed dma_release_declared_memory(). - * @dev: Device to release declared coherent memory for - * - * Managed dmam_release_declared_memory(). - */ -void dmam_release_declared_memory(struct device *dev) -{ - WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL)); -} -EXPORT_SYMBOL(dmam_release_declared_memory); - -#endif - -/* - * Create scatter-list for the already allocated DMA buffer. - */ -int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t handle, size_t size) -{ - struct page *page = virt_to_page(cpu_addr); - int ret; - - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); - if (unlikely(ret)) - return ret; - - sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); - return 0; -} -EXPORT_SYMBOL(dma_common_get_sgtable); - -/* - * Create userspace mapping for the DMA-coherent memory. - */ -int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size) -{ - int ret = -ENXIO; -#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP - unsigned long user_count = vma_pages(vma); - unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long off = vma->vm_pgoff; - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) - return ret; - - if (off < count && user_count <= (count - off)) - ret = remap_pfn_range(vma, vma->vm_start, - page_to_pfn(virt_to_page(cpu_addr)) + off, - user_count << PAGE_SHIFT, - vma->vm_page_prot); -#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ - - return ret; -} -EXPORT_SYMBOL(dma_common_mmap); - -#ifdef CONFIG_MMU -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, unsigned long vm_flags, pgprot_t prot, - const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, vm_flags, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - -/* - * remaps an array of PAGE_SIZE pages into another vm_area - * Cannot be used in non-sleeping contexts - */ -void *dma_common_pages_remap(struct page **pages, size_t size, - unsigned long vm_flags, pgprot_t prot, - const void *caller) -{ - struct vm_struct *area; - - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; -} - -/* - * remaps an allocated contiguous region into another vm_area. - * Cannot be used in non-sleeping contexts - */ - -void *dma_common_contiguous_remap(struct page *page, size_t size, - unsigned long vm_flags, - pgprot_t prot, const void *caller) -{ - int i; - struct page **pages; - struct vm_struct *area; - - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); - if (!pages) - return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) - pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); - - kfree(pages); - - if (!area) - return NULL; - return area->addr; -} - -/* - * unmaps a range previously mapped by dma_common_*_remap - */ -void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) -{ - struct vm_struct *area = find_vm_area(cpu_addr); - - if (!area || (area->flags & vm_flags) != vm_flags) { - WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); - return; - } - - unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); - vunmap(cpu_addr); -} -#endif - -/* - * enables DMA API use for a device - */ -int dma_configure(struct device *dev) -{ - if (dev->bus->dma_configure) - return dev->bus->dma_configure(dev); - return 0; -} - -void dma_deconfigure(struct device *dev) -{ - of_dma_deconfigure(dev); - acpi_dma_deconfigure(dev); -} diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index b67bf6ac907d..3c5a4cb3eb95 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -48,7 +48,7 @@ * CMA should not be used by the device drivers directly. It is * only a helper framework for dma-mapping subsystem. * - * For more information, see kernel-docs in drivers/base/dma-contiguous.c + * For more information, see kernel-docs in kernel/dma/contiguous.c */ #ifdef __KERNEL__ diff --git a/init/Kconfig b/init/Kconfig index 5a52f07259a2..fde3d09e8b27 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1719,10 +1719,6 @@ source "arch/Kconfig" endmenu # General setup -config HAVE_GENERIC_DMA_COHERENT - bool - default n - config RT_MUTEXES bool diff --git a/kernel/Makefile b/kernel/Makefile index d2001624fe7a..04bc07c2b42a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,6 +41,7 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += dma/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig new file mode 100644 index 000000000000..9bd54304446f --- /dev/null +++ b/kernel/dma/Kconfig @@ -0,0 +1,50 @@ + +config HAS_DMA + bool + depends on !NO_DMA + default y + +config NEED_SG_DMA_LENGTH + bool + +config NEED_DMA_MAP_STATE + bool + +config ARCH_DMA_ADDR_T_64BIT + def_bool 64BIT || PHYS_ADDR_T_64BIT + +config HAVE_GENERIC_DMA_COHERENT + bool + +config ARCH_HAS_SYNC_DMA_FOR_DEVICE + bool + +config ARCH_HAS_SYNC_DMA_FOR_CPU + bool + select NEED_DMA_MAP_STATE + +config DMA_DIRECT_OPS + bool + depends on HAS_DMA + +config DMA_NONCOHERENT_OPS + bool + depends on HAS_DMA + select DMA_DIRECT_OPS + +config DMA_NONCOHERENT_MMAP + bool + depends on DMA_NONCOHERENT_OPS + +config DMA_NONCOHERENT_CACHE_SYNC + bool + depends on DMA_NONCOHERENT_OPS + +config DMA_VIRT_OPS + bool + depends on HAS_DMA + +config SWIOTLB + bool + select DMA_DIRECT_OPS + select NEED_DMA_MAP_STATE diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile new file mode 100644 index 000000000000..6de44e4eb454 --- /dev/null +++ b/kernel/dma/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_HAS_DMA) += mapping.o +obj-$(CONFIG_DMA_CMA) += contiguous.o +obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o +obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o +obj-$(CONFIG_DMA_NONCOHERENT_OPS) += noncoherent.o +obj-$(CONFIG_DMA_VIRT_OPS) += virt.o +obj-$(CONFIG_DMA_API_DEBUG) += debug.o +obj-$(CONFIG_SWIOTLB) += swiotlb.o + diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c new file mode 100644 index 000000000000..597d40893862 --- /dev/null +++ b/kernel/dma/coherent.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Coherent per-device memory handling. + * Borrowed from i386 + */ +#include +#include +#include +#include +#include + +struct dma_coherent_mem { + void *virt_base; + dma_addr_t device_base; + unsigned long pfn_base; + int size; + int flags; + unsigned long *bitmap; + spinlock_t spinlock; + bool use_dev_dma_pfn_offset; +}; + +static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init; + +static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev) +{ + if (dev && dev->dma_mem) + return dev->dma_mem; + return NULL; +} + +static inline dma_addr_t dma_get_device_base(struct device *dev, + struct dma_coherent_mem * mem) +{ + if (mem->use_dev_dma_pfn_offset) + return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT; + else + return mem->device_base; +} + +static int dma_init_coherent_memory( + phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, + struct dma_coherent_mem **mem) +{ + struct dma_coherent_mem *dma_mem = NULL; + void __iomem *mem_base = NULL; + int pages = size >> PAGE_SHIFT; + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + int ret; + + if (!size) { + ret = -EINVAL; + goto out; + } + + mem_base = memremap(phys_addr, size, MEMREMAP_WC); + if (!mem_base) { + ret = -EINVAL; + goto out; + } + dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dma_mem) { + ret = -ENOMEM; + goto out; + } + dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dma_mem->bitmap) { + ret = -ENOMEM; + goto out; + } + + dma_mem->virt_base = mem_base; + dma_mem->device_base = device_addr; + dma_mem->pfn_base = PFN_DOWN(phys_addr); + dma_mem->size = pages; + dma_mem->flags = flags; + spin_lock_init(&dma_mem->spinlock); + + *mem = dma_mem; + return 0; + +out: + kfree(dma_mem); + if (mem_base) + memunmap(mem_base); + return ret; +} + +static void dma_release_coherent_memory(struct dma_coherent_mem *mem) +{ + if (!mem) + return; + + memunmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} + +static int dma_assign_coherent_memory(struct device *dev, + struct dma_coherent_mem *mem) +{ + if (!dev) + return -ENODEV; + + if (dev->dma_mem) + return -EBUSY; + + dev->dma_mem = mem; + return 0; +} + +int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + struct dma_coherent_mem *mem; + int ret; + + ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); + if (ret) + return ret; + + ret = dma_assign_coherent_memory(dev, mem); + if (ret) + dma_release_coherent_memory(mem); + return ret; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if (!mem) + return; + dma_release_coherent_memory(mem); + dev->dma_mem = NULL; +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + unsigned long flags; + int pos, err; + + size += device_addr & ~PAGE_MASK; + + if (!mem) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&mem->spinlock, flags); + pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem)); + err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); + spin_unlock_irqrestore(&mem->spinlock, flags); + + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + +static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, + ssize_t size, dma_addr_t *dma_handle) +{ + int order = get_order(size); + unsigned long flags; + int pageno; + void *ret; + + spin_lock_irqsave(&mem->spinlock, flags); + + if (unlikely(size > (mem->size << PAGE_SHIFT))) + goto err; + + pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); + if (unlikely(pageno < 0)) + goto err; + + /* + * Memory was found in the coherent area. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + ret = mem->virt_base + (pageno << PAGE_SHIFT); + spin_unlock_irqrestore(&mem->spinlock, flags); + memset(ret, 0, size); + return ret; +err: + spin_unlock_irqrestore(&mem->spinlock, flags); + return NULL; +} + +/** + * dma_alloc_from_dev_coherent() - allocate memory from device coherent pool + * @dev: device from which we allocate memory + * @size: size of requested memory area + * @dma_handle: This will be filled with the correct dma handle + * @ret: This pointer will be filled with the virtual address + * to allocated area. + * + * This function should be only called from per-arch dma_alloc_coherent() + * to support allocation from per-device coherent memory pools. + * + * Returns 0 if dma_alloc_coherent should continue with allocating from + * generic memory areas, or !0 if dma_alloc_coherent should return @ret. + */ +int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + if (!mem) + return 0; + + *ret = __dma_alloc_from_coherent(mem, size, dma_handle); + if (*ret) + return 1; + + /* + * In the case where the allocation can not be satisfied from the + * per-device area, try to fall back to generic memory if the + * constraints allow it. + */ + return mem->flags & DMA_MEMORY_EXCLUSIVE; +} +EXPORT_SYMBOL(dma_alloc_from_dev_coherent); + +void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) +{ + if (!dma_coherent_default_memory) + return NULL; + + return __dma_alloc_from_coherent(dma_coherent_default_memory, size, + dma_handle); +} + +static int __dma_release_from_coherent(struct dma_coherent_mem *mem, + int order, void *vaddr) +{ + if (mem && vaddr >= mem->virt_base && vaddr < + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + unsigned long flags; + + spin_lock_irqsave(&mem->spinlock, flags); + bitmap_release_region(mem->bitmap, page, order); + spin_unlock_irqrestore(&mem->spinlock, flags); + return 1; + } + return 0; +} + +/** + * dma_release_from_dev_coherent() - free memory to device coherent memory pool + * @dev: device from which the memory was allocated + * @order: the order of pages allocated + * @vaddr: virtual address of allocated pages + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, releases that memory. + * + * Returns 1 if we correctly released the memory, or 0 if the caller should + * proceed with releasing memory from generic pools. + */ +int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + return __dma_release_from_coherent(mem, order, vaddr); +} +EXPORT_SYMBOL(dma_release_from_dev_coherent); + +int dma_release_from_global_coherent(int order, void *vaddr) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_release_from_coherent(dma_coherent_default_memory, order, + vaddr); +} + +static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, + struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) +{ + if (mem && vaddr >= mem->virt_base && vaddr + size <= + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + unsigned long off = vma->vm_pgoff; + int start = (vaddr - mem->virt_base) >> PAGE_SHIFT; + int user_count = vma_pages(vma); + int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + *ret = -ENXIO; + if (off < count && user_count <= count - off) { + unsigned long pfn = mem->pfn_base + start + off; + *ret = remap_pfn_range(vma, vma->vm_start, pfn, + user_count << PAGE_SHIFT, + vma->vm_page_prot); + } + return 1; + } + return 0; +} + +/** + * dma_mmap_from_dev_coherent() - mmap memory from the device coherent pool + * @dev: device from which the memory was allocated + * @vma: vm_area for the userspace memory + * @vaddr: cpu address returned by dma_alloc_from_dev_coherent + * @size: size of the memory buffer allocated + * @ret: result from remap_pfn_range() + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, maps that memory to the provided vma. + * + * Returns 1 if @vaddr belongs to the device coherent pool and the caller + * should return @ret, or 0 if they should proceed with mapping memory from + * generic areas. + */ +int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, + void *vaddr, size_t size, int *ret) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); +} +EXPORT_SYMBOL(dma_mmap_from_dev_coherent); + +int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, + size_t size, int *ret) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_mmap_from_coherent(dma_coherent_default_memory, vma, + vaddr, size, ret); +} + +/* + * Support for reserved memory regions defined in device tree + */ +#ifdef CONFIG_OF_RESERVED_MEM +#include +#include +#include + +static struct reserved_mem *dma_reserved_default_memory __initdata; + +static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) +{ + struct dma_coherent_mem *mem = rmem->priv; + int ret; + + if (!mem) { + ret = dma_init_coherent_memory(rmem->base, rmem->base, + rmem->size, + DMA_MEMORY_EXCLUSIVE, &mem); + if (ret) { + pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return ret; + } + } + mem->use_dev_dma_pfn_offset = true; + rmem->priv = mem; + dma_assign_coherent_memory(dev, mem); + return 0; +} + +static void rmem_dma_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + if (dev) + dev->dma_mem = NULL; +} + +static const struct reserved_mem_ops rmem_dma_ops = { + .device_init = rmem_dma_device_init, + .device_release = rmem_dma_device_release, +}; + +static int __init rmem_dma_setup(struct reserved_mem *rmem) +{ + unsigned long node = rmem->fdt_node; + + if (of_get_flat_dt_prop(node, "reusable", NULL)) + return -EINVAL; + +#ifdef CONFIG_ARM + if (!of_get_flat_dt_prop(node, "no-map", NULL)) { + pr_err("Reserved memory: regions without no-map are not yet supported\n"); + return -EINVAL; + } + + if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) { + WARN(dma_reserved_default_memory, + "Reserved memory: region for default DMA coherent area is redefined\n"); + dma_reserved_default_memory = rmem; + } +#endif + + rmem->ops = &rmem_dma_ops; + pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return 0; +} + +static int __init dma_init_reserved_memory(void) +{ + const struct reserved_mem_ops *ops; + int ret; + + if (!dma_reserved_default_memory) + return -ENOMEM; + + ops = dma_reserved_default_memory->ops; + + /* + * We rely on rmem_dma_device_init() does not propagate error of + * dma_assign_coherent_memory() for "NULL" device. + */ + ret = ops->device_init(dma_reserved_default_memory, NULL); + + if (!ret) { + dma_coherent_default_memory = dma_reserved_default_memory->priv; + pr_info("DMA: default coherent area is set\n"); + } + + return ret; +} + +core_initcall(dma_init_reserved_memory); + +RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup); +#endif diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c new file mode 100644 index 000000000000..d987dcd1bd56 --- /dev/null +++ b/kernel/dma/contiguous.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Contiguous Memory Allocator for DMA mapping framework + * Copyright (c) 2010-2011 by Samsung Electronics. + * Written by: + * Marek Szyprowski + * Michal Nazarewicz + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_CMA_SIZE_MBYTES +#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES +#else +#define CMA_SIZE_MBYTES 0 +#endif + +struct cma *dma_contiguous_default_area; + +/* + * Default global CMA area size can be defined in kernel's .config. + * This is useful mainly for distro maintainers to create a kernel + * that works correctly for most supported systems. + * The size can be set in bytes or as a percentage of the total memory + * in the system. + * + * Users, who want to set the size of global CMA area for their system + * should use cma= kernel parameter. + */ +static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +static phys_addr_t size_cmdline = -1; +static phys_addr_t base_cmdline; +static phys_addr_t limit_cmdline; + +static int __init early_cma(char *p) +{ + pr_debug("%s(%s)\n", __func__, p); + size_cmdline = memparse(p, &p); + if (*p != '@') + return 0; + base_cmdline = memparse(p + 1, &p); + if (*p != '-') { + limit_cmdline = base_cmdline + size_cmdline; + return 0; + } + limit_cmdline = memparse(p + 1, &p); + + return 0; +} +early_param("cma", early_cma); + +#ifdef CONFIG_CMA_SIZE_PERCENTAGE + +static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) +{ + struct memblock_region *reg; + unsigned long total_pages = 0; + + /* + * We cannot use memblock_phys_mem_size() here, because + * memblock_analyze() has not been called yet. + */ + for_each_memblock(memory, reg) + total_pages += memblock_region_memory_end_pfn(reg) - + memblock_region_memory_base_pfn(reg); + + return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT; +} + +#else + +static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) +{ + return 0; +} + +#endif + +/** + * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling + * @limit: End address of the reserved memory (optional, 0 for any). + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. + */ +void __init dma_contiguous_reserve(phys_addr_t limit) +{ + phys_addr_t selected_size = 0; + phys_addr_t selected_base = 0; + phys_addr_t selected_limit = limit; + bool fixed = false; + + pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); + + if (size_cmdline != -1) { + selected_size = size_cmdline; + selected_base = base_cmdline; + selected_limit = min_not_zero(limit_cmdline, limit); + if (base_cmdline + size_cmdline == limit_cmdline) + fixed = true; + } else { +#ifdef CONFIG_CMA_SIZE_SEL_MBYTES + selected_size = size_bytes; +#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE) + selected_size = cma_early_percent_memory(); +#elif defined(CONFIG_CMA_SIZE_SEL_MIN) + selected_size = min(size_bytes, cma_early_percent_memory()); +#elif defined(CONFIG_CMA_SIZE_SEL_MAX) + selected_size = max(size_bytes, cma_early_percent_memory()); +#endif + } + + if (selected_size && !dma_contiguous_default_area) { + pr_debug("%s: reserving %ld MiB for global area\n", __func__, + (unsigned long)selected_size / SZ_1M); + + dma_contiguous_reserve_area(selected_size, selected_base, + selected_limit, + &dma_contiguous_default_area, + fixed); + } +} + +/** + * dma_contiguous_reserve_area() - reserve custom contiguous area + * @size: Size of the reserved area (in bytes), + * @base: Base address of the reserved area optional, use 0 for any + * @limit: End address of the reserved memory (optional, 0 for any). + * @res_cma: Pointer to store the created cma region. + * @fixed: hint about where to place the reserved area + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas for specific + * devices. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, + phys_addr_t limit, struct cma **res_cma, + bool fixed) +{ + int ret; + + ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, + "reserved", res_cma); + if (ret) + return ret; + + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(cma_get_base(*res_cma), + cma_get_size(*res_cma)); + + return 0; +} + +/** + * dma_alloc_from_contiguous() - allocate pages from contiguous area + * @dev: Pointer to device for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * @gfp_mask: GFP flags to use for this allocation. + * + * This function allocates memory buffer for specified device. It uses + * device specific contiguous memory area if available or the default + * global one. Requires architecture specific dev_get_cma_area() helper + * function. + */ +struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, + unsigned int align, gfp_t gfp_mask) +{ + if (align > CONFIG_CMA_ALIGNMENT) + align = CONFIG_CMA_ALIGNMENT; + + return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); +} + +/** + * dma_release_from_contiguous() - release allocated pages + * @dev: Pointer to device for which the pages were allocated. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by dma_alloc_from_contiguous(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool dma_release_from_contiguous(struct device *dev, struct page *pages, + int count) +{ + return cma_release(dev_get_cma_area(dev), pages, count); +} + +/* + * Support for reserved memory regions defined in device tree + */ +#ifdef CONFIG_OF_RESERVED_MEM +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) fmt + +static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) +{ + dev_set_cma_area(dev, rmem->priv); + return 0; +} + +static void rmem_cma_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + dev_set_cma_area(dev, NULL); +} + +static const struct reserved_mem_ops rmem_cma_ops = { + .device_init = rmem_cma_device_init, + .device_release = rmem_cma_device_release, +}; + +static int __init rmem_cma_setup(struct reserved_mem *rmem) +{ + phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + phys_addr_t mask = align - 1; + unsigned long node = rmem->fdt_node; + struct cma *cma; + int err; + + if (!of_get_flat_dt_prop(node, "reusable", NULL) || + of_get_flat_dt_prop(node, "no-map", NULL)) + return -EINVAL; + + if ((rmem->base & mask) || (rmem->size & mask)) { + pr_err("Reserved memory: incorrect alignment of CMA region\n"); + return -EINVAL; + } + + err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); + if (err) { + pr_err("Reserved memory: unable to setup CMA region\n"); + return err; + } + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(rmem->base, rmem->size); + + if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + dma_contiguous_set_default(cma); + + rmem->ops = &rmem_cma_ops; + rmem->priv = cma; + + pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + + return 0; +} +RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); +#endif diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c new file mode 100644 index 000000000000..c007d25bee09 --- /dev/null +++ b/kernel/dma/debug.c @@ -0,0 +1,1773 @@ +/* + * Copyright (C) 2008 Advanced Micro Devices, Inc. + * + * Author: Joerg Roedel + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define HASH_SIZE 1024ULL +#define HASH_FN_SHIFT 13 +#define HASH_FN_MASK (HASH_SIZE - 1) + +/* allow architectures to override this if absolutely required */ +#ifndef PREALLOC_DMA_DEBUG_ENTRIES +#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) +#endif + +enum { + dma_debug_single, + dma_debug_page, + dma_debug_sg, + dma_debug_coherent, + dma_debug_resource, +}; + +enum map_err_types { + MAP_ERR_CHECK_NOT_APPLICABLE, + MAP_ERR_NOT_CHECKED, + MAP_ERR_CHECKED, +}; + +#define DMA_DEBUG_STACKTRACE_ENTRIES 5 + +/** + * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping + * @list: node on pre-allocated free_entries list + * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent + * @type: single, page, sg, coherent + * @pfn: page frame of the start address + * @offset: offset of mapping relative to pfn + * @size: length of the mapping + * @direction: enum dma_data_direction + * @sg_call_ents: 'nents' from dma_map_sg + * @sg_mapped_ents: 'mapped_ents' from dma_map_sg + * @map_err_type: track whether dma_mapping_error() was checked + * @stacktrace: support backtraces when a violation is detected + */ +struct dma_debug_entry { + struct list_head list; + struct device *dev; + int type; + unsigned long pfn; + size_t offset; + u64 dev_addr; + u64 size; + int direction; + int sg_call_ents; + int sg_mapped_ents; + enum map_err_types map_err_type; +#ifdef CONFIG_STACKTRACE + struct stack_trace stacktrace; + unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; +#endif +}; + +typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); + +struct hash_bucket { + struct list_head list; + spinlock_t lock; +} ____cacheline_aligned_in_smp; + +/* Hash list to save the allocated dma addresses */ +static struct hash_bucket dma_entry_hash[HASH_SIZE]; +/* List of pre-allocated dma_debug_entry's */ +static LIST_HEAD(free_entries); +/* Lock for the list above */ +static DEFINE_SPINLOCK(free_entries_lock); + +/* Global disable flag - will be set in case of an error */ +static bool global_disable __read_mostly; + +/* Early initialization disable flag, set at the end of dma_debug_init */ +static bool dma_debug_initialized __read_mostly; + +static inline bool dma_debug_disabled(void) +{ + return global_disable || !dma_debug_initialized; +} + +/* Global error count */ +static u32 error_count; + +/* Global error show enable*/ +static u32 show_all_errors __read_mostly; +/* Number of errors to show */ +static u32 show_num_errors = 1; + +static u32 num_free_entries; +static u32 min_free_entries; +static u32 nr_total_entries; + +/* number of preallocated entries requested by kernel cmdline */ +static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; + +/* debugfs dentry's for the stuff above */ +static struct dentry *dma_debug_dent __read_mostly; +static struct dentry *global_disable_dent __read_mostly; +static struct dentry *error_count_dent __read_mostly; +static struct dentry *show_all_errors_dent __read_mostly; +static struct dentry *show_num_errors_dent __read_mostly; +static struct dentry *num_free_entries_dent __read_mostly; +static struct dentry *min_free_entries_dent __read_mostly; +static struct dentry *filter_dent __read_mostly; + +/* per-driver filter related state */ + +#define NAME_MAX_LEN 64 + +static char current_driver_name[NAME_MAX_LEN] __read_mostly; +static struct device_driver *current_driver __read_mostly; + +static DEFINE_RWLOCK(driver_name_lock); + +static const char *const maperr2str[] = { + [MAP_ERR_CHECK_NOT_APPLICABLE] = "dma map error check not applicable", + [MAP_ERR_NOT_CHECKED] = "dma map error not checked", + [MAP_ERR_CHECKED] = "dma map error checked", +}; + +static const char *type2name[5] = { "single", "page", + "scather-gather", "coherent", + "resource" }; + +static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", + "DMA_FROM_DEVICE", "DMA_NONE" }; + +/* + * The access to some variables in this macro is racy. We can't use atomic_t + * here because all these variables are exported to debugfs. Some of them even + * writeable. This is also the reason why a lock won't help much. But anyway, + * the races are no big deal. Here is why: + * + * error_count: the addition is racy, but the worst thing that can happen is + * that we don't count some errors + * show_num_errors: the subtraction is racy. Also no big deal because in + * worst case this will result in one warning more in the + * system log than the user configured. This variable is + * writeable via debugfs. + */ +static inline void dump_entry_trace(struct dma_debug_entry *entry) +{ +#ifdef CONFIG_STACKTRACE + if (entry) { + pr_warning("Mapped at:\n"); + print_stack_trace(&entry->stacktrace, 0); + } +#endif +} + +static bool driver_filter(struct device *dev) +{ + struct device_driver *drv; + unsigned long flags; + bool ret; + + /* driver filter off */ + if (likely(!current_driver_name[0])) + return true; + + /* driver filter on and initialized */ + if (current_driver && dev && dev->driver == current_driver) + return true; + + /* driver filter on, but we can't filter on a NULL device... */ + if (!dev) + return false; + + if (current_driver || !current_driver_name[0]) + return false; + + /* driver filter on but not yet initialized */ + drv = dev->driver; + if (!drv) + return false; + + /* lock to protect against change of current_driver_name */ + read_lock_irqsave(&driver_name_lock, flags); + + ret = false; + if (drv->name && + strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) { + current_driver = drv; + ret = true; + } + + read_unlock_irqrestore(&driver_name_lock, flags); + + return ret; +} + +#define err_printk(dev, entry, format, arg...) do { \ + error_count += 1; \ + if (driver_filter(dev) && \ + (show_all_errors || show_num_errors > 0)) { \ + WARN(1, "%s %s: " format, \ + dev ? dev_driver_string(dev) : "NULL", \ + dev ? dev_name(dev) : "NULL", ## arg); \ + dump_entry_trace(entry); \ + } \ + if (!show_all_errors && show_num_errors > 0) \ + show_num_errors -= 1; \ + } while (0); + +/* + * Hash related functions + * + * Every DMA-API request is saved into a struct dma_debug_entry. To + * have quick access to these structs they are stored into a hash. + */ +static int hash_fn(struct dma_debug_entry *entry) +{ + /* + * Hash function is based on the dma address. + * We use bits 20-27 here as the index into the hash + */ + return (entry->dev_addr >> HASH_FN_SHIFT) & HASH_FN_MASK; +} + +/* + * Request exclusive access to a hash bucket for a given dma_debug_entry. + */ +static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, + unsigned long *flags) + __acquires(&dma_entry_hash[idx].lock) +{ + int idx = hash_fn(entry); + unsigned long __flags; + + spin_lock_irqsave(&dma_entry_hash[idx].lock, __flags); + *flags = __flags; + return &dma_entry_hash[idx]; +} + +/* + * Give up exclusive access to the hash bucket + */ +static void put_hash_bucket(struct hash_bucket *bucket, + unsigned long *flags) + __releases(&bucket->lock) +{ + unsigned long __flags = *flags; + + spin_unlock_irqrestore(&bucket->lock, __flags); +} + +static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b) +{ + return ((a->dev_addr == b->dev_addr) && + (a->dev == b->dev)) ? true : false; +} + +static bool containing_match(struct dma_debug_entry *a, + struct dma_debug_entry *b) +{ + if (a->dev != b->dev) + return false; + + if ((b->dev_addr <= a->dev_addr) && + ((b->dev_addr + b->size) >= (a->dev_addr + a->size))) + return true; + + return false; +} + +/* + * Search a given entry in the hash bucket list + */ +static struct dma_debug_entry *__hash_bucket_find(struct hash_bucket *bucket, + struct dma_debug_entry *ref, + match_fn match) +{ + struct dma_debug_entry *entry, *ret = NULL; + int matches = 0, match_lvl, last_lvl = -1; + + list_for_each_entry(entry, &bucket->list, list) { + if (!match(ref, entry)) + continue; + + /* + * Some drivers map the same physical address multiple + * times. Without a hardware IOMMU this results in the + * same device addresses being put into the dma-debug + * hash multiple times too. This can result in false + * positives being reported. Therefore we implement a + * best-fit algorithm here which returns the entry from + * the hash which fits best to the reference value + * instead of the first-fit. + */ + matches += 1; + match_lvl = 0; + entry->size == ref->size ? ++match_lvl : 0; + entry->type == ref->type ? ++match_lvl : 0; + entry->direction == ref->direction ? ++match_lvl : 0; + entry->sg_call_ents == ref->sg_call_ents ? ++match_lvl : 0; + + if (match_lvl == 4) { + /* perfect-fit - return the result */ + return entry; + } else if (match_lvl > last_lvl) { + /* + * We found an entry that fits better then the + * previous one or it is the 1st match. + */ + last_lvl = match_lvl; + ret = entry; + } + } + + /* + * If we have multiple matches but no perfect-fit, just return + * NULL. + */ + ret = (matches == 1) ? ret : NULL; + + return ret; +} + +static struct dma_debug_entry *bucket_find_exact(struct hash_bucket *bucket, + struct dma_debug_entry *ref) +{ + return __hash_bucket_find(bucket, ref, exact_match); +} + +static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, + struct dma_debug_entry *ref, + unsigned long *flags) +{ + + unsigned int max_range = dma_get_max_seg_size(ref->dev); + struct dma_debug_entry *entry, index = *ref; + unsigned int range = 0; + + while (range <= max_range) { + entry = __hash_bucket_find(*bucket, ref, containing_match); + + if (entry) + return entry; + + /* + * Nothing found, go back a hash bucket + */ + put_hash_bucket(*bucket, flags); + range += (1 << HASH_FN_SHIFT); + index.dev_addr -= (1 << HASH_FN_SHIFT); + *bucket = get_hash_bucket(&index, flags); + } + + return NULL; +} + +/* + * Add an entry to a hash bucket + */ +static void hash_bucket_add(struct hash_bucket *bucket, + struct dma_debug_entry *entry) +{ + list_add_tail(&entry->list, &bucket->list); +} + +/* + * Remove entry from a hash bucket list + */ +static void hash_bucket_del(struct dma_debug_entry *entry) +{ + list_del(&entry->list); +} + +static unsigned long long phys_addr(struct dma_debug_entry *entry) +{ + if (entry->type == dma_debug_resource) + return __pfn_to_phys(entry->pfn) + entry->offset; + + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; +} + +/* + * Dump mapping entries for debugging purposes + */ +void debug_dma_dump_mappings(struct device *dev) +{ + int idx; + + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&bucket->lock, flags); + + list_for_each_entry(entry, &bucket->list, list) { + if (!dev || dev == entry->dev) { + dev_info(entry->dev, + "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + } + + spin_unlock_irqrestore(&bucket->lock, flags); + } +} + +/* + * For each mapping (initial cacheline in the case of + * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a + * scatterlist, or the cacheline specified in dma_map_single) insert + * into this tree using the cacheline as the key. At + * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If + * the entry already exists at insertion time add a tag as a reference + * count for the overlapping mappings. For now, the overlap tracking + * just ensures that 'unmaps' balance 'maps' before marking the + * cacheline idle, but we should also be flagging overlaps as an API + * violation. + * + * Memory usage is mostly constrained by the maximum number of available + * dma-debug entries in that we need a free dma_debug_entry before + * inserting into the tree. In the case of dma_map_page and + * dma_alloc_coherent there is only one dma_debug_entry and one + * dma_active_cacheline entry to track per event. dma_map_sg(), on the + * other hand, consumes a single dma_debug_entry, but inserts 'nents' + * entries into the tree. + * + * At any time debug_dma_assert_idle() can be called to trigger a + * warning if any cachelines in the given page are in the active set. + */ +static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); +static DEFINE_SPINLOCK(radix_lock); +#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) +#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) +#define CACHELINES_PER_PAGE (1 << CACHELINE_PER_PAGE_SHIFT) + +static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry) +{ + return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) + + (entry->offset >> L1_CACHE_SHIFT); +} + +static int active_cacheline_read_overlap(phys_addr_t cln) +{ + int overlap = 0, i; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (radix_tree_tag_get(&dma_active_cacheline, cln, i)) + overlap |= 1 << i; + return overlap; +} + +static int active_cacheline_set_overlap(phys_addr_t cln, int overlap) +{ + int i; + + if (overlap > ACTIVE_CACHELINE_MAX_OVERLAP || overlap < 0) + return overlap; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (overlap & 1 << i) + radix_tree_tag_set(&dma_active_cacheline, cln, i); + else + radix_tree_tag_clear(&dma_active_cacheline, cln, i); + + return overlap; +} + +static void active_cacheline_inc_overlap(phys_addr_t cln) +{ + int overlap = active_cacheline_read_overlap(cln); + + overlap = active_cacheline_set_overlap(cln, ++overlap); + + /* If we overflowed the overlap counter then we're potentially + * leaking dma-mappings. Otherwise, if maps and unmaps are + * balanced then this overflow may cause false negatives in + * debug_dma_assert_idle() as the cacheline may be marked idle + * prematurely. + */ + WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, + "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n", + ACTIVE_CACHELINE_MAX_OVERLAP, &cln); +} + +static int active_cacheline_dec_overlap(phys_addr_t cln) +{ + int overlap = active_cacheline_read_overlap(cln); + + return active_cacheline_set_overlap(cln, --overlap); +} + +static int active_cacheline_insert(struct dma_debug_entry *entry) +{ + phys_addr_t cln = to_cacheline_number(entry); + unsigned long flags; + int rc; + + /* If the device is not writing memory then we don't have any + * concerns about the cpu consuming stale data. This mitigates + * legitimate usages of overlapping mappings. + */ + if (entry->direction == DMA_TO_DEVICE) + return 0; + + spin_lock_irqsave(&radix_lock, flags); + rc = radix_tree_insert(&dma_active_cacheline, cln, entry); + if (rc == -EEXIST) + active_cacheline_inc_overlap(cln); + spin_unlock_irqrestore(&radix_lock, flags); + + return rc; +} + +static void active_cacheline_remove(struct dma_debug_entry *entry) +{ + phys_addr_t cln = to_cacheline_number(entry); + unsigned long flags; + + /* ...mirror the insert case */ + if (entry->direction == DMA_TO_DEVICE) + return; + + spin_lock_irqsave(&radix_lock, flags); + /* since we are counting overlaps the final put of the + * cacheline will occur when the overlap count is 0. + * active_cacheline_dec_overlap() returns -1 in that case + */ + if (active_cacheline_dec_overlap(cln) < 0) + radix_tree_delete(&dma_active_cacheline, cln); + spin_unlock_irqrestore(&radix_lock, flags); +} + +/** + * debug_dma_assert_idle() - assert that a page is not undergoing dma + * @page: page to lookup in the dma_active_cacheline tree + * + * Place a call to this routine in cases where the cpu touching the page + * before the dma completes (page is dma_unmapped) will lead to data + * corruption. + */ +void debug_dma_assert_idle(struct page *page) +{ + static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; + struct dma_debug_entry *entry = NULL; + void **results = (void **) &ents; + unsigned int nents, i; + unsigned long flags; + phys_addr_t cln; + + if (dma_debug_disabled()) + return; + + if (!page) + return; + + cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; + spin_lock_irqsave(&radix_lock, flags); + nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln, + CACHELINES_PER_PAGE); + for (i = 0; i < nents; i++) { + phys_addr_t ent_cln = to_cacheline_number(ents[i]); + + if (ent_cln == cln) { + entry = ents[i]; + break; + } else if (ent_cln >= cln + CACHELINES_PER_PAGE) + break; + } + spin_unlock_irqrestore(&radix_lock, flags); + + if (!entry) + return; + + cln = to_cacheline_number(entry); + err_printk(entry->dev, entry, + "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n", + &cln); +} + +/* + * Wrapper function for adding an entry to the hash. + * This function takes care of locking itself. + */ +static void add_dma_entry(struct dma_debug_entry *entry) +{ + struct hash_bucket *bucket; + unsigned long flags; + int rc; + + bucket = get_hash_bucket(entry, &flags); + hash_bucket_add(bucket, entry); + put_hash_bucket(bucket, &flags); + + rc = active_cacheline_insert(entry); + if (rc == -ENOMEM) { + pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n"); + global_disable = true; + } + + /* TODO: report -EEXIST errors here as overlapping mappings are + * not supported by the DMA API + */ +} + +static struct dma_debug_entry *__dma_entry_alloc(void) +{ + struct dma_debug_entry *entry; + + entry = list_entry(free_entries.next, struct dma_debug_entry, list); + list_del(&entry->list); + memset(entry, 0, sizeof(*entry)); + + num_free_entries -= 1; + if (num_free_entries < min_free_entries) + min_free_entries = num_free_entries; + + return entry; +} + +/* struct dma_entry allocator + * + * The next two functions implement the allocator for + * struct dma_debug_entries. + */ +static struct dma_debug_entry *dma_entry_alloc(void) +{ + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&free_entries_lock, flags); + + if (list_empty(&free_entries)) { + global_disable = true; + spin_unlock_irqrestore(&free_entries_lock, flags); + pr_err("DMA-API: debugging out of memory - disabling\n"); + return NULL; + } + + entry = __dma_entry_alloc(); + + spin_unlock_irqrestore(&free_entries_lock, flags); + +#ifdef CONFIG_STACKTRACE + entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; + entry->stacktrace.entries = entry->st_entries; + entry->stacktrace.skip = 2; + save_stack_trace(&entry->stacktrace); +#endif + + return entry; +} + +static void dma_entry_free(struct dma_debug_entry *entry) +{ + unsigned long flags; + + active_cacheline_remove(entry); + + /* + * add to beginning of the list - this way the entries are + * more likely cache hot when they are reallocated. + */ + spin_lock_irqsave(&free_entries_lock, flags); + list_add(&entry->list, &free_entries); + num_free_entries += 1; + spin_unlock_irqrestore(&free_entries_lock, flags); +} + +int dma_debug_resize_entries(u32 num_entries) +{ + int i, delta, ret = 0; + unsigned long flags; + struct dma_debug_entry *entry; + LIST_HEAD(tmp); + + spin_lock_irqsave(&free_entries_lock, flags); + + if (nr_total_entries < num_entries) { + delta = num_entries - nr_total_entries; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + for (i = 0; i < delta; i++) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + break; + + list_add_tail(&entry->list, &tmp); + } + + spin_lock_irqsave(&free_entries_lock, flags); + + list_splice(&tmp, &free_entries); + nr_total_entries += i; + num_free_entries += i; + } else { + delta = nr_total_entries - num_entries; + + for (i = 0; i < delta && !list_empty(&free_entries); i++) { + entry = __dma_entry_alloc(); + kfree(entry); + } + + nr_total_entries -= i; + } + + if (nr_total_entries != num_entries) + ret = 1; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + return ret; +} + +/* + * DMA-API debugging init code + * + * The init code does two things: + * 1. Initialize core data structures + * 2. Preallocate a given number of dma_debug_entry structs + */ + +static int prealloc_memory(u32 num_entries) +{ + struct dma_debug_entry *entry, *next_entry; + int i; + + for (i = 0; i < num_entries; ++i) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out_err; + + list_add_tail(&entry->list, &free_entries); + } + + num_free_entries = num_entries; + min_free_entries = num_entries; + + pr_info("DMA-API: preallocated %d debug entries\n", num_entries); + + return 0; + +out_err: + + list_for_each_entry_safe(entry, next_entry, &free_entries, list) { + list_del(&entry->list); + kfree(entry); + } + + return -ENOMEM; +} + +static ssize_t filter_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[NAME_MAX_LEN + 1]; + unsigned long flags; + int len; + + if (!current_driver_name[0]) + return 0; + + /* + * We can't copy to userspace directly because current_driver_name can + * only be read under the driver_name_lock with irqs disabled. So + * create a temporary copy first. + */ + read_lock_irqsave(&driver_name_lock, flags); + len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name); + read_unlock_irqrestore(&driver_name_lock, flags); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t filter_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[NAME_MAX_LEN]; + unsigned long flags; + size_t len; + int i; + + /* + * We can't copy from userspace directly. Access to + * current_driver_name is protected with a write_lock with irqs + * disabled. Since copy_from_user can fault and may sleep we + * need to copy to temporary buffer first + */ + len = min(count, (size_t)(NAME_MAX_LEN - 1)); + if (copy_from_user(buf, userbuf, len)) + return -EFAULT; + + buf[len] = 0; + + write_lock_irqsave(&driver_name_lock, flags); + + /* + * Now handle the string we got from userspace very carefully. + * The rules are: + * - only use the first token we got + * - token delimiter is everything looking like a space + * character (' ', '\n', '\t' ...) + * + */ + if (!isalnum(buf[0])) { + /* + * If the first character userspace gave us is not + * alphanumerical then assume the filter should be + * switched off. + */ + if (current_driver_name[0]) + pr_info("DMA-API: switching off dma-debug driver filter\n"); + current_driver_name[0] = 0; + current_driver = NULL; + goto out_unlock; + } + + /* + * Now parse out the first token and use it as the name for the + * driver to filter for. + */ + for (i = 0; i < NAME_MAX_LEN - 1; ++i) { + current_driver_name[i] = buf[i]; + if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0) + break; + } + current_driver_name[i] = 0; + current_driver = NULL; + + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); + +out_unlock: + write_unlock_irqrestore(&driver_name_lock, flags); + + return count; +} + +static const struct file_operations filter_fops = { + .read = filter_read, + .write = filter_write, + .llseek = default_llseek, +}; + +static int dma_debug_fs_init(void) +{ + dma_debug_dent = debugfs_create_dir("dma-api", NULL); + if (!dma_debug_dent) { + pr_err("DMA-API: can not create debugfs directory\n"); + return -ENOMEM; + } + + global_disable_dent = debugfs_create_bool("disabled", 0444, + dma_debug_dent, + &global_disable); + if (!global_disable_dent) + goto out_err; + + error_count_dent = debugfs_create_u32("error_count", 0444, + dma_debug_dent, &error_count); + if (!error_count_dent) + goto out_err; + + show_all_errors_dent = debugfs_create_u32("all_errors", 0644, + dma_debug_dent, + &show_all_errors); + if (!show_all_errors_dent) + goto out_err; + + show_num_errors_dent = debugfs_create_u32("num_errors", 0644, + dma_debug_dent, + &show_num_errors); + if (!show_num_errors_dent) + goto out_err; + + num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, + dma_debug_dent, + &num_free_entries); + if (!num_free_entries_dent) + goto out_err; + + min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, + dma_debug_dent, + &min_free_entries); + if (!min_free_entries_dent) + goto out_err; + + filter_dent = debugfs_create_file("driver_filter", 0644, + dma_debug_dent, NULL, &filter_fops); + if (!filter_dent) + goto out_err; + + return 0; + +out_err: + debugfs_remove_recursive(dma_debug_dent); + + return -ENOMEM; +} + +static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) +{ + struct dma_debug_entry *entry; + unsigned long flags; + int count = 0, i; + + for (i = 0; i < HASH_SIZE; ++i) { + spin_lock_irqsave(&dma_entry_hash[i].lock, flags); + list_for_each_entry(entry, &dma_entry_hash[i].list, list) { + if (entry->dev == dev) { + count += 1; + *out_entry = entry; + } + } + spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags); + } + + return count; +} + +static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) +{ + struct device *dev = data; + struct dma_debug_entry *uninitialized_var(entry); + int count; + + if (dma_debug_disabled()) + return 0; + + switch (action) { + case BUS_NOTIFY_UNBOUND_DRIVER: + count = device_dma_allocations(dev, &entry); + if (count == 0) + break; + err_printk(dev, entry, "DMA-API: device driver has pending " + "DMA allocations while released from device " + "[count=%d]\n" + "One of leaked entries details: " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [mapped as %s]\n", + count, entry->dev_addr, entry->size, + dir2name[entry->direction], type2name[entry->type]); + break; + default: + break; + } + + return 0; +} + +void dma_debug_add_bus(struct bus_type *bus) +{ + struct notifier_block *nb; + + if (dma_debug_disabled()) + return; + + nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); + if (nb == NULL) { + pr_err("dma_debug_add_bus: out of memory\n"); + return; + } + + nb->notifier_call = dma_debug_device_change; + + bus_register_notifier(bus, nb); +} + +static int dma_debug_init(void) +{ + int i; + + /* Do not use dma_debug_initialized here, since we really want to be + * called to set dma_debug_initialized + */ + if (global_disable) + return 0; + + for (i = 0; i < HASH_SIZE; ++i) { + INIT_LIST_HEAD(&dma_entry_hash[i].list); + spin_lock_init(&dma_entry_hash[i].lock); + } + + if (dma_debug_fs_init() != 0) { + pr_err("DMA-API: error creating debugfs entries - disabling\n"); + global_disable = true; + + return 0; + } + + if (prealloc_memory(nr_prealloc_entries) != 0) { + pr_err("DMA-API: debugging out of memory error - disabled\n"); + global_disable = true; + + return 0; + } + + nr_total_entries = num_free_entries; + + dma_debug_initialized = true; + + pr_info("DMA-API: debugging enabled by kernel config\n"); + return 0; +} +core_initcall(dma_debug_init); + +static __init int dma_debug_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (strncmp(str, "off", 3) == 0) { + pr_info("DMA-API: debugging disabled on kernel command line\n"); + global_disable = true; + } + + return 0; +} + +static __init int dma_debug_entries_cmdline(char *str) +{ + if (!str) + return -EINVAL; + if (!get_option(&str, &nr_prealloc_entries)) + nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; + return 0; +} + +__setup("dma_debug=", dma_debug_cmdline); +__setup("dma_debug_entries=", dma_debug_entries_cmdline); + +static void check_unmap(struct dma_debug_entry *ref) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + + bucket = get_hash_bucket(ref, &flags); + entry = bucket_find_exact(bucket, ref); + + if (!entry) { + /* must drop lock before calling dma_mapping_error */ + put_hash_bucket(bucket, &flags); + + if (dma_mapping_error(ref->dev, ref->dev_addr)) { + err_printk(ref->dev, NULL, + "DMA-API: device driver tries to free an " + "invalid DMA memory address\n"); + } else { + err_printk(ref->dev, NULL, + "DMA-API: device driver tries to free DMA " + "memory it has not allocated [device " + "address=0x%016llx] [size=%llu bytes]\n", + ref->dev_addr, ref->size); + } + return; + } + + if (ref->size != entry->size) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with different size " + "[device address=0x%016llx] [map size=%llu bytes] " + "[unmap size=%llu bytes]\n", + ref->dev_addr, entry->size, ref->size); + } + + if (ref->type != entry->type) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with wrong function " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped as %s] [unmapped as %s]\n", + ref->dev_addr, ref->size, + type2name[entry->type], type2name[ref->type]); + } else if ((entry->type == dma_debug_coherent) && + (phys_addr(ref) != phys_addr(entry))) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with different CPU address " + "[device address=0x%016llx] [size=%llu bytes] " + "[cpu alloc address=0x%016llx] " + "[cpu free address=0x%016llx]", + ref->dev_addr, ref->size, + phys_addr(entry), + phys_addr(ref)); + } + + if (ref->sg_call_ents && ref->type == dma_debug_sg && + ref->sg_call_ents != entry->sg_call_ents) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA sg list with different entry count " + "[map count=%d] [unmap count=%d]\n", + entry->sg_call_ents, ref->sg_call_ents); + } + + /* + * This may be no bug in reality - but most implementations of the + * DMA API don't handle this properly, so check for it here + */ + if (ref->direction != entry->direction) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with different direction " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [unmapped with %s]\n", + ref->dev_addr, ref->size, + dir2name[entry->direction], + dir2name[ref->direction]); + } + + /* + * Drivers should use dma_mapping_error() to check the returned + * addresses of dma_map_single() and dma_map_page(). + * If not, print this warning message. See Documentation/DMA-API.txt. + */ + if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { + err_printk(ref->dev, entry, + "DMA-API: device driver failed to check map error" + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped as %s]", + ref->dev_addr, ref->size, + type2name[entry->type]); + } + + hash_bucket_del(entry); + dma_entry_free(entry); + + put_hash_bucket(bucket, &flags); +} + +static void check_for_stack(struct device *dev, + struct page *page, size_t offset) +{ + void *addr; + struct vm_struct *stack_vm_area = task_stack_vm_area(current); + + if (!stack_vm_area) { + /* Stack is direct-mapped. */ + if (PageHighMem(page)) + return; + addr = page_address(page) + offset; + if (object_is_on_stack(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); + } else { + /* Stack is vmalloced. */ + int i; + + for (i = 0; i < stack_vm_area->nr_pages; i++) { + if (page != stack_vm_area->pages[i]) + continue; + + addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); + break; + } + } +} + +static inline bool overlap(void *addr, unsigned long len, void *start, void *end) +{ + unsigned long a1 = (unsigned long)addr; + unsigned long b1 = a1 + len; + unsigned long a2 = (unsigned long)start; + unsigned long b2 = (unsigned long)end; + + return !(b1 <= a2 || a1 >= b2); +} + +static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len) +{ + if (overlap(addr, len, _stext, _etext) || + overlap(addr, len, __start_rodata, __end_rodata)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); +} + +static void check_sync(struct device *dev, + struct dma_debug_entry *ref, + bool to_cpu) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + + bucket = get_hash_bucket(ref, &flags); + + entry = bucket_find_contain(&bucket, ref, &flags); + + if (!entry) { + err_printk(dev, NULL, "DMA-API: device driver tries " + "to sync DMA memory it has not allocated " + "[device address=0x%016llx] [size=%llu bytes]\n", + (unsigned long long)ref->dev_addr, ref->size); + goto out; + } + + if (ref->size > entry->size) { + err_printk(dev, entry, "DMA-API: device driver syncs" + " DMA memory outside allocated range " + "[device address=0x%016llx] " + "[allocation size=%llu bytes] " + "[sync offset+size=%llu]\n", + entry->dev_addr, entry->size, + ref->size); + } + + if (entry->direction == DMA_BIDIRECTIONAL) + goto out; + + if (ref->direction != entry->direction) { + err_printk(dev, entry, "DMA-API: device driver syncs " + "DMA memory with different direction " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [synced with %s]\n", + (unsigned long long)ref->dev_addr, entry->size, + dir2name[entry->direction], + dir2name[ref->direction]); + } + + if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) && + !(ref->direction == DMA_TO_DEVICE)) + err_printk(dev, entry, "DMA-API: device driver syncs " + "device read-only DMA memory for cpu " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [synced with %s]\n", + (unsigned long long)ref->dev_addr, entry->size, + dir2name[entry->direction], + dir2name[ref->direction]); + + if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) && + !(ref->direction == DMA_FROM_DEVICE)) + err_printk(dev, entry, "DMA-API: device driver syncs " + "device write-only DMA memory to device " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [synced with %s]\n", + (unsigned long long)ref->dev_addr, entry->size, + dir2name[entry->direction], + dir2name[ref->direction]); + + if (ref->sg_call_ents && ref->type == dma_debug_sg && + ref->sg_call_ents != entry->sg_call_ents) { + err_printk(ref->dev, entry, "DMA-API: device driver syncs " + "DMA sg list with different entry count " + "[map count=%d] [sync count=%d]\n", + entry->sg_call_ents, ref->sg_call_ents); + } + +out: + put_hash_bucket(bucket, &flags); +} + +static void check_sg_segment(struct device *dev, struct scatterlist *sg) +{ +#ifdef CONFIG_DMA_API_DEBUG_SG + unsigned int max_seg = dma_get_max_seg_size(dev); + u64 start, end, boundary = dma_get_seg_boundary(dev); + + /* + * Either the driver forgot to set dma_parms appropriately, or + * whoever generated the list forgot to check them. + */ + if (sg->length > max_seg) + err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n", + sg->length, max_seg); + /* + * In some cases this could potentially be the DMA API + * implementation's fault, but it would usually imply that + * the scatterlist was built inappropriately to begin with. + */ + start = sg_dma_address(sg); + end = start + sg_dma_len(sg) - 1; + if ((start ^ end) & ~boundary) + err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n", + start, end, boundary); +#endif +} + +void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, + size_t size, int direction, dma_addr_t dma_addr, + bool map_single) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + if (dma_mapping_error(dev, dma_addr)) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->dev = dev; + entry->type = dma_debug_page; + entry->pfn = page_to_pfn(page); + entry->offset = offset, + entry->dev_addr = dma_addr; + entry->size = size; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + if (map_single) + entry->type = dma_debug_single; + + check_for_stack(dev, page, offset); + + if (!PageHighMem(page)) { + void *addr = page_address(page) + offset; + + check_for_illegal_area(dev, addr, size); + } + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_page); + +void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + struct dma_debug_entry ref; + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + + if (unlikely(dma_debug_disabled())) + return; + + ref.dev = dev; + ref.dev_addr = dma_addr; + bucket = get_hash_bucket(&ref, &flags); + + list_for_each_entry(entry, &bucket->list, list) { + if (!exact_match(&ref, entry)) + continue; + + /* + * The same physical address can be mapped multiple + * times. Without a hardware IOMMU this results in the + * same device addresses being put into the dma-debug + * hash multiple times too. This can result in false + * positives being reported. Therefore we implement a + * best-fit algorithm here which updates the first entry + * from the hash which fits the reference value and is + * not currently listed as being checked. + */ + if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { + entry->map_err_type = MAP_ERR_CHECKED; + break; + } + } + + put_hash_bucket(bucket, &flags); +} +EXPORT_SYMBOL(debug_dma_mapping_error); + +void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction, bool map_single) +{ + struct dma_debug_entry ref = { + .type = dma_debug_page, + .dev = dev, + .dev_addr = addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + if (map_single) + ref.type = dma_debug_single; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_page); + +void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, int mapped_ents, int direction) +{ + struct dma_debug_entry *entry; + struct scatterlist *s; + int i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sg, s, mapped_ents, i) { + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_sg; + entry->dev = dev; + entry->pfn = page_to_pfn(sg_page(s)); + entry->offset = s->offset, + entry->size = sg_dma_len(s); + entry->dev_addr = sg_dma_address(s); + entry->direction = direction; + entry->sg_call_ents = nents; + entry->sg_mapped_ents = mapped_ents; + + check_for_stack(dev, sg_page(s), s->offset); + + if (!PageHighMem(sg_page(s))) { + check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); + } + + check_sg_segment(dev, s); + + add_dma_entry(entry); + } +} +EXPORT_SYMBOL(debug_dma_map_sg); + +static int get_nr_mapped_entries(struct device *dev, + struct dma_debug_entry *ref) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + int mapped_ents; + + bucket = get_hash_bucket(ref, &flags); + entry = bucket_find_exact(bucket, ref); + mapped_ents = 0; + + if (entry) + mapped_ents = entry->sg_mapped_ents; + put_hash_bucket(bucket, &flags); + + return mapped_ents; +} + +void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir) +{ + struct scatterlist *s; + int mapped_ents = 0, i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sglist, s, nelems, i) { + + struct dma_debug_entry ref = { + .type = dma_debug_sg, + .dev = dev, + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), + .direction = dir, + .sg_call_ents = nelems, + }; + + if (mapped_ents && i >= mapped_ents) + break; + + if (!i) + mapped_ents = get_nr_mapped_entries(dev, &ref); + + check_unmap(&ref); + } +} +EXPORT_SYMBOL(debug_dma_unmap_sg); + +void debug_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t dma_addr, void *virt) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + if (unlikely(virt == NULL)) + return; + + /* handle vmalloc and linear addresses */ + if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_coherent; + entry->dev = dev; + entry->offset = offset_in_page(virt); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = DMA_BIDIRECTIONAL; + + if (is_vmalloc_addr(virt)) + entry->pfn = vmalloc_to_pfn(virt); + else + entry->pfn = page_to_pfn(virt_to_page(virt)); + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_alloc_coherent); + +void debug_dma_free_coherent(struct device *dev, size_t size, + void *virt, dma_addr_t addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_coherent, + .dev = dev, + .offset = offset_in_page(virt), + .dev_addr = addr, + .size = size, + .direction = DMA_BIDIRECTIONAL, + }; + + /* handle vmalloc and linear addresses */ + if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) + return; + + if (is_vmalloc_addr(virt)) + ref.pfn = vmalloc_to_pfn(virt); + else + ref.pfn = page_to_pfn(virt_to_page(virt)); + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_free_coherent); + +void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, + int direction, dma_addr_t dma_addr) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_resource; + entry->dev = dev; + entry->pfn = PHYS_PFN(addr); + entry->offset = offset_in_page(addr); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_resource); + +void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + struct dma_debug_entry ref = { + .type = dma_debug_resource, + .dev = dev, + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_resource); + +void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, + size_t size, int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, true); +} +EXPORT_SYMBOL(debug_dma_sync_single_for_cpu); + +void debug_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, false); +} +EXPORT_SYMBOL(debug_dma_sync_single_for_device); + +void debug_dma_sync_single_range_for_cpu(struct device *dev, + dma_addr_t dma_handle, + unsigned long offset, size_t size, + int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = offset + size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, true); +} +EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu); + +void debug_dma_sync_single_range_for_device(struct device *dev, + dma_addr_t dma_handle, + unsigned long offset, + size_t size, int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = offset + size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, false); +} +EXPORT_SYMBOL(debug_dma_sync_single_range_for_device); + +void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, int direction) +{ + struct scatterlist *s; + int mapped_ents = 0, i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sg, s, nelems, i) { + + struct dma_debug_entry ref = { + .type = dma_debug_sg, + .dev = dev, + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), + .direction = direction, + .sg_call_ents = nelems, + }; + + if (!i) + mapped_ents = get_nr_mapped_entries(dev, &ref); + + if (i >= mapped_ents) + break; + + check_sync(dev, &ref, true); + } +} +EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); + +void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, int direction) +{ + struct scatterlist *s; + int mapped_ents = 0, i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sg, s, nelems, i) { + + struct dma_debug_entry ref = { + .type = dma_debug_sg, + .dev = dev, + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), + .direction = direction, + .sg_call_ents = nelems, + }; + if (!i) + mapped_ents = get_nr_mapped_entries(dev, &ref); + + if (i >= mapped_ents) + break; + + check_sync(dev, &ref, false); + } +} +EXPORT_SYMBOL(debug_dma_sync_sg_for_device); + +static int __init dma_debug_driver_setup(char *str) +{ + int i; + + for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) { + current_driver_name[i] = *str; + if (*str == 0) + break; + } + + if (current_driver_name[0]) + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); + + + return 1; +} +__setup("dma_debug_driver=", dma_debug_driver_setup); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c new file mode 100644 index 000000000000..8be8106270c2 --- /dev/null +++ b/kernel/dma/direct.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DMA operations that map physical memory directly without using an IOMMU or + * flushing caches. + */ +#include +#include +#include +#include +#include +#include +#include + +#define DIRECT_MAPPING_ERROR 0 + +/* + * Most architectures use ZONE_DMA for the first 16 Megabytes, but + * some use it for entirely different regions: + */ +#ifndef ARCH_ZONE_DMA_BITS +#define ARCH_ZONE_DMA_BITS 24 +#endif + +/* + * For AMD SEV all DMA must be to unencrypted addresses. + */ +static inline bool force_dma_unencrypted(void) +{ + return sev_active(); +} + +static bool +check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, + const char *caller) +{ + if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { + if (!dev->dma_mask) { + dev_err(dev, + "%s: call on device without dma_mask\n", + caller); + return false; + } + + if (*dev->dma_mask >= DMA_BIT_MASK(32)) { + dev_err(dev, + "%s: overflow %pad+%zu of device mask %llx\n", + caller, &dma_addr, size, *dev->dma_mask); + } + return false; + } + return true; +} + +static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +{ + dma_addr_t addr = force_dma_unencrypted() ? + __phys_to_dma(dev, phys) : phys_to_dma(dev, phys); + return addr + size - 1 <= dev->coherent_dma_mask; +} + +void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + int page_order = get_order(size); + struct page *page = NULL; + void *ret; + + /* we always manually zero the memory once we are done: */ + gfp &= ~__GFP_ZERO; + + /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ + if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + gfp |= GFP_DMA; + if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) + gfp |= GFP_DMA32; + +again: + /* CMA can be used only in the context which permits sleeping */ + if (gfpflags_allow_blocking(gfp)) { + page = dma_alloc_from_contiguous(dev, count, page_order, gfp); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } + } + if (!page) + page = alloc_pages_node(dev_to_node(dev), gfp, page_order); + + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + __free_pages(page, page_order); + page = NULL; + + if (IS_ENABLED(CONFIG_ZONE_DMA32) && + dev->coherent_dma_mask < DMA_BIT_MASK(64) && + !(gfp & (GFP_DMA32 | GFP_DMA))) { + gfp |= GFP_DMA32; + goto again; + } + + if (IS_ENABLED(CONFIG_ZONE_DMA) && + dev->coherent_dma_mask < DMA_BIT_MASK(32) && + !(gfp & GFP_DMA)) { + gfp = (gfp & ~GFP_DMA32) | GFP_DMA; + goto again; + } + } + + if (!page) + return NULL; + ret = page_address(page); + if (force_dma_unencrypted()) { + set_memory_decrypted((unsigned long)ret, 1 << page_order); + *dma_handle = __phys_to_dma(dev, page_to_phys(page)); + } else { + *dma_handle = phys_to_dma(dev, page_to_phys(page)); + } + memset(ret, 0, size); + return ret; +} + +/* + * NOTE: this function must never look at the dma_addr argument, because we want + * to be able to use it as a helper for iommu implementations as well. + */ +void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_addr, unsigned long attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned int page_order = get_order(size); + + if (force_dma_unencrypted()) + set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); + if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count)) + free_pages((unsigned long)cpu_addr, page_order); +} + +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset; + + if (!check_addr(dev, dma_addr, size, __func__)) + return DIRECT_MAPPING_ERROR; + return dma_addr; +} + +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + + sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg)); + if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__)) + return 0; + sg_dma_len(sg) = sg->length; + } + + return nents; +} + +int dma_direct_supported(struct device *dev, u64 mask) +{ +#ifdef CONFIG_ZONE_DMA + if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + return 0; +#else + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask < DMA_BIT_MASK(32)) + return 0; +#endif + /* + * Various PCI/PCIe bridges have broken support for > 32bit DMA even + * if the device itself might support it. + */ + if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32)) + return 0; + return 1; +} + +int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == DIRECT_MAPPING_ERROR; +} + +const struct dma_map_ops dma_direct_ops = { + .alloc = dma_direct_alloc, + .free = dma_direct_free, + .map_page = dma_direct_map_page, + .map_sg = dma_direct_map_sg, + .dma_supported = dma_direct_supported, + .mapping_error = dma_direct_mapping_error, +}; +EXPORT_SYMBOL(dma_direct_ops); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c new file mode 100644 index 000000000000..d2a92ddaac4d --- /dev/null +++ b/kernel/dma/mapping.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arch-independent dma-mapping routines + * + * Copyright (c) 2006 SUSE Linux Products GmbH + * Copyright (c) 2006 Tejun Heo + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Managed DMA API + */ +struct dma_devres { + size_t size; + void *vaddr; + dma_addr_t dma_handle; + unsigned long attrs; +}; + +static void dmam_release(struct device *dev, void *res) +{ + struct dma_devres *this = res; + + dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle, + this->attrs); +} + +static int dmam_match(struct device *dev, void *res, void *match_data) +{ + struct dma_devres *this = res, *match = match_data; + + if (this->vaddr == match->vaddr) { + WARN_ON(this->size != match->size || + this->dma_handle != match->dma_handle); + return 1; + } + return 0; +} + +/** + * dmam_alloc_coherent - Managed dma_alloc_coherent() + * @dev: Device to allocate coherent memory for + * @size: Size of allocation + * @dma_handle: Out argument for allocated DMA handle + * @gfp: Allocation flags + * + * Managed dma_alloc_coherent(). Memory allocated using this function + * will be automatically released on driver detach. + * + * RETURNS: + * Pointer to allocated memory on success, NULL on failure. + */ +void *dmam_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + struct dma_devres *dr; + void *vaddr; + + dr = devres_alloc(dmam_release, sizeof(*dr), gfp); + if (!dr) + return NULL; + + vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp); + if (!vaddr) { + devres_free(dr); + return NULL; + } + + dr->vaddr = vaddr; + dr->dma_handle = *dma_handle; + dr->size = size; + + devres_add(dev, dr); + + return vaddr; +} +EXPORT_SYMBOL(dmam_alloc_coherent); + +/** + * dmam_free_coherent - Managed dma_free_coherent() + * @dev: Device to free coherent memory for + * @size: Size of allocation + * @vaddr: Virtual address of the memory to free + * @dma_handle: DMA handle of the memory to free + * + * Managed dma_free_coherent(). + */ +void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle) +{ + struct dma_devres match_data = { size, vaddr, dma_handle }; + + dma_free_coherent(dev, size, vaddr, dma_handle); + WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data)); +} +EXPORT_SYMBOL(dmam_free_coherent); + +/** + * dmam_alloc_attrs - Managed dma_alloc_attrs() + * @dev: Device to allocate non_coherent memory for + * @size: Size of allocation + * @dma_handle: Out argument for allocated DMA handle + * @gfp: Allocation flags + * @attrs: Flags in the DMA_ATTR_* namespace. + * + * Managed dma_alloc_attrs(). Memory allocated using this function will be + * automatically released on driver detach. + * + * RETURNS: + * Pointer to allocated memory on success, NULL on failure. + */ +void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + struct dma_devres *dr; + void *vaddr; + + dr = devres_alloc(dmam_release, sizeof(*dr), gfp); + if (!dr) + return NULL; + + vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs); + if (!vaddr) { + devres_free(dr); + return NULL; + } + + dr->vaddr = vaddr; + dr->dma_handle = *dma_handle; + dr->size = size; + dr->attrs = attrs; + + devres_add(dev, dr); + + return vaddr; +} +EXPORT_SYMBOL(dmam_alloc_attrs); + +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT + +static void dmam_coherent_decl_release(struct device *dev, void *res) +{ + dma_release_declared_memory(dev); +} + +/** + * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory() + * @dev: Device to declare coherent memory for + * @phys_addr: Physical address of coherent memory to be declared + * @device_addr: Device address of coherent memory to be declared + * @size: Size of coherent memory to be declared + * @flags: Flags + * + * Managed dma_declare_coherent_memory(). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void *res; + int rc; + + res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL); + if (!res) + return -ENOMEM; + + rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size, + flags); + if (!rc) + devres_add(dev, res); + else + devres_free(res); + + return rc; +} +EXPORT_SYMBOL(dmam_declare_coherent_memory); + +/** + * dmam_release_declared_memory - Managed dma_release_declared_memory(). + * @dev: Device to release declared coherent memory for + * + * Managed dmam_release_declared_memory(). + */ +void dmam_release_declared_memory(struct device *dev) +{ + WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL)); +} +EXPORT_SYMBOL(dmam_release_declared_memory); + +#endif + +/* + * Create scatter-list for the already allocated DMA buffer. + */ +int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t handle, size_t size) +{ + struct page *page = virt_to_page(cpu_addr); + int ret; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (unlikely(ret)) + return ret; + + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return 0; +} +EXPORT_SYMBOL(dma_common_get_sgtable); + +/* + * Create userspace mapping for the DMA-coherent memory. + */ +int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size) +{ + int ret = -ENXIO; +#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP + unsigned long user_count = vma_pages(vma); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long off = vma->vm_pgoff; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + if (off < count && user_count <= (count - off)) + ret = remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(cpu_addr)) + off, + user_count << PAGE_SHIFT, + vma->vm_page_prot); +#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ + + return ret; +} +EXPORT_SYMBOL(dma_common_mmap); + +#ifdef CONFIG_MMU +static struct vm_struct *__dma_common_pages_remap(struct page **pages, + size_t size, unsigned long vm_flags, pgprot_t prot, + const void *caller) +{ + struct vm_struct *area; + + area = get_vm_area_caller(size, vm_flags, caller); + if (!area) + return NULL; + + if (map_vm_area(area, prot, pages)) { + vunmap(area->addr); + return NULL; + } + + return area; +} + +/* + * remaps an array of PAGE_SIZE pages into another vm_area + * Cannot be used in non-sleeping contexts + */ +void *dma_common_pages_remap(struct page **pages, size_t size, + unsigned long vm_flags, pgprot_t prot, + const void *caller) +{ + struct vm_struct *area; + + area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + if (!area) + return NULL; + + area->pages = pages; + + return area->addr; +} + +/* + * remaps an allocated contiguous region into another vm_area. + * Cannot be used in non-sleeping contexts + */ + +void *dma_common_contiguous_remap(struct page *page, size_t size, + unsigned long vm_flags, + pgprot_t prot, const void *caller) +{ + int i; + struct page **pages; + struct vm_struct *area; + + pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + if (!pages) + return NULL; + + for (i = 0; i < (size >> PAGE_SHIFT); i++) + pages[i] = nth_page(page, i); + + area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + + kfree(pages); + + if (!area) + return NULL; + return area->addr; +} + +/* + * unmaps a range previously mapped by dma_common_*_remap + */ +void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) +{ + struct vm_struct *area = find_vm_area(cpu_addr); + + if (!area || (area->flags & vm_flags) != vm_flags) { + WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); + return; + } + + unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); + vunmap(cpu_addr); +} +#endif + +/* + * enables DMA API use for a device + */ +int dma_configure(struct device *dev) +{ + if (dev->bus->dma_configure) + return dev->bus->dma_configure(dev); + return 0; +} + +void dma_deconfigure(struct device *dev) +{ + of_dma_deconfigure(dev); + acpi_dma_deconfigure(dev); +} diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c new file mode 100644 index 000000000000..79e9a757387f --- /dev/null +++ b/kernel/dma/noncoherent.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018 Christoph Hellwig. + * + * DMA operations that map physical memory directly without providing cache + * coherence. + */ +#include +#include +#include +#include +#include + +static void dma_noncoherent_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); +} + +static void dma_noncoherent_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); +} + +static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t addr; + + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(dev, page_to_phys(page) + offset, + size, dir); + return addr; +} + +static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs); + if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir); + return nents; +} + +#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU +static void dma_noncoherent_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); +} + +static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); +} + +static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir); +} + +static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir); +} +#endif + +const struct dma_map_ops dma_noncoherent_ops = { + .alloc = arch_dma_alloc, + .free = arch_dma_free, + .mmap = arch_dma_mmap, + .sync_single_for_device = dma_noncoherent_sync_single_for_device, + .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, + .map_page = dma_noncoherent_map_page, + .map_sg = dma_noncoherent_map_sg, +#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU + .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, + .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, + .unmap_page = dma_noncoherent_unmap_page, + .unmap_sg = dma_noncoherent_unmap_sg, +#endif + .dma_supported = dma_direct_supported, + .mapping_error = dma_direct_mapping_error, + .cache_sync = arch_dma_cache_sync, +}; +EXPORT_SYMBOL(dma_noncoherent_ops); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c new file mode 100644 index 000000000000..04b68d9dffac --- /dev/null +++ b/kernel/dma/swiotlb.c @@ -0,0 +1,1087 @@ +/* + * Dynamic DMA mapping support. + * + * This implementation is a fallback for platforms that do not support + * I/O TLBs (aka DMA address translation hardware). + * Copyright (C) 2000 Asit Mallick + * Copyright (C) 2000 Goutham Rao + * Copyright (C) 2000, 2003 Hewlett-Packard Co + * David Mosberger-Tang + * + * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. + * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid + * unnecessary i-cache flushing. + * 04/07/.. ak Better overflow handling. Assorted fixes. + * 05/09/10 linville Add support for syncing ranges, support syncing for + * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. + * 08/12/11 beckyb Add highmem support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#define OFFSET(val,align) ((unsigned long) \ + ( (val) & ( (align) - 1))) + +#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) + +/* + * Minimum IO TLB size to bother booting with. Systems with mainly + * 64bit capable cards will only lightly use the swiotlb. If we can't + * allocate a contiguous 1MB, we're probably in trouble anyway. + */ +#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) + +enum swiotlb_force swiotlb_force; + +/* + * Used to do a quick range check in swiotlb_tbl_unmap_single and + * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ +static phys_addr_t io_tlb_start, io_tlb_end; + +/* + * The number of IO TLB blocks (in groups of 64) between io_tlb_start and + * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. + */ +static unsigned long io_tlb_nslabs; + +/* + * When the IOMMU overflows we return a fallback buffer. This sets the size. + */ +static unsigned long io_tlb_overflow = 32*1024; + +static phys_addr_t io_tlb_overflow_buffer; + +/* + * This is a free list describing the number of free entries available from + * each index + */ +static unsigned int *io_tlb_list; +static unsigned int io_tlb_index; + +/* + * Max segment that we can provide which (if pages are contingous) will + * not be bounced (unless SWIOTLB_FORCE is set). + */ +unsigned int max_segment; + +/* + * We need to save away the original address corresponding to a mapped entry + * for the sync operations. + */ +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) +static phys_addr_t *io_tlb_orig_addr; + +/* + * Protect the above data structures in the map and unmap calls + */ +static DEFINE_SPINLOCK(io_tlb_lock); + +static int late_alloc; + +static int __init +setup_io_tlb_npages(char *str) +{ + if (isdigit(*str)) { + io_tlb_nslabs = simple_strtoul(str, &str, 0); + /* avoid tail segment of size < IO_TLB_SEGSIZE */ + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + if (*str == ',') + ++str; + if (!strcmp(str, "force")) { + swiotlb_force = SWIOTLB_FORCE; + } else if (!strcmp(str, "noforce")) { + swiotlb_force = SWIOTLB_NO_FORCE; + io_tlb_nslabs = 1; + } + + return 0; +} +early_param("swiotlb", setup_io_tlb_npages); +/* make io_tlb_overflow tunable too? */ + +unsigned long swiotlb_nr_tbl(void) +{ + return io_tlb_nslabs; +} +EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); + +unsigned int swiotlb_max_segment(void) +{ + return max_segment; +} +EXPORT_SYMBOL_GPL(swiotlb_max_segment); + +void swiotlb_set_max_segment(unsigned int val) +{ + if (swiotlb_force == SWIOTLB_FORCE) + max_segment = 1; + else + max_segment = rounddown(val, PAGE_SIZE); +} + +/* default to 64MB */ +#define IO_TLB_DEFAULT_SIZE (64UL<<20) +unsigned long swiotlb_size_or_default(void) +{ + unsigned long size; + + size = io_tlb_nslabs << IO_TLB_SHIFT; + + return size ? size : (IO_TLB_DEFAULT_SIZE); +} + +static bool no_iotlb_memory; + +void swiotlb_print_info(void) +{ + unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; + unsigned char *vstart, *vend; + + if (no_iotlb_memory) { + pr_warn("software IO TLB: No low mem\n"); + return; + } + + vstart = phys_to_virt(io_tlb_start); + vend = phys_to_virt(io_tlb_end); + + printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", + (unsigned long long)io_tlb_start, + (unsigned long long)io_tlb_end, + bytes >> 20, vstart, vend - 1); +} + +/* + * Early SWIOTLB allocation may be too early to allow an architecture to + * perform the desired operations. This function allows the architecture to + * call SWIOTLB when the operations are possible. It needs to be called + * before the SWIOTLB memory is used. + */ +void __init swiotlb_update_mem_attributes(void) +{ + void *vaddr; + unsigned long bytes; + + if (no_iotlb_memory || late_alloc) + return; + + vaddr = phys_to_virt(io_tlb_start); + bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); + set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); + memset(vaddr, 0, bytes); + + vaddr = phys_to_virt(io_tlb_overflow_buffer); + bytes = PAGE_ALIGN(io_tlb_overflow); + set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); + memset(vaddr, 0, bytes); +} + +int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +{ + void *v_overflow_buffer; + unsigned long i, bytes; + + bytes = nslabs << IO_TLB_SHIFT; + + io_tlb_nslabs = nslabs; + io_tlb_start = __pa(tlb); + io_tlb_end = io_tlb_start + bytes; + + /* + * Get the overflow emergency buffer + */ + v_overflow_buffer = memblock_virt_alloc_low_nopanic( + PAGE_ALIGN(io_tlb_overflow), + PAGE_SIZE); + if (!v_overflow_buffer) + return -ENOMEM; + + io_tlb_overflow_buffer = __pa(v_overflow_buffer); + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE + * between io_tlb_start and io_tlb_end. + */ + io_tlb_list = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), + PAGE_SIZE); + io_tlb_orig_addr = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), + PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + } + io_tlb_index = 0; + + if (verbose) + swiotlb_print_info(); + + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + return 0; +} + +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the DMA API. + */ +void __init +swiotlb_init(int verbose) +{ + size_t default_size = IO_TLB_DEFAULT_SIZE; + unsigned char *vstart; + unsigned long bytes; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + /* Get IO TLB memory from the low pages */ + vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); + if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) + return; + + if (io_tlb_start) + memblock_free_early(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + pr_warn("Cannot allocate SWIOTLB buffer"); + no_iotlb_memory = true; +} + +/* + * Systems with larger DMA zones (those that don't support ISA) can + * initialize the swiotlb later using the slab allocator if needed. + * This should be just like above, but with some error catching. + */ +int +swiotlb_late_init_with_default_size(size_t default_size) +{ + unsigned long bytes, req_nslabs = io_tlb_nslabs; + unsigned char *vstart = NULL; + unsigned int order; + int rc = 0; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + /* + * Get IO TLB memory from the low pages + */ + order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); + io_tlb_nslabs = SLABS_PER_PAGE << order; + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { + vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + order); + if (vstart) + break; + order--; + } + + if (!vstart) { + io_tlb_nslabs = req_nslabs; + return -ENOMEM; + } + if (order != get_order(bytes)) { + printk(KERN_WARNING "Warning: only able to allocate %ld MB " + "for software IO TLB\n", (PAGE_SIZE << order) >> 20); + io_tlb_nslabs = SLABS_PER_PAGE << order; + } + rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); + if (rc) + free_pages((unsigned long)vstart, order); + + return rc; +} + +int +swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) +{ + unsigned long i, bytes; + unsigned char *v_overflow_buffer; + + bytes = nslabs << IO_TLB_SHIFT; + + io_tlb_nslabs = nslabs; + io_tlb_start = virt_to_phys(tlb); + io_tlb_end = io_tlb_start + bytes; + + set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); + memset(tlb, 0, bytes); + + /* + * Get the overflow emergency buffer + */ + v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, + get_order(io_tlb_overflow)); + if (!v_overflow_buffer) + goto cleanup2; + + set_memory_decrypted((unsigned long)v_overflow_buffer, + io_tlb_overflow >> PAGE_SHIFT); + memset(v_overflow_buffer, 0, io_tlb_overflow); + io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE + * between io_tlb_start and io_tlb_end. + */ + io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * sizeof(int))); + if (!io_tlb_list) + goto cleanup3; + + io_tlb_orig_addr = (phys_addr_t *) + __get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * + sizeof(phys_addr_t))); + if (!io_tlb_orig_addr) + goto cleanup4; + + for (i = 0; i < io_tlb_nslabs; i++) { + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + } + io_tlb_index = 0; + + swiotlb_print_info(); + + late_alloc = 1; + + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + + return 0; + +cleanup4: + free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * + sizeof(int))); + io_tlb_list = NULL; +cleanup3: + free_pages((unsigned long)v_overflow_buffer, + get_order(io_tlb_overflow)); + io_tlb_overflow_buffer = 0; +cleanup2: + io_tlb_end = 0; + io_tlb_start = 0; + io_tlb_nslabs = 0; + max_segment = 0; + return -ENOMEM; +} + +void __init swiotlb_exit(void) +{ + if (!io_tlb_orig_addr) + return; + + if (late_alloc) { + free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), + get_order(io_tlb_overflow)); + free_pages((unsigned long)io_tlb_orig_addr, + get_order(io_tlb_nslabs * sizeof(phys_addr_t))); + free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * + sizeof(int))); + free_pages((unsigned long)phys_to_virt(io_tlb_start), + get_order(io_tlb_nslabs << IO_TLB_SHIFT)); + } else { + memblock_free_late(io_tlb_overflow_buffer, + PAGE_ALIGN(io_tlb_overflow)); + memblock_free_late(__pa(io_tlb_orig_addr), + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + memblock_free_late(__pa(io_tlb_list), + PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); + memblock_free_late(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + } + io_tlb_nslabs = 0; + max_segment = 0; +} + +int is_swiotlb_buffer(phys_addr_t paddr) +{ + return paddr >= io_tlb_start && paddr < io_tlb_end; +} + +/* + * Bounce: copy the swiotlb buffer back to the original dma location + */ +static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) +{ + unsigned long pfn = PFN_DOWN(orig_addr); + unsigned char *vaddr = phys_to_virt(tlb_addr); + + if (PageHighMem(pfn_to_page(pfn))) { + /* The buffer does not have a mapping. Map it in and copy */ + unsigned int offset = orig_addr & ~PAGE_MASK; + char *buffer; + unsigned int sz = 0; + unsigned long flags; + + while (size) { + sz = min_t(size_t, PAGE_SIZE - offset, size); + + local_irq_save(flags); + buffer = kmap_atomic(pfn_to_page(pfn)); + if (dir == DMA_TO_DEVICE) + memcpy(vaddr, buffer + offset, sz); + else + memcpy(buffer + offset, vaddr, sz); + kunmap_atomic(buffer); + local_irq_restore(flags); + + size -= sz; + pfn++; + vaddr += sz; + offset = 0; + } + } else if (dir == DMA_TO_DEVICE) { + memcpy(vaddr, phys_to_virt(orig_addr), size); + } else { + memcpy(phys_to_virt(orig_addr), vaddr, size); + } +} + +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, + dma_addr_t tbl_dma_addr, + phys_addr_t orig_addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long flags; + phys_addr_t tlb_addr; + unsigned int nslots, stride, index, wrap; + int i; + unsigned long mask; + unsigned long offset_slots; + unsigned long max_slots; + + if (no_iotlb_memory) + panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + + if (mem_encrypt_active()) + pr_warn_once("%s is active and system is using DMA bounce buffers\n", + sme_active() ? "SME" : "SEV"); + + mask = dma_get_seg_boundary(hwdev); + + tbl_dma_addr &= mask; + + offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + + /* + * Carefully handle integer overflow which can occur when mask == ~0UL. + */ + max_slots = mask + 1 + ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT + : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); + + /* + * For mappings greater than or equal to a page, we limit the stride + * (and hence alignment) to a page size. + */ + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + if (size >= PAGE_SIZE) + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); + else + stride = 1; + + BUG_ON(!nslots); + + /* + * Find suitable number of IO TLB entries size that will fit this + * request and allocate a buffer from that IO TLB pool. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + index = ALIGN(io_tlb_index, stride); + if (index >= io_tlb_nslabs) + index = 0; + wrap = index; + + do { + while (iommu_is_span_boundary(index, nslots, offset_slots, + max_slots)) { + index += stride; + if (index >= io_tlb_nslabs) + index = 0; + if (index == wrap) + goto not_found; + } + + /* + * If we find a slot that indicates we have 'nslots' number of + * contiguous buffers, we allocate the buffers from that slot + * and mark the entries as '0' indicating unavailable. + */ + if (io_tlb_list[index] >= nslots) { + int count = 0; + + for (i = index; i < (int) (index + nslots); i++) + io_tlb_list[i] = 0; + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); + + /* + * Update the indices to avoid searching in the next + * round. + */ + io_tlb_index = ((index + nslots) < io_tlb_nslabs + ? (index + nslots) : 0); + + goto found; + } + index += stride; + if (index >= io_tlb_nslabs) + index = 0; + } while (index != wrap); + +not_found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) + dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); + return SWIOTLB_MAP_ERROR; +found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + + /* + * Save away the mapping from the original address to the DMA address. + * This is needed when we sync the memory. Then we sync the buffer if + * needed. + */ + for (i = 0; i < nslots; i++) + io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + + return tlb_addr; +} + +/* + * Allocates bounce buffer and returns its physical address. + */ +static phys_addr_t +map_single(struct device *hwdev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + dma_addr_t start_dma_addr; + + if (swiotlb_force == SWIOTLB_NO_FORCE) { + dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", + &phys); + return SWIOTLB_MAP_ERROR; + } + + start_dma_addr = __phys_to_dma(hwdev, io_tlb_start); + return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, + dir, attrs); +} + +/* + * tlb_addr is the physical address of the bounce buffer to unmap. + */ +void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long flags; + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = io_tlb_orig_addr[index]; + + /* + * First, sync the memory before unmapping the entry + */ + if (orig_addr != INVALID_PHYS_ADDR && + !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); + + /* + * Return the buffer to the free list by setting the corresponding + * entries to indicate the number of contiguous entries available. + * While returning the entries to the free list, we merge the entries + * with slots below and above the pool being returned. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + { + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? + io_tlb_list[index + nslots] : 0); + /* + * Step 1: return the slots to the free list, merging the + * slots with superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) { + io_tlb_list[i] = ++count; + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + } + /* + * Step 2: merge the returned slots with the preceding slots, + * if available (non zero) + */ + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + } + spin_unlock_irqrestore(&io_tlb_lock, flags); +} + +void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = io_tlb_orig_addr[index]; + + if (orig_addr == INVALID_PHYS_ADDR) + return; + orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); + + switch (target) { + case SYNC_FOR_CPU: + if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_FROM_DEVICE); + else + BUG_ON(dir != DMA_TO_DEVICE); + break; + case SYNC_FOR_DEVICE: + if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_TO_DEVICE); + else + BUG_ON(dir != DMA_FROM_DEVICE); + break; + default: + BUG(); + } +} + +static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr, + size_t size) +{ + u64 mask = DMA_BIT_MASK(32); + + if (dev && dev->coherent_dma_mask) + mask = dev->coherent_dma_mask; + return addr + size - 1 <= mask; +} + +static void * +swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle, + unsigned long attrs) +{ + phys_addr_t phys_addr; + + if (swiotlb_force == SWIOTLB_NO_FORCE) + goto out_warn; + + phys_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + 0, size, DMA_FROM_DEVICE, attrs); + if (phys_addr == SWIOTLB_MAP_ERROR) + goto out_warn; + + *dma_handle = __phys_to_dma(dev, phys_addr); + if (!dma_coherent_ok(dev, *dma_handle, size)) + goto out_unmap; + + memset(phys_to_virt(phys_addr), 0, size); + return phys_to_virt(phys_addr); + +out_unmap: + dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", + (unsigned long long)dev->coherent_dma_mask, + (unsigned long long)*dma_handle); + + /* + * DMA_TO_DEVICE to avoid memcpy in unmap_single. + * DMA_ATTR_SKIP_CPU_SYNC is optional. + */ + swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); +out_warn: + if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { + dev_warn(dev, + "swiotlb: coherent allocation failed, size=%zu\n", + size); + dump_stack(); + } + return NULL; +} + +static bool swiotlb_free_buffer(struct device *dev, size_t size, + dma_addr_t dma_addr) +{ + phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); + + WARN_ON_ONCE(irqs_disabled()); + + if (!is_swiotlb_buffer(phys_addr)) + return false; + + /* + * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. + * DMA_ATTR_SKIP_CPU_SYNC is optional. + */ + swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + return true; +} + +static void +swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, + int do_panic) +{ + if (swiotlb_force == SWIOTLB_NO_FORCE) + return; + + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * unless they check for dma_mapping_error (most don't) + * When the mapping is small enough return a static buffer to limit + * the damage, or panic when the transfer is too big. + */ + dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", + size); + + if (size <= io_tlb_overflow || !do_panic) + return; + + if (dir == DMA_BIDIRECTIONAL) + panic("DMA: Random memory could be DMA accessed\n"); + if (dir == DMA_FROM_DEVICE) + panic("DMA: Random memory could be DMA written\n"); + if (dir == DMA_TO_DEVICE) + panic("DMA: Random memory could be DMA read\n"); +} + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. + */ +dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t map, phys = page_to_phys(page) + offset; + dma_addr_t dev_addr = phys_to_dma(dev, phys); + + BUG_ON(dir == DMA_NONE); + /* + * If the address happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) + return dev_addr; + + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + + /* Oh well, have to allocate and map a bounce buffer. */ + map = map_single(dev, phys, size, dir, attrs); + if (map == SWIOTLB_MAP_ERROR) { + swiotlb_full(dev, size, dir, 1); + return __phys_to_dma(dev, io_tlb_overflow_buffer); + } + + dev_addr = __phys_to_dma(dev, map); + + /* Ensure that the address returned is DMA'ble */ + if (dma_capable(dev, dev_addr, size)) + return dev_addr; + + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); + + return __phys_to_dma(dev, io_tlb_overflow_buffer); +} + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous swiotlb_map_page call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); + + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(paddr)) { + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + /* + * phys_to_virt doesn't work with hihgmem page but we could + * call dma_mark_clean() with hihgmem page here. However, we + * are fine since dma_mark_clean() is null on POWERPC. We can + * make dma_mark_clean() take a physical address if necessary. + */ + dma_mark_clean(phys_to_virt(paddr), size); +} + +void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + unmap_single(hwdev, dev_addr, size, dir, attrs); +} + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a swiotlb_map_page() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the dma mapping, you must + * call this function before doing so. At the next point you give the dma + * address back to the card, you must first perform a + * swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +static void +swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); + + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(paddr)) { + swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(phys_to_virt(paddr), size); +} + +void +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); +} + +void +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); +} + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above swiotlb_map_page + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for swiotlb_map_page are the + * same here. + */ +int +swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir, unsigned long attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { + phys_addr_t paddr = sg_phys(sg); + dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); + + if (swiotlb_force == SWIOTLB_FORCE || + !dma_capable(hwdev, dev_addr, sg->length)) { + phys_addr_t map = map_single(hwdev, sg_phys(sg), + sg->length, dir, attrs); + if (map == SWIOTLB_MAP_ERROR) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + swiotlb_full(hwdev, sg->length, dir, 0); + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, + attrs); + sg_dma_len(sgl) = 0; + return 0; + } + sg->dma_address = __phys_to_dma(hwdev, map); + } else + sg->dma_address = dev_addr; + sg_dma_len(sg) = sg->length; + } + return nelems; +} + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_page() above. + */ +void +swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) + unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, + attrs); +} + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +static void +swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nelems, i) + swiotlb_sync_single(hwdev, sg->dma_address, + sg_dma_len(sg), dir, target); +} + +void +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); +} + +void +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); +} + +int +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) +{ + return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer)); +} + +/* + * Return whether the given device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +swiotlb_dma_supported(struct device *hwdev, u64 mask) +{ + return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; +} + +void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + void *vaddr; + + /* temporary workaround: */ + if (gfp & __GFP_NOWARN) + attrs |= DMA_ATTR_NO_WARN; + + /* + * Don't print a warning when the first allocation attempt fails. + * swiotlb_alloc_coherent() will print a warning when the DMA memory + * allocation ultimately failed. + */ + gfp |= __GFP_NOWARN; + + vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); + if (!vaddr) + vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs); + return vaddr; +} + +void swiotlb_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, unsigned long attrs) +{ + if (!swiotlb_free_buffer(dev, size, dma_addr)) + dma_direct_free(dev, size, vaddr, dma_addr, attrs); +} + +const struct dma_map_ops swiotlb_dma_ops = { + .mapping_error = swiotlb_dma_mapping_error, + .alloc = swiotlb_alloc, + .free = swiotlb_free, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .dma_supported = dma_direct_supported, +}; diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c new file mode 100644 index 000000000000..631ddec4b60a --- /dev/null +++ b/kernel/dma/virt.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DMA operations that map to virtual addresses without flushing memory. + */ +#include +#include +#include +#include + +static void *dma_virt_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs) +{ + void *ret; + + ret = (void *)__get_free_pages(gfp, get_order(size)); + if (ret) + *dma_handle = (uintptr_t)ret; + return ret; +} + +static void dma_virt_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr, + unsigned long attrs) +{ + free_pages((unsigned long)cpu_addr, get_order(size)); +} + +static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + return (uintptr_t)(page_address(page) + offset); +} + +static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + sg_dma_address(sg) = (uintptr_t)sg_virt(sg); + sg_dma_len(sg) = sg->length; + } + + return nents; +} + +const struct dma_map_ops dma_virt_ops = { + .alloc = dma_virt_alloc, + .free = dma_virt_free, + .map_page = dma_virt_map_page, + .map_sg = dma_virt_map_sg, +}; +EXPORT_SYMBOL(dma_virt_ops); diff --git a/lib/Kconfig b/lib/Kconfig index 809fdd155739..803fcbced729 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -420,60 +420,15 @@ config HAS_IOPORT_MAP depends on HAS_IOMEM && !NO_IOPORT_MAP default y -config HAS_DMA - bool - depends on !NO_DMA - default y +source "kernel/dma/Kconfig" config SGL_ALLOC bool default n -config NEED_SG_DMA_LENGTH - bool - -config NEED_DMA_MAP_STATE - bool - -config ARCH_DMA_ADDR_T_64BIT - def_bool 64BIT || PHYS_ADDR_T_64BIT - config IOMMU_HELPER bool -config ARCH_HAS_SYNC_DMA_FOR_DEVICE - bool - -config ARCH_HAS_SYNC_DMA_FOR_CPU - bool - select NEED_DMA_MAP_STATE - -config DMA_DIRECT_OPS - bool - depends on HAS_DMA - -config DMA_NONCOHERENT_OPS - bool - depends on HAS_DMA - select DMA_DIRECT_OPS - -config DMA_NONCOHERENT_MMAP - bool - depends on DMA_NONCOHERENT_OPS - -config DMA_NONCOHERENT_CACHE_SYNC - bool - depends on DMA_NONCOHERENT_OPS - -config DMA_VIRT_OPS - bool - depends on HAS_DMA - -config SWIOTLB - bool - select DMA_DIRECT_OPS - select NEED_DMA_MAP_STATE - config CHECK_SIGNATURE bool diff --git a/lib/Makefile b/lib/Makefile index 5e0e160c9242..8153fdab287f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -29,9 +29,6 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o -obj-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o -obj-$(CONFIG_DMA_NONCOHERENT_OPS) += dma-noncoherent.o -obj-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o lib-y += kobject.o klist.o obj-y += lockref.o @@ -148,7 +145,6 @@ obj-$(CONFIG_SMP) += percpu_counter.o obj-$(CONFIG_AUDIT_GENERIC) += audit.o obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o -obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o @@ -169,8 +165,6 @@ obj-$(CONFIG_NLATTR) += nlattr.o obj-$(CONFIG_LRU_CACHE) += lru_cache.o -obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o - obj-$(CONFIG_GENERIC_CSUM) += checksum.o obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o diff --git a/lib/dma-debug.c b/lib/dma-debug.c deleted file mode 100644 index c007d25bee09..000000000000 --- a/lib/dma-debug.c +++ /dev/null @@ -1,1773 +0,0 @@ -/* - * Copyright (C) 2008 Advanced Micro Devices, Inc. - * - * Author: Joerg Roedel - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define HASH_SIZE 1024ULL -#define HASH_FN_SHIFT 13 -#define HASH_FN_MASK (HASH_SIZE - 1) - -/* allow architectures to override this if absolutely required */ -#ifndef PREALLOC_DMA_DEBUG_ENTRIES -#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) -#endif - -enum { - dma_debug_single, - dma_debug_page, - dma_debug_sg, - dma_debug_coherent, - dma_debug_resource, -}; - -enum map_err_types { - MAP_ERR_CHECK_NOT_APPLICABLE, - MAP_ERR_NOT_CHECKED, - MAP_ERR_CHECKED, -}; - -#define DMA_DEBUG_STACKTRACE_ENTRIES 5 - -/** - * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping - * @list: node on pre-allocated free_entries list - * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent - * @type: single, page, sg, coherent - * @pfn: page frame of the start address - * @offset: offset of mapping relative to pfn - * @size: length of the mapping - * @direction: enum dma_data_direction - * @sg_call_ents: 'nents' from dma_map_sg - * @sg_mapped_ents: 'mapped_ents' from dma_map_sg - * @map_err_type: track whether dma_mapping_error() was checked - * @stacktrace: support backtraces when a violation is detected - */ -struct dma_debug_entry { - struct list_head list; - struct device *dev; - int type; - unsigned long pfn; - size_t offset; - u64 dev_addr; - u64 size; - int direction; - int sg_call_ents; - int sg_mapped_ents; - enum map_err_types map_err_type; -#ifdef CONFIG_STACKTRACE - struct stack_trace stacktrace; - unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; -#endif -}; - -typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); - -struct hash_bucket { - struct list_head list; - spinlock_t lock; -} ____cacheline_aligned_in_smp; - -/* Hash list to save the allocated dma addresses */ -static struct hash_bucket dma_entry_hash[HASH_SIZE]; -/* List of pre-allocated dma_debug_entry's */ -static LIST_HEAD(free_entries); -/* Lock for the list above */ -static DEFINE_SPINLOCK(free_entries_lock); - -/* Global disable flag - will be set in case of an error */ -static bool global_disable __read_mostly; - -/* Early initialization disable flag, set at the end of dma_debug_init */ -static bool dma_debug_initialized __read_mostly; - -static inline bool dma_debug_disabled(void) -{ - return global_disable || !dma_debug_initialized; -} - -/* Global error count */ -static u32 error_count; - -/* Global error show enable*/ -static u32 show_all_errors __read_mostly; -/* Number of errors to show */ -static u32 show_num_errors = 1; - -static u32 num_free_entries; -static u32 min_free_entries; -static u32 nr_total_entries; - -/* number of preallocated entries requested by kernel cmdline */ -static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; - -/* debugfs dentry's for the stuff above */ -static struct dentry *dma_debug_dent __read_mostly; -static struct dentry *global_disable_dent __read_mostly; -static struct dentry *error_count_dent __read_mostly; -static struct dentry *show_all_errors_dent __read_mostly; -static struct dentry *show_num_errors_dent __read_mostly; -static struct dentry *num_free_entries_dent __read_mostly; -static struct dentry *min_free_entries_dent __read_mostly; -static struct dentry *filter_dent __read_mostly; - -/* per-driver filter related state */ - -#define NAME_MAX_LEN 64 - -static char current_driver_name[NAME_MAX_LEN] __read_mostly; -static struct device_driver *current_driver __read_mostly; - -static DEFINE_RWLOCK(driver_name_lock); - -static const char *const maperr2str[] = { - [MAP_ERR_CHECK_NOT_APPLICABLE] = "dma map error check not applicable", - [MAP_ERR_NOT_CHECKED] = "dma map error not checked", - [MAP_ERR_CHECKED] = "dma map error checked", -}; - -static const char *type2name[5] = { "single", "page", - "scather-gather", "coherent", - "resource" }; - -static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", - "DMA_FROM_DEVICE", "DMA_NONE" }; - -/* - * The access to some variables in this macro is racy. We can't use atomic_t - * here because all these variables are exported to debugfs. Some of them even - * writeable. This is also the reason why a lock won't help much. But anyway, - * the races are no big deal. Here is why: - * - * error_count: the addition is racy, but the worst thing that can happen is - * that we don't count some errors - * show_num_errors: the subtraction is racy. Also no big deal because in - * worst case this will result in one warning more in the - * system log than the user configured. This variable is - * writeable via debugfs. - */ -static inline void dump_entry_trace(struct dma_debug_entry *entry) -{ -#ifdef CONFIG_STACKTRACE - if (entry) { - pr_warning("Mapped at:\n"); - print_stack_trace(&entry->stacktrace, 0); - } -#endif -} - -static bool driver_filter(struct device *dev) -{ - struct device_driver *drv; - unsigned long flags; - bool ret; - - /* driver filter off */ - if (likely(!current_driver_name[0])) - return true; - - /* driver filter on and initialized */ - if (current_driver && dev && dev->driver == current_driver) - return true; - - /* driver filter on, but we can't filter on a NULL device... */ - if (!dev) - return false; - - if (current_driver || !current_driver_name[0]) - return false; - - /* driver filter on but not yet initialized */ - drv = dev->driver; - if (!drv) - return false; - - /* lock to protect against change of current_driver_name */ - read_lock_irqsave(&driver_name_lock, flags); - - ret = false; - if (drv->name && - strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) { - current_driver = drv; - ret = true; - } - - read_unlock_irqrestore(&driver_name_lock, flags); - - return ret; -} - -#define err_printk(dev, entry, format, arg...) do { \ - error_count += 1; \ - if (driver_filter(dev) && \ - (show_all_errors || show_num_errors > 0)) { \ - WARN(1, "%s %s: " format, \ - dev ? dev_driver_string(dev) : "NULL", \ - dev ? dev_name(dev) : "NULL", ## arg); \ - dump_entry_trace(entry); \ - } \ - if (!show_all_errors && show_num_errors > 0) \ - show_num_errors -= 1; \ - } while (0); - -/* - * Hash related functions - * - * Every DMA-API request is saved into a struct dma_debug_entry. To - * have quick access to these structs they are stored into a hash. - */ -static int hash_fn(struct dma_debug_entry *entry) -{ - /* - * Hash function is based on the dma address. - * We use bits 20-27 here as the index into the hash - */ - return (entry->dev_addr >> HASH_FN_SHIFT) & HASH_FN_MASK; -} - -/* - * Request exclusive access to a hash bucket for a given dma_debug_entry. - */ -static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, - unsigned long *flags) - __acquires(&dma_entry_hash[idx].lock) -{ - int idx = hash_fn(entry); - unsigned long __flags; - - spin_lock_irqsave(&dma_entry_hash[idx].lock, __flags); - *flags = __flags; - return &dma_entry_hash[idx]; -} - -/* - * Give up exclusive access to the hash bucket - */ -static void put_hash_bucket(struct hash_bucket *bucket, - unsigned long *flags) - __releases(&bucket->lock) -{ - unsigned long __flags = *flags; - - spin_unlock_irqrestore(&bucket->lock, __flags); -} - -static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b) -{ - return ((a->dev_addr == b->dev_addr) && - (a->dev == b->dev)) ? true : false; -} - -static bool containing_match(struct dma_debug_entry *a, - struct dma_debug_entry *b) -{ - if (a->dev != b->dev) - return false; - - if ((b->dev_addr <= a->dev_addr) && - ((b->dev_addr + b->size) >= (a->dev_addr + a->size))) - return true; - - return false; -} - -/* - * Search a given entry in the hash bucket list - */ -static struct dma_debug_entry *__hash_bucket_find(struct hash_bucket *bucket, - struct dma_debug_entry *ref, - match_fn match) -{ - struct dma_debug_entry *entry, *ret = NULL; - int matches = 0, match_lvl, last_lvl = -1; - - list_for_each_entry(entry, &bucket->list, list) { - if (!match(ref, entry)) - continue; - - /* - * Some drivers map the same physical address multiple - * times. Without a hardware IOMMU this results in the - * same device addresses being put into the dma-debug - * hash multiple times too. This can result in false - * positives being reported. Therefore we implement a - * best-fit algorithm here which returns the entry from - * the hash which fits best to the reference value - * instead of the first-fit. - */ - matches += 1; - match_lvl = 0; - entry->size == ref->size ? ++match_lvl : 0; - entry->type == ref->type ? ++match_lvl : 0; - entry->direction == ref->direction ? ++match_lvl : 0; - entry->sg_call_ents == ref->sg_call_ents ? ++match_lvl : 0; - - if (match_lvl == 4) { - /* perfect-fit - return the result */ - return entry; - } else if (match_lvl > last_lvl) { - /* - * We found an entry that fits better then the - * previous one or it is the 1st match. - */ - last_lvl = match_lvl; - ret = entry; - } - } - - /* - * If we have multiple matches but no perfect-fit, just return - * NULL. - */ - ret = (matches == 1) ? ret : NULL; - - return ret; -} - -static struct dma_debug_entry *bucket_find_exact(struct hash_bucket *bucket, - struct dma_debug_entry *ref) -{ - return __hash_bucket_find(bucket, ref, exact_match); -} - -static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, - struct dma_debug_entry *ref, - unsigned long *flags) -{ - - unsigned int max_range = dma_get_max_seg_size(ref->dev); - struct dma_debug_entry *entry, index = *ref; - unsigned int range = 0; - - while (range <= max_range) { - entry = __hash_bucket_find(*bucket, ref, containing_match); - - if (entry) - return entry; - - /* - * Nothing found, go back a hash bucket - */ - put_hash_bucket(*bucket, flags); - range += (1 << HASH_FN_SHIFT); - index.dev_addr -= (1 << HASH_FN_SHIFT); - *bucket = get_hash_bucket(&index, flags); - } - - return NULL; -} - -/* - * Add an entry to a hash bucket - */ -static void hash_bucket_add(struct hash_bucket *bucket, - struct dma_debug_entry *entry) -{ - list_add_tail(&entry->list, &bucket->list); -} - -/* - * Remove entry from a hash bucket list - */ -static void hash_bucket_del(struct dma_debug_entry *entry) -{ - list_del(&entry->list); -} - -static unsigned long long phys_addr(struct dma_debug_entry *entry) -{ - if (entry->type == dma_debug_resource) - return __pfn_to_phys(entry->pfn) + entry->offset; - - return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; -} - -/* - * Dump mapping entries for debugging purposes - */ -void debug_dma_dump_mappings(struct device *dev) -{ - int idx; - - for (idx = 0; idx < HASH_SIZE; idx++) { - struct hash_bucket *bucket = &dma_entry_hash[idx]; - struct dma_debug_entry *entry; - unsigned long flags; - - spin_lock_irqsave(&bucket->lock, flags); - - list_for_each_entry(entry, &bucket->list, list) { - if (!dev || dev == entry->dev) { - dev_info(entry->dev, - "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", - type2name[entry->type], idx, - phys_addr(entry), entry->pfn, - entry->dev_addr, entry->size, - dir2name[entry->direction], - maperr2str[entry->map_err_type]); - } - } - - spin_unlock_irqrestore(&bucket->lock, flags); - } -} - -/* - * For each mapping (initial cacheline in the case of - * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a - * scatterlist, or the cacheline specified in dma_map_single) insert - * into this tree using the cacheline as the key. At - * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If - * the entry already exists at insertion time add a tag as a reference - * count for the overlapping mappings. For now, the overlap tracking - * just ensures that 'unmaps' balance 'maps' before marking the - * cacheline idle, but we should also be flagging overlaps as an API - * violation. - * - * Memory usage is mostly constrained by the maximum number of available - * dma-debug entries in that we need a free dma_debug_entry before - * inserting into the tree. In the case of dma_map_page and - * dma_alloc_coherent there is only one dma_debug_entry and one - * dma_active_cacheline entry to track per event. dma_map_sg(), on the - * other hand, consumes a single dma_debug_entry, but inserts 'nents' - * entries into the tree. - * - * At any time debug_dma_assert_idle() can be called to trigger a - * warning if any cachelines in the given page are in the active set. - */ -static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); -static DEFINE_SPINLOCK(radix_lock); -#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) -#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) -#define CACHELINES_PER_PAGE (1 << CACHELINE_PER_PAGE_SHIFT) - -static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry) -{ - return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) + - (entry->offset >> L1_CACHE_SHIFT); -} - -static int active_cacheline_read_overlap(phys_addr_t cln) -{ - int overlap = 0, i; - - for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) - if (radix_tree_tag_get(&dma_active_cacheline, cln, i)) - overlap |= 1 << i; - return overlap; -} - -static int active_cacheline_set_overlap(phys_addr_t cln, int overlap) -{ - int i; - - if (overlap > ACTIVE_CACHELINE_MAX_OVERLAP || overlap < 0) - return overlap; - - for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) - if (overlap & 1 << i) - radix_tree_tag_set(&dma_active_cacheline, cln, i); - else - radix_tree_tag_clear(&dma_active_cacheline, cln, i); - - return overlap; -} - -static void active_cacheline_inc_overlap(phys_addr_t cln) -{ - int overlap = active_cacheline_read_overlap(cln); - - overlap = active_cacheline_set_overlap(cln, ++overlap); - - /* If we overflowed the overlap counter then we're potentially - * leaking dma-mappings. Otherwise, if maps and unmaps are - * balanced then this overflow may cause false negatives in - * debug_dma_assert_idle() as the cacheline may be marked idle - * prematurely. - */ - WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, - "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n", - ACTIVE_CACHELINE_MAX_OVERLAP, &cln); -} - -static int active_cacheline_dec_overlap(phys_addr_t cln) -{ - int overlap = active_cacheline_read_overlap(cln); - - return active_cacheline_set_overlap(cln, --overlap); -} - -static int active_cacheline_insert(struct dma_debug_entry *entry) -{ - phys_addr_t cln = to_cacheline_number(entry); - unsigned long flags; - int rc; - - /* If the device is not writing memory then we don't have any - * concerns about the cpu consuming stale data. This mitigates - * legitimate usages of overlapping mappings. - */ - if (entry->direction == DMA_TO_DEVICE) - return 0; - - spin_lock_irqsave(&radix_lock, flags); - rc = radix_tree_insert(&dma_active_cacheline, cln, entry); - if (rc == -EEXIST) - active_cacheline_inc_overlap(cln); - spin_unlock_irqrestore(&radix_lock, flags); - - return rc; -} - -static void active_cacheline_remove(struct dma_debug_entry *entry) -{ - phys_addr_t cln = to_cacheline_number(entry); - unsigned long flags; - - /* ...mirror the insert case */ - if (entry->direction == DMA_TO_DEVICE) - return; - - spin_lock_irqsave(&radix_lock, flags); - /* since we are counting overlaps the final put of the - * cacheline will occur when the overlap count is 0. - * active_cacheline_dec_overlap() returns -1 in that case - */ - if (active_cacheline_dec_overlap(cln) < 0) - radix_tree_delete(&dma_active_cacheline, cln); - spin_unlock_irqrestore(&radix_lock, flags); -} - -/** - * debug_dma_assert_idle() - assert that a page is not undergoing dma - * @page: page to lookup in the dma_active_cacheline tree - * - * Place a call to this routine in cases where the cpu touching the page - * before the dma completes (page is dma_unmapped) will lead to data - * corruption. - */ -void debug_dma_assert_idle(struct page *page) -{ - static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; - struct dma_debug_entry *entry = NULL; - void **results = (void **) &ents; - unsigned int nents, i; - unsigned long flags; - phys_addr_t cln; - - if (dma_debug_disabled()) - return; - - if (!page) - return; - - cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; - spin_lock_irqsave(&radix_lock, flags); - nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln, - CACHELINES_PER_PAGE); - for (i = 0; i < nents; i++) { - phys_addr_t ent_cln = to_cacheline_number(ents[i]); - - if (ent_cln == cln) { - entry = ents[i]; - break; - } else if (ent_cln >= cln + CACHELINES_PER_PAGE) - break; - } - spin_unlock_irqrestore(&radix_lock, flags); - - if (!entry) - return; - - cln = to_cacheline_number(entry); - err_printk(entry->dev, entry, - "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n", - &cln); -} - -/* - * Wrapper function for adding an entry to the hash. - * This function takes care of locking itself. - */ -static void add_dma_entry(struct dma_debug_entry *entry) -{ - struct hash_bucket *bucket; - unsigned long flags; - int rc; - - bucket = get_hash_bucket(entry, &flags); - hash_bucket_add(bucket, entry); - put_hash_bucket(bucket, &flags); - - rc = active_cacheline_insert(entry); - if (rc == -ENOMEM) { - pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n"); - global_disable = true; - } - - /* TODO: report -EEXIST errors here as overlapping mappings are - * not supported by the DMA API - */ -} - -static struct dma_debug_entry *__dma_entry_alloc(void) -{ - struct dma_debug_entry *entry; - - entry = list_entry(free_entries.next, struct dma_debug_entry, list); - list_del(&entry->list); - memset(entry, 0, sizeof(*entry)); - - num_free_entries -= 1; - if (num_free_entries < min_free_entries) - min_free_entries = num_free_entries; - - return entry; -} - -/* struct dma_entry allocator - * - * The next two functions implement the allocator for - * struct dma_debug_entries. - */ -static struct dma_debug_entry *dma_entry_alloc(void) -{ - struct dma_debug_entry *entry; - unsigned long flags; - - spin_lock_irqsave(&free_entries_lock, flags); - - if (list_empty(&free_entries)) { - global_disable = true; - spin_unlock_irqrestore(&free_entries_lock, flags); - pr_err("DMA-API: debugging out of memory - disabling\n"); - return NULL; - } - - entry = __dma_entry_alloc(); - - spin_unlock_irqrestore(&free_entries_lock, flags); - -#ifdef CONFIG_STACKTRACE - entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; - entry->stacktrace.entries = entry->st_entries; - entry->stacktrace.skip = 2; - save_stack_trace(&entry->stacktrace); -#endif - - return entry; -} - -static void dma_entry_free(struct dma_debug_entry *entry) -{ - unsigned long flags; - - active_cacheline_remove(entry); - - /* - * add to beginning of the list - this way the entries are - * more likely cache hot when they are reallocated. - */ - spin_lock_irqsave(&free_entries_lock, flags); - list_add(&entry->list, &free_entries); - num_free_entries += 1; - spin_unlock_irqrestore(&free_entries_lock, flags); -} - -int dma_debug_resize_entries(u32 num_entries) -{ - int i, delta, ret = 0; - unsigned long flags; - struct dma_debug_entry *entry; - LIST_HEAD(tmp); - - spin_lock_irqsave(&free_entries_lock, flags); - - if (nr_total_entries < num_entries) { - delta = num_entries - nr_total_entries; - - spin_unlock_irqrestore(&free_entries_lock, flags); - - for (i = 0; i < delta; i++) { - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - break; - - list_add_tail(&entry->list, &tmp); - } - - spin_lock_irqsave(&free_entries_lock, flags); - - list_splice(&tmp, &free_entries); - nr_total_entries += i; - num_free_entries += i; - } else { - delta = nr_total_entries - num_entries; - - for (i = 0; i < delta && !list_empty(&free_entries); i++) { - entry = __dma_entry_alloc(); - kfree(entry); - } - - nr_total_entries -= i; - } - - if (nr_total_entries != num_entries) - ret = 1; - - spin_unlock_irqrestore(&free_entries_lock, flags); - - return ret; -} - -/* - * DMA-API debugging init code - * - * The init code does two things: - * 1. Initialize core data structures - * 2. Preallocate a given number of dma_debug_entry structs - */ - -static int prealloc_memory(u32 num_entries) -{ - struct dma_debug_entry *entry, *next_entry; - int i; - - for (i = 0; i < num_entries; ++i) { - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto out_err; - - list_add_tail(&entry->list, &free_entries); - } - - num_free_entries = num_entries; - min_free_entries = num_entries; - - pr_info("DMA-API: preallocated %d debug entries\n", num_entries); - - return 0; - -out_err: - - list_for_each_entry_safe(entry, next_entry, &free_entries, list) { - list_del(&entry->list); - kfree(entry); - } - - return -ENOMEM; -} - -static ssize_t filter_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - char buf[NAME_MAX_LEN + 1]; - unsigned long flags; - int len; - - if (!current_driver_name[0]) - return 0; - - /* - * We can't copy to userspace directly because current_driver_name can - * only be read under the driver_name_lock with irqs disabled. So - * create a temporary copy first. - */ - read_lock_irqsave(&driver_name_lock, flags); - len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name); - read_unlock_irqrestore(&driver_name_lock, flags); - - return simple_read_from_buffer(user_buf, count, ppos, buf, len); -} - -static ssize_t filter_write(struct file *file, const char __user *userbuf, - size_t count, loff_t *ppos) -{ - char buf[NAME_MAX_LEN]; - unsigned long flags; - size_t len; - int i; - - /* - * We can't copy from userspace directly. Access to - * current_driver_name is protected with a write_lock with irqs - * disabled. Since copy_from_user can fault and may sleep we - * need to copy to temporary buffer first - */ - len = min(count, (size_t)(NAME_MAX_LEN - 1)); - if (copy_from_user(buf, userbuf, len)) - return -EFAULT; - - buf[len] = 0; - - write_lock_irqsave(&driver_name_lock, flags); - - /* - * Now handle the string we got from userspace very carefully. - * The rules are: - * - only use the first token we got - * - token delimiter is everything looking like a space - * character (' ', '\n', '\t' ...) - * - */ - if (!isalnum(buf[0])) { - /* - * If the first character userspace gave us is not - * alphanumerical then assume the filter should be - * switched off. - */ - if (current_driver_name[0]) - pr_info("DMA-API: switching off dma-debug driver filter\n"); - current_driver_name[0] = 0; - current_driver = NULL; - goto out_unlock; - } - - /* - * Now parse out the first token and use it as the name for the - * driver to filter for. - */ - for (i = 0; i < NAME_MAX_LEN - 1; ++i) { - current_driver_name[i] = buf[i]; - if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0) - break; - } - current_driver_name[i] = 0; - current_driver = NULL; - - pr_info("DMA-API: enable driver filter for driver [%s]\n", - current_driver_name); - -out_unlock: - write_unlock_irqrestore(&driver_name_lock, flags); - - return count; -} - -static const struct file_operations filter_fops = { - .read = filter_read, - .write = filter_write, - .llseek = default_llseek, -}; - -static int dma_debug_fs_init(void) -{ - dma_debug_dent = debugfs_create_dir("dma-api", NULL); - if (!dma_debug_dent) { - pr_err("DMA-API: can not create debugfs directory\n"); - return -ENOMEM; - } - - global_disable_dent = debugfs_create_bool("disabled", 0444, - dma_debug_dent, - &global_disable); - if (!global_disable_dent) - goto out_err; - - error_count_dent = debugfs_create_u32("error_count", 0444, - dma_debug_dent, &error_count); - if (!error_count_dent) - goto out_err; - - show_all_errors_dent = debugfs_create_u32("all_errors", 0644, - dma_debug_dent, - &show_all_errors); - if (!show_all_errors_dent) - goto out_err; - - show_num_errors_dent = debugfs_create_u32("num_errors", 0644, - dma_debug_dent, - &show_num_errors); - if (!show_num_errors_dent) - goto out_err; - - num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, - dma_debug_dent, - &num_free_entries); - if (!num_free_entries_dent) - goto out_err; - - min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, - dma_debug_dent, - &min_free_entries); - if (!min_free_entries_dent) - goto out_err; - - filter_dent = debugfs_create_file("driver_filter", 0644, - dma_debug_dent, NULL, &filter_fops); - if (!filter_dent) - goto out_err; - - return 0; - -out_err: - debugfs_remove_recursive(dma_debug_dent); - - return -ENOMEM; -} - -static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) -{ - struct dma_debug_entry *entry; - unsigned long flags; - int count = 0, i; - - for (i = 0; i < HASH_SIZE; ++i) { - spin_lock_irqsave(&dma_entry_hash[i].lock, flags); - list_for_each_entry(entry, &dma_entry_hash[i].list, list) { - if (entry->dev == dev) { - count += 1; - *out_entry = entry; - } - } - spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags); - } - - return count; -} - -static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) -{ - struct device *dev = data; - struct dma_debug_entry *uninitialized_var(entry); - int count; - - if (dma_debug_disabled()) - return 0; - - switch (action) { - case BUS_NOTIFY_UNBOUND_DRIVER: - count = device_dma_allocations(dev, &entry); - if (count == 0) - break; - err_printk(dev, entry, "DMA-API: device driver has pending " - "DMA allocations while released from device " - "[count=%d]\n" - "One of leaked entries details: " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [mapped as %s]\n", - count, entry->dev_addr, entry->size, - dir2name[entry->direction], type2name[entry->type]); - break; - default: - break; - } - - return 0; -} - -void dma_debug_add_bus(struct bus_type *bus) -{ - struct notifier_block *nb; - - if (dma_debug_disabled()) - return; - - nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); - if (nb == NULL) { - pr_err("dma_debug_add_bus: out of memory\n"); - return; - } - - nb->notifier_call = dma_debug_device_change; - - bus_register_notifier(bus, nb); -} - -static int dma_debug_init(void) -{ - int i; - - /* Do not use dma_debug_initialized here, since we really want to be - * called to set dma_debug_initialized - */ - if (global_disable) - return 0; - - for (i = 0; i < HASH_SIZE; ++i) { - INIT_LIST_HEAD(&dma_entry_hash[i].list); - spin_lock_init(&dma_entry_hash[i].lock); - } - - if (dma_debug_fs_init() != 0) { - pr_err("DMA-API: error creating debugfs entries - disabling\n"); - global_disable = true; - - return 0; - } - - if (prealloc_memory(nr_prealloc_entries) != 0) { - pr_err("DMA-API: debugging out of memory error - disabled\n"); - global_disable = true; - - return 0; - } - - nr_total_entries = num_free_entries; - - dma_debug_initialized = true; - - pr_info("DMA-API: debugging enabled by kernel config\n"); - return 0; -} -core_initcall(dma_debug_init); - -static __init int dma_debug_cmdline(char *str) -{ - if (!str) - return -EINVAL; - - if (strncmp(str, "off", 3) == 0) { - pr_info("DMA-API: debugging disabled on kernel command line\n"); - global_disable = true; - } - - return 0; -} - -static __init int dma_debug_entries_cmdline(char *str) -{ - if (!str) - return -EINVAL; - if (!get_option(&str, &nr_prealloc_entries)) - nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; - return 0; -} - -__setup("dma_debug=", dma_debug_cmdline); -__setup("dma_debug_entries=", dma_debug_entries_cmdline); - -static void check_unmap(struct dma_debug_entry *ref) -{ - struct dma_debug_entry *entry; - struct hash_bucket *bucket; - unsigned long flags; - - bucket = get_hash_bucket(ref, &flags); - entry = bucket_find_exact(bucket, ref); - - if (!entry) { - /* must drop lock before calling dma_mapping_error */ - put_hash_bucket(bucket, &flags); - - if (dma_mapping_error(ref->dev, ref->dev_addr)) { - err_printk(ref->dev, NULL, - "DMA-API: device driver tries to free an " - "invalid DMA memory address\n"); - } else { - err_printk(ref->dev, NULL, - "DMA-API: device driver tries to free DMA " - "memory it has not allocated [device " - "address=0x%016llx] [size=%llu bytes]\n", - ref->dev_addr, ref->size); - } - return; - } - - if (ref->size != entry->size) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA memory with different size " - "[device address=0x%016llx] [map size=%llu bytes] " - "[unmap size=%llu bytes]\n", - ref->dev_addr, entry->size, ref->size); - } - - if (ref->type != entry->type) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA memory with wrong function " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped as %s] [unmapped as %s]\n", - ref->dev_addr, ref->size, - type2name[entry->type], type2name[ref->type]); - } else if ((entry->type == dma_debug_coherent) && - (phys_addr(ref) != phys_addr(entry))) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA memory with different CPU address " - "[device address=0x%016llx] [size=%llu bytes] " - "[cpu alloc address=0x%016llx] " - "[cpu free address=0x%016llx]", - ref->dev_addr, ref->size, - phys_addr(entry), - phys_addr(ref)); - } - - if (ref->sg_call_ents && ref->type == dma_debug_sg && - ref->sg_call_ents != entry->sg_call_ents) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA sg list with different entry count " - "[map count=%d] [unmap count=%d]\n", - entry->sg_call_ents, ref->sg_call_ents); - } - - /* - * This may be no bug in reality - but most implementations of the - * DMA API don't handle this properly, so check for it here - */ - if (ref->direction != entry->direction) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA memory with different direction " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [unmapped with %s]\n", - ref->dev_addr, ref->size, - dir2name[entry->direction], - dir2name[ref->direction]); - } - - /* - * Drivers should use dma_mapping_error() to check the returned - * addresses of dma_map_single() and dma_map_page(). - * If not, print this warning message. See Documentation/DMA-API.txt. - */ - if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { - err_printk(ref->dev, entry, - "DMA-API: device driver failed to check map error" - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped as %s]", - ref->dev_addr, ref->size, - type2name[entry->type]); - } - - hash_bucket_del(entry); - dma_entry_free(entry); - - put_hash_bucket(bucket, &flags); -} - -static void check_for_stack(struct device *dev, - struct page *page, size_t offset) -{ - void *addr; - struct vm_struct *stack_vm_area = task_stack_vm_area(current); - - if (!stack_vm_area) { - /* Stack is direct-mapped. */ - if (PageHighMem(page)) - return; - addr = page_address(page) + offset; - if (object_is_on_stack(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); - } else { - /* Stack is vmalloced. */ - int i; - - for (i = 0; i < stack_vm_area->nr_pages; i++) { - if (page != stack_vm_area->pages[i]) - continue; - - addr = (u8 *)current->stack + i * PAGE_SIZE + offset; - err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); - break; - } - } -} - -static inline bool overlap(void *addr, unsigned long len, void *start, void *end) -{ - unsigned long a1 = (unsigned long)addr; - unsigned long b1 = a1 + len; - unsigned long a2 = (unsigned long)start; - unsigned long b2 = (unsigned long)end; - - return !(b1 <= a2 || a1 >= b2); -} - -static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len) -{ - if (overlap(addr, len, _stext, _etext) || - overlap(addr, len, __start_rodata, __end_rodata)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); -} - -static void check_sync(struct device *dev, - struct dma_debug_entry *ref, - bool to_cpu) -{ - struct dma_debug_entry *entry; - struct hash_bucket *bucket; - unsigned long flags; - - bucket = get_hash_bucket(ref, &flags); - - entry = bucket_find_contain(&bucket, ref, &flags); - - if (!entry) { - err_printk(dev, NULL, "DMA-API: device driver tries " - "to sync DMA memory it has not allocated " - "[device address=0x%016llx] [size=%llu bytes]\n", - (unsigned long long)ref->dev_addr, ref->size); - goto out; - } - - if (ref->size > entry->size) { - err_printk(dev, entry, "DMA-API: device driver syncs" - " DMA memory outside allocated range " - "[device address=0x%016llx] " - "[allocation size=%llu bytes] " - "[sync offset+size=%llu]\n", - entry->dev_addr, entry->size, - ref->size); - } - - if (entry->direction == DMA_BIDIRECTIONAL) - goto out; - - if (ref->direction != entry->direction) { - err_printk(dev, entry, "DMA-API: device driver syncs " - "DMA memory with different direction " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [synced with %s]\n", - (unsigned long long)ref->dev_addr, entry->size, - dir2name[entry->direction], - dir2name[ref->direction]); - } - - if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) && - !(ref->direction == DMA_TO_DEVICE)) - err_printk(dev, entry, "DMA-API: device driver syncs " - "device read-only DMA memory for cpu " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [synced with %s]\n", - (unsigned long long)ref->dev_addr, entry->size, - dir2name[entry->direction], - dir2name[ref->direction]); - - if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) && - !(ref->direction == DMA_FROM_DEVICE)) - err_printk(dev, entry, "DMA-API: device driver syncs " - "device write-only DMA memory to device " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [synced with %s]\n", - (unsigned long long)ref->dev_addr, entry->size, - dir2name[entry->direction], - dir2name[ref->direction]); - - if (ref->sg_call_ents && ref->type == dma_debug_sg && - ref->sg_call_ents != entry->sg_call_ents) { - err_printk(ref->dev, entry, "DMA-API: device driver syncs " - "DMA sg list with different entry count " - "[map count=%d] [sync count=%d]\n", - entry->sg_call_ents, ref->sg_call_ents); - } - -out: - put_hash_bucket(bucket, &flags); -} - -static void check_sg_segment(struct device *dev, struct scatterlist *sg) -{ -#ifdef CONFIG_DMA_API_DEBUG_SG - unsigned int max_seg = dma_get_max_seg_size(dev); - u64 start, end, boundary = dma_get_seg_boundary(dev); - - /* - * Either the driver forgot to set dma_parms appropriately, or - * whoever generated the list forgot to check them. - */ - if (sg->length > max_seg) - err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n", - sg->length, max_seg); - /* - * In some cases this could potentially be the DMA API - * implementation's fault, but it would usually imply that - * the scatterlist was built inappropriately to begin with. - */ - start = sg_dma_address(sg); - end = start + sg_dma_len(sg) - 1; - if ((start ^ end) & ~boundary) - err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n", - start, end, boundary); -#endif -} - -void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr, - bool map_single) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - if (dma_mapping_error(dev, dma_addr)) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->dev = dev; - entry->type = dma_debug_page; - entry->pfn = page_to_pfn(page); - entry->offset = offset, - entry->dev_addr = dma_addr; - entry->size = size; - entry->direction = direction; - entry->map_err_type = MAP_ERR_NOT_CHECKED; - - if (map_single) - entry->type = dma_debug_single; - - check_for_stack(dev, page, offset); - - if (!PageHighMem(page)) { - void *addr = page_address(page) + offset; - - check_for_illegal_area(dev, addr, size); - } - - add_dma_entry(entry); -} -EXPORT_SYMBOL(debug_dma_map_page); - -void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - struct dma_debug_entry ref; - struct dma_debug_entry *entry; - struct hash_bucket *bucket; - unsigned long flags; - - if (unlikely(dma_debug_disabled())) - return; - - ref.dev = dev; - ref.dev_addr = dma_addr; - bucket = get_hash_bucket(&ref, &flags); - - list_for_each_entry(entry, &bucket->list, list) { - if (!exact_match(&ref, entry)) - continue; - - /* - * The same physical address can be mapped multiple - * times. Without a hardware IOMMU this results in the - * same device addresses being put into the dma-debug - * hash multiple times too. This can result in false - * positives being reported. Therefore we implement a - * best-fit algorithm here which updates the first entry - * from the hash which fits the reference value and is - * not currently listed as being checked. - */ - if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { - entry->map_err_type = MAP_ERR_CHECKED; - break; - } - } - - put_hash_bucket(bucket, &flags); -} -EXPORT_SYMBOL(debug_dma_mapping_error); - -void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, int direction, bool map_single) -{ - struct dma_debug_entry ref = { - .type = dma_debug_page, - .dev = dev, - .dev_addr = addr, - .size = size, - .direction = direction, - }; - - if (unlikely(dma_debug_disabled())) - return; - - if (map_single) - ref.type = dma_debug_single; - - check_unmap(&ref); -} -EXPORT_SYMBOL(debug_dma_unmap_page); - -void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, int mapped_ents, int direction) -{ - struct dma_debug_entry *entry; - struct scatterlist *s; - int i; - - if (unlikely(dma_debug_disabled())) - return; - - for_each_sg(sg, s, mapped_ents, i) { - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_sg; - entry->dev = dev; - entry->pfn = page_to_pfn(sg_page(s)); - entry->offset = s->offset, - entry->size = sg_dma_len(s); - entry->dev_addr = sg_dma_address(s); - entry->direction = direction; - entry->sg_call_ents = nents; - entry->sg_mapped_ents = mapped_ents; - - check_for_stack(dev, sg_page(s), s->offset); - - if (!PageHighMem(sg_page(s))) { - check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); - } - - check_sg_segment(dev, s); - - add_dma_entry(entry); - } -} -EXPORT_SYMBOL(debug_dma_map_sg); - -static int get_nr_mapped_entries(struct device *dev, - struct dma_debug_entry *ref) -{ - struct dma_debug_entry *entry; - struct hash_bucket *bucket; - unsigned long flags; - int mapped_ents; - - bucket = get_hash_bucket(ref, &flags); - entry = bucket_find_exact(bucket, ref); - mapped_ents = 0; - - if (entry) - mapped_ents = entry->sg_mapped_ents; - put_hash_bucket(bucket, &flags); - - return mapped_ents; -} - -void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems, int dir) -{ - struct scatterlist *s; - int mapped_ents = 0, i; - - if (unlikely(dma_debug_disabled())) - return; - - for_each_sg(sglist, s, nelems, i) { - - struct dma_debug_entry ref = { - .type = dma_debug_sg, - .dev = dev, - .pfn = page_to_pfn(sg_page(s)), - .offset = s->offset, - .dev_addr = sg_dma_address(s), - .size = sg_dma_len(s), - .direction = dir, - .sg_call_ents = nelems, - }; - - if (mapped_ents && i >= mapped_ents) - break; - - if (!i) - mapped_ents = get_nr_mapped_entries(dev, &ref); - - check_unmap(&ref); - } -} -EXPORT_SYMBOL(debug_dma_unmap_sg); - -void debug_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t dma_addr, void *virt) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - if (unlikely(virt == NULL)) - return; - - /* handle vmalloc and linear addresses */ - if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_coherent; - entry->dev = dev; - entry->offset = offset_in_page(virt); - entry->size = size; - entry->dev_addr = dma_addr; - entry->direction = DMA_BIDIRECTIONAL; - - if (is_vmalloc_addr(virt)) - entry->pfn = vmalloc_to_pfn(virt); - else - entry->pfn = page_to_pfn(virt_to_page(virt)); - - add_dma_entry(entry); -} -EXPORT_SYMBOL(debug_dma_alloc_coherent); - -void debug_dma_free_coherent(struct device *dev, size_t size, - void *virt, dma_addr_t addr) -{ - struct dma_debug_entry ref = { - .type = dma_debug_coherent, - .dev = dev, - .offset = offset_in_page(virt), - .dev_addr = addr, - .size = size, - .direction = DMA_BIDIRECTIONAL, - }; - - /* handle vmalloc and linear addresses */ - if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) - return; - - if (is_vmalloc_addr(virt)) - ref.pfn = vmalloc_to_pfn(virt); - else - ref.pfn = page_to_pfn(virt_to_page(virt)); - - if (unlikely(dma_debug_disabled())) - return; - - check_unmap(&ref); -} -EXPORT_SYMBOL(debug_dma_free_coherent); - -void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, - int direction, dma_addr_t dma_addr) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_resource; - entry->dev = dev; - entry->pfn = PHYS_PFN(addr); - entry->offset = offset_in_page(addr); - entry->size = size; - entry->dev_addr = dma_addr; - entry->direction = direction; - entry->map_err_type = MAP_ERR_NOT_CHECKED; - - add_dma_entry(entry); -} -EXPORT_SYMBOL(debug_dma_map_resource); - -void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - struct dma_debug_entry ref = { - .type = dma_debug_resource, - .dev = dev, - .dev_addr = dma_addr, - .size = size, - .direction = direction, - }; - - if (unlikely(dma_debug_disabled())) - return; - - check_unmap(&ref); -} -EXPORT_SYMBOL(debug_dma_unmap_resource); - -void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, - size_t size, int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, true); -} -EXPORT_SYMBOL(debug_dma_sync_single_for_cpu); - -void debug_dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, - int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, false); -} -EXPORT_SYMBOL(debug_dma_sync_single_for_device); - -void debug_dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, size_t size, - int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = offset + size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, true); -} -EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu); - -void debug_dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = offset + size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, false); -} -EXPORT_SYMBOL(debug_dma_sync_single_range_for_device); - -void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nelems, int direction) -{ - struct scatterlist *s; - int mapped_ents = 0, i; - - if (unlikely(dma_debug_disabled())) - return; - - for_each_sg(sg, s, nelems, i) { - - struct dma_debug_entry ref = { - .type = dma_debug_sg, - .dev = dev, - .pfn = page_to_pfn(sg_page(s)), - .offset = s->offset, - .dev_addr = sg_dma_address(s), - .size = sg_dma_len(s), - .direction = direction, - .sg_call_ents = nelems, - }; - - if (!i) - mapped_ents = get_nr_mapped_entries(dev, &ref); - - if (i >= mapped_ents) - break; - - check_sync(dev, &ref, true); - } -} -EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); - -void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nelems, int direction) -{ - struct scatterlist *s; - int mapped_ents = 0, i; - - if (unlikely(dma_debug_disabled())) - return; - - for_each_sg(sg, s, nelems, i) { - - struct dma_debug_entry ref = { - .type = dma_debug_sg, - .dev = dev, - .pfn = page_to_pfn(sg_page(s)), - .offset = s->offset, - .dev_addr = sg_dma_address(s), - .size = sg_dma_len(s), - .direction = direction, - .sg_call_ents = nelems, - }; - if (!i) - mapped_ents = get_nr_mapped_entries(dev, &ref); - - if (i >= mapped_ents) - break; - - check_sync(dev, &ref, false); - } -} -EXPORT_SYMBOL(debug_dma_sync_sg_for_device); - -static int __init dma_debug_driver_setup(char *str) -{ - int i; - - for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) { - current_driver_name[i] = *str; - if (*str == 0) - break; - } - - if (current_driver_name[0]) - pr_info("DMA-API: enable driver filter for driver [%s]\n", - current_driver_name); - - - return 1; -} -__setup("dma_debug_driver=", dma_debug_driver_setup); diff --git a/lib/dma-direct.c b/lib/dma-direct.c deleted file mode 100644 index 8be8106270c2..000000000000 --- a/lib/dma-direct.c +++ /dev/null @@ -1,204 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DMA operations that map physical memory directly without using an IOMMU or - * flushing caches. - */ -#include -#include -#include -#include -#include -#include -#include - -#define DIRECT_MAPPING_ERROR 0 - -/* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but - * some use it for entirely different regions: - */ -#ifndef ARCH_ZONE_DMA_BITS -#define ARCH_ZONE_DMA_BITS 24 -#endif - -/* - * For AMD SEV all DMA must be to unencrypted addresses. - */ -static inline bool force_dma_unencrypted(void) -{ - return sev_active(); -} - -static bool -check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, - const char *caller) -{ - if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { - if (!dev->dma_mask) { - dev_err(dev, - "%s: call on device without dma_mask\n", - caller); - return false; - } - - if (*dev->dma_mask >= DMA_BIT_MASK(32)) { - dev_err(dev, - "%s: overflow %pad+%zu of device mask %llx\n", - caller, &dma_addr, size, *dev->dma_mask); - } - return false; - } - return true; -} - -static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) -{ - dma_addr_t addr = force_dma_unencrypted() ? - __phys_to_dma(dev, phys) : phys_to_dma(dev, phys); - return addr + size - 1 <= dev->coherent_dma_mask; -} - -void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - int page_order = get_order(size); - struct page *page = NULL; - void *ret; - - /* we always manually zero the memory once we are done: */ - gfp &= ~__GFP_ZERO; - - /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ - if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) - gfp |= GFP_DMA; - if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) - gfp |= GFP_DMA32; - -again: - /* CMA can be used only in the context which permits sleeping */ - if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, gfp); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_release_from_contiguous(dev, page, count); - page = NULL; - } - } - if (!page) - page = alloc_pages_node(dev_to_node(dev), gfp, page_order); - - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - __free_pages(page, page_order); - page = NULL; - - if (IS_ENABLED(CONFIG_ZONE_DMA32) && - dev->coherent_dma_mask < DMA_BIT_MASK(64) && - !(gfp & (GFP_DMA32 | GFP_DMA))) { - gfp |= GFP_DMA32; - goto again; - } - - if (IS_ENABLED(CONFIG_ZONE_DMA) && - dev->coherent_dma_mask < DMA_BIT_MASK(32) && - !(gfp & GFP_DMA)) { - gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } - } - - if (!page) - return NULL; - ret = page_address(page); - if (force_dma_unencrypted()) { - set_memory_decrypted((unsigned long)ret, 1 << page_order); - *dma_handle = __phys_to_dma(dev, page_to_phys(page)); - } else { - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - } - memset(ret, 0, size); - return ret; -} - -/* - * NOTE: this function must never look at the dma_addr argument, because we want - * to be able to use it as a helper for iommu implementations as well. - */ -void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_addr, unsigned long attrs) -{ - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned int page_order = get_order(size); - - if (force_dma_unencrypted()) - set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); - if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count)) - free_pages((unsigned long)cpu_addr, page_order); -} - -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset; - - if (!check_addr(dev, dma_addr, size, __func__)) - return DIRECT_MAPPING_ERROR; - return dma_addr; -} - -int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, - enum dma_data_direction dir, unsigned long attrs) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - BUG_ON(!sg_page(sg)); - - sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg)); - if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__)) - return 0; - sg_dma_len(sg) = sg->length; - } - - return nents; -} - -int dma_direct_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_ZONE_DMA - if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) - return 0; -#else - /* - * Because 32-bit DMA masks are so common we expect every architecture - * to be able to satisfy them - either by not supporting more physical - * memory, or by providing a ZONE_DMA32. If neither is the case, the - * architecture needs to use an IOMMU instead of the direct mapping. - */ - if (mask < DMA_BIT_MASK(32)) - return 0; -#endif - /* - * Various PCI/PCIe bridges have broken support for > 32bit DMA even - * if the device itself might support it. - */ - if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32)) - return 0; - return 1; -} - -int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return dma_addr == DIRECT_MAPPING_ERROR; -} - -const struct dma_map_ops dma_direct_ops = { - .alloc = dma_direct_alloc, - .free = dma_direct_free, - .map_page = dma_direct_map_page, - .map_sg = dma_direct_map_sg, - .dma_supported = dma_direct_supported, - .mapping_error = dma_direct_mapping_error, -}; -EXPORT_SYMBOL(dma_direct_ops); diff --git a/lib/dma-noncoherent.c b/lib/dma-noncoherent.c deleted file mode 100644 index 79e9a757387f..000000000000 --- a/lib/dma-noncoherent.c +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2018 Christoph Hellwig. - * - * DMA operations that map physical memory directly without providing cache - * coherence. - */ -#include -#include -#include -#include -#include - -static void dma_noncoherent_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); -} - -static void dma_noncoherent_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); -} - -static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - dma_addr_t addr; - - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); - if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(dev, page_to_phys(page) + offset, - size, dir); - return addr; -} - -static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs); - if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir); - return nents; -} - -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU -static void dma_noncoherent_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); -} - -static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); -} - -static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir); -} - -static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir); -} -#endif - -const struct dma_map_ops dma_noncoherent_ops = { - .alloc = arch_dma_alloc, - .free = arch_dma_free, - .mmap = arch_dma_mmap, - .sync_single_for_device = dma_noncoherent_sync_single_for_device, - .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, - .map_page = dma_noncoherent_map_page, - .map_sg = dma_noncoherent_map_sg, -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU - .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, - .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, - .unmap_page = dma_noncoherent_unmap_page, - .unmap_sg = dma_noncoherent_unmap_sg, -#endif - .dma_supported = dma_direct_supported, - .mapping_error = dma_direct_mapping_error, - .cache_sync = arch_dma_cache_sync, -}; -EXPORT_SYMBOL(dma_noncoherent_ops); diff --git a/lib/dma-virt.c b/lib/dma-virt.c deleted file mode 100644 index 8e61a02ef9ca..000000000000 --- a/lib/dma-virt.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * lib/dma-virt.c - * - * DMA operations that map to virtual addresses without flushing memory. - */ -#include -#include -#include -#include - -static void *dma_virt_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, - unsigned long attrs) -{ - void *ret; - - ret = (void *)__get_free_pages(gfp, get_order(size)); - if (ret) - *dma_handle = (uintptr_t)ret; - return ret; -} - -static void dma_virt_free(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_addr, - unsigned long attrs) -{ - free_pages((unsigned long)cpu_addr, get_order(size)); -} - -static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - return (uintptr_t)(page_address(page) + offset); -} - -static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - BUG_ON(!sg_page(sg)); - sg_dma_address(sg) = (uintptr_t)sg_virt(sg); - sg_dma_len(sg) = sg->length; - } - - return nents; -} - -const struct dma_map_ops dma_virt_ops = { - .alloc = dma_virt_alloc, - .free = dma_virt_free, - .map_page = dma_virt_map_page, - .map_sg = dma_virt_map_sg, -}; -EXPORT_SYMBOL(dma_virt_ops); diff --git a/lib/swiotlb.c b/lib/swiotlb.c deleted file mode 100644 index 04b68d9dffac..000000000000 --- a/lib/swiotlb.c +++ /dev/null @@ -1,1087 +0,0 @@ -/* - * Dynamic DMA mapping support. - * - * This implementation is a fallback for platforms that do not support - * I/O TLBs (aka DMA address translation hardware). - * Copyright (C) 2000 Asit Mallick - * Copyright (C) 2000 Goutham Rao - * Copyright (C) 2000, 2003 Hewlett-Packard Co - * David Mosberger-Tang - * - * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. - * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid - * unnecessary i-cache flushing. - * 04/07/.. ak Better overflow handling. Assorted fixes. - * 05/09/10 linville Add support for syncing ranges, support syncing for - * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. - * 08/12/11 beckyb Add highmem support - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -#define OFFSET(val,align) ((unsigned long) \ - ( (val) & ( (align) - 1))) - -#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) - -/* - * Minimum IO TLB size to bother booting with. Systems with mainly - * 64bit capable cards will only lightly use the swiotlb. If we can't - * allocate a contiguous 1MB, we're probably in trouble anyway. - */ -#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) - -enum swiotlb_force swiotlb_force; - -/* - * Used to do a quick range check in swiotlb_tbl_unmap_single and - * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this - * API. - */ -static phys_addr_t io_tlb_start, io_tlb_end; - -/* - * The number of IO TLB blocks (in groups of 64) between io_tlb_start and - * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. - */ -static unsigned long io_tlb_nslabs; - -/* - * When the IOMMU overflows we return a fallback buffer. This sets the size. - */ -static unsigned long io_tlb_overflow = 32*1024; - -static phys_addr_t io_tlb_overflow_buffer; - -/* - * This is a free list describing the number of free entries available from - * each index - */ -static unsigned int *io_tlb_list; -static unsigned int io_tlb_index; - -/* - * Max segment that we can provide which (if pages are contingous) will - * not be bounced (unless SWIOTLB_FORCE is set). - */ -unsigned int max_segment; - -/* - * We need to save away the original address corresponding to a mapped entry - * for the sync operations. - */ -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) -static phys_addr_t *io_tlb_orig_addr; - -/* - * Protect the above data structures in the map and unmap calls - */ -static DEFINE_SPINLOCK(io_tlb_lock); - -static int late_alloc; - -static int __init -setup_io_tlb_npages(char *str) -{ - if (isdigit(*str)) { - io_tlb_nslabs = simple_strtoul(str, &str, 0); - /* avoid tail segment of size < IO_TLB_SEGSIZE */ - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - if (*str == ',') - ++str; - if (!strcmp(str, "force")) { - swiotlb_force = SWIOTLB_FORCE; - } else if (!strcmp(str, "noforce")) { - swiotlb_force = SWIOTLB_NO_FORCE; - io_tlb_nslabs = 1; - } - - return 0; -} -early_param("swiotlb", setup_io_tlb_npages); -/* make io_tlb_overflow tunable too? */ - -unsigned long swiotlb_nr_tbl(void) -{ - return io_tlb_nslabs; -} -EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); - -unsigned int swiotlb_max_segment(void) -{ - return max_segment; -} -EXPORT_SYMBOL_GPL(swiotlb_max_segment); - -void swiotlb_set_max_segment(unsigned int val) -{ - if (swiotlb_force == SWIOTLB_FORCE) - max_segment = 1; - else - max_segment = rounddown(val, PAGE_SIZE); -} - -/* default to 64MB */ -#define IO_TLB_DEFAULT_SIZE (64UL<<20) -unsigned long swiotlb_size_or_default(void) -{ - unsigned long size; - - size = io_tlb_nslabs << IO_TLB_SHIFT; - - return size ? size : (IO_TLB_DEFAULT_SIZE); -} - -static bool no_iotlb_memory; - -void swiotlb_print_info(void) -{ - unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; - unsigned char *vstart, *vend; - - if (no_iotlb_memory) { - pr_warn("software IO TLB: No low mem\n"); - return; - } - - vstart = phys_to_virt(io_tlb_start); - vend = phys_to_virt(io_tlb_end); - - printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", - (unsigned long long)io_tlb_start, - (unsigned long long)io_tlb_end, - bytes >> 20, vstart, vend - 1); -} - -/* - * Early SWIOTLB allocation may be too early to allow an architecture to - * perform the desired operations. This function allows the architecture to - * call SWIOTLB when the operations are possible. It needs to be called - * before the SWIOTLB memory is used. - */ -void __init swiotlb_update_mem_attributes(void) -{ - void *vaddr; - unsigned long bytes; - - if (no_iotlb_memory || late_alloc) - return; - - vaddr = phys_to_virt(io_tlb_start); - bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); - set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); - - vaddr = phys_to_virt(io_tlb_overflow_buffer); - bytes = PAGE_ALIGN(io_tlb_overflow); - set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); -} - -int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) -{ - void *v_overflow_buffer; - unsigned long i, bytes; - - bytes = nslabs << IO_TLB_SHIFT; - - io_tlb_nslabs = nslabs; - io_tlb_start = __pa(tlb); - io_tlb_end = io_tlb_start + bytes; - - /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = memblock_virt_alloc_low_nopanic( - PAGE_ALIGN(io_tlb_overflow), - PAGE_SIZE); - if (!v_overflow_buffer) - return -ENOMEM; - - io_tlb_overflow_buffer = __pa(v_overflow_buffer); - - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. - */ - io_tlb_list = memblock_virt_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), - PAGE_SIZE); - io_tlb_orig_addr = memblock_virt_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), - PAGE_SIZE); - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - io_tlb_index = 0; - - if (verbose) - swiotlb_print_info(); - - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); - return 0; -} - -/* - * Statically reserve bounce buffer space and initialize bounce buffer data - * structures for the software IO TLB used to implement the DMA API. - */ -void __init -swiotlb_init(int verbose) -{ - size_t default_size = IO_TLB_DEFAULT_SIZE; - unsigned char *vstart; - unsigned long bytes; - - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - - bytes = io_tlb_nslabs << IO_TLB_SHIFT; - - /* Get IO TLB memory from the low pages */ - vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); - if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) - return; - - if (io_tlb_start) - memblock_free_early(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - pr_warn("Cannot allocate SWIOTLB buffer"); - no_iotlb_memory = true; -} - -/* - * Systems with larger DMA zones (those that don't support ISA) can - * initialize the swiotlb later using the slab allocator if needed. - * This should be just like above, but with some error catching. - */ -int -swiotlb_late_init_with_default_size(size_t default_size) -{ - unsigned long bytes, req_nslabs = io_tlb_nslabs; - unsigned char *vstart = NULL; - unsigned int order; - int rc = 0; - - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - - /* - * Get IO TLB memory from the low pages - */ - order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); - io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = io_tlb_nslabs << IO_TLB_SHIFT; - - while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, - order); - if (vstart) - break; - order--; - } - - if (!vstart) { - io_tlb_nslabs = req_nslabs; - return -ENOMEM; - } - if (order != get_order(bytes)) { - printk(KERN_WARNING "Warning: only able to allocate %ld MB " - "for software IO TLB\n", (PAGE_SIZE << order) >> 20); - io_tlb_nslabs = SLABS_PER_PAGE << order; - } - rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); - if (rc) - free_pages((unsigned long)vstart, order); - - return rc; -} - -int -swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) -{ - unsigned long i, bytes; - unsigned char *v_overflow_buffer; - - bytes = nslabs << IO_TLB_SHIFT; - - io_tlb_nslabs = nslabs; - io_tlb_start = virt_to_phys(tlb); - io_tlb_end = io_tlb_start + bytes; - - set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - memset(tlb, 0, bytes); - - /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, - get_order(io_tlb_overflow)); - if (!v_overflow_buffer) - goto cleanup2; - - set_memory_decrypted((unsigned long)v_overflow_buffer, - io_tlb_overflow >> PAGE_SHIFT); - memset(v_overflow_buffer, 0, io_tlb_overflow); - io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); - - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. - */ - io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * sizeof(int))); - if (!io_tlb_list) - goto cleanup3; - - io_tlb_orig_addr = (phys_addr_t *) - __get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * - sizeof(phys_addr_t))); - if (!io_tlb_orig_addr) - goto cleanup4; - - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - io_tlb_index = 0; - - swiotlb_print_info(); - - late_alloc = 1; - - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); - - return 0; - -cleanup4: - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - io_tlb_list = NULL; -cleanup3: - free_pages((unsigned long)v_overflow_buffer, - get_order(io_tlb_overflow)); - io_tlb_overflow_buffer = 0; -cleanup2: - io_tlb_end = 0; - io_tlb_start = 0; - io_tlb_nslabs = 0; - max_segment = 0; - return -ENOMEM; -} - -void __init swiotlb_exit(void) -{ - if (!io_tlb_orig_addr) - return; - - if (late_alloc) { - free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), - get_order(io_tlb_overflow)); - free_pages((unsigned long)io_tlb_orig_addr, - get_order(io_tlb_nslabs * sizeof(phys_addr_t))); - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - free_pages((unsigned long)phys_to_virt(io_tlb_start), - get_order(io_tlb_nslabs << IO_TLB_SHIFT)); - } else { - memblock_free_late(io_tlb_overflow_buffer, - PAGE_ALIGN(io_tlb_overflow)); - memblock_free_late(__pa(io_tlb_orig_addr), - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); - memblock_free_late(__pa(io_tlb_list), - PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); - memblock_free_late(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - } - io_tlb_nslabs = 0; - max_segment = 0; -} - -int is_swiotlb_buffer(phys_addr_t paddr) -{ - return paddr >= io_tlb_start && paddr < io_tlb_end; -} - -/* - * Bounce: copy the swiotlb buffer back to the original dma location - */ -static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir) -{ - unsigned long pfn = PFN_DOWN(orig_addr); - unsigned char *vaddr = phys_to_virt(tlb_addr); - - if (PageHighMem(pfn_to_page(pfn))) { - /* The buffer does not have a mapping. Map it in and copy */ - unsigned int offset = orig_addr & ~PAGE_MASK; - char *buffer; - unsigned int sz = 0; - unsigned long flags; - - while (size) { - sz = min_t(size_t, PAGE_SIZE - offset, size); - - local_irq_save(flags); - buffer = kmap_atomic(pfn_to_page(pfn)); - if (dir == DMA_TO_DEVICE) - memcpy(vaddr, buffer + offset, sz); - else - memcpy(buffer + offset, vaddr, sz); - kunmap_atomic(buffer); - local_irq_restore(flags); - - size -= sz; - pfn++; - vaddr += sz; - offset = 0; - } - } else if (dir == DMA_TO_DEVICE) { - memcpy(vaddr, phys_to_virt(orig_addr), size); - } else { - memcpy(phys_to_virt(orig_addr), vaddr, size); - } -} - -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - unsigned long flags; - phys_addr_t tlb_addr; - unsigned int nslots, stride, index, wrap; - int i; - unsigned long mask; - unsigned long offset_slots; - unsigned long max_slots; - - if (no_iotlb_memory) - panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); - - if (mem_encrypt_active()) - pr_warn_once("%s is active and system is using DMA bounce buffers\n", - sme_active() ? "SME" : "SEV"); - - mask = dma_get_seg_boundary(hwdev); - - tbl_dma_addr &= mask; - - offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - - /* - * Carefully handle integer overflow which can occur when mask == ~0UL. - */ - max_slots = mask + 1 - ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT - : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); - - /* - * For mappings greater than or equal to a page, we limit the stride - * (and hence alignment) to a page size. - */ - nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - if (size >= PAGE_SIZE) - stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); - else - stride = 1; - - BUG_ON(!nslots); - - /* - * Find suitable number of IO TLB entries size that will fit this - * request and allocate a buffer from that IO TLB pool. - */ - spin_lock_irqsave(&io_tlb_lock, flags); - index = ALIGN(io_tlb_index, stride); - if (index >= io_tlb_nslabs) - index = 0; - wrap = index; - - do { - while (iommu_is_span_boundary(index, nslots, offset_slots, - max_slots)) { - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - if (index == wrap) - goto not_found; - } - - /* - * If we find a slot that indicates we have 'nslots' number of - * contiguous buffers, we allocate the buffers from that slot - * and mark the entries as '0' indicating unavailable. - */ - if (io_tlb_list[index] >= nslots) { - int count = 0; - - for (i = index; i < (int) (index + nslots); i++) - io_tlb_list[i] = 0; - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; - tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); - - /* - * Update the indices to avoid searching in the next - * round. - */ - io_tlb_index = ((index + nslots) < io_tlb_nslabs - ? (index + nslots) : 0); - - goto found; - } - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - } while (index != wrap); - -not_found: - spin_unlock_irqrestore(&io_tlb_lock, flags); - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); - return SWIOTLB_MAP_ERROR; -found: - spin_unlock_irqrestore(&io_tlb_lock, flags); - - /* - * Save away the mapping from the original address to the DMA address. - * This is needed when we sync the memory. Then we sync the buffer if - * needed. - */ - for (i = 0; i < nslots; i++) - io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); - - return tlb_addr; -} - -/* - * Allocates bounce buffer and returns its physical address. - */ -static phys_addr_t -map_single(struct device *hwdev, phys_addr_t phys, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t start_dma_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) { - dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", - &phys); - return SWIOTLB_MAP_ERROR; - } - - start_dma_addr = __phys_to_dma(hwdev, io_tlb_start); - return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, - dir, attrs); -} - -/* - * tlb_addr is the physical address of the bounce buffer to unmap. - */ -void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - unsigned long flags; - int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - - /* - * First, sync the memory before unmapping the entry - */ - if (orig_addr != INVALID_PHYS_ADDR && - !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); - - /* - * Return the buffer to the free list by setting the corresponding - * entries to indicate the number of contiguous entries available. - * While returning the entries to the free list, we merge the entries - * with slots below and above the pool being returned. - */ - spin_lock_irqsave(&io_tlb_lock, flags); - { - count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? - io_tlb_list[index + nslots] : 0); - /* - * Step 1: return the slots to the free list, merging the - * slots with superceeding slots - */ - for (i = index + nslots - 1; i >= index; i--) { - io_tlb_list[i] = ++count; - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - /* - * Step 2: merge the returned slots with the preceding slots, - * if available (non zero) - */ - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; - } - spin_unlock_irqrestore(&io_tlb_lock, flags); -} - -void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) -{ - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - - if (orig_addr == INVALID_PHYS_ADDR) - return; - orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); - - switch (target) { - case SYNC_FOR_CPU: - if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_FROM_DEVICE); - else - BUG_ON(dir != DMA_TO_DEVICE); - break; - case SYNC_FOR_DEVICE: - if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); - break; - default: - BUG(); - } -} - -static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr, - size_t size) -{ - u64 mask = DMA_BIT_MASK(32); - - if (dev && dev->coherent_dma_mask) - mask = dev->coherent_dma_mask; - return addr + size - 1 <= mask; -} - -static void * -swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned long attrs) -{ - phys_addr_t phys_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) - goto out_warn; - - phys_addr = swiotlb_tbl_map_single(dev, - __phys_to_dma(dev, io_tlb_start), - 0, size, DMA_FROM_DEVICE, attrs); - if (phys_addr == SWIOTLB_MAP_ERROR) - goto out_warn; - - *dma_handle = __phys_to_dma(dev, phys_addr); - if (!dma_coherent_ok(dev, *dma_handle, size)) - goto out_unmap; - - memset(phys_to_virt(phys_addr), 0, size); - return phys_to_virt(phys_addr); - -out_unmap: - dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", - (unsigned long long)dev->coherent_dma_mask, - (unsigned long long)*dma_handle); - - /* - * DMA_TO_DEVICE to avoid memcpy in unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); -out_warn: - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { - dev_warn(dev, - "swiotlb: coherent allocation failed, size=%zu\n", - size); - dump_stack(); - } - return NULL; -} - -static bool swiotlb_free_buffer(struct device *dev, size_t size, - dma_addr_t dma_addr) -{ - phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); - - WARN_ON_ONCE(irqs_disabled()); - - if (!is_swiotlb_buffer(phys_addr)) - return false; - - /* - * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); - return true; -} - -static void -swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, - int do_panic) -{ - if (swiotlb_force == SWIOTLB_NO_FORCE) - return; - - /* - * Ran out of IOMMU space for this operation. This is very bad. - * Unfortunately the drivers cannot handle this operation properly. - * unless they check for dma_mapping_error (most don't) - * When the mapping is small enough return a static buffer to limit - * the damage, or panic when the transfer is too big. - */ - dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", - size); - - if (size <= io_tlb_overflow || !do_panic) - return; - - if (dir == DMA_BIDIRECTIONAL) - panic("DMA: Random memory could be DMA accessed\n"); - if (dir == DMA_FROM_DEVICE) - panic("DMA: Random memory could be DMA written\n"); - if (dir == DMA_TO_DEVICE) - panic("DMA: Random memory could be DMA read\n"); -} - -/* - * Map a single buffer of the indicated size for DMA in streaming mode. The - * physical address to use is returned. - * - * Once the device is given the dma address, the device owns this memory until - * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. - */ -dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t map, phys = page_to_phys(page) + offset; - dma_addr_t dev_addr = phys_to_dma(dev, phys); - - BUG_ON(dir == DMA_NONE); - /* - * If the address happens to be in the device's DMA window, - * we can safely return the device addr and not worry about bounce - * buffering it. - */ - if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) - return dev_addr; - - trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - - /* Oh well, have to allocate and map a bounce buffer. */ - map = map_single(dev, phys, size, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - swiotlb_full(dev, size, dir, 1); - return __phys_to_dma(dev, io_tlb_overflow_buffer); - } - - dev_addr = __phys_to_dma(dev, map); - - /* Ensure that the address returned is DMA'ble */ - if (dma_capable(dev, dev_addr, size)) - return dev_addr; - - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); - - return __phys_to_dma(dev, io_tlb_overflow_buffer); -} - -/* - * Unmap a single streaming mode DMA translation. The dma_addr and size must - * match what was provided for in a previous swiotlb_map_page call. All - * other usages are undefined. - * - * After this call, reads by the cpu to the buffer are guaranteed to see - * whatever the device wrote there. - */ -static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); - - BUG_ON(dir == DMA_NONE); - - if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); - return; - } - - if (dir != DMA_FROM_DEVICE) - return; - - /* - * phys_to_virt doesn't work with hihgmem page but we could - * call dma_mark_clean() with hihgmem page here. However, we - * are fine since dma_mark_clean() is null on POWERPC. We can - * make dma_mark_clean() take a physical address if necessary. - */ - dma_mark_clean(phys_to_virt(paddr), size); -} - -void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - unmap_single(hwdev, dev_addr, size, dir, attrs); -} - -/* - * Make physical memory consistent for a single streaming mode DMA translation - * after a transfer. - * - * If you perform a swiotlb_map_page() but wish to interrogate the buffer - * using the cpu, yet do not wish to teardown the dma mapping, you must - * call this function before doing so. At the next point you give the dma - * address back to the card, you must first perform a - * swiotlb_dma_sync_for_device, and then the device again owns the buffer - */ -static void -swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) -{ - phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); - - BUG_ON(dir == DMA_NONE); - - if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - return; - } - - if (dir != DMA_FROM_DEVICE) - return; - - dma_mark_clean(phys_to_virt(paddr), size); -} - -void -swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) -{ - swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); -} - -void -swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) -{ - swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); -} - -/* - * Map a set of buffers described by scatterlist in streaming mode for DMA. - * This is the scatter-gather version of the above swiotlb_map_page - * interface. Here the scatter gather list elements are each tagged with the - * appropriate dma address and length. They are obtained via - * sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for swiotlb_map_page are the - * same here. - */ -int -swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir, unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - BUG_ON(dir == DMA_NONE); - - for_each_sg(sgl, sg, nelems, i) { - phys_addr_t paddr = sg_phys(sg); - dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); - - if (swiotlb_force == SWIOTLB_FORCE || - !dma_capable(hwdev, dev_addr, sg->length)) { - phys_addr_t map = map_single(hwdev, sg_phys(sg), - sg->length, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - /* Don't panic here, we expect map_sg users - to do proper error handling. */ - swiotlb_full(hwdev, sg->length, dir, 0); - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, - attrs); - sg_dma_len(sgl) = 0; - return 0; - } - sg->dma_address = __phys_to_dma(hwdev, map); - } else - sg->dma_address = dev_addr; - sg_dma_len(sg) = sg->length; - } - return nelems; -} - -/* - * Unmap a set of streaming mode DMA translations. Again, cpu read rules - * concerning calls here are the same as for swiotlb_unmap_page() above. - */ -void -swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - BUG_ON(dir == DMA_NONE); - - for_each_sg(sgl, sg, nelems, i) - unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, - attrs); -} - -/* - * Make physical memory consistent for a set of streaming mode DMA translations - * after a transfer. - * - * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules - * and usage. - */ -static void -swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - enum dma_sync_target target) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nelems, i) - swiotlb_sync_single(hwdev, sg->dma_address, - sg_dma_len(sg), dir, target); -} - -void -swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); -} - -void -swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); -} - -int -swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) -{ - return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer)); -} - -/* - * Return whether the given device DMA address mask can be supported - * properly. For example, if your device can only drive the low 24-bits - * during bus mastering, then you would pass 0x00ffffff as the mask to - * this function. - */ -int -swiotlb_dma_supported(struct device *hwdev, u64 mask) -{ - return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; -} - -void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - void *vaddr; - - /* temporary workaround: */ - if (gfp & __GFP_NOWARN) - attrs |= DMA_ATTR_NO_WARN; - - /* - * Don't print a warning when the first allocation attempt fails. - * swiotlb_alloc_coherent() will print a warning when the DMA memory - * allocation ultimately failed. - */ - gfp |= __GFP_NOWARN; - - vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); - if (!vaddr) - vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs); - return vaddr; -} - -void swiotlb_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, unsigned long attrs) -{ - if (!swiotlb_free_buffer(dev, size, dma_addr)) - dma_direct_free(dev, size, vaddr, dma_addr, attrs); -} - -const struct dma_map_ops swiotlb_dma_ops = { - .mapping_error = swiotlb_dma_mapping_error, - .alloc = swiotlb_alloc, - .free = swiotlb_free, - .sync_single_for_cpu = swiotlb_sync_single_for_cpu, - .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device, - .map_sg = swiotlb_map_sg_attrs, - .unmap_sg = swiotlb_unmap_sg_attrs, - .map_page = swiotlb_map_page, - .unmap_page = swiotlb_unmap_page, - .dma_supported = dma_direct_supported, -}; -- cgit v1.2.3 From 290c3982f66ab750e85863efcb1fdd736985e5d2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 13 Jun 2018 16:46:56 -0500 Subject: cifs: minor documentation updates Various minor cifs/smb3 documentation updates Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- Documentation/filesystems/cifs/AUTHORS | 7 ++++--- Documentation/filesystems/cifs/CHANGES | 3 +++ Documentation/filesystems/cifs/TODO | 17 +++++++++-------- 3 files changed, 16 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/cifs/AUTHORS b/Documentation/filesystems/cifs/AUTHORS index 9f4f87e16240..75865da2ce14 100644 --- a/Documentation/filesystems/cifs/AUTHORS +++ b/Documentation/filesystems/cifs/AUTHORS @@ -42,9 +42,11 @@ Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code) Scott Lovenberg Pavel Shilovsky (for great work adding SMB2 support, and various SMB3 features) Aurelien Aptel (for DFS SMB3 work and some key bug fixes) -Ronnie Sahlberg (for SMB3 xattr work and bug fixes) +Ronnie Sahlberg (for SMB3 xattr work, bug fixes, and lots of great work on compounding) Shirish Pargaonkar (for many ACL patches over the years) Sachin Prabhu (many bug fixes, including for reconnect, copy offload and security) +Paulo Alcantara +Long Li (some great work on RDMA, SMB Direct) Test case and Bug Report contributors @@ -58,5 +60,4 @@ mention to the Stanford Checker (SWAT) which pointed out many minor bugs in error paths. Valuable suggestions also have come from Al Viro and Dave Miller. -And thanks to the IBM LTC and Power test teams and SuSE testers for -finding multiple bugs during excellent stress test runs. +And thanks to the IBM LTC and Power test teams and SuSE and Citrix and RedHat testers for finding multiple bugs during excellent stress test runs. diff --git a/Documentation/filesystems/cifs/CHANGES b/Documentation/filesystems/cifs/CHANGES index bc0025cdd1c9..455e1cc494a9 100644 --- a/Documentation/filesystems/cifs/CHANGES +++ b/Documentation/filesystems/cifs/CHANGES @@ -1,3 +1,6 @@ +See https://wiki.samba.org/index.php/LinuxCIFSKernel for +more current information. + Version 1.62 ------------ Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO index c5adf149b57f..852499aed64b 100644 --- a/Documentation/filesystems/cifs/TODO +++ b/Documentation/filesystems/cifs/TODO @@ -9,14 +9,14 @@ is a partial list of the known problems and missing features: a) SMB3 (and SMB3.02) missing optional features: - multichannel (started), integration with RDMA - - directory leases (improved metadata caching) - - T10 copy offload (copy chunk, and "Duplicate Extents" ioctl + - directory leases (improved metadata caching), started (root dir only) + - T10 copy offload ie "ODX" (copy chunk, and "Duplicate Extents" ioctl currently the only two server side copy mechanisms supported) b) improved sparse file support c) Directory entry caching relies on a 1 second timer, rather than -using Directory Leases +using Directory Leases, currently only the root file handle is cached longer d) quota support (needs minor kernel change since quota calls to make it to network filesystems or deviceless filesystems) @@ -42,6 +42,8 @@ mount or a per server basis to client UIDs or nobody if no mapping exists. Also better integration with winbind for resolving SID owners k) Add tools to take advantage of more smb3 specific ioctls and features +(passthrough ioctl/fsctl for sending various SMB3 fsctls to the server +is in progress) l) encrypted file support @@ -71,9 +73,8 @@ t) split cifs and smb3 support into separate modules so legacy (and less secure) CIFS dialect can be disabled in environments that don't need it and simplify the code. -u) Finish up SMB3.1.1 dialect support - -v) POSIX Extensions for SMB3.1.1 +v) POSIX Extensions for SMB3.1.1 (started, create and mkdir support added +so far). KNOWN BUGS ==================================== @@ -92,8 +93,8 @@ Misc testing to do 1) check out max path names and max path name components against various server types. Try nested symlinks (8 deep). Return max path name in stat -f information -2) Improve xfstest's cifs enablement and adapt xfstests where needed to test -cifs better +2) Improve xfstest's cifs/smb3 enablement and adapt xfstests where needed to test +cifs/smb3 better 3) Additional performance testing and optimization using iozone and similar - there are some easy changes that can be done to parallelize sequential writes, -- cgit v1.2.3 From 3ed1d012ac3e60e0e95cda6fbd59352ec6dcbb88 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 16 Jun 2018 17:09:41 -0700 Subject: Fix Documentation build due to rename of main.c to mtrr.c This fixes this documentation build error that is due to a file rename: Error: Cannot open file ../arch/x86/kernel/cpu/mtrr/main.c Fixes: 0afe832e55a7 ("Merge branch 'x86-cleanups-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip") Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- Documentation/core-api/kernel-api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 8e44aea366c2..76fe2d0f5e7d 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -284,7 +284,7 @@ Resources Management MTRR Handling ------------- -.. kernel-doc:: arch/x86/kernel/cpu/mtrr/main.c +.. kernel-doc:: arch/x86/kernel/cpu/mtrr/mtrr.c :export: Security Framework -- cgit v1.2.3 From 3ec148ebe3112b40c9a2c0c543bcb0cd1a3abd43 Mon Sep 17 00:00:00 2001 From: Jaejoong Kim Date: Thu, 14 Jun 2018 18:56:31 +0900 Subject: doc: usb: Fix typo in gadget_configfs documentation Fix the directory name from 'configfs' to 'configs'. Signed-off-by: Jaejoong Kim Signed-off-by: Felipe Balbi --- Documentation/usb/gadget_configfs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/usb/gadget_configfs.txt b/Documentation/usb/gadget_configfs.txt index 635e57493709..b8cb38a98c19 100644 --- a/Documentation/usb/gadget_configfs.txt +++ b/Documentation/usb/gadget_configfs.txt @@ -226,7 +226,7 @@ $ rm configs/./ where . specify the configuration and is a symlink to a function being removed from the configuration, e.g.: -$ rm configfs/c.1/ncm.usb0 +$ rm configs/c.1/ncm.usb0 ... ... -- cgit v1.2.3 From 7a0f9d1eb51ff25d119b48fe7cc6aa0433cd6621 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Jun 2018 10:42:07 +0200 Subject: Documentation: intel_pstate: Fix typo Fix a typo in the intel_pstate admin-guide documentation. Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_pstate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index ab2fe0eda1d7..8b9164990956 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -410,7 +410,7 @@ argument is passed to the kernel in the command line. That only is supported in some configurations, though (for example, if the `HWP feature is enabled in the processor `_, the operation mode of the driver cannot be changed), and if it is not - supported in the current configuration, writes to this attribute with + supported in the current configuration, writes to this attribute will fail with an appropriate error. Interpretation of Policy Attributes -- cgit v1.2.3 From 064f35a952246c60e956717dfc5782c48f174e74 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 14 Jun 2018 15:48:59 -0700 Subject: tracing: Fix some errors in histogram documentation Fix typos, inconsistencies in using quotes, incorrect section number, etc. in the trace histogram documentation. Link: http://lkml.kernel.org/r/20180614224859.55864-1-joel@joelfernandes.org Reviewed-by: Masami Hiramatsu Acked-by: Tom Zanussi Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.txt | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt index e73bcf9cb5f3..7ffea6aa22e3 100644 --- a/Documentation/trace/histogram.txt +++ b/Documentation/trace/histogram.txt @@ -1729,35 +1729,35 @@ If a variable isn't a key variable or prefixed with 'vals=', the associated event field will be saved in a variable but won't be summed as a value: - # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger + # echo 'hist:keys=next_pid:ts1=common_timestamp ...' >> event/trigger Multiple variables can be assigned at the same time. The below would result in both ts0 and b being created as variables, with both common_timestamp and field1 additionally being summed as values: - # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \ + # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ...' >> \ event/trigger Note that variable assignments can appear either preceding or following their use. The command below behaves identically to the command above: - # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \ + # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ...' >> \ event/trigger Any number of variables not bound to a 'vals=' prefix can also be assigned by simply separating them with colons. Below is the same thing but without the values being summed in the histogram: - # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger + # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ...' >> event/trigger Variables set as above can be referenced and used in expressions on another event. For example, here's how a latency can be calculated: - # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger - # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger + # echo 'hist:keys=pid,prio:ts0=common_timestamp ...' >> event1/trigger + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ...' >> event2/trigger In the first line above, the event's timetamp is saved into the variable ts0. In the next line, ts0 is subtracted from the second @@ -1766,7 +1766,7 @@ yet another variable, 'wakeup_lat'. The hist trigger below in turn makes use of the wakeup_lat variable to compute a combined latency using the same key and variable from yet another event: - # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger + # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ...' >> event3/trigger 2.2.2 Synthetic Events ---------------------- @@ -1807,10 +1807,11 @@ the command that defined it with a '!': At this point, there isn't yet an actual 'wakeup_latency' event instantiated in the event subsytem - for this to happen, a 'hist trigger action' needs to be instantiated and bound to actual fields -and variables defined on other events (see Section 6.3.3 below). +and variables defined on other events (see Section 2.2.3 below on +how that is done using hist trigger 'onmatch' action). Once that is +done, the 'wakeup_latency' synthetic event instance is created. -Once that is done, an event instance is created, and a histogram can -be defined using it: +A histogram can now be defined for the new synthetic event: # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger @@ -1960,7 +1961,7 @@ hist trigger specification. back to that pid, the timestamp difference is calculated. If the resulting latency, stored in wakeup_lat, exceeds the current maximum latency, the values specified in the save() fields are - recoreded: + recorded: # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ if comm=="cyclictest"' >> \ -- cgit v1.2.3 From 2ddc649810133fcf8e5282eea898ee7ececf161e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 22 Jun 2018 16:56:14 +0200 Subject: KVM: fix KVM_CAP_HYPERV_TLBFLUSH paragraph number MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM_CAP_HYPERV_TLBFLUSH collided with KVM_CAP_S390_PSW-BPB, its paragraph number should now be 8.18. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Radim Krčmář --- Documentation/virtual/kvm/api.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 495b7742ab58..d10944e619d3 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4610,7 +4610,7 @@ This capability indicates that kvm will implement the interfaces to handle reset, migration and nested KVM for branch prediction blocking. The stfle facility 82 should not be provided to the guest without this capability. -8.14 KVM_CAP_HYPERV_TLBFLUSH +8.18 KVM_CAP_HYPERV_TLBFLUSH Architectures: x86 -- cgit v1.2.3 From 32e6996ca697099a4be5cd0dbd4b436699ca81a9 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Fri, 22 Jun 2018 10:37:05 +1000 Subject: Documentation: e100: Use correct heading adornment Recently documentation file was converted to rst. The document title has the incorrect heading adornment. From kernel docs: * Please stick to this order of heading adornments: 1. ``=`` with overline for document title:: ============== Document title ============== Add overline heading adornment to document title. Fixes commit (85d63445f411 Documentation: e100: Update the Intel 10/100 driver doc) CC: Jeff Kirsher Signed-off-by: Tobin C. Harding Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- Documentation/networking/e100.rst | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/networking/e100.rst b/Documentation/networking/e100.rst index d4d837027925..59b80608e27d 100644 --- a/Documentation/networking/e100.rst +++ b/Documentation/networking/e100.rst @@ -1,3 +1,4 @@ +============================================================== Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters ============================================================== -- cgit v1.2.3 From 3be40e54764d3ac9c004dc1646ab2c4cc0f0905e Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Fri, 22 Jun 2018 10:37:06 +1000 Subject: Documentation: e1000: Use correct heading adornment Recently documentation file was converted to rst. The document title has the incorrect heading adornment. From kernel docs: * Please stick to this order of heading adornments: 1. ``=`` with overline for document title:: ============== Document title ============== Add overline heading adornment to document title. Fixes commit (228046e76189 Documentation: e1000: Update kernel documentation) CC: Jeff Kirsher Signed-off-by: Tobin C. Harding Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- Documentation/networking/e1000.rst | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/networking/e1000.rst b/Documentation/networking/e1000.rst index 616848940e63..55f28e5043b6 100644 --- a/Documentation/networking/e1000.rst +++ b/Documentation/networking/e1000.rst @@ -1,3 +1,4 @@ +=========================================================== Linux* Base Driver for Intel(R) Ethernet Network Connection =========================================================== -- cgit v1.2.3 From 3b0c3ebe2a42ce18a59828acc4578166367dc7b5 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Fri, 22 Jun 2018 10:37:07 +1000 Subject: Documentation: e100: Fix docs build error Recent patch updated e100 docs to rst format. Docs build (`make htmldocs`) is currently failing due to this file with error: (SEVERE/4) Unexpected section title. This is because a section of the file is indented 2 spaces. Build error can be cleared by aligning the text with column 0. While we are changing these lines we can make sure line length does not exceed 72, that newlines following headings are uniform, and that full stops are followed by two spaces. Align text with column 0, limit line length to 72, ensure two spaces follow all full stops, ensure uniform use of newlines after heading. Fixes commit (85d63445f411 Documentation: e100: Update the Intel 10/100 driver doc) CC: Jeff Kirsher Signed-off-by: Tobin C. Harding Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- Documentation/networking/e100.rst | 115 +++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 57 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/e100.rst b/Documentation/networking/e100.rst index 59b80608e27d..9708f5fa76de 100644 --- a/Documentation/networking/e100.rst +++ b/Documentation/networking/e100.rst @@ -87,83 +87,84 @@ Event Log Message Level: The driver uses the message level flag to log events Additional Configurations ========================= - Configuring the Driver on Different Distributions - ------------------------------------------------- - - Configuring a network driver to load properly when the system is started is - distribution dependent. Typically, the configuration process involves adding - an alias line to /etc/modprobe.d/*.conf as well as editing other system - startup scripts and/or configuration files. Many popular Linux - distributions ship with tools to make these changes for you. To learn the - proper way to configure a network device for your system, refer to your - distribution documentation. If during this process you are asked for the - driver or module name, the name for the Linux Base Driver for the Intel - PRO/100 Family of Adapters is e100. - - As an example, if you install the e100 driver for two PRO/100 adapters - (eth0 and eth1), add the following to a configuration file in /etc/modprobe.d/ +Configuring the Driver on Different Distributions +------------------------------------------------- + +Configuring a network driver to load properly when the system is started +is distribution dependent. Typically, the configuration process involves +adding an alias line to /etc/modprobe.d/*.conf as well as editing other +system startup scripts and/or configuration files. Many popular Linux +distributions ship with tools to make these changes for you. To learn +the proper way to configure a network device for your system, refer to +your distribution documentation. If during this process you are asked +for the driver or module name, the name for the Linux Base Driver for +the Intel PRO/100 Family of Adapters is e100. + +As an example, if you install the e100 driver for two PRO/100 adapters +(eth0 and eth1), add the following to a configuration file in +/etc/modprobe.d/:: alias eth0 e100 alias eth1 e100 - Viewing Link Messages - --------------------- - In order to see link messages and other Intel driver information on your - console, you must set the dmesg level up to six. This can be done by - entering the following on the command line before loading the e100 driver:: +Viewing Link Messages +--------------------- - dmesg -n 6 - - If you wish to see all messages issued by the driver, including debug - messages, set the dmesg level to eight. +In order to see link messages and other Intel driver information on your +console, you must set the dmesg level up to six. This can be done by +entering the following on the command line before loading the e100 +driver:: - NOTE: This setting is not saved across reboots. + dmesg -n 6 +If you wish to see all messages issued by the driver, including debug +messages, set the dmesg level to eight. - ethtool - ------- +NOTE: This setting is not saved across reboots. - The driver utilizes the ethtool interface for driver configuration and - diagnostics, as well as displaying statistical information. The ethtool - version 1.6 or later is required for this functionality. +ethtool +------- - The latest release of ethtool can be found from - https://www.kernel.org/pub/software/network/ethtool/ +The driver utilizes the ethtool interface for driver configuration and +diagnostics, as well as displaying statistical information. The ethtool +version 1.6 or later is required for this functionality. - Enabling Wake on LAN* (WoL) - --------------------------- - WoL is provided through the ethtool* utility. For instructions on enabling - WoL with ethtool, refer to the ethtool man page. +The latest release of ethtool can be found from +https://www.kernel.org/pub/software/network/ethtool/ - WoL will be enabled on the system during the next shut down or reboot. For - this driver version, in order to enable WoL, the e100 driver must be - loaded when shutting down or rebooting the system. +Enabling Wake on LAN* (WoL) +--------------------------- +WoL is provided through the ethtool* utility. For instructions on +enabling WoL with ethtool, refer to the ethtool man page. WoL will be +enabled on the system during the next shut down or reboot. For this +driver version, in order to enable WoL, the e100 driver must be loaded +when shutting down or rebooting the system. - NAPI - ---- +NAPI +---- - NAPI (Rx polling mode) is supported in the e100 driver. +NAPI (Rx polling mode) is supported in the e100 driver. - See https://wiki.linuxfoundation.org/networking/napi for more information - on NAPI. +See https://wiki.linuxfoundation.org/networking/napi for more +information on NAPI. - Multiple Interfaces on Same Ethernet Broadcast Network - ------------------------------------------------------ +Multiple Interfaces on Same Ethernet Broadcast Network +------------------------------------------------------ - Due to the default ARP behavior on Linux, it is not possible to have - one system on two IP networks in the same Ethernet broadcast domain - (non-partitioned switch) behave as expected. All Ethernet interfaces - will respond to IP traffic for any IP address assigned to the system. - This results in unbalanced receive traffic. +Due to the default ARP behavior on Linux, it is not possible to have one +system on two IP networks in the same Ethernet broadcast domain +(non-partitioned switch) behave as expected. All Ethernet interfaces +will respond to IP traffic for any IP address assigned to the system. +This results in unbalanced receive traffic. - If you have multiple interfaces in a server, either turn on ARP - filtering by +If you have multiple interfaces in a server, either turn on ARP +filtering by - (1) entering:: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter - (this only works if your kernel's version is higher than 2.4.5), or +(1) entering:: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter + (this only works if your kernel's version is higher than 2.4.5), or - (2) installing the interfaces in separate broadcast domains (either - in different switches or in a switch partitioned to VLANs). +(2) installing the interfaces in separate broadcast domains (either + in different switches or in a switch partitioned to VLANs). Support -- cgit v1.2.3 From 805f16a5f12fd68e10841013ccfaceb2f4d7066a Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Fri, 22 Jun 2018 10:37:08 +1000 Subject: Documentation: e1000: Fix docs build error Recent patch updated e1000 docs to rst format. Docs build (`make htmldocs`) is currently failing due to this file with error: (SEVERE/4) Unexpected section title. This is because a section of the file is indented 2 spaces. Build error can be cleared by aligning the text with column 0. While we are changing these lines we can make sure line length does not exceed 72, that newlines following headings are uniform, and that full stops are followed by two spaces. Align text with column 0, limit line length to 72, ensure two spaces follow all full stops, ensure uniform use of newlines after heading. Fixes commit (228046e76189 Documentation: e1000: Update kernel documentation) CC: Jeff Kirsher Signed-off-by: Tobin C. Harding Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- Documentation/networking/e1000.rst | 75 +++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 37 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/e1000.rst b/Documentation/networking/e1000.rst index 55f28e5043b6..144b87eef153 100644 --- a/Documentation/networking/e1000.rst +++ b/Documentation/networking/e1000.rst @@ -355,57 +355,58 @@ previously mentioned to force the adapter to the same speed and duplex. Additional Configurations ========================= - Jumbo Frames - ------------ - Jumbo Frames support is enabled by changing the MTU to a value larger than - the default of 1500. Use the ifconfig command to increase the MTU size. - For example:: +Jumbo Frames +------------ +Jumbo Frames support is enabled by changing the MTU to a value larger +than the default of 1500. Use the ifconfig command to increase the MTU +size. For example:: ifconfig eth mtu 9000 up - This setting is not saved across reboots. It can be made permanent if - you add:: +This setting is not saved across reboots. It can be made permanent if +you add:: MTU=9000 - to the file /etc/sysconfig/network-scripts/ifcfg-eth. This example - applies to the Red Hat distributions; other distributions may store this - setting in a different location. +to the file /etc/sysconfig/network-scripts/ifcfg-eth. This example +applies to the Red Hat distributions; other distributions may store this +setting in a different location. - Notes: - Degradation in throughput performance may be observed in some Jumbo frames - environments. If this is observed, increasing the application's socket buffer - size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values may help. - See the specific application manual and /usr/src/linux*/Documentation/ - networking/ip-sysctl.txt for more details. +Notes: Degradation in throughput performance may be observed in some +Jumbo frames environments. If this is observed, increasing the +application's socket buffer size and/or increasing the +/proc/sys/net/ipv4/tcp_*mem entry values may help. See the specific +application manual and /usr/src/linux*/Documentation/ +networking/ip-sysctl.txt for more details. - - The maximum MTU setting for Jumbo Frames is 16110. This value coincides - with the maximum Jumbo Frames size of 16128. +- The maximum MTU setting for Jumbo Frames is 16110. This value + coincides with the maximum Jumbo Frames size of 16128. - - Using Jumbo frames at 10 or 100 Mbps is not supported and may result in - poor performance or loss of link. +- Using Jumbo frames at 10 or 100 Mbps is not supported and may result + in poor performance or loss of link. - - Adapters based on the Intel(R) 82542 and 82573V/E controller do not - support Jumbo Frames. These correspond to the following product names: - Intel(R) PRO/1000 Gigabit Server Adapter - Intel(R) PRO/1000 PM Network Connection +- Adapters based on the Intel(R) 82542 and 82573V/E controller do not + support Jumbo Frames. These correspond to the following product names: + Intel(R) PRO/1000 Gigabit Server Adapter Intel(R) PRO/1000 PM Network + Connection - ethtool - ------- - The driver utilizes the ethtool interface for driver configuration and - diagnostics, as well as displaying statistical information. The ethtool - version 1.6 or later is required for this functionality. +ethtool +------- +The driver utilizes the ethtool interface for driver configuration and +diagnostics, as well as displaying statistical information. The ethtool +version 1.6 or later is required for this functionality. + +The latest release of ethtool can be found from +https://www.kernel.org/pub/software/network/ethtool/ - The latest release of ethtool can be found from - https://www.kernel.org/pub/software/network/ethtool/ +Enabling Wake on LAN* (WoL) +--------------------------- +WoL is configured through the ethtool* utility. - Enabling Wake on LAN* (WoL) - --------------------------- - WoL is configured through the ethtool* utility. +WoL will be enabled on the system during the next shut down or reboot. +For this driver version, in order to enable WoL, the e1000 driver must be +loaded when shutting down or rebooting the system. - WoL will be enabled on the system during the next shut down or reboot. - For this driver version, in order to enable WoL, the e1000 driver must be - loaded when shutting down or rebooting the system. Support ======= -- cgit v1.2.3 From 3531456aba6c8a8c905730af96dbb83608538b71 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Sun, 24 Jun 2018 18:14:01 +0530 Subject: strparser: Corrected typo in documentation. Replaced strp_pause() with strp_unpause() to correct a seemingly copy paste documentation mistake. Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- Documentation/networking/strparser.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/networking/strparser.txt b/Documentation/networking/strparser.txt index 13081b3decef..a7d354ddda7b 100644 --- a/Documentation/networking/strparser.txt +++ b/Documentation/networking/strparser.txt @@ -48,7 +48,7 @@ void strp_pause(struct strparser *strp) Temporarily pause a stream parser. Message parsing is suspended and no new messages are delivered to the upper layer. -void strp_pause(struct strparser *strp) +void strp_unpause(struct strparser *strp) Unpause a paused stream parser. -- cgit v1.2.3 From bdb60101df4a2999608430112a5abfb78628db1e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Jun 2018 20:08:21 -0700 Subject: kconfig: document Kconfig source file comments I saw this type of Kconfig construct on LKML: config SYMBOOL #bool "prompt string" default y and wondered what it does. Then I wondered if '#' comments are even documented. They aren't, so add a little doc for that. Ah, good. kconfig says: arch/x86/Kconfig:2942:warning: config symbol defined without type Signed-off-by: Randy Dunlap Signed-off-by: Masahiro Yamada --- Documentation/kbuild/kconfig-language.txt | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index 3534a84d206c..64e0775a62d4 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt @@ -430,6 +430,12 @@ This sets the config program's title bar if the config program chooses to use it. It should be placed at the top of the configuration, before any other statement. +'#' Kconfig source file comment: + +An unquoted '#' character anywhere in a source file line indicates +the beginning of a source file comment. The remainder of that line +is a comment. + Kconfig hints ------------- -- cgit v1.2.3 From 9e421b8fffb92bfd1c274f1deae611a6ab99d8a7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Jun 2018 12:09:51 +0200 Subject: Documentation: admin-guide: intel_pstate: Fix sysfs path Fix an incorrect sysfs path in the intel_pstate admin-guide documentation. Fixes: 33fc30b47098 (cpufreq: intel_pstate: Document the current behavior and user interface) Reported-by: Pawit Pornkitprasan Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_pstate.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 8b9164990956..d74fb572f6cc 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -324,8 +324,7 @@ Global Attributes ``intel_pstate`` exposes several global attributes (files) in ``sysfs`` to control its functionality at the system level. They are located in the -``/sys/devices/system/cpu/cpufreq/intel_pstate/`` directory and affect all -CPUs. +``/sys/devices/system/cpu/intel_pstate/`` directory and affect all CPUs. Some of them are not present if the ``intel_pstate=per_cpu_perf_limits`` argument is passed to the kernel in the command line. -- cgit v1.2.3 From 649f53a3e4cf6873741673b9271275e484c56194 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Jun 2018 17:20:43 +0200 Subject: Documentation: intel_pstate: Describe hwp_dynamic_boost sysfs knob Document the recently introduced hwp_dynamic_boost sysfs knob allowing user space to tell intel_pstate to use iowait boosting in the active mode with HWP enabled (to improve performance). Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- Documentation/admin-guide/pm/intel_pstate.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index d74fb572f6cc..8f1d3de449b5 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -378,6 +378,17 @@ argument is passed to the kernel in the command line. but it affects the maximum possible value of per-policy P-state limits (see `Interpretation of Policy Attributes`_ below for details). +``hwp_dynamic_boost`` + This attribute is only present if ``intel_pstate`` works in the + `active mode with the HWP feature enabled `_ in + the processor. If set (equal to 1), it causes the minimum P-state limit + to be increased dynamically for a short time whenever a task previously + waiting on I/O is selected to run on a given logical CPU (the purpose + of this mechanism is to improve performance). + + This setting has no effect on logical CPUs whose minimum P-state limit + is directly set to the highest non-turbo P-state or above it. + .. _status_attr: ``status`` -- cgit v1.2.3 From a11e1d432b51f63ba698d044441284a661f01144 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jun 2018 09:43:44 -0700 Subject: Revert changes to convert to ->poll_mask() and aio IOCB_CMD_POLL The poll() changes were not well thought out, and completely unexplained. They also caused a huge performance regression, because "->poll()" was no longer a trivial file operation that just called down to the underlying file operations, but instead did at least two indirect calls. Indirect calls are sadly slow now with the Spectre mitigation, but the performance problem could at least be largely mitigated by changing the "->get_poll_head()" operation to just have a per-file-descriptor pointer to the poll head instead. That gets rid of one of the new indirections. But that doesn't fix the new complexity that is completely unwarranted for the regular case. The (undocumented) reason for the poll() changes was some alleged AIO poll race fixing, but we don't make the common case slower and more complex for some uncommon special case, so this all really needs way more explanations and most likely a fundamental redesign. [ This revert is a revert of about 30 different commits, not reverted individually because that would just be unnecessarily messy - Linus ] Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 7 +- Documentation/filesystems/vfs.txt | 13 ---- crypto/af_alg.c | 13 +++- crypto/algif_aead.c | 4 +- crypto/algif_skcipher.c | 4 +- drivers/char/random.c | 29 ++++---- drivers/isdn/mISDN/socket.c | 2 +- drivers/net/ppp/pppoe.c | 2 +- fs/aio.c | 148 +------------------------------------- fs/eventfd.c | 19 ++--- fs/eventpoll.c | 15 ++-- fs/pipe.c | 22 +++--- fs/select.c | 23 ------ fs/timerfd.c | 22 +++--- include/crypto/if_alg.h | 3 +- include/linux/fs.h | 2 - include/linux/net.h | 1 - include/linux/poll.h | 12 ++-- include/linux/skbuff.h | 3 +- include/net/bluetooth/bluetooth.h | 2 +- include/net/iucv/af_iucv.h | 2 + include/net/sctp/sctp.h | 3 +- include/net/tcp.h | 3 +- include/net/tls.h | 6 +- include/net/udp.h | 2 +- include/uapi/linux/aio_abi.h | 8 ++- net/appletalk/ddp.c | 2 +- net/atm/common.c | 11 ++- net/atm/common.h | 2 +- net/atm/pvc.c | 2 +- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 2 +- net/bluetooth/af_bluetooth.c | 7 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/l2cap_sock.c | 2 +- net/bluetooth/rfcomm/sock.c | 2 +- net/bluetooth/sco.c | 2 +- net/caif/caif_socket.c | 12 ++-- net/can/bcm.c | 2 +- net/can/raw.c | 2 +- net/core/datagram.c | 13 ++-- net/dccp/dccp.h | 3 +- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/dccp/proto.c | 13 +++- net/decnet/af_decnet.c | 6 +- net/ieee802154/socket.c | 4 +- net/ipv4/af_inet.c | 8 +-- net/ipv4/tcp.c | 23 ++++-- net/ipv4/udp.c | 10 +-- net/ipv6/af_inet6.c | 4 +- net/ipv6/raw.c | 4 +- net/iucv/af_iucv.c | 7 +- net/kcm/kcmsock.c | 10 +-- net/key/af_key.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- net/l2tp/l2tp_ppp.c | 2 +- net/llc/af_llc.c | 2 +- net/netlink/af_netlink.c | 2 +- net/netrom/af_netrom.c | 2 +- net/nfc/llcp_sock.c | 9 ++- net/nfc/rawsock.c | 4 +- net/packet/af_packet.c | 9 +-- net/phonet/socket.c | 9 ++- net/qrtr/qrtr.c | 2 +- net/rose/af_rose.c | 2 +- net/rxrpc/af_rxrpc.c | 10 ++- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 2 +- net/sctp/socket.c | 4 +- net/smc/af_smc.c | 12 +++- net/socket.c | 48 ++----------- net/tipc/socket.c | 14 ++-- net/tls/tls_main.c | 2 +- net/tls/tls_sw.c | 19 ++--- net/unix/af_unix.c | 30 +++++--- net/vmw_vsock/af_vsock.c | 19 +++-- net/x25/af_x25.c | 2 +- net/xdp/xsk.c | 7 +- 80 files changed, 301 insertions(+), 450 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2c391338c675..37bf0a9de75c 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -441,8 +441,6 @@ prototypes: int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -473,7 +471,7 @@ prototypes: }; locking rules: - All except for ->poll_mask may block. + All may block. ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you @@ -505,9 +503,6 @@ in sys_read() and friends. the lease within the individual filesystem to record the result of the operation -->poll_mask can be called with or without the waitqueue lock for the waitqueue -returned from ->get_poll_head. - --------------------------- dquot_operations ------------------------------- prototypes: int (*write_dquot) (struct dquot *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 829a7b7857a4..f608180ad59d 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -857,8 +857,6 @@ struct file_operations { ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -903,17 +901,6 @@ otherwise noted. activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls - get_poll_head: Returns the struct wait_queue_head that callers can - wait on. Callers need to check the returned events using ->poll_mask - once woken. Can return NULL to indicate polling is not supported, - or any error code using the ERR_PTR convention to indicate that a - grave error occured and ->poll_mask shall not be called. - - poll_mask: return the mask of EPOLL* values describing the file descriptor - state. Called either before going to sleep on the waitqueue returned by - get_poll_head, or after it has been woken. If ->get_poll_head and - ->poll_mask are implemented ->poll does not need to be implement. - unlocked_ioctl: called by the ioctl(2) system call. compat_ioctl: called by the ioctl(2) system call when 32 bit system calls diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 49fa8582138b..314c52c967e5 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1060,12 +1060,19 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err) } EXPORT_SYMBOL_GPL(af_alg_async_cb); -__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events) +/** + * af_alg_poll - poll system call handler + */ +__poll_t af_alg_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; if (!ctx->more || ctx->used) mask |= EPOLLIN | EPOLLRDNORM; @@ -1075,7 +1082,7 @@ __poll_t af_alg_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL_GPL(af_alg_poll_mask); +EXPORT_SYMBOL_GPL(af_alg_poll); /** * af_alg_alloc_areq - allocate struct af_alg_async_req diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index 825524f27438..c40a8c7ee8ae 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = { .sendmsg = aead_sendmsg, .sendpage = af_alg_sendpage, .recvmsg = aead_recvmsg, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static int aead_check_key(struct socket *sock) @@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = { .sendmsg = aead_sendmsg_nokey, .sendpage = aead_sendpage_nokey, .recvmsg = aead_recvmsg_nokey, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static void *aead_bind(const char *name, u32 type, u32 mask) diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 4c04eb9888ad..cfdaab2b7d76 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -206,7 +206,7 @@ static struct proto_ops algif_skcipher_ops = { .sendmsg = skcipher_sendmsg, .sendpage = af_alg_sendpage, .recvmsg = skcipher_recvmsg, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static int skcipher_check_key(struct socket *sock) @@ -302,7 +302,7 @@ static struct proto_ops algif_skcipher_ops_nokey = { .sendmsg = skcipher_sendmsg_nokey, .sendpage = skcipher_sendpage_nokey, .recvmsg = skcipher_recvmsg_nokey, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static void *skcipher_bind(const char *name, u32 type, u32 mask) diff --git a/drivers/char/random.c b/drivers/char/random.c index a8fb0020ba5c..cd888d4ee605 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -402,7 +402,8 @@ static struct poolinfo { /* * Static global variables */ -static DECLARE_WAIT_QUEUE_HEAD(random_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_read_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_write_wait); static struct fasync_struct *fasync; static DEFINE_SPINLOCK(random_ready_list_lock); @@ -721,8 +722,8 @@ retry: /* should we wake readers? */ if (entropy_bits >= random_read_wakeup_bits && - wq_has_sleeper(&random_wait)) { - wake_up_interruptible_poll(&random_wait, POLLIN); + wq_has_sleeper(&random_read_wait)) { + wake_up_interruptible(&random_read_wait); kill_fasync(&fasync, SIGIO, POLL_IN); } /* If the input pool is getting full, send some @@ -1396,7 +1397,7 @@ retry: trace_debit_entropy(r->name, 8 * ibytes); if (ibytes && (r->entropy_count >> ENTROPY_SHIFT) < random_write_wakeup_bits) { - wake_up_interruptible_poll(&random_wait, POLLOUT); + wake_up_interruptible(&random_write_wait); kill_fasync(&fasync, SIGIO, POLL_OUT); } @@ -1838,7 +1839,7 @@ _random_read(int nonblock, char __user *buf, size_t nbytes) if (nonblock) return -EAGAIN; - wait_event_interruptible(random_wait, + wait_event_interruptible(random_read_wait, ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits); if (signal_pending(current)) @@ -1875,17 +1876,14 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) return ret; } -static struct wait_queue_head * -random_get_poll_head(struct file *file, __poll_t events) -{ - return &random_wait; -} - static __poll_t -random_poll_mask(struct file *file, __poll_t events) +random_poll(struct file *file, poll_table * wait) { - __poll_t mask = 0; + __poll_t mask; + poll_wait(file, &random_read_wait, wait); + poll_wait(file, &random_write_wait, wait); + mask = 0; if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits) mask |= EPOLLIN | EPOLLRDNORM; if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits) @@ -1992,8 +1990,7 @@ static int random_fasync(int fd, struct file *filp, int on) const struct file_operations random_fops = { .read = random_read, .write = random_write, - .get_poll_head = random_get_poll_head, - .poll_mask = random_poll_mask, + .poll = random_poll, .unlocked_ioctl = random_ioctl, .fasync = random_fasync, .llseek = noop_llseek, @@ -2326,7 +2323,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count, * We'll be woken up again once below random_write_wakeup_thresh, * or when the calling thread is about to terminate. */ - wait_event_interruptible(random_wait, kthread_should_stop() || + wait_event_interruptible(random_write_wait, kthread_should_stop() || ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits); mix_pool_bytes(poolp, buffer, count); credit_entropy_bits(poolp, entropy); diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 98f90aadd141..18c0a1281914 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = { .getname = data_sock_getname, .sendmsg = mISDN_sock_sendmsg, .recvmsg = mISDN_sock_recvmsg, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = data_sock_setsockopt, diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index de51e8f70f44..ce61231e96ea 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1107,7 +1107,7 @@ static const struct proto_ops pppoe_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppoe_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/fs/aio.c b/fs/aio.c index e1d20124ec0e..210df9da1283 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -5,7 +5,6 @@ * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. - * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ @@ -165,22 +164,10 @@ struct fsync_iocb { bool datasync; }; -struct poll_iocb { - struct file *file; - __poll_t events; - struct wait_queue_head *head; - - union { - struct wait_queue_entry wait; - struct work_struct work; - }; -}; - struct aio_kiocb { union { struct kiocb rw; struct fsync_iocb fsync; - struct poll_iocb poll; }; struct kioctx *ki_ctx; @@ -1590,6 +1577,7 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; + req->file = fget(iocb->aio_fildes); if (unlikely(!req->file)) return -EBADF; @@ -1604,137 +1592,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) return 0; } -/* need to use list_del_init so we can check if item was present */ -static inline bool __aio_poll_remove(struct poll_iocb *req) -{ - if (list_empty(&req->wait.entry)) - return false; - list_del_init(&req->wait.entry); - return true; -} - -static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - fput(iocb->poll.file); - aio_complete(iocb, mangle_poll(mask), 0); -} - -static void aio_poll_work(struct work_struct *work) -{ - struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work); - - if (!list_empty_careful(&iocb->ki_list)) - aio_remove_iocb(iocb); - __aio_poll_complete(iocb, iocb->poll.events); -} - -static int aio_poll_cancel(struct kiocb *iocb) -{ - struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); - struct poll_iocb *req = &aiocb->poll; - struct wait_queue_head *head = req->head; - bool found = false; - - spin_lock(&head->lock); - found = __aio_poll_remove(req); - spin_unlock(&head->lock); - - if (found) { - req->events = 0; - INIT_WORK(&req->work, aio_poll_work); - schedule_work(&req->work); - } - return 0; -} - -static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); - struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); - struct file *file = req->file; - __poll_t mask = key_to_poll(key); - - assert_spin_locked(&req->head->lock); - - /* for instances that support it check for an event match first: */ - if (mask && !(mask & req->events)) - return 0; - - mask = file->f_op->poll_mask(file, req->events) & req->events; - if (!mask) - return 0; - - __aio_poll_remove(req); - - /* - * Try completing without a context switch if we can acquire ctx_lock - * without spinning. Otherwise we need to defer to a workqueue to - * avoid a deadlock due to the lock order. - */ - if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { - list_del_init(&iocb->ki_list); - spin_unlock(&iocb->ki_ctx->ctx_lock); - - __aio_poll_complete(iocb, mask); - } else { - req->events = mask; - INIT_WORK(&req->work, aio_poll_work); - schedule_work(&req->work); - } - - return 1; -} - -static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) -{ - struct kioctx *ctx = aiocb->ki_ctx; - struct poll_iocb *req = &aiocb->poll; - __poll_t mask; - - /* reject any unknown events outside the normal event mask. */ - if ((u16)iocb->aio_buf != iocb->aio_buf) - return -EINVAL; - /* reject fields that are not defined for poll */ - if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) - return -EINVAL; - - req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; - if (!file_has_poll_mask(req->file)) - goto out_fail; - - req->head = req->file->f_op->get_poll_head(req->file, req->events); - if (!req->head) - goto out_fail; - if (IS_ERR(req->head)) { - mask = EPOLLERR; - goto done; - } - - init_waitqueue_func_entry(&req->wait, aio_poll_wake); - aiocb->ki_cancel = aio_poll_cancel; - - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - mask = req->file->f_op->poll_mask(req->file, req->events) & req->events; - if (!mask) { - __add_wait_queue(req->head, &req->wait); - list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - } - spin_unlock(&req->head->lock); - spin_unlock_irq(&ctx->ctx_lock); -done: - if (mask) - __aio_poll_complete(aiocb, mask); - return 0; -out_fail: - fput(req->file); - return -EINVAL; /* same as no support for IOCB_CMD_POLL */ -} - static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { @@ -1808,9 +1665,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, case IOCB_CMD_FDSYNC: ret = aio_fsync(&req->fsync, &iocb, true); break; - case IOCB_CMD_POLL: - ret = aio_poll(req, &iocb); - break; default: pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); ret = -EINVAL; diff --git a/fs/eventfd.c b/fs/eventfd.c index ceb1031f1cac..08d3bd602f73 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -101,20 +101,14 @@ static int eventfd_release(struct inode *inode, struct file *file) return 0; } -static struct wait_queue_head * -eventfd_get_poll_head(struct file *file, __poll_t events) -{ - struct eventfd_ctx *ctx = file->private_data; - - return &ctx->wqh; -} - -static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) +static __poll_t eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; __poll_t events = 0; u64 count; + poll_wait(file, &ctx->wqh, wait); + /* * All writes to ctx->count occur within ctx->wqh.lock. This read * can be done outside ctx->wqh.lock because we know that poll_wait @@ -156,11 +150,11 @@ static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) count = READ_ONCE(ctx->count); if (count > 0) - events |= (EPOLLIN & eventmask); + events |= EPOLLIN; if (count == ULLONG_MAX) events |= EPOLLERR; if (ULLONG_MAX - 1 > count) - events |= (EPOLLOUT & eventmask); + events |= EPOLLOUT; return events; } @@ -311,8 +305,7 @@ static const struct file_operations eventfd_fops = { .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, - .get_poll_head = eventfd_get_poll_head, - .poll_mask = eventfd_poll_mask, + .poll = eventfd_poll, .read = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ea4436f409fb..67db22fe99c5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -922,18 +922,14 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head return 0; } -static struct wait_queue_head *ep_eventpoll_get_poll_head(struct file *file, - __poll_t eventmask) -{ - struct eventpoll *ep = file->private_data; - return &ep->poll_wait; -} - -static __poll_t ep_eventpoll_poll_mask(struct file *file, __poll_t eventmask) +static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { struct eventpoll *ep = file->private_data; int depth = 0; + /* Insert inside our poll wait queue */ + poll_wait(file, &ep->poll_wait, wait); + /* * Proceed to find out if wanted events are really available inside * the ready list. @@ -972,8 +968,7 @@ static const struct file_operations eventpoll_fops = { .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, - .get_poll_head = ep_eventpoll_get_poll_head, - .poll_mask = ep_eventpoll_poll_mask, + .poll = ep_eventpoll_poll, .llseek = noop_llseek, }; diff --git a/fs/pipe.c b/fs/pipe.c index bb0840e234f3..39d6f431da83 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -509,22 +509,19 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } -static struct wait_queue_head * -pipe_get_poll_head(struct file *filp, __poll_t events) -{ - struct pipe_inode_info *pipe = filp->private_data; - - return &pipe->wait; -} - /* No kernel lock held - fine */ -static __poll_t pipe_poll_mask(struct file *filp, __poll_t events) +static __poll_t +pipe_poll(struct file *filp, poll_table *wait) { + __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - int nrbufs = pipe->nrbufs; - __poll_t mask = 0; + int nrbufs; + + poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ + nrbufs = pipe->nrbufs; + mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; if (!pipe->writers && filp->f_version != pipe->w_counter) @@ -1023,8 +1020,7 @@ const struct file_operations pipefifo_fops = { .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, - .get_poll_head = pipe_get_poll_head, - .poll_mask = pipe_poll_mask, + .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, diff --git a/fs/select.c b/fs/select.c index 317891ff8165..4a6b6e4b21cb 100644 --- a/fs/select.c +++ b/fs/select.c @@ -34,29 +34,6 @@ #include -__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) -{ - if (file->f_op->poll) { - return file->f_op->poll(file, pt); - } else if (file_has_poll_mask(file)) { - unsigned int events = poll_requested_events(pt); - struct wait_queue_head *head; - - if (pt && pt->_qproc) { - head = file->f_op->get_poll_head(file, events); - if (!head) - return DEFAULT_POLLMASK; - if (IS_ERR(head)) - return EPOLLERR; - pt->_qproc(file, head, pt); - } - - return file->f_op->poll_mask(file, events); - } else { - return DEFAULT_POLLMASK; - } -} -EXPORT_SYMBOL_GPL(vfs_poll); /* * Estimate expected accuracy in ns from a timeval. diff --git a/fs/timerfd.c b/fs/timerfd.c index d84a2bee4f82..cdad49da3ff7 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -226,20 +226,21 @@ static int timerfd_release(struct inode *inode, struct file *file) kfree_rcu(ctx, rcu); return 0; } - -static struct wait_queue_head *timerfd_get_poll_head(struct file *file, - __poll_t eventmask) + +static __poll_t timerfd_poll(struct file *file, poll_table *wait) { struct timerfd_ctx *ctx = file->private_data; + __poll_t events = 0; + unsigned long flags; - return &ctx->wqh; -} + poll_wait(file, &ctx->wqh, wait); -static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask) -{ - struct timerfd_ctx *ctx = file->private_data; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->ticks) + events |= EPOLLIN; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); - return ctx->ticks ? EPOLLIN : 0; + return events; } static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, @@ -363,8 +364,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg static const struct file_operations timerfd_fops = { .release = timerfd_release, - .get_poll_head = timerfd_get_poll_head, - .poll_mask = timerfd_poll_mask, + .poll = timerfd_poll, .read = timerfd_read, .llseek = noop_llseek, .show_fdinfo = timerfd_show, diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index cc414db9da0a..482461d8931d 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -245,7 +245,8 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); void af_alg_free_resources(struct af_alg_async_req *areq); void af_alg_async_cb(struct crypto_async_request *_req, int err); -__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events); +__poll_t af_alg_poll(struct file *file, struct socket *sock, + poll_table *wait); struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk, unsigned int areqlen); int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, diff --git a/include/linux/fs.h b/include/linux/fs.h index 5c91108846db..d78d146a98da 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1720,8 +1720,6 @@ struct file_operations { int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); diff --git a/include/linux/net.h b/include/linux/net.h index 08b6eb964dd6..6554d3ba4396 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -147,7 +147,6 @@ struct proto_ops { int (*getname) (struct socket *sock, struct sockaddr *addr, int peer); - __poll_t (*poll_mask) (struct socket *sock, __poll_t events); __poll_t (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, diff --git a/include/linux/poll.h b/include/linux/poll.h index fdf86b4cbc71..7e0fdcf905d2 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -74,18 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) pt->_key = ~(__poll_t)0; /* all events enabled */ } -static inline bool file_has_poll_mask(struct file *file) +static inline bool file_can_poll(struct file *file) { - return file->f_op->get_poll_head && file->f_op->poll_mask; + return file->f_op->poll; } -static inline bool file_can_poll(struct file *file) +static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) { - return file->f_op->poll || file_has_poll_mask(file); + if (unlikely(!file->f_op->poll)) + return DEFAULT_POLLMASK; + return file->f_op->poll(file, pt); } -__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt); - struct poll_table_entry { struct file *filp; __poll_t key; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c86885954994..164cdedf6012 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3252,7 +3252,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, int *peeked, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err); -__poll_t datagram_poll_mask(struct socket *sock, __poll_t events); +__poll_t datagram_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 53ce8176c313..ec9d6bc65855 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -271,7 +271,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); -__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events); +__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); int bt_sock_wait_ready(struct sock *sk, unsigned long flags); diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index b0eaeb02d46d..f4c21b5a1242 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -153,6 +153,8 @@ struct iucv_sock_list { atomic_t autobind_name; }; +__poll_t iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait); void iucv_sock_link(struct iucv_sock_list *l, struct sock *s); void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s); void iucv_accept_enqueue(struct sock *parent, struct sock *sk); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 30b3e2fe240a..8c2caa370e0f 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -109,7 +109,8 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); void sctp_data_ready(struct sock *sk); -__poll_t sctp_poll_mask(struct socket *sock, __poll_t events); +__poll_t sctp_poll(struct file *file, struct socket *sock, + poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc); diff --git a/include/net/tcp.h b/include/net/tcp.h index 0448e7c5d2b4..800582b5dd54 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -388,7 +388,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op); -__poll_t tcp_poll_mask(struct socket *sock, __poll_t events); +__poll_t tcp_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, diff --git a/include/net/tls.h b/include/net/tls.h index 7f84ea3e217c..70c273777fe9 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -109,7 +109,8 @@ struct tls_sw_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); - __poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); struct sk_buff *recv_pkt; u8 control; bool decrypted; @@ -224,7 +225,8 @@ void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events); +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/include/net/udp.h b/include/net/udp.h index b1ea8b0f5e6a..81afdacd4fff 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -285,7 +285,7 @@ int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); -__poll_t udp_poll_mask(struct socket *sock, __poll_t events); +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index d00221345c19..d4e768d55d14 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -39,8 +39,10 @@ enum { IOCB_CMD_PWRITE = 1, IOCB_CMD_FSYNC = 2, IOCB_CMD_FDSYNC = 3, - /* 4 was the experimental IOCB_CMD_PREADX */ - IOCB_CMD_POLL = 5, + /* These two are experimental. + * IOCB_CMD_PREADX = 4, + * IOCB_CMD_POLL = 5, + */ IOCB_CMD_NOOP = 6, IOCB_CMD_PREADV = 7, IOCB_CMD_PWRITEV = 8, @@ -109,7 +111,7 @@ struct iocb { #undef IFLITTLE struct __aio_sigset { - const sigset_t __user *sigmask; + sigset_t __user *sigmask; size_t sigsetsize; }; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 55fdba05d7d9..9b6bc5abe946 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = atalk_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = atalk_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = atalk_compat_ioctl, diff --git a/net/atm/common.c b/net/atm/common.c index ff5748b2190f..a7a68e509628 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -647,11 +647,16 @@ out: return error; } -__poll_t vcc_poll_mask(struct socket *sock, __poll_t events) +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - struct atm_vcc *vcc = ATM_SD(sock); - __poll_t mask = 0; + struct atm_vcc *vcc; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + vcc = ATM_SD(sock); /* exceptional events */ if (sk->sk_err) diff --git a/net/atm/common.h b/net/atm/common.h index 526796ad230f..5850649068bb 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci); int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len); -__poll_t vcc_poll_mask(struct socket *sock, __poll_t events); +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 9f75092fe778..2cb10af16afc 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pvc_getname, - .poll_mask = vcc_poll_mask, + .poll = vcc_poll, .ioctl = vcc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vcc_compat_ioctl, diff --git a/net/atm/svc.c b/net/atm/svc.c index 53f4ad7087b1..2f91b766ac42 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = { .socketpair = sock_no_socketpair, .accept = svc_accept, .getname = svc_getname, - .poll_mask = vcc_poll_mask, + .poll = vcc_poll, .ioctl = svc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d1d2442ce573..c603d33d5410 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = { .socketpair = sock_no_socketpair, .accept = ax25_accept, .getname = ax25_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ax25_ioctl, .listen = ax25_listen, .shutdown = ax25_shutdown, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 510ab4f55df5..3264e1873219 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -437,13 +437,16 @@ static inline __poll_t bt_accept_poll(struct sock *parent) return 0; } -__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) +__poll_t bt_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; BT_DBG("sock %p, sk %p", sock, sk); + poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); @@ -475,7 +478,7 @@ __poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(bt_sock_poll_mask); +EXPORT_SYMBOL(bt_sock_poll); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index d6c099861538..1506e1632394 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = { .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, .ioctl = hci_sock_ioctl, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = hci_sock_setsockopt, diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 742a190034e6..686bdc6b35b0 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = { .getname = l2cap_sock_getname, .sendmsg = l2cap_sock_sendmsg, .recvmsg = l2cap_sock_recvmsg, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 1cf57622473a..d606e9212291 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = { .setsockopt = rfcomm_sock_setsockopt, .getsockopt = rfcomm_sock_getsockopt, .ioctl = rfcomm_sock_ioctl, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .socketpair = sock_no_socketpair, .mmap = sock_no_mmap }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index d60dbc61d170..413b8ee49fec 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = { .getname = sco_sock_getname, .sendmsg = sco_sock_sendmsg, .recvmsg = sco_sock_recvmsg, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index c7991867d622..a6fb1b3bcad9 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -934,11 +934,15 @@ static int caif_release(struct socket *sock) } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ -static __poll_t caif_poll_mask(struct socket *sock, __poll_t events) +static __poll_t caif_poll(struct file *file, + struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - __poll_t mask = 0; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -972,7 +976,7 @@ static const struct proto_ops caif_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = caif_poll_mask, + .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -993,7 +997,7 @@ static const struct proto_ops caif_stream_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = caif_poll_mask, + .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/bcm.c b/net/can/bcm.c index 9393f25df08d..0af8f0db892a 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1660,7 +1660,7 @@ static const struct proto_ops bcm_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/raw.c b/net/can/raw.c index fd7e2f49ea6a..1051eee82581 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = raw_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/core/datagram.c b/net/core/datagram.c index f19bf3dc2bd6..9938952c5c78 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -819,8 +819,9 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll + * @file: file struct * @sock: socket - * @events to wait for + * @wait: poll table * * Datagram poll: Again totally generic. This also handles * sequenced packet sockets providing the socket receive queue @@ -830,10 +831,14 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ -__poll_t datagram_poll_mask(struct socket *sock, __poll_t events) +__poll_t datagram_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -866,4 +871,4 @@ __poll_t datagram_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(datagram_poll_mask); +EXPORT_SYMBOL(datagram_poll); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 0ea2ee56ac1b..f91e3816806b 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -316,7 +316,8 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); void dccp_shutdown(struct sock *sk, int how); int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll_mask(struct socket *sock, __poll_t events); +__poll_t dccp_poll(struct file *file, struct socket *sock, + poll_table *wait); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void dccp_req_err(struct sock *sk, u64 seq); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a9e478cd3787..b08feb219b44 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = { .accept = inet_accept, .getname = inet_getname, /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll_mask = dccp_poll_mask, + .poll = dccp_poll, .ioctl = inet_ioctl, /* FIXME: work on inet_listen to rename it to sock_common_listen */ .listen = inet_dccp_listen, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 17fc4e0166ba..6344f1b18a6a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet6_getname, - .poll_mask = dccp_poll_mask, + .poll = dccp_poll, .ioctl = inet6_ioctl, .listen = inet_dccp_listen, .shutdown = inet_shutdown, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ca21c1c76da0..0d56e36a6db7 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -312,11 +312,20 @@ int dccp_disconnect(struct sock *sk, int flags) EXPORT_SYMBOL_GPL(dccp_disconnect); -__poll_t dccp_poll_mask(struct socket *sock, __poll_t events) +/* + * Wait for a DCCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +__poll_t dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); @@ -358,7 +367,7 @@ __poll_t dccp_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL_GPL(dccp_poll_mask); +EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 9a686d890bfa..7d6ff983ba2c 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer) } -static __poll_t dn_poll_mask(struct socket *sock, __poll_t events) +static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); if (!skb_queue_empty(&scp->other_receive_queue)) mask |= EPOLLRDBAND; @@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = { .socketpair = sock_no_socketpair, .accept = dn_accept, .getname = dn_getname, - .poll_mask = dn_poll_mask, + .poll = dn_poll, .ioctl = dn_ioctl, .listen = dn_listen, .shutdown = dn_shutdown, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a0768d2759b8..a60658c85a9a 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15e125558c76..b403499fdabe 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, - .poll_mask = tcp_poll_mask, + .poll = tcp_poll, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, @@ -1021,7 +1021,7 @@ const struct proto_ops inet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = udp_poll_mask, + .poll = udp_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, @@ -1042,7 +1042,7 @@ EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without - * udp_poll_mask + * udp_poll */ static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, @@ -1053,7 +1053,7 @@ static const struct proto_ops inet_sockraw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 141acd92e58a..e7b53d2a971f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -494,21 +494,32 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, } /* - * Socket is not locked. We are protected from async events by poll logic and - * correct handling of state changes made by other threads is impossible in - * any case. + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. */ -__poll_t tcp_poll_mask(struct socket *sock, __poll_t events) +__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { + __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); - __poll_t mask = 0; int state; + sock_poll_wait(file, sk_sleep(sk), wait); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); + /* Socket is not locked. We are protected from async events + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. + */ + + mask = 0; + /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -589,7 +600,7 @@ __poll_t tcp_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(tcp_poll_mask); +EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9bb27df4dac5..24e116ddae79 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2591,7 +2591,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * udp_poll - wait for a UDP event. * @file - file struct * @sock - socket - * @events - events to wait for + * @wait - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd @@ -2600,23 +2600,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * but then block when reading it. Add special case code * to work around these arguably broken applications. */ -__poll_t udp_poll_mask(struct socket *sock, __poll_t events) +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ - if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) && + if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); return mask; } -EXPORT_SYMBOL(udp_poll_mask); +EXPORT_SYMBOL(udp_poll); int udp_abort(struct sock *sk, int err) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 74f2a261e8df..9ed0eae91758 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -570,7 +570,7 @@ const struct proto_ops inet6_stream_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = inet_accept, /* ok */ .getname = inet6_getname, - .poll_mask = tcp_poll_mask, /* ok */ + .poll = tcp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = inet_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ @@ -603,7 +603,7 @@ const struct proto_ops inet6_dgram_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll_mask = udp_poll_mask, /* ok */ + .poll = udp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ce6f0d15b5dd..afc307c89d1a 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1334,7 +1334,7 @@ void raw6_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -/* Same as inet6_dgram_ops, sans udp_poll_mask. */ +/* Same as inet6_dgram_ops, sans udp_poll. */ const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -1344,7 +1344,7 @@ const struct proto_ops inet6_sockraw_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll_mask = datagram_poll_mask, /* ok */ + .poll = datagram_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 68e86257a549..893a022f9620 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1488,11 +1488,14 @@ static inline __poll_t iucv_accept_poll(struct sock *parent) return 0; } -static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events) +__poll_t iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); @@ -2385,7 +2388,7 @@ static const struct proto_ops iucv_sock_ops = { .getname = iucv_sock_getname, .sendmsg = iucv_sock_sendmsg, .recvmsg = iucv_sock_recvmsg, - .poll_mask = iucv_sock_poll_mask, + .poll = iucv_sock_poll, .ioctl = sock_no_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 84b7d5c6fec8..d3601d421571 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) struct list_head *head; int index = 0; - /* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state, - * so we set sk_state, otherwise epoll_wait always returns right away - * with EPOLLHUP + /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so + * we set sk_state, otherwise epoll_wait always returns right away with + * EPOLLHUP */ kcm->sk.sk_state = TCP_ESTABLISHED; @@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/key/af_key.c b/net/key/af_key.c index 8bdc1cbe490a..5e1d2946ffbf 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = { /* Now the operations that really occur. */ .release = pfkey_release, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 181073bf6925..a9c05b2bc1b0 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 336e4c00abbc..957369192ca1 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet6_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 55188382845c..e398797878a9 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1818,7 +1818,7 @@ static const struct proto_ops pppol2tp_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppol2tp_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = pppol2tp_setsockopt, diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 804de8490186..1beeea9549fa 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = { .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 1189b84413d5..393573a99a5a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2658,7 +2658,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 93fbcafbf388..03f37c4e64fe 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = { .socketpair = sock_no_socketpair, .accept = nr_accept, .getname = nr_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = nr_ioctl, .listen = nr_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ab5bb14b49af..ea0c0c6f1874 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -548,13 +548,16 @@ static inline __poll_t llcp_accept_poll(struct sock *parent) return 0; } -static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events) +static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; pr_debug("%p\n", sk); + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); @@ -896,7 +899,7 @@ static const struct proto_ops llcp_sock_ops = { .socketpair = sock_no_socketpair, .accept = llcp_sock_accept, .getname = llcp_sock_getname, - .poll_mask = llcp_sock_poll_mask, + .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, @@ -916,7 +919,7 @@ static const struct proto_ops llcp_rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = llcp_sock_getname, - .poll_mask = llcp_sock_poll_mask, + .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 60c322531c49..e2188deb08dc 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ff8e7e245c37..57634bc3da74 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4076,11 +4076,12 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, return 0; } -static __poll_t packet_poll_mask(struct socket *sock, __poll_t events) +static __poll_t packet_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { @@ -4422,7 +4423,7 @@ static const struct proto_ops packet_ops_spkt = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -4443,7 +4444,7 @@ static const struct proto_ops packet_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, - .poll_mask = packet_poll_mask, + .poll = packet_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index c295c4e20f01..30187990257f 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -340,12 +340,15 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, return sizeof(struct sockaddr_pn); } -static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events) +static __poll_t pn_socket_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct pep_sock *pn = pep_sk(sk); __poll_t mask = 0; + poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == TCP_CLOSE) return EPOLLERR; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -445,7 +448,7 @@ const struct proto_ops phonet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pn_socket_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = pn_socket_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -470,7 +473,7 @@ const struct proto_ops phonet_stream_ops = { .socketpair = sock_no_socketpair, .accept = pn_socket_accept, .getname = pn_socket_getname, - .poll_mask = pn_socket_poll_mask, + .poll = pn_socket_poll, .ioctl = pn_socket_ioctl, .listen = pn_socket_listen, .shutdown = sock_no_shutdown, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 1b5025ea5b04..2aa07b547b16 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = { .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ebe42e7eb456..d00a0ef39a56 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = { .socketpair = sock_no_socketpair, .accept = rose_accept, .getname = rose_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = rose_ioctl, .listen = rose_listen, .shutdown = sock_no_shutdown, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 3b1ac93efee2..2b463047dd7b 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -734,11 +734,15 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname, /* * permit an RxRPC socket to be polled */ -static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t rxrpc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* the socket is readable if there are any messages waiting on the Rx * queue */ @@ -945,7 +949,7 @@ static const struct proto_ops rxrpc_rpc_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = rxrpc_poll_mask, + .poll = rxrpc_poll, .ioctl = sock_no_ioctl, .listen = rxrpc_listen, .shutdown = rxrpc_shutdown, diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7339918a805d..0cd2e764f47f 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1010,7 +1010,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = sctp_getname, - .poll_mask = sctp_poll_mask, + .poll = sctp_poll, .ioctl = inet6_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 5dffbc493008..67f73d3a1356 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, /* Semantics are different. */ - .poll_mask = sctp_poll_mask, + .poll = sctp_poll, .ioctl = inet_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d20f7addee19..ce620e878538 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7717,12 +7717,14 @@ out: * here, again, by modeling the current TCP/UDP code. We don't have * a good way to test with it yet. */ -__poll_t sctp_poll_mask(struct socket *sock, __poll_t events) +__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct sctp_sock *sp = sctp_sk(sk); __poll_t mask; + poll_wait(file, sk_sleep(sk), wait); + sock_rps_record_flow(sk); /* A TCP-style listening socket becomes readable when the accept queue diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da7f02edcd37..973b4471b532 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1273,7 +1273,8 @@ static __poll_t smc_accept_poll(struct sock *parent) return mask; } -static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; @@ -1289,7 +1290,7 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ release_sock(sk); - mask = smc->clcsock->ops->poll_mask(smc->clcsock, events); + mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); lock_sock(sk); sk->sk_err = smc->clcsock->sk->sk_err; if (sk->sk_err) { @@ -1307,6 +1308,11 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) } } } else { + if (sk->sk_state != SMC_CLOSED) { + release_sock(sk); + sock_poll_wait(file, sk_sleep(sk), wait); + lock_sock(sk); + } if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || @@ -1619,7 +1625,7 @@ static const struct proto_ops smc_sock_ops = { .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, - .poll_mask = smc_poll_mask, + .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, diff --git a/net/socket.c b/net/socket.c index 8a109012608a..a564c6ed19d5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -117,10 +117,8 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events); -static __poll_t sock_poll_mask(struct file *file, __poll_t); -static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); +static __poll_t sock_poll(struct file *file, + struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT static long compat_sock_ioctl(struct file *file, @@ -143,8 +141,6 @@ static const struct file_operations socket_file_ops = { .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, - .get_poll_head = sock_get_poll_head, - .poll_mask = sock_poll_mask, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT @@ -1130,48 +1126,14 @@ out_release: } EXPORT_SYMBOL(sock_create_lite); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events) -{ - struct socket *sock = file->private_data; - - if (!sock->ops->poll_mask) - return NULL; - sock_poll_busy_loop(sock, events); - return sk_sleep(sock->sk); -} - -static __poll_t sock_poll_mask(struct file *file, __poll_t events) -{ - struct socket *sock = file->private_data; - - /* - * We need to be sure we are in sync with the socket flags modification. - * - * This memory barrier is paired in the wq_has_sleeper. - */ - smp_mb(); - - /* this socket can poll_ll so tell the system call */ - return sock->ops->poll_mask(sock, events) | - (sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0); -} - /* No kernel lock held - perfect */ static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; - __poll_t events = poll_requested_events(wait), mask = 0; - - if (sock->ops->poll) { - sock_poll_busy_loop(sock, events); - mask = sock->ops->poll(file, sock, wait); - } else if (sock->ops->poll_mask) { - sock_poll_wait(file, sock_get_poll_head(file, events), wait); - mask = sock->ops->poll_mask(sock, events); - } + __poll_t events = poll_requested_events(wait); - return mask | sock_poll_busy_flag(sock); + sock_poll_busy_loop(sock, events); + return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 14a5d055717d..930852c54d7a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -692,9 +692,10 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, } /** - * tipc_poll - read pollmask + * tipc_poll - read and possibly block on pollmask * @file: file structure associated with the socket * @sock: socket for which to calculate the poll bits + * @wait: ??? * * Returns pollmask value * @@ -708,12 +709,15 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, * imply that the operation will succeed, merely that it should be performed * and will not block. */ -static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t tipc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) @@ -3033,7 +3037,7 @@ static const struct proto_ops msg_ops = { .socketpair = tipc_socketpair, .accept = sock_no_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = sock_no_listen, .shutdown = tipc_shutdown, @@ -3054,7 +3058,7 @@ static const struct proto_ops packet_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, @@ -3075,7 +3079,7 @@ static const struct proto_ops stream_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a127d61e8af9..301f22430469 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -712,7 +712,7 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll_mask = tls_sw_poll_mask; + tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f127fac88acf..d2380548f8f6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -919,22 +919,23 @@ splice_read_end: return copied ? : err; } -__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events) +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) { + unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - __poll_t mask; - /* Grab EPOLLOUT and EPOLLHUP from the underlying socket */ - mask = ctx->sk_poll_mask(sock, events); + /* Grab POLLOUT and POLLHUP from the underlying socket */ + ret = ctx->sk_poll(file, sock, wait); - /* Clear EPOLLIN bits, and set based on recv_pkt */ - mask &= ~(EPOLLIN | EPOLLRDNORM); + /* Clear POLLIN bits, and set based on recv_pkt */ + ret &= ~(POLLIN | POLLRDNORM); if (ctx->recv_pkt) - mask |= EPOLLIN | EPOLLRDNORM; + ret |= POLLIN | POLLRDNORM; - return mask; + return ret; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1191,7 +1192,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask; + sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; strp_check_rcv(&sw_ctx_rx->strp); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 95b02a71fd47..e5473c03d667 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -638,8 +638,9 @@ static int unix_stream_connect(struct socket *, struct sockaddr *, static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int); -static __poll_t unix_poll_mask(struct socket *, __poll_t); -static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t); +static __poll_t unix_poll(struct file *, struct socket *, poll_table *); +static __poll_t unix_dgram_poll(struct file *, struct socket *, + poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); @@ -680,7 +681,7 @@ static const struct proto_ops unix_stream_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_poll_mask, + .poll = unix_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -703,7 +704,7 @@ static const struct proto_ops unix_dgram_ops = { .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = sock_no_listen, .shutdown = unix_shutdown, @@ -725,7 +726,7 @@ static const struct proto_ops unix_seqpacket_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -2629,10 +2630,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -2661,11 +2665,15 @@ static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) return mask; } -static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk, *other; - int writable; - __poll_t mask = 0; + unsigned int writable; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -2691,7 +2699,7 @@ static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) } /* No write status requested, avoid expensive OUT tests. */ - if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) + if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index bb5d5fa68c35..c1076c19b858 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -850,11 +850,18 @@ static int vsock_shutdown(struct socket *sock, int mode) return err; } -static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events) +static __poll_t vsock_poll(struct file *file, struct socket *sock, + poll_table *wait) { - struct sock *sk = sock->sk; - struct vsock_sock *vsk = vsock_sk(sk); - __poll_t mask = 0; + struct sock *sk; + __poll_t mask; + struct vsock_sock *vsk; + + sk = sock->sk; + vsk = vsock_sk(sk); + + poll_wait(file, sk_sleep(sk), wait); + mask = 0; if (sk->sk_err) /* Signify that there has been an error on this socket. */ @@ -1084,7 +1091,7 @@ static const struct proto_ops vsock_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = vsock_getname, - .poll_mask = vsock_poll_mask, + .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, @@ -1842,7 +1849,7 @@ static const struct proto_ops vsock_stream_ops = { .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, - .poll_mask = vsock_poll_mask, + .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index f93365ae0fdd..d49aa79b7997 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = { .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3b3410ada097..59fb7d3c36a3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -303,9 +303,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); } -static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events) +static unsigned int xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) { - __poll_t mask = datagram_poll_mask(sock, events); + unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -696,7 +697,7 @@ static const struct proto_ops xsk_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = xsk_poll_mask, + .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, -- cgit v1.2.3