From a95a9e5f0fdf9ef7c1b1cbf2788cb0df28a97bfb Mon Sep 17 00:00:00 2001 From: Jagadeesh Pagadala Date: Thu, 28 Mar 2019 02:58:45 +0530 Subject: arch:sparc:kernel/uprobes.c : Remove duplicate header Remove duplicate header which is included twice. Signed-off-by: Jagadeesh Pagadala Reviewed-by: Mukesh Ojha Signed-off-by: David S. Miller --- arch/sparc/kernel/uprobes.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sparc/kernel/uprobes.c b/arch/sparc/kernel/uprobes.c index d852ae56ddc1..c44bf5b85de8 100644 --- a/arch/sparc/kernel/uprobes.c +++ b/arch/sparc/kernel/uprobes.c @@ -29,7 +29,6 @@ #include #include -#include /* Compute the address of the breakpoint instruction and return it. * -- cgit v1.2.3 From 269fe56551c68cde57e477a6810ed57921dfe54f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 3 Apr 2019 17:32:24 +0900 Subject: sparc: vdso: add FORCE to the build rule of %.so $(call if_changed,...) must have FORCE as a prerequisite. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers Signed-off-by: David S. Miller --- arch/sparc/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index 74e97f77e23b..83c4b463cb3d 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -68,7 +68,7 @@ CFLAGS_REMOVE_vdso-note.o = -pg CFLAGS_REMOVE_vclock_gettime.o = -pg $(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg +$(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) -- cgit v1.2.3 From 031abf0b70cb6804eefb11340463a2277e52f853 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Apr 2019 20:23:39 +0200 Subject: sparc/iommu: use !PageHighMem to check if a page has a kernel mapping This deobsfucates the check a bit, and prepares for future changes. Signed-off-by: Christoph Hellwig Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- arch/sparc/mm/iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index e8d5d73ca40d..dcdadac03fdf 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -273,7 +273,8 @@ static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl, * XXX Is this a good assumption? * XXX What if someone else unmaps it here and races us? */ - if ((page = (unsigned long) page_address(sg_page(sg))) != 0) { + if (!PageHighMem(sg_page(sg))) { + page = (unsigned long)page_address(sg_page(sg)); for (i = 0; i < n; i++) { if (page != oldpage) { /* Already flushed? */ flush_page_for_dma(page); -- cgit v1.2.3 From a7fce1f7ca2f092fe44a17cb158deda97060aab4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Apr 2019 20:23:40 +0200 Subject: sparc/iommu: use sbus_iommu_unmap_page in sbus_iommu_unmap_sg Use the page-level helper instead of duplicating the logic, while also fixing the incorrect handling of larger than page sized offsets in the sg variant. Signed-off-by: Christoph Hellwig Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- arch/sparc/mm/iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index dcdadac03fdf..f47a6ce0acaa 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -321,11 +321,11 @@ static void sbus_iommu_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; - int i, n; + int i; for_each_sg(sgl, sg, nents, i) { - n = (sg->length + sg->offset + PAGE_SIZE-1) >> PAGE_SHIFT; - iommu_release_one(dev, sg->dma_address & PAGE_MASK, n); + sbus_iommu_unmap_page(dev, sg->dma_address, sg->length, dir, + attrs); sg->dma_address = 0x21212121; } } -- cgit v1.2.3 From f25b23bc156fef3211fe4adf9692eca5ce2fd082 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Apr 2019 20:23:41 +0200 Subject: sparc/iommu: merge iommu_release_one and sbus_iommu_unmap_page There is only one caller of iommu_release_one left, so merge it into that one to clean things up a bit. Signed-off-by: Christoph Hellwig Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- arch/sparc/mm/iommu.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index f47a6ce0acaa..7cb9ddda7531 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -291,14 +291,17 @@ static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl, return nents; } -static void iommu_release_one(struct device *dev, u32 busa, int npages) +static void sbus_iommu_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t len, enum dma_data_direction dir, unsigned long attrs) { struct iommu_struct *iommu = dev->archdata.iommu; - int ioptex; - int i; + unsigned int busa = dma_addr & PAGE_MASK; + unsigned long off = dma_addr & ~PAGE_MASK; + unsigned int npages = (off + len + PAGE_SIZE-1) >> PAGE_SHIFT; + unsigned int ioptex = (busa - iommu->start) >> PAGE_SHIFT; + unsigned int i; BUG_ON(busa < iommu->start); - ioptex = (busa - iommu->start) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { iopte_val(iommu->page_table[ioptex + i]) = 0; iommu_invalidate_page(iommu->regs, busa); @@ -307,16 +310,6 @@ static void iommu_release_one(struct device *dev, u32 busa, int npages) bit_map_clear(&iommu->usemap, ioptex, npages); } -static void sbus_iommu_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t len, enum dma_data_direction dir, unsigned long attrs) -{ - unsigned long off = dma_addr & ~PAGE_MASK; - int npages; - - npages = (off + len + PAGE_SIZE-1) >> PAGE_SHIFT; - iommu_release_one(dev, dma_addr & PAGE_MASK, npages); -} - static void sbus_iommu_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { -- cgit v1.2.3 From ff5cbec0c3ea8b96c4cb7bcd9f484d8665d394e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Apr 2019 20:23:42 +0200 Subject: sparc/iommu: create a common helper for map_sg Share the code for the global and per-page flush map_sg loops using a simple bool parameter to disable the per-page flush for the former variant. Signed-off-by: Christoph Hellwig Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- arch/sparc/mm/iommu.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index 7cb9ddda7531..f90d943a3a27 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -241,25 +241,9 @@ static dma_addr_t sbus_iommu_map_page_pflush(struct device *dev, return __sbus_iommu_map_page(dev, page, offset, len); } -static int sbus_iommu_map_sg_gflush(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - struct scatterlist *sg; - int i, n; - - flush_page_for_dma(0); - - for_each_sg(sgl, sg, nents, i) { - n = (sg->length + sg->offset + PAGE_SIZE-1) >> PAGE_SHIFT; - sg->dma_address = iommu_get_one(dev, sg_page(sg), n) + sg->offset; - sg->dma_length = sg->length; - } - - return nents; -} - -static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) +static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs, + bool per_page_flush) { unsigned long page, oldpage = 0; struct scatterlist *sg; @@ -273,7 +257,7 @@ static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl, * XXX Is this a good assumption? * XXX What if someone else unmaps it here and races us? */ - if (!PageHighMem(sg_page(sg))) { + if (per_page_flush && !PageHighMem(sg_page(sg))) { page = (unsigned long)page_address(sg_page(sg)); for (i = 0; i < n; i++) { if (page != oldpage) { /* Already flushed? */ @@ -291,6 +275,19 @@ static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl, return nents; } +static int sbus_iommu_map_sg_gflush(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + flush_page_for_dma(0); + return __sbus_iommu_map_sg(dev, sgl, nents, dir, attrs, false); +} + +static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + return __sbus_iommu_map_sg(dev, sgl, nents, dir, attrs, true); +} + static void sbus_iommu_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t len, enum dma_data_direction dir, unsigned long attrs) { -- cgit v1.2.3 From b82059428c0577c2ec082974d7956291d5eae2cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Apr 2019 20:23:43 +0200 Subject: sparc/iommu: pass a physical address to iommu_get_one No need for the page structure, just the paddr / pfn. This is going to simplify fixes to the callers. Signed-off-by: Christoph Hellwig Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- arch/sparc/mm/iommu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index f90d943a3a27..19d9266e4049 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -175,16 +175,17 @@ static void iommu_flush_iotlb(iopte_t *iopte, unsigned int niopte) } } -static u32 iommu_get_one(struct device *dev, struct page *page, int npages) +static u32 iommu_get_one(struct device *dev, phys_addr_t paddr, int npages) { struct iommu_struct *iommu = dev->archdata.iommu; int ioptex; iopte_t *iopte, *iopte0; unsigned int busa, busa0; + unsigned long pfn = __phys_to_pfn(paddr); int i; /* page color = pfn of page */ - ioptex = bit_map_string_get(&iommu->usemap, npages, page_to_pfn(page)); + ioptex = bit_map_string_get(&iommu->usemap, npages, pfn); if (ioptex < 0) panic("iommu out"); busa0 = iommu->start + (ioptex << PAGE_SHIFT); @@ -193,11 +194,11 @@ static u32 iommu_get_one(struct device *dev, struct page *page, int npages) busa = busa0; iopte = iopte0; for (i = 0; i < npages; i++) { - iopte_val(*iopte) = MKIOPTE(page_to_pfn(page), IOPERM); + iopte_val(*iopte) = MKIOPTE(pfn, IOPERM); iommu_invalidate_page(iommu->regs, busa); busa += PAGE_SIZE; iopte++; - page++; + pfn++; } iommu_flush_iotlb(iopte0, npages); @@ -215,7 +216,7 @@ static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page, /* XXX So what is maxphys for us and how do drivers know it? */ if (!len || len > 256 * 1024) return DMA_MAPPING_ERROR; - return iommu_get_one(dev, virt_to_page(vaddr), npages) + off; + return iommu_get_one(dev, virt_to_phys(vaddr), npages) + off; } static dma_addr_t sbus_iommu_map_page_gflush(struct device *dev, @@ -268,7 +269,7 @@ static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl, } } - sg->dma_address = iommu_get_one(dev, sg_page(sg), n) + sg->offset; + sg->dma_address = iommu_get_one(dev, sg_phys(sg), n) + sg->offset; sg->dma_length = sg->length; } -- cgit v1.2.3 From 8668b38c1c7720baf76da15a7a7eef43ae0c65a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Apr 2019 20:23:44 +0200 Subject: sparc/iommu: move per-page flushing into __sbus_iommu_map_page This prepares for reusing __sbus_iommu_map_page in the map_sg path. Signed-off-by: Christoph Hellwig Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- arch/sparc/mm/iommu.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index 19d9266e4049..7e191c8ae46a 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -207,15 +207,25 @@ static u32 iommu_get_one(struct device *dev, phys_addr_t paddr, int npages) } static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t len) + unsigned long offset, size_t len, bool per_page_flush) { void *vaddr = page_address(page) + offset; unsigned long off = (unsigned long)vaddr & ~PAGE_MASK; unsigned long npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - + /* XXX So what is maxphys for us and how do drivers know it? */ if (!len || len > 256 * 1024) return DMA_MAPPING_ERROR; + + if (per_page_flush) { + unsigned long p = (unsigned long)vaddr & PAGE_MASK; + + while (p < (unsigned long)vaddr + len) { + flush_page_for_dma(p); + p += PAGE_SIZE; + } + } + return iommu_get_one(dev, virt_to_phys(vaddr), npages) + off; } @@ -224,22 +234,14 @@ static dma_addr_t sbus_iommu_map_page_gflush(struct device *dev, enum dma_data_direction dir, unsigned long attrs) { flush_page_for_dma(0); - return __sbus_iommu_map_page(dev, page, offset, len); + return __sbus_iommu_map_page(dev, page, offset, len, false); } static dma_addr_t sbus_iommu_map_page_pflush(struct device *dev, struct page *page, unsigned long offset, size_t len, enum dma_data_direction dir, unsigned long attrs) { - void *vaddr = page_address(page) + offset; - unsigned long p = ((unsigned long)vaddr) & PAGE_MASK; - - while (p < (unsigned long)vaddr + len) { - flush_page_for_dma(p); - p += PAGE_SIZE; - } - - return __sbus_iommu_map_page(dev, page, offset, len); + return __sbus_iommu_map_page(dev, page, offset, len, true); } static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl, -- cgit v1.2.3 From 7e996890b88078011bfb55ce072712d464207dad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Apr 2019 20:23:45 +0200 Subject: sparc/iommu: fix __sbus_iommu_map_page for highmem pages __sbus_iommu_map_page currently assumes all pages are mapped into the kernel direct mapping. Switch to using physical address instead of virtual ones for all the normal mapping operations, and only use the virtual addresses for cache flushing when not operating on a highmem page. Signed-off-by: Christoph Hellwig Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- arch/sparc/mm/iommu.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index 7e191c8ae46a..37b5ce7657f6 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -209,24 +209,23 @@ static u32 iommu_get_one(struct device *dev, phys_addr_t paddr, int npages) static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page, unsigned long offset, size_t len, bool per_page_flush) { - void *vaddr = page_address(page) + offset; - unsigned long off = (unsigned long)vaddr & ~PAGE_MASK; + phys_addr_t paddr = page_to_phys(page) + offset; + unsigned long off = paddr & ~PAGE_MASK; unsigned long npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* XXX So what is maxphys for us and how do drivers know it? */ if (!len || len > 256 * 1024) return DMA_MAPPING_ERROR; - if (per_page_flush) { - unsigned long p = (unsigned long)vaddr & PAGE_MASK; + if (per_page_flush && !PageHighMem(page)) { + unsigned long vaddr, p; - while (p < (unsigned long)vaddr + len) { + vaddr = (unsigned long)page_address(page) + offset; + for (p = vaddr & PAGE_MASK; p < vaddr + len; p += PAGE_SIZE) flush_page_for_dma(p); - p += PAGE_SIZE; - } } - return iommu_get_one(dev, virt_to_phys(vaddr), npages) + off; + return iommu_get_one(dev, paddr, npages) + off; } static dma_addr_t sbus_iommu_map_page_gflush(struct device *dev, -- cgit v1.2.3 From edb1f07203ba8856b24bcddf8326386ba6a03291 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Apr 2019 20:23:46 +0200 Subject: sparc/iommu: use __sbus_iommu_map_page to implement the map_sg path This means we handle > PAGE_SIZE offsets fine, and grow the size check so far only performed in the map_page path. We lose the optimization to not double flush a page if it apears in multiple consecutive SG list entries. But at least for block I/O those don't happen anymore since we properly merge in higher layers anyway. Signed-off-by: Christoph Hellwig Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- arch/sparc/mm/iommu.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index 37b5ce7657f6..8fbc08d14836 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -217,6 +217,11 @@ static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page, if (!len || len > 256 * 1024) return DMA_MAPPING_ERROR; + /* + * We expect unmapped highmem pages to be not in the cache. + * XXX Is this a good assumption? + * XXX What if someone else unmaps it here and races us? + */ if (per_page_flush && !PageHighMem(page)) { unsigned long vaddr, p; @@ -247,30 +252,14 @@ static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs, bool per_page_flush) { - unsigned long page, oldpage = 0; struct scatterlist *sg; - int i, j, n; + int j; for_each_sg(sgl, sg, nents, j) { - n = (sg->length + sg->offset + PAGE_SIZE-1) >> PAGE_SHIFT; - - /* - * We expect unmapped highmem pages to be not in the cache. - * XXX Is this a good assumption? - * XXX What if someone else unmaps it here and races us? - */ - if (per_page_flush && !PageHighMem(sg_page(sg))) { - page = (unsigned long)page_address(sg_page(sg)); - for (i = 0; i < n; i++) { - if (page != oldpage) { /* Already flushed? */ - flush_page_for_dma(page); - oldpage = page; - } - page += PAGE_SIZE; - } - } - - sg->dma_address = iommu_get_one(dev, sg_phys(sg), n) + sg->offset; + sg->dma_address =__sbus_iommu_map_page(dev, sg_page(sg), + sg->offset, sg->length, per_page_flush); + if (sg->dma_address == DMA_MAPPING_ERROR) + return 0; sg->dma_length = sg->length; } -- cgit v1.2.3 From 376b1371a9f29112ae000cc0cade174a9a670053 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Apr 2019 20:23:47 +0200 Subject: sparc/iommu: merge iommu_get_one and __sbus_iommu_map_page There is only one caller of iommu_get_one left, so merge it into that one to clean things up a bit. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- arch/sparc/mm/iommu.c | 56 ++++++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index 8fbc08d14836..71ac353032b6 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -175,43 +175,17 @@ static void iommu_flush_iotlb(iopte_t *iopte, unsigned int niopte) } } -static u32 iommu_get_one(struct device *dev, phys_addr_t paddr, int npages) -{ - struct iommu_struct *iommu = dev->archdata.iommu; - int ioptex; - iopte_t *iopte, *iopte0; - unsigned int busa, busa0; - unsigned long pfn = __phys_to_pfn(paddr); - int i; - - /* page color = pfn of page */ - ioptex = bit_map_string_get(&iommu->usemap, npages, pfn); - if (ioptex < 0) - panic("iommu out"); - busa0 = iommu->start + (ioptex << PAGE_SHIFT); - iopte0 = &iommu->page_table[ioptex]; - - busa = busa0; - iopte = iopte0; - for (i = 0; i < npages; i++) { - iopte_val(*iopte) = MKIOPTE(pfn, IOPERM); - iommu_invalidate_page(iommu->regs, busa); - busa += PAGE_SIZE; - iopte++; - pfn++; - } - - iommu_flush_iotlb(iopte0, npages); - - return busa0; -} - static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page, unsigned long offset, size_t len, bool per_page_flush) { + struct iommu_struct *iommu = dev->archdata.iommu; phys_addr_t paddr = page_to_phys(page) + offset; unsigned long off = paddr & ~PAGE_MASK; unsigned long npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long pfn = __phys_to_pfn(paddr); + unsigned int busa, busa0; + iopte_t *iopte, *iopte0; + int ioptex, i; /* XXX So what is maxphys for us and how do drivers know it? */ if (!len || len > 256 * 1024) @@ -230,7 +204,25 @@ static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page, flush_page_for_dma(p); } - return iommu_get_one(dev, paddr, npages) + off; + /* page color = pfn of page */ + ioptex = bit_map_string_get(&iommu->usemap, npages, pfn); + if (ioptex < 0) + panic("iommu out"); + busa0 = iommu->start + (ioptex << PAGE_SHIFT); + iopte0 = &iommu->page_table[ioptex]; + + busa = busa0; + iopte = iopte0; + for (i = 0; i < npages; i++) { + iopte_val(*iopte) = MKIOPTE(pfn, IOPERM); + iommu_invalidate_page(iommu->regs, busa); + busa += PAGE_SIZE; + iopte++; + pfn++; + } + + iommu_flush_iotlb(iopte0, npages); + return busa0 + off; } static dma_addr_t sbus_iommu_map_page_gflush(struct device *dev, -- cgit v1.2.3 From 5d5db1c94f8c412d1c9755bd13194701301b9046 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 22 Apr 2019 10:28:02 -0300 Subject: docs: sparc: convert to ReST Rename the sparc documentation files to ReST, add an index for them and adjust in order to produce a nice html output via the Sphinx build system. There is an except from a document under oradax dir. It doesn't seem to make much sense to convert this one to ReST, so let's add it as an included document. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/sparc/adi.rst | 286 +++++++++++++++++++ Documentation/sparc/adi.txt | 278 ------------------- Documentation/sparc/console.rst | 9 + Documentation/sparc/console.txt | 9 - Documentation/sparc/index.rst | 13 + Documentation/sparc/oradax/oracle-dax.rst | 445 ++++++++++++++++++++++++++++++ Documentation/sparc/oradax/oracle-dax.txt | 429 ---------------------------- drivers/sbus/char/oradax.c | 2 +- 8 files changed, 754 insertions(+), 717 deletions(-) create mode 100644 Documentation/sparc/adi.rst delete mode 100644 Documentation/sparc/adi.txt create mode 100644 Documentation/sparc/console.rst delete mode 100644 Documentation/sparc/console.txt create mode 100644 Documentation/sparc/index.rst create mode 100644 Documentation/sparc/oradax/oracle-dax.rst delete mode 100644 Documentation/sparc/oradax/oracle-dax.txt diff --git a/Documentation/sparc/adi.rst b/Documentation/sparc/adi.rst new file mode 100644 index 000000000000..857ad30f9569 --- /dev/null +++ b/Documentation/sparc/adi.rst @@ -0,0 +1,286 @@ +================================ +Application Data Integrity (ADI) +================================ + +SPARC M7 processor adds the Application Data Integrity (ADI) feature. +ADI allows a task to set version tags on any subset of its address +space. Once ADI is enabled and version tags are set for ranges of +address space of a task, the processor will compare the tag in pointers +to memory in these ranges to the version set by the application +previously. Access to memory is granted only if the tag in given pointer +matches the tag set by the application. In case of mismatch, processor +raises an exception. + +Following steps must be taken by a task to enable ADI fully: + +1. Set the user mode PSTATE.mcde bit. This acts as master switch for + the task's entire address space to enable/disable ADI for the task. + +2. Set TTE.mcd bit on any TLB entries that correspond to the range of + addresses ADI is being enabled on. MMU checks the version tag only + on the pages that have TTE.mcd bit set. + +3. Set the version tag for virtual addresses using stxa instruction + and one of the MCD specific ASIs. Each stxa instruction sets the + given tag for one ADI block size number of bytes. This step must + be repeated for entire page to set tags for entire page. + +ADI block size for the platform is provided by the hypervisor to kernel +in machine description tables. Hypervisor also provides the number of +top bits in the virtual address that specify the version tag. Once +version tag has been set for a memory location, the tag is stored in the +physical memory and the same tag must be present in the ADI version tag +bits of the virtual address being presented to the MMU. For example on +SPARC M7 processor, MMU uses bits 63-60 for version tags and ADI block +size is same as cacheline size which is 64 bytes. A task that sets ADI +version to, say 10, on a range of memory, must access that memory using +virtual addresses that contain 0xa in bits 63-60. + +ADI is enabled on a set of pages using mprotect() with PROT_ADI flag. +When ADI is enabled on a set of pages by a task for the first time, +kernel sets the PSTATE.mcde bit fot the task. Version tags for memory +addresses are set with an stxa instruction on the addresses using +ASI_MCD_PRIMARY or ASI_MCD_ST_BLKINIT_PRIMARY. ADI block size is +provided by the hypervisor to the kernel. Kernel returns the value of +ADI block size to userspace using auxiliary vector along with other ADI +info. Following auxiliary vectors are provided by the kernel: + + ============ =========================================== + AT_ADI_BLKSZ ADI block size. This is the granularity and + alignment, in bytes, of ADI versioning. + AT_ADI_NBITS Number of ADI version bits in the VA + ============ =========================================== + + +IMPORTANT NOTES +=============== + +- Version tag values of 0x0 and 0xf are reserved. These values match any + tag in virtual address and never generate a mismatch exception. + +- Version tags are set on virtual addresses from userspace even though + tags are stored in physical memory. Tags are set on a physical page + after it has been allocated to a task and a pte has been created for + it. + +- When a task frees a memory page it had set version tags on, the page + goes back to free page pool. When this page is re-allocated to a task, + kernel clears the page using block initialization ASI which clears the + version tags as well for the page. If a page allocated to a task is + freed and allocated back to the same task, old version tags set by the + task on that page will no longer be present. + +- ADI tag mismatches are not detected for non-faulting loads. + +- Kernel does not set any tags for user pages and it is entirely a + task's responsibility to set any version tags. Kernel does ensure the + version tags are preserved if a page is swapped out to the disk and + swapped back in. It also preserves that version tags if a page is + migrated. + +- ADI works for any size pages. A userspace task need not be aware of + page size when using ADI. It can simply select a virtual address + range, enable ADI on the range using mprotect() and set version tags + for the entire range. mprotect() ensures range is aligned to page size + and is a multiple of page size. + +- ADI tags can only be set on writable memory. For example, ADI tags can + not be set on read-only mappings. + + + +ADI related traps +================= + +With ADI enabled, following new traps may occur: + +Disrupting memory corruption +---------------------------- + + When a store accesses a memory localtion that has TTE.mcd=1, + the task is running with ADI enabled (PSTATE.mcde=1), and the ADI + tag in the address used (bits 63:60) does not match the tag set on + the corresponding cacheline, a memory corruption trap occurs. By + default, it is a disrupting trap and is sent to the hypervisor + first. Hypervisor creates a sun4v error report and sends a + resumable error (TT=0x7e) trap to the kernel. The kernel sends + a SIGSEGV to the task that resulted in this trap with the following + info:: + + siginfo.si_signo = SIGSEGV; + siginfo.errno = 0; + siginfo.si_code = SEGV_ADIDERR; + siginfo.si_addr = addr; /* PC where first mismatch occurred */ + siginfo.si_trapno = 0; + + +Precise memory corruption +------------------------- + + When a store accesses a memory location that has TTE.mcd=1, + the task is running with ADI enabled (PSTATE.mcde=1), and the ADI + tag in the address used (bits 63:60) does not match the tag set on + the corresponding cacheline, a memory corruption trap occurs. If + MCD precise exception is enabled (MCDPERR=1), a precise + exception is sent to the kernel with TT=0x1a. The kernel sends + a SIGSEGV to the task that resulted in this trap with the following + info:: + + siginfo.si_signo = SIGSEGV; + siginfo.errno = 0; + siginfo.si_code = SEGV_ADIPERR; + siginfo.si_addr = addr; /* address that caused trap */ + siginfo.si_trapno = 0; + + NOTE: + ADI tag mismatch on a load always results in precise trap. + + +MCD disabled +------------ + + When a task has not enabled ADI and attempts to set ADI version + on a memory address, processor sends an MCD disabled trap. This + trap is handled by hypervisor first and the hypervisor vectors this + trap through to the kernel as Data Access Exception trap with + fault type set to 0xa (invalid ASI). When this occurs, the kernel + sends the task SIGSEGV signal with following info:: + + siginfo.si_signo = SIGSEGV; + siginfo.errno = 0; + siginfo.si_code = SEGV_ACCADI; + siginfo.si_addr = addr; /* address that caused trap */ + siginfo.si_trapno = 0; + + +Sample program to use ADI +------------------------- + +Following sample program is meant to illustrate how to use the ADI +functionality:: + + #include + #include + #include + #include + #include + #include + #include + #include + + #ifndef AT_ADI_BLKSZ + #define AT_ADI_BLKSZ 48 + #endif + #ifndef AT_ADI_NBITS + #define AT_ADI_NBITS 49 + #endif + + #ifndef PROT_ADI + #define PROT_ADI 0x10 + #endif + + #define BUFFER_SIZE 32*1024*1024UL + + main(int argc, char* argv[], char* envp[]) + { + unsigned long i, mcde, adi_blksz, adi_nbits; + char *shmaddr, *tmp_addr, *end, *veraddr, *clraddr; + int shmid, version; + Elf64_auxv_t *auxv; + + adi_blksz = 0; + + while(*envp++ != NULL); + for (auxv = (Elf64_auxv_t *)envp; auxv->a_type != AT_NULL; auxv++) { + switch (auxv->a_type) { + case AT_ADI_BLKSZ: + adi_blksz = auxv->a_un.a_val; + break; + case AT_ADI_NBITS: + adi_nbits = auxv->a_un.a_val; + break; + } + } + if (adi_blksz == 0) { + fprintf(stderr, "Oops! ADI is not supported\n"); + exit(1); + } + + printf("ADI capabilities:\n"); + printf("\tBlock size = %ld\n", adi_blksz); + printf("\tNumber of bits = %ld\n", adi_nbits); + + if ((shmid = shmget(2, BUFFER_SIZE, + IPC_CREAT | SHM_R | SHM_W)) < 0) { + perror("shmget failed"); + exit(1); + } + + shmaddr = shmat(shmid, NULL, 0); + if (shmaddr == (char *)-1) { + perror("shm attach failed"); + shmctl(shmid, IPC_RMID, NULL); + exit(1); + } + + if (mprotect(shmaddr, BUFFER_SIZE, PROT_READ|PROT_WRITE|PROT_ADI)) { + perror("mprotect failed"); + goto err_out; + } + + /* Set the ADI version tag on the shm segment + */ + version = 10; + tmp_addr = shmaddr; + end = shmaddr + BUFFER_SIZE; + while (tmp_addr < end) { + asm volatile( + "stxa %1, [%0]0x90\n\t" + : + : "r" (tmp_addr), "r" (version)); + tmp_addr += adi_blksz; + } + asm volatile("membar #Sync\n\t"); + + /* Create a versioned address from the normal address by placing + * version tag in the upper adi_nbits bits + */ + tmp_addr = (void *) ((unsigned long)shmaddr << adi_nbits); + tmp_addr = (void *) ((unsigned long)tmp_addr >> adi_nbits); + veraddr = (void *) (((unsigned long)version << (64-adi_nbits)) + | (unsigned long)tmp_addr); + + printf("Starting the writes:\n"); + for (i = 0; i < BUFFER_SIZE; i++) { + veraddr[i] = (char)(i); + if (!(i % (1024 * 1024))) + printf("."); + } + printf("\n"); + + printf("Verifying data..."); + fflush(stdout); + for (i = 0; i < BUFFER_SIZE; i++) + if (veraddr[i] != (char)i) + printf("\nIndex %lu mismatched\n", i); + printf("Done.\n"); + + /* Disable ADI and clean up + */ + if (mprotect(shmaddr, BUFFER_SIZE, PROT_READ|PROT_WRITE)) { + perror("mprotect failed"); + goto err_out; + } + + if (shmdt((const void *)shmaddr) != 0) + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + + exit(0); + + err_out: + if (shmdt((const void *)shmaddr) != 0) + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(1); + } diff --git a/Documentation/sparc/adi.txt b/Documentation/sparc/adi.txt deleted file mode 100644 index e1aed155fb89..000000000000 --- a/Documentation/sparc/adi.txt +++ /dev/null @@ -1,278 +0,0 @@ -Application Data Integrity (ADI) -================================ - -SPARC M7 processor adds the Application Data Integrity (ADI) feature. -ADI allows a task to set version tags on any subset of its address -space. Once ADI is enabled and version tags are set for ranges of -address space of a task, the processor will compare the tag in pointers -to memory in these ranges to the version set by the application -previously. Access to memory is granted only if the tag in given pointer -matches the tag set by the application. In case of mismatch, processor -raises an exception. - -Following steps must be taken by a task to enable ADI fully: - -1. Set the user mode PSTATE.mcde bit. This acts as master switch for - the task's entire address space to enable/disable ADI for the task. - -2. Set TTE.mcd bit on any TLB entries that correspond to the range of - addresses ADI is being enabled on. MMU checks the version tag only - on the pages that have TTE.mcd bit set. - -3. Set the version tag for virtual addresses using stxa instruction - and one of the MCD specific ASIs. Each stxa instruction sets the - given tag for one ADI block size number of bytes. This step must - be repeated for entire page to set tags for entire page. - -ADI block size for the platform is provided by the hypervisor to kernel -in machine description tables. Hypervisor also provides the number of -top bits in the virtual address that specify the version tag. Once -version tag has been set for a memory location, the tag is stored in the -physical memory and the same tag must be present in the ADI version tag -bits of the virtual address being presented to the MMU. For example on -SPARC M7 processor, MMU uses bits 63-60 for version tags and ADI block -size is same as cacheline size which is 64 bytes. A task that sets ADI -version to, say 10, on a range of memory, must access that memory using -virtual addresses that contain 0xa in bits 63-60. - -ADI is enabled on a set of pages using mprotect() with PROT_ADI flag. -When ADI is enabled on a set of pages by a task for the first time, -kernel sets the PSTATE.mcde bit fot the task. Version tags for memory -addresses are set with an stxa instruction on the addresses using -ASI_MCD_PRIMARY or ASI_MCD_ST_BLKINIT_PRIMARY. ADI block size is -provided by the hypervisor to the kernel. Kernel returns the value of -ADI block size to userspace using auxiliary vector along with other ADI -info. Following auxiliary vectors are provided by the kernel: - - AT_ADI_BLKSZ ADI block size. This is the granularity and - alignment, in bytes, of ADI versioning. - AT_ADI_NBITS Number of ADI version bits in the VA - - -IMPORTANT NOTES: - -- Version tag values of 0x0 and 0xf are reserved. These values match any - tag in virtual address and never generate a mismatch exception. - -- Version tags are set on virtual addresses from userspace even though - tags are stored in physical memory. Tags are set on a physical page - after it has been allocated to a task and a pte has been created for - it. - -- When a task frees a memory page it had set version tags on, the page - goes back to free page pool. When this page is re-allocated to a task, - kernel clears the page using block initialization ASI which clears the - version tags as well for the page. If a page allocated to a task is - freed and allocated back to the same task, old version tags set by the - task on that page will no longer be present. - -- ADI tag mismatches are not detected for non-faulting loads. - -- Kernel does not set any tags for user pages and it is entirely a - task's responsibility to set any version tags. Kernel does ensure the - version tags are preserved if a page is swapped out to the disk and - swapped back in. It also preserves that version tags if a page is - migrated. - -- ADI works for any size pages. A userspace task need not be aware of - page size when using ADI. It can simply select a virtual address - range, enable ADI on the range using mprotect() and set version tags - for the entire range. mprotect() ensures range is aligned to page size - and is a multiple of page size. - -- ADI tags can only be set on writable memory. For example, ADI tags can - not be set on read-only mappings. - - - -ADI related traps ------------------ - -With ADI enabled, following new traps may occur: - -Disrupting memory corruption - - When a store accesses a memory localtion that has TTE.mcd=1, - the task is running with ADI enabled (PSTATE.mcde=1), and the ADI - tag in the address used (bits 63:60) does not match the tag set on - the corresponding cacheline, a memory corruption trap occurs. By - default, it is a disrupting trap and is sent to the hypervisor - first. Hypervisor creates a sun4v error report and sends a - resumable error (TT=0x7e) trap to the kernel. The kernel sends - a SIGSEGV to the task that resulted in this trap with the following - info: - - siginfo.si_signo = SIGSEGV; - siginfo.errno = 0; - siginfo.si_code = SEGV_ADIDERR; - siginfo.si_addr = addr; /* PC where first mismatch occurred */ - siginfo.si_trapno = 0; - - -Precise memory corruption - - When a store accesses a memory location that has TTE.mcd=1, - the task is running with ADI enabled (PSTATE.mcde=1), and the ADI - tag in the address used (bits 63:60) does not match the tag set on - the corresponding cacheline, a memory corruption trap occurs. If - MCD precise exception is enabled (MCDPERR=1), a precise - exception is sent to the kernel with TT=0x1a. The kernel sends - a SIGSEGV to the task that resulted in this trap with the following - info: - - siginfo.si_signo = SIGSEGV; - siginfo.errno = 0; - siginfo.si_code = SEGV_ADIPERR; - siginfo.si_addr = addr; /* address that caused trap */ - siginfo.si_trapno = 0; - - NOTE: ADI tag mismatch on a load always results in precise trap. - - -MCD disabled - - When a task has not enabled ADI and attempts to set ADI version - on a memory address, processor sends an MCD disabled trap. This - trap is handled by hypervisor first and the hypervisor vectors this - trap through to the kernel as Data Access Exception trap with - fault type set to 0xa (invalid ASI). When this occurs, the kernel - sends the task SIGSEGV signal with following info: - - siginfo.si_signo = SIGSEGV; - siginfo.errno = 0; - siginfo.si_code = SEGV_ACCADI; - siginfo.si_addr = addr; /* address that caused trap */ - siginfo.si_trapno = 0; - - -Sample program to use ADI -------------------------- - -Following sample program is meant to illustrate how to use the ADI -functionality. - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef AT_ADI_BLKSZ -#define AT_ADI_BLKSZ 48 -#endif -#ifndef AT_ADI_NBITS -#define AT_ADI_NBITS 49 -#endif - -#ifndef PROT_ADI -#define PROT_ADI 0x10 -#endif - -#define BUFFER_SIZE 32*1024*1024UL - -main(int argc, char* argv[], char* envp[]) -{ - unsigned long i, mcde, adi_blksz, adi_nbits; - char *shmaddr, *tmp_addr, *end, *veraddr, *clraddr; - int shmid, version; - Elf64_auxv_t *auxv; - - adi_blksz = 0; - - while(*envp++ != NULL); - for (auxv = (Elf64_auxv_t *)envp; auxv->a_type != AT_NULL; auxv++) { - switch (auxv->a_type) { - case AT_ADI_BLKSZ: - adi_blksz = auxv->a_un.a_val; - break; - case AT_ADI_NBITS: - adi_nbits = auxv->a_un.a_val; - break; - } - } - if (adi_blksz == 0) { - fprintf(stderr, "Oops! ADI is not supported\n"); - exit(1); - } - - printf("ADI capabilities:\n"); - printf("\tBlock size = %ld\n", adi_blksz); - printf("\tNumber of bits = %ld\n", adi_nbits); - - if ((shmid = shmget(2, BUFFER_SIZE, - IPC_CREAT | SHM_R | SHM_W)) < 0) { - perror("shmget failed"); - exit(1); - } - - shmaddr = shmat(shmid, NULL, 0); - if (shmaddr == (char *)-1) { - perror("shm attach failed"); - shmctl(shmid, IPC_RMID, NULL); - exit(1); - } - - if (mprotect(shmaddr, BUFFER_SIZE, PROT_READ|PROT_WRITE|PROT_ADI)) { - perror("mprotect failed"); - goto err_out; - } - - /* Set the ADI version tag on the shm segment - */ - version = 10; - tmp_addr = shmaddr; - end = shmaddr + BUFFER_SIZE; - while (tmp_addr < end) { - asm volatile( - "stxa %1, [%0]0x90\n\t" - : - : "r" (tmp_addr), "r" (version)); - tmp_addr += adi_blksz; - } - asm volatile("membar #Sync\n\t"); - - /* Create a versioned address from the normal address by placing - * version tag in the upper adi_nbits bits - */ - tmp_addr = (void *) ((unsigned long)shmaddr << adi_nbits); - tmp_addr = (void *) ((unsigned long)tmp_addr >> adi_nbits); - veraddr = (void *) (((unsigned long)version << (64-adi_nbits)) - | (unsigned long)tmp_addr); - - printf("Starting the writes:\n"); - for (i = 0; i < BUFFER_SIZE; i++) { - veraddr[i] = (char)(i); - if (!(i % (1024 * 1024))) - printf("."); - } - printf("\n"); - - printf("Verifying data..."); - fflush(stdout); - for (i = 0; i < BUFFER_SIZE; i++) - if (veraddr[i] != (char)i) - printf("\nIndex %lu mismatched\n", i); - printf("Done.\n"); - - /* Disable ADI and clean up - */ - if (mprotect(shmaddr, BUFFER_SIZE, PROT_READ|PROT_WRITE)) { - perror("mprotect failed"); - goto err_out; - } - - if (shmdt((const void *)shmaddr) != 0) - perror("Detach failure"); - shmctl(shmid, IPC_RMID, NULL); - - exit(0); - -err_out: - if (shmdt((const void *)shmaddr) != 0) - perror("Detach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(1); -} diff --git a/Documentation/sparc/console.rst b/Documentation/sparc/console.rst new file mode 100644 index 000000000000..73132db83ece --- /dev/null +++ b/Documentation/sparc/console.rst @@ -0,0 +1,9 @@ +Steps for sending 'break' on sunhv console +========================================== + +On Baremetal: + 1. press Esc + 'B' + +On LDOM: + 1. press Ctrl + ']' + 2. telnet> send break diff --git a/Documentation/sparc/console.txt b/Documentation/sparc/console.txt deleted file mode 100644 index 5aa735a44e02..000000000000 --- a/Documentation/sparc/console.txt +++ /dev/null @@ -1,9 +0,0 @@ -Steps for sending 'break' on sunhv console: -=========================================== - -On Baremetal: - 1. press Esc + 'B' - -On LDOM: - 1. press Ctrl + ']' - 2. telnet> send break diff --git a/Documentation/sparc/index.rst b/Documentation/sparc/index.rst new file mode 100644 index 000000000000..91f7d6643dd5 --- /dev/null +++ b/Documentation/sparc/index.rst @@ -0,0 +1,13 @@ +:orphan: + +================== +Sparc Architecture +================== + +.. toctree:: + :maxdepth: 1 + + console + adi + + oradax/oracle-dax diff --git a/Documentation/sparc/oradax/oracle-dax.rst b/Documentation/sparc/oradax/oracle-dax.rst new file mode 100644 index 000000000000..d1e14d572918 --- /dev/null +++ b/Documentation/sparc/oradax/oracle-dax.rst @@ -0,0 +1,445 @@ +======================================= +Oracle Data Analytics Accelerator (DAX) +======================================= + +DAX is a coprocessor which resides on the SPARC M7 (DAX1) and M8 +(DAX2) processor chips, and has direct access to the CPU's L3 caches +as well as physical memory. It can perform several operations on data +streams with various input and output formats. A driver provides a +transport mechanism and has limited knowledge of the various opcodes +and data formats. A user space library provides high level services +and translates these into low level commands which are then passed +into the driver and subsequently the Hypervisor and the coprocessor. +The library is the recommended way for applications to use the +coprocessor, and the driver interface is not intended for general use. +This document describes the general flow of the driver, its +structures, and its programmatic interface. It also provides example +code sufficient to write user or kernel applications that use DAX +functionality. + +The user library is open source and available at: + + https://oss.oracle.com/git/gitweb.cgi?p=libdax.git + +The Hypervisor interface to the coprocessor is described in detail in +the accompanying document, dax-hv-api.txt, which is a plain text +excerpt of the (Oracle internal) "UltraSPARC Virtual Machine +Specification" version 3.0.20+15, dated 2017-09-25. + + +High Level Overview +=================== + +A coprocessor request is described by a Command Control Block +(CCB). The CCB contains an opcode and various parameters. The opcode +specifies what operation is to be done, and the parameters specify +options, flags, sizes, and addresses. The CCB (or an array of CCBs) +is passed to the Hypervisor, which handles queueing and scheduling of +requests to the available coprocessor execution units. A status code +returned indicates if the request was submitted successfully or if +there was an error. One of the addresses given in each CCB is a +pointer to a "completion area", which is a 128 byte memory block that +is written by the coprocessor to provide execution status. No +interrupt is generated upon completion; the completion area must be +polled by software to find out when a transaction has finished, but +the M7 and later processors provide a mechanism to pause the virtual +processor until the completion status has been updated by the +coprocessor. This is done using the monitored load and mwait +instructions, which are described in more detail later. The DAX +coprocessor was designed so that after a request is submitted, the +kernel is no longer involved in the processing of it. The polling is +done at the user level, which results in almost zero latency between +completion of a request and resumption of execution of the requesting +thread. + + +Addressing Memory +================= + +The kernel does not have access to physical memory in the Sun4v +architecture, as there is an additional level of memory virtualization +present. This intermediate level is called "real" memory, and the +kernel treats this as if it were physical. The Hypervisor handles the +translations between real memory and physical so that each logical +domain (LDOM) can have a partition of physical memory that is isolated +from that of other LDOMs. When the kernel sets up a virtual mapping, +it specifies a virtual address and the real address to which it should +be mapped. + +The DAX coprocessor can only operate on physical memory, so before a +request can be fed to the coprocessor, all the addresses in a CCB must +be converted into physical addresses. The kernel cannot do this since +it has no visibility into physical addresses. So a CCB may contain +either the virtual or real addresses of the buffers or a combination +of them. An "address type" field is available for each address that +may be given in the CCB. In all cases, the Hypervisor will translate +all the addresses to physical before dispatching to hardware. Address +translations are performed using the context of the process initiating +the request. + + +The Driver API +============== + +An application makes requests to the driver via the write() system +call, and gets results (if any) via read(). The completion areas are +made accessible via mmap(), and are read-only for the application. + +The request may either be an immediate command or an array of CCBs to +be submitted to the hardware. + +Each open instance of the device is exclusive to the thread that +opened it, and must be used by that thread for all subsequent +operations. The driver open function creates a new context for the +thread and initializes it for use. This context contains pointers and +values used internally by the driver to keep track of submitted +requests. The completion area buffer is also allocated, and this is +large enough to contain the completion areas for many concurrent +requests. When the device is closed, any outstanding transactions are +flushed and the context is cleaned up. + +On a DAX1 system (M7), the device will be called "oradax1", while on a +DAX2 system (M8) it will be "oradax2". If an application requires one +or the other, it should simply attempt to open the appropriate +device. Only one of the devices will exist on any given system, so the +name can be used to determine what the platform supports. + +The immediate commands are CCB_DEQUEUE, CCB_KILL, and CCB_INFO. For +all of these, success is indicated by a return value from write() +equal to the number of bytes given in the call. Otherwise -1 is +returned and errno is set. + +CCB_DEQUEUE +----------- + +Tells the driver to clean up resources associated with past +requests. Since no interrupt is generated upon the completion of a +request, the driver must be told when it may reclaim resources. No +further status information is returned, so the user should not +subsequently call read(). + +CCB_KILL +-------- + +Kills a CCB during execution. The CCB is guaranteed to not continue +executing once this call returns successfully. On success, read() must +be called to retrieve the result of the action. + +CCB_INFO +-------- + +Retrieves information about a currently executing CCB. Note that some +Hypervisors might return 'notfound' when the CCB is in 'inprogress' +state. To ensure a CCB in the 'notfound' state will never be executed, +CCB_KILL must be invoked on that CCB. Upon success, read() must be +called to retrieve the details of the action. + +Submission of an array of CCBs for execution +--------------------------------------------- + +A write() whose length is a multiple of the CCB size is treated as a +submit operation. The file offset is treated as the index of the +completion area to use, and may be set via lseek() or using the +pwrite() system call. If -1 is returned then errno is set to indicate +the error. Otherwise, the return value is the length of the array that +was actually accepted by the coprocessor. If the accepted length is +equal to the requested length, then the submission was completely +successful and there is no further status needed; hence, the user +should not subsequently call read(). Partial acceptance of the CCB +array is indicated by a return value less than the requested length, +and read() must be called to retrieve further status information. The +status will reflect the error caused by the first CCB that was not +accepted, and status_data will provide additional data in some cases. + +MMAP +---- + +The mmap() function provides access to the completion area allocated +in the driver. Note that the completion area is not writeable by the +user process, and the mmap call must not specify PROT_WRITE. + + +Completion of a Request +======================= + +The first byte in each completion area is the command status which is +updated by the coprocessor hardware. Software may take advantage of +new M7/M8 processor capabilities to efficiently poll this status byte. +First, a "monitored load" is achieved via a Load from Alternate Space +(ldxa, lduba, etc.) with ASI 0x84 (ASI_MONITOR_PRIMARY). Second, a +"monitored wait" is achieved via the mwait instruction (a write to +%asr28). This instruction is like pause in that it suspends execution +of the virtual processor for the given number of nanoseconds, but in +addition will terminate early when one of several events occur. If the +block of data containing the monitored location is modified, then the +mwait terminates. This causes software to resume execution immediately +(without a context switch or kernel to user transition) after a +transaction completes. Thus the latency between transaction completion +and resumption of execution may be just a few nanoseconds. + + +Application Life Cycle of a DAX Submission +========================================== + + - open dax device + - call mmap() to get the completion area address + - allocate a CCB and fill in the opcode, flags, parameters, addresses, etc. + - submit CCB via write() or pwrite() + - go into a loop executing monitored load + monitored wait and + terminate when the command status indicates the request is complete + (CCB_KILL or CCB_INFO may be used any time as necessary) + - perform a CCB_DEQUEUE + - call munmap() for completion area + - close the dax device + + +Memory Constraints +================== + +The DAX hardware operates only on physical addresses. Therefore, it is +not aware of virtual memory mappings and the discontiguities that may +exist in the physical memory that a virtual buffer maps to. There is +no I/O TLB or any scatter/gather mechanism. All buffers, whether input +or output, must reside in a physically contiguous region of memory. + +The Hypervisor translates all addresses within a CCB to physical +before handing off the CCB to DAX. The Hypervisor determines the +virtual page size for each virtual address given, and uses this to +program a size limit for each address. This prevents the coprocessor +from reading or writing beyond the bound of the virtual page, even +though it is accessing physical memory directly. A simpler way of +saying this is that a DAX operation will never "cross" a virtual page +boundary. If an 8k virtual page is used, then the data is strictly +limited to 8k. If a user's buffer is larger than 8k, then a larger +page size must be used, or the transaction size will be truncated to +8k. + +Huge pages. A user may allocate huge pages using standard interfaces. +Memory buffers residing on huge pages may be used to achieve much +larger DAX transaction sizes, but the rules must still be followed, +and no transaction will cross a page boundary, even a huge page. A +major caveat is that Linux on Sparc presents 8Mb as one of the huge +page sizes. Sparc does not actually provide a 8Mb hardware page size, +and this size is synthesized by pasting together two 4Mb pages. The +reasons for this are historical, and it creates an issue because only +half of this 8Mb page can actually be used for any given buffer in a +DAX request, and it must be either the first half or the second half; +it cannot be a 4Mb chunk in the middle, since that crosses a +(hardware) page boundary. Note that this entire issue may be hidden by +higher level libraries. + + +CCB Structure +------------- +A CCB is an array of 8 64-bit words. Several of these words provide +command opcodes, parameters, flags, etc., and the rest are addresses +for the completion area, output buffer, and various inputs:: + + struct ccb { + u64 control; + u64 completion; + u64 input0; + u64 access; + u64 input1; + u64 op_data; + u64 output; + u64 table; + }; + +See libdax/common/sys/dax1/dax1_ccb.h for a detailed description of +each of these fields, and see dax-hv-api.txt for a complete description +of the Hypervisor API available to the guest OS (ie, Linux kernel). + +The first word (control) is examined by the driver for the following: + - CCB version, which must be consistent with hardware version + - Opcode, which must be one of the documented allowable commands + - Address types, which must be set to "virtual" for all the addresses + given by the user, thereby ensuring that the application can + only access memory that it owns + + +Example Code +============ + +The DAX is accessible to both user and kernel code. The kernel code +can make hypercalls directly while the user code must use wrappers +provided by the driver. The setup of the CCB is nearly identical for +both; the only difference is in preparation of the completion area. An +example of user code is given now, with kernel code afterwards. + +In order to program using the driver API, the file +arch/sparc/include/uapi/asm/oradax.h must be included. + +First, the proper device must be opened. For M7 it will be +/dev/oradax1 and for M8 it will be /dev/oradax2. The simplest +procedure is to attempt to open both, as only one will succeed:: + + fd = open("/dev/oradax1", O_RDWR); + if (fd < 0) + fd = open("/dev/oradax2", O_RDWR); + if (fd < 0) + /* No DAX found */ + +Next, the completion area must be mapped:: + + completion_area = mmap(NULL, DAX_MMAP_LEN, PROT_READ, MAP_SHARED, fd, 0); + +All input and output buffers must be fully contained in one hardware +page, since as explained above, the DAX is strictly constrained by +virtual page boundaries. In addition, the output buffer must be +64-byte aligned and its size must be a multiple of 64 bytes because +the coprocessor writes in units of cache lines. + +This example demonstrates the DAX Scan command, which takes as input a +vector and a match value, and produces a bitmap as the output. For +each input element that matches the value, the corresponding bit is +set in the output. + +In this example, the input vector consists of a series of single bits, +and the match value is 0. So each 0 bit in the input will produce a 1 +in the output, and vice versa, which produces an output bitmap which +is the input bitmap inverted. + +For details of all the parameters and bits used in this CCB, please +refer to section 36.2.1.3 of the DAX Hypervisor API document, which +describes the Scan command in detail:: + + ccb->control = /* Table 36.1, CCB Header Format */ + (2L << 48) /* command = Scan Value */ + | (3L << 40) /* output address type = primary virtual */ + | (3L << 34) /* primary input address type = primary virtual */ + /* Section 36.2.1, Query CCB Command Formats */ + | (1 << 28) /* 36.2.1.1.1 primary input format = fixed width bit packed */ + | (0 << 23) /* 36.2.1.1.2 primary input element size = 0 (1 bit) */ + | (8 << 10) /* 36.2.1.1.6 output format = bit vector */ + | (0 << 5) /* 36.2.1.3 First scan criteria size = 0 (1 byte) */ + | (31 << 0); /* 36.2.1.3 Disable second scan criteria */ + + ccb->completion = 0; /* Completion area address, to be filled in by driver */ + + ccb->input0 = (unsigned long) input; /* primary input address */ + + ccb->access = /* Section 36.2.1.2, Data Access Control */ + (2 << 24) /* Primary input length format = bits */ + | (nbits - 1); /* number of bits in primary input stream, minus 1 */ + + ccb->input1 = 0; /* secondary input address, unused */ + + ccb->op_data = 0; /* scan criteria (value to be matched) */ + + ccb->output = (unsigned long) output; /* output address */ + + ccb->table = 0; /* table address, unused */ + +The CCB submission is a write() or pwrite() system call to the +driver. If the call fails, then a read() must be used to retrieve the +status:: + + if (pwrite(fd, ccb, 64, 0) != 64) { + struct ccb_exec_result status; + read(fd, &status, sizeof(status)); + /* bail out */ + } + +After a successful submission of the CCB, the completion area may be +polled to determine when the DAX is finished. Detailed information on +the contents of the completion area can be found in section 36.2.2 of +the DAX HV API document:: + + while (1) { + /* Monitored Load */ + __asm__ __volatile__("lduba [%1] 0x84, %0\n" + : "=r" (status) + : "r" (completion_area)); + + if (status) /* 0 indicates command in progress */ + break; + + /* MWAIT */ + __asm__ __volatile__("wr %%g0, 1000, %%asr28\n" ::); /* 1000 ns */ + } + +A completion area status of 1 indicates successful completion of the +CCB and validity of the output bitmap, which may be used immediately. +All other non-zero values indicate error conditions which are +described in section 36.2.2:: + + if (completion_area[0] != 1) { /* section 36.2.2, 1 = command ran and succeeded */ + /* completion_area[0] contains the completion status */ + /* completion_area[1] contains an error code, see 36.2.2 */ + } + +After the completion area has been processed, the driver must be +notified that it can release any resources associated with the +request. This is done via the dequeue operation:: + + struct dax_command cmd; + cmd.command = CCB_DEQUEUE; + if (write(fd, &cmd, sizeof(cmd)) != sizeof(cmd)) { + /* bail out */ + } + +Finally, normal program cleanup should be done, i.e., unmapping +completion area, closing the dax device, freeing memory etc. + +Kernel example +-------------- + +The only difference in using the DAX in kernel code is the treatment +of the completion area. Unlike user applications which mmap the +completion area allocated by the driver, kernel code must allocate its +own memory to use for the completion area, and this address and its +type must be given in the CCB:: + + ccb->control |= /* Table 36.1, CCB Header Format */ + (3L << 32); /* completion area address type = primary virtual */ + + ccb->completion = (unsigned long) completion_area; /* Completion area address */ + +The dax submit hypercall is made directly. The flags used in the +ccb_submit call are documented in the DAX HV API in section 36.3.1/ + +:: + + #include + + hv_rv = sun4v_ccb_submit((unsigned long)ccb, 64, + HV_CCB_QUERY_CMD | + HV_CCB_ARG0_PRIVILEGED | HV_CCB_ARG0_TYPE_PRIMARY | + HV_CCB_VA_PRIVILEGED, + 0, &bytes_accepted, &status_data); + + if (hv_rv != HV_EOK) { + /* hv_rv is an error code, status_data contains */ + /* potential additional status, see 36.3.1.1 */ + } + +After the submission, the completion area polling code is identical to +that in user land:: + + while (1) { + /* Monitored Load */ + __asm__ __volatile__("lduba [%1] 0x84, %0\n" + : "=r" (status) + : "r" (completion_area)); + + if (status) /* 0 indicates command in progress */ + break; + + /* MWAIT */ + __asm__ __volatile__("wr %%g0, 1000, %%asr28\n" ::); /* 1000 ns */ + } + + if (completion_area[0] != 1) { /* section 36.2.2, 1 = command ran and succeeded */ + /* completion_area[0] contains the completion status */ + /* completion_area[1] contains an error code, see 36.2.2 */ + } + +The output bitmap is ready for consumption immediately after the +completion status indicates success. + +Excer[t from UltraSPARC Virtual Machine Specification +===================================================== + + .. include:: dax-hv-api.txt + :literal: diff --git a/Documentation/sparc/oradax/oracle-dax.txt b/Documentation/sparc/oradax/oracle-dax.txt deleted file mode 100644 index 9d53ac93286f..000000000000 --- a/Documentation/sparc/oradax/oracle-dax.txt +++ /dev/null @@ -1,429 +0,0 @@ -Oracle Data Analytics Accelerator (DAX) ---------------------------------------- - -DAX is a coprocessor which resides on the SPARC M7 (DAX1) and M8 -(DAX2) processor chips, and has direct access to the CPU's L3 caches -as well as physical memory. It can perform several operations on data -streams with various input and output formats. A driver provides a -transport mechanism and has limited knowledge of the various opcodes -and data formats. A user space library provides high level services -and translates these into low level commands which are then passed -into the driver and subsequently the Hypervisor and the coprocessor. -The library is the recommended way for applications to use the -coprocessor, and the driver interface is not intended for general use. -This document describes the general flow of the driver, its -structures, and its programmatic interface. It also provides example -code sufficient to write user or kernel applications that use DAX -functionality. - -The user library is open source and available at: - https://oss.oracle.com/git/gitweb.cgi?p=libdax.git - -The Hypervisor interface to the coprocessor is described in detail in -the accompanying document, dax-hv-api.txt, which is a plain text -excerpt of the (Oracle internal) "UltraSPARC Virtual Machine -Specification" version 3.0.20+15, dated 2017-09-25. - - -High Level Overview -------------------- - -A coprocessor request is described by a Command Control Block -(CCB). The CCB contains an opcode and various parameters. The opcode -specifies what operation is to be done, and the parameters specify -options, flags, sizes, and addresses. The CCB (or an array of CCBs) -is passed to the Hypervisor, which handles queueing and scheduling of -requests to the available coprocessor execution units. A status code -returned indicates if the request was submitted successfully or if -there was an error. One of the addresses given in each CCB is a -pointer to a "completion area", which is a 128 byte memory block that -is written by the coprocessor to provide execution status. No -interrupt is generated upon completion; the completion area must be -polled by software to find out when a transaction has finished, but -the M7 and later processors provide a mechanism to pause the virtual -processor until the completion status has been updated by the -coprocessor. This is done using the monitored load and mwait -instructions, which are described in more detail later. The DAX -coprocessor was designed so that after a request is submitted, the -kernel is no longer involved in the processing of it. The polling is -done at the user level, which results in almost zero latency between -completion of a request and resumption of execution of the requesting -thread. - - -Addressing Memory ------------------ - -The kernel does not have access to physical memory in the Sun4v -architecture, as there is an additional level of memory virtualization -present. This intermediate level is called "real" memory, and the -kernel treats this as if it were physical. The Hypervisor handles the -translations between real memory and physical so that each logical -domain (LDOM) can have a partition of physical memory that is isolated -from that of other LDOMs. When the kernel sets up a virtual mapping, -it specifies a virtual address and the real address to which it should -be mapped. - -The DAX coprocessor can only operate on physical memory, so before a -request can be fed to the coprocessor, all the addresses in a CCB must -be converted into physical addresses. The kernel cannot do this since -it has no visibility into physical addresses. So a CCB may contain -either the virtual or real addresses of the buffers or a combination -of them. An "address type" field is available for each address that -may be given in the CCB. In all cases, the Hypervisor will translate -all the addresses to physical before dispatching to hardware. Address -translations are performed using the context of the process initiating -the request. - - -The Driver API --------------- - -An application makes requests to the driver via the write() system -call, and gets results (if any) via read(). The completion areas are -made accessible via mmap(), and are read-only for the application. - -The request may either be an immediate command or an array of CCBs to -be submitted to the hardware. - -Each open instance of the device is exclusive to the thread that -opened it, and must be used by that thread for all subsequent -operations. The driver open function creates a new context for the -thread and initializes it for use. This context contains pointers and -values used internally by the driver to keep track of submitted -requests. The completion area buffer is also allocated, and this is -large enough to contain the completion areas for many concurrent -requests. When the device is closed, any outstanding transactions are -flushed and the context is cleaned up. - -On a DAX1 system (M7), the device will be called "oradax1", while on a -DAX2 system (M8) it will be "oradax2". If an application requires one -or the other, it should simply attempt to open the appropriate -device. Only one of the devices will exist on any given system, so the -name can be used to determine what the platform supports. - -The immediate commands are CCB_DEQUEUE, CCB_KILL, and CCB_INFO. For -all of these, success is indicated by a return value from write() -equal to the number of bytes given in the call. Otherwise -1 is -returned and errno is set. - -CCB_DEQUEUE - -Tells the driver to clean up resources associated with past -requests. Since no interrupt is generated upon the completion of a -request, the driver must be told when it may reclaim resources. No -further status information is returned, so the user should not -subsequently call read(). - -CCB_KILL - -Kills a CCB during execution. The CCB is guaranteed to not continue -executing once this call returns successfully. On success, read() must -be called to retrieve the result of the action. - -CCB_INFO - -Retrieves information about a currently executing CCB. Note that some -Hypervisors might return 'notfound' when the CCB is in 'inprogress' -state. To ensure a CCB in the 'notfound' state will never be executed, -CCB_KILL must be invoked on that CCB. Upon success, read() must be -called to retrieve the details of the action. - -Submission of an array of CCBs for execution - -A write() whose length is a multiple of the CCB size is treated as a -submit operation. The file offset is treated as the index of the -completion area to use, and may be set via lseek() or using the -pwrite() system call. If -1 is returned then errno is set to indicate -the error. Otherwise, the return value is the length of the array that -was actually accepted by the coprocessor. If the accepted length is -equal to the requested length, then the submission was completely -successful and there is no further status needed; hence, the user -should not subsequently call read(). Partial acceptance of the CCB -array is indicated by a return value less than the requested length, -and read() must be called to retrieve further status information. The -status will reflect the error caused by the first CCB that was not -accepted, and status_data will provide additional data in some cases. - -MMAP - -The mmap() function provides access to the completion area allocated -in the driver. Note that the completion area is not writeable by the -user process, and the mmap call must not specify PROT_WRITE. - - -Completion of a Request ------------------------ - -The first byte in each completion area is the command status which is -updated by the coprocessor hardware. Software may take advantage of -new M7/M8 processor capabilities to efficiently poll this status byte. -First, a "monitored load" is achieved via a Load from Alternate Space -(ldxa, lduba, etc.) with ASI 0x84 (ASI_MONITOR_PRIMARY). Second, a -"monitored wait" is achieved via the mwait instruction (a write to -%asr28). This instruction is like pause in that it suspends execution -of the virtual processor for the given number of nanoseconds, but in -addition will terminate early when one of several events occur. If the -block of data containing the monitored location is modified, then the -mwait terminates. This causes software to resume execution immediately -(without a context switch or kernel to user transition) after a -transaction completes. Thus the latency between transaction completion -and resumption of execution may be just a few nanoseconds. - - -Application Life Cycle of a DAX Submission ------------------------------------------- - - - open dax device - - call mmap() to get the completion area address - - allocate a CCB and fill in the opcode, flags, parameters, addresses, etc. - - submit CCB via write() or pwrite() - - go into a loop executing monitored load + monitored wait and - terminate when the command status indicates the request is complete - (CCB_KILL or CCB_INFO may be used any time as necessary) - - perform a CCB_DEQUEUE - - call munmap() for completion area - - close the dax device - - -Memory Constraints ------------------- - -The DAX hardware operates only on physical addresses. Therefore, it is -not aware of virtual memory mappings and the discontiguities that may -exist in the physical memory that a virtual buffer maps to. There is -no I/O TLB or any scatter/gather mechanism. All buffers, whether input -or output, must reside in a physically contiguous region of memory. - -The Hypervisor translates all addresses within a CCB to physical -before handing off the CCB to DAX. The Hypervisor determines the -virtual page size for each virtual address given, and uses this to -program a size limit for each address. This prevents the coprocessor -from reading or writing beyond the bound of the virtual page, even -though it is accessing physical memory directly. A simpler way of -saying this is that a DAX operation will never "cross" a virtual page -boundary. If an 8k virtual page is used, then the data is strictly -limited to 8k. If a user's buffer is larger than 8k, then a larger -page size must be used, or the transaction size will be truncated to -8k. - -Huge pages. A user may allocate huge pages using standard interfaces. -Memory buffers residing on huge pages may be used to achieve much -larger DAX transaction sizes, but the rules must still be followed, -and no transaction will cross a page boundary, even a huge page. A -major caveat is that Linux on Sparc presents 8Mb as one of the huge -page sizes. Sparc does not actually provide a 8Mb hardware page size, -and this size is synthesized by pasting together two 4Mb pages. The -reasons for this are historical, and it creates an issue because only -half of this 8Mb page can actually be used for any given buffer in a -DAX request, and it must be either the first half or the second half; -it cannot be a 4Mb chunk in the middle, since that crosses a -(hardware) page boundary. Note that this entire issue may be hidden by -higher level libraries. - - -CCB Structure -------------- -A CCB is an array of 8 64-bit words. Several of these words provide -command opcodes, parameters, flags, etc., and the rest are addresses -for the completion area, output buffer, and various inputs: - - struct ccb { - u64 control; - u64 completion; - u64 input0; - u64 access; - u64 input1; - u64 op_data; - u64 output; - u64 table; - }; - -See libdax/common/sys/dax1/dax1_ccb.h for a detailed description of -each of these fields, and see dax-hv-api.txt for a complete description -of the Hypervisor API available to the guest OS (ie, Linux kernel). - -The first word (control) is examined by the driver for the following: - - CCB version, which must be consistent with hardware version - - Opcode, which must be one of the documented allowable commands - - Address types, which must be set to "virtual" for all the addresses - given by the user, thereby ensuring that the application can - only access memory that it owns - - -Example Code ------------- - -The DAX is accessible to both user and kernel code. The kernel code -can make hypercalls directly while the user code must use wrappers -provided by the driver. The setup of the CCB is nearly identical for -both; the only difference is in preparation of the completion area. An -example of user code is given now, with kernel code afterwards. - -In order to program using the driver API, the file -arch/sparc/include/uapi/asm/oradax.h must be included. - -First, the proper device must be opened. For M7 it will be -/dev/oradax1 and for M8 it will be /dev/oradax2. The simplest -procedure is to attempt to open both, as only one will succeed: - - fd = open("/dev/oradax1", O_RDWR); - if (fd < 0) - fd = open("/dev/oradax2", O_RDWR); - if (fd < 0) - /* No DAX found */ - -Next, the completion area must be mapped: - - completion_area = mmap(NULL, DAX_MMAP_LEN, PROT_READ, MAP_SHARED, fd, 0); - -All input and output buffers must be fully contained in one hardware -page, since as explained above, the DAX is strictly constrained by -virtual page boundaries. In addition, the output buffer must be -64-byte aligned and its size must be a multiple of 64 bytes because -the coprocessor writes in units of cache lines. - -This example demonstrates the DAX Scan command, which takes as input a -vector and a match value, and produces a bitmap as the output. For -each input element that matches the value, the corresponding bit is -set in the output. - -In this example, the input vector consists of a series of single bits, -and the match value is 0. So each 0 bit in the input will produce a 1 -in the output, and vice versa, which produces an output bitmap which -is the input bitmap inverted. - -For details of all the parameters and bits used in this CCB, please -refer to section 36.2.1.3 of the DAX Hypervisor API document, which -describes the Scan command in detail. - - ccb->control = /* Table 36.1, CCB Header Format */ - (2L << 48) /* command = Scan Value */ - | (3L << 40) /* output address type = primary virtual */ - | (3L << 34) /* primary input address type = primary virtual */ - /* Section 36.2.1, Query CCB Command Formats */ - | (1 << 28) /* 36.2.1.1.1 primary input format = fixed width bit packed */ - | (0 << 23) /* 36.2.1.1.2 primary input element size = 0 (1 bit) */ - | (8 << 10) /* 36.2.1.1.6 output format = bit vector */ - | (0 << 5) /* 36.2.1.3 First scan criteria size = 0 (1 byte) */ - | (31 << 0); /* 36.2.1.3 Disable second scan criteria */ - - ccb->completion = 0; /* Completion area address, to be filled in by driver */ - - ccb->input0 = (unsigned long) input; /* primary input address */ - - ccb->access = /* Section 36.2.1.2, Data Access Control */ - (2 << 24) /* Primary input length format = bits */ - | (nbits - 1); /* number of bits in primary input stream, minus 1 */ - - ccb->input1 = 0; /* secondary input address, unused */ - - ccb->op_data = 0; /* scan criteria (value to be matched) */ - - ccb->output = (unsigned long) output; /* output address */ - - ccb->table = 0; /* table address, unused */ - -The CCB submission is a write() or pwrite() system call to the -driver. If the call fails, then a read() must be used to retrieve the -status: - - if (pwrite(fd, ccb, 64, 0) != 64) { - struct ccb_exec_result status; - read(fd, &status, sizeof(status)); - /* bail out */ - } - -After a successful submission of the CCB, the completion area may be -polled to determine when the DAX is finished. Detailed information on -the contents of the completion area can be found in section 36.2.2 of -the DAX HV API document. - - while (1) { - /* Monitored Load */ - __asm__ __volatile__("lduba [%1] 0x84, %0\n" - : "=r" (status) - : "r" (completion_area)); - - if (status) /* 0 indicates command in progress */ - break; - - /* MWAIT */ - __asm__ __volatile__("wr %%g0, 1000, %%asr28\n" ::); /* 1000 ns */ - } - -A completion area status of 1 indicates successful completion of the -CCB and validity of the output bitmap, which may be used immediately. -All other non-zero values indicate error conditions which are -described in section 36.2.2. - - if (completion_area[0] != 1) { /* section 36.2.2, 1 = command ran and succeeded */ - /* completion_area[0] contains the completion status */ - /* completion_area[1] contains an error code, see 36.2.2 */ - } - -After the completion area has been processed, the driver must be -notified that it can release any resources associated with the -request. This is done via the dequeue operation: - - struct dax_command cmd; - cmd.command = CCB_DEQUEUE; - if (write(fd, &cmd, sizeof(cmd)) != sizeof(cmd)) { - /* bail out */ - } - -Finally, normal program cleanup should be done, i.e., unmapping -completion area, closing the dax device, freeing memory etc. - -[Kernel example] - -The only difference in using the DAX in kernel code is the treatment -of the completion area. Unlike user applications which mmap the -completion area allocated by the driver, kernel code must allocate its -own memory to use for the completion area, and this address and its -type must be given in the CCB: - - ccb->control |= /* Table 36.1, CCB Header Format */ - (3L << 32); /* completion area address type = primary virtual */ - - ccb->completion = (unsigned long) completion_area; /* Completion area address */ - -The dax submit hypercall is made directly. The flags used in the -ccb_submit call are documented in the DAX HV API in section 36.3.1. - -#include - - hv_rv = sun4v_ccb_submit((unsigned long)ccb, 64, - HV_CCB_QUERY_CMD | - HV_CCB_ARG0_PRIVILEGED | HV_CCB_ARG0_TYPE_PRIMARY | - HV_CCB_VA_PRIVILEGED, - 0, &bytes_accepted, &status_data); - - if (hv_rv != HV_EOK) { - /* hv_rv is an error code, status_data contains */ - /* potential additional status, see 36.3.1.1 */ - } - -After the submission, the completion area polling code is identical to -that in user land: - - while (1) { - /* Monitored Load */ - __asm__ __volatile__("lduba [%1] 0x84, %0\n" - : "=r" (status) - : "r" (completion_area)); - - if (status) /* 0 indicates command in progress */ - break; - - /* MWAIT */ - __asm__ __volatile__("wr %%g0, 1000, %%asr28\n" ::); /* 1000 ns */ - } - - if (completion_area[0] != 1) { /* section 36.2.2, 1 = command ran and succeeded */ - /* completion_area[0] contains the completion status */ - /* completion_area[1] contains an error code, see 36.2.2 */ - } - -The output bitmap is ready for consumption immediately after the -completion status indicates success. diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index 6516bc3cb58b..acd9ba40eabe 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -30,7 +30,7 @@ * the recommended way for applications to use the coprocessor, and * the driver interface is not intended for general use. * - * See Documentation/sparc/oradax/oracle-dax.txt for more details. + * See Documentation/sparc/oradax/oracle-dax.rst for more details. */ #include -- cgit v1.2.3 From bc0025b6107c011e8f9411a275d8442a56bd573a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 8 Jan 2019 10:13:56 -0600 Subject: sparc: use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- arch/sparc/kernel/cpumap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c index d1d52822603d..1cb62bfeaa1f 100644 --- a/arch/sparc/kernel/cpumap.c +++ b/arch/sparc/kernel/cpumap.c @@ -194,8 +194,7 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void) n = enumerate_cpuinfo_nodes(tmp_level); - new_tree = kzalloc(sizeof(struct cpuinfo_tree) + - (sizeof(struct cpuinfo_node) * n), GFP_ATOMIC); + new_tree = kzalloc(struct_size(new_tree, nodes, n), GFP_ATOMIC); if (!new_tree) return NULL; -- cgit v1.2.3 From f4d9a23d3dad0252f375901bf4ff6523a2c97241 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 12 Feb 2019 11:32:36 +0200 Subject: sparc64: simplify reduce_memory() function The reduce_memory() function clampls the available memory to a limit defined by the "mem=" command line parameter. It takes into account the amount of already reserved memory and excludes it from the limit calculations. Rather than traverse memblocks and remove them by hand, use memblock_reserved_size() to account the reserved memory and memblock_enforce_memory_limit() to clamp the available memory. Signed-off-by: Mike Rapoport Acked-by: David S. Miller Signed-off-by: David S. Miller --- arch/sparc/mm/init_64.c | 42 ++---------------------------------------- 1 file changed, 2 insertions(+), 40 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index f2d70ff7a284..bc2aaa47bc8a 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2269,19 +2269,6 @@ static unsigned long last_valid_pfn; static void sun4u_pgprot_init(void); static void sun4v_pgprot_init(void); -static phys_addr_t __init available_memory(void) -{ - phys_addr_t available = 0ULL; - phys_addr_t pa_start, pa_end; - u64 i; - - for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, - &pa_end, NULL) - available = available + (pa_end - pa_start); - - return available; -} - #define _PAGE_CACHE_4U (_PAGE_CP_4U | _PAGE_CV_4U) #define _PAGE_CACHE_4V (_PAGE_CP_4V | _PAGE_CV_4V) #define __DIRTY_BITS_4U (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U) @@ -2295,33 +2282,8 @@ static phys_addr_t __init available_memory(void) */ static void __init reduce_memory(phys_addr_t limit_ram) { - phys_addr_t avail_ram = available_memory(); - phys_addr_t pa_start, pa_end; - u64 i; - - if (limit_ram >= avail_ram) - return; - - for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, - &pa_end, NULL) { - phys_addr_t region_size = pa_end - pa_start; - phys_addr_t clip_start = pa_start; - - avail_ram = avail_ram - region_size; - /* Are we consuming too much? */ - if (avail_ram < limit_ram) { - phys_addr_t give_back = limit_ram - avail_ram; - - region_size = region_size - give_back; - clip_start = clip_start + give_back; - } - - memblock_remove(clip_start, region_size); - - if (avail_ram <= limit_ram) - break; - i = 0UL; - } + limit_ram += memblock_reserved_size(); + memblock_enforce_memory_limit(limit_ram); } void __init paging_init(void) -- cgit v1.2.3