diff options
Diffstat (limited to 'arch/tile/lib')
-rw-r--r-- | arch/tile/lib/atomic_asm_32.S | 2 | ||||
-rw-r--r-- | arch/tile/lib/cacheflush.c | 18 | ||||
-rw-r--r-- | arch/tile/lib/memchr_64.c | 71 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_64.c | 220 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_user_64.c | 86 | ||||
-rw-r--r-- | arch/tile/lib/memset_64.c | 145 | ||||
-rw-r--r-- | arch/tile/lib/spinlock_64.c | 104 | ||||
-rw-r--r-- | arch/tile/lib/strchr_64.c | 67 | ||||
-rw-r--r-- | arch/tile/lib/strlen_64.c | 38 | ||||
-rw-r--r-- | arch/tile/lib/usercopy_64.S | 196 |
10 files changed, 946 insertions, 1 deletions
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 82f64cc63658..24448734f6f1 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S @@ -59,7 +59,7 @@ * bad kernel addresses). * * Note that if the value we would store is the same as what we - * loaded, we bypass the load. Other platforms with true atomics can + * loaded, we bypass the store. Other platforms with true atomics can * make the guarantee that a non-atomic __clear_bit(), for example, * can safely race with an atomic test_and_set_bit(); this example is * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c index 35c1d8ca5f38..8928aace7a64 100644 --- a/arch/tile/lib/cacheflush.c +++ b/arch/tile/lib/cacheflush.c @@ -15,6 +15,7 @@ #include <asm/page.h> #include <asm/cacheflush.h> #include <arch/icache.h> +#include <arch/spr_def.h> void __flush_icache_range(unsigned long start, unsigned long end) @@ -39,6 +40,18 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh) char *p, *base; size_t step_size, load_count; const unsigned long STRIPE_WIDTH = 8192; +#ifdef __tilegx__ + /* + * On TILE-Gx, we must disable the dstream prefetcher before doing + * a cache flush; otherwise, we could end up with data in the cache + * that we don't want there. Note that normally we'd do an mf + * after the SPR write to disabling the prefetcher, but we do one + * below, before any further loads, so there's no need to do it + * here. + */ + uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF); + __insn_mtspr(SPR_DSTREAM_PF, 0); +#endif /* * Flush and invalidate the buffer out of the local L1/L2 @@ -122,4 +135,9 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh) /* Wait for the load+inv's (and thus finvs) to have completed. */ __insn_mf(); + +#ifdef __tilegx__ + /* Reenable the prefetcher. */ + __insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf); +#endif } diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c new file mode 100644 index 000000000000..84fdc8d8e735 --- /dev/null +++ b/arch/tile/lib/memchr_64.c @@ -0,0 +1,71 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> + +void *memchr(const void *s, int c, size_t n) +{ + const uint64_t *last_word_ptr; + const uint64_t *p; + const char *last_byte_ptr; + uintptr_t s_int; + uint64_t goal, before_mask, v, bits; + char *ret; + + if (__builtin_expect(n == 0, 0)) { + /* Don't dereference any memory if the array is empty. */ + return NULL; + } + + /* Get an aligned pointer. */ + s_int = (uintptr_t) s; + p = (const uint64_t *)(s_int & -8); + + /* Create eight copies of the byte for which we are looking. */ + goal = 0x0101010101010101ULL * (uint8_t) c; + + /* Read the first word, but munge it so that bytes before the array + * will not match goal. + * + * Note that this shift count expression works because we know + * shift counts are taken mod 64. + */ + before_mask = (1ULL << (s_int << 3)) - 1; + v = (*p | before_mask) ^ (goal & before_mask); + + /* Compute the address of the last byte. */ + last_byte_ptr = (const char *)s + n - 1; + + /* Compute the address of the word containing the last byte. */ + last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8); + + while ((bits = __insn_v1cmpeq(v, goal)) == 0) { + if (__builtin_expect(p == last_word_ptr, 0)) { + /* We already read the last word in the array, + * so give up. + */ + return NULL; + } + v = *++p; + } + + /* We found a match, but it might be in a byte past the end + * of the array. + */ + ret = ((char *)p) + (__insn_ctz(bits) >> 3); + return (ret <= last_byte_ptr) ? ret : NULL; +} +EXPORT_SYMBOL(memchr); diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c new file mode 100644 index 000000000000..3fab9a6a2bbe --- /dev/null +++ b/arch/tile/lib/memcpy_64.c @@ -0,0 +1,220 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> +#define __memcpy memcpy +/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */ + +/* Must be 8 bytes in size. */ +#define word_t uint64_t + +#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128 +#error "Assumes 64 or 128 byte line size" +#endif + +/* How many cache lines ahead should we prefetch? */ +#define PREFETCH_LINES_AHEAD 3 + +/* + * Provide "base versions" of load and store for the normal code path. + * The kernel provides other versions for userspace copies. + */ +#define ST(p, v) (*(p) = (v)) +#define LD(p) (*(p)) + +#ifndef USERCOPY_FUNC +#define ST1 ST +#define ST2 ST +#define ST4 ST +#define ST8 ST +#define LD1 LD +#define LD2 LD +#define LD4 LD +#define LD8 LD +#define RETVAL dstv +void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n) +#else +/* + * Special kernel version will provide implementation of the LDn/STn + * macros to return a count of uncopied bytes due to mm fault. + */ +#define RETVAL 0 +int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) +#endif +{ + char *__restrict dst1 = (char *)dstv; + const char *__restrict src1 = (const char *)srcv; + const char *__restrict src1_end; + const char *__restrict prefetch; + word_t *__restrict dst8; /* 8-byte pointer to destination memory. */ + word_t final; /* Final bytes to write to trailing word, if any */ + long i; + + if (n < 16) { + for (; n; n--) + ST1(dst1++, LD1(src1++)); + return RETVAL; + } + + /* + * Locate the end of source memory we will copy. Don't + * prefetch past this. + */ + src1_end = src1 + n - 1; + + /* Prefetch ahead a few cache lines, but not past the end. */ + prefetch = src1; + for (i = 0; i < PREFETCH_LINES_AHEAD; i++) { + __insn_prefetch(prefetch); + prefetch += CHIP_L2_LINE_SIZE(); + prefetch = (prefetch > src1_end) ? prefetch : src1; + } + + /* Copy bytes until dst is word-aligned. */ + for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--) + ST1(dst1++, LD1(src1++)); + + /* 8-byte pointer to destination memory. */ + dst8 = (word_t *)dst1; + + if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) { + /* + * Misaligned copy. Copy 8 bytes at a time, but don't + * bother with other fanciness. + * + * TODO: Consider prefetching and using wh64 as well. + */ + + /* Create an aligned src8. */ + const word_t *__restrict src8 = + (const word_t *)((uintptr_t)src1 & -sizeof(word_t)); + word_t b; + + word_t a = LD8(src8++); + for (; n >= sizeof(word_t); n -= sizeof(word_t)) { + b = LD8(src8++); + a = __insn_dblalign(a, b, src1); + ST8(dst8++, a); + a = b; + } + + if (n == 0) + return RETVAL; + + b = ((const char *)src8 <= src1_end) ? *src8 : 0; + + /* + * Final source bytes to write to trailing partial + * word, if any. + */ + final = __insn_dblalign(a, b, src1); + } else { + /* Aligned copy. */ + + const word_t* __restrict src8 = (const word_t *)src1; + + /* src8 and dst8 are both word-aligned. */ + if (n >= CHIP_L2_LINE_SIZE()) { + /* Copy until 'dst' is cache-line-aligned. */ + for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1); + n -= sizeof(word_t)) + ST8(dst8++, LD8(src8++)); + + for (; n >= CHIP_L2_LINE_SIZE(); ) { + __insn_wh64(dst8); + + /* + * Prefetch and advance to next line + * to prefetch, but don't go past the end + */ + __insn_prefetch(prefetch); + prefetch += CHIP_L2_LINE_SIZE(); + prefetch = (prefetch > src1_end) ? prefetch : + (const char *)src8; + + /* + * Copy an entire cache line. Manually + * unrolled to avoid idiosyncracies of + * compiler unrolling. + */ +#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; }) + COPY_WORD(0); + COPY_WORD(1); + COPY_WORD(2); + COPY_WORD(3); + COPY_WORD(4); + COPY_WORD(5); + COPY_WORD(6); + COPY_WORD(7); +#if CHIP_L2_LINE_SIZE() == 128 + COPY_WORD(8); + COPY_WORD(9); + COPY_WORD(10); + COPY_WORD(11); + COPY_WORD(12); + COPY_WORD(13); + COPY_WORD(14); + COPY_WORD(15); +#elif CHIP_L2_LINE_SIZE() != 64 +# error Fix code that assumes particular L2 cache line sizes +#endif + + dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); + src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); + } + } + + for (; n >= sizeof(word_t); n -= sizeof(word_t)) + ST8(dst8++, LD8(src8++)); + + if (__builtin_expect(n == 0, 1)) + return RETVAL; + + final = LD8(src8); + } + + /* n != 0 if we get here. Write out any trailing bytes. */ + dst1 = (char *)dst8; + if (n & 4) { + ST4((uint32_t *)dst1, final); + dst1 += 4; + final >>= 32; + n &= 3; + } + if (n & 2) { + ST2((uint16_t *)dst1, final); + dst1 += 2; + final >>= 16; + n &= 1; + } + if (n) + ST1((uint8_t *)dst1, final); + + return RETVAL; +} + + +#ifdef USERCOPY_FUNC +#undef ST1 +#undef ST2 +#undef ST4 +#undef ST8 +#undef LD1 +#undef LD2 +#undef LD4 +#undef LD8 +#undef USERCOPY_FUNC +#endif diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c new file mode 100644 index 000000000000..4763b3aff1cc --- /dev/null +++ b/arch/tile/lib/memcpy_user_64.c @@ -0,0 +1,86 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * + * Do memcpy(), but trap and return "n" when a load or store faults. + * + * Note: this idiom only works when memcpy() compiles to a leaf function. + * If "sp" is updated during memcpy, the "jrp lr" will be incorrect. + * + * Also note that we are capturing "n" from the containing scope here. + */ + +#define _ST(p, inst, v) \ + ({ \ + asm("1: " #inst " %0, %1;" \ + ".pushsection .coldtext.memcpy,\"ax\";" \ + "2: { move r0, %2; jrp lr };" \ + ".section __ex_table,\"a\";" \ + ".quad 1b, 2b;" \ + ".popsection" \ + : "=m" (*(p)) : "r" (v), "r" (n)); \ + }) + +#define _LD(p, inst) \ + ({ \ + unsigned long __v; \ + asm("1: " #inst " %0, %1;" \ + ".pushsection .coldtext.memcpy,\"ax\";" \ + "2: { move r0, %2; jrp lr };" \ + ".section __ex_table,\"a\";" \ + ".quad 1b, 2b;" \ + ".popsection" \ + : "=r" (__v) : "m" (*(p)), "r" (n)); \ + __v; \ + }) + +#define USERCOPY_FUNC __copy_to_user_inatomic +#define ST1(p, v) _ST((p), st1, (v)) +#define ST2(p, v) _ST((p), st2, (v)) +#define ST4(p, v) _ST((p), st4, (v)) +#define ST8(p, v) _ST((p), st, (v)) +#define LD1 LD +#define LD2 LD +#define LD4 LD +#define LD8 LD +#include "memcpy_64.c" + +#define USERCOPY_FUNC __copy_from_user_inatomic +#define ST1 ST +#define ST2 ST +#define ST4 ST +#define ST8 ST +#define LD1(p) _LD((p), ld1u) +#define LD2(p) _LD((p), ld2u) +#define LD4(p) _LD((p), ld4u) +#define LD8(p) _LD((p), ld) +#include "memcpy_64.c" + +#define USERCOPY_FUNC __copy_in_user_inatomic +#define ST1(p, v) _ST((p), st1, (v)) +#define ST2(p, v) _ST((p), st2, (v)) +#define ST4(p, v) _ST((p), st4, (v)) +#define ST8(p, v) _ST((p), st, (v)) +#define LD1(p) _LD((p), ld1u) +#define LD2(p) _LD((p), ld2u) +#define LD4(p) _LD((p), ld4u) +#define LD8(p) _LD((p), ld) +#include "memcpy_64.c" + +unsigned long __copy_from_user_zeroing(void *to, const void __user *from, + unsigned long n) +{ + unsigned long rc = __copy_from_user_inatomic(to, from, n); + if (unlikely(rc)) + memset(to + n - rc, 0, rc); + return rc; +} diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c new file mode 100644 index 000000000000..3873085711d5 --- /dev/null +++ b/arch/tile/lib/memset_64.c @@ -0,0 +1,145 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <arch/chip.h> + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> + +#undef memset + +void *memset(void *s, int c, size_t n) +{ + uint64_t *out64; + int n64, to_align64; + uint64_t v64; + uint8_t *out8 = s; + + /* Experimentation shows that a trivial tight loop is a win up until + * around a size of 20, where writing a word at a time starts to win. + */ +#define BYTE_CUTOFF 20 + +#if BYTE_CUTOFF < 7 + /* This must be at least at least this big, or some code later + * on doesn't work. + */ +#error "BYTE_CUTOFF is too small" +#endif + + if (n < BYTE_CUTOFF) { + /* Strangely, this turns out to be the tightest way to + * write this loop. + */ + if (n != 0) { + do { + /* Strangely, combining these into one line + * performs worse. + */ + *out8 = c; + out8++; + } while (--n != 0); + } + + return s; + } + + /* Align 'out8'. We know n >= 7 so this won't write past the end. */ + while (((uintptr_t) out8 & 7) != 0) { + *out8++ = c; + --n; + } + + /* Align 'n'. */ + while (n & 7) + out8[--n] = c; + + out64 = (uint64_t *) out8; + n64 = n >> 3; + + /* Tile input byte out to 64 bits. */ + /* KLUDGE */ + v64 = 0x0101010101010101ULL * (uint8_t)c; + + /* This must be at least 8 or the following loop doesn't work. */ +#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8) + + /* Determine how many words we need to emit before the 'out32' + * pointer becomes aligned modulo the cache line size. + */ + to_align64 = (-((uintptr_t)out64 >> 3)) & + (CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1); + + /* Only bother aligning and using wh64 if there is at least + * one full cache line to process. This check also prevents + * overrunning the end of the buffer with alignment words. + */ + if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) { + int lines_left; + + /* Align out64 mod the cache line size so we can use wh64. */ + n64 -= to_align64; + for (; to_align64 != 0; to_align64--) { + *out64 = v64; + out64++; + } + + /* Use unsigned divide to turn this into a right shift. */ + lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS; + + do { + /* Only wh64 a few lines at a time, so we don't + * exceed the maximum number of victim lines. + */ + int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) + ? lines_left + : CHIP_MAX_OUTSTANDING_VICTIMS()); + uint64_t *wh = out64; + int i = x; + int j; + + lines_left -= x; + + do { + __insn_wh64(wh); + wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS; + } while (--i); + + for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4); + j != 0; j--) { + *out64++ = v64; + *out64++ = v64; + *out64++ = v64; + *out64++ = v64; + } + } while (lines_left != 0); + + /* We processed all full lines above, so only this many + * words remain to be processed. + */ + n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1; + } + + /* Now handle any leftover values. */ + if (n64 != 0) { + do { + *out64 = v64; + out64++; + } while (--n64 != 0); + } + + return s; +} +EXPORT_SYMBOL(memset); diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c new file mode 100644 index 000000000000..d6fb9581e980 --- /dev/null +++ b/arch/tile/lib/spinlock_64.c @@ -0,0 +1,104 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/spinlock.h> +#include <linux/module.h> +#include <asm/processor.h> + +#include "spinlock_common.h" + +/* + * Read the spinlock value without allocating in our cache and without + * causing an invalidation to another cpu with a copy of the cacheline. + * This is important when we are spinning waiting for the lock. + */ +static inline u32 arch_spin_read_noalloc(void *lock) +{ + return atomic_cmpxchg((atomic_t *)lock, -1, -1); +} + +/* + * Wait until the high bits (current) match my ticket. + * If we notice the overflow bit set on entry, we clear it. + */ +void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket) +{ + if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) { + __insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW); + my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW; + } + + for (;;) { + u32 val = arch_spin_read_noalloc(lock); + u32 delta = my_ticket - arch_spin_current(val); + if (delta == 0) + return; + relax((128 / CYCLES_PER_RELAX_LOOP) * delta); + } +} +EXPORT_SYMBOL(arch_spin_lock_slow); + +/* + * Check the lock to see if it is plausible, and try to get it with cmpxchg(). + */ +int arch_spin_trylock(arch_spinlock_t *lock) +{ + u32 val = arch_spin_read_noalloc(lock); + if (unlikely(arch_spin_current(val) != arch_spin_next(val))) + return 0; + return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW) + == val; +} +EXPORT_SYMBOL(arch_spin_trylock); + +void arch_spin_unlock_wait(arch_spinlock_t *lock) +{ + u32 iterations = 0; + while (arch_spin_is_locked(lock)) + delay_backoff(iterations++); +} +EXPORT_SYMBOL(arch_spin_unlock_wait); + +/* + * If the read lock fails due to a writer, we retry periodically + * until the value is positive and we write our incremented reader count. + */ +void __read_lock_failed(arch_rwlock_t *rw) +{ + u32 val; + int iterations = 0; + do { + delay_backoff(iterations++); + val = __insn_fetchaddgez4(&rw->lock, 1); + } while (unlikely(arch_write_val_locked(val))); +} +EXPORT_SYMBOL(__read_lock_failed); + +/* + * If we failed because there were readers, clear the "writer" bit + * so we don't block additional readers. Otherwise, there was another + * writer anyway, so our "fetchor" made no difference. Then wait, + * issuing periodic fetchor instructions, till we get the lock. + */ +void __write_lock_failed(arch_rwlock_t *rw, u32 val) +{ + int iterations = 0; + do { + if (!arch_write_val_locked(val)) + val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT); + delay_backoff(iterations++); + val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT); + } while (val != 0); +} +EXPORT_SYMBOL(__write_lock_failed); diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c new file mode 100644 index 000000000000..617a9273aaa8 --- /dev/null +++ b/arch/tile/lib/strchr_64.c @@ -0,0 +1,67 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> + +#undef strchr + +char *strchr(const char *s, int c) +{ + int z, g; + + /* Get an aligned pointer. */ + const uintptr_t s_int = (uintptr_t) s; + const uint64_t *p = (const uint64_t *)(s_int & -8); + + /* Create eight copies of the byte for which we are looking. */ + const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c; + + /* Read the first aligned word, but force bytes before the string to + * match neither zero nor goal (we make sure the high bit of each + * byte is 1, and the low 7 bits are all the opposite of the goal + * byte). + * + * Note that this shift count expression works because we know shift + * counts are taken mod 64. + */ + const uint64_t before_mask = (1ULL << (s_int << 3)) - 1; + uint64_t v = (*p | before_mask) ^ + (goal & __insn_v1shrsi(before_mask, 1)); + + uint64_t zero_matches, goal_matches; + while (1) { + /* Look for a terminating '\0'. */ + zero_matches = __insn_v1cmpeqi(v, 0); + + /* Look for the goal byte. */ + goal_matches = __insn_v1cmpeq(v, goal); + + if (__builtin_expect((zero_matches | goal_matches) != 0, 0)) + break; + + v = *++p; + } + + z = __insn_ctz(zero_matches); + g = __insn_ctz(goal_matches); + + /* If we found c before '\0' we got a match. Note that if c == '\0' + * then g == z, and we correctly return the address of the '\0' + * rather than NULL. + */ + return (g <= z) ? ((char *)p) + (g >> 3) : NULL; +} +EXPORT_SYMBOL(strchr); diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c new file mode 100644 index 000000000000..1c92d46202a8 --- /dev/null +++ b/arch/tile/lib/strlen_64.c @@ -0,0 +1,38 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> + +#undef strlen + +size_t strlen(const char *s) +{ + /* Get an aligned pointer. */ + const uintptr_t s_int = (uintptr_t) s; + const uint64_t *p = (const uint64_t *)(s_int & -8); + + /* Read the first word, but force bytes before the string to be nonzero. + * This expression works because we know shift counts are taken mod 64. + */ + uint64_t v = *p | ((1ULL << (s_int << 3)) - 1); + + uint64_t bits; + while ((bits = __insn_v1cmpeqi(v, 0)) == 0) + v = *++p; + + return ((const char *)p) + (__insn_ctz(bits) >> 3) - s; +} +EXPORT_SYMBOL(strlen); diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S new file mode 100644 index 000000000000..2ff44f87b78e --- /dev/null +++ b/arch/tile/lib/usercopy_64.S @@ -0,0 +1,196 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/linkage.h> +#include <asm/errno.h> +#include <asm/cache.h> +#include <arch/chip.h> + +/* Access user memory, but use MMU to avoid propagating kernel exceptions. */ + + .pushsection .fixup,"ax" + +get_user_fault: + { movei r1, -EFAULT; move r0, zero } + jrp lr + ENDPROC(get_user_fault) + +put_user_fault: + { movei r0, -EFAULT; jrp lr } + ENDPROC(put_user_fault) + + .popsection + +/* + * __get_user_N functions take a pointer in r0, and return 0 in r1 + * on success, with the value in r0; or else -EFAULT in r1. + */ +#define __get_user_N(bytes, LOAD) \ + STD_ENTRY(__get_user_##bytes); \ +1: { LOAD r0, r0; move r1, zero }; \ + jrp lr; \ + STD_ENDPROC(__get_user_##bytes); \ + .pushsection __ex_table,"a"; \ + .quad 1b, get_user_fault; \ + .popsection + +__get_user_N(1, ld1u) +__get_user_N(2, ld2u) +__get_user_N(4, ld4u) +__get_user_N(8, ld) + +/* + * __put_user_N functions take a value in r0 and a pointer in r1, + * and return 0 in r0 on success or -EFAULT on failure. + */ +#define __put_user_N(bytes, STORE) \ + STD_ENTRY(__put_user_##bytes); \ +1: { STORE r1, r0; move r0, zero }; \ + jrp lr; \ + STD_ENDPROC(__put_user_##bytes); \ + .pushsection __ex_table,"a"; \ + .quad 1b, put_user_fault; \ + .popsection + +__put_user_N(1, st1) +__put_user_N(2, st2) +__put_user_N(4, st4) +__put_user_N(8, st) + +/* + * strnlen_user_asm takes the pointer in r0, and the length bound in r1. + * It returns the length, including the terminating NUL, or zero on exception. + * If length is greater than the bound, returns one plus the bound. + */ +STD_ENTRY(strnlen_user_asm) + { beqz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */ +1: { ld1u r4, r0; addi r1, r1, -1 } + beqz r4, 2f + { bnezt r1, 1b; addi r0, r0, 1 } +2: { sub r0, r0, r3; jrp lr } + STD_ENDPROC(strnlen_user_asm) + .pushsection .fixup,"ax" +strnlen_user_fault: + { move r0, zero; jrp lr } + ENDPROC(strnlen_user_fault) + .section __ex_table,"a" + .quad 1b, strnlen_user_fault + .popsection + +/* + * strncpy_from_user_asm takes the kernel target pointer in r0, + * the userspace source pointer in r1, and the length bound (including + * the trailing NUL) in r2. On success, it returns the string length + * (not including the trailing NUL), or -EFAULT on failure. + */ +STD_ENTRY(strncpy_from_user_asm) + { beqz r2, 2f; move r3, r0 } +1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } + { st1 r0, r4; addi r0, r0, 1 } + beqz r2, 2f + bnezt r4, 1b + addi r0, r0, -1 /* don't count the trailing NUL */ +2: { sub r0, r0, r3; jrp lr } + STD_ENDPROC(strncpy_from_user_asm) + .pushsection .fixup,"ax" +strncpy_from_user_fault: + { movei r0, -EFAULT; jrp lr } + ENDPROC(strncpy_from_user_fault) + .section __ex_table,"a" + .quad 1b, strncpy_from_user_fault + .popsection + +/* + * clear_user_asm takes the user target address in r0 and the + * number of bytes to zero in r1. + * It returns the number of uncopiable bytes (hopefully zero) in r0. + * Note that we don't use a separate .fixup section here since we fall + * through into the "fixup" code as the last straight-line bundle anyway. + */ +STD_ENTRY(clear_user_asm) + { beqz r1, 2f; or r2, r0, r1 } + andi r2, r2, 7 + beqzt r2, .Lclear_aligned_user_asm +1: { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 } + bnezt r1, 1b +2: { move r0, r1; jrp lr } + .pushsection __ex_table,"a" + .quad 1b, 2b + .popsection + +.Lclear_aligned_user_asm: +1: { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 } + bnezt r1, 1b +2: { move r0, r1; jrp lr } + STD_ENDPROC(clear_user_asm) + .pushsection __ex_table,"a" + .quad 1b, 2b + .popsection + +/* + * flush_user_asm takes the user target address in r0 and the + * number of bytes to flush in r1. + * It returns the number of unflushable bytes (hopefully zero) in r0. + */ +STD_ENTRY(flush_user_asm) + beqz r1, 2f + { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } + { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } + { and r0, r0, r2; and r1, r1, r2 } + { sub r1, r1, r0 } +1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() } + { addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b } +2: { move r0, r1; jrp lr } + STD_ENDPROC(flush_user_asm) + .pushsection __ex_table,"a" + .quad 1b, 2b + .popsection + +/* + * inv_user_asm takes the user target address in r0 and the + * number of bytes to invalidate in r1. + * It returns the number of not inv'able bytes (hopefully zero) in r0. + */ +STD_ENTRY(inv_user_asm) + beqz r1, 2f + { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } + { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } + { and r0, r0, r2; and r1, r1, r2 } + { sub r1, r1, r0 } +1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() } + { addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b } +2: { move r0, r1; jrp lr } + STD_ENDPROC(inv_user_asm) + .pushsection __ex_table,"a" + .quad 1b, 2b + .popsection + +/* + * finv_user_asm takes the user target address in r0 and the + * number of bytes to flush-invalidate in r1. + * It returns the number of not finv'able bytes (hopefully zero) in r0. + */ +STD_ENTRY(finv_user_asm) + beqz r1, 2f + { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } + { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } + { and r0, r0, r2; and r1, r1, r2 } + { sub r1, r1, r0 } +1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() } + { addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b } +2: { move r0, r1; jrp lr } + STD_ENDPROC(finv_user_asm) + .pushsection __ex_table,"a" + .quad 1b, 2b + .popsection |