diff options
Diffstat (limited to 'arch/sh/lib')
-rw-r--r-- | arch/sh/lib/Makefile | 13 | ||||
-rw-r--r-- | arch/sh/lib/checksum.S | 385 | ||||
-rw-r--r-- | arch/sh/lib/delay.c | 41 | ||||
-rw-r--r-- | arch/sh/lib/div64-generic.c | 19 | ||||
-rw-r--r-- | arch/sh/lib/div64.S | 46 | ||||
-rw-r--r-- | arch/sh/lib/memchr.S | 26 | ||||
-rw-r--r-- | arch/sh/lib/memcpy-sh4.S | 800 | ||||
-rw-r--r-- | arch/sh/lib/memcpy.S | 227 | ||||
-rw-r--r-- | arch/sh/lib/memmove.S | 254 | ||||
-rw-r--r-- | arch/sh/lib/memset.S | 57 | ||||
-rw-r--r-- | arch/sh/lib/strcasecmp.c | 26 | ||||
-rw-r--r-- | arch/sh/lib/strlen.S | 70 | ||||
-rw-r--r-- | arch/sh/lib/udivdi3.c | 16 |
13 files changed, 1980 insertions, 0 deletions
diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile new file mode 100644 index 000000000000..b5681e3f9684 --- /dev/null +++ b/arch/sh/lib/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for SuperH-specific library files.. +# + +lib-y = delay.o memset.o memmove.o memchr.o \ + checksum.o strcasecmp.o strlen.o div64.o udivdi3.o \ + div64-generic.o + +memcpy-y := memcpy.o +memcpy-$(CONFIG_CPU_SH4) := memcpy-sh4.o + +lib-y += $(memcpy-y) + diff --git a/arch/sh/lib/checksum.S b/arch/sh/lib/checksum.S new file mode 100644 index 000000000000..7c50dfe68c07 --- /dev/null +++ b/arch/sh/lib/checksum.S @@ -0,0 +1,385 @@ +/* $Id: checksum.S,v 1.10 2001/07/06 13:11:32 gniibe Exp $ + * + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Tom May, <ftom@netcom.com> + * Pentium Pro/II routines: + * Alexander Kjeldaas <astor@guardian.no> + * Finn Arne Gangstad <finnag@guardian.no> + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error + * converted to pure assembler + * + * SuperH version: Copyright (C) 1999 Niibe Yutaka + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/errno.h> +#include <linux/linkage.h> + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* + * unsigned int csum_partial(const unsigned char *buf, int len, + * unsigned int sum); + */ + +.text +ENTRY(csum_partial) + /* + * Experiments with Ethernet and SLIP connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. We get at + * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. + * Fortunately, it is easy to convert 2-byte alignment to 4-byte + * alignment for the unrolled loop. + */ + mov r5, r1 + mov r4, r0 + tst #2, r0 ! Check alignment. + bt 2f ! Jump if alignment is ok. + ! + add #-2, r5 ! Alignment uses up two bytes. + cmp/pz r5 ! + bt/s 1f ! Jump if we had at least two bytes. + clrt + bra 6f + add #2, r5 ! r5 was < 2. Deal with it. +1: + mov r5, r1 ! Save new len for later use. + mov.w @r4+, r0 + extu.w r0, r0 + addc r0, r6 + bf 2f + add #1, r6 +2: + mov #-5, r0 + shld r0, r5 + tst r5, r5 + bt/s 4f ! if it's =0, go to 4f + clrt + .align 2 +3: + mov.l @r4+, r0 + mov.l @r4+, r2 + mov.l @r4+, r3 + addc r0, r6 + mov.l @r4+, r0 + addc r2, r6 + mov.l @r4+, r2 + addc r3, r6 + mov.l @r4+, r3 + addc r0, r6 + mov.l @r4+, r0 + addc r2, r6 + mov.l @r4+, r2 + addc r3, r6 + addc r0, r6 + addc r2, r6 + movt r0 + dt r5 + bf/s 3b + cmp/eq #1, r0 + ! here, we know r5==0 + addc r5, r6 ! add carry to r6 +4: + mov r1, r0 + and #0x1c, r0 + tst r0, r0 + bt/s 6f + mov r0, r5 + shlr2 r5 + mov #0, r2 +5: + addc r2, r6 + mov.l @r4+, r2 + movt r0 + dt r5 + bf/s 5b + cmp/eq #1, r0 + addc r2, r6 + addc r5, r6 ! r5==0 here, so it means add carry-bit +6: + mov r1, r5 + mov #3, r0 + and r0, r5 + tst r5, r5 + bt 9f ! if it's =0 go to 9f + mov #2, r1 + cmp/hs r1, r5 + bf 7f + mov.w @r4+, r0 + extu.w r0, r0 + cmp/eq r1, r5 + bt/s 8f + clrt + shll16 r0 + addc r0, r6 +7: + mov.b @r4+, r0 + extu.b r0, r0 +#ifndef __LITTLE_ENDIAN__ + shll8 r0 +#endif +8: + addc r0, r6 + mov #0, r0 + addc r0, r6 +9: + rts + mov r6, r0 + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, + int sum, int *src_err_ptr, int *dst_err_ptr) + */ + +/* + * Copy from ds while checksumming, otherwise like csum_partial + * + * The macros SRC and DST specify the type of access for the instruction. + * thus we can call a custom exception handler for all access types. + * + * FIXME: could someone double-check whether I haven't mixed up some SRC and + * DST definitions? It's damn hard to trigger all cases. I hope I got + * them all but there's no guarantee. + */ + +#define SRC(...) \ + 9999: __VA_ARGS__ ; \ + .section __ex_table, "a"; \ + .long 9999b, 6001f ; \ + .previous + +#define DST(...) \ + 9999: __VA_ARGS__ ; \ + .section __ex_table, "a"; \ + .long 9999b, 6002f ; \ + .previous + +! +! r4: const char *SRC +! r5: char *DST +! r6: int LEN +! r7: int SUM +! +! on stack: +! int *SRC_ERR_PTR +! int *DST_ERR_PTR +! +ENTRY(csum_partial_copy_generic) + mov.l r5,@-r15 + mov.l r6,@-r15 + + mov #3,r0 ! Check src and dest are equally aligned + mov r4,r1 + and r0,r1 + and r5,r0 + cmp/eq r1,r0 + bf 3f ! Different alignments, use slow version + tst #1,r0 ! Check dest word aligned + bf 3f ! If not, do it the slow way + + mov #2,r0 + tst r0,r5 ! Check dest alignment. + bt 2f ! Jump if alignment is ok. + add #-2,r6 ! Alignment uses up two bytes. + cmp/pz r6 ! Jump if we had at least two bytes. + bt/s 1f + clrt + bra 4f + add #2,r6 ! r6 was < 2. Deal with it. + +3: ! Handle different src and dest alignments. + ! This is not common, so simple byte by byte copy will do. + mov r6,r2 + shlr r6 + tst r6,r6 + bt 4f + clrt + .align 2 +5: +SRC( mov.b @r4+,r1 ) +SRC( mov.b @r4+,r0 ) + extu.b r1,r1 +DST( mov.b r1,@r5 ) +DST( mov.b r0,@(1,r5) ) + extu.b r0,r0 + add #2,r5 + +#ifdef __LITTLE_ENDIAN__ + shll8 r0 +#else + shll8 r1 +#endif + or r1,r0 + + addc r0,r7 + movt r0 + dt r6 + bf/s 5b + cmp/eq #1,r0 + mov #0,r0 + addc r0, r7 + + mov r2, r0 + tst #1, r0 + bt 7f + bra 5f + clrt + + ! src and dest equally aligned, but to a two byte boundary. + ! Handle first two bytes as a special case + .align 2 +1: +SRC( mov.w @r4+,r0 ) +DST( mov.w r0,@r5 ) + add #2,r5 + extu.w r0,r0 + addc r0,r7 + mov #0,r0 + addc r0,r7 +2: + mov r6,r2 + mov #-5,r0 + shld r0,r6 + tst r6,r6 + bt/s 2f + clrt + .align 2 +1: +SRC( mov.l @r4+,r0 ) +SRC( mov.l @r4+,r1 ) + addc r0,r7 +DST( mov.l r0,@r5 ) +DST( mov.l r1,@(4,r5) ) + addc r1,r7 + +SRC( mov.l @r4+,r0 ) +SRC( mov.l @r4+,r1 ) + addc r0,r7 +DST( mov.l r0,@(8,r5) ) +DST( mov.l r1,@(12,r5) ) + addc r1,r7 + +SRC( mov.l @r4+,r0 ) +SRC( mov.l @r4+,r1 ) + addc r0,r7 +DST( mov.l r0,@(16,r5) ) +DST( mov.l r1,@(20,r5) ) + addc r1,r7 + +SRC( mov.l @r4+,r0 ) +SRC( mov.l @r4+,r1 ) + addc r0,r7 +DST( mov.l r0,@(24,r5) ) +DST( mov.l r1,@(28,r5) ) + addc r1,r7 + add #32,r5 + movt r0 + dt r6 + bf/s 1b + cmp/eq #1,r0 + mov #0,r0 + addc r0,r7 + +2: mov r2,r6 + mov #0x1c,r0 + and r0,r6 + cmp/pl r6 + bf/s 4f + clrt + shlr2 r6 +3: +SRC( mov.l @r4+,r0 ) + addc r0,r7 +DST( mov.l r0,@r5 ) + add #4,r5 + movt r0 + dt r6 + bf/s 3b + cmp/eq #1,r0 + mov #0,r0 + addc r0,r7 +4: mov r2,r6 + mov #3,r0 + and r0,r6 + cmp/pl r6 + bf 7f + mov #2,r1 + cmp/hs r1,r6 + bf 5f +SRC( mov.w @r4+,r0 ) +DST( mov.w r0,@r5 ) + extu.w r0,r0 + add #2,r5 + cmp/eq r1,r6 + bt/s 6f + clrt + shll16 r0 + addc r0,r7 +5: +SRC( mov.b @r4+,r0 ) +DST( mov.b r0,@r5 ) + extu.b r0,r0 +#ifndef __LITTLE_ENDIAN__ + shll8 r0 +#endif +6: addc r0,r7 + mov #0,r0 + addc r0,r7 +7: +5000: + +# Exception handler: +.section .fixup, "ax" + +6001: + mov.l @(8,r15),r0 ! src_err_ptr + mov #-EFAULT,r1 + mov.l r1,@r0 + + ! zero the complete destination - computing the rest + ! is too much work + mov.l @(4,r15),r5 ! dst + mov.l @r15,r6 ! len + mov #0,r7 +1: mov.b r7,@r5 + dt r6 + bf/s 1b + add #1,r5 + mov.l 8000f,r0 + jmp @r0 + nop + .align 2 +8000: .long 5000b + +6002: + mov.l @(12,r15),r0 ! dst_err_ptr + mov #-EFAULT,r1 + mov.l r1,@r0 + mov.l 8001f,r0 + jmp @r0 + nop + .align 2 +8001: .long 5000b + +.previous + add #8,r15 + rts + mov r7,r0 diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c new file mode 100644 index 000000000000..50b36037d86b --- /dev/null +++ b/arch/sh/lib/delay.c @@ -0,0 +1,41 @@ +/* + * Precise Delay Loops for SuperH + * + * Copyright (C) 1999 Niibe Yutaka & Kaz Kojima + */ + +#include <linux/sched.h> +#include <linux/delay.h> + +void __delay(unsigned long loops) +{ + __asm__ __volatile__( + "tst %0, %0\n\t" + "1:\t" + "bf/s 1b\n\t" + " dt %0" + : "=r" (loops) + : "0" (loops) + : "t"); +} + +inline void __const_udelay(unsigned long xloops) +{ + __asm__("dmulu.l %0, %2\n\t" + "sts mach, %0" + : "=r" (xloops) + : "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy) + : "macl", "mach"); + __delay(xloops * HZ); +} + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ +} + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00000005); +} + diff --git a/arch/sh/lib/div64-generic.c b/arch/sh/lib/div64-generic.c new file mode 100644 index 000000000000..c02473afd581 --- /dev/null +++ b/arch/sh/lib/div64-generic.c @@ -0,0 +1,19 @@ +/* + * Generic __div64_32 wrapper for __xdiv64_32. + */ + +#include <linux/types.h> + +extern u64 __xdiv64_32(u64 n, u32 d); + +u64 __div64_32(u64 *xp, u32 y) +{ + u64 rem; + u64 q = __xdiv64_32(*xp, y); + + rem = *xp - q * y; + *xp = q; + + return rem; +} + diff --git a/arch/sh/lib/div64.S b/arch/sh/lib/div64.S new file mode 100644 index 000000000000..eefc275d64a7 --- /dev/null +++ b/arch/sh/lib/div64.S @@ -0,0 +1,46 @@ +/* + * unsigned long long __xdiv64_32(unsigned long long n, unsigned long d); + */ + +#include <linux/linkage.h> + +.text +ENTRY(__xdiv64_32) +#ifdef __LITTLE_ENDIAN__ + mov r4, r0 + mov r5, r1 +#else + mov r4, r1 + mov r5, r0 +#endif + cmp/hs r6, r1 + bf.s 1f + mov #0, r2 + + mov r1, r2 + mov #0, r3 + div0u + .rept 32 + rotcl r2 + div1 r6, r3 + .endr + rotcl r2 + mul.l r6, r2 + sts macl, r3 + sub r3, r1 +1: + div0u + .rept 32 + rotcl r0 + div1 r6, r1 + .endr +#ifdef __LITTLE_ENDIAN__ + mov r2, r1 + rts + rotcl r0 +#else + rotcl r0 + mov r0, r1 + rts + mov r2, r0 +#endif diff --git a/arch/sh/lib/memchr.S b/arch/sh/lib/memchr.S new file mode 100644 index 000000000000..bc6036ad5706 --- /dev/null +++ b/arch/sh/lib/memchr.S @@ -0,0 +1,26 @@ +/* $Id: memchr.S,v 1.1 2000/04/14 16:49:01 mjd Exp $ + * + * "memchr" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * + */ + +/* + * void *memchr(const void *s, int c, size_t n); + */ + +#include <linux/linkage.h> +ENTRY(memchr) + tst r6,r6 + bt/s 2f + exts.b r5,r5 +1: mov.b @r4,r1 + cmp/eq r1,r5 + bt/s 3f + dt r6 + bf/s 1b + add #1,r4 +2: mov #0,r4 +3: rts + mov r4,r0 diff --git a/arch/sh/lib/memcpy-sh4.S b/arch/sh/lib/memcpy-sh4.S new file mode 100644 index 000000000000..55f227441f9e --- /dev/null +++ b/arch/sh/lib/memcpy-sh4.S @@ -0,0 +1,800 @@ +/* + * "memcpy" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * Copyright (c) 2002 STMicroelectronics Ltd + * Modified from memcpy.S and micro-optimised for SH4 + * Stuart Menefy (stuart.menefy@st.com) + * + */ +#include <linux/linkage.h> +#include <linux/config.h> + +/* + * void *memcpy(void *dst, const void *src, size_t n); + * + * It is assumed that there is no overlap between src and dst. + * If there is an overlap, then the results are undefined. + */ + + ! + ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. + ! + + ! Size is 16 or greater, and may have trailing bytes + + .balign 32 +.Lcase1: + ! Read a long word and write a long word at once + ! At the start of each iteration, r7 contains last long load + add #-1,r5 ! 79 EX + mov r4,r2 ! 5 MT (0 cycles latency) + + mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) + add #-4,r5 ! 50 EX + + add #7,r2 ! 79 EX + ! +#ifdef CONFIG_CPU_LITTLE_ENDIAN + ! 6 cycles, 4 bytes per iteration +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK + mov r7, r3 ! 5 MT (latency=0) ! RQPO + + cmp/hi r2,r0 ! 57 MT + shll16 r3 ! 103 EX + + mov r1,r6 ! 5 MT (latency=0) + shll8 r3 ! 102 EX ! Oxxx + + shlr8 r6 ! 106 EX ! xNML + mov r1, r7 ! 5 MT (latency=0) + + or r6,r3 ! 82 EX ! ONML + bt/s 3b ! 109 BR + + mov.l r3,@-r0 ! 30 LS +#else +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN + mov r7,r3 ! 5 MT (latency=0) ! OPQR + + cmp/hi r2,r0 ! 57 MT + shlr16 r3 ! 107 EX + + shlr8 r3 ! 106 EX ! xxxO + mov r1,r6 ! 5 MT (latency=0) + + shll8 r6 ! 102 EX ! LMNx + mov r1,r7 ! 5 MT (latency=0) + + or r6,r3 ! 82 EX ! LMNO + bt/s 3b ! 109 BR + + mov.l r3,@-r0 ! 30 LS +#endif + ! Finally, copy a byte at once, if necessary + + add #4,r5 ! 50 EX + cmp/eq r4,r0 ! 54 MT + + add #-6,r2 ! 50 EX + bt 9f ! 109 BR + +8: cmp/hi r2,r0 ! 57 MT + mov.b @(r0,r5),r1 ! 20 LS (latency=2) + + bt/s 8b ! 109 BR + + mov.b r1,@-r0 ! 29 LS + +9: rts + nop + + + ! + ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... + ! + + ! Size is 16 or greater, and may have trailing bytes + + .balign 32 +.Lcase3: + ! Read a long word and write a long word at once + ! At the start of each iteration, r7 contains last long load + add #-3,r5 ! 79 EX + mov r4,r2 ! 5 MT (0 cycles latency) + + mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) + add #-4,r5 ! 50 EX + + add #7,r2 ! 79 EX + ! +#ifdef CONFIG_CPU_LITTLE_ENDIAN + ! 6 cycles, 4 bytes per iteration +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK + mov r7, r3 ! 5 MT (latency=0) ! RQPO + + cmp/hi r2,r0 ! 57 MT + shll8 r3 ! 102 EX ! QPOx + + mov r1,r6 ! 5 MT (latency=0) + shlr16 r6 ! 107 EX + + shlr8 r6 ! 106 EX ! xxxN + mov r1, r7 ! 5 MT (latency=0) + + or r6,r3 ! 82 EX ! QPON + bt/s 3b ! 109 BR + + mov.l r3,@-r0 ! 30 LS +#else +3: mov r1,r3 ! OPQR + shlr8 r3 ! xOPQ + mov.l @(r0,r5),r1 ! KLMN + mov r1,r6 + shll16 r6 + shll8 r6 ! Nxxx + or r6,r3 ! NOPQ + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#endif + + ! Finally, copy a byte at once, if necessary + + add #6,r5 ! 50 EX + cmp/eq r4,r0 ! 54 MT + + add #-6,r2 ! 50 EX + bt 9f ! 109 BR + +8: cmp/hi r2,r0 ! 57 MT + mov.b @(r0,r5),r1 ! 20 LS (latency=2) + + bt/s 8b ! 109 BR + + mov.b r1,@-r0 ! 29 LS + +9: rts + nop + +ENTRY(memcpy) + + ! Calculate the invariants which will be used in the remainder + ! of the code: + ! + ! r4 --> [ ... ] DST [ ... ] SRC + ! [ ... ] [ ... ] + ! : : + ! r0 --> [ ... ] r0+r5 --> [ ... ] + ! + ! + + ! Short circuit the common case of src, dst and len being 32 bit aligned + ! and test for zero length move + + mov r6, r0 ! 5 MT (0 cycle latency) + or r4, r0 ! 82 EX + + or r5, r0 ! 82 EX + tst r6, r6 ! 86 MT + + bt/s 99f ! 111 BR (zero len) + tst #3, r0 ! 87 MT + + mov r4, r0 ! 5 MT (0 cycle latency) + add r6, r0 ! 49 EX + + mov #16, r1 ! 6 EX + bt/s .Lcase00 ! 111 BR (aligned) + + sub r4, r5 ! 75 EX + + ! Arguments are not nicely long word aligned or zero len. + ! Check for small copies, and if so do a simple byte at a time copy. + ! + ! Deciding on an exact value of 'small' is not easy, as the point at which + ! using the optimised routines become worthwhile varies (these are the + ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): + ! size byte-at-time long word byte + ! 16 42 39-40 46-50 50-55 + ! 24 58 43-44 54-58 62-67 + ! 36 82 49-50 66-70 80-85 + ! However the penalty for getting it 'wrong' is much higher for long word + ! aligned data (and this is more common), so use a value of 16. + + cmp/gt r6,r1 ! 56 MT + + add #-1,r5 ! 50 EX + bf/s 6f ! 108 BR (not small) + + mov r5, r3 ! 5 MT (latency=0) + shlr r6 ! 104 EX + + mov.b @(r0,r5),r1 ! 20 LS (latency=2) + bf/s 4f ! 111 BR + + add #-1,r3 ! 50 EX + tst r6, r6 ! 86 MT + + bt/s 98f ! 110 BR + mov.b r1,@-r0 ! 29 LS + + ! 4 cycles, 2 bytes per iteration +3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) + +4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) + dt r6 ! 67 EX + + mov.b r1,@-r0 ! 29 LS + bf/s 3b ! 111 BR + + mov.b r2,@-r0 ! 29 LS +98: + rts + nop + +99: rts + mov r4, r0 + + ! Size is not small, so its worthwhile looking for optimisations. + ! First align destination to a long word boundary. + ! + ! r5 = normal value -1 + +6: tst #3, r0 ! 87 MT + mov #3, r3 ! 6 EX + + bt/s 2f ! 111 BR + and r0,r3 ! 78 EX + + ! 3 cycles, 1 byte per iteration +1: dt r3 ! 67 EX + mov.b @(r0,r5),r1 ! 19 LS (latency=2) + + add #-1, r6 ! 79 EX + bf/s 1b ! 109 BR + + mov.b r1,@-r0 ! 28 LS + +2: add #1, r5 ! 79 EX + + ! Now select the appropriate bulk transfer code based on relative + ! alignment of src and dst. + + mov r0, r3 ! 5 MT (latency=0) + + mov r5, r0 ! 5 MT (latency=0) + tst #1, r0 ! 87 MT + + bf/s 1f ! 111 BR + mov #64, r7 ! 6 EX + + ! bit 0 clear + + cmp/ge r7, r6 ! 55 MT + + bt/s 2f ! 111 BR + tst #2, r0 ! 87 MT + + ! small + bt/s .Lcase0 + mov r3, r0 + + bra .Lcase2 + nop + + ! big +2: bt/s .Lcase0b + mov r3, r0 + + bra .Lcase2b + nop + + ! bit 0 set +1: tst #2, r0 ! 87 MT + + bt/s .Lcase1 + mov r3, r0 + + bra .Lcase3 + nop + + + ! + ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR + ! + + ! src, dst and size are all long word aligned + ! size is non-zero + + .balign 32 +.Lcase00: + mov #64, r1 ! 6 EX + mov r5, r3 ! 5 MT (latency=0) + + cmp/gt r6, r1 ! 56 MT + add #-4, r5 ! 50 EX + + bf .Lcase00b ! 108 BR (big loop) + shlr2 r6 ! 105 EX + + shlr r6 ! 104 EX + mov.l @(r0, r5), r1 ! 21 LS (latency=2) + + bf/s 4f ! 111 BR + add #-8, r3 ! 50 EX + + tst r6, r6 ! 86 MT + bt/s 5f ! 110 BR + + mov.l r1,@-r0 ! 30 LS + + ! 4 cycles, 2 long words per iteration +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) + +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) + dt r6 ! 67 EX + + mov.l r1, @-r0 ! 30 LS + bf/s 3b ! 109 BR + + mov.l r2, @-r0 ! 30 LS + +5: rts + nop + + + ! Size is 16 or greater and less than 64, but may have trailing bytes + + .balign 32 +.Lcase0: + add #-4, r5 ! 50 EX + mov r4, r7 ! 5 MT (latency=0) + + mov.l @(r0, r5), r1 ! 21 LS (latency=2) + mov #4, r2 ! 6 EX + + add #11, r7 ! 50 EX + tst r2, r6 ! 86 MT + + mov r5, r3 ! 5 MT (latency=0) + bt/s 4f ! 111 BR + + add #-4, r3 ! 50 EX + mov.l r1,@-r0 ! 30 LS + + ! 4 cycles, 2 long words per iteration +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) + +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) + cmp/hi r7, r0 + + mov.l r1, @-r0 ! 30 LS + bt/s 3b ! 109 BR + + mov.l r2, @-r0 ! 30 LS + + ! Copy the final 0-3 bytes + + add #3,r5 ! 50 EX + + cmp/eq r0, r4 ! 54 MT + add #-10, r7 ! 50 EX + + bt 9f ! 110 BR + + ! 3 cycles, 1 byte per iteration +1: mov.b @(r0,r5),r1 ! 19 LS + cmp/hi r7,r0 ! 57 MT + + bt/s 1b ! 111 BR + mov.b r1,@-r0 ! 28 LS + +9: rts + nop + + ! Size is at least 64 bytes, so will be going round the big loop at least once. + ! + ! r2 = rounded up r4 + ! r3 = rounded down r0 + + .balign 32 +.Lcase0b: + add #-4, r5 ! 50 EX + +.Lcase00b: + mov r0, r3 ! 5 MT (latency=0) + mov #(~0x1f), r1 ! 6 EX + + and r1, r3 ! 78 EX + mov r4, r2 ! 5 MT (latency=0) + + cmp/eq r3, r0 ! 54 MT + add #0x1f, r2 ! 50 EX + + bt/s 1f ! 110 BR + and r1, r2 ! 78 EX + + ! copy initial words until cache line aligned + + mov.l @(r0, r5), r1 ! 21 LS (latency=2) + tst #4, r0 ! 87 MT + + mov r5, r6 ! 5 MT (latency=0) + add #-4, r6 ! 50 EX + + bt/s 4f ! 111 BR + add #8, r3 ! 50 EX + + tst #0x18, r0 ! 87 MT + + bt/s 1f ! 109 BR + mov.l r1,@-r0 ! 30 LS + + ! 4 cycles, 2 long words per iteration +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) + +4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) + cmp/eq r3, r0 ! 54 MT + + mov.l r1, @-r0 ! 30 LS + bf/s 3b ! 109 BR + + mov.l r7, @-r0 ! 30 LS + + ! Copy the cache line aligned blocks + ! + ! In use: r0, r2, r4, r5 + ! Scratch: r1, r3, r6, r7 + ! + ! We could do this with the four scratch registers, but if src + ! and dest hit the same cache line, this will thrash, so make + ! use of additional registers. + ! + ! We also need r0 as a temporary (for movca), so 'undo' the invariant: + ! r5: src (was r0+r5) + ! r1: dest (was r0) + ! this can be reversed at the end, so we don't need to save any extra + ! state. + ! +1: mov.l r8, @-r15 ! 30 LS + add r0, r5 ! 49 EX + + mov.l r9, @-r15 ! 30 LS + mov r0, r1 ! 5 MT (latency=0) + + mov.l r10, @-r15 ! 30 LS + add #-0x1c, r5 ! 50 EX + + mov.l r11, @-r15 ! 30 LS + + ! 16 cycles, 32 bytes per iteration +2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) + add #-0x20, r1 ! 50 EX + mov.l @(0x04,r5),r3 ! 18 LS (latency=2) + mov.l @(0x08,r5),r6 ! 18 LS (latency=2) + mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) + mov.l @(0x10,r5),r8 ! 18 LS (latency=2) + mov.l @(0x14,r5),r9 ! 18 LS (latency=2) + mov.l @(0x18,r5),r10 ! 18 LS (latency=2) + mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) + movca.l r0,@r1 ! 40 LS (latency=3-7) + mov.l r3,@(0x04,r1) ! 33 LS + mov.l r6,@(0x08,r1) ! 33 LS + mov.l r7,@(0x0c,r1) ! 33 LS + + mov.l r8,@(0x10,r1) ! 33 LS + add #-0x20, r5 ! 50 EX + + mov.l r9,@(0x14,r1) ! 33 LS + cmp/eq r2,r1 ! 54 MT + + mov.l r10,@(0x18,r1) ! 33 LS + bf/s 2b ! 109 BR + + mov.l r11,@(0x1c,r1) ! 33 LS + + mov r1, r0 ! 5 MT (latency=0) + + mov.l @r15+, r11 ! 15 LS + sub r1, r5 ! 75 EX + + mov.l @r15+, r10 ! 15 LS + cmp/eq r4, r0 ! 54 MT + + bf/s 1f ! 109 BR + mov.l @r15+, r9 ! 15 LS + + rts +1: mov.l @r15+, r8 ! 15 LS + sub r4, r1 ! 75 EX (len remaining) + + ! number of trailing bytes is non-zero + ! + ! invariants restored (r5 already decremented by 4) + ! also r1=num bytes remaining + + mov #4, r2 ! 6 EX + mov r4, r7 ! 5 MT (latency=0) + + add #0x1c, r5 ! 50 EX (back to -4) + cmp/hs r2, r1 ! 58 MT + + bf/s 5f ! 108 BR + add #11, r7 ! 50 EX + + mov.l @(r0, r5), r6 ! 21 LS (latency=2) + tst r2, r1 ! 86 MT + + mov r5, r3 ! 5 MT (latency=0) + bt/s 4f ! 111 BR + + add #-4, r3 ! 50 EX + cmp/hs r2, r1 ! 58 MT + + bt/s 5f ! 111 BR + mov.l r6,@-r0 ! 30 LS + + ! 4 cycles, 2 long words per iteration +3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) + +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) + cmp/hi r7, r0 + + mov.l r6, @-r0 ! 30 LS + bt/s 3b ! 109 BR + + mov.l r2, @-r0 ! 30 LS + + ! Copy the final 0-3 bytes + +5: cmp/eq r0, r4 ! 54 MT + add #-10, r7 ! 50 EX + + bt 9f ! 110 BR + add #3,r5 ! 50 EX + + ! 3 cycles, 1 byte per iteration +1: mov.b @(r0,r5),r1 ! 19 LS + cmp/hi r7,r0 ! 57 MT + + bt/s 1b ! 111 BR + mov.b r1,@-r0 ! 28 LS + +9: rts + nop + + ! + ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. + ! + + .balign 32 +.Lcase2: + ! Size is 16 or greater and less then 64, but may have trailing bytes + +2: mov r5, r6 ! 5 MT (latency=0) + add #-2,r5 ! 50 EX + + mov r4,r2 ! 5 MT (latency=0) + add #-4,r6 ! 50 EX + + add #7,r2 ! 50 EX +3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) + + mov.w @(r0,r6),r3 ! 20 LS (latency=2) + cmp/hi r2,r0 ! 57 MT + + mov.w r1,@-r0 ! 29 LS + bt/s 3b ! 111 BR + + mov.w r3,@-r0 ! 29 LS + + bra 10f + nop + + + .balign 32 +.Lcase2b: + ! Size is at least 64 bytes, so will be going round the big loop at least once. + ! + ! r2 = rounded up r4 + ! r3 = rounded down r0 + + mov r0, r3 ! 5 MT (latency=0) + mov #(~0x1f), r1 ! 6 EX + + and r1, r3 ! 78 EX + mov r4, r2 ! 5 MT (latency=0) + + cmp/eq r3, r0 ! 54 MT + add #0x1f, r2 ! 50 EX + + add #-2, r5 ! 50 EX + bt/s 1f ! 110 BR + and r1, r2 ! 78 EX + + ! Copy a short word one at a time until we are cache line aligned + ! Normal values: r0, r2, r3, r4 + ! Unused: r1, r6, r7 + ! Mod: r5 (=r5-2) + ! + add #2, r3 ! 50 EX + +2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) + cmp/eq r3,r0 ! 54 MT + + bf/s 2b ! 111 BR + + mov.w r1,@-r0 ! 29 LS + + ! Copy the cache line aligned blocks + ! + ! In use: r0, r2, r4, r5 (=r5-2) + ! Scratch: r1, r3, r6, r7 + ! + ! We could do this with the four scratch registers, but if src + ! and dest hit the same cache line, this will thrash, so make + ! use of additional registers. + ! + ! We also need r0 as a temporary (for movca), so 'undo' the invariant: + ! r5: src (was r0+r5) + ! r1: dest (was r0) + ! this can be reversed at the end, so we don't need to save any extra + ! state. + ! +1: mov.l r8, @-r15 ! 30 LS + add r0, r5 ! 49 EX + + mov.l r9, @-r15 ! 30 LS + mov r0, r1 ! 5 MT (latency=0) + + mov.l r10, @-r15 ! 30 LS + add #-0x1e, r5 ! 50 EX + + mov.l r11, @-r15 ! 30 LS + + mov.l r12, @-r15 ! 30 LS + + ! 17 cycles, 32 bytes per iteration +#ifdef CONFIG_CPU_LITTLE_ENDIAN +2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI + add #-0x20, r1 ! 50 EX + + mov.l @r5+, r3 ! 15 LS (latency=2) NMLK + + mov.l @r5+, r6 ! 15 LS (latency=2) RQPO + shll16 r0 ! 103 EX JI.. + + mov.l @r5+, r7 ! 15 LS (latency=2) + xtrct r3, r0 ! 48 EX LKJI + + mov.l @r5+, r8 ! 15 LS (latency=2) + xtrct r6, r3 ! 48 EX PONM + + mov.l @r5+, r9 ! 15 LS (latency=2) + xtrct r7, r6 ! 48 EX + + mov.l @r5+, r10 ! 15 LS (latency=2) + xtrct r8, r7 ! 48 EX + + mov.l @r5+, r11 ! 15 LS (latency=2) + xtrct r9, r8 ! 48 EX + + mov.w @r5+, r12 ! 15 LS (latency=2) + xtrct r10, r9 ! 48 EX + + movca.l r0,@r1 ! 40 LS (latency=3-7) + xtrct r11, r10 ! 48 EX + + mov.l r3, @(0x04,r1) ! 33 LS + xtrct r12, r11 ! 48 EX + + mov.l r6, @(0x08,r1) ! 33 LS + + mov.l r7, @(0x0c,r1) ! 33 LS + + mov.l r8, @(0x10,r1) ! 33 LS + add #-0x40, r5 ! 50 EX + + mov.l r9, @(0x14,r1) ! 33 LS + cmp/eq r2,r1 ! 54 MT + + mov.l r10, @(0x18,r1) ! 33 LS + bf/s 2b ! 109 BR + + mov.l r11, @(0x1c,r1) ! 33 LS +#else +2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) + add #-2, r5 ! 50 EX + + mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) + add #-4, r1 ! 50 EX + + mov.l @(0x18,r5), r6 ! 18 LS (latency=2) + shll16 r0 ! 103 EX + + mov.l @(0x14,r5), r7 ! 18 LS (latency=2) + xtrct r3, r0 ! 48 EX + + mov.l @(0x10,r5), r8 ! 18 LS (latency=2) + xtrct r6, r3 ! 48 EX + + mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) + xtrct r7, r6 ! 48 EX + + mov.l @(0x08,r5), r10 ! 18 LS (latency=2) + xtrct r8, r7 ! 48 EX + + mov.l @(0x04,r5), r11 ! 18 LS (latency=2) + xtrct r9, r8 ! 48 EX + + mov.w @(0x02,r5), r12 ! 18 LS (latency=2) + xtrct r10, r9 ! 48 EX + + movca.l r0,@r1 ! 40 LS (latency=3-7) + add #-0x1c, r1 ! 50 EX + + mov.l r3, @(0x1c,r1) ! 33 LS + xtrct r11, r10 ! 48 EX + + mov.l r6, @(0x18,r1) ! 33 LS + xtrct r12, r11 ! 48 EX + + mov.l r7, @(0x14,r1) ! 33 LS + + mov.l r8, @(0x10,r1) ! 33 LS + add #-0x3e, r5 ! 50 EX + + mov.l r9, @(0x0c,r1) ! 33 LS + cmp/eq r2,r1 ! 54 MT + + mov.l r10, @(0x08,r1) ! 33 LS + bf/s 2b ! 109 BR + + mov.l r11, @(0x04,r1) ! 33 LS +#endif + + mov.l @r15+, r12 + mov r1, r0 ! 5 MT (latency=0) + + mov.l @r15+, r11 ! 15 LS + sub r1, r5 ! 75 EX + + mov.l @r15+, r10 ! 15 LS + cmp/eq r4, r0 ! 54 MT + + bf/s 1f ! 109 BR + mov.l @r15+, r9 ! 15 LS + + rts +1: mov.l @r15+, r8 ! 15 LS + + add #0x1e, r5 ! 50 EX + + ! Finish off a short word at a time + ! r5 must be invariant - 2 +10: mov r4,r2 ! 5 MT (latency=0) + add #1,r2 ! 50 EX + + cmp/hi r2, r0 ! 57 MT + bf/s 1f ! 109 BR + + add #2, r2 ! 50 EX + +3: mov.w @(r0,r5),r1 ! 20 LS + cmp/hi r2,r0 ! 57 MT + + bt/s 3b ! 109 BR + + mov.w r1,@-r0 ! 29 LS +1: + + ! + ! Finally, copy the last byte if necessary + cmp/eq r4,r0 ! 54 MT + bt/s 9b + add #1,r5 + mov.b @(r0,r5),r1 + rts + mov.b r1,@-r0 + diff --git a/arch/sh/lib/memcpy.S b/arch/sh/lib/memcpy.S new file mode 100644 index 000000000000..232fab34c261 --- /dev/null +++ b/arch/sh/lib/memcpy.S @@ -0,0 +1,227 @@ +/* $Id: memcpy.S,v 1.3 2001/07/27 11:50:52 gniibe Exp $ + * + * "memcpy" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * + */ + +/* + * void *memcpy(void *dst, const void *src, size_t n); + * No overlap between the memory of DST and of SRC are assumed. + */ + +#include <linux/linkage.h> +ENTRY(memcpy) + tst r6,r6 + bt/s 9f ! if n=0, do nothing + mov r4,r0 + sub r4,r5 ! From here, r5 has the distance to r0 + add r6,r0 ! From here, r0 points the end of copying point + mov #12,r1 + cmp/gt r6,r1 + bt/s 7f ! if it's too small, copy a byte at once + add #-1,r5 + add #1,r5 + ! From here, r6 is free + ! + ! r4 --> [ ... ] DST [ ... ] SRC + ! [ ... ] [ ... ] + ! : : + ! r0 --> [ ... ] r0+r5 --> [ ... ] + ! + ! + mov r5,r1 + mov #3,r2 + and r2,r1 + shll2 r1 + mov r0,r3 ! Save the value on R0 to R3 + mova jmptable,r0 + add r1,r0 + mov.l @r0,r1 + jmp @r1 + mov r3,r0 ! and back to R0 + .balign 4 +jmptable: + .long case0 + .long case1 + .long case2 + .long case3 + + ! copy a byte at once +7: mov r4,r2 + add #1,r2 +8: + cmp/hi r2,r0 + mov.b @(r0,r5),r1 + bt/s 8b ! while (r0>r2) + mov.b r1,@-r0 +9: + rts + nop + +case0: + ! + ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-4,r5 + add #3,r5 +1: dt r3 + mov.b @(r0,r5),r1 + bf/s 1b + mov.b r1,@-r0 + ! + add #-3,r5 +2: ! Second, copy a long word at once + mov r4,r2 + add #7,r2 +3: mov.l @(r0,r5),r1 + cmp/hi r2,r0 + bt/s 3b + mov.l r1,@-r0 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r4,r0 + bt/s 9b + add #3,r5 + bra 8b + add #-6,r2 + +case1: + ! + ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r5 +1: dt r3 + mov.b @(r0,r5),r1 + bf/s 1b + mov.b r1,@-r0 + ! +2: ! Second, read a long word and write a long word at once + mov.l @(r0,r5),r1 + add #-4,r5 + mov r4,r2 + add #7,r2 + ! +#ifdef __LITTLE_ENDIAN__ +3: mov r1,r3 ! RQPO + shll16 r3 + shll8 r3 ! Oxxx + mov.l @(r0,r5),r1 ! NMLK + mov r1,r6 + shlr8 r6 ! xNML + or r6,r3 ! ONML + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#else +3: mov r1,r3 ! OPQR + shlr16 r3 + shlr8 r3 ! xxxO + mov.l @(r0,r5),r1 ! KLMN + mov r1,r6 + shll8 r6 ! LMNx + or r6,r3 ! LMNO + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#endif + ! + ! Third, copy a byte at once, if necessary + cmp/eq r4,r0 + bt/s 9b + add #4,r5 + bra 8b + add #-6,r2 + +case2: + ! + ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. + ! + ! First, align to word boundary + tst #1,r0 + bt/s 2f + add #-1,r5 + mov.b @(r0,r5),r1 + mov.b r1,@-r0 + ! +2: ! Second, read a word and write a word at once + add #-1,r5 + mov r4,r2 + add #3,r2 + ! +3: mov.w @(r0,r5),r1 + cmp/hi r2,r0 + bt/s 3b + mov.w r1,@-r0 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r4,r0 + bt/s 9b + add #1,r5 + mov.b @(r0,r5),r1 + rts + mov.b r1,@-r0 + +case3: + ! + ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r5 +1: dt r3 + mov.b @(r0,r5),r1 + bf/s 1b + mov.b r1,@-r0 + ! +2: ! Second, read a long word and write a long word at once + add #-2,r5 + mov.l @(r0,r5),r1 + add #-4,r5 + mov r4,r2 + add #7,r2 + ! +#ifdef __LITTLE_ENDIAN__ +3: mov r1,r3 ! RQPO + shll8 r3 ! QPOx + mov.l @(r0,r5),r1 ! NMLK + mov r1,r6 + shlr16 r6 + shlr8 r6 ! xxxN + or r6,r3 ! QPON + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#else +3: mov r1,r3 ! OPQR + shlr8 r3 ! xOPQ + mov.l @(r0,r5),r1 ! KLMN + mov r1,r6 + shll16 r6 + shll8 r6 ! Nxxx + or r6,r3 ! NOPQ + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#endif + ! + ! Third, copy a byte at once, if necessary + cmp/eq r4,r0 + bt/s 9b + add #6,r5 + bra 8b + add #-6,r2 diff --git a/arch/sh/lib/memmove.S b/arch/sh/lib/memmove.S new file mode 100644 index 000000000000..5a2211f09202 --- /dev/null +++ b/arch/sh/lib/memmove.S @@ -0,0 +1,254 @@ +/* $Id: memmove.S,v 1.2 2001/07/27 11:51:09 gniibe Exp $ + * + * "memmove" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * + */ + +/* + * void *memmove(void *dst, const void *src, size_t n); + * The memory areas may overlap. + */ + +#include <linux/linkage.h> +ENTRY(memmove) + ! if dest > src, call memcpy (it copies in decreasing order) + cmp/hi r5,r4 + bf 1f + mov.l 2f,r0 + jmp @r0 + nop + .balign 4 +2: .long memcpy +1: + sub r5,r4 ! From here, r4 has the distance to r0 + tst r6,r6 + bt/s 9f ! if n=0, do nothing + mov r5,r0 + add r6,r5 + mov #12,r1 + cmp/gt r6,r1 + bt/s 8f ! if it's too small, copy a byte at once + add #-1,r4 + add #1,r4 + ! + ! [ ... ] DST [ ... ] SRC + ! [ ... ] [ ... ] + ! : : + ! r0+r4--> [ ... ] r0 --> [ ... ] + ! : : + ! [ ... ] [ ... ] + ! r5 --> + ! + mov r4,r1 + mov #3,r2 + and r2,r1 + shll2 r1 + mov r0,r3 ! Save the value on R0 to R3 + mova jmptable,r0 + add r1,r0 + mov.l @r0,r1 + jmp @r1 + mov r3,r0 ! and back to R0 + .balign 4 +jmptable: + .long case0 + .long case1 + .long case2 + .long case3 + + ! copy a byte at once +8: mov.b @r0+,r1 + cmp/hs r5,r0 + bf/s 8b ! while (r0<r5) + mov.b r1,@(r0,r4) + add #1,r4 +9: + add r4,r0 + rts + sub r6,r0 + +case_none: + bra 8b + add #-1,r4 + +case0: + ! + ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r4 + mov #4,r2 + sub r3,r2 +1: dt r2 + mov.b @r0+,r1 + bf/s 1b + mov.b r1,@(r0,r4) + ! +2: ! Second, copy a long word at once + add #-3,r4 + add #-3,r5 +3: mov.l @r0+,r1 + cmp/hs r5,r0 + bf/s 3b + mov.l r1,@(r0,r4) + add #3,r5 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r5,r0 + bt/s 9b + add #4,r4 + bra 8b + add #-1,r4 + +case3: + ! + ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r4 + mov #4,r2 + sub r3,r2 +1: dt r2 + mov.b @r0+,r1 + bf/s 1b + mov.b r1,@(r0,r4) + ! +2: ! Second, read a long word and write a long word at once + add #-2,r4 + mov.l @(r0,r4),r1 + add #-7,r5 + add #-4,r4 + ! +#ifdef __LITTLE_ENDIAN__ + shll8 r1 +3: mov r1,r3 ! JIHG + shlr8 r3 ! xJIH + mov.l @r0+,r1 ! NMLK + mov r1,r2 + shll16 r2 + shll8 r2 ! Kxxx + or r2,r3 ! KJIH + cmp/hs r5,r0 + bf/s 3b + mov.l r3,@(r0,r4) +#else + shlr8 r1 +3: mov r1,r3 ! GHIJ + shll8 r3 ! HIJx + mov.l @r0+,r1 ! KLMN + mov r1,r2 + shlr16 r2 + shlr8 r2 ! xxxK + or r2,r3 ! HIJK + cmp/hs r5,r0 + bf/s 3b + mov.l r3,@(r0,r4) +#endif + add #7,r5 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r5,r0 + bt/s 9b + add #7,r4 + add #-3,r0 + bra 8b + add #-1,r4 + +case2: + ! + ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. + ! + ! First, align to word boundary + tst #1,r0 + bt/s 2f + add #-1,r4 + mov.b @r0+,r1 + mov.b r1,@(r0,r4) + ! +2: ! Second, read a word and write a word at once + add #-1,r4 + add #-1,r5 + ! +3: mov.w @r0+,r1 + cmp/hs r5,r0 + bf/s 3b + mov.w r1,@(r0,r4) + add #1,r5 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r5,r0 + bt/s 9b + add #2,r4 + mov.b @r0,r1 + mov.b r1,@(r0,r4) + bra 9b + add #1,r0 + +case1: + ! + ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r4 + mov #4,r2 + sub r3,r2 +1: dt r2 + mov.b @r0+,r1 + bf/s 1b + mov.b r1,@(r0,r4) + ! +2: ! Second, read a long word and write a long word at once + mov.l @(r0,r4),r1 + add #-7,r5 + add #-4,r4 + ! +#ifdef __LITTLE_ENDIAN__ + shll16 r1 + shll8 r1 +3: mov r1,r3 ! JIHG + shlr16 r3 + shlr8 r3 ! xxxJ + mov.l @r0+,r1 ! NMLK + mov r1,r2 + shll8 r2 ! MLKx + or r2,r3 ! MLKJ + cmp/hs r5,r0 + bf/s 3b + mov.l r3,@(r0,r4) +#else + shlr16 r1 + shlr8 r1 +3: mov r1,r3 ! GHIJ + shll16 r3 + shll8 r3 ! Jxxx + mov.l @r0+,r1 ! KLMN + mov r1,r2 + shlr8 r2 ! xKLM + or r2,r3 ! JKLM + cmp/hs r5,r0 + bf/s 3b ! while(r0<r5) + mov.l r3,@(r0,r4) +#endif + add #7,r5 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r5,r0 + bt/s 9b + add #5,r4 + add #-3,r0 + bra 8b + add #-1,r4 diff --git a/arch/sh/lib/memset.S b/arch/sh/lib/memset.S new file mode 100644 index 000000000000..95670090680e --- /dev/null +++ b/arch/sh/lib/memset.S @@ -0,0 +1,57 @@ +/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $ + * + * "memset" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * + */ + +/* + * void *memset(void *s, int c, size_t n); + */ + +#include <linux/linkage.h> + +ENTRY(memset) + tst r6,r6 + bt/s 5f ! if n=0, do nothing + add r6,r4 + mov #12,r0 + cmp/gt r6,r0 + bt/s 4f ! if it's too small, set a byte at once + mov r4,r0 + and #3,r0 + cmp/eq #0,r0 + bt/s 2f ! It's aligned + sub r0,r6 +1: + dt r0 + bf/s 1b + mov.b r5,@-r4 +2: ! make VVVV + swap.b r5,r0 ! V0 + or r0,r5 ! VV + swap.w r5,r0 ! VV00 + or r0,r5 ! VVVV + ! + mov r6,r0 + shlr2 r0 + shlr r0 ! r0 = r6 >> 3 +3: + dt r0 + mov.l r5,@-r4 ! set 8-byte at once + bf/s 3b + mov.l r5,@-r4 + ! + mov #7,r0 + and r0,r6 + tst r6,r6 + bt 5f + ! fill bytes +4: + dt r6 + bf/s 4b + mov.b r5,@-r4 +5: + rts + mov r4,r0 diff --git a/arch/sh/lib/strcasecmp.c b/arch/sh/lib/strcasecmp.c new file mode 100644 index 000000000000..4e57a216feaf --- /dev/null +++ b/arch/sh/lib/strcasecmp.c @@ -0,0 +1,26 @@ +/* + * linux/arch/alpha/lib/strcasecmp.c + */ + +#include <linux/string.h> + + +/* We handle nothing here except the C locale. Since this is used in + only one place, on strings known to contain only 7 bit ASCII, this + is ok. */ + +int strcasecmp(const char *a, const char *b) +{ + int ca, cb; + + do { + ca = *a++ & 0xff; + cb = *b++ & 0xff; + if (ca >= 'A' && ca <= 'Z') + ca += 'a' - 'A'; + if (cb >= 'A' && cb <= 'Z') + cb += 'a' - 'A'; + } while (ca == cb && ca != '\0'); + + return ca - cb; +} diff --git a/arch/sh/lib/strlen.S b/arch/sh/lib/strlen.S new file mode 100644 index 000000000000..f8ab296047b3 --- /dev/null +++ b/arch/sh/lib/strlen.S @@ -0,0 +1,70 @@ +/* $Id: strlen.S,v 1.2 2001/06/29 14:07:15 gniibe Exp $ + * + * "strlen" implementation of SuperH + * + * Copyright (C) 1999 Kaz Kojima + * + */ + +/* size_t strlen (const char *s) */ + +#include <linux/linkage.h> +ENTRY(strlen) + mov r4,r0 + and #3,r0 + tst r0,r0 + bt/s 1f + mov #0,r2 + + add #-1,r0 + shll2 r0 + shll r0 + braf r0 + nop + + mov.b @r4+,r1 + tst r1,r1 + bt 8f + add #1,r2 + + mov.b @r4+,r1 + tst r1,r1 + bt 8f + add #1,r2 + + mov.b @r4+,r1 + tst r1,r1 + bt 8f + add #1,r2 + +1: + mov #0,r3 +2: + mov.l @r4+,r1 + cmp/str r3,r1 + bf/s 2b + add #4,r2 + + add #-4,r2 +#ifndef __LITTLE_ENDIAN__ + swap.b r1,r1 + swap.w r1,r1 + swap.b r1,r1 +#endif + extu.b r1,r0 + tst r0,r0 + bt/s 8f + shlr8 r1 + add #1,r2 + extu.b r1,r0 + tst r0,r0 + bt/s 8f + shlr8 r1 + add #1,r2 + extu.b r1,r0 + tst r0,r0 + bt 8f + add #1,r2 +8: + rts + mov r2,r0 diff --git a/arch/sh/lib/udivdi3.c b/arch/sh/lib/udivdi3.c new file mode 100644 index 000000000000..68f038bf3c50 --- /dev/null +++ b/arch/sh/lib/udivdi3.c @@ -0,0 +1,16 @@ +/* + * Simple __udivdi3 function which doesn't use FPU. + */ + +#include <linux/types.h> + +extern u64 __xdiv64_32(u64 n, u32 d); +extern void panic(const char * fmt, ...); + +u64 __udivdi3(u64 n, u64 d) +{ + if (d & ~0xffffffff) + panic("Need true 64-bit/64-bit division"); + return __xdiv64_32(n, (u32)d); +} + |