summaryrefslogtreecommitdiffstats
path: root/arch/x86_64/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/lib
downloadlinux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'arch/x86_64/lib')
-rw-r--r--arch/x86_64/lib/Makefile14
-rw-r--r--arch/x86_64/lib/bitops.c141
-rw-r--r--arch/x86_64/lib/bitstr.c28
-rw-r--r--arch/x86_64/lib/clear_page.S50
-rw-r--r--arch/x86_64/lib/copy_page.S101
-rw-r--r--arch/x86_64/lib/copy_user.S294
-rw-r--r--arch/x86_64/lib/csum-copy.S233
-rw-r--r--arch/x86_64/lib/csum-partial.c150
-rw-r--r--arch/x86_64/lib/csum-wrappers.c129
-rw-r--r--arch/x86_64/lib/dec_and_lock.c40
-rw-r--r--arch/x86_64/lib/delay.c48
-rw-r--r--arch/x86_64/lib/getuser.S101
-rw-r--r--arch/x86_64/lib/io.c23
-rw-r--r--arch/x86_64/lib/memcpy.S121
-rw-r--r--arch/x86_64/lib/memmove.c19
-rw-r--r--arch/x86_64/lib/memset.S125
-rw-r--r--arch/x86_64/lib/putuser.S89
-rw-r--r--arch/x86_64/lib/thunk.S95
-rw-r--r--arch/x86_64/lib/usercopy.c153
19 files changed, 1954 insertions, 0 deletions
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
new file mode 100644
index 000000000000..6b26a1c1e9ff
--- /dev/null
+++ b/arch/x86_64/lib/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for x86_64-specific library files.
+#
+
+CFLAGS_csum-partial.o := -funroll-loops
+
+obj-y := io.o
+
+lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
+ usercopy.o getuser.o putuser.o \
+ thunk.o clear_page.o copy_page.o bitstr.o bitops.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o
+
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c
new file mode 100644
index 000000000000..a29fb75b33ac
--- /dev/null
+++ b/arch/x86_64/lib/bitops.c
@@ -0,0 +1,141 @@
+#include <linux/bitops.h>
+
+#undef find_first_zero_bit
+#undef find_next_zero_bit
+#undef find_first_bit
+#undef find_next_bit
+
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+ long d0, d1, d2;
+ long res;
+
+ if (!size)
+ return 0;
+ asm volatile(
+ " repe; scasq\n"
+ " je 1f\n"
+ " xorq -8(%%rdi),%%rax\n"
+ " subq $8,%%rdi\n"
+ " bsfq %%rax,%%rdx\n"
+ "1: subq %[addr],%%rdi\n"
+ " shlq $3,%%rdi\n"
+ " addq %%rdi,%%rdx"
+ :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
+ :"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL),
+ [addr] "r" (addr) : "memory");
+ return res;
+}
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_zero_bit (const unsigned long * addr, long size, long offset)
+{
+ unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
+ unsigned long set = 0;
+ unsigned long res, bit = offset&63;
+
+ if (bit) {
+ /*
+ * Look for zero in first word
+ */
+ asm("bsfq %1,%0\n\t"
+ "cmoveq %2,%0"
+ : "=r" (set)
+ : "r" (~(*p >> bit)), "r"(64L));
+ if (set < (64 - bit))
+ return set + offset;
+ set = 64 - bit;
+ p++;
+ }
+ /*
+ * No zero yet, search remaining full words for a zero
+ */
+ res = find_first_zero_bit ((const unsigned long *)p,
+ size - 64 * (p - (unsigned long *) addr));
+ return (offset + set + res);
+}
+
+static inline long
+__find_first_bit(const unsigned long * addr, unsigned long size)
+{
+ long d0, d1;
+ long res;
+
+ asm volatile(
+ " repe; scasq\n"
+ " jz 1f\n"
+ " subq $8,%%rdi\n"
+ " bsfq (%%rdi),%%rax\n"
+ "1: subq %[addr],%%rdi\n"
+ " shlq $3,%%rdi\n"
+ " addq %%rdi,%%rax"
+ :"=a" (res), "=&c" (d0), "=&D" (d1)
+ :"0" (0ULL),
+ "1" ((size + 63) >> 6), "2" (addr),
+ [addr] "r" (addr) : "memory");
+ return res;
+}
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_bit(const unsigned long * addr, unsigned long size)
+{
+ return __find_first_bit(addr,size);
+}
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_bit(const unsigned long * addr, long size, long offset)
+{
+ const unsigned long * p = addr + (offset >> 6);
+ unsigned long set = 0, bit = offset & 63, res;
+
+ if (bit) {
+ /*
+ * Look for nonzero in the first 64 bits:
+ */
+ asm("bsfq %1,%0\n\t"
+ "cmoveq %2,%0\n\t"
+ : "=r" (set)
+ : "r" (*p >> bit), "r" (64L));
+ if (set < (64 - bit))
+ return set + offset;
+ set = 64 - bit;
+ p++;
+ }
+ /*
+ * No set bit yet, search remaining full words for a bit
+ */
+ res = __find_first_bit (p, size - 64 * (p - addr));
+ return (offset + set + res);
+}
+
+#include <linux/module.h>
+
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c
new file mode 100644
index 000000000000..24676609a6ac
--- /dev/null
+++ b/arch/x86_64/lib/bitstr.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+/* Find string of zero bits in a bitmap */
+unsigned long
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{
+ unsigned long n, end, i;
+
+ again:
+ n = find_next_zero_bit(bitmap, nbits, start);
+ if (n == -1)
+ return -1;
+
+ /* could test bitsliced, but it's hardly worth it */
+ end = n+len;
+ if (end >= nbits)
+ return -1;
+ for (i = n+1; i < end; i++) {
+ if (test_bit(i, bitmap)) {
+ start = i+1;
+ goto again;
+ }
+ }
+ return n;
+}
+
+EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
new file mode 100644
index 000000000000..30a9da458c15
--- /dev/null
+++ b/arch/x86_64/lib/clear_page.S
@@ -0,0 +1,50 @@
+/*
+ * Zero a page.
+ * rdi page
+ */
+ .globl clear_page
+ .p2align 4
+clear_page:
+ xorl %eax,%eax
+ movl $4096/64,%ecx
+ .p2align 4
+.Lloop:
+ decl %ecx
+#define PUT(x) movq %rax,x*8(%rdi)
+ movq %rax,(%rdi)
+ PUT(1)
+ PUT(2)
+ PUT(3)
+ PUT(4)
+ PUT(5)
+ PUT(6)
+ PUT(7)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop
+ nop
+ ret
+clear_page_end:
+
+ /* C stepping K8 run faster using the string instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad clear_page
+ .quad clear_page_c
+ .byte X86_FEATURE_K8_C
+ .byte clear_page_end-clear_page
+ .byte clear_page_c_end-clear_page_c
+ .previous
+
+ .section .altinstr_replacement,"ax"
+clear_page_c:
+ movl $4096/8,%ecx
+ xorl %eax,%eax
+ rep
+ stosq
+ ret
+clear_page_c_end:
+ .previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
new file mode 100644
index 000000000000..dd3aa47b6bf5
--- /dev/null
+++ b/arch/x86_64/lib/copy_page.S
@@ -0,0 +1,101 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+
+/* Don't use streaming store because it's better when the target
+ ends up in cache. */
+
+/* Could vary the prefetch distance based on SMP/UP */
+
+ .globl copy_page
+ .p2align 4
+copy_page:
+ subq $3*8,%rsp
+ movq %rbx,(%rsp)
+ movq %r12,1*8(%rsp)
+ movq %r13,2*8(%rsp)
+
+ movl $(4096/64)-5,%ecx
+ .p2align 4
+.Loop64:
+ dec %rcx
+
+ movq (%rsi), %rax
+ movq 8 (%rsi), %rbx
+ movq 16 (%rsi), %rdx
+ movq 24 (%rsi), %r8
+ movq 32 (%rsi), %r9
+ movq 40 (%rsi), %r10
+ movq 48 (%rsi), %r11
+ movq 56 (%rsi), %r12
+
+ prefetcht0 5*64(%rsi)
+
+ movq %rax, (%rdi)
+ movq %rbx, 8 (%rdi)
+ movq %rdx, 16 (%rdi)
+ movq %r8, 24 (%rdi)
+ movq %r9, 32 (%rdi)
+ movq %r10, 40 (%rdi)
+ movq %r11, 48 (%rdi)
+ movq %r12, 56 (%rdi)
+
+ leaq 64 (%rsi), %rsi
+ leaq 64 (%rdi), %rdi
+
+ jnz .Loop64
+
+ movl $5,%ecx
+ .p2align 4
+.Loop2:
+ decl %ecx
+
+ movq (%rsi), %rax
+ movq 8 (%rsi), %rbx
+ movq 16 (%rsi), %rdx
+ movq 24 (%rsi), %r8
+ movq 32 (%rsi), %r9
+ movq 40 (%rsi), %r10
+ movq 48 (%rsi), %r11
+ movq 56 (%rsi), %r12
+
+ movq %rax, (%rdi)
+ movq %rbx, 8 (%rdi)
+ movq %rdx, 16 (%rdi)
+ movq %r8, 24 (%rdi)
+ movq %r9, 32 (%rdi)
+ movq %r10, 40 (%rdi)
+ movq %r11, 48 (%rdi)
+ movq %r12, 56 (%rdi)
+
+ leaq 64(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+
+ jnz .Loop2
+
+ movq (%rsp),%rbx
+ movq 1*8(%rsp),%r12
+ movq 2*8(%rsp),%r13
+ addq $3*8,%rsp
+ ret
+
+ /* C stepping K8 run faster using the string copy instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad copy_page
+ .quad copy_page_c
+ .byte X86_FEATURE_K8_C
+ .byte copy_page_c_end-copy_page_c
+ .byte copy_page_c_end-copy_page_c
+ .previous
+
+ .section .altinstr_replacement,"ax"
+copy_page_c:
+ movl $4096/8,%ecx
+ rep
+ movsq
+ ret
+copy_page_c_end:
+ .previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
new file mode 100644
index 000000000000..bd556c804248
--- /dev/null
+++ b/arch/x86_64/lib/copy_user.S
@@ -0,0 +1,294 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#define FIX_ALIGNMENT 1
+
+ #include <asm/current.h>
+ #include <asm/offset.h>
+ #include <asm/thread_info.h>
+ #include <asm/cpufeature.h>
+
+/* Standard copy_to_user with segment limit checking */
+ .globl copy_to_user
+ .p2align 4
+copy_to_user:
+ GET_THREAD_INFO(%rax)
+ movq %rdi,%rcx
+ addq %rdx,%rcx
+ jc bad_to_user
+ cmpq threadinfo_addr_limit(%rax),%rcx
+ jae bad_to_user
+2:
+ .byte 0xe9 /* 32bit jump */
+ .long .Lcug-1f
+1:
+
+ .section .altinstr_replacement,"ax"
+3: .byte 0xe9 /* replacement jmp with 8 bit immediate */
+ .long copy_user_generic_c-1b /* offset */
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad 2b
+ .quad 3b
+ .byte X86_FEATURE_K8_C
+ .byte 5
+ .byte 5
+ .previous
+
+/* Standard copy_from_user with segment limit checking */
+ .globl copy_from_user
+ .p2align 4
+copy_from_user:
+ GET_THREAD_INFO(%rax)
+ movq %rsi,%rcx
+ addq %rdx,%rcx
+ jc bad_from_user
+ cmpq threadinfo_addr_limit(%rax),%rcx
+ jae bad_from_user
+ /* FALL THROUGH to copy_user_generic */
+
+ .section .fixup,"ax"
+ /* must zero dest */
+bad_from_user:
+ movl %edx,%ecx
+ xorl %eax,%eax
+ rep
+ stosb
+bad_to_user:
+ movl %edx,%eax
+ ret
+ .previous
+
+
+/*
+ * copy_user_generic - memory copy with exception handling.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ .globl copy_user_generic
+ .p2align 4
+copy_user_generic:
+ .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
+ .byte 0x66,0x90
+1:
+ .section .altinstr_replacement,"ax"
+2: .byte 0xe9 /* near jump with 32bit immediate */
+ .long copy_user_generic_c-1b /* offset */
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad copy_user_generic
+ .quad 2b
+ .byte X86_FEATURE_K8_C
+ .byte 5
+ .byte 5
+ .previous
+.Lcug:
+ pushq %rbx
+ xorl %eax,%eax /*zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+ movq %rdx,%rcx
+
+ movl $64,%ebx
+ shrq $6,%rdx
+ decq %rdx
+ js .Lhandle_tail
+
+ .p2align 4
+.Lloop:
+.Ls1: movq (%rsi),%r11
+.Ls2: movq 1*8(%rsi),%r8
+.Ls3: movq 2*8(%rsi),%r9
+.Ls4: movq 3*8(%rsi),%r10
+.Ld1: movq %r11,(%rdi)
+.Ld2: movq %r8,1*8(%rdi)
+.Ld3: movq %r9,2*8(%rdi)
+.Ld4: movq %r10,3*8(%rdi)
+
+.Ls5: movq 4*8(%rsi),%r11
+.Ls6: movq 5*8(%rsi),%r8
+.Ls7: movq 6*8(%rsi),%r9
+.Ls8: movq 7*8(%rsi),%r10
+.Ld5: movq %r11,4*8(%rdi)
+.Ld6: movq %r8,5*8(%rdi)
+.Ld7: movq %r9,6*8(%rdi)
+.Ld8: movq %r10,7*8(%rdi)
+
+ decq %rdx
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+
+ jns .Lloop
+
+ .p2align 4
+.Lhandle_tail:
+ movl %ecx,%edx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ movl $8,%ebx
+ .p2align 4
+.Lloop_8:
+.Ls9: movq (%rsi),%r8
+.Ld9: movq %r8,(%rdi)
+ decl %ecx
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+.Lende:
+ popq %rbx
+ ret
+
+#ifdef FIX_ALIGNMENT
+ /* align destination */
+ .p2align 4
+.Lbad_alignment:
+ movl $8,%r9d
+ subl %ecx,%r9d
+ movl %r9d,%ecx
+ cmpq %r9,%rdx
+ jz .Lhandle_7
+ js .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .Lalign_1
+ subq %r9,%rdx
+ jmp .Lafter_bad_alignment
+#endif
+
+ /* table sorted by exception address */
+ .section __ex_table,"a"
+ .align 8
+ .quad .Ls1,.Ls1e
+ .quad .Ls2,.Ls2e
+ .quad .Ls3,.Ls3e
+ .quad .Ls4,.Ls4e
+ .quad .Ld1,.Ls1e
+ .quad .Ld2,.Ls2e
+ .quad .Ld3,.Ls3e
+ .quad .Ld4,.Ls4e
+ .quad .Ls5,.Ls5e
+ .quad .Ls6,.Ls6e
+ .quad .Ls7,.Ls7e
+ .quad .Ls8,.Ls8e
+ .quad .Ld5,.Ls5e
+ .quad .Ld6,.Ls6e
+ .quad .Ld7,.Ls7e
+ .quad .Ld8,.Ls8e
+ .quad .Ls9,.Le_quad
+ .quad .Ld9,.Le_quad
+ .quad .Ls10,.Le_byte
+ .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+ .quad .Ls11,.Lzero_rest
+ .quad .Ld11,.Lzero_rest
+#endif
+ .quad .Le5,.Le_zero
+ .previous
+
+ /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+ pessimistic side. this is gross. it would be better to fix the
+ interface. */
+ /* eax: zero, ebx: 64 */
+.Ls1e: addl $8,%eax
+.Ls2e: addl $8,%eax
+.Ls3e: addl $8,%eax
+.Ls4e: addl $8,%eax
+.Ls5e: addl $8,%eax
+.Ls6e: addl $8,%eax
+.Ls7e: addl $8,%eax
+.Ls8e: addl $8,%eax
+ addq %rbx,%rdi /* +64 */
+ subq %rax,%rdi /* correct destination with computed offset */
+
+ shlq $6,%rdx /* loop counter * 64 (stride length) */
+ addq %rax,%rdx /* add offset to loopcnt */
+ andl $63,%ecx /* remaining bytes */
+ addq %rcx,%rdx /* add them */
+ jmp .Lzero_rest
+
+ /* exception on quad word loop in tail handling */
+ /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+ shll $3,%ecx
+ andl $7,%edx
+ addl %ecx,%edx
+ /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+ movq %rdx,%rcx
+.Le_byte:
+ xorl %eax,%eax
+.Le5: rep
+ stosb
+ /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+ movq %rdx,%rax
+ jmp .Lende
+
+ /* C stepping K8 run faster using the string copy instructions.
+ This is also a lot simpler. Use them when possible.
+ Patch in jmps to this code instead of copying it fully
+ to avoid unwanted aliasing in the exception tables. */
+
+ /* rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successfull.
+ */
+copy_user_generic_c:
+ movl %edx,%ecx
+ shrl $3,%ecx
+ andl $7,%edx
+1: rep
+ movsq
+ movl %edx,%ecx
+2: rep
+ movsb
+4: movl %ecx,%eax
+ ret
+3: lea (%rdx,%rcx,8),%rax
+ ret
+
+ .section __ex_table,"a"
+ .quad 1b,3b
+ .quad 2b,4b
+ .previous
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
new file mode 100644
index 000000000000..01808ec37836
--- /dev/null
+++ b/arch/x86_64/lib/csum-copy.S
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+ #include <linux/linkage.h>
+ #include <asm/errno.h>
+
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
+ * destination is zeroed.
+ *
+ * Input
+ * rdi source
+ * rsi destination
+ * edx len (32bit)
+ * ecx sum (32bit)
+ * r8 src_err_ptr (int)
+ * r9 dst_err_ptr (int)
+ *
+ * Output
+ * eax 64bit sum. undefined in case of exception.
+ *
+ * Wrappers need to take care of valid exception sum and zeroing.
+ * They also should align source or destination to 8 bytes.
+ */
+
+ .macro source
+10:
+ .section __ex_table,"a"
+ .align 8
+ .quad 10b,.Lbad_source
+ .previous
+ .endm
+
+ .macro dest
+20:
+ .section __ex_table,"a"
+ .align 8
+ .quad 20b,.Lbad_dest
+ .previous
+ .endm
+
+ .macro ignore L=.Lignore
+30:
+ .section __ex_table,"a"
+ .align 8
+ .quad 30b,\L
+ .previous
+ .endm
+
+
+ .globl csum_partial_copy_generic
+ .p2align 4
+csum_partial_copy_generic:
+ cmpl $3*64,%edx
+ jle .Lignore
+
+.Lignore:
+ subq $7*8,%rsp
+ movq %rbx,2*8(%rsp)
+ movq %r12,3*8(%rsp)
+ movq %r14,4*8(%rsp)
+ movq %r13,5*8(%rsp)
+ movq %rbp,6*8(%rsp)
+
+ movq %r8,(%rsp)
+ movq %r9,1*8(%rsp)
+
+ movl %ecx,%eax
+ movl %edx,%ecx
+
+ xorl %r9d,%r9d
+ movq %rcx,%r12
+
+ shrq $6,%r12
+ jz .Lhandle_tail /* < 64 */
+
+ clc
+
+ /* main loop. clear in 64 byte blocks */
+ /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+ /* r11: temp3, rdx: temp4, r12 loopcnt */
+ /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
+ .p2align 4
+.Lloop:
+ source
+ movq (%rdi),%rbx
+ source
+ movq 8(%rdi),%r8
+ source
+ movq 16(%rdi),%r11
+ source
+ movq 24(%rdi),%rdx
+
+ source
+ movq 32(%rdi),%r10
+ source
+ movq 40(%rdi),%rbp
+ source
+ movq 48(%rdi),%r14
+ source
+ movq 56(%rdi),%r13
+
+ ignore 2f
+ prefetcht0 5*64(%rdi)
+2:
+ adcq %rbx,%rax
+ adcq %r8,%rax
+ adcq %r11,%rax
+ adcq %rdx,%rax
+ adcq %r10,%rax
+ adcq %rbp,%rax
+ adcq %r14,%rax
+ adcq %r13,%rax
+
+ decl %r12d
+
+ dest
+ movq %rbx,(%rsi)
+ dest
+ movq %r8,8(%rsi)
+ dest
+ movq %r11,16(%rsi)
+ dest
+ movq %rdx,24(%rsi)
+
+ dest
+ movq %r10,32(%rsi)
+ dest
+ movq %rbp,40(%rsi)
+ dest
+ movq %r14,48(%rsi)
+ dest
+ movq %r13,56(%rsi)
+
+3:
+
+ leaq 64(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+
+ jnz .Lloop
+
+ adcq %r9,%rax
+
+ /* do last upto 56 bytes */
+.Lhandle_tail:
+ /* ecx: count */
+ movl %ecx,%r10d
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lfold
+ clc
+ .p2align 4
+.Lloop_8:
+ source
+ movq (%rdi),%rbx
+ adcq %rbx,%rax
+ decl %ecx
+ dest
+ movq %rbx,(%rsi)
+ leaq 8(%rsi),%rsi /* preserve carry */
+ leaq 8(%rdi),%rdi
+ jnz .Lloop_8
+ adcq %r9,%rax /* add in carry */
+
+.Lfold:
+ /* reduce checksum to 32bits */
+ movl %eax,%ebx
+ shrq $32,%rax
+ addl %ebx,%eax
+ adcl %r9d,%eax
+
+ /* do last upto 6 bytes */
+.Lhandle_7:
+ movl %r10d,%ecx
+ andl $7,%ecx
+ shrl $1,%ecx
+ jz .Lhandle_1
+ movl $2,%edx
+ xorl %ebx,%ebx
+ clc
+ .p2align 4
+.Lloop_1:
+ source
+ movw (%rdi),%bx
+ adcl %ebx,%eax
+ dest
+ decl %ecx
+ movw %bx,(%rsi)
+ leaq 2(%rdi),%rdi
+ leaq 2(%rsi),%rsi
+ jnz .Lloop_1
+ adcl %r9d,%eax /* add in carry */
+
+ /* handle last odd byte */
+.Lhandle_1:
+ testl $1,%r10d
+ jz .Lende
+ xorl %ebx,%ebx
+ source
+ movb (%rdi),%bl
+ dest
+ movb %bl,(%rsi)
+ addl %ebx,%eax
+ adcl %r9d,%eax /* carry */
+
+.Lende:
+ movq 2*8(%rsp),%rbx
+ movq 3*8(%rsp),%r12
+ movq 4*8(%rsp),%r14
+ movq 5*8(%rsp),%r13
+ movq 6*8(%rsp),%rbp
+ addq $7*8,%rsp
+ ret
+
+ /* Exception handlers. Very simple, zeroing is done in the wrappers */
+.Lbad_source:
+ movq (%rsp),%rax
+ testq %rax,%rax
+ jz .Lende
+ movl $-EFAULT,(%rax)
+ jmp .Lende
+
+.Lbad_dest:
+ movq 8(%rsp),%rax
+ testq %rax,%rax
+ jz .Lende
+ movl $-EFAULT,(%rax)
+ jmp .Lende
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c
new file mode 100644
index 000000000000..5384e227cdf6
--- /dev/null
+++ b/arch/x86_64/lib/csum-partial.c
@@ -0,0 +1,150 @@
+/*
+ * arch/x86_64/lib/csum-partial.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ */
+
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <asm/checksum.h>
+
+#define __force_inline inline __attribute__((always_inline))
+
+static inline unsigned short from32to16(unsigned a)
+{
+ unsigned short b = a >> 16;
+ asm("addw %w2,%w0\n\t"
+ "adcw $0,%w0\n"
+ : "=r" (b)
+ : "0" (b), "r" (a));
+ return b;
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ *
+ * Things tried and found to not make it faster:
+ * Manual Prefetching
+ * Unrolling to an 128 bytes inner loop.
+ * Using interleaving with more registers to break the carry chains.
+ */
+static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len)
+{
+ unsigned odd, count;
+ unsigned long result = 0;
+
+ if (unlikely(len == 0))
+ return result;
+ odd = 1 & (unsigned long) buff;
+ if (unlikely(odd)) {
+ result = *buff << 8;
+ len--;
+ buff++;
+ }
+ count = len >> 1; /* nr of 16-bit words.. */
+ if (count) {
+ if (2 & (unsigned long) buff) {
+ result += *(unsigned short *)buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ unsigned long zero;
+ unsigned count64;
+ if (4 & (unsigned long) buff) {
+ result += *(unsigned int *) buff;
+ count--;
+ len -= 4;
+ buff += 4;
+ }
+ count >>= 1; /* nr of 64-bit words.. */
+
+ /* main loop using 64byte blocks */
+ zero = 0;
+ count64 = count >> 3;
+ while (count64) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq 4*8(%[src]),%[res]\n\t"
+ "adcq 5*8(%[src]),%[res]\n\t"
+ "adcq 6*8(%[src]),%[res]\n\t"
+ "adcq 7*8(%[src]),%[res]\n\t"
+ "adcq %[zero],%[res]"
+ : [res] "=r" (result)
+ : [src] "r" (buff), [zero] "r" (zero),
+ "[res]" (result));
+ buff += 64;
+ count64--;
+ }
+
+ /* last upto 7 8byte blocks */
+ count %= 8;
+ while (count) {
+ asm("addq %1,%0\n\t"
+ "adcq %2,%0\n"
+ : "=r" (result)
+ : "m" (*(unsigned long *)buff),
+ "r" (zero), "0" (result));
+ --count;
+ buff += 8;
+ }
+ result = add32_with_carry(result>>32,
+ result&0xffffffff);
+
+ if (len & 4) {
+ result += *(unsigned int *) buff;
+ buff += 4;
+ }
+ }
+ if (len & 2) {
+ result += *(unsigned short *) buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+ result += *buff;
+ result = add32_with_carry(result>>32, result & 0xffffffff);
+ if (unlikely(odd)) {
+ result = from32to16(result);
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+ }
+ return result;
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+unsigned csum_partial(const unsigned char *buff, unsigned len, unsigned sum)
+{
+ return add32_with_carry(do_csum(buff, len), sum);
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short ip_compute_csum(unsigned char * buff, int len)
+{
+ return csum_fold(csum_partial(buff,len,0));
+}
+
diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c
new file mode 100644
index 000000000000..94323f20816e
--- /dev/null
+++ b/arch/x86_64/lib/csum-wrappers.c
@@ -0,0 +1,129 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v.2
+ *
+ * Wrappers of assembly checksum functions for x86-64.
+ */
+
+#include <asm/checksum.h>
+#include <linux/module.h>
+
+/**
+ * csum_partial_copy_from_user - Copy and checksum from user space.
+ * @src: source address (user space)
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad source address.
+ *
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */
+unsigned int
+csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst,
+ int len, unsigned int isum, int *errp)
+{
+ might_sleep();
+ *errp = 0;
+ if (likely(access_ok(VERIFY_READ,src, len))) {
+ /* Why 6, not 7? To handle odd addresses aligned we
+ would need to do considerable complications to fix the
+ checksum which is defined as an 16bit accumulator. The
+ fix alignment code is primarily for performance
+ compatibility with 32bit and that will handle odd
+ addresses slowly too. */
+ if (unlikely((unsigned long)src & 6)) {
+ while (((unsigned long)src & 6) && len >= 2) {
+ __u16 val16;
+ *errp = __get_user(val16, (__u16 __user *)src);
+ if (*errp)
+ return isum;
+ *(__u16 *)dst = val16;
+ isum = add32_with_carry(isum, val16);
+ src += 2;
+ dst += 2;
+ len -= 2;
+ }
+ }
+ isum = csum_partial_copy_generic((__force void *)src,dst,len,isum,errp,NULL);
+ if (likely(*errp == 0))
+ return isum;
+ }
+ *errp = -EFAULT;
+ memset(dst,0,len);
+ return isum;
+}
+
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
+/**
+ * csum_partial_copy_to_user - Copy and checksum to user space.
+ * @src: source address
+ * @dst: destination address (user space)
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad destination address.
+ *
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */
+unsigned int
+csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst,
+ int len, unsigned int isum, int *errp)
+{
+ might_sleep();
+ if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+ *errp = -EFAULT;
+ return 0;
+ }
+
+ if (unlikely((unsigned long)dst & 6)) {
+ while (((unsigned long)dst & 6) && len >= 2) {
+ __u16 val16 = *(__u16 *)src;
+ isum = add32_with_carry(isum, val16);
+ *errp = __put_user(val16, (__u16 __user *)dst);
+ if (*errp)
+ return isum;
+ src += 2;
+ dst += 2;
+ len -= 2;
+ }
+ }
+
+ *errp = 0;
+ return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp);
+}
+
+EXPORT_SYMBOL(csum_partial_copy_to_user);
+
+/**
+ * csum_partial_copy_nocheck - Copy and checksum.
+ * @src: source address
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ *
+ * Returns an 32bit unfolded checksum of the buffer.
+ */
+unsigned int
+csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len, unsigned int sum)
+{
+ return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
+}
+
+unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr,
+ __u32 len, unsigned short proto, unsigned int sum)
+{
+ __u64 rest, sum64;
+
+ rest = (__u64)htonl(len) + (__u64)htons(proto) + (__u64)sum;
+ asm(" addq (%[saddr]),%[sum]\n"
+ " adcq 8(%[saddr]),%[sum]\n"
+ " adcq (%[daddr]),%[sum]\n"
+ " adcq 8(%[daddr]),%[sum]\n"
+ " adcq $0,%[sum]\n"
+ : [sum] "=r" (sum64)
+ : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
+ return csum_fold(add32_with_carry(sum64 & 0xffffffff, sum64>>32));
+}
+
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86_64/lib/dec_and_lock.c b/arch/x86_64/lib/dec_and_lock.c
new file mode 100644
index 000000000000..ab43394dc775
--- /dev/null
+++ b/arch/x86_64/lib/dec_and_lock.c
@@ -0,0 +1,40 @@
+/*
+ * x86 version of "atomic_dec_and_lock()" using
+ * the atomic "cmpxchg" instruction.
+ *
+ * (For CPU's lacking cmpxchg, we use the slow
+ * generic version, and this one never even gets
+ * compiled).
+ */
+
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+{
+ int counter;
+ int newcount;
+
+repeat:
+ counter = atomic_read(atomic);
+ newcount = counter-1;
+
+ if (!newcount)
+ goto slow_path;
+
+ asm volatile("lock; cmpxchgl %1,%2"
+ :"=a" (newcount)
+ :"r" (newcount), "m" (atomic->counter), "0" (counter));
+
+ /* If the above failed, "eax" will have changed */
+ if (newcount != counter)
+ goto repeat;
+ return 0;
+
+slow_path:
+ spin_lock(lock);
+ if (atomic_dec_and_test(atomic))
+ return 1;
+ spin_unlock(lock);
+ return 0;
+}
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
new file mode 100644
index 000000000000..6e2d66472eb1
--- /dev/null
+++ b/arch/x86_64/lib/delay.c
@@ -0,0 +1,48 @@
+/*
+ * Precise Delay Loops for x86-64
+ *
+ * Copyright (C) 1993 Linus Torvalds
+ * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * The __delay function must _NOT_ be inlined as its execution time
+ * depends wildly on alignment on many x86 processors.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <asm/delay.h>
+
+#ifdef CONFIG_SMP
+#include <asm/smp.h>
+#endif
+
+int x86_udelay_tsc = 0; /* Delay via TSC */
+
+void __delay(unsigned long loops)
+{
+ unsigned bclock, now;
+
+ rdtscl(bclock);
+ do
+ {
+ rep_nop();
+ rdtscl(now);
+ }
+ while((now-bclock) < loops);
+}
+
+inline void __const_udelay(unsigned long xloops)
+{
+ __delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
+}
+
+void __udelay(unsigned long usecs)
+{
+ __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+ __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S
new file mode 100644
index 000000000000..f94ea8a44051
--- /dev/null
+++ b/arch/x86_64/lib/getuser.S
@@ -0,0 +1,101 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __get_user_X
+ *
+ * Inputs: %rcx contains the address.
+ * The register is modified, but all changes are undone
+ * before returning because the C code doesn't know about it.
+ *
+ * Outputs: %rax is error code (0 or -EFAULT)
+ * %rdx contains zero-extended value
+ *
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/offset.h>
+#include <asm/thread_info.h>
+
+ .text
+ .p2align 4
+.globl __get_user_1
+__get_user_1:
+ GET_THREAD_INFO(%r8)
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae bad_get_user
+1: movzb (%rcx),%edx
+ xorl %eax,%eax
+ ret
+
+ .p2align 4
+.globl __get_user_2
+__get_user_2:
+ GET_THREAD_INFO(%r8)
+ addq $1,%rcx
+ jc 20f
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae 20f
+ decq %rcx
+2: movzwl (%rcx),%edx
+ xorl %eax,%eax
+ ret
+20: decq %rcx
+ jmp bad_get_user
+
+ .p2align 4
+.globl __get_user_4
+__get_user_4:
+ GET_THREAD_INFO(%r8)
+ addq $3,%rcx
+ jc 30f
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae 30f
+ subq $3,%rcx
+3: movl (%rcx),%edx
+ xorl %eax,%eax
+ ret
+30: subq $3,%rcx
+ jmp bad_get_user
+
+ .p2align 4
+.globl __get_user_8
+__get_user_8:
+ GET_THREAD_INFO(%r8)
+ addq $7,%rcx
+ jc bad_get_user
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae bad_get_user
+ subq $7,%rcx
+4: movq (%rcx),%rdx
+ xorl %eax,%eax
+ ret
+40: subq $7,%rcx
+ jmp bad_get_user
+
+bad_get_user:
+ xorl %edx,%edx
+ movq $(-EFAULT),%rax
+ ret
+
+.section __ex_table,"a"
+ .quad 1b,bad_get_user
+ .quad 2b,bad_get_user
+ .quad 3b,bad_get_user
+ .quad 4b,bad_get_user
+.previous
diff --git a/arch/x86_64/lib/io.c b/arch/x86_64/lib/io.c
new file mode 100644
index 000000000000..87b4a4e18039
--- /dev/null
+++ b/arch/x86_64/lib/io.c
@@ -0,0 +1,23 @@
+#include <linux/string.h>
+#include <asm/io.h>
+#include <linux/module.h>
+
+void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
+{
+ __inline_memcpy((void *) dst,src,len);
+}
+EXPORT_SYMBOL(__memcpy_toio);
+
+void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
+{
+ __inline_memcpy(dst,(const void *) src,len);
+}
+EXPORT_SYMBOL(__memcpy_fromio);
+
+void memset_io(volatile void __iomem *a, int b, size_t c)
+{
+ /* XXX: memset can mangle the IO patterns quite a bit.
+ perhaps it would be better to use a dumb one */
+ memset((void *)a,b,c);
+}
+EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
new file mode 100644
index 000000000000..c6c46494fef5
--- /dev/null
+++ b/arch/x86_64/lib/memcpy.S
@@ -0,0 +1,121 @@
+/* Copyright 2002 Andi Kleen */
+
+ #include <asm/cpufeature.h>
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * rax original destination
+ */
+
+ .globl __memcpy
+ .globl memcpy
+ .p2align 4
+__memcpy:
+memcpy:
+ pushq %rbx
+ movq %rdi,%rax
+
+ movl %edx,%ecx
+ shrl $6,%ecx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decl %ecx
+
+ movq (%rsi),%r11
+ movq 8(%rsi),%r8
+
+ movq %r11,(%rdi)
+ movq %r8,1*8(%rdi)
+
+ movq 2*8(%rsi),%r9
+ movq 3*8(%rsi),%r10
+
+ movq %r9,2*8(%rdi)
+ movq %r10,3*8(%rdi)
+
+ movq 4*8(%rsi),%r11
+ movq 5*8(%rsi),%r8
+
+ movq %r11,4*8(%rdi)
+ movq %r8,5*8(%rdi)
+
+ movq 6*8(%rsi),%r9
+ movq 7*8(%rsi),%r10
+
+ movq %r9,6*8(%rdi)
+ movq %r10,7*8(%rdi)
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+.Lhandle_tail:
+ movl %edx,%ecx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq (%rsi),%r8
+ movq %r8,(%rdi)
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ movb (%rsi),%r8b
+ movb %r8b,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+.Lende:
+ popq %rbx
+ ret
+.Lfinal:
+
+ /* C stepping K8 run faster using the string copy instructions.
+ It is also a lot simpler. Use this when possible */
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad memcpy
+ .quad memcpy_c
+ .byte X86_FEATURE_K8_C
+ .byte .Lfinal-memcpy
+ .byte memcpy_c_end-memcpy_c
+ .previous
+
+ .section .altinstr_replacement,"ax"
+ /* rdi destination
+ * rsi source
+ * rdx count
+ */
+memcpy_c:
+ movq %rdi,%rax
+ movl %edx,%ecx
+ shrl $3,%ecx
+ andl $7,%edx
+ rep
+ movsq
+ movl %edx,%ecx
+ rep
+ movsb
+ ret
+memcpy_c_end:
+ .previous
diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c
new file mode 100644
index 000000000000..e93d5255fdc9
--- /dev/null
+++ b/arch/x86_64/lib/memmove.c
@@ -0,0 +1,19 @@
+/* Normally compiler builtins are used, but sometimes the compiler calls out
+ of line code. Based on asm-i386/string.h.
+ */
+#define _STRING_C
+#include <linux/string.h>
+
+#undef memmove
+void *memmove(void * dest,const void *src,size_t count)
+{
+ if (dest < src) {
+ __inline_memcpy(dest,src,count);
+ } else {
+ char *p = (char *) dest + count;
+ char *s = (char *) src + count;
+ while (count--)
+ *--p = *--s;
+ }
+ return dest;
+}
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
new file mode 100644
index 000000000000..4b4c40638640
--- /dev/null
+++ b/arch/x86_64/lib/memset.S
@@ -0,0 +1,125 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ .globl __memset
+ .globl memset
+ .p2align 4
+memset:
+__memset:
+ movq %rdi,%r10
+ movq %rdx,%r11
+
+ /* expand byte value */
+ movzbl %sil,%ecx
+ movabs $0x0101010101010101,%rax
+ mul %rcx /* with rax, clobbers rdx */
+
+ /* align dst */
+ movl %edi,%r9d
+ andl $7,%r9d
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+
+ movl %r11d,%ecx
+ shrl $6,%ecx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decl %ecx
+ movq %rax,(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+ movq %rax,24(%rdi)
+ movq %rax,32(%rdi)
+ movq %rax,40(%rdi)
+ movq %rax,48(%rdi)
+ movq %rax,56(%rdi)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+ /* Handle tail in loops. The loops should be faster than hard
+ to predict jump tables. */
+ .p2align 4
+.Lhandle_tail:
+ movl %r11d,%ecx
+ andl $63&(~7),%ecx
+ jz .Lhandle_7
+ shrl $3,%ecx
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq %rax,(%rdi)
+ leaq 8(%rdi),%rdi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %r11d,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ decl %ecx
+ movb %al,(%rdi)
+ leaq 1(%rdi),%rdi
+ jnz .Lloop_1
+
+.Lende:
+ movq %r10,%rax
+ ret
+
+.Lbad_alignment:
+ cmpq $7,%r11
+ jbe .Lhandle_7
+ movq %rax,(%rdi) /* unaligned store */
+ movq $8,%r8
+ subq %r9,%r8
+ addq %r8,%rdi
+ subq %r8,%r11
+ jmp .Lafter_bad_alignment
+
+ /* C stepping K8 run faster using the string instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad memset
+ .quad memset_c
+ .byte X86_FEATURE_K8_C
+ .byte memset_c_end-memset_c
+ .byte memset_c_end-memset_c
+ .previous
+
+ .section .altinstr_replacement,"ax"
+ /* rdi destination
+ * rsi value
+ * rdx count
+ */
+memset_c:
+ movq %rdi,%r9
+ movl %edx,%r8d
+ andl $7,%r8d
+ movl %edx,%ecx
+ shrl $3,%ecx
+ /* expand byte value */
+ movzbl %sil,%esi
+ movabs $0x0101010101010101,%rax
+ mulq %rsi /* with rax, clobbers rdx */
+ rep
+ stosq
+ movl %r8d,%ecx
+ rep
+ stosb
+ movq %r9,%rax
+ ret
+memset_c_end:
+ .previous
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S
new file mode 100644
index 000000000000..0dee1fdcb162
--- /dev/null
+++ b/arch/x86_64/lib/putuser.S
@@ -0,0 +1,89 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __put_user_X
+ *
+ * Inputs: %rcx contains the address
+ * %rdx contains new value
+ *
+ * Outputs: %rax is error code (0 or -EFAULT)
+ *
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/offset.h>
+#include <asm/thread_info.h>
+
+ .text
+ .p2align 4
+.globl __put_user_1
+__put_user_1:
+ GET_THREAD_INFO(%r8)
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae bad_put_user
+1: movb %dl,(%rcx)
+ xorl %eax,%eax
+ ret
+
+ .p2align 4
+.globl __put_user_2
+__put_user_2:
+ GET_THREAD_INFO(%r8)
+ addq $1,%rcx
+ jc bad_put_user
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae bad_put_user
+2: movw %dx,-1(%rcx)
+ xorl %eax,%eax
+ ret
+
+ .p2align 4
+.globl __put_user_4
+__put_user_4:
+ GET_THREAD_INFO(%r8)
+ addq $3,%rcx
+ jc bad_put_user
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae bad_put_user
+3: movl %edx,-3(%rcx)
+ xorl %eax,%eax
+ ret
+
+ .p2align 4
+.globl __put_user_8
+__put_user_8:
+ GET_THREAD_INFO(%r8)
+ addq $7,%rcx
+ jc bad_put_user
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae bad_put_user
+4: movq %rdx,-7(%rcx)
+ xorl %eax,%eax
+ ret
+
+bad_put_user:
+ movq $(-EFAULT),%rax
+ ret
+
+.section __ex_table,"a"
+ .quad 1b,bad_put_user
+ .quad 2b,bad_put_user
+ .quad 3b,bad_put_user
+ .quad 4b,bad_put_user
+.previous
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
new file mode 100644
index 000000000000..acc1e2ca7ed7
--- /dev/null
+++ b/arch/x86_64/lib/thunk.S
@@ -0,0 +1,95 @@
+ /*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $
+ */
+
+ #include <linux/config.h>
+ #include <linux/linkage.h>
+ #include <asm/dwarf2.h>
+ #include <asm/calling.h>
+ #include <asm/rwlock.h>
+
+ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+ .macro thunk name,func
+ .globl \name
+\name:
+ CFI_STARTPROC
+ SAVE_ARGS
+ call \func
+ jmp restore
+ CFI_ENDPROC
+ .endm
+
+ /* rdi: arg1 ... normal C conventions. rax is passed from C. */
+ .macro thunk_retrax name,func
+ .globl \name
+\name:
+ CFI_STARTPROC
+ SAVE_ARGS
+ call \func
+ jmp restore_norax
+ CFI_ENDPROC
+ .endm
+
+
+ .section .sched.text
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+ thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
+ thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
+ thunk rwsem_wake_thunk,rwsem_wake
+ thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
+#endif
+ thunk do_softirq_thunk,do_softirq
+
+ thunk __down_failed,__down
+ thunk_retrax __down_failed_interruptible,__down_interruptible
+ thunk_retrax __down_failed_trylock,__down_trylock
+ thunk __up_wakeup,__up
+
+ /* SAVE_ARGS below is used only for the .cfi directives it contains. */
+ CFI_STARTPROC
+ SAVE_ARGS
+restore:
+ RESTORE_ARGS
+ ret
+ CFI_ENDPROC
+
+ CFI_STARTPROC
+ SAVE_ARGS
+restore_norax:
+ RESTORE_ARGS 1
+ ret
+ CFI_ENDPROC
+
+#ifdef CONFIG_SMP
+/* Support for read/write spinlocks. */
+ .text
+/* rax: pointer to rwlock_t */
+ENTRY(__write_lock_failed)
+ lock
+ addl $RW_LOCK_BIAS,(%rax)
+1: rep
+ nop
+ cmpl $RW_LOCK_BIAS,(%rax)
+ jne 1b
+ lock
+ subl $RW_LOCK_BIAS,(%rax)
+ jnz __write_lock_failed
+ ret
+
+/* rax: pointer to rwlock_t */
+ENTRY(__read_lock_failed)
+ lock
+ incl (%rax)
+1: rep
+ nop
+ cmpl $1,(%rax)
+ js 1b
+ lock
+ decl (%rax)
+ js __read_lock_failed
+ ret
+#endif
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c
new file mode 100644
index 000000000000..db8abba1ad81
--- /dev/null
+++ b/arch/x86_64/lib/usercopy.c
@@ -0,0 +1,153 @@
+/*
+ * User address space access functions.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ * Copyright 2002 Andi Kleen <ak@suse.de>
+ */
+#include <asm/uaccess.h>
+
+/*
+ * Copy a null terminated string from userspace.
+ */
+
+#define __do_strncpy_from_user(dst,src,count,res) \
+do { \
+ long __d0, __d1, __d2; \
+ might_sleep(); \
+ __asm__ __volatile__( \
+ " testq %1,%1\n" \
+ " jz 2f\n" \
+ "0: lodsb\n" \
+ " stosb\n" \
+ " testb %%al,%%al\n" \
+ " jz 1f\n" \
+ " decq %1\n" \
+ " jnz 0b\n" \
+ "1: subq %1,%0\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: movq %5,%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 8\n" \
+ " .quad 0b,3b\n" \
+ ".previous" \
+ : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
+ "=&D" (__d2) \
+ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+ : "memory"); \
+} while (0)
+
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+ long res;
+ __do_strncpy_from_user(dst, src, count, res);
+ return res;
+}
+
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+ long res = -EFAULT;
+ if (access_ok(VERIFY_READ, src, 1))
+ __do_strncpy_from_user(dst, src, count, res);
+ return res;
+}
+
+/*
+ * Zero Userspace
+ */
+
+unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+ long __d0;
+ might_sleep();
+ /* no memory constraint because it doesn't change any memory gcc knows
+ about */
+ asm volatile(
+ " testq %[size8],%[size8]\n"
+ " jz 4f\n"
+ "0: movq %[zero],(%[dst])\n"
+ " addq %[eight],%[dst]\n"
+ " decl %%ecx ; jnz 0b\n"
+ "4: movq %[size1],%%rcx\n"
+ " testl %%ecx,%%ecx\n"
+ " jz 2f\n"
+ "1: movb %b[zero],(%[dst])\n"
+ " incq %[dst]\n"
+ " decl %%ecx ; jnz 1b\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: lea 0(%[size1],%[size8],8),%[size8]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 8\n"
+ " .quad 0b,3b\n"
+ " .quad 1b,2b\n"
+ ".previous"
+ : [size8] "=c"(size), [dst] "=&D" (__d0)
+ : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
+ [zero] "r" (0UL), [eight] "r" (8UL));
+ return size;
+}
+
+
+unsigned long clear_user(void __user *to, unsigned long n)
+{
+ if (access_ok(VERIFY_WRITE, to, n))
+ return __clear_user(to, n);
+ return n;
+}
+
+/*
+ * Return the size of a string (including the ending 0)
+ *
+ * Return 0 on exception, a value greater than N if too long
+ */
+
+long strnlen_user(const char __user *s, long n)
+{
+ long res = 0;
+ char c;
+
+ if (!access_ok(VERIFY_READ, s, n))
+ return 0;
+
+ while (1) {
+ if (res>n)
+ return n+1;
+ if (__get_user(c, s))
+ return 0;
+ if (!c)
+ return res+1;
+ res++;
+ s++;
+ }
+}
+
+long strlen_user(const char __user *s)
+{
+ long res = 0;
+ char c;
+
+ for (;;) {
+ if (get_user(c, s))
+ return 0;
+ if (!c)
+ return res+1;
+ res++;
+ s++;
+ }
+}
+
+unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
+{
+ if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) {
+ return copy_user_generic((__force void *)to, (__force void *)from, len);
+ }
+ return len;
+}