summaryrefslogtreecommitdiffstats
path: root/arch/sparc64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sparc64/lib')
-rw-r--r--arch/sparc64/lib/Makefile56
-rw-r--r--arch/sparc64/lib/blockops.S138
-rw-r--r--arch/sparc64/lib/checksum.S565
-rw-r--r--arch/sparc64/lib/copy_from_user.S456
-rw-r--r--arch/sparc64/lib/copy_to_user.S456
-rw-r--r--arch/sparc64/lib/locks.S77
-rw-r--r--arch/sparc64/lib/memcmp.S29
-rw-r--r--arch/sparc64/lib/memcpy.S526
-rw-r--r--arch/sparc64/lib/memscan.S116
-rw-r--r--arch/sparc64/lib/memset.S196
-rw-r--r--arch/sparc64/lib/strlen.S77
-rw-r--r--arch/sparc64/lib/strlen_user.S99
-rw-r--r--arch/sparc64/lib/strncmp.S31
-rw-r--r--arch/sparc64/lib/strncpy_from_user.S54
14 files changed, 2876 insertions, 0 deletions
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile
new file mode 100644
index 000000000..56c506507
--- /dev/null
+++ b/arch/sparc64/lib/Makefile
@@ -0,0 +1,56 @@
+# $Id: Makefile,v 1.7 1997/04/07 18:57:05 jj Exp $
+# Makefile for Sparc library files..
+#
+
+CFLAGS := $(CFLAGS) -ansi
+
+OBJS = memset.o blockops.o locks.o memcpy.o strlen.o strncmp.o \
+ memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \
+ copy_to_user.o copy_from_user.o
+
+lib.a: $(OBJS)
+ $(AR) rcs lib.a $(OBJS)
+ sync
+
+blockops.o: blockops.S
+ $(CC) -ansi -c -o blockops.o blockops.S
+
+memset.o: memset.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o memset.o memset.S
+
+copy_to_user.o: copy_to_user.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o copy_to_user.o copy_to_user.S
+
+copy_from_user.o: copy_from_user.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o copy_from_user.o copy_from_user.S
+
+memcpy.o: memcpy.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o memcpy.o memcpy.S
+
+strlen.o: strlen.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o strlen.o strlen.S
+
+strncmp.o: strncmp.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o strncmp.o strncmp.S
+
+memcmp.o: memcmp.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o memcmp.o memcmp.S
+
+locks.o: locks.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o locks.o locks.S
+
+checksum.o: checksum.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o checksum.o checksum.S
+
+memscan.o: memscan.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o memscan.o memscan.S
+
+strncpy_from_user.o: strncpy_from_user.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o strncpy_from_user.o strncpy_from_user.S
+
+strlen_user.o: strlen_user.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o strlen_user.o strlen_user.S
+
+dep:
+
+include $(TOPDIR)/Rules.make
diff --git a/arch/sparc64/lib/blockops.S b/arch/sparc64/lib/blockops.S
new file mode 100644
index 000000000..b3f06c18d
--- /dev/null
+++ b/arch/sparc64/lib/blockops.S
@@ -0,0 +1,138 @@
+/* $Id: blockops.S,v 1.5 1997/03/26 18:34:28 jj Exp $
+ * arch/sparc64/lib/blockops.S: UltraSparc block zero optimized routines.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#include <asm/asi.h>
+
+ /* Zero out 256 bytes of memory at (buf + offset). */
+#define BLAST_BLOCK(buf, offset) \
+ stda %f48, [buf + offset + 0x00] %asi; \
+ stda %f48, [buf + offset + 0x40] %asi; \
+ stda %f48, [buf + offset + 0x80] %asi; \
+ stda %f48, [buf + offset + 0xc0] %asi;
+
+ /* Copy 256 bytes of memory at (src + offset) to
+ * (dst + offset).
+ */
+#define MIRROR_BLOCK(dst, src, offset, sync) \
+ ldda [src + offset + 0x000] %asi, %f0; \
+ ldda [src + offset + 0x040] %asi, %f16; \
+ ldda [src + offset + 0x080] %asi, %f32; \
+ ldda [src + offset + 0x0c0] %asi, %f48; \
+ membar sync; \
+ stda %f0, [dst + offset + 0x000] %asi; \
+ stda %f16, [dst + offset + 0x040] %asi; \
+ stda %f32, [dst + offset + 0x080] %asi; \
+ stda %f48, [dst + offset + 0x0c0] %asi;
+
+ .text
+ .align 4
+
+ .globl bzero_2page, bzero_1page
+bzero_2page:
+ /* %o0 = buf */
+ mov %o0, %o1
+ wr %g0, ASI_BLK_P, %asi
+ mov 0x10, %g2
+
+ membar #Sync|#StoreLoad
+
+ fzero %f48
+ fzero %f50
+ fzero %f52
+ fzero %f54
+ fzero %f56
+ fzero %f58
+ fzero %f60
+ fzero %f62
+1:
+ BLAST_BLOCK(%o0, 0x000)
+ BLAST_BLOCK(%o0, 0x100)
+ BLAST_BLOCK(%o0, 0x200)
+ BLAST_BLOCK(%o0, 0x300)
+ subcc %g2, 1, %g2
+ bne,pt %icc, 1b
+ add %o0, 0x400, %o0
+
+ membar #Sync|#LoadStore|#StoreStore
+
+ retl
+ mov %o1, %o0
+
+bzero_1page:
+ /* %o0 = buf */
+ mov %o0, %o1
+ wr %g0, ASI_BLK_P, %asi
+ mov 0x08, %g2
+ membar #Sync|#StoreLoad
+ fzero %f48
+ fzero %f50
+ fzero %f52
+ fzero %f54
+ fzero %f56
+ fzero %f58
+ fzero %f60
+ fzero %f62
+1:
+ BLAST_BLOCK(%o0, 0x000)
+ BLAST_BLOCK(%o0, 0x100)
+ BLAST_BLOCK(%o0, 0x200)
+ BLAST_BLOCK(%o0, 0x300)
+ subcc %g2, 1, %g2
+ bne,pt %icc, 1b
+ add %o0, 0x400, %o0
+
+ membar #Sync|#LoadStore|#StoreStore
+
+ retl
+ mov %o1, %o0
+
+ .globl __bfill64
+__bfill64:
+ /* %o0 = buf */
+ stx %o1, [%sp + 0x7ff + 128]
+ wr %g0, ASI_BLK_P, %asi
+ mov 0x08, %g2
+ ldd [%sp + 0x7ff + 128], %f48
+ membar #Sync|#StoreLoad
+ fmovd %f48, %f50
+ fmovd %f48, %f52
+ fmovd %f48, %f54
+ fmovd %f48, %f56
+ fmovd %f48, %f58
+ fmovd %f48, %f60
+ fmovd %f48, %f62
+1:
+ BLAST_BLOCK(%o0, 0x000)
+ BLAST_BLOCK(%o0, 0x100)
+ BLAST_BLOCK(%o0, 0x200)
+ BLAST_BLOCK(%o0, 0x300)
+ subcc %g2, 1, %g2
+ bne,pt %icc, 1b
+ add %o0, 0x400, %o0
+
+ retl
+ membar #Sync|#LoadStore|#StoreStore
+
+ .globl __copy_1page
+__copy_1page:
+ /* %o0 = dst, %o1 = src */
+ or %g0, 0x08, %g1
+ wr %g0, ASI_BLK_P, %asi
+ membar #Sync|#StoreLoad
+1:
+ MIRROR_BLOCK(%o0, %o1, 0x000, #Sync)
+ MIRROR_BLOCK(%o0, %o1, 0x100, #Sync)
+ MIRROR_BLOCK(%o0, %o1, 0x200, #Sync)
+ MIRROR_BLOCK(%o0, %o1, 0x300, #Sync)
+ subcc %g1, 1, %g1
+ add %o0, 0x400, %o0
+ bne,pt %icc, 1b
+ add %o1, 0x400, %o1
+
+ retl
+ membar #Sync|#LoadStore|#StoreStore
+
diff --git a/arch/sparc64/lib/checksum.S b/arch/sparc64/lib/checksum.S
new file mode 100644
index 000000000..8a06003ee
--- /dev/null
+++ b/arch/sparc64/lib/checksum.S
@@ -0,0 +1,565 @@
+/* checksum.S: Sparc V9 optimized checksum code.
+ *
+ * Copyright(C) 1995 Linus Torvalds
+ * Copyright(C) 1995 Miguel de Icaza
+ * Copyright(C) 1996 David S. Miller
+ * Copyright(C) 1997 Jakub Jelinek
+ *
+ * derived from:
+ * Linux/Alpha checksum c-code
+ * Linux/ix86 inline checksum assembly
+ * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
+ * David Mosberger-Tang for optimized reference c-code
+ * BSD4.4 portable checksum routine
+ */
+
+#include <asm/errno.h>
+#include <asm/head.h>
+#include <asm/ptrace.h>
+#include <asm/asi.h>
+
+#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \
+ ldd [buf + offset + 0x00], t0; \
+ ldd [buf + offset + 0x08], t2; \
+ addccc t0, sum, sum; \
+ addccc t1, sum, sum; \
+ ldd [buf + offset + 0x10], t4; \
+ addccc t2, sum, sum; \
+ addccc t3, sum, sum; \
+ ldd [buf + offset + 0x18], t0; \
+ addccc t4, sum, sum; \
+ addccc t5, sum, sum; \
+ addccc t0, sum, sum; \
+ addccc t1, sum, sum;
+
+#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3) \
+ ldd [buf - offset - 0x08], t0; \
+ ldd [buf - offset - 0x00], t2; \
+ addccc t0, sum, sum; \
+ addccc t1, sum, sum; \
+ addccc t2, sum, sum; \
+ addccc t3, sum, sum;
+
+ /* Do end cruft out of band to get better cache patterns. */
+csum_partial_end_cruft:
+ andcc %o1, 8, %g0 ! check how much
+ be,pn %icc, 1f ! caller asks %o1 & 0x8
+ and %o1, 4, %g3 ! nope, check for word remaining
+ ldd [%o0], %g2 ! load two
+ addcc %g2, %o2, %o2 ! add first word to sum
+ addccc %g3, %o2, %o2 ! add second word as well
+ add %o0, 8, %o0 ! advance buf ptr
+ addc %g0, %o2, %o2 ! add in final carry
+1: brz,pn %g3, 1f ! nope, skip this code
+ andcc %o1, 3, %o1 ! check for trailing bytes
+ ld [%o0], %g2 ! load it
+ addcc %g2, %o2, %o2 ! add to sum
+ add %o0, 4, %o0 ! advance buf ptr
+ addc %g0, %o2, %o2 ! add in final carry
+1: brz,pn %o1, 1f ! no trailing bytes, return
+ addcc %o1, -1, %g0 ! only one byte remains?
+ bne,pn %icc, 2f ! at least two bytes more
+ subcc %o1, 2, %o1 ! only two bytes more?
+ ba,pt %xcc, 4f ! only one byte remains
+ clr %o4 ! clear fake hword value
+2: lduh [%o0], %o4 ! get hword
+ be,pn %icc, 6f ! jmp if only hword remains
+ add %o0, 2, %o0 ! advance buf ptr either way
+ sll %o4, 16, %o4 ! create upper hword
+4: ldub [%o0], %o5 ! get final byte
+ sll %o5, 8, %o5 ! put into place
+ or %o5, %o4, %o4 ! coalese with hword (if any)
+6: addcc %o4, %o2, %o2 ! add to sum
+1: sllx %g4, 32, %g4 ! give gfp back
+ retl ! get outta here
+ addc %g0, %o2, %o0 ! add final carry into retval
+
+ /* Also do alignment out of band to get better cache patterns. */
+csum_partial_fix_alignment:
+
+ /* The common case is to get called with a nicely aligned
+ * buffer of size 0x20. Follow the code path for that case.
+ */
+ .globl csum_partial
+csum_partial: /* %o0=buf, %o1=len, %o2=sum */
+ andcc %o0, 0x7, %g0 ! alignment problems?
+ be,pt %icc, csum_partial_fix_aligned ! yep, handle it
+ andn %o1, 0x7f, %o3 ! num loop iterations
+ cmp %o1, 6
+ bl,pn %icc, cpte - 0x4
+ andcc %o0, 0x2, %g0
+ be,pn %icc, 1f
+ and %o0, 0x4, %g7
+ lduh [%o0 + 0x00], %g2
+ sub %o1, 2, %o1
+ add %o0, 2, %o0
+ sll %g2, 16, %g2
+ addcc %g2, %o2, %o2
+ srl %o2, 16, %g3
+ addc %g0, %g3, %g2
+ sll %o2, 16, %o2
+ sll %g2, 16, %g3
+ srl %o2, 16, %o2
+ or %g3, %o2, %o2
+1: brz,pn %g7, csum_partial_fix_aligned
+ nop
+ ld [%o0 + 0x00], %g2
+ sub %o1, 4, %o1
+ addcc %g2, %o2, %o2
+ add %o0, 4, %o0
+ addc %g0, %o2, %o2
+csum_partial_fix_aligned:
+ brz,pt %o3, 3f ! none to do
+ andcc %o1, 0x70, %g1 ! clears carry flag too
+5: CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ sub %o3, 128, %o3 ! detract from loop iters
+ addc %g0, %o2, %o2 ! sink in final carry
+ brnz,pt %o3, 5b ! more to do
+ add %o0, 128, %o0 ! advance buf ptr
+3: brz,pn %g1, cpte ! nope
+ andcc %o1, 0xf, %o3 ! anything left at all?
+10: rd %pc, %g7 ! get pc
+ srl %g1, 1, %o4 ! compute offset
+ sub %g7, %g1, %g7 ! adjust jmp ptr
+ sub %g7, %o4, %g7 ! final jmp ptr adjust
+ jmp %g7 + (cpte - 8 - 10b) ! enter the table
+ add %o0, %g1, %o0 ! advance buf ptr
+cptbl: CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
+ addc %g0, %o2, %o2 ! fetch final carry
+ andcc %o1, 0xf, %g0 ! anything left at all?
+cpte: brnz,pn %o3, csum_partial_end_cruft ! yep, handle it
+ sethi %uhi(KERNBASE), %g4
+ mov %o2, %o0 ! return computed csum
+ retl ! get outta here
+ sllx %g4, 32, %g4 ! give gfp back
+
+ .globl __csum_partial_copy_start, __csum_partial_copy_end
+__csum_partial_copy_start:
+
+#define EX(x,y,a,b,z) \
+98: x,y; \
+ .section .fixup,z##alloc,z##execinstr; \
+ .align 4; \
+99: ba,pt %xcc, 30f; \
+ a, b, %o3; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4
+
+#define EX2(x,y,z) \
+98: x,y; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 30f; \
+ .text; \
+ .align 4
+
+#define EX3(x,y,z) \
+98: x,y; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 96f; \
+ .text; \
+ .align 4
+
+#define EXT(start,end,handler,z) \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word start, 0, end, handler; \
+ .text; \
+ .align 4
+
+ /* This aligned version executes typically in 8.5 superscalar cycles, this
+ * is the best I can do. I say 8.5 because the final add will pair with
+ * the next ldd in the main unrolled loop. Thus the pipe is always full.
+ * If you change these macros (including order of instructions),
+ * please check the fixup code below as well.
+ */
+#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [src + off + 0x00], t0; \
+ ldd [src + off + 0x08], t2; \
+ addccc t0, sum, sum; \
+ ldd [src + off + 0x10], t4; \
+ addccc t1, sum, sum; \
+ ldd [src + off + 0x18], t6; \
+ addccc t2, sum, sum; \
+ std t0, [dst + off + 0x00]; \
+ addccc t3, sum, sum; \
+ std t2, [dst + off + 0x08]; \
+ addccc t4, sum, sum; \
+ std t4, [dst + off + 0x10]; \
+ addccc t5, sum, sum; \
+ std t6, [dst + off + 0x18]; \
+ addccc t6, sum, sum; \
+ addccc t7, sum, sum;
+
+ /* 12 superscalar cycles seems to be the limit for this case,
+ * because of this we thus do all the ldd's together to get
+ * Viking MXCC into streaming mode. Ho hum...
+ */
+#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [src + off + 0x00], t0; \
+ ldd [src + off + 0x08], t2; \
+ ldd [src + off + 0x10], t4; \
+ ldd [src + off + 0x18], t6; \
+ st t0, [dst + off + 0x00]; \
+ addccc t0, sum, sum; \
+ st t1, [dst + off + 0x04]; \
+ addccc t1, sum, sum; \
+ st t2, [dst + off + 0x08]; \
+ addccc t2, sum, sum; \
+ st t3, [dst + off + 0x0c]; \
+ addccc t3, sum, sum; \
+ st t4, [dst + off + 0x10]; \
+ addccc t4, sum, sum; \
+ st t5, [dst + off + 0x14]; \
+ addccc t5, sum, sum; \
+ st t6, [dst + off + 0x18]; \
+ addccc t6, sum, sum; \
+ st t7, [dst + off + 0x1c]; \
+ addccc t7, sum, sum;
+
+ /* Yuck, 6 superscalar cycles... */
+#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3) \
+ ldd [src - off - 0x08], t0; \
+ ldd [src - off - 0x00], t2; \
+ addccc t0, sum, sum; \
+ st t0, [dst - off - 0x08]; \
+ addccc t1, sum, sum; \
+ st t1, [dst - off - 0x04]; \
+ addccc t2, sum, sum; \
+ st t2, [dst - off - 0x00]; \
+ addccc t3, sum, sum; \
+ st t3, [dst - off + 0x04];
+
+ /* Handle the end cruft code out of band for better cache patterns. */
+cc_end_cruft:
+ andcc %o3, 8, %g0 ! begin checks for that code
+ be,pn %icc, 1f
+ and %o3, 4, %g5
+ EX(ldd [%o0 + 0x00], %g2, and %o3, 0xf,#)
+ add %o1, 8, %o1
+ addcc %g2, %g7, %g7
+ add %o0, 8, %o0
+ addccc %g3, %g7, %g7
+ EX2(st %g2, [%o1 - 0x08],#)
+ addc %g0, %g7, %g7
+ EX2(st %g3, [%o1 - 0x04],#)
+1: brz,pt %g5, 1f
+ andcc %o3, 3, %o3
+ EX(ld [%o0 + 0x00], %g2, add %o3, 4,#)
+ add %o1, 4, %o1
+ addcc %g2, %g7, %g7
+ EX2(st %g2, [%o1 - 0x04],#)
+ addc %g0, %g7, %g7
+ add %o0, 4, %o0
+1: brz,pn %o3, 1f
+ addcc %o3, -1, %g0
+ bne,pn %icc, 2f
+ subcc %o3, 2, %o3
+ ba,pt %xcc, 4f
+ clr %o4
+2: EX(lduh [%o0 + 0x00], %o4, add %o3, 2,#)
+ add %o0, 2, %o0
+ EX2(sth %o4, [%o1 + 0x00],#)
+ be,pn %icc, 6f
+ add %o1, 2, %o1
+ sll %o4, 16, %o4
+4: EX(ldub [%o0 + 0x00], %o5, add %g0, 1,#)
+ EX2(stb %o5, [%o1 + 0x00],#)
+ sll %o5, 8, %o5
+ or %o5, %o4, %o4
+6: addcc %o4, %g7, %g7
+1: sllx %g4, 32, %g4
+ retl
+ addc %g0, %g7, %o0
+
+ /* Sun, you just can't beat me, you just can't. Stop trying,
+ * give up. I'm serious, I am going to kick the living shit
+ * out of you, game over, lights out.
+ */
+ .align 8
+ .globl __csum_partial_copy_sparc_generic
+__csum_partial_copy_sparc_generic:
+ /* %o0=src, %o1=dest, %g1=len, %g7=sum */
+ xor %o0, %o1, %o4 ! get changing bits
+ andcc %o4, 3, %g0 ! check for mismatched alignment
+ bne,pn %icc, ccslow ! better this than unaligned/fixups
+ andcc %o0, 7, %g0 ! need to align things?
+ be,pt %icc, cc_dword_aligned ! yes, we check for short lengths there
+ andn %g1, 0x7f, %g2 ! can we use unrolled loop?
+ cmp %g1, 6
+ bl,a,pn %icc, ccte
+ andcc %g1, 0xf, %o3
+ andcc %o0, 0x1, %g0
+ bne,pn %icc, ccslow
+ andcc %o0, 0x2, %g0
+ be,pn %icc, 1f
+ andcc %o0, 0x4, %g0
+ EX(lduh [%o0 + 0x00], %g4, add %g1, 0,#)
+ sub %g1, 2, %g1
+ EX2(sth %g4, [%o1 + 0x00],#)
+ add %o0, 2, %o0
+ sll %g4, 16, %g4
+ addcc %g4, %g7, %g7
+ add %o1, 2, %o1
+ srl %g7, 16, %g3
+ addc %g0, %g3, %g4
+ sll %g7, 16, %g7
+ sll %g4, 16, %g3
+ srl %g7, 16, %g7
+ andcc %o0, 0x4, %g0
+ or %g3, %g7, %g7
+1: be,pt %icc, 3f
+ andn %g1, 0x7f, %g0
+ EX(ld [%o0 + 0x00], %g4, add %g1, 0,#)
+ sub %g1, 4, %g1
+ EX2(st %g4, [%o1 + 0x00],#)
+ add %o0, 4, %o0
+ addcc %g4, %g7, %g7
+ add %o1, 4, %o1
+ addc %g0, %g7, %g7
+cc_dword_aligned:
+3: brz,pn %g2, 3f ! nope, less than one loop remains
+ andcc %o1, 4, %g0 ! dest aligned on 4 or 8 byte boundry?
+ be,pn %icc, ccdbl + 4 ! 8 byte aligned, kick ass
+5: CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+10: EXT(5b, 10b, 20f,#) ! note for exception handling
+ sub %g1, 128, %g1 ! detract from length
+ addc %g0, %g7, %g7 ! add in last carry bit
+ andncc %g1, 0x7f, %g0 ! more to csum?
+ add %o0, 128, %o0 ! advance src ptr
+ bne,pt %icc, 5b ! we did not go negative, continue looping
+ add %o1, 128, %o1 ! advance dest ptr
+3: andcc %g1, 0x70, %o2 ! can use table?
+ccmerge:be,pn %icc, ccte ! nope, go and check for end cruft
+ andcc %g1, 0xf, %o3 ! get low bits of length (clears carry btw)
+ srl %o2, 1, %o4 ! begin negative offset computation
+13: rd %pc, %o5 ! set up table ptr end
+ add %o0, %o2, %o0 ! advance src ptr
+ sub %o5, %o4, %o5 ! continue table calculation
+ sll %o2, 1, %g2 ! constant multiplies are fun...
+ sub %o5, %g2, %o5 ! some more adjustments
+ jmpl %o5 + (12f-13b), %g0 ! jump into it, duff style, wheee...
+ add %o1, %o2, %o1 ! advance dest ptr (carry is clear btw)
+cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
+12: EXT(cctbl, 12b, 22f,#) ! note for exception table handling
+ addc %g0, %g7, %g7
+ andcc %o3, 0xf, %g0 ! check for low bits set
+ccte: bne,pn %icc, cc_end_cruft ! something left, handle it out of band
+ sethi %uhi(KERNBASE), %g4 ! restore gfp
+ mov %g7, %o0 ! give em the computed checksum
+ retl ! return
+ sllx %g4, 32, %g4 ! finish gfp restoration
+ccdbl: CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+11: EXT(ccdbl, 11b, 21f,#) ! note for exception table handling
+ sub %g1, 128, %g1 ! detract from length
+ addc %g0, %g7, %g7 ! add in last carry bit
+ andncc %g1, 0x7f, %g0 ! more to csum?
+ add %o0, 128, %o0 ! advance src ptr
+ bne,pt %icc, ccdbl ! we did not go negative, continue looping
+ add %o1, 128, %o1 ! advance dest ptr
+ ba,pt %xcc, ccmerge ! finish it off, above
+ andcc %g1, 0x70, %o2 ! can use table? (clears carry btw)
+
+ccslow: mov 0, %g5
+ brlez,pn %g1, 4f
+ andcc %o0, 1, %o5
+ be,a,pt %icc, 1f
+ srl %g1, 1, %o3
+ sub %g1, 1, %g1
+ EX(ldub [%o0], %g5, add %g1, 1,#)
+ add %o0, 1, %o0
+ EX2(stb %g5, [%o1],#)
+ srl %g1, 1, %o3
+ add %o1, 1, %o1
+1: brz,a,pn %o3, 3f
+ andcc %g1, 1, %g0
+ andcc %o0, 2, %g0
+ be,a,pt %icc, 1f
+ srl %o3, 1, %o3
+ EX(lduh [%o0], %o4, add %g1, 0,#)
+ sub %g1, 2, %g1
+ srl %o4, 8, %g2
+ sub %o3, 1, %o3
+ EX2(stb %g2, [%o1],#)
+ add %o4, %g5, %g5
+ EX2(stb %o4, [%o1 + 1],#)
+ add %o0, 2, %o0
+ srl %o3, 1, %o3
+ add %o1, 2, %o1
+1: brz,a,pn %o3, 2f
+ andcc %g1, 2, %g0
+ EX3(ld [%o0], %o4,#)
+5: srl %o4, 24, %g2
+ srl %o4, 16, %g3
+ EX2(stb %g2, [%o1],#)
+ srl %o4, 8, %g2
+ EX2(stb %g3, [%o1 + 1],#)
+ add %o0, 4, %o0
+ EX2(stb %g2, [%o1 + 2],#)
+ addcc %o4, %g5, %g5
+ EX2(stb %o4, [%o1 + 3],#)
+ addc %g5, %g0, %g5 ! I am now to lazy to optimize this (question is if it
+ add %o1, 4, %o1 ! is worthy). Maybe some day - with the sll/srl
+ subcc %o3, 1, %o3 ! tricks
+ bne,a,pt %icc, 5b
+ EX3(ld [%o0], %o4,#)
+ sll %g5, 16, %g2
+ srl %g5, 16, %g5
+ srl %g2, 16, %g2
+ andcc %g1, 2, %g0
+ add %g2, %g5, %g5
+2: be,a,pt %icc, 3f
+ andcc %g1, 1, %g0
+ EX(lduh [%o0], %o4, and %g1, 3,#)
+ andcc %g1, 1, %g0
+ srl %o4, 8, %g2
+ add %o0, 2, %o0
+ EX2(stb %g2, [%o1],#)
+ add %g5, %o4, %g5
+ EX2(stb %o4, [%o1 + 1],#)
+ add %o1, 2, %o1
+3: be,a,pt %icc, 1f
+ sll %g5, 16, %o4
+ EX(ldub [%o0], %g2, add %g0, 1,#)
+ sll %g2, 8, %o4
+ EX2(stb %g2, [%o1],#)
+ add %g5, %o4, %g5
+ sll %g5, 16, %o4
+1: addcc %o4, %g5, %g5
+ srl %g5, 16, %o4
+ addc %g0, %o4, %g5
+ brz,pt %o5, 4f
+ srl %g5, 8, %o4
+ and %g5, 0xff, %g2
+ and %o4, 0xff, %o4
+ sll %g2, 8, %g2
+ or %g2, %o4, %g5
+4: addcc %g7, %g5, %g7
+ retl
+ addc %g0, %g7, %o0
+__csum_partial_copy_end:
+
+ .section .fixup,#alloc,#execinstr
+ .align 4
+/* We do these strange calculations for the csum_*_from_user case only, ie.
+ * we only bother with faults on loads... */
+
+/* o2 = ((g2%20)&3)*8
+ * o3 = g1 - (g2/20)*32 - o2 */
+20:
+ cmp %g2, 20
+ blu,a,pn %icc, 1f
+ and %g2, 3, %o2
+ sub %g1, 32, %g1
+ ba,pt %xcc, 20b
+ sub %g2, 20, %g2
+1:
+ sll %o2, 3, %o2
+ ba,pt %xcc, 31f
+ sub %g1, %o2, %o3
+
+/* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8)
+ * o3 = g1 - (g2/16)*32 - o2 */
+21:
+ andcc %g2, 15, %o3
+ srl %g2, 4, %g2
+ be,a,pn %icc, 1f
+ clr %o2
+ add %o3, 1, %o3
+ and %o3, 14, %o3
+ sll %o3, 3, %o2
+1:
+ sll %g2, 5, %g2
+ sub %g1, %g2, %o3
+ ba,pt %xcc, 31f
+ sub %o3, %o2, %o3
+
+/* o0 += (g2/10)*16 - 0x70
+ * 01 += (g2/10)*16 - 0x70
+ * o2 = (g2 % 10) ? 8 : 0
+ * o3 += 0x70 - (g2/10)*16 - o2 */
+22:
+ cmp %g2, 10
+ blu,a,pt %xcc, 1f
+ sub %o0, 0x70, %o0
+ add %o0, 16, %o0
+ add %o1, 16, %o1
+ sub %o3, 16, %o3
+ ba,pt %xcc, 22b
+ sub %g2, 10, %g2
+1:
+ sub %o1, 0x70, %o1
+ add %o3, 0x70, %o3
+ clr %o2
+ movrnz %g2, 8, %o2
+ ba,pt %xcc, 31f
+ sub %o3, %o2, %o3
+96:
+ and %g1, 3, %g1
+ sll %o3, 2, %o3
+ add %g1, %o3, %o3
+30:
+/* %o1 is dst
+ * %o3 is # bytes to zero out
+ * %o4 is faulting address
+ * %o5 is %pc where fault occured */
+ clr %o2
+31:
+/* %o0 is src
+ * %o1 is dst
+ * %o2 is # of bytes to copy from src to dst
+ * %o3 is # bytes to zero out
+ * %o4 is faulting address
+ * %o5 is %pc where fault occured */
+ save %sp, -136, %sp
+ mov %i5, %o0
+ mov %i7, %o1
+ mov %i4, %o2
+ call lookup_fault
+ mov %g7, %i4
+ cmp %o0, 2
+ bne,pn %icc, 1f
+ add %g0, -EFAULT, %i5
+ brz,pn %i2, 2f
+ mov %i0, %o1
+ mov %i1, %o0
+ call __copy_from_user
+ mov %i2, %o2
+ brnz,a,pn %o0, 2f
+ add %i3, %i2, %i3
+ add %i1, %i2, %i1
+2:
+ mov %i1, %o0
+ wr %%g0, ASI_S, %%asi
+ call __bzero_noasi
+ mov %i3, %o1
+1:
+ ldx [%sp + STACK_BIAS + 264], %o2 ! struct_ptr of parent
+ st %i5, [%o2]
+ ret
+ restore
diff --git a/arch/sparc64/lib/copy_from_user.S b/arch/sparc64/lib/copy_from_user.S
new file mode 100644
index 000000000..ba26a1c01
--- /dev/null
+++ b/arch/sparc64/lib/copy_from_user.S
@@ -0,0 +1,456 @@
+/* copy_user.S: Sparc optimized copy_from_user code.
+ *
+ * Copyright(C) 1995 Linus Torvalds
+ * Copyright(C) 1996 David S. Miller
+ * Copyright(C) 1996 Eddie C. Dost
+ * Copyright(C) 1996,1997 Jakub Jelinek
+ *
+ * derived from:
+ * e-mail between David and Eddie.
+ *
+ * Returns 0 if successful, otherwise count of bytes not copied yet
+ *
+ * FIXME: This code should be optimized for sparc64... -jj
+ */
+
+#include <asm/ptrace.h>
+#include <asm/asi.h>
+
+#define EX(x,y,a,b,z) \
+98: x,y; \
+ .section .fixup,z##alloc,z##execinstr; \
+ .align 4; \
+99: retl; \
+ a, b, %o0; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4
+
+#define EX2(x,y,c,d,e,a,b,z) \
+98: x,y; \
+ .section .fixup,z##alloc,z##execinstr; \
+ .align 4; \
+99: c, d, e; \
+ retl; \
+ a, b, %o0; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4
+
+#define EXO2(x,y,z) \
+98: x,##y; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 97f; \
+ .text; \
+ .align 4
+
+#define EXT(start,end,handler,z) \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word start, 0, end, handler; \
+ .text; \
+ .align 4
+
+/* Please do not change following macros unless you change logic used
+ * in .fixup at the end of this file as well
+ */
+
+/* Both these macros have to start with exactly the same insn */
+#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldda [%src + offset + 0x00] %asi, %t0; \
+ ldda [%src + offset + 0x08] %asi, %t2; \
+ ldda [%src + offset + 0x10] %asi, %t4; \
+ ldda [%src + offset + 0x18] %asi, %t6; \
+ st %t0, [%dst + offset + 0x00]; \
+ st %t1, [%dst + offset + 0x04]; \
+ st %t2, [%dst + offset + 0x08]; \
+ st %t3, [%dst + offset + 0x0c]; \
+ st %t4, [%dst + offset + 0x10]; \
+ st %t5, [%dst + offset + 0x14]; \
+ st %t6, [%dst + offset + 0x18]; \
+ st %t7, [%dst + offset + 0x1c];
+
+#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldda [%src + offset + 0x00] %asi, %t0; \
+ ldda [%src + offset + 0x08] %asi, %t2; \
+ ldda [%src + offset + 0x10] %asi, %t4; \
+ ldda [%src + offset + 0x18] %asi, %t6; \
+ std %t0, [%dst + offset + 0x00]; \
+ std %t2, [%dst + offset + 0x08]; \
+ std %t4, [%dst + offset + 0x10]; \
+ std %t6, [%dst + offset + 0x18];
+
+#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
+ ldda [%src - offset - 0x10] %asi, %t0; \
+ ldda [%src - offset - 0x08] %asi, %t2; \
+ st %t0, [%dst - offset - 0x10]; \
+ st %t1, [%dst - offset - 0x0c]; \
+ st %t2, [%dst - offset - 0x08]; \
+ st %t3, [%dst - offset - 0x04];
+
+#define MOVE_HALFCHUNK(src, dst, offset, t0, t1, t2, t3) \
+ lduha [%src + offset + 0x00] %asi, %t0; \
+ lduha [%src + offset + 0x02] %asi, %t1; \
+ lduha [%src + offset + 0x04] %asi, %t2; \
+ lduha [%src + offset + 0x06] %asi, %t3; \
+ sth %t0, [%dst + offset + 0x00]; \
+ sth %t1, [%dst + offset + 0x02]; \
+ sth %t2, [%dst + offset + 0x04]; \
+ sth %t3, [%dst + offset + 0x06];
+
+#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
+ lduba [%src - offset - 0x02] %asi, %t0; \
+ lduba [%src - offset - 0x01] %asi, %t1; \
+ stb %t0, [%dst - offset - 0x02]; \
+ stb %t1, [%dst - offset - 0x01];
+
+ .text
+ .align 4
+
+ .globl __copy_from_user
+dword_align:
+ andcc %o1, 1, %g0
+ be 4f
+ andcc %o1, 2, %g0
+
+ EXO2(lduba [%o1] %asi, %g2,#)
+ add %o1, 1, %o1
+ stb %g2, [%o0]
+ sub %o2, 1, %o2
+ bne 3f
+ add %o0, 1, %o0
+
+ EXO2(lduha [%o1] %asi, %g2,#)
+ add %o1, 2, %o1
+ sth %g2, [%o0]
+ sub %o2, 2, %o2
+ ba,pt %xcc, 3f
+ add %o0, 2, %o0
+4:
+ EXO2(lduha [%o1] %asi, %g2,#)
+ add %o1, 2, %o1
+ sth %g2, [%o0]
+ sub %o2, 2, %o2
+ ba,pt %xcc, 3f
+ add %o0, 2, %o0
+
+__copy_from_user: /* %o0=dst %o1=src %o2=len */
+ wr %g0, ASI_S, %asi
+ xor %o0, %o1, %o4
+1:
+ andcc %o4, 3, %o5
+2:
+ bne,pn %icc, cannot_optimize
+ cmp %o2, 15
+
+ bleu,pn %xcc, short_aligned_end
+ andcc %o1, 3, %g0
+
+ bne,pn %icc, dword_align
+3:
+ andcc %o1, 4, %g0
+
+ be,pt %icc, 2f
+ mov %o2, %g1
+
+ EXO2(lda [%o1] %asi, %o4,#)
+ sub %g1, 4, %g1
+ st %o4, [%o0]
+ add %o1, 4, %o1
+ add %o0, 4, %o0
+2:
+ andcc %g1, 0xffffffffffffff80, %g7
+ be,pn %xcc, 3f
+ andcc %o0, 4, %g0
+
+ be,pn %icc, ldd_std + 4
+5:
+ MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
+80:
+ EXT(5b, 80b, 50f,#)
+ subcc %g7, 128, %g7
+ add %o1, 128, %o1
+ bne,pt %xcc, 5b
+ add %o0, 128, %o0
+3:
+ andcc %g1, 0x70, %g7
+ be,pn %icc, copy_user_table_end
+ andcc %g1, 8, %g0
+100:
+ rd %pc, %o5
+ srl %g7, 1, %o4
+ add %g7, %o4, %o4
+ add %o1, %g7, %o1
+ sub %o5, %o4, %o5
+ jmpl %o5 + (copy_user_table_end - 100b), %g0
+ add %o0, %g7, %o0
+
+copy_user_table:
+ MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
+copy_user_table_end:
+ EXT(copy_user_table, copy_user_table_end, 51f,#)
+ be,pt %icc, copy_user_last7
+ andcc %g1, 4, %g0
+
+ EX(ldda [%o1] %asi, %g2, and %g1, 0xf,#)
+ add %o0, 8, %o0
+ add %o1, 8, %o1
+ st %g2, [%o0 - 0x08]
+ st %g3, [%o0 - 0x04]
+copy_user_last7:
+ be,pn %icc, 1f
+ andcc %g1, 2, %g0
+
+ EX(lda [%o1] %asi, %g2, and %g1, 7,#)
+ add %o1, 4, %o1
+ st %g2, [%o0]
+ add %o0, 4, %o0
+1:
+ be,pn %icc, 1f
+ andcc %g1, 1, %g0
+
+ EX(lduha [%o1] %asi, %g2, and %g1, 3,#)
+ add %o1, 2, %o1
+ sth %g2, [%o0]
+ add %o0, 2, %o0
+1:
+ be,pn %icc, 1f
+ nop
+
+ EX(lduba [%o1] %asi, %g2, add %g0, 1,#)
+ stb %g2, [%o0]
+1:
+ retl
+ clr %o0
+
+ldd_std:
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
+81:
+ EXT(ldd_std, 81b, 52f,#)
+ subcc %g7, 128, %g7
+ add %o1, 128, %o1
+ bne,pt %xcc, ldd_std
+ add %o0, 128, %o0
+
+ andcc %g1, 0x70, %g7
+ be,pn %icc, copy_user_table_end
+ andcc %g1, 8, %g0
+101:
+ rd %pc, %o5
+ srl %g7, 1, %o4
+ add %g7, %o4, %o4
+ add %o1, %g7, %o1
+ sub %o5, %o4, %o5
+ jmpl %o5 + (copy_user_table_end - 101b), %g0
+ add %o0, %g7, %o0
+
+cannot_optimize:
+ bleu short_end
+ cmp %o5, 2
+
+ bne byte_chunk
+ and %o2, 0xfffffffffffffff0, %o3
+
+ andcc %o1, 1, %g0
+ be 10f
+ nop
+
+ EXO2(lduba [%o1] %asi, %g2,#)
+ add %o1, 1, %o1
+ stb %g2, [%o0]
+ sub %o2, 1, %o2
+ andcc %o2, 0xfffffffffffffff0, %o3
+ be short_end
+ add %o0, 1, %o0
+10:
+ MOVE_HALFCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
+ MOVE_HALFCHUNK(o1, o0, 0x08, g2, g3, g4, g5)
+82:
+ EXT(10b, 82b, 53f,#)
+ subcc %o3, 0x10, %o3
+ add %o1, 0x10, %o1
+ bne 10b
+ add %o0, 0x10, %o0
+ ba,pt %xcc, 2f
+ and %o2, 0xe, %o3
+
+byte_chunk:
+ MOVE_SHORTCHUNK(o1, o0, -0x02, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x04, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x06, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x08, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x0a, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x0c, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x0e, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x10, g2, g3)
+83:
+ EXT(byte_chunk, 83b, 54f,#)
+ subcc %o3, 0x10, %o3
+ add %o1, 0x10, %o1
+ bne,pt %xcc, byte_chunk
+ add %o0, 0x10, %o0
+
+short_end:
+ and %o2, 0xe, %o3
+2:
+ rd %pc, %o5
+ sll %o3, 3, %o4
+ add %o0, %o3, %o0
+ sub %o5, %o4, %o5
+ add %o1, %o3, %o1
+ jmpl %o5 + (short_table_end - 2b), %g0
+ andcc %o2, 1, %g0
+84:
+ MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
+short_table_end:
+ EXT(84b, short_table_end, 55f,#)
+ be 1f
+ nop
+ EX(lduba [%o1] %asi, %g2, add %g0, 1,#)
+ stb %g2, [%o0]
+1:
+ retl
+ clr %o0
+
+short_aligned_end:
+ bne short_end
+ andcc %o2, 8, %g0
+
+ be 1f
+ andcc %o2, 4, %g0
+
+ EXO2(lda [%o1 + 0x00] %asi, %g2,#)
+ EX(lda [%o1 + 0x04] %asi, %g3, sub %o2, 4,#)
+ add %o1, 8, %o1
+ st %g2, [%o0 + 0x00]
+ st %g3, [%o0 + 0x04]
+ add %o0, 8, %o0
+1:
+ ba,pt %xcc, copy_user_last7
+ mov %o2, %g1
+
+ .section .fixup,#alloc,#execinstr
+ .align 4
+97:
+ retl
+ mov %o2, %o0
+/* exception routine sets %g2 to (broken_insn - first_insn)>>2 */
+50:
+/* This magic counts how many bytes are left when crash in MOVE_BIGCHUNK
+ * happens. This is derived from the amount ldd reads, st stores, etc.
+ * x = g2 % 12;
+ * o0 = g1 + g7 - ((g2 / 12) * 32 + (x < 4) ? x * 8 : (x - 4) * 4)
+ */
+ cmp %g2, 12
+ bcs 1f
+ cmp %g2, 24
+ bcs 2f
+ cmp %g2, 36
+ bcs 3f
+ nop
+ sub %g2, 12, %g2
+ sub %g7, 32, %g7
+3:
+ sub %g2, 12, %g2
+ sub %g7, 32, %g7
+2:
+ sub %g2, 12, %g2
+ sub %g7, 32, %g7
+1:
+ cmp %g2, 4
+ bcs,a 1f
+ sll %g2, 3, %g2
+ sub %g2, 4, %g2
+ sll %g2, 2, %g2
+1:
+ and %g1, 0x7f, %o0
+ add %o0, %g7, %o0
+ retl
+ sub %o0, %g2, %o0
+51:
+/* i = 41 - g2; j = i % 6;
+ * o0 = (g1 & 15) + (i / 6) * 16 + (j < 4) ? (j + 1) * 4 : (j - 3) * 8;
+ */
+ neg %g2
+ and %g1, 0xf, %g1
+ add %g2, 41, %g2
+1:
+ cmp %g2, 6
+ bcs,a 2f
+ cmp %g2, 4
+ add %g1, 16, %g1
+ b 1b
+ sub %g2, 6, %g2
+2:
+ bcs,a 3f
+ inc %g2
+ sub %g2, 3, %g2
+ b 2f
+ sll %g2, 3, %g2
+3:
+ sll %g2, 2, %g2
+2:
+ retl
+ add %g1, %g2, %o0
+52:
+/* o0 = g1 + g7 - (g2 / 8) * 32 + (x & 3) * 8 */
+ and %g2, 0xfffffffffffffff8, %g4
+ and %g2, 3, %g2
+ sll %g4, 2, %g4
+ sll %g2, 3, %g2
+ add %g2, %g4, %g2
+ b,a 1b
+53:
+/* o0 = o3 + (o2 & 15) - (g2 & 8) - (g2 & 3) * 2 */
+ and %g2, 3, %g4
+ and %g2, 0xfffffffffffffff8, %g2
+ sll %g4, 1, %g4
+ add %g2, %g4, %g2
+ and %o2, 0xf, %o0
+ add %o0, %o3, %o0
+ retl
+ sub %o0, %g2, %o0
+54:
+/* o0 = o3 + (o2 & 15) - (g2 / 4) * 2 - (g2 & 1) */
+ srl %g2, 2, %o4
+ and %g2, 1, %o1
+ sll %o4, 1, %o4
+ and %o2, 0xf, %o2
+ sub %o3, %o1, %o3
+ sub %o2, %o4, %o2
+ retl
+ add %o2, %o3, %o0
+55:
+/* o0 = (o2 & 1) + (27 - g2)/4 * 2 + ((27 - g2) & 1) */
+ neg %g2
+ and %o2, 1, %o2
+ add %g2, 27, %g2
+ srl %g2, 2, %o1
+ and %g2, 1, %g2
+ sll %o1, 1, %o1
+ add %o2, %g2, %o0
+ retl
+ add %o0, %o1, %o0
diff --git a/arch/sparc64/lib/copy_to_user.S b/arch/sparc64/lib/copy_to_user.S
new file mode 100644
index 000000000..47a6bd337
--- /dev/null
+++ b/arch/sparc64/lib/copy_to_user.S
@@ -0,0 +1,456 @@
+/* copy_user.S: Sparc optimized copy_to_user code.
+ *
+ * Copyright(C) 1995 Linus Torvalds
+ * Copyright(C) 1996 David S. Miller
+ * Copyright(C) 1996 Eddie C. Dost
+ * Copyright(C) 1996,1997 Jakub Jelinek
+ *
+ * derived from:
+ * e-mail between David and Eddie.
+ *
+ * Returns 0 if successful, otherwise count of bytes not copied yet
+ *
+ * FIXME: This code should be optimized for sparc64... -jj
+ */
+
+#include <asm/ptrace.h>
+#include <asm/asi.h>
+
+#define EX(x,y,a,b,z) \
+98: x,y; \
+ .section .fixup,z##alloc,z##execinstr; \
+ .align 4; \
+99: retl; \
+ a, b, %o0; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4
+
+#define EX2(x,y,c,d,e,a,b,z) \
+98: x,y; \
+ .section .fixup,z##alloc,z##execinstr; \
+ .align 4; \
+99: c, d, e; \
+ retl; \
+ a, b, %o0; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4
+
+#define EXO2(x,y,z) \
+98: x,##y; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 97f; \
+ .text; \
+ .align 4
+
+#define EXT(start,end,handler,z) \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word start, 0, end, handler; \
+ .text; \
+ .align 4
+
+/* Please do not change following macros unless you change logic used
+ * in .fixup at the end of this file as well
+ */
+
+/* Both these macros have to start with exactly the same insn */
+#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [%src + offset + 0x00], %t0; \
+ ldd [%src + offset + 0x08], %t2; \
+ ldd [%src + offset + 0x10], %t4; \
+ ldd [%src + offset + 0x18], %t6; \
+ sta %t0, [%dst + offset + 0x00] %asi; \
+ sta %t1, [%dst + offset + 0x04] %asi; \
+ sta %t2, [%dst + offset + 0x08] %asi; \
+ sta %t3, [%dst + offset + 0x0c] %asi; \
+ sta %t4, [%dst + offset + 0x10] %asi; \
+ sta %t5, [%dst + offset + 0x14] %asi; \
+ sta %t6, [%dst + offset + 0x18] %asi; \
+ sta %t7, [%dst + offset + 0x1c] %asi;
+
+#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [%src + offset + 0x00], %t0; \
+ ldd [%src + offset + 0x08], %t2; \
+ ldd [%src + offset + 0x10], %t4; \
+ ldd [%src + offset + 0x18], %t6; \
+ stda %t0, [%dst + offset + 0x00] %asi; \
+ stda %t2, [%dst + offset + 0x08] %asi; \
+ stda %t4, [%dst + offset + 0x10] %asi; \
+ stda %t6, [%dst + offset + 0x18] %asi;
+
+#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
+ ldd [%src - offset - 0x10], %t0; \
+ ldd [%src - offset - 0x08], %t2; \
+ sta %t0, [%dst - offset - 0x10] %asi; \
+ sta %t1, [%dst - offset - 0x0c] %asi; \
+ sta %t2, [%dst - offset - 0x08] %asi; \
+ sta %t3, [%dst - offset - 0x04] %asi;
+
+#define MOVE_HALFCHUNK(src, dst, offset, t0, t1, t2, t3) \
+ lduh [%src + offset + 0x00], %t0; \
+ lduh [%src + offset + 0x02], %t1; \
+ lduh [%src + offset + 0x04], %t2; \
+ lduh [%src + offset + 0x06], %t3; \
+ stha %t0, [%dst + offset + 0x00] %asi; \
+ stha %t1, [%dst + offset + 0x02] %asi; \
+ stha %t2, [%dst + offset + 0x04] %asi; \
+ stha %t3, [%dst + offset + 0x06] %asi;
+
+#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
+ ldub [%src - offset - 0x02], %t0; \
+ ldub [%src - offset - 0x01], %t1; \
+ stba %t0, [%dst - offset - 0x02] %asi; \
+ stba %t1, [%dst - offset - 0x01] %asi;
+
+ .text
+ .align 4
+
+ .globl __copy_to_user
+dword_align:
+ andcc %o1, 1, %g0
+ be 4f
+ andcc %o1, 2, %g0
+
+ ldub [%o1], %g2
+ add %o1, 1, %o1
+ EXO2(stba %g2, [%o0] %asi,#)
+ sub %o2, 1, %o2
+ bne 3f
+ add %o0, 1, %o0
+
+ lduh [%o1], %g2
+ add %o1, 2, %o1
+ EXO2(stha %g2, [%o0] %asi,#)
+ sub %o2, 2, %o2
+ ba,pt %xcc, 3f
+ add %o0, 2, %o0
+4:
+ lduh [%o1], %g2
+ add %o1, 2, %o1
+ EXO2(stha %g2, [%o0] %asi,#)
+ sub %o2, 2, %o2
+ ba,pt %xcc, 3f
+ add %o0, 2, %o0
+
+__copy_to_user: /* %o0=dst %o1=src %o2=len */
+ wr %g0, ASI_S, %asi
+ xor %o0, %o1, %o4
+1:
+ andcc %o4, 3, %o5
+2:
+ bne,pn %icc, cannot_optimize
+ cmp %o2, 15
+
+ bleu,pn %xcc, short_aligned_end
+ andcc %o1, 3, %g0
+
+ bne,pn %icc, dword_align
+3:
+ andcc %o1, 4, %g0
+
+ be,pt %icc, 2f
+ mov %o2, %g1
+
+ ld [%o1], %o4
+ sub %g1, 4, %g1
+ EXO2(sta %o4, [%o0] %asi,#)
+ add %o1, 4, %o1
+ add %o0, 4, %o0
+2:
+ andcc %g1, 0xffffffffffffff80, %g7
+ be,pn %xcc, 3f
+ andcc %o0, 4, %g0
+
+ be,pn %icc, ldd_std + 4
+5:
+ MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
+80:
+ EXT(5b, 80b, 50f,#)
+ subcc %g7, 128, %g7
+ add %o1, 128, %o1
+ bne,pt %xcc, 5b
+ add %o0, 128, %o0
+3:
+ andcc %g1, 0x70, %g7
+ be,pn %icc, copy_user_table_end
+ andcc %g1, 8, %g0
+100:
+ rd %pc, %o5
+ srl %g7, 1, %o4
+ add %g7, %o4, %o4
+ add %o1, %g7, %o1
+ sub %o5, %o4, %o5
+ jmpl %o5 + (copy_user_table_end - 100b), %g0
+ add %o0, %g7, %o0
+
+copy_user_table:
+ MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
+copy_user_table_end:
+ EXT(copy_user_table, copy_user_table_end, 51f,#)
+ be,pt %icc, copy_user_last7
+ andcc %g1, 4, %g0
+
+ ldd [%o1], %g2
+ add %o0, 8, %o0
+ add %o1, 8, %o1
+ EX(sta %g2, [%o0 - 0x08] %asi, and %g1, 0xf,#)
+ EX2(sta %g3, [%o0 - 0x04] %asi, and %g1, 0xf, %g1, sub %g1, 4,#)
+copy_user_last7:
+ be,pn %icc, 1f
+ andcc %g1, 2, %g0
+
+ ld [%o1], %g2
+ add %o1, 4, %o1
+ EX(sta %g2, [%o0] %asi, and %g1, 7,#)
+ add %o0, 4, %o0
+1:
+ be,pn %icc, 1f
+ andcc %g1, 1, %g0
+
+ lduh [%o1], %g2
+ add %o1, 2, %o1
+ EX(stha %g2, [%o0] %asi, and %g1, 3,#)
+ add %o0, 2, %o0
+1:
+ be,pn %icc, 1f
+ nop
+
+ ldub [%o1], %g2
+ EX(stba %g2, [%o0] %asi, add %g0, 1,#)
+1:
+ retl
+ clr %o0
+
+ldd_std:
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
+81:
+ EXT(ldd_std, 81b, 52f,#)
+ subcc %g7, 128, %g7
+ add %o1, 128, %o1
+ bne,pt %xcc, ldd_std
+ add %o0, 128, %o0
+
+ andcc %g1, 0x70, %g7
+ be,pn %icc, copy_user_table_end
+ andcc %g1, 8, %g0
+101:
+ rd %pc, %o5
+ srl %g7, 1, %o4
+ add %g7, %o4, %o4
+ add %o1, %g7, %o1
+ sub %o5, %o4, %o5
+ jmpl %o5 + (copy_user_table_end - 101b), %g0
+ add %o0, %g7, %o0
+
+cannot_optimize:
+ bleu short_end
+ cmp %o5, 2
+
+ bne byte_chunk
+ and %o2, 0xfffffffffffffff0, %o3
+
+ andcc %o1, 1, %g0
+ be 10f
+ nop
+
+ ldub [%o1], %g2
+ add %o1, 1, %o1
+ EXO2(stba %g2, [%o0] %asi,#)
+ sub %o2, 1, %o2
+ andcc %o2, 0xfffffffffffffff0, %o3
+ be short_end
+ add %o0, 1, %o0
+10:
+ MOVE_HALFCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
+ MOVE_HALFCHUNK(o1, o0, 0x08, g2, g3, g4, g5)
+82:
+ EXT(10b, 82b, 53f,#)
+ subcc %o3, 0x10, %o3
+ add %o1, 0x10, %o1
+ bne 10b
+ add %o0, 0x10, %o0
+ ba,pt %xcc, 2f
+ and %o2, 0xe, %o3
+
+byte_chunk:
+ MOVE_SHORTCHUNK(o1, o0, -0x02, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x04, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x06, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x08, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x0a, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x0c, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x0e, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x10, g2, g3)
+83:
+ EXT(byte_chunk, 83b, 54f,#)
+ subcc %o3, 0x10, %o3
+ add %o1, 0x10, %o1
+ bne,pt %xcc, byte_chunk
+ add %o0, 0x10, %o0
+
+short_end:
+ and %o2, 0xe, %o3
+2:
+ rd %pc, %o5
+ sll %o3, 3, %o4
+ add %o0, %o3, %o0
+ sub %o5, %o4, %o5
+ add %o1, %o3, %o1
+ jmpl %o5 + (short_table_end - 2b), %g0
+ andcc %o2, 1, %g0
+84:
+ MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
+short_table_end:
+ EXT(84b, short_table_end, 55f,#)
+ be 1f
+ nop
+ ldub [%o1], %g2
+ EX(stba %g2, [%o0] %asi, add %g0, 1,#)
+1:
+ retl
+ clr %o0
+
+short_aligned_end:
+ bne short_end
+ andcc %o2, 8, %g0
+
+ be 1f
+ andcc %o2, 4, %g0
+
+ ld [%o1 + 0x00], %g2
+ ld [%o1 + 0x04], %g3
+ add %o1, 8, %o1
+ EXO2(sta %g2, [%o0 + 0x00] %asi,#)
+ EX(sta %g3, [%o0 + 0x04] %asi, sub %o2, 4,#)
+ add %o0, 8, %o0
+1:
+ ba,pt %xcc, copy_user_last7
+ mov %o2, %g1
+
+ .section .fixup,#alloc,#execinstr
+ .align 4
+97:
+ retl
+ mov %o2, %o0
+/* exception routine sets %g2 to (broken_insn - first_insn)>>2 */
+50:
+/* This magic counts how many bytes are left when crash in MOVE_BIGCHUNK
+ * happens. This is derived from the amount ldd reads, st stores, etc.
+ * x = g2 % 12;
+ * o0 = g1 + g7 - ((g2 / 12) * 32 + (x < 4) ? x * 8 : (x - 4) * 4)
+ */
+ cmp %g2, 12
+ bcs 1f
+ cmp %g2, 24
+ bcs 2f
+ cmp %g2, 36
+ bcs 3f
+ nop
+ sub %g2, 12, %g2
+ sub %g7, 32, %g7
+3:
+ sub %g2, 12, %g2
+ sub %g7, 32, %g7
+2:
+ sub %g2, 12, %g2
+ sub %g7, 32, %g7
+1:
+ cmp %g2, 4
+ bcs,a 1f
+ sll %g2, 3, %g2
+ sub %g2, 4, %g2
+ sll %g2, 2, %g2
+1:
+ and %g1, 0x7f, %o0
+ add %o0, %g7, %o0
+ retl
+ sub %o0, %g2, %o0
+51:
+/* i = 41 - g2; j = i % 6;
+ * o0 = (g1 & 15) + (i / 6) * 16 + (j < 4) ? (j + 1) * 4 : (j - 3) * 8;
+ */
+ neg %g2
+ and %g1, 0xf, %g1
+ add %g2, 41, %g2
+1:
+ cmp %g2, 6
+ bcs,a 2f
+ cmp %g2, 4
+ add %g1, 16, %g1
+ b 1b
+ sub %g2, 6, %g2
+2:
+ bcs,a 3f
+ inc %g2
+ sub %g2, 3, %g2
+ b 2f
+ sll %g2, 3, %g2
+3:
+ sll %g2, 2, %g2
+2:
+ retl
+ add %g1, %g2, %o0
+52:
+/* o0 = g1 + g7 - (g2 / 8) * 32 + (x & 3) * 8 */
+ and %g2, 0xfffffffffffffff8, %g4
+ and %g2, 3, %g2
+ sll %g4, 2, %g4
+ sll %g2, 3, %g2
+ add %g2, %g4, %g2
+ b,a 1b
+53:
+/* o0 = o3 + (o2 & 15) - (g2 & 8) - (g2 & 3) * 2 */
+ and %g2, 3, %g4
+ and %g2, 0xfffffffffffffff8, %g2
+ sll %g4, 1, %g4
+ add %g2, %g4, %g2
+ and %o2, 0xf, %o0
+ add %o0, %o3, %o0
+ retl
+ sub %o0, %g2, %o0
+54:
+/* o0 = o3 + (o2 & 15) - (g2 / 4) * 2 - (g2 & 1) */
+ srl %g2, 2, %o4
+ and %g2, 1, %o1
+ sll %o4, 1, %o4
+ and %o2, 0xf, %o2
+ sub %o3, %o1, %o3
+ sub %o2, %o4, %o2
+ retl
+ add %o2, %o3, %o0
+55:
+/* o0 = (o2 & 1) + (27 - g2)/4 * 2 + ((27 - g2) & 1) */
+ neg %g2
+ and %o2, 1, %o2
+ add %g2, 27, %g2
+ srl %g2, 2, %o1
+ and %g2, 1, %g2
+ sll %o1, 1, %o1
+ add %o2, %g2, %o0
+ retl
+ add %o0, %o1, %o0
diff --git a/arch/sparc64/lib/locks.S b/arch/sparc64/lib/locks.S
new file mode 100644
index 000000000..a1154cb6d
--- /dev/null
+++ b/arch/sparc64/lib/locks.S
@@ -0,0 +1,77 @@
+/* $Id: locks.S,v 1.2 1997/03/10 12:28:02 jj Exp $
+ * locks.S: SMP low-level lock primitives on Sparc64.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <asm/ptrace.h>
+
+ .text
+ .align 4
+
+ .globl __spinlock_waitfor
+__spinlock_waitfor:
+1: orcc %g2, 0x0, %g0
+ bne 1b
+ ldub [%g1], %g2
+ ldstub [%g1], %g2
+ jmpl %o7 - 12, %g0
+ mov %g5, %o7
+
+ .globl ___become_idt
+___become_idt:
+#if 0 /* Don't know how to do this on the Ultra yet... */
+#endif
+ jmpl %o7 + 8, %g0
+ mov %g5, %o7
+
+___lk_busy_spin:
+ orcc %g2, 0, %g0
+ bne ___lk_busy_spin
+ ldub [%g1 + 0], %g2
+ b 1f
+ ldstub [%g1 + 0], %g2
+
+ .globl ___lock_kernel
+___lock_kernel:
+ addcc %g2, -1, %g2
+ rdpr %pil, %g3
+ bcs,a 9f
+ st %g2, [%g6 + AOFF_task_lock_depth]
+ wrpr 15, %pil
+ ldstub [%g1 + 0], %g2
+1: orcc %g2, 0, %g0
+ bne,a ___lk_busy_spin
+ ldub [%g1 + 0], %g2
+ ldub [%g1 + 2], %g2
+ cmp %g2, %g5
+ be 2f
+ stb %g5, [%g1 + 1]
+ stb %g5, [%g1 + 2]
+#ifdef __SMP__
+ /* XXX Figure out how to become interrupt receiver in SMP system. */
+#endif
+2: mov -1, %g2
+ st %g2, [%g6 + AOFF_task_lock_depth]
+ wrpr %g3, %pil
+9: jmpl %o7 + 0x8, %g0
+ mov %g5, %o7
+
+#undef NO_PROC_ID
+#define NO_PROC_ID 0xff
+
+ .globl ___unlock_kernel
+___unlock_kernel:
+ addcc %g2, 1, %g2
+ rdpr %pil, %g3
+ bne,a 1f
+ st %g2, [%g6 + AOFF_task_lock_depth]
+ wrpr 15, %pil
+ mov NO_PROC_ID, %g2
+ stb %g2, [%g1 + 1]
+ stb %g0, [%g1 + 0]
+ st %g0, [%g6 + AOFF_task_lock_depth]
+ wrpr %g3, %pil
+1: jmpl %o7 + 0x8, %g0
+ mov %g5, %o7
+
diff --git a/arch/sparc64/lib/memcmp.S b/arch/sparc64/lib/memcmp.S
new file mode 100644
index 000000000..4c08d57c3
--- /dev/null
+++ b/arch/sparc64/lib/memcmp.S
@@ -0,0 +1,29 @@
+/* $Id: memcmp.S,v 1.2 1997/04/01 03:43:18 davem Exp $
+ * Sparc64 optimized memcmp code.
+ *
+ * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+ .text
+ .align 4
+ .globl __memcmp, memcmp
+__memcmp:
+memcmp:
+ brlez,pn %o2, 2f
+ sub %g0, %o2, %o3
+ add %o0, %o2, %o0
+ add %o1, %o2, %o1
+ ldub [%o0 + %o3], %o4
+1:
+ ldub [%o1 + %o3], %o5
+ sub %o4, %o5, %o4
+ brnz,pn %o4, 3f
+ addcc %o3, 1, %o3
+ bne,a,pt %xcc, 1b
+ ldub [%o0 + %o3], %o4
+2:
+ retl
+ clr %o0
+3:
+ retl
+ mov %o4, %o0
diff --git a/arch/sparc64/lib/memcpy.S b/arch/sparc64/lib/memcpy.S
new file mode 100644
index 000000000..e9462345a
--- /dev/null
+++ b/arch/sparc64/lib/memcpy.S
@@ -0,0 +1,526 @@
+/* memcpy.S: Sparc optimized memcpy, bcopy and memmove code
+ * Hand optimized from GNU libc's memcpy, bcopy and memmove
+ * for UltraSparc
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi)
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
+ * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#include <asm/asi.h>
+#include <asm/head.h>
+
+#ifdef __KERNEL__
+
+#define FUNC(x) \
+ .globl x; \
+ .type x,@function; \
+ .align 4; \
+x:
+
+#define FASTER_ALIGNED
+
+/* In kernel these functions don't return a value.
+ * One should use macros in asm/string.h for that purpose.
+ * We return 0, so that bugs are more apparent.
+ */
+#define SETUP_RETL
+#define PRE_RETL sethi %uhi(KERNBASE), %g4; clr %o0
+#define RETL_INSN sllx %g4, 32, %g4
+
+#else
+
+/* libc */
+
+#define FASTER_ALIGNED
+
+#ifdef DEBUG
+#define FUNC(x) \
+ .globl jj##x##1; \
+ .type jj##x##1,@function; \
+ .align 4; \
+jj##x##1:
+#else
+#include "DEFS.h"
+#endif
+
+#define SETUP_RETL mov %o0, %g6
+#define PRE_RETL
+#define RETL_INSN mov %g6, %o0
+
+#endif
+
+#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [%src + offset + 0x00], %t0; \
+ ldd [%src + offset + 0x08], %t2; \
+ ldd [%src + offset + 0x10], %t4; \
+ ldd [%src + offset + 0x18], %t6; \
+ stw %t0, [%dst + offset + 0x00]; \
+ stw %t1, [%dst + offset + 0x04]; \
+ stw %t2, [%dst + offset + 0x08]; \
+ stw %t3, [%dst + offset + 0x0c]; \
+ stw %t4, [%dst + offset + 0x10]; \
+ stw %t5, [%dst + offset + 0x14]; \
+ stw %t6, [%dst + offset + 0x18]; \
+ stw %t7, [%dst + offset + 0x1c];
+
+#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldx [%src + offset + 0x00], %t0; \
+ ldx [%src + offset + 0x08], %t1; \
+ ldx [%src + offset + 0x10], %t2; \
+ ldx [%src + offset + 0x18], %t3; \
+ ldx [%src + offset + 0x20], %t4; \
+ ldx [%src + offset + 0x28], %t5; \
+ ldx [%src + offset + 0x30], %t6; \
+ ldx [%src + offset + 0x38], %t7; \
+ stx %t0, [%dst + offset + 0x00]; \
+ stx %t1, [%dst + offset + 0x08]; \
+ stx %t2, [%dst + offset + 0x10]; \
+ stx %t3, [%dst + offset + 0x18]; \
+ stx %t4, [%dst + offset + 0x20]; \
+ stx %t5, [%dst + offset + 0x28]; \
+ stx %t6, [%dst + offset + 0x30]; \
+ stx %t7, [%dst + offset + 0x38];
+
+#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
+ ldd [%src - offset - 0x10], %t0; \
+ ldd [%src - offset - 0x08], %t2; \
+ stw %t0, [%dst - offset - 0x10]; \
+ stw %t1, [%dst - offset - 0x0c]; \
+ stw %t2, [%dst - offset - 0x08]; \
+ stw %t3, [%dst - offset - 0x04];
+
+#define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1) \
+ ldx [%src - offset - 0x10], %t0; \
+ ldx [%src - offset - 0x08], %t1; \
+ stx %t0, [%dst - offset - 0x10]; \
+ stx %t1, [%dst - offset - 0x08];
+
+#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
+ ldub [%src - offset - 0x02], %t0; \
+ ldub [%src - offset - 0x01], %t1; \
+ stb %t0, [%dst - offset - 0x02]; \
+ stb %t1, [%dst - offset - 0x01];
+
+ .text
+ .align 4
+
+FUNC(bcopy)
+
+ mov %o0, %o3
+ mov %o1, %o0
+ mov %o3, %o1
+ brgez,a,pt %o2, 1f
+ cmp %o0, %o1
+
+ retl
+ nop ! Only bcopy returns here and it retuns void...
+
+#ifdef __KERNEL__
+FUNC(amemmove)
+FUNC(__memmove)
+#endif
+FUNC(memmove)
+
+ cmp %o0, %o1
+1:
+ SETUP_RETL
+ bleu,pt %xcc, 9f
+ sub %o0, %o1, %o4
+
+ add %o1, %o2, %o3
+ cmp %o3, %o0
+ bleu,pt %xcc, 0f
+ andcc %o4, 3, %o5
+
+ add %o1, %o2, %o1
+ add %o0, %o2, %o0
+ sub %o1, 1, %o1
+ sub %o0, 1, %o0
+
+1:
+ ldub [%o1], %o4
+ subcc %o2, 1, %o2
+ sub %o1, 1, %o1
+ stb %o4, [%o0]
+ bne,pt %icc, 1b
+ sub %o0, 1, %o0
+
+ PRE_RETL
+ retl
+ RETL_INSN
+
+#ifdef __KERNEL__
+FUNC(__memcpy)
+#endif
+FUNC(memcpy) /* %o0=dst %o1=src %o2=len */
+
+ sub %o0, %o1, %o4
+ SETUP_RETL
+9:
+ andcc %o4, 3, %o5
+0:
+ bne,pn %icc, 86f
+ cmp %o2, 15
+
+ bleu,pn %xcc, 90f
+ andcc %o1, 3, %g0
+
+ be,a,pt %icc, 3f ! check if we need to align
+ andcc %o1, 4, %g0
+
+ andcc %o1, 1, %g0
+ be,pn %icc, 4f
+ andcc %o1, 2, %g0
+
+ ldub [%o1], %g2
+ add %o1, 1, %o1
+ sub %o2, 1, %o2
+ stb %g2, [%o0]
+ bne,pn %icc, 5f
+ add %o0, 1, %o0
+4:
+ lduh [%o1], %g2
+ add %o1, 2, %o1
+ sub %o2, 2, %o2
+ sth %g2, [%o0]
+ add %o0, 2, %o0
+5:
+ andcc %o1, 4, %g0
+3:
+ be,pn %icc, 2f
+ mov %o2, %g1
+
+ lduw [%o1], %o4
+ sub %g1, 4, %g1
+ stw %o4, [%o0]
+ add %o1, 4, %o1
+ add %o0, 4, %o0
+2:
+ andcc %g1, -128, %g7
+ be,pn %xcc, 3f
+ andcc %o0, 4, %g0
+
+ be,a,pn %icc, 82f + 4
+ ldx [%o1], %o2
+5:
+ MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
+ subcc %g7, 128, %g7
+ add %o1, 128, %o1
+ bne,pt %xcc, 5b
+ add %o0, 128, %o0
+3:
+ andcc %g1, 0x70, %g7
+ be,pn %icc, 80f
+ andcc %g1, 8, %g0
+79:
+ rd %pc, %o5
+ srl %g7, 1, %o4
+ add %g7, %o4, %o4
+ add %o1, %g7, %o1
+ sub %o5, %o4, %o5
+ jmpl %o5 + %lo(80f-79b), %g0
+ add %o0, %g7, %o0
+
+ MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
+
+80: /* memcpy_table_end */
+ be,pt %icc, 81f
+ andcc %g1, 4, %g0
+
+ ldd [%o1], %g2
+ add %o0, 8, %o0
+ stw %g2, [%o0 - 0x08]
+ add %o1, 8, %o1
+ stw %g3, [%o0 - 0x04]
+
+81: /* memcpy_last7 */
+
+ be,pt %icc, 1f
+ andcc %g1, 2, %g0
+
+ lduw [%o1], %g2
+ add %o1, 4, %o1
+ stw %g2, [%o0]
+ add %o0, 4, %o0
+1:
+ be,pt %icc, 1f
+ andcc %g1, 1, %g0
+
+ lduh [%o1], %g2
+ add %o1, 2, %o1
+ sth %g2, [%o0]
+ add %o0, 2, %o0
+1:
+ be,pt %icc, 1f
+ nop
+
+ ldub [%o1], %g2
+ stb %g2, [%o0]
+1:
+ PRE_RETL
+ retl
+ RETL_INSN
+
+82: /* ldx_stx */
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+ subcc %g7, 128, %g7
+ add %o1, 128, %o1
+ bne,pt %xcc, 82b
+ add %o0, 128, %o0
+
+#ifndef FASTER_ALIGNED
+
+ andcc %g1, 0x70, %g7
+ be,pn %icc, 80b
+ andcc %g1, 8, %g0
+83:
+ rd %pc, %o5
+ srl %g7, 1, %o4
+ add %g7, %o4, %o4
+ add %o1, %g7, %o1
+ sub %o5, %o4, %o5
+ jmpl %o5 + %lo(80b - 83b), %g0
+ add %o0, %g7, %o0
+
+#else /* FASTER_ALIGNED */
+
+ andcc %g1, 0x70, %g7
+ be,pn %icc, 84f
+ andcc %g1, 8, %g0
+83:
+ rd %pc, %o5
+ add %o1, %g7, %o1
+ sub %o5, %g7, %o5
+ jmpl %o5 + %lo(84f - 83b), %g0
+ add %o0, %g7, %o0
+
+ MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3)
+ MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3)
+ MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3)
+ MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3)
+ MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3)
+ MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3)
+ MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3)
+
+84: /* amemcpy_table_end */
+ be,pt %icc, 85f
+ andcc %g1, 4, %g0
+
+ ldx [%o1], %g2
+ add %o1, 8, %o1
+ stx %g2, [%o0]
+ add %o0, 8, %o0
+85: /* amemcpy_last7 */
+ be,pt %icc, 1f
+ andcc %g1, 2, %g0
+
+ lduw [%o1], %g2
+ add %o1, 4, %o1
+ stw %g2, [%o0]
+ add %o0, 4, %o0
+1:
+ be,pt %icc, 1f
+ andcc %g1, 1, %g0
+
+ lduh [%o1], %g2
+ add %o1, 2, %o1
+ sth %g2, [%o0]
+ add %o0, 2, %o0
+1:
+ be,pt %icc, 1f
+ nop
+
+ ldub [%o1], %g2
+ stb %g2, [%o0]
+1:
+ PRE_RETL
+ retl
+ RETL_INSN
+
+#endif /* FASTER_ALIGNED */
+
+86: /* non_aligned */
+ cmp %o2, 15
+ bleu,pn %xcc, 88f
+
+ andcc %o0, 3, %g0
+ be,pn %icc, 61f
+ andcc %o0, 1, %g0
+ be,pn %icc, 60f
+ andcc %o0, 2, %g0
+
+ ldub [%o1], %g5
+ add %o1, 1, %o1
+ stb %g5, [%o0]
+ sub %o2, 1, %o2
+ bne,pn %icc, 61f
+ add %o0, 1, %o0
+60:
+ ldub [%o1], %g3
+ add %o1, 2, %o1
+ stb %g3, [%o0]
+ sub %o2, 2, %o2
+ ldub [%o1 - 1], %g3
+ add %o0, 2, %o0
+ stb %g3, [%o0 - 1]
+61:
+ and %o1, 3, %g2
+ and %o2, 0xc, %g3
+ and %o1, -4, %o1
+ cmp %g3, 4
+ sll %g2, 3, %g4
+ mov 32, %g2
+ be,pn %icc, 4f
+ sub %g2, %g4, %g7
+
+ blu,pn %icc, 3f
+ cmp %g3, 0x8
+
+ be,pn %icc, 2f
+ srl %o2, 2, %g3
+
+ lduw [%o1], %o3
+ add %o0, -8, %o0
+ lduw [%o1 + 4], %o4
+ ba,pt %xcc, 8f
+ add %g3, 1, %g3
+2:
+ lduw [%o1], %o4
+ add %o0, -12, %o0
+ lduw [%o1 + 4], %o5
+ add %g3, 2, %g3
+ ba,pt %xcc, 9f
+ add %o1, -4, %o1
+3:
+ lduw [%o1], %g1
+ add %o0, -4, %o0
+ lduw [%o1 + 4], %o3
+ srl %o2, 2, %g3
+ ba,pt %xcc, 7f
+ add %o1, 4, %o1
+4:
+ lduw [%o1], %o5
+ cmp %o2, 7
+ lduw [%o1 + 4], %g1
+ srl %o2, 2, %g3
+ bleu,pn %xcc, 10f
+ add %o1, 8, %o1
+
+ lduw [%o1], %o3
+ add %g3, -1, %g3
+5:
+ sll %o5, %g4, %g2
+ srl %g1, %g7, %g5
+ or %g2, %g5, %g2
+ stw %g2, [%o0]
+7:
+ lduw [%o1 + 4], %o4
+ sll %g1, %g4, %g2
+ srl %o3, %g7, %g5
+ or %g2, %g5, %g2
+ stw %g2, [%o0 + 4]
+8:
+ lduw [%o1 + 8], %o5
+ sll %o3, %g4, %g2
+ srl %o4, %g7, %g5
+ or %g2, %g5, %g2
+ stw %g2, [%o0 + 8]
+9:
+ lduw [%o1 + 12], %g1
+ sll %o4, %g4, %g2
+ srl %o5, %g7, %g5
+ addcc %g3, -4, %g3
+ or %g2, %g5, %g2
+ add %o1, 16, %o1
+ stw %g2, [%o0 + 12]
+ add %o0, 16, %o0
+ bne,a,pt %xcc, 5b
+ lduw [%o1], %o3
+10:
+ sll %o5, %g4, %g2
+ srl %g1, %g7, %g5
+ srl %g7, 3, %g3
+ or %g2, %g5, %g2
+ sub %o1, %g3, %o1
+ andcc %o2, 2, %g0
+ stw %g2, [%o0]
+ be,pt %icc, 1f
+ andcc %o2, 1, %g0
+
+ ldub [%o1], %g2
+ add %o1, 2, %o1
+ stb %g2, [%o0 + 4]
+ add %o0, 2, %o0
+ ldub [%o1 - 1], %g2
+ stb %g2, [%o0 + 3]
+1:
+ be,pt %icc, 1f
+ nop
+
+ ldub [%o1], %g2
+ stb %g2, [%o0 + 4]
+1:
+ PRE_RETL
+ retl
+ RETL_INSN
+
+88: /* short_end */
+
+ and %o2, 0xe, %o3
+20:
+ rd %pc, %o5
+ sll %o3, 3, %o4
+ add %o0, %o3, %o0
+ sub %o5, %o4, %o5
+ add %o1, %o3, %o1
+ jmpl %o5 + %lo(89f - 20b), %g0
+ andcc %o2, 1, %g0
+
+ MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
+
+89: /* short_table_end */
+
+ be,pt %icc, 1f
+ nop
+
+ ldub [%o1], %g2
+ stb %g2, [%o0]
+1:
+ PRE_RETL
+ retl
+ RETL_INSN
+
+90: /* short_aligned_end */
+ bne,pn %xcc, 88b
+ andcc %o2, 8, %g0
+
+ be,pt %icc, 1f
+ andcc %o2, 4, %g0
+
+ lduw [%o1 + 0x00], %g2
+ lduw [%o1 + 0x04], %g3
+ add %o1, 8, %o1
+ stw %g2, [%o0 + 0x00]
+ stw %g3, [%o0 + 0x04]
+ add %o0, 8, %o0
+1:
+ ba,pt %xcc, 81b
+ mov %o2, %g1
diff --git a/arch/sparc64/lib/memscan.S b/arch/sparc64/lib/memscan.S
new file mode 100644
index 000000000..83abe4040
--- /dev/null
+++ b/arch/sparc64/lib/memscan.S
@@ -0,0 +1,116 @@
+/* $Id: memscan.S,v 1.1 1997/03/14 21:04:24 jj Exp $
+ * memscan.S: Optimized memscan for the Sparc64.
+ *
+ * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+/* In essence, this is just a fancy strlen. */
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+ .text
+ .align 4
+ .globl __memscan_zero, __memscan_generic
+ .globl memscan
+__memscan_zero:
+ /* %o0 = addr, %o1 = size */
+ brlez,pn %o1, 0f
+ andcc %o0, 3, %g0
+ be,pt %icc, 9f
+ sethi %hi(HI_MAGIC), %o4
+ ldub [%o0], %o5
+ subcc %o1, 1, %o1
+ brz,pn %o5, 10f
+ add %o0, 1, %o0
+ be,pn %xcc, 0f
+ andcc %o0, 3, %g0
+ be,pn %icc, 4f
+ or %o4, %lo(HI_MAGIC), %o3
+ ldub [%o0], %o5
+ subcc %o1, 1, %o1
+ brz,pn %o5, 10f
+ add %o0, 1, %o0
+ be,pn %xcc, 0f
+ andcc %o0, 3, %g0
+ be,pt %icc, 5f
+ sethi %hi(LO_MAGIC), %o4
+ ldub [%o0], %o5
+ subcc %o1, 1, %o1
+ brz,pn %o5, 10f
+ add %o0, 1, %o0
+ be,pn %xcc, 0f
+ or %o4, %lo(LO_MAGIC), %o2
+ ba,pt %xcc, 2f
+ ld [%o0], %o5
+9:
+ or %o4, %lo(HI_MAGIC), %o3
+4:
+ sethi %hi(LO_MAGIC), %o4
+5:
+ or %o4, %lo(LO_MAGIC), %o2
+ ld [%o0], %o5
+2:
+ sub %o5, %o2, %o4
+ sub %o1, 4, %o1
+ andcc %o4, %o3, %g0
+ be,pn %icc, 1f
+ add %o0, 4, %o0
+ brgz,pt %o1, 2b
+ ld [%o0], %o5
+
+ retl
+ add %o0, %o1, %o0
+1:
+ /* Check every byte. */
+ srl %o5, 24, %g5
+ andcc %g5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o0, -4, %o4
+ srl %o5, 16, %g5
+ andcc %g5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o4, 1, %o4
+ srl %o5, 8, %g5
+ andcc %g5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o4, 1, %o4
+ andcc %o5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o4, 1, %o4
+ brgz,pt %o1, 2b
+ ld [%o0], %o5
+1:
+ add %o0, %o1, %o0
+ cmp %o4, %o0
+ retl
+ movle %xcc, %o4, %o0
+0:
+ retl
+ nop
+10:
+ retl
+ sub %o0, 1, %o0
+
+memscan:
+__memscan_generic:
+ /* %o0 = addr, %o1 = c, %o2 = size */
+ brz,pn %o2, 3f
+ add %o0, %o2, %o3
+ ldub [%o0], %o5
+ sub %g0, %o2, %o4
+1:
+ cmp %o5, %o1
+ be,pn %icc, 2f
+ addcc %o4, 1, %o4
+ bne,a,pt %xcc, 1b
+ ldub [%o3 + %o4], %o5
+ retl
+ /* The delay slot is the same as the next insn, this is just to make it look more awful */
+2:
+ add %o3, %o4, %o0
+ retl
+ sub %o0, 1, %o0
+3:
+ retl
+ nop
diff --git a/arch/sparc64/lib/memset.S b/arch/sparc64/lib/memset.S
new file mode 100644
index 000000000..55de4ea9d
--- /dev/null
+++ b/arch/sparc64/lib/memset.S
@@ -0,0 +1,196 @@
+/* linux/arch/sparc64/lib/memset.S: Sparc optimized memset, bzero and clear_user code
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ *
+ * Returns 0, if ok, and number of bytes not yet set if exception
+ * occurs and we were called as clear_user.
+ */
+
+#include <asm/asi.h>
+#include <asm/ptrace.h>
+
+#define EX(x,y,a,b,z) \
+98: x,y; \
+ .section .fixup,z##alloc,z##execinstr; \
+ .align 4; \
+99: ba,pt %xcc, 30f; \
+ a, b, %o0; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4
+
+#define EXT(start,end,handler,z) \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word start, 0, end, handler; \
+ .text; \
+ .align 4
+
+/* Please don't change these macros, unless you change the logic
+ * in the .fixup section below as well.
+ * Store 64 bytes at (BASE + OFFSET) using value SOURCE. */
+#define ZERO_BIG_BLOCK(base, offset, source) \
+ stxa source, [base + offset + 0x00] %asi; \
+ stxa source, [base + offset + 0x08] %asi; \
+ stxa source, [base + offset + 0x10] %asi; \
+ stxa source, [base + offset + 0x18] %asi; \
+ stxa source, [base + offset + 0x20] %asi; \
+ stxa source, [base + offset + 0x28] %asi; \
+ stxa source, [base + offset + 0x30] %asi; \
+ stxa source, [base + offset + 0x38] %asi;
+
+#define ZERO_LAST_BLOCKS(base, offset, source) \
+ stxa source, [base - offset - 0x38] %asi; \
+ stxa source, [base - offset - 0x30] %asi; \
+ stxa source, [base - offset - 0x28] %asi; \
+ stxa source, [base - offset - 0x20] %asi; \
+ stxa source, [base - offset - 0x18] %asi; \
+ stxa source, [base - offset - 0x10] %asi; \
+ stxa source, [base - offset - 0x08] %asi; \
+ stxa source, [base - offset - 0x00] %asi;
+
+ .text
+ .align 4
+
+ .globl __bzero, __memset, __bzero_noasi
+ .globl memset, __memset_start, __memset_end
+__memset_start:
+__memset:
+memset:
+ and %o1, 0xff, %g3
+ sll %g3, 8, %g2
+ or %g3, %g2, %g3
+ sll %g3, 16, %g2
+ or %g3, %g2, %g3
+ mov %o2, %o1
+ wr %g0, ASI_P, %asi
+ sllx %g3, 32, %g2
+ ba,pt %xcc, 1f
+ or %g3, %g2, %g3
+__bzero:
+ wr %g0, ASI_P, %asi
+__bzero_noasi:
+ mov %g0, %g3
+1:
+ cmp %o1, 7
+ bleu,pn %xcc, 7f
+ andcc %o0, 3, %o2
+
+ be,a,pt %icc, 4f
+ andcc %o0, 4, %g0
+
+ cmp %o2, 3
+ be,pn %icc, 2f
+ EX(stba %g3, [%o0] %asi, sub %o1, 0,#)
+
+ cmp %o2, 2
+ be,pt %icc, 2f
+ EX(stba %g3, [%o0 + 0x01] %asi, sub %o1, 1,#)
+
+ EX(stba %g3, [%o0 + 0x02] %asi, sub %o1, 2,#)
+2:
+ sub %o2, 4, %o2
+ sub %o0, %o2, %o0
+ add %o1, %o2, %o1
+ andcc %o0, 4, %g0
+4:
+ be,a,pt %icc, 2f
+ andncc %o1, 0x7f, %o3
+
+ EX(sta %g3, [%o0] %asi, sub %o1, 0,#)
+ sub %o1, 4, %o1
+ add %o0, 4, %o0
+ andncc %o1, 0x7f, %o3 ! Now everything is 8 aligned and o1 is len to run
+2:
+ be,pn %xcc, 9f
+ andcc %o1, 0x78, %o2
+10:
+ ZERO_BIG_BLOCK(%o0, 0x00, %g3)
+ subcc %o3, 128, %o3
+ ZERO_BIG_BLOCK(%o0, 0x40, %g3)
+11:
+ EXT(10b, 11b, 20f,#)
+ bne,pt %xcc, 10b
+ add %o0, 128, %o0
+
+ tst %o2
+9:
+ be,pn %xcc, 13f
+ andcc %o1, 7, %o1
+14:
+ rd %pc, %o4
+ srl %o2, 1, %o3
+ sub %o4, %o3, %o4
+ jmpl %o4 + (13f - 14b), %g0
+ add %o0, %o2, %o0
+12:
+ ZERO_LAST_BLOCKS(%o0, 0x48, %g3)
+ ZERO_LAST_BLOCKS(%o0, 0x08, %g3)
+13:
+ be,pn %icc, 8f
+ andcc %o1, 4, %g0
+
+ be,pn %icc, 1f
+ andcc %o1, 2, %g0
+
+ EX(sta %g3, [%o0] %asi, and %o1, 7,#)
+ add %o0, 4, %o0
+1:
+ be,pn %icc, 1f
+ andcc %o1, 1, %g0
+
+ EX(stha %g3, [%o0] %asi, and %o1, 3,#)
+ add %o0, 2, %o0
+1:
+ bne,a,pn %icc, 8f
+ EX(stba %g3, [%o0] %asi, and %o1, 1,#)
+8:
+ retl
+ clr %o0
+7:
+ be,pn %icc, 13b
+ orcc %o1, 0, %g0
+
+ be,pn %icc, 0f
+8:
+ add %o0, 1, %o0
+ subcc %o1, 1, %o1
+ bne,a,pt %icc, 8b
+ EX(stba %g3, [%o0 - 1] %asi, add %o1, 1,#)
+0:
+ retl
+ clr %o0
+__memset_end:
+
+ .section .fixup,#alloc,#execinstr
+ .align 4
+20:
+ cmp %g2, 8
+ bleu,pn %xcc, 1f
+ and %o1, 0x7f, %o1
+ sub %g2, 9, %g2
+ add %o3, 64, %o3
+1:
+ sll %g2, 3, %g2
+ add %o3, %o1, %o0
+ ba,pt %xcc, 30f
+ sub %o0, %g2, %o0
+21:
+ mov 8, %o0
+ and %o1, 7, %o1
+ sub %o0, %g2, %o0
+ sll %o0, 3, %o0
+ ba,pt %xcc, 30f
+ add %o0, %o1, %o0
+30:
+/* %o4 is faulting address, %o5 is %pc where fault occured */
+ save %sp, -160, %sp
+ mov %i5, %o0
+ mov %i7, %o1
+ call lookup_fault
+ mov %i4, %o2
+ ret
+ restore
diff --git a/arch/sparc64/lib/strlen.S b/arch/sparc64/lib/strlen.S
new file mode 100644
index 000000000..5f2ec6bb4
--- /dev/null
+++ b/arch/sparc64/lib/strlen.S
@@ -0,0 +1,77 @@
+/* strlen.S: Sparc64 optimized strlen code
+ * Hand optimized from GNU libc's strlen
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+ .align 4
+ .global strlen
+strlen:
+ mov %o0, %o1
+ andcc %o0, 3, %g0
+ be,pt %icc, 9f
+ sethi %hi(HI_MAGIC), %o4
+ ldub [%o0], %o5
+ brz,pn %o5, 11f
+ add %o0, 1, %o0
+ andcc %o0, 3, %g0
+ be,pn %icc, 4f
+ or %o4, %lo(HI_MAGIC), %o3
+ ldub [%o0], %o5
+ brz,pn %o5, 12f
+ add %o0, 1, %o0
+ andcc %o0, 3, %g0
+ be,pt %icc, 5f
+ sethi %hi(LO_MAGIC), %o4
+ ldub [%o0], %o5
+ brz,pn %o5, 13f
+ add %o0, 1, %o0
+ ba,pt %icc, 8f
+ or %o4, %lo(LO_MAGIC), %o2
+9:
+ or %o4, %lo(HI_MAGIC), %o3
+4:
+ sethi %hi(LO_MAGIC), %o4
+5:
+ or %o4, %lo(LO_MAGIC), %o2
+8:
+ ld [%o0], %o5
+2:
+ sub %o5, %o2, %o4
+ andcc %o4, %o3, %g0
+ be,pt %icc, 8b
+ add %o0, 4, %o0
+
+ /* Check every byte. */
+ srl %o5, 24, %g5
+ andcc %g5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o0, -4, %o4
+ srl %o5, 16, %g5
+ andcc %g5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o4, 1, %o4
+ srl %o5, 8, %g5
+ andcc %g5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o4, 1, %o4
+ andcc %o5, 0xff, %g0
+ bne,a,pt %icc, 2b
+ ld [%o0], %o5
+ add %o4, 1, %o4
+1:
+ retl
+ sub %o4, %o1, %o0
+11:
+ retl
+ mov 0, %o0
+12:
+ retl
+ mov 1, %o0
+13:
+ retl
+ mov 2, %o0
diff --git a/arch/sparc64/lib/strlen_user.S b/arch/sparc64/lib/strlen_user.S
new file mode 100644
index 000000000..24bea73fd
--- /dev/null
+++ b/arch/sparc64/lib/strlen_user.S
@@ -0,0 +1,99 @@
+/* strlen_user.S: Sparc64 optimized strlen_user code
+ *
+ * Return length of string in userspace including terminating 0
+ * or 0 for error
+ *
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+ .align 4
+ .global __strlen_user
+__strlen_user:
+ mov %o0, %o1
+ andcc %o0, 3, %g0
+ be,pt %icc, 9f
+ sethi %hi(HI_MAGIC), %o4
+10:
+ ldub [%o0], %o5
+ brz,pn %o5, 21f
+ add %o0, 1, %o0
+ andcc %o0, 3, %g0
+ be,pn %icc, 4f
+ or %o4, %lo(HI_MAGIC), %o3
+11:
+ ldub [%o0], %o5
+ brz,pn %o5, 22f
+ add %o0, 1, %o0
+ andcc %o0, 3, %g0
+ be,pt %icc, 5f
+ sethi %hi(LO_MAGIC), %o4
+12:
+ ldub [%o0], %o5
+ brz,pn %o5, 23f
+ add %o0, 1, %o0
+ ba,pt %icc, 13f
+ or %o4, %lo(LO_MAGIC), %o2
+9:
+ or %o4, %lo(HI_MAGIC), %o3
+4:
+ sethi %hi(LO_MAGIC), %o4
+5:
+ or %o4, %lo(LO_MAGIC), %o2
+13:
+ ld [%o0], %o5
+2:
+ sub %o5, %o2, %o4
+ andcc %o4, %o3, %g0
+ be,pt %icc, 13b
+ add %o0, 4, %o0
+
+ /* Check every byte. */
+ srl %o5, 24, %g5
+ andcc %g5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o0, -3, %o4
+ srl %o5, 16, %g5
+ andcc %g5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o4, 1, %o4
+ srl %o5, 8, %g5
+ andcc %g5, 0xff, %g0
+ be,pn %icc, 1f
+ add %o4, 1, %o4
+ andcc %o5, 0xff, %g0
+ bne,a,pt %icc, 2b
+14:
+ ld [%o0], %o5
+ add %o4, 1, %o4
+1:
+ retl
+ sub %o4, %o1, %o0
+21:
+ retl
+ mov 1, %o0
+22:
+ retl
+ mov 2, %o0
+23:
+ retl
+ mov 3, %o0
+
+ .section .fixup,#alloc,#execinstr
+ .align 4
+30:
+ retl
+ clr %o0
+
+ .section __ex_table,#alloc
+ .align 4
+
+ .word 10b, 30b
+ .word 11b, 30b
+ .word 12b, 30b
+ .word 13b, 30b
+ .word 14b, 30b
diff --git a/arch/sparc64/lib/strncmp.S b/arch/sparc64/lib/strncmp.S
new file mode 100644
index 000000000..474ba7296
--- /dev/null
+++ b/arch/sparc64/lib/strncmp.S
@@ -0,0 +1,31 @@
+/* $Id: strncmp.S,v 1.2 1997/03/11 17:51:44 jj Exp $
+ * Sparc64 optimized strncmp code.
+ *
+ * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#include <asm/asi.h>
+
+ .text
+ .align 4
+ .global __strncmp, strncmp
+__strncmp:
+strncmp:
+ brlez,pn %o2, 3f
+ lduba [%o0] (ASI_PNF), %o3
+1:
+ add %o0, 1, %o0
+ ldub [%o1], %o4
+ brz,pn %o3, 2f
+ add %o1, 1, %o1
+ cmp %o3, %o4
+ bne,pn %icc, 2f
+ subcc %o2, 1, %o2
+ bne,a,pt %xcc, 1b
+ ldub [%o0], %o3
+2:
+ retl
+ sub %o3, %o4, %o0
+3:
+ retl
+ clr %o0
diff --git a/arch/sparc64/lib/strncpy_from_user.S b/arch/sparc64/lib/strncpy_from_user.S
new file mode 100644
index 000000000..05a48eb5a
--- /dev/null
+++ b/arch/sparc64/lib/strncpy_from_user.S
@@ -0,0 +1,54 @@
+/* strncpy_from_user.S: Sparc64 strncpy from userspace.
+ *
+ * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#include <asm/asi.h>
+#include <asm/errno.h>
+
+ .text
+ .align 4
+
+ /* Must return:
+ *
+ * -EFAULT for an exception
+ * count if we hit the buffer limit
+ * bytes copied if we hit a null byte
+ */
+
+ .globl __strncpy_from_user
+__strncpy_from_user:
+ /* %o0=dest, %o1=src, %o2=count */
+ brlez,pn %o2, 3f
+ add %o1, %o2, %o1
+ sub %g0, %o2, %o3
+ add %o0, %o2, %o0
+10:
+ ldub [%o1 + %o3], %o4
+1:
+ brz,pn %o4, 2f
+ stb %o4, [%o0 + %o3]
+ addcc %o3, 1, %o3
+ bne,pt %xcc, 1b
+11:
+ ldub [%o1 + %o3], %o4
+ retl
+ mov %o2, %o0
+2:
+ add %o3, 1, %o3
+ retl
+ add %o2, %o3, %o0
+3:
+ retl
+ clr %o0
+
+ .section .fixup,#alloc,#execinstr
+ .align 4
+4:
+ retl
+ mov -EFAULT, %o0
+
+ .section __ex_table,#alloc
+ .align 4
+ .word 10b, 4b
+ .word 11b, 4b