summaryrefslogtreecommitdiffstats
path: root/arch/sparc/lib
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1997-01-07 02:33:00 +0000
committer <ralf@linux-mips.org>1997-01-07 02:33:00 +0000
commitbeb116954b9b7f3bb56412b2494b562f02b864b1 (patch)
tree120e997879884e1b9d93b265221b939d2ef1ade1 /arch/sparc/lib
parent908d4681a1dc3792ecafbe64265783a86c4cccb6 (diff)
Import of Linux/MIPS 2.1.14
Diffstat (limited to 'arch/sparc/lib')
-rw-r--r--arch/sparc/lib/Makefile47
-rw-r--r--arch/sparc/lib/ashrdi3.S24
-rw-r--r--arch/sparc/lib/blockops.S103
-rw-r--r--arch/sparc/lib/checksum.S439
-rw-r--r--arch/sparc/lib/memcmp.S314
-rw-r--r--arch/sparc/lib/memcpy.S364
-rw-r--r--arch/sparc/lib/memscan.S135
-rw-r--r--arch/sparc/lib/memset.S166
-rw-r--r--arch/sparc/lib/memset.c71
-rw-r--r--arch/sparc/lib/mul.S22
-rw-r--r--arch/sparc/lib/rem.S221
-rw-r--r--arch/sparc/lib/sdiv.S222
-rw-r--r--arch/sparc/lib/strlen.S88
-rw-r--r--arch/sparc/lib/strncmp.S120
-rw-r--r--arch/sparc/lib/strncpy_from_user.S49
-rw-r--r--arch/sparc/lib/udiv.S209
-rw-r--r--arch/sparc/lib/umul.S25
-rw-r--r--arch/sparc/lib/urem.S207
18 files changed, 2389 insertions, 437 deletions
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 1f2ce0e1c..2cb74336f 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -1,22 +1,44 @@
-#
+# $Id: Makefile,v 1.12 1996/10/27 08:36:26 davem Exp $
# Makefile for Sparc library files..
#
CFLAGS := $(CFLAGS) -ansi
-.c.s:
- $(CC) $(CFLAGS) -S $<
-.s.o:
- $(AS) -c -o $*.o $<
-.c.o:
- $(CC) $(CFLAGS) -c $<
-
-OBJS = mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o
+OBJS = mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o memcpy.o memset.o \
+ strlen.o checksum.o blockops.o memscan.o memcmp.o strncmp.o \
+ strncpy_from_user.o
lib.a: $(OBJS)
$(AR) rcs lib.a $(OBJS)
sync
+checksum.o: checksum.S
+ $(CC) -ansi -c -o checksum.o checksum.S
+
+memcpy.o: memcpy.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o memcpy.o memcpy.S
+
+memcmp.o: memcmp.S
+ $(CC) -ansi -c -o memcmp.o memcmp.S
+
+memscan.o: memscan.S
+ $(CC) -ansi -c -o memscan.o memscan.S
+
+strncmp.o: strncmp.S
+ $(CC) -ansi -c -o strncmp.o strncmp.S
+
+strncpy_from_user.o: strncpy_from_user.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o strncpy_from_user.o strncpy_from_user.S
+
+blockops.o: blockops.S
+ $(CC) -ansi -c -o blockops.o blockops.S
+
+memset.o: memset.S
+ $(CC) -D__ASSEMBLY__ -ansi -c -o memset.o memset.S
+
+strlen.o: strlen.S
+ $(CC) -ansi -c -o strlen.o strlen.S
+
mul.o: mul.S
$(CC) -c -o mul.o mul.S
@@ -40,9 +62,4 @@ ashrdi3.o: ashrdi3.S
dep:
-#
-# include a dependency file if one exists
-#
-ifeq (.depend,$(wildcard .depend))
-include .depend
-endif
+include $(TOPDIR)/Rules.make
diff --git a/arch/sparc/lib/ashrdi3.S b/arch/sparc/lib/ashrdi3.S
index c672d2c9f..bf589c283 100644
--- a/arch/sparc/lib/ashrdi3.S
+++ b/arch/sparc/lib/ashrdi3.S
@@ -1,4 +1,5 @@
-/* ashrdi3.S: The filesystem code creates all kinds of references to
+/* $Id: ashrdi3.S,v 1.3 1996/09/07 23:18:10 davem Exp $
+ * ashrdi3.S: The filesystem code creates all kinds of references to
* this little routine on the sparc with gcc.
*
* Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
@@ -10,19 +11,26 @@
C_LABEL(__ashrdi3):
tst %o2
be 3f
- or %g0, 32, %g2
+ or %g0, 32, %g2
+
sub %g2, %o2, %g2
+
tst %g2
bg 1f
- sra %o0, %o2, %o4
+ sra %o0, %o2, %o4
+
sra %o0, 31, %o4
sub %g0, %g2, %g2
ba 2f
- sra %o0, %g2, %o5
-1: sll %o0, %g2, %g3
+ sra %o0, %g2, %o5
+
+1:
+ sll %o0, %g2, %g3
srl %o1, %o2, %g2
or %g2, %g3, %o5
-2: or %g0, %o4, %o0
+2:
+ or %g0, %o4, %o0
or %g0, %o5, %o1
-3: jmpl %o7 + 8, %g0
- nop
+3:
+ jmpl %o7 + 8, %g0
+ nop
diff --git a/arch/sparc/lib/blockops.S b/arch/sparc/lib/blockops.S
new file mode 100644
index 000000000..f8a9e80df
--- /dev/null
+++ b/arch/sparc/lib/blockops.S
@@ -0,0 +1,103 @@
+/* $Id: blockops.S,v 1.5 1996/09/24 05:22:56 davem Exp $
+ * blockops.S: Common block zero optimized routines.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <asm/cprefix.h>
+
+ /* Zero out 64 bytes of memory at (buf + offset).
+ * Assumes %g1 contains zero.
+ */
+#define BLAST_BLOCK(buf, offset) \
+ std %g0, [buf + offset + 0x38]; \
+ std %g0, [buf + offset + 0x30]; \
+ std %g0, [buf + offset + 0x28]; \
+ std %g0, [buf + offset + 0x20]; \
+ std %g0, [buf + offset + 0x18]; \
+ std %g0, [buf + offset + 0x10]; \
+ std %g0, [buf + offset + 0x08]; \
+ std %g0, [buf + offset + 0x00];
+
+ /* Copy 32 bytes of memory at (src + offset) to
+ * (dst + offset).
+ */
+#define MIRROR_BLOCK(dst, src, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [src + offset + 0x18], t0; \
+ ldd [src + offset + 0x10], t2; \
+ ldd [src + offset + 0x08], t4; \
+ ldd [src + offset + 0x00], t6; \
+ std t0, [dst + offset + 0x18]; \
+ std t2, [dst + offset + 0x10]; \
+ std t4, [dst + offset + 0x08]; \
+ std t6, [dst + offset + 0x00];
+
+ /* Profiling evidence indicates that memset() is
+ * commonly called for blocks of size PAGE_SIZE,
+ * and (2 * PAGE_SIZE) (for kernel stacks)
+ * and with a second arg of zero. We assume in
+ * all of these cases that the buffer is aligned
+ * on at least an 8 byte boundry.
+ *
+ * Therefore we special case them to make them
+ * as fast as possible.
+ */
+
+ .text
+ .align 4
+
+ .globl C_LABEL(bzero_2page), C_LABEL(bzero_1page)
+C_LABEL(bzero_2page):
+ /* %o0 = buf */
+ or %g0, %g0, %g1
+ or %o0, %g0, %o1
+ or %g0, 0x20, %g2
+1:
+ BLAST_BLOCK(%o0, 0x00)
+ BLAST_BLOCK(%o0, 0x40)
+ BLAST_BLOCK(%o0, 0x80)
+ BLAST_BLOCK(%o0, 0xc0)
+ subcc %g2, 1, %g2
+ bne 1b
+ add %o0, 0x100, %o0
+
+ retl
+ mov %o1, %o0
+
+C_LABEL(bzero_1page):
+ /* %o0 = buf */
+ or %g0, %g0, %g1
+ or %o0, %g0, %o1
+ or %g0, 0x10, %g2
+1:
+ BLAST_BLOCK(%o0, 0x00)
+ BLAST_BLOCK(%o0, 0x40)
+ BLAST_BLOCK(%o0, 0x80)
+ BLAST_BLOCK(%o0, 0xc0)
+ subcc %g2, 1, %g2
+ bne 1b
+ add %o0, 0x100, %o0
+
+ retl
+ mov %o1, %o0
+
+ .globl C_LABEL(__copy_1page)
+C_LABEL(__copy_1page):
+ /* %o0 = dst, %o1 = src */
+ or %g0, 0x10, %g1
+1:
+ MIRROR_BLOCK(%o0, %o1, 0x00, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+ MIRROR_BLOCK(%o0, %o1, 0x20, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+ MIRROR_BLOCK(%o0, %o1, 0x40, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+ MIRROR_BLOCK(%o0, %o1, 0x60, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+ MIRROR_BLOCK(%o0, %o1, 0x80, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+ MIRROR_BLOCK(%o0, %o1, 0xa0, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+ MIRROR_BLOCK(%o0, %o1, 0xc0, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+ MIRROR_BLOCK(%o0, %o1, 0xe0, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+ subcc %g1, 1, %g1
+ add %o0, 0x100, %o0
+ bne 1b
+ add %o1, 0x100, %o1
+
+ retl
+ nop
diff --git a/arch/sparc/lib/checksum.S b/arch/sparc/lib/checksum.S
new file mode 100644
index 000000000..a71371bf8
--- /dev/null
+++ b/arch/sparc/lib/checksum.S
@@ -0,0 +1,439 @@
+/* checksum.S: Sparc optimized checksum code.
+ *
+ * Copyright(C) 1995 Linus Torvalds
+ * Copyright(C) 1995 Miguel de Icaza
+ * Copyright(C) 1996 David S. Miller
+ *
+ * derived from:
+ * Linux/Alpha checksum c-code
+ * Linux/ix86 inline checksum assembly
+ * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
+ * David Mosberger-Tang for optimized reference c-code
+ * BSD4.4 portable checksum routine
+ */
+
+#include <asm-sparc/cprefix.h>
+
+#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \
+ ldd [buf + offset + 0x00], t0; \
+ ldd [buf + offset + 0x08], t2; \
+ addxcc t0, sum, sum; \
+ addxcc t1, sum, sum; \
+ ldd [buf + offset + 0x10], t4; \
+ addxcc t2, sum, sum; \
+ addxcc t3, sum, sum; \
+ ldd [buf + offset + 0x18], t0; \
+ addxcc t4, sum, sum; \
+ addxcc t5, sum, sum; \
+ addxcc t0, sum, sum; \
+ addxcc t1, sum, sum;
+
+#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3) \
+ ldd [buf - offset - 0x08], t0; \
+ ldd [buf - offset - 0x00], t2; \
+ addxcc t0, sum, sum; \
+ addxcc t1, sum, sum; \
+ addxcc t2, sum, sum; \
+ addxcc t3, sum, sum;
+
+ /* Do end cruft out of band to get better cache patterns. */
+csum_partial_end_cruft:
+ be 1f ! caller asks %o1 & 0x8
+ andcc %o1, 4, %g0 ! nope, check for word remaining
+ ldd [%o0], %g2 ! load two
+ addcc %g2, %o2, %o2 ! add first word to sum
+ addxcc %g3, %o2, %o2 ! add second word as well
+ add %o0, 8, %o0 ! advance buf ptr
+ addx %g0, %o2, %o2 ! add in final carry
+ andcc %o1, 4, %g0 ! check again for word remaining
+1: be 1f ! nope, skip this code
+ andcc %o1, 3, %o1 ! check for trailing bytes
+ ld [%o0], %g2 ! load it
+ addcc %g2, %o2, %o2 ! add to sum
+ add %o0, 4, %o0 ! advance buf ptr
+ addx %g0, %o2, %o2 ! add in final carry
+ andcc %o1, 3, %g0 ! check again for trailing bytes
+1: be 1f ! no trailing bytes, return
+ addcc %o1, -1, %g0 ! only one byte remains?
+ bne 2f ! at least two bytes more
+ subcc %o1, 2, %o1 ! only two bytes more?
+ b 4f ! only one byte remains
+ or %g0, %g0, %o4 ! clear fake hword value
+2: lduh [%o0], %o4 ! get hword
+ be 6f ! jmp if only hword remains
+ add %o0, 2, %o0 ! advance buf ptr either way
+ sll %o4, 16, %o4 ! create upper hword
+4: ldub [%o0], %o5 ! get final byte
+ sll %o5, 8, %o5 ! put into place
+ or %o5, %o4, %o4 ! coalese with hword (if any)
+6: addcc %o4, %o2, %o2 ! add to sum
+1: retl ! get outta here
+ addx %g0, %o2, %o0 ! add final carry into retval
+
+ /* Also do alignment out of band to get better cache patterns. */
+csum_partial_fix_alignment:
+ cmp %o1, 6
+ bl cpte - 0x4
+ andcc %o0, 0x2, %g0
+ be 1f
+ andcc %o0, 0x4, %g0
+ lduh [%o0 + 0x00], %g2
+ sub %o1, 2, %o1
+ add %o0, 2, %o0
+ sll %g2, 16, %g2
+ addcc %g2, %o2, %o2
+ srl %o2, 16, %g3
+ addx %g0, %g3, %g2
+ sll %o2, 16, %o2
+ sll %g2, 16, %g3
+ srl %o2, 16, %o2
+ andcc %o0, 0x4, %g0
+ or %g3, %o2, %o2
+1: be cpa
+ andcc %o1, 0xffffff80, %o3
+ ld [%o0 + 0x00], %g2
+ sub %o1, 4, %o1
+ addcc %g2, %o2, %o2
+ add %o0, 4, %o0
+ addx %g0, %o2, %o2
+ b cpa
+ andcc %o1, 0xffffff80, %o3
+
+ /* The common case is to get called with a nicely aligned
+ * buffer of size 0x20. Follow the code path for that case.
+ */
+ .globl C_LABEL(csum_partial)
+C_LABEL(csum_partial): /* %o0=buf, %o1=len, %o2=sum */
+ andcc %o0, 0x7, %g0 ! alignment problems?
+ bne csum_partial_fix_alignment ! yep, handle it
+ sethi %hi(cpte - 8), %g7 ! prepare table jmp ptr
+ andcc %o1, 0xffffff80, %o3 ! num loop iterations
+cpa: be 3f ! none to do
+ andcc %o1, 0x70, %g1 ! clears carry flag too
+5: CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ addx %g0, %o2, %o2 ! sink in final carry
+ subcc %o3, 128, %o3 ! detract from loop iters
+ bne 5b ! more to do
+ add %o0, 128, %o0 ! advance buf ptr
+ andcc %o1, 0x70, %g1 ! clears carry flag too
+3: be cpte ! nope
+ andcc %o1, 0xf, %g0 ! anything left at all?
+ srl %g1, 1, %o4 ! compute offset
+ sub %g7, %g1, %g7 ! adjust jmp ptr
+ sub %g7, %o4, %g7 ! final jmp ptr adjust
+ jmp %g7 + %lo(cpte - 8) ! enter the table
+ add %o0, %g1, %o0 ! advance buf ptr
+cptbl: CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
+ addx %g0, %o2, %o2 ! fetch final carry
+ andcc %o1, 0xf, %g0 ! anything left at all?
+cpte: bne csum_partial_end_cruft ! yep, handle it
+ andcc %o1, 8, %g0 ! check how much
+cpout: retl ! get outta here
+ mov %o2, %o0 ! return computed csum
+
+ /* This aligned version executes typically in 8.5 superscalar cycles, this
+ * is the best I can do. I say 8.5 because the final add will pair with
+ * the next ldd in the main unrolled loop. Thus the pipe is always full.
+ */
+#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [src + off + 0x00], t0; \
+ ldd [src + off + 0x08], t2; \
+ addxcc t0, sum, sum; \
+ ldd [src + off + 0x10], t4; \
+ addxcc t1, sum, sum; \
+ ldd [src + off + 0x18], t6; \
+ addxcc t2, sum, sum; \
+ std t0, [dst + off + 0x00]; \
+ addxcc t3, sum, sum; \
+ std t2, [dst + off + 0x08]; \
+ addxcc t4, sum, sum; \
+ std t4, [dst + off + 0x10]; \
+ addxcc t5, sum, sum; \
+ std t6, [dst + off + 0x18]; \
+ addxcc t6, sum, sum; \
+ addxcc t7, sum, sum;
+
+ /* 12 superscalar cycles seems to be the limit for this case,
+ * because of this we thus do all the ldd's together to get
+ * Viking MXCC into streaming mode. Ho hum...
+ */
+#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [src + off + 0x00], t0; \
+ ldd [src + off + 0x08], t2; \
+ ldd [src + off + 0x10], t4; \
+ ldd [src + off + 0x18], t6; \
+ st t0, [dst + off + 0x00]; \
+ addxcc t0, sum, sum; \
+ st t1, [dst + off + 0x04]; \
+ addxcc t1, sum, sum; \
+ st t2, [dst + off + 0x08]; \
+ addxcc t2, sum, sum; \
+ st t3, [dst + off + 0x0c]; \
+ addxcc t3, sum, sum; \
+ st t4, [dst + off + 0x10]; \
+ addxcc t4, sum, sum; \
+ st t5, [dst + off + 0x14]; \
+ addxcc t5, sum, sum; \
+ st t6, [dst + off + 0x18]; \
+ addxcc t6, sum, sum; \
+ st t7, [dst + off + 0x1c]; \
+ addxcc t7, sum, sum;
+
+ /* Yuck, 6 superscalar cycles... */
+#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3) \
+ ldd [src - off - 0x08], t0; \
+ ldd [src - off - 0x00], t2; \
+ addxcc t0, sum, sum; \
+ st t0, [dst - off - 0x08]; \
+ addxcc t1, sum, sum; \
+ st t1, [dst - off - 0x04]; \
+ addxcc t2, sum, sum; \
+ st t2, [dst - off - 0x00]; \
+ addxcc t3, sum, sum; \
+ st t3, [dst - off + 0x04];
+
+ /* Handle the end cruft code out of band for better cache patterns. */
+cc_end_cruft:
+ be 1f
+ andcc %o3, 4, %g0
+ ldd [%o0 + 0x00], %g2
+ add %o1, 8, %o1
+ addcc %g2, %g7, %g7
+ add %o0, 8, %o0
+ addxcc %g3, %g7, %g7
+ st %g2, [%o1 - 0x08]
+ addx %g0, %g7, %g7
+ andcc %o3, 4, %g0
+ st %g3, [%o1 - 0x04]
+1: be 1f
+ andcc %o3, 3, %o3
+ ld [%o0 + 0x00], %g2
+ add %o1, 4, %o1
+ addcc %g2, %g7, %g7
+ st %g2, [%o1 - 0x04]
+ addx %g0, %g7, %g7
+ add %o0, 4, %o0
+ andcc %o3, 3, %g0
+1: be 1f
+ addcc %o3, -1, %g0
+ bne 2f
+ subcc %o3, 2, %o3
+ b 4f
+ or %g0, %g0, %o4
+2: lduh [%o0 + 0x00], %o4
+ add %o0, 2, %o0
+ sth %o4, [%o1 + 0x00]
+ be 6f
+ add %o1, 2, %o1
+ sll %o4, 16, %o4
+4: ldub [%o0 + 0x00], %o5
+ stb %o5, [%o1 + 0x00]
+ sll %o5, 8, %o5
+ or %o5, %o4, %o4
+6: addcc %o4, %g7, %g7
+1: retl
+ addx %g0, %g7, %o0
+
+ /* Also, handle the alignment code out of band. */
+cc_dword_align:
+ cmp %g1, 6
+ bl,a ccte
+ andcc %g1, 0xf, %o3
+ andcc %o0, 0x1, %g0
+ bne ccslow
+ andcc %o0, 0x2, %g0
+ be 1f
+ andcc %o0, 0x4, %g0
+ lduh [%o0 + 0x00], %g2
+ sub %g1, 2, %g1
+ sth %g2, [%o1 + 0x00]
+ add %o0, 2, %o0
+ sll %g2, 16, %g2
+ addcc %g2, %g7, %g7
+ add %o1, 2, %o1
+ srl %g7, 16, %g3
+ addx %g0, %g3, %g2
+ sll %g7, 16, %g7
+ sll %g2, 16, %g3
+ srl %g7, 16, %g7
+ andcc %o0, 0x4, %g0
+ or %g3, %g7, %g7
+1: be 3f
+ andcc %g1, 0xffffff80, %g0
+ ld [%o0 + 0x00], %g2
+ sub %g1, 4, %g1
+ st %g2, [%o1 + 0x00]
+ add %o0, 4, %o0
+ addcc %g2, %g7, %g7
+ add %o1, 4, %o1
+ addx %g0, %g7, %g7
+ b 3f
+ andcc %g1, 0xffffff80, %g0
+
+ /* Sun, you just can't beat me, you just can't. Stop trying,
+ * give up. I'm serious, I am going to kick the living shit
+ * out of you, game over, lights out.
+ */
+ .align 8
+ .globl C_LABEL(csum_partial_copy)
+C_LABEL(csum_partial_copy): /* %o0=src, %o1=dest, %o2=len, %o3=sum */
+ xor %o0, %o1, %o4 ! get changing bits
+ mov %o2, %g1 ! free up %o2
+ andcc %o4, 3, %g0 ! check for mismatched alignment
+ bne ccslow ! better this than unaligned/fixups
+ andcc %o0, 7, %g0 ! need to align things?
+ mov %o3, %g7 ! free up %o3
+ bne cc_dword_align ! yes, we check for short lengths there
+ andcc %g1, 0xffffff80, %g0 ! can we use unrolled loop?
+3: be 3f ! nope, less than one loop remains
+ andcc %o1, 4, %g0 ! dest aligned on 4 or 8 byte boundry?
+ be ccdbl + 4 ! 8 byte aligned, kick ass
+5: CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ sub %g1, 128, %g1 ! detract from length
+ addx %g0, %g7, %g7 ! add in last carry bit
+ andcc %g1, 0xffffff80, %g0 ! more to csum?
+ add %o0, 128, %o0 ! advance src ptr
+ bne 5b ! we did not go negative, continue looping
+ add %o1, 128, %o1 ! advance dest ptr
+3: andcc %g1, 0x70, %o2 ! can use table?
+ccmerge:be ccte ! nope, go and check for end cruft
+ andcc %g1, 0xf, %o3 ! get low bits of length (clears carry btw)
+ srl %o2, 1, %o4 ! begin negative offset computation
+ sethi %hi(ccte - 8), %o5 ! set up table ptr end
+ add %o0, %o2, %o0 ! advance src ptr
+ sub %o5, %o4, %o5 ! continue table calculation
+ sll %o2, 1, %g2 ! constant multiplies are fun...
+ sub %o5, %g2, %o5 ! some more adjustments
+ jmp %o5 + %lo(ccte - 8) ! jump into it, duff style, wheee...
+ add %o1, %o2, %o1 ! advance dest ptr (carry is clear btw)
+cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
+ addx %g0, %g7, %g7
+ andcc %o3, 0xf, %g0 ! check for low bits set
+ccte: bne cc_end_cruft ! something left, handle it out of band
+ andcc %o3, 8, %g0 ! begin checks for that code
+ retl ! return
+ mov %g7, %o0 ! give em the computed checksum
+ccdbl: CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ sub %g1, 128, %g1 ! detract from length
+ addx %g0, %g7, %g7 ! add in last carry bit
+ andcc %g1, 0xffffff80, %g0 ! more to csum?
+ add %o0, 128, %o0 ! advance src ptr
+ bne ccdbl ! we did not go negative, continue looping
+ add %o1, 128, %o1 ! advance dest ptr
+ b ccmerge ! finish it off, above
+ andcc %g1, 0x70, %o2 ! can use table? (clears carry btw)
+
+ccslow:
+ save %sp, -104, %sp
+ mov %i0, %g2
+ mov %g2, %o4
+ orcc %i2, %g0, %o5
+ ble .LL37
+ mov 0, %o3
+ andcc %g2, 1, %g3
+ be .LL50
+ sra %o5, 1, %o1
+ ldub [%g2], %o3
+ add %i2, -1, %o5
+ add %g2, 1, %o4
+ sra %o5, 1, %o1
+.LL50:
+ cmp %o1, 0
+ be .LL39
+ andcc %o4, 2, %g0
+ be,a .LL51
+ sra %o1, 1, %o1
+ add %o1, -1, %o1
+ lduh [%o4], %o0
+ add %o5, -2, %o5
+ add %o3, %o0, %o3
+ add %o4, 2, %o4
+ sra %o1, 1, %o1
+.LL51:
+ cmp %o1, 0
+ be .LL41
+ mov 0, %o2
+.LL42:
+ ld [%o4], %o0
+ add %o3, %o2, %o3
+ add %o3, %o0, %o3
+ cmp %o3, %o0
+ addx %g0, 0, %o2
+ addcc %o1, -1, %o1
+ bne .LL42
+ add %o4, 4, %o4
+ add %o3, %o2, %o3
+ sethi %hi(65535), %o0
+ or %o0, %lo(65535), %o0
+ and %o3, %o0, %o0
+ srl %o3, 16, %o1
+ add %o0, %o1, %o3
+.LL41:
+ andcc %o5, 2, %g0
+ be .LL52
+ andcc %o5, 1, %g0
+ lduh [%o4], %o0
+ add %o3, %o0, %o3
+ add %o4, 2, %o4
+.LL39:
+ andcc %o5, 1, %g0
+.LL52:
+ be .LL53
+ sethi %hi(65535), %o0
+ ldub [%o4], %o0
+ sll %o0, 8, %o0
+ add %o3, %o0, %o3
+ sethi %hi(65535), %o0
+.LL53:
+ or %o0, %lo(65535), %o0
+ and %o3, %o0, %o2
+ srl %o3, 16, %o1
+ add %o2, %o1, %o1
+ and %o1, %o0, %o2
+ srl %o1, 16, %o1
+ add %o2, %o1, %o1
+ and %o1, %o0, %o0
+ srl %o1, 16, %o1
+ add %o0, %o1, %o1
+ sll %o1, 16, %o0
+ cmp %g3, 0
+ be .LL37
+ srl %o0, 16, %o3
+ srl %o0, 24, %o1
+ and %o3, 255, %o0
+ sll %o0, 8, %o0
+ or %o1, %o0, %o3
+.LL37:
+ add %o3, %i3, %o1
+ sethi %hi(65535), %o0
+ or %o0, %lo(65535), %o0
+ and %o1, %o0, %o0
+ srl %o1, 16, %o1
+ add %o0, %o1, %i0
+ mov %i1, %o0
+ mov %g2, %o1
+ call C_LABEL(__memcpy)
+ mov %i2, %o2
+ ret
+ restore
diff --git a/arch/sparc/lib/memcmp.S b/arch/sparc/lib/memcmp.S
new file mode 100644
index 000000000..bf22e492c
--- /dev/null
+++ b/arch/sparc/lib/memcmp.S
@@ -0,0 +1,314 @@
+#include <asm/cprefix.h>
+
+ .text
+ .align 4
+ .global C_LABEL(__memcmp), C_LABEL(memcmp)
+C_LABEL(__memcmp):
+C_LABEL(memcmp):
+#if 1
+ cmp %o2, 0
+ ble L3
+ mov 0, %g3
+L5:
+ ldub [%o0], %g2
+ ldub [%o1], %g3
+ sub %g2, %g3, %g2
+ mov %g2, %g3
+ sll %g2, 24, %g2
+
+ cmp %g2, 0
+ bne L3
+ add %o0, 1, %o0
+
+ add %o2, -1, %o2
+
+ cmp %o2, 0
+ bg L5
+ add %o1, 1, %o1
+L3:
+ sll %g3, 24, %o0
+ sra %o0, 24, %o0
+
+ retl
+ nop
+#else
+ save %sp, -104, %sp
+ mov %i2, %o4
+ mov %i0, %o0
+
+ cmp %o4, 15
+ ble L72
+ mov %i1, %i2
+
+ andcc %i2, 3, %g0
+ be L161
+ andcc %o0, 3, %g2
+L75:
+ ldub [%o0], %g3
+ ldub [%i2], %g2
+ add %o0,1, %o0
+
+ subcc %g3, %g2, %i0
+ bne L156
+ add %i2, 1, %i2
+
+ andcc %i2, 3, %g0
+ bne L75
+ add %o4, -1, %o4
+
+ andcc %o0, 3, %g2
+L161:
+ bne,a L78
+ mov %i2, %i1
+
+ mov %o0, %i5
+ mov %i2, %i3
+ srl %o4, 2, %i4
+
+ cmp %i4, 0
+ bge L93
+ mov %i4, %g2
+
+ add %i4, 3, %g2
+L93:
+ sra %g2, 2, %g2
+ sll %g2, 2, %g2
+ sub %i4, %g2, %g2
+
+ cmp %g2, 1
+ be,a L88
+ add %o0, 4, %i5
+
+ bg L94
+ cmp %g2, 2
+
+ cmp %g2, 0
+ be,a L86
+ ld [%o0], %g3
+
+ b L162
+ ld [%i5], %g3
+L94:
+ be L81
+ cmp %g2, 3
+
+ be,a L83
+ add %o0, -4, %i5
+
+ b L162
+ ld [%i5], %g3
+L81:
+ add %o0, -8, %i5
+ ld [%o0], %g3
+ add %i2, -8, %i3
+ ld [%i2], %g2
+
+ b L82
+ add %i4, 2, %i4
+L83:
+ ld [%o0], %g4
+ add %i2, -4, %i3
+ ld [%i2], %g1
+
+ b L84
+ add %i4, 1, %i4
+L86:
+ b L87
+ ld [%i2], %g2
+L88:
+ add %i2, 4, %i3
+ ld [%o0], %g4
+ add %i4, -1, %i4
+ ld [%i2], %g1
+L95:
+ ld [%i5], %g3
+L162:
+ cmp %g4, %g1
+ be L87
+ ld [%i3], %g2
+
+ cmp %g4, %g1
+L163:
+ bleu L114
+ mov -1, %i0
+
+ b L114
+ mov 1, %i0
+L87:
+ ld [%i5 + 4], %g4
+ cmp %g3, %g2
+ bne L163
+ ld [%i3 + 4], %g1
+L84:
+ ld [%i5 + 8], %g3
+
+ cmp %g4, %g1
+ bne L163
+ ld [%i3 + 8], %g2
+L82:
+ ld [%i5 + 12], %g4
+ cmp %g3, %g2
+ bne L163
+ ld [%i3 + 12], %g1
+
+ add %i5, 16, %i5
+
+ addcc %i4, -4, %i4
+ bne L95
+ add %i3, 16, %i3
+
+ cmp %g4, %g1
+ bne L163
+ nop
+
+ b L114
+ mov 0, %i0
+L78:
+ srl %o4, 2, %i0
+ and %o0, -4, %i3
+ orcc %i0, %g0, %g3
+ sll %g2, 3, %o7
+ mov 32, %g2
+
+ bge L129
+ sub %g2, %o7, %o1
+
+ add %i0, 3, %g3
+L129:
+ sra %g3, 2, %g2
+ sll %g2, 2, %g2
+ sub %i0, %g2, %g2
+
+ cmp %g2, 1
+ be,a L124
+ ld [%i3], %o3
+
+ bg L130
+ cmp %g2, 2
+
+ cmp %g2, 0
+ be,a L122
+ ld [%i3], %o2
+
+ b L164
+ sll %o3, %o7, %g3
+L130:
+ be L117
+ cmp %g2, 3
+
+ be,a L119
+ ld [%i3], %g1
+
+ b L164
+ sll %o3, %o7, %g3
+L117:
+ ld [%i3], %g4
+ add %i2, -8, %i1
+ ld [%i3 + 4], %o3
+ add %i0, 2, %i0
+ ld [%i2], %i4
+
+ b L118
+ add %i3, -4, %i3
+L119:
+ ld [%i3 + 4], %g4
+ add %i2, -4, %i1
+ ld [%i2], %i5
+
+ b L120
+ add %i0, 1, %i0
+L122:
+ ld [%i3 + 4], %g1
+ ld [%i2], %i4
+
+ b L123
+ add %i3, 4, %i3
+L124:
+ add %i2, 4, %i1
+ ld [%i3 + 4], %o2
+ add %i0, -1, %i0
+ ld [%i2], %i5
+ add %i3, 8, %i3
+L131:
+ sll %o3, %o7, %g3
+L164:
+ srl %o2, %o1, %g2
+ ld [%i3], %g1
+ or %g3, %g2, %g3
+
+ cmp %g3, %i5
+ bne L163
+ ld [%i1], %i4
+L123:
+ sll %o2, %o7, %g3
+ srl %g1, %o1, %g2
+ ld [%i3 + 4], %g4
+ or %g3, %g2, %g3
+
+ cmp %g3, %i4
+ bne L163
+ ld [%i1 + 4], %i5
+L120:
+ sll %g1, %o7, %g3
+ srl %g4, %o1, %g2
+ ld [%i3 + 8], %o3
+ or %g3, %g2, %g3
+
+ cmp %g3, %i5
+ bne L163
+ ld [%i1 + 8], %i4
+L118:
+ sll %g4, %o7, %g3
+ srl %o3, %o1, %g2
+ ld [%i3 + 12], %o2
+ or %g3, %g2, %g3
+
+ cmp %g3, %i4
+ bne L163
+ ld [%i1 + 12], %i5
+
+ add %i3, 16, %i3
+ addcc %i0, -4, %i0
+ bne L131
+ add %i1, 16, %i1
+
+ sll %o3, %o7, %g3
+ srl %o2, %o1, %g2
+ or %g3, %g2, %g3
+
+ cmp %g3, %i5
+ be,a L114
+ mov 0, %i0
+
+ b,a L163
+L114:
+ cmp %i0, 0
+ bne L156
+ and %o4, -4, %g2
+
+ add %o0, %g2, %o0
+ add %i2, %g2, %i2
+ and %o4, 3, %o4
+L72:
+ cmp %o4, 0
+ be L156
+ mov 0, %i0
+
+ ldub [%o0], %g3
+L165:
+ ldub [%i2], %g2
+ add %o0, 1, %o0
+
+ subcc %g3, %g2, %i0
+ bne L156
+ add %i2, 1, %i2
+
+ addcc %o4, -1, %o4
+ bne,a L165
+ ldub [%o0], %g3
+
+ mov 0, %i0
+L156:
+ ret
+ restore
+#endif
diff --git a/arch/sparc/lib/memcpy.S b/arch/sparc/lib/memcpy.S
new file mode 100644
index 000000000..c4f0394a4
--- /dev/null
+++ b/arch/sparc/lib/memcpy.S
@@ -0,0 +1,364 @@
+/* memcpy.S: Sparc optimized memcpy code.
+ *
+ * Copyright(C) 1995 Linus Torvalds
+ * Copyright(C) 1996 David S. Miller
+ * Copyright(C) 1996 Eddie C. Dost
+ * Copyright(C) 1996 Jakub Jelinek
+ *
+ * derived from:
+ * e-mail between David and Eddie.
+ */
+
+#include <asm/cprefix.h>
+#include <asm/ptrace.h>
+
+/* Both these macros have to start with exactly the same insn */
+#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [%src + offset + 0x00], %t0; \
+ ldd [%src + offset + 0x08], %t2; \
+ ldd [%src + offset + 0x10], %t4; \
+ ldd [%src + offset + 0x18], %t6; \
+ st %t0, [%dst + offset + 0x00]; \
+ st %t1, [%dst + offset + 0x04]; \
+ st %t2, [%dst + offset + 0x08]; \
+ st %t3, [%dst + offset + 0x0c]; \
+ st %t4, [%dst + offset + 0x10]; \
+ st %t5, [%dst + offset + 0x14]; \
+ st %t6, [%dst + offset + 0x18]; \
+ st %t7, [%dst + offset + 0x1c];
+
+#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [%src + offset + 0x00], %t0; \
+ ldd [%src + offset + 0x08], %t2; \
+ ldd [%src + offset + 0x10], %t4; \
+ ldd [%src + offset + 0x18], %t6; \
+ std %t0, [%dst + offset + 0x00]; \
+ std %t2, [%dst + offset + 0x08]; \
+ std %t4, [%dst + offset + 0x10]; \
+ std %t6, [%dst + offset + 0x18];
+
+#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
+ ldd [%src - offset - 0x10], %t0; \
+ ldd [%src - offset - 0x08], %t2; \
+ st %t0, [%dst - offset - 0x10]; \
+ st %t1, [%dst - offset - 0x0c]; \
+ st %t2, [%dst - offset - 0x08]; \
+ st %t3, [%dst - offset - 0x04];
+
+#define MOVE_HALFCHUNK(src, dst, offset, t0, t1, t2, t3) \
+ lduh [%src + offset + 0x00], %t0; \
+ lduh [%src + offset + 0x02], %t1; \
+ lduh [%src + offset + 0x04], %t2; \
+ lduh [%src + offset + 0x06], %t3; \
+ sth %t0, [%dst + offset + 0x00]; \
+ sth %t1, [%dst + offset + 0x02]; \
+ sth %t2, [%dst + offset + 0x04]; \
+ sth %t3, [%dst + offset + 0x06];
+
+#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
+ ldub [%src - offset - 0x02], %t0; \
+ ldub [%src - offset - 0x01], %t1; \
+ stb %t0, [%dst - offset - 0x02]; \
+ stb %t1, [%dst - offset - 0x01];
+
+ .text
+ .align 4
+
+ .globl C_LABEL(__memcpy), C_LABEL(memcpy), C_LABEL(bcopy)
+ .globl C_LABEL(amemmove), C_LABEL(memmove)
+C_LABEL(bcopy):
+ mov %o0, %o3
+ mov %o1, %o0
+ mov %o3, %o1
+C_LABEL(amemmove):
+C_LABEL(memmove):
+/* This should be kept as optimized as possible */
+ cmp %o0, %o1
+ bleu 1f
+ xor %o0, %o1, %o4
+
+ add %o1, %o2, %o3
+ cmp %o3, %o0
+ bleu 2f
+ andcc %o4, 3, %g0
+
+/* But I think from now on, we can hold on. Or tell me, is memmoving
+ * overlapping regions such a nice game? */
+
+ mov %o0, %g1
+ add %o1, %o2, %o1
+ add %o0, %o2, %o0
+ sub %o1, 1, %o1
+ sub %o0, 1, %o0
+
+reverse_bytes:
+ ldub [%o1], %o4
+ subcc %o2, 1, %o2
+ stb %o4, [%o0]
+ sub %o1, 1, %o1
+ bne reverse_bytes
+ sub %o0, 1, %o0
+
+ retl
+ mov %g1, %o0
+
+/* And here start optimizing again... */
+
+dword_align:
+ andcc %o1, 1, %g0
+ be 4f
+ andcc %o1, 2, %g0
+
+ ldub [%o1], %g2
+ add %o1, 1, %o1
+ stb %g2, [%o0]
+ sub %o2, 1, %o2
+ bne 3f
+ add %o0, 1, %o0
+
+ lduh [%o1], %g2
+ add %o1, 2, %o1
+ sth %g2, [%o0]
+ sub %o2, 2, %o2
+ b 3f
+ add %o0, 2, %o0
+4:
+ lduh [%o1], %g2
+ add %o1, 2, %o1
+ sth %g2, [%o0]
+ sub %o2, 2, %o2
+ b 3f
+ add %o0, 2, %o0
+
+C_LABEL(__memcpy):
+C_LABEL(memcpy): /* %o0=dst %o1=src %o2=len */
+ xor %o0, %o1, %o4
+1:
+ andcc %o4, 3, %o5
+2:
+ bne cannot_optimize
+ cmp %o2, 15
+
+ bleu short_aligned_end
+ andcc %o1, 3, %g0
+
+ bne dword_align
+3:
+ andcc %o1, 4, %g0
+
+ be 2f
+ mov %o2, %g1
+
+ ld [%o1], %o4
+ sub %g1, 4, %g1
+ st %o4, [%o0]
+ add %o1, 4, %o1
+ add %o0, 4, %o0
+2:
+ andcc %g1, 0xffffff80, %g7
+ be 3f
+ andcc %o0, 4, %g0
+
+ be ldd_std + 4
+5:
+ MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
+ subcc %g7, 128, %g7
+ add %o1, 128, %o1
+ bne 5b
+ add %o0, 128, %o0
+3:
+ andcc %g1, 0x70, %g7
+ be memcpy_table_end
+ andcc %g1, 8, %g0
+
+ sethi %hi(memcpy_table_end), %o5
+ srl %g7, 1, %o4
+ add %g7, %o4, %o4
+ add %o1, %g7, %o1
+ sub %o5, %o4, %o5
+ jmpl %o5 + %lo(memcpy_table_end), %g0
+ add %o0, %g7, %o0
+
+memcpy_table:
+ MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
+ MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
+
+memcpy_table_end:
+ be memcpy_last7
+ andcc %g1, 4, %g0
+
+ ldd [%o1], %g2
+ add %o0, 8, %o0
+ add %o1, 8, %o1
+ st %g2, [%o0 - 0x08]
+ st %g3, [%o0 - 0x04]
+memcpy_last7:
+ be 1f
+ andcc %g1, 2, %g0
+
+ ld [%o1], %g2
+ add %o1, 4, %o1
+ st %g2, [%o0]
+ add %o0, 4, %o0
+1:
+ be 1f
+ andcc %g1, 1, %g0
+
+ lduh [%o1], %g2
+ add %o1, 2, %o1
+ sth %g2, [%o0]
+ add %o0, 2, %o0
+1:
+ be 1f
+ nop
+
+ ldub [%o1], %g2
+ stb %g2, [%o0]
+1:
+ retl
+ nop
+
+ /* Placed here for cache reasons. */
+ .globl C_LABEL(__copy_to_user), C_LABEL(__copy_from_user)
+C_LABEL(__copy_to_user):
+ b copy_user_common
+ st %o0, [%g6 + THREAD_EX_ADDR]
+
+C_LABEL(__copy_from_user):
+ st %o1, [%g6 + THREAD_EX_ADDR]
+
+copy_user_common:
+ ld [%g6 + THREAD_EX_COUNT], %g1
+ set copy_user_failure, %g2
+ add %g1, 1, %g1
+ st %o7, [%g6 + THREAD_EX_PC]
+ st %g1, [%g6 + THREAD_EX_COUNT]
+ call C_LABEL(__memcpy)
+ st %g2, [%g6 + THREAD_EX_EXPC]
+
+copy_user_success:
+ ldd [%g6 + THREAD_EX_COUNT], %g2
+ mov 0, %o0
+ sub %g2, 1, %g1
+ jmpl %g3 + 0x8, %g0
+ st %g1, [%g6 + THREAD_EX_COUNT]
+
+copy_user_failure:
+ jmpl %g3 + 0x8, %g0
+ mov %g2, %o0
+
+ldd_std:
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+ MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
+ subcc %g7, 128, %g7
+ add %o1, 128, %o1
+ bne ldd_std
+ add %o0, 128, %o0
+
+ andcc %g1, 0x70, %g7
+ be memcpy_table_end
+ andcc %g1, 8, %g0
+
+ sethi %hi(memcpy_table_end), %o5
+ srl %g7, 1, %o4
+ add %g7, %o4, %o4
+ add %o1, %g7, %o1
+ sub %o5, %o4, %o5
+ jmpl %o5 + %lo(memcpy_table_end), %g0
+ add %o0, %g7, %o0
+
+cannot_optimize:
+ bleu short_end
+ cmp %o5, 2
+
+ bne byte_chunk
+ and %o2, 0xfffffff0, %o3
+
+ andcc %o1, 1, %g0
+ be 1f
+ nop
+
+ ldub [%o1], %g2
+ add %o1, 1, %o1
+ sub %o2, 1, %o2
+ stb %g2, [%o0]
+ andcc %o2, 0xfffffff0, %o3
+ be short_end
+ add %o0, 1, %o0
+1:
+ MOVE_HALFCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
+ MOVE_HALFCHUNK(o1, o0, 0x08, g2, g3, g4, g5)
+ subcc %o3, 0x10, %o3
+ add %o1, 0x10, %o1
+ bne 1b
+ add %o0, 0x10, %o0
+ b 2f
+ and %o2, 0xe, %o3
+
+byte_chunk:
+ MOVE_SHORTCHUNK(o1, o0, -0x02, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x04, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x06, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x08, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x0a, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x0c, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x0e, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, -0x10, g2, g3)
+ subcc %o3, 0x10, %o3
+ add %o1, 0x10, %o1
+ bne byte_chunk
+ add %o0, 0x10, %o0
+
+short_end:
+ and %o2, 0xe, %o3
+2:
+ sethi %hi(short_table_end), %o5
+ sll %o3, 3, %o4
+ add %o0, %o3, %o0
+ sub %o5, %o4, %o5
+ add %o1, %o3, %o1
+ jmpl %o5 + %lo(short_table_end), %g0
+ andcc %o2, 1, %g0
+
+ MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
+ MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
+short_table_end:
+ be 1f
+ nop
+ ldub [%o1], %g2
+ stb %g2, [%o0]
+1:
+ retl
+ nop
+
+short_aligned_end:
+ bne short_end
+ andcc %o2, 8, %g0
+
+ be 1f
+ andcc %o2, 4, %g0
+
+ ld [%o1 + 0x00], %g2
+ ld [%o1 + 0x04], %g3
+ add %o1, 8, %o1
+ st %g2, [%o0 + 0x00]
+ st %g3, [%o0 + 0x04]
+ add %o0, 8, %o0
+1:
+ b memcpy_last7
+ mov %o2, %g1
diff --git a/arch/sparc/lib/memscan.S b/arch/sparc/lib/memscan.S
new file mode 100644
index 000000000..f334751c2
--- /dev/null
+++ b/arch/sparc/lib/memscan.S
@@ -0,0 +1,135 @@
+/* $Id: memscan.S,v 1.4 1996/09/08 02:01:20 davem Exp $
+ * memscan.S: Optimized memscan for the Sparc.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <asm/cprefix.h>
+
+/* In essence, this is just a fancy strlen. */
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+ .text
+ .align 4
+ .globl C_LABEL(__memscan_zero), C_LABEL(__memscan_generic)
+ .globl C_LABEL(memscan)
+C_LABEL(__memscan_zero):
+ /* %o0 = addr, %o1 = size */
+ cmp %o1, 0
+ bne,a 1f
+ andcc %o0, 3, %g0
+
+ retl
+ nop
+
+1:
+ be mzero_scan_word
+ sethi %hi(HI_MAGIC), %g2
+
+ ldsb [%o0], %g3
+mzero_still_not_word_aligned:
+ cmp %g3, 0
+ bne 1f
+ add %o0, 1, %o0
+
+ retl
+ sub %o0, 1, %o0
+
+1:
+ subcc %o1, 1, %o1
+ bne,a 1f
+ andcc %o0, 3, %g0
+
+ retl
+ nop
+
+1:
+ bne,a mzero_still_not_word_aligned
+ ldsb [%o0], %g3
+
+ sethi %hi(HI_MAGIC), %g2
+mzero_scan_word:
+ or %g2, %lo(HI_MAGIC), %o3
+ sethi %hi(LO_MAGIC), %g3
+ or %g3, %lo(LO_MAGIC), %o2
+mzero_next_word:
+ ld [%o0], %g2
+mzero_next_word_preloaded:
+ sub %g2, %o2, %g2
+mzero_next_word_preloaded_next:
+ andcc %g2, %o3, %g0
+ bne mzero_byte_zero
+ add %o0, 4, %o0
+
+mzero_check_out_of_fuel:
+ subcc %o1, 4, %o1
+ bg,a 1f
+ ld [%o0], %g2
+
+ retl
+ nop
+
+1:
+ b mzero_next_word_preloaded_next
+ sub %g2, %o2, %g2
+
+ /* Check every byte. */
+mzero_byte_zero:
+ ldsb [%o0 - 4], %g2
+ cmp %g2, 0
+ bne mzero_byte_one
+ sub %o0, 4, %g3
+
+ retl
+ mov %g3, %o0
+
+mzero_byte_one:
+ ldsb [%o0 - 3], %g2
+ cmp %g2, 0
+ bne,a mzero_byte_two_and_three
+ ldsb [%o0 - 2], %g2
+
+ retl
+ sub %o0, 3, %o0
+
+mzero_byte_two_and_three:
+ cmp %g2, 0
+ bne,a 1f
+ ldsb [%o0 - 1], %g2
+
+ retl
+ sub %o0, 2, %o0
+
+1:
+ cmp %g2, 0
+ bne,a mzero_next_word_preloaded
+ ld [%o0], %g2
+
+ retl
+ sub %o0, 1, %o0
+
+mzero_found_it:
+ retl
+ sub %o0, 2, %o0
+
+C_LABEL(memscan):
+C_LABEL(__memscan_generic):
+ /* %o0 = addr, %o1 = c, %o2 = size */
+ cmp %o2, 0
+ bne,a 0f
+ ldub [%o0], %g2
+
+ b,a 2f
+1:
+ ldub [%o0], %g2
+0:
+ cmp %g2, %o1
+ be 2f
+ addcc %o2, -1, %o2
+ bne 1b
+ add %o0, 1, %o0
+2:
+ retl
+ nop
diff --git a/arch/sparc/lib/memset.S b/arch/sparc/lib/memset.S
new file mode 100644
index 000000000..95691debb
--- /dev/null
+++ b/arch/sparc/lib/memset.S
@@ -0,0 +1,166 @@
+/* linux/arch/sparc/lib/memset.S: Sparc optimized memset and bzero code
+ * Hand optimized from GNU libc's memset
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <asm/cprefix.h>
+#include <asm/ptrace.h>
+
+#define HANDLE_UNALIGNED 1
+
+ /* Store 64 bytes at (BASE + OFFSET) using value SOURCE. */
+#define ZERO_BIG_BLOCK(base, offset, source) \
+ std source, [base + offset + 0x00]; \
+ std source, [base + offset + 0x08]; \
+ std source, [base + offset + 0x10]; \
+ std source, [base + offset + 0x18]; \
+ std source, [base + offset + 0x20]; \
+ std source, [base + offset + 0x28]; \
+ std source, [base + offset + 0x30]; \
+ std source, [base + offset + 0x38];
+
+#define ZERO_LAST_BLOCKS(base, offset, source) \
+ std source, [base - offset - 0x38]; \
+ std source, [base - offset - 0x30]; \
+ std source, [base - offset - 0x28]; \
+ std source, [base - offset - 0x20]; \
+ std source, [base - offset - 0x18]; \
+ std source, [base - offset - 0x10]; \
+ std source, [base - offset - 0x08]; \
+ std source, [base - offset - 0x00];
+
+ .text
+ .align 4
+
+ .globl C_LABEL(__bzero), C_LABEL(__memset), C_LABEL(memset)
+C_LABEL(__memset):
+C_LABEL(memset):
+ and %o1, 0xff, %g3
+ sll %g3, 8, %g2
+ or %g3, %g2, %g3
+ sll %g3, 16, %g2
+ or %g3, %g2, %g3
+ b 1f
+ mov %o2, %o1
+
+#if HANDLE_UNALIGNED
+/* As this is highly unprobable, we optimize the other case (4 aligned)
+ * Define HANDLE_UNALIGNED to 0, if all the alignment work is done by
+ * the trap. Then we have to hope nobody will memset something unaligned
+ * with large counts, as this would lead to a lot of traps...
+ */
+3:
+ cmp %o2, 3
+ be 2f
+ stb %g3, [%o0]
+
+ cmp %o2, 2
+ be 2f
+ stb %g3, [%o0 + 0x01]
+
+ stb %g3, [%o0 + 0x02]
+2:
+ sub %o2, 4, %o2
+ add %o1, %o2, %o1
+ b 4f
+ sub %o0, %o2, %o0
+#endif /* HANDLE_UNALIGNED */
+
+ .globl C_LABEL(__clear_user)
+C_LABEL(__clear_user):
+ st %o0, [%g6 + THREAD_EX_ADDR]
+ ld [%g6 + THREAD_EX_COUNT], %g1
+ set clear_user_failure, %g2
+ add %g1, 1, %g1
+ st %o7, [%g6 + THREAD_EX_PC]
+ st %g1, [%g6 + THREAD_EX_COUNT]
+ call C_LABEL(__bzero)
+ st %g2, [%g6 + THREAD_EX_EXPC]
+
+clear_user_success:
+ ldd [%g6 + THREAD_EX_COUNT], %g2
+ mov 0, %o0
+ sub %g2, 1, %g1
+ jmpl %g3 + 0x8, %g0
+ st %g1, [%g6 + THREAD_EX_COUNT]
+
+clear_user_failure:
+ jmpl %g3 + 0x8, %g0
+ mov %g2, %o0
+
+C_LABEL(__bzero):
+ mov %g0, %g3
+1:
+ cmp %o1, 7
+ bleu 7f
+ mov %o0, %g1
+
+#if HANDLE_UNALIGNED
+ andcc %o0, 3, %o2
+ bne 3b
+#endif /* HANDLE_UNALIGNED */
+4:
+ andcc %o0, 4, %g0
+
+ be 2f
+ mov %g3, %g2
+
+ st %g3, [%o0]
+ sub %o1, 4, %o1
+ add %o0, 4, %o0
+2:
+ andcc %o1, 0xffffff80, %o3 ! Now everything is 8 aligned and o1 is len to run
+ be 9f
+ andcc %o1, 0x78, %o2
+4:
+ ZERO_BIG_BLOCK(%o0, 0x00, %g2)
+ subcc %o3, 128, %o3
+ ZERO_BIG_BLOCK(%o0, 0x40, %g2)
+ bne 4b
+ add %o0, 128, %o0
+
+ orcc %o2, %g0, %g0
+9:
+ be 6f
+ andcc %o1, 7, %o1
+
+ srl %o2, 1, %o3
+ set bzero_table + 64, %o4
+ sub %o4, %o3, %o4
+ jmp %o4
+ add %o0, %o2, %o0
+
+bzero_table:
+ ZERO_LAST_BLOCKS(%o0, 0x48, %g2)
+ ZERO_LAST_BLOCKS(%o0, 0x08, %g2)
+
+6:
+ be 8f
+ andcc %o1, 4, %g0
+
+ be 1f
+ andcc %o1, 2, %g0
+
+ st %g3, [%o0]
+ add %o0, 4, %o0
+1:
+ be 1f
+ andcc %o1, 1, %g0
+
+ sth %g3, [%o0]
+ add %o0, 2, %o0
+1:
+ bne,a 8f
+ stb %g3, [%o0]
+8:
+ retl
+ mov %g1,%o0
+
+/* Don't care about alignment here. It is highly
+ * unprobable and at most two traps may happen
+ */
+7:
+ b 6b
+ orcc %o1, 0, %g0
diff --git a/arch/sparc/lib/memset.c b/arch/sparc/lib/memset.c
new file mode 100644
index 000000000..1e81dff49
--- /dev/null
+++ b/arch/sparc/lib/memset.c
@@ -0,0 +1,71 @@
+/* linux/arch/sparc/lib/memset.c
+ *
+ * This is from GNU libc.
+ */
+
+#include <linux/types.h>
+
+#define op_t unsigned long int
+#define OPSIZ (sizeof(op_t))
+
+typedef unsigned char byte;
+
+void *memset(void *dstpp, char c, size_t len)
+{
+ long int dstp = (long int) dstpp;
+
+ if (len >= 8) {
+ size_t xlen;
+ op_t cccc;
+
+ cccc = (unsigned char) c;
+ cccc |= cccc << 8;
+ cccc |= cccc << 16;
+
+ /* There are at least some bytes to set.
+ No need to test for LEN == 0 in this alignment loop. */
+ while (dstp % OPSIZ != 0) {
+ ((byte *) dstp)[0] = c;
+ dstp += 1;
+ len -= 1;
+ }
+
+ /* Write 8 `op_t' per iteration until less
+ * than 8 `op_t' remain.
+ */
+ xlen = len / (OPSIZ * 8);
+ while (xlen > 0) {
+ ((op_t *) dstp)[0] = cccc;
+ ((op_t *) dstp)[1] = cccc;
+ ((op_t *) dstp)[2] = cccc;
+ ((op_t *) dstp)[3] = cccc;
+ ((op_t *) dstp)[4] = cccc;
+ ((op_t *) dstp)[5] = cccc;
+ ((op_t *) dstp)[6] = cccc;
+ ((op_t *) dstp)[7] = cccc;
+ dstp += 8 * OPSIZ;
+ xlen -= 1;
+ }
+ len %= OPSIZ * 8;
+
+ /* Write 1 `op_t' per iteration until less than
+ * OPSIZ bytes remain.
+ */
+ xlen = len / OPSIZ;
+ while (xlen > 0) {
+ ((op_t *) dstp)[0] = cccc;
+ dstp += OPSIZ;
+ xlen -= 1;
+ }
+ len %= OPSIZ;
+ }
+
+ /* Write the last few bytes. */
+ while (len > 0) {
+ ((byte *) dstp)[0] = c;
+ dstp += 1;
+ len -= 1;
+ }
+
+ return dstpp;
+}
diff --git a/arch/sparc/lib/mul.S b/arch/sparc/lib/mul.S
index e6d78f85f..83dffbc2f 100644
--- a/arch/sparc/lib/mul.S
+++ b/arch/sparc/lib/mul.S
@@ -1,4 +1,5 @@
-/* mul.S: This routine was taken from glibc-1.09 and is covered
+/* $Id: mul.S,v 1.4 1996/09/30 02:22:32 davem Exp $
+ * mul.S: This routine was taken from glibc-1.09 and is covered
* by the GNU Library General Public License Version 2.
*/
@@ -19,7 +20,7 @@
mov %o0, %y ! multiplier -> Y
andncc %o0, 0xfff, %g0 ! test bits 12..31
be Lmul_shortway ! if zero, can do it the short way
- andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
+ andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
/*
* Long multiply. 32 steps, followed by a final shift step.
@@ -65,23 +66,23 @@
#if 0
tst %o0
bge 1f
- rd %y, %o0
+ rd %y, %o0
! %o0 was indeed negative; fix upper 32 bits of result by subtracting
! %o1 (i.e., return %o4 - %o1 in %o1).
retl
- sub %o4, %o1, %o1
+ sub %o4, %o1, %o1
1:
retl
- mov %o4, %o1
+ mov %o4, %o1
#else
/* Faster code adapted from tege@sics.se's code for umul.S. */
sra %o0, 31, %o2 ! make mask from sign bit
and %o1, %o2, %o2 ! %o2 = 0 or %o1, depending on sign of %o0
rd %y, %o0 ! get lower half of product
retl
- sub %o4, %o2, %o1 ! subtract compensation
+ sub %o4, %o2, %o1 ! subtract compensation
! and put upper half in place
#endif
@@ -124,4 +125,11 @@ Lmul_shortway:
srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left
or %o5, %o0, %o0 ! construct low part of result
retl
- sra %o4, 20, %o1 ! ... and extract high part of result
+ sra %o4, 20, %o1 ! ... and extract high part of result
+
+ .globl .mul_patch
+.mul_patch:
+ smul %o0, %o1, %o0
+ retl
+ rd %y, %o1
+ nop
diff --git a/arch/sparc/lib/rem.S b/arch/sparc/lib/rem.S
index 3c0cc579b..44508148d 100644
--- a/arch/sparc/lib/rem.S
+++ b/arch/sparc/lib/rem.S
@@ -1,4 +1,5 @@
-/* rem.S: This routine was taken from glibc-1.09 and is covered
+/* $Id: rem.S,v 1.7 1996/09/30 02:22:34 davem Exp $
+ * rem.S: This routine was taken from glibc-1.09 and is covered
* by the GNU Library General Public License Version 2.
*/
@@ -46,13 +47,14 @@
! compute sign of result; if neither is negative, no problem
orcc %o1, %o0, %g0 ! either negative?
bge 2f ! no, go do the divide
- xor %o1, %o0, %g6 ! compute sign in any case
+ mov %o0, %g2 ! compute sign in any case
+
tst %o1
bge 1f
- tst %o0
+ tst %o0
! %o1 is definitely negative; %o0 might also be negative
bge 2f ! if %o0 not negative...
- sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
+ sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
1: ! %o0 is negative, %o1 is nonnegative
sub %g0, %o0, %o0 ! make %o0 nonnegative
2:
@@ -60,22 +62,24 @@
! Ready to divide. Compute size of quotient; scale comparand.
orcc %o1, %g0, %o5
bne 1f
- mov %o0, %o3
+ mov %o0, %o3
! Divide by zero trap. If it returns, return 0 (about as
! wrong as possible, but that is what SunOS does...).
ta ST_DIV0
retl
- clr %o0
+ clr %o0
1:
cmp %o3, %o5 ! if %o1 exceeds %o0, done
blu Lgot_result ! (and algorithm fails otherwise)
- clr %o2
+ clr %o2
+
sethi %hi(1 << (32 - 4 - 1)), %g1
+
cmp %o3, %g1
blu Lnot_really_big
- clr %o4
+ clr %o4
! Here the dividend is >= 2**(31-N) or so. We must be careful here,
! as our usual N-at-a-shot divide step will cause overflow and havoc.
@@ -85,15 +89,19 @@
1:
cmp %o5, %g1
bgeu 3f
- mov 1, %g7
+ mov 1, %g7
+
sll %o5, 4, %o5
+
b 1b
- add %o4, 1, %o4
+ add %o4, 1, %o4
! Now compute %g7.
- 2: addcc %o5, %o5, %o5
+ 2:
+ addcc %o5, %o5, %o5
+
bcc Lnot_too_big
- add %g7, 1, %g7
+ add %g7, 1, %g7
! We get here if the %o1 overflowed while shifting.
! This means that %o3 has the high-order bit set.
@@ -101,15 +109,18 @@
sll %g1, 4, %g1 ! high order bit
srl %o5, 1, %o5 ! rest of %o5
add %o5, %g1, %o5
+
b Ldo_single_div
- sub %g7, 1, %g7
+ sub %g7, 1, %g7
Lnot_too_big:
- 3: cmp %o5, %o3
+ 3:
+ cmp %o5, %o3
blu 2b
- nop
+ nop
+
be Ldo_single_div
- nop
+ nop
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
! %o5 > %o3: went too far: back up 1 step
@@ -126,19 +137,23 @@
Ldo_single_div:
subcc %g7, 1, %g7
bl Lend_regular_divide
- nop
+ nop
+
sub %o3, %o5, %o3
mov 1, %o2
+
b Lend_single_divloop
- nop
+ nop
Lsingle_divloop:
sll %o2, 1, %o2
+
bl 1f
- srl %o5, 1, %o5
+ srl %o5, 1, %o5
! %o3 >= 0
sub %o3, %o5, %o3
+
b 2f
- add %o2, 1, %o2
+ add %o2, 1, %o2
1: ! %o3 < 0
add %o3, %o5, %o3
sub %o2, 1, %o2
@@ -146,7 +161,8 @@
Lend_single_divloop:
subcc %g7, 1, %g7
bge Lsingle_divloop
- tst %o3
+ tst %o3
+
b,a Lend_regular_divide
Lnot_really_big:
@@ -154,206 +170,213 @@ Lnot_really_big:
sll %o5, 4, %o5
cmp %o5, %o3
bleu 1b
- addcc %o4, 1, %o4
+ addcc %o4, 1, %o4
be Lgot_result
- sub %o4, 1, %o4
+ sub %o4, 1, %o4
tst %o3 ! set up for initial iteration
Ldivloop:
sll %o2, 4, %o2
! depth 1, accumulated bits 0
bl L.1.16
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 2, accumulated bits 1
bl L.2.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits 3
bl L.3.19
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 7
bl L.4.23
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2+1), %o2
+
+ b 9f
+ add %o2, (7*2+1), %o2
L.4.23:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2-1), %o2
-
+ b 9f
+ add %o2, (7*2-1), %o2
L.3.19:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 5
bl L.4.21
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2+1), %o2
+ b 9f
+ add %o2, (5*2+1), %o2
L.4.21:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2-1), %o2
-
-
+ b 9f
+ add %o2, (5*2-1), %o2
L.2.17:
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits 1
bl L.3.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 3
bl L.4.19
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2+1), %o2
-
+ b 9f
+ add %o2, (3*2+1), %o2
+
L.4.19:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2-1), %o2
-
-
+ b 9f
+ add %o2, (3*2-1), %o2
+
L.3.17:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 1
bl L.4.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2+1), %o2
-
+ b 9f
+ add %o2, (1*2+1), %o2
+
L.4.17:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2-1), %o2
-
-
-
-
+ b 9f
+ add %o2, (1*2-1), %o2
+
L.1.16:
! remainder is negative
addcc %o3,%o5,%o3
! depth 2, accumulated bits -1
bl L.2.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits -1
bl L.3.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -1
bl L.4.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2+1), %o2
-
+ b 9f
+ add %o2, (-1*2+1), %o2
+
L.4.15:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2-1), %o2
-
-
+ b 9f
+ add %o2, (-1*2-1), %o2
+
L.3.15:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -3
bl L.4.13
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2+1), %o2
-
+ b 9f
+ add %o2, (-3*2+1), %o2
+
L.4.13:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2-1), %o2
-
-
-
+ b 9f
+ add %o2, (-3*2-1), %o2
+
L.2.15:
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits -3
bl L.3.13
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -5
bl L.4.11
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2+1), %o2
-
+ b 9f
+ add %o2, (-5*2+1), %o2
+
L.4.11:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2-1), %o2
-
-
+ b 9f
+ add %o2, (-5*2-1), %o2
+
+
L.3.13:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -7
bl L.4.9
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2+1), %o2
-
+ b 9f
+ add %o2, (-7*2+1), %o2
+
L.4.9:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2-1), %o2
-
-
-
-
+ b 9f
+ add %o2, (-7*2-1), %o2
+
9:
Lend_regular_divide:
subcc %o4, 1, %o4
bge Ldivloop
- tst %o3
+ tst %o3
+
bl,a Lgot_result
! non-restoring fixup here (one instruction only!)
add %o3, %o1, %o3
-
Lgot_result:
+ ! check to see if answer should be < 0
+ tst %g2
+ bl,a 1f
+ sub %g0, %o3, %o3
+1:
+ retl
+ mov %o3, %o0
+ .globl .rem_patch
+.rem_patch:
+ sra %o0, 0x1f, %o4
+ wr %o4, 0x0, %y
+ nop
+ nop
+ nop
+ sdivcc %o0, %o1, %o2
+ bvs,a 1f
+ xnor %o2, %g0, %o2
+1: smul %o2, %o1, %o2
retl
- mov %o3, %o0
+ sub %o0, %o2, %o0
+ nop
diff --git a/arch/sparc/lib/sdiv.S b/arch/sparc/lib/sdiv.S
index 2fa7a9794..e0ad80b6f 100644
--- a/arch/sparc/lib/sdiv.S
+++ b/arch/sparc/lib/sdiv.S
@@ -1,4 +1,5 @@
-/* sdiv.S: This routine was taken from glibc-1.09 and is covered
+/* $Id: sdiv.S,v 1.6 1996/10/02 17:37:00 davem Exp $
+ * sdiv.S: This routine was taken from glibc-1.09 and is covered
* by the GNU Library General Public License Version 2.
*/
@@ -46,13 +47,14 @@
! compute sign of result; if neither is negative, no problem
orcc %o1, %o0, %g0 ! either negative?
bge 2f ! no, go do the divide
- xor %o1, %o0, %g6 ! compute sign in any case
+ xor %o1, %o0, %g2 ! compute sign in any case
+
tst %o1
bge 1f
- tst %o0
+ tst %o0
! %o1 is definitely negative; %o0 might also be negative
bge 2f ! if %o0 not negative...
- sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
+ sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
1: ! %o0 is negative, %o1 is nonnegative
sub %g0, %o0, %o0 ! make %o0 nonnegative
2:
@@ -60,22 +62,24 @@
! Ready to divide. Compute size of quotient; scale comparand.
orcc %o1, %g0, %o5
bne 1f
- mov %o0, %o3
+ mov %o0, %o3
! Divide by zero trap. If it returns, return 0 (about as
! wrong as possible, but that is what SunOS does...).
ta ST_DIV0
retl
- clr %o0
+ clr %o0
1:
cmp %o3, %o5 ! if %o1 exceeds %o0, done
blu Lgot_result ! (and algorithm fails otherwise)
- clr %o2
+ clr %o2
+
sethi %hi(1 << (32 - 4 - 1)), %g1
+
cmp %o3, %g1
blu Lnot_really_big
- clr %o4
+ clr %o4
! Here the dividend is >= 2**(31-N) or so. We must be careful here,
! as our usual N-at-a-shot divide step will cause overflow and havoc.
@@ -85,15 +89,18 @@
1:
cmp %o5, %g1
bgeu 3f
- mov 1, %g7
+ mov 1, %g7
+
sll %o5, 4, %o5
+
b 1b
- add %o4, 1, %o4
+ add %o4, 1, %o4
! Now compute %g7.
- 2: addcc %o5, %o5, %o5
+ 2:
+ addcc %o5, %o5, %o5
bcc Lnot_too_big
- add %g7, 1, %g7
+ add %g7, 1, %g7
! We get here if the %o1 overflowed while shifting.
! This means that %o3 has the high-order bit set.
@@ -101,15 +108,18 @@
sll %g1, 4, %g1 ! high order bit
srl %o5, 1, %o5 ! rest of %o5
add %o5, %g1, %o5
+
b Ldo_single_div
- sub %g7, 1, %g7
+ sub %g7, 1, %g7
Lnot_too_big:
- 3: cmp %o5, %o3
+ 3:
+ cmp %o5, %o3
blu 2b
- nop
+ nop
+
be Ldo_single_div
- nop
+ nop
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
! %o5 > %o3: went too far: back up 1 step
@@ -126,19 +136,23 @@
Ldo_single_div:
subcc %g7, 1, %g7
bl Lend_regular_divide
- nop
+ nop
+
sub %o3, %o5, %o3
mov 1, %o2
+
b Lend_single_divloop
- nop
+ nop
Lsingle_divloop:
sll %o2, 1, %o2
+
bl 1f
- srl %o5, 1, %o5
+ srl %o5, 1, %o5
! %o3 >= 0
sub %o3, %o5, %o3
+
b 2f
- add %o2, 1, %o2
+ add %o2, 1, %o2
1: ! %o3 < 0
add %o3, %o5, %o3
sub %o2, 1, %o2
@@ -146,7 +160,8 @@
Lend_single_divloop:
subcc %g7, 1, %g7
bge Lsingle_divloop
- tst %o3
+ tst %o3
+
b,a Lend_regular_divide
Lnot_really_big:
@@ -154,83 +169,81 @@ Lnot_really_big:
sll %o5, 4, %o5
cmp %o5, %o3
bleu 1b
- addcc %o4, 1, %o4
+ addcc %o4, 1, %o4
+
be Lgot_result
- sub %o4, 1, %o4
+ sub %o4, 1, %o4
tst %o3 ! set up for initial iteration
Ldivloop:
sll %o2, 4, %o2
! depth 1, accumulated bits 0
bl L.1.16
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 2, accumulated bits 1
bl L.2.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits 3
bl L.3.19
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 7
bl L.4.23
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2+1), %o2
-
+ b 9f
+ add %o2, (7*2+1), %o2
+
L.4.23:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2-1), %o2
-
-
+ b 9f
+ add %o2, (7*2-1), %o2
+
L.3.19:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 5
bl L.4.21
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2+1), %o2
-
+ b 9f
+ add %o2, (5*2+1), %o2
+
L.4.21:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2-1), %o2
-
-
-
+ b 9f
+ add %o2, (5*2-1), %o2
+
L.2.17:
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits 1
bl L.3.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 3
bl L.4.19
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2+1), %o2
-
+ b 9f
+ add %o2, (3*2+1), %o2
+
L.4.19:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2-1), %o2
+ b 9f
+ add %o2, (3*2-1), %o2
L.3.17:
@@ -238,126 +251,129 @@ L.3.17:
addcc %o3,%o5,%o3
! depth 4, accumulated bits 1
bl L.4.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2+1), %o2
-
+ b 9f
+ add %o2, (1*2+1), %o2
+
L.4.17:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2-1), %o2
-
-
-
-
+ b 9f
+ add %o2, (1*2-1), %o2
+
L.1.16:
! remainder is negative
addcc %o3,%o5,%o3
! depth 2, accumulated bits -1
bl L.2.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits -1
bl L.3.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -1
bl L.4.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2+1), %o2
-
+ b 9f
+ add %o2, (-1*2+1), %o2
+
L.4.15:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2-1), %o2
-
-
+ b 9f
+ add %o2, (-1*2-1), %o2
+
L.3.15:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -3
bl L.4.13
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2+1), %o2
-
+ b 9f
+ add %o2, (-3*2+1), %o2
+
L.4.13:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2-1), %o2
-
-
-
+ b 9f
+ add %o2, (-3*2-1), %o2
+
L.2.15:
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits -3
bl L.3.13
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -5
bl L.4.11
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2+1), %o2
-
+ b 9f
+ add %o2, (-5*2+1), %o2
+
L.4.11:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2-1), %o2
-
-
+ b 9f
+ add %o2, (-5*2-1), %o2
+
L.3.13:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -7
bl L.4.9
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2+1), %o2
-
+ b 9f
+ add %o2, (-7*2+1), %o2
+
L.4.9:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2-1), %o2
-
-
-
-
+ b 9f
+ add %o2, (-7*2-1), %o2
+
9:
Lend_regular_divide:
subcc %o4, 1, %o4
bge Ldivloop
- tst %o3
+ tst %o3
+
bl,a Lgot_result
! non-restoring fixup here (one instruction only!)
sub %o2, 1, %o2
-
Lgot_result:
! check to see if answer should be < 0
- tst %g6
+ tst %g2
bl,a 1f
- sub %g0, %o2, %o2
+ sub %g0, %o2, %o2
1:
retl
- mov %o2, %o0
+ mov %o2, %o0
+
+ .globl .div_patch
+.div_patch:
+ sra %o0, 0x1f, %o2
+ wr %o2, 0x0, %y
+ nop
+ nop
+ nop
+ sdivcc %o0, %o1, %o0
+ bvs,a 1f
+ xnor %o0, %g0, %o0
+1: retl
+ nop
diff --git a/arch/sparc/lib/strlen.S b/arch/sparc/lib/strlen.S
new file mode 100644
index 000000000..95321d4c5
--- /dev/null
+++ b/arch/sparc/lib/strlen.S
@@ -0,0 +1,88 @@
+/* strlen.S: Sparc optimized strlen().
+ *
+ * This was hand optimized by davem@caip.rutgers.edu from
+ * the C-code in GNU-libc.
+ */
+
+#include <asm/cprefix.h>
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+ .align 4
+ .global C_LABEL(strlen)
+C_LABEL(strlen):
+ mov %o0, %o1
+ andcc %o0, 3, %g0 ! and with %o0 so no dependency problems
+ be scan_words
+ sethi %hi(HI_MAGIC), %g2 ! common case and most Sparcs predict taken
+
+ ldsb [%o0], %g2
+still_not_word_aligned:
+ cmp %g2, 0
+ bne,a 1f
+ add %o0, 1, %o0
+
+ /* Ok, so there are tons of quick interlocks above for the
+ * < 4 length string unaligned... not too common so I'm not
+ * very concerned.
+ */
+ retl
+ sub %o0, %o1, %o0
+
+1:
+ andcc %o0, 3, %g0
+ bne,a still_not_word_aligned
+ ldsb [%o0], %g2
+
+ /* HyperSparc executes each sethi/or pair in 1 cycle. */
+ sethi %hi(HI_MAGIC), %g2
+scan_words:
+ or %g2, %lo(HI_MAGIC), %o3
+ sethi %hi(LO_MAGIC), %g3
+ or %g3, %lo(LO_MAGIC), %o2
+next_word:
+ ld [%o0], %g2 ! no dependencies
+next_word_preloaded:
+ sub %g2, %o2, %g2 ! lots of locks here
+ andcc %g2, %o3, %g0 ! and I dont like it...
+ be next_word
+ add %o0, 4, %o0
+
+ /* Check every byte. */
+byte_zero:
+ ldsb [%o0 - 0x4], %g2
+ cmp %g2, 0
+ bne byte_one
+ add %o0, -4, %g3
+
+ retl
+ sub %g3, %o1, %o0
+
+byte_one:
+ ldsb [%o0 - 0x3], %g2
+ cmp %g2, 0
+ bne,a byte_two_and_three
+ ldsb [%o0 - 0x2], %g2
+
+ sub %g3, %o1, %o0
+ retl
+ add %o0, 1, %o0
+
+byte_two_and_three:
+ cmp %g2, 0
+ be,a found_it
+ sub %g3, %o1, %o0
+
+ ldsb [%o0 - 0x1], %g2
+ cmp %g2, 0
+ bne,a next_word_preloaded
+ ld [%o0], %g2
+
+ sub %g3, %o1, %o0
+ retl
+ add %o0, 3, %o0
+
+found_it:
+ retl
+ add %o0, 2, %o0
diff --git a/arch/sparc/lib/strncmp.S b/arch/sparc/lib/strncmp.S
new file mode 100644
index 000000000..2f26b1b4a
--- /dev/null
+++ b/arch/sparc/lib/strncmp.S
@@ -0,0 +1,120 @@
+/* $Id: strncmp.S,v 1.2 1996/09/09 02:47:20 davem Exp $
+ * strncmp.S: Hand optimized Sparc assembly of GCC output from GNU libc
+ * generic strncmp routine.
+ */
+
+#include <asm/cprefix.h>
+
+ .text
+ .align 4
+ .global C_LABEL(__strncmp), C_LABEL(strncmp)
+C_LABEL(__strncmp):
+C_LABEL(strncmp):
+ mov %o0, %g3
+ mov 0, %o3
+
+ cmp %o2, 3
+ ble 7f
+ mov 0, %g2
+
+ sra %o2, 2, %o4
+ ldub [%g3], %o3
+
+0:
+ ldub [%o1], %g2
+ add %g3, 1, %g3
+ and %o3, 0xff, %o0
+
+ cmp %o0, 0
+ be 8f
+ add %o1, 1, %o1
+
+ cmp %o0, %g2
+ be,a 1f
+ ldub [%g3], %o3
+
+ retl
+ sub %o0, %g2, %o0
+
+1:
+ ldub [%o1], %g2
+ add %g3,1, %g3
+ and %o3, 0xff, %o0
+
+ cmp %o0, 0
+ be 8f
+ add %o1, 1, %o1
+
+ cmp %o0, %g2
+ be,a 1f
+ ldub [%g3], %o3
+
+ retl
+ sub %o0, %g2, %o0
+
+1:
+ ldub [%o1], %g2
+ add %g3, 1, %g3
+ and %o3, 0xff, %o0
+
+ cmp %o0, 0
+ be 8f
+ add %o1, 1, %o1
+
+ cmp %o0, %g2
+ be,a 1f
+ ldub [%g3], %o3
+
+ retl
+ sub %o0, %g2, %o0
+
+1:
+ ldub [%o1], %g2
+ add %g3, 1, %g3
+ and %o3, 0xff, %o0
+
+ cmp %o0, 0
+ be 8f
+ add %o1, 1, %o1
+
+ cmp %o0, %g2
+ be 1f
+ add %o4, -1, %o4
+
+ retl
+ sub %o0, %g2, %o0
+
+1:
+
+ cmp %o4, 0
+ bg,a 0b
+ ldub [%g3], %o3
+
+ b 7f
+ and %o2, 3, %o2
+
+9:
+ ldub [%o1], %g2
+ add %g3, 1, %g3
+ and %o3, 0xff, %o0
+
+ cmp %o0, 0
+ be 8f
+ add %o1, 1, %o1
+
+ cmp %o0, %g2
+ be 7f
+ add %o2, -1, %o2
+
+8:
+ retl
+ sub %o0, %g2, %o0
+
+7:
+ cmp %o2, 0
+ bg,a 9b
+ ldub [%g3], %o3
+
+ and %g2, 0xff, %o0
+ retl
+ sub %o3, %o0, %o0
diff --git a/arch/sparc/lib/strncpy_from_user.S b/arch/sparc/lib/strncpy_from_user.S
new file mode 100644
index 000000000..3dd2bd71c
--- /dev/null
+++ b/arch/sparc/lib/strncpy_from_user.S
@@ -0,0 +1,49 @@
+/* strncpy_from_user.S: Sparc strncpy from userspace.
+ *
+ * Copyright(C) 1996 David S. Miller
+ */
+
+#include <asm/cprefix.h>
+#include <asm/ptrace.h>
+
+ .text
+ .align 4
+
+ /* Must return:
+ *
+ * -EFAULT for an exception
+ * count if we hit the buffer limit
+ * bytes copied if we hit a null byte
+ */
+
+ .globl C_LABEL(__strncpy_from_user)
+C_LABEL(__strncpy_from_user):
+ /* %o0=dest, %o1=src, %o2=count */
+ ld [%g6 + THREAD_EX_COUNT], %g1
+ set strncpy_user_failure, %g2
+ add %g1, 1, %g3
+ st %o7, [%g6 + THREAD_EX_PC]
+ st %g3, [%g6 + THREAD_EX_COUNT]
+ st %g2, [%g6 + THREAD_EX_EXPC]
+
+ mov %o2, %o3
+1:
+ subcc %o2, 1, %o2
+ bneg 2f
+ nop
+
+ ldub [%o1], %o4
+ add %o0, 1, %o0
+ cmp %o4, 0
+ add %o1, 1, %o1
+ bne 1b
+ stb %o4, [%o0 - 1]
+2:
+ add %o2, 1, %o0
+ st %g1, [%g6 + THREAD_EX_COUNT]
+ retl
+ sub %o3, %o0, %o0
+
+strncpy_user_failure:
+ jmpl %g3 + 0x8, %g0
+ mov %g1, %o0
diff --git a/arch/sparc/lib/udiv.S b/arch/sparc/lib/udiv.S
index 53cfeac90..2abfc6b0f 100644
--- a/arch/sparc/lib/udiv.S
+++ b/arch/sparc/lib/udiv.S
@@ -1,4 +1,5 @@
-/* udiv.S: This routine was taken from glibc-1.09 and is covered
+/* $Id: udiv.S,v 1.4 1996/09/30 02:22:38 davem Exp $
+ * udiv.S: This routine was taken from glibc-1.09 and is covered
* by the GNU Library General Public License Version 2.
*/
@@ -47,22 +48,24 @@
! Ready to divide. Compute size of quotient; scale comparand.
orcc %o1, %g0, %o5
bne 1f
- mov %o0, %o3
+ mov %o0, %o3
! Divide by zero trap. If it returns, return 0 (about as
! wrong as possible, but that is what SunOS does...).
ta ST_DIV0
retl
- clr %o0
+ clr %o0
1:
cmp %o3, %o5 ! if %o1 exceeds %o0, done
blu Lgot_result ! (and algorithm fails otherwise)
- clr %o2
+ clr %o2
+
sethi %hi(1 << (32 - 4 - 1)), %g1
+
cmp %o3, %g1
blu Lnot_really_big
- clr %o4
+ clr %o4
! Here the dividend is >= 2**(31-N) or so. We must be careful here,
! as our usual N-at-a-shot divide step will cause overflow and havoc.
@@ -72,15 +75,18 @@
1:
cmp %o5, %g1
bgeu 3f
- mov 1, %g7
+ mov 1, %g7
+
sll %o5, 4, %o5
+
b 1b
- add %o4, 1, %o4
+ add %o4, 1, %o4
! Now compute %g7.
- 2: addcc %o5, %o5, %o5
+ 2:
+ addcc %o5, %o5, %o5
bcc Lnot_too_big
- add %g7, 1, %g7
+ add %g7, 1, %g7
! We get here if the %o1 overflowed while shifting.
! This means that %o3 has the high-order bit set.
@@ -88,15 +94,18 @@
sll %g1, 4, %g1 ! high order bit
srl %o5, 1, %o5 ! rest of %o5
add %o5, %g1, %o5
+
b Ldo_single_div
- sub %g7, 1, %g7
+ sub %g7, 1, %g7
Lnot_too_big:
- 3: cmp %o5, %o3
+ 3:
+ cmp %o5, %o3
blu 2b
- nop
+ nop
+
be Ldo_single_div
- nop
+ nop
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
! %o5 > %o3: went too far: back up 1 step
@@ -113,19 +122,21 @@
Ldo_single_div:
subcc %g7, 1, %g7
bl Lend_regular_divide
- nop
+ nop
+
sub %o3, %o5, %o3
mov 1, %o2
+
b Lend_single_divloop
- nop
+ nop
Lsingle_divloop:
sll %o2, 1, %o2
bl 1f
- srl %o5, 1, %o5
+ srl %o5, 1, %o5
! %o3 >= 0
sub %o3, %o5, %o3
b 2f
- add %o2, 1, %o2
+ add %o2, 1, %o2
1: ! %o3 < 0
add %o3, %o5, %o3
sub %o2, 1, %o2
@@ -133,214 +144,212 @@
Lend_single_divloop:
subcc %g7, 1, %g7
bge Lsingle_divloop
- tst %o3
+ tst %o3
+
b,a Lend_regular_divide
Lnot_really_big:
1:
sll %o5, 4, %o5
+
cmp %o5, %o3
bleu 1b
- addcc %o4, 1, %o4
+ addcc %o4, 1, %o4
+
be Lgot_result
- sub %o4, 1, %o4
+ sub %o4, 1, %o4
tst %o3 ! set up for initial iteration
Ldivloop:
sll %o2, 4, %o2
! depth 1, accumulated bits 0
bl L.1.16
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 2, accumulated bits 1
bl L.2.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits 3
bl L.3.19
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 7
bl L.4.23
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2+1), %o2
-
+ b 9f
+ add %o2, (7*2+1), %o2
+
L.4.23:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2-1), %o2
-
-
+ b 9f
+ add %o2, (7*2-1), %o2
+
L.3.19:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 5
bl L.4.21
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2+1), %o2
-
+ b 9f
+ add %o2, (5*2+1), %o2
+
L.4.21:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2-1), %o2
-
-
-
+ b 9f
+ add %o2, (5*2-1), %o2
+
L.2.17:
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits 1
bl L.3.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 3
bl L.4.19
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2+1), %o2
-
+ b 9f
+ add %o2, (3*2+1), %o2
+
L.4.19:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2-1), %o2
-
-
+ b 9f
+ add %o2, (3*2-1), %o2
+
L.3.17:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 1
bl L.4.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2+1), %o2
-
+ b 9f
+ add %o2, (1*2+1), %o2
+
L.4.17:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2-1), %o2
-
-
-
-
+ b 9f
+ add %o2, (1*2-1), %o2
+
L.1.16:
! remainder is negative
addcc %o3,%o5,%o3
! depth 2, accumulated bits -1
bl L.2.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits -1
bl L.3.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -1
bl L.4.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2+1), %o2
-
+ b 9f
+ add %o2, (-1*2+1), %o2
+
L.4.15:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2-1), %o2
-
-
+ b 9f
+ add %o2, (-1*2-1), %o2
+
L.3.15:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -3
bl L.4.13
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2+1), %o2
-
+ b 9f
+ add %o2, (-3*2+1), %o2
+
L.4.13:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2-1), %o2
-
-
-
+ b 9f
+ add %o2, (-3*2-1), %o2
+
L.2.15:
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits -3
bl L.3.13
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -5
bl L.4.11
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2+1), %o2
-
+ b 9f
+ add %o2, (-5*2+1), %o2
+
L.4.11:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2-1), %o2
-
-
+ b 9f
+ add %o2, (-5*2-1), %o2
+
L.3.13:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -7
bl L.4.9
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2+1), %o2
-
+ b 9f
+ add %o2, (-7*2+1), %o2
+
L.4.9:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2-1), %o2
-
-
-
-
+ b 9f
+ add %o2, (-7*2-1), %o2
+
9:
Lend_regular_divide:
subcc %o4, 1, %o4
bge Ldivloop
- tst %o3
+ tst %o3
+
bl,a Lgot_result
! non-restoring fixup here (one instruction only!)
sub %o2, 1, %o2
-
Lgot_result:
retl
- mov %o2, %o0
+ mov %o2, %o0
+
+ .globl .udiv_patch
+.udiv_patch:
+ wr %g0, 0x0, %y
+ nop
+ nop
+ retl
+ udiv %o0, %o1, %o0
+ nop
diff --git a/arch/sparc/lib/umul.S b/arch/sparc/lib/umul.S
index 24f7c3cda..a784720a8 100644
--- a/arch/sparc/lib/umul.S
+++ b/arch/sparc/lib/umul.S
@@ -1,4 +1,5 @@
-/* umul.S: This routine was taken from glibc-1.09 and is covered
+/* $Id: umul.S,v 1.4 1996/09/30 02:22:39 davem Exp $
+ * umul.S: This routine was taken from glibc-1.09 and is covered
* by the GNU Library General Public License Version 2.
*/
@@ -23,9 +24,10 @@
.umul:
or %o0, %o1, %o4
mov %o0, %y ! multiplier -> Y
+
andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args
be Lmul_shortway ! if zero, can do it the short way
- andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
+ andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
/*
* Long multiply. 32 steps, followed by a final shift step.
@@ -102,17 +104,19 @@
#if 0
tst %o1
bl,a 1f ! if %o1 < 0 (high order bit = 1),
- add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)
-1: rd %y, %o0 ! get lower half of product
+ add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)
+
+1:
+ rd %y, %o0 ! get lower half of product
retl
- addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0
+ addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0
#else
/* Faster code from tege@sics.se. */
sra %o1, 31, %o2 ! make mask from sign bit
and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1
rd %y, %o0 ! get lower half of product
retl
- addcc %o4, %o2, %o1 ! add compensation and put upper half in place
+ addcc %o4, %o2, %o1 ! add compensation and put upper half in place
#endif
Lmul_shortway:
@@ -155,4 +159,11 @@ Lmul_shortway:
srl %o5, 20, %o5 ! shift low bits right 20
or %o5, %o0, %o0
retl
- addcc %g0, %g0, %o1 ! %o1 = zero, and set Z
+ addcc %g0, %g0, %o1 ! %o1 = zero, and set Z
+
+ .globl .umul_patch
+.umul_patch:
+ umul %o0, %o1, %o0
+ retl
+ rd %y, %o1
+ nop
diff --git a/arch/sparc/lib/urem.S b/arch/sparc/lib/urem.S
index c84aa81e5..ec7f0c502 100644
--- a/arch/sparc/lib/urem.S
+++ b/arch/sparc/lib/urem.S
@@ -1,4 +1,5 @@
-/* urem.S: This routine was taken from glibc-1.09 and is covered
+/* $Id: urem.S,v 1.4 1996/09/30 02:22:42 davem Exp $
+ * urem.S: This routine was taken from glibc-1.09 and is covered
* by the GNU Library General Public License Version 2.
*/
@@ -45,22 +46,24 @@
! Ready to divide. Compute size of quotient; scale comparand.
orcc %o1, %g0, %o5
bne 1f
- mov %o0, %o3
+ mov %o0, %o3
! Divide by zero trap. If it returns, return 0 (about as
! wrong as possible, but that is what SunOS does...).
ta ST_DIV0
retl
- clr %o0
+ clr %o0
1:
cmp %o3, %o5 ! if %o1 exceeds %o0, done
blu Lgot_result ! (and algorithm fails otherwise)
- clr %o2
+ clr %o2
+
sethi %hi(1 << (32 - 4 - 1)), %g1
+
cmp %o3, %g1
blu Lnot_really_big
- clr %o4
+ clr %o4
! Here the dividend is >= 2**(31-N) or so. We must be careful here,
! as our usual N-at-a-shot divide step will cause overflow and havoc.
@@ -70,15 +73,18 @@
1:
cmp %o5, %g1
bgeu 3f
- mov 1, %g7
+ mov 1, %g7
+
sll %o5, 4, %o5
+
b 1b
- add %o4, 1, %o4
+ add %o4, 1, %o4
! Now compute %g7.
- 2: addcc %o5, %o5, %o5
+ 2:
+ addcc %o5, %o5, %o5
bcc Lnot_too_big
- add %g7, 1, %g7
+ add %g7, 1, %g7
! We get here if the %o1 overflowed while shifting.
! This means that %o3 has the high-order bit set.
@@ -86,15 +92,18 @@
sll %g1, 4, %g1 ! high order bit
srl %o5, 1, %o5 ! rest of %o5
add %o5, %g1, %o5
+
b Ldo_single_div
- sub %g7, 1, %g7
+ sub %g7, 1, %g7
Lnot_too_big:
- 3: cmp %o5, %o3
+ 3:
+ cmp %o5, %o3
blu 2b
- nop
+ nop
+
be Ldo_single_div
- nop
+ nop
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
! %o5 > %o3: went too far: back up 1 step
@@ -111,19 +120,21 @@
Ldo_single_div:
subcc %g7, 1, %g7
bl Lend_regular_divide
- nop
+ nop
+
sub %o3, %o5, %o3
mov 1, %o2
+
b Lend_single_divloop
- nop
+ nop
Lsingle_divloop:
sll %o2, 1, %o2
bl 1f
- srl %o5, 1, %o5
+ srl %o5, 1, %o5
! %o3 >= 0
sub %o3, %o5, %o3
b 2f
- add %o2, 1, %o2
+ add %o2, 1, %o2
1: ! %o3 < 0
add %o3, %o5, %o3
sub %o2, 1, %o2
@@ -131,214 +142,214 @@
Lend_single_divloop:
subcc %g7, 1, %g7
bge Lsingle_divloop
- tst %o3
+ tst %o3
+
b,a Lend_regular_divide
Lnot_really_big:
1:
sll %o5, 4, %o5
+
cmp %o5, %o3
bleu 1b
- addcc %o4, 1, %o4
+ addcc %o4, 1, %o4
+
be Lgot_result
- sub %o4, 1, %o4
+ sub %o4, 1, %o4
tst %o3 ! set up for initial iteration
Ldivloop:
sll %o2, 4, %o2
! depth 1, accumulated bits 0
bl L.1.16
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 2, accumulated bits 1
bl L.2.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits 3
bl L.3.19
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 7
bl L.4.23
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2+1), %o2
-
+ b 9f
+ add %o2, (7*2+1), %o2
+
L.4.23:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2-1), %o2
-
-
+ b 9f
+ add %o2, (7*2-1), %o2
+
L.3.19:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 5
bl L.4.21
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2+1), %o2
-
+ b 9f
+ add %o2, (5*2+1), %o2
+
L.4.21:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2-1), %o2
-
-
-
+ b 9f
+ add %o2, (5*2-1), %o2
+
L.2.17:
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits 1
bl L.3.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits 3
bl L.4.19
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2+1), %o2
-
+ b 9f
+ add %o2, (3*2+1), %o2
+
L.4.19:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2-1), %o2
-
-
+ b 9f
+ add %o2, (3*2-1), %o2
+
L.3.17:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits 1
bl L.4.17
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2+1), %o2
+ b 9f
+ add %o2, (1*2+1), %o2
L.4.17:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2-1), %o2
-
-
-
-
+ b 9f
+ add %o2, (1*2-1), %o2
+
L.1.16:
! remainder is negative
addcc %o3,%o5,%o3
! depth 2, accumulated bits -1
bl L.2.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 3, accumulated bits -1
bl L.3.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -1
bl L.4.15
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2+1), %o2
-
+ b 9f
+ add %o2, (-1*2+1), %o2
+
L.4.15:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2-1), %o2
-
-
+ b 9f
+ add %o2, (-1*2-1), %o2
+
L.3.15:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -3
bl L.4.13
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2+1), %o2
-
+ b 9f
+ add %o2, (-3*2+1), %o2
+
L.4.13:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2-1), %o2
-
-
-
+ b 9f
+ add %o2, (-3*2-1), %o2
+
L.2.15:
! remainder is negative
addcc %o3,%o5,%o3
! depth 3, accumulated bits -3
bl L.3.13
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
! depth 4, accumulated bits -5
bl L.4.11
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2+1), %o2
+ b 9f
+ add %o2, (-5*2+1), %o2
L.4.11:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2-1), %o2
-
-
+ b 9f
+ add %o2, (-5*2-1), %o2
+
L.3.13:
! remainder is negative
addcc %o3,%o5,%o3
! depth 4, accumulated bits -7
bl L.4.9
- srl %o5,1,%o5
+ srl %o5,1,%o5
! remainder is positive
subcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2+1), %o2
-
+ b 9f
+ add %o2, (-7*2+1), %o2
+
L.4.9:
! remainder is negative
addcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2-1), %o2
-
-
-
-
+ b 9f
+ add %o2, (-7*2-1), %o2
+
9:
Lend_regular_divide:
subcc %o4, 1, %o4
bge Ldivloop
- tst %o3
+ tst %o3
+
bl,a Lgot_result
! non-restoring fixup here (one instruction only!)
add %o3, %o1, %o3
-
Lgot_result:
retl
- mov %o3, %o0
+ mov %o3, %o0
+
+ .globl .urem_patch
+.urem_patch:
+ wr %g0, 0x0, %y
+ nop
+ nop
+ nop
+ udiv %o0, %o1, %o2
+ umul %o2, %o1, %o2
+ retl
+ sub %o0, %o2, %o0