Import of Linux/MIPS 2.1.14

author: Ralf Baechle <ralf@linux-mips.org> 1997-01-07 02:33:00 +0000
committer: <ralf@linux-mips.org> 1997-01-07 02:33:00 +0000
commit: beb116954b9b7f3bb56412b2494b562f02b864b1 (patch)
tree: 120e997879884e1b9d93b265221b939d2ef1ade1 /arch/sparc/lib
parent: 908d4681a1dc3792ecafbe64265783a86c4cccb6 (diff)
18 files changed, 2389 insertions, 437 deletions
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 1f2ce0e1c..2cb74336f 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -1,22 +1,44 @@
-#
+# $Id: Makefile,v 1.12 1996/10/27 08:36:26 davem Exp $
 # Makefile for Sparc library files..
 #
 
 CFLAGS := $(CFLAGS) -ansi
 
-.c.s:
-	$(CC) $(CFLAGS) -S $<
-.s.o:
-	$(AS) -c -o $*.o $<
-.c.o:
-	$(CC) $(CFLAGS) -c $<
-
-OBJS  = mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o
+OBJS  = mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o memcpy.o memset.o \
+        strlen.o checksum.o blockops.o memscan.o memcmp.o strncmp.o \
+	strncpy_from_user.o
 
 lib.a: $(OBJS)
 	$(AR) rcs lib.a $(OBJS)
 	sync
 
+checksum.o: checksum.S
+	$(CC) -ansi -c -o checksum.o checksum.S
+
+memcpy.o: memcpy.S
+	$(CC) -D__ASSEMBLY__ -ansi -c -o memcpy.o memcpy.S
+
+memcmp.o: memcmp.S
+	$(CC) -ansi -c -o memcmp.o memcmp.S
+
+memscan.o: memscan.S
+	$(CC) -ansi -c -o memscan.o memscan.S
+
+strncmp.o: strncmp.S
+	$(CC) -ansi -c -o strncmp.o strncmp.S
+
+strncpy_from_user.o: strncpy_from_user.S
+	$(CC) -D__ASSEMBLY__ -ansi -c -o strncpy_from_user.o strncpy_from_user.S
+
+blockops.o: blockops.S
+	$(CC) -ansi -c -o blockops.o blockops.S
+
+memset.o: memset.S
+	$(CC) -D__ASSEMBLY__ -ansi -c -o memset.o memset.S
+
+strlen.o: strlen.S
+	$(CC) -ansi -c -o strlen.o strlen.S
+
 mul.o: mul.S
 	$(CC) -c -o mul.o mul.S
 
@@ -40,9 +62,4 @@ ashrdi3.o: ashrdi3.S
 
 dep:
 
-#
-# include a dependency file if one exists
-#
-ifeq (.depend,$(wildcard .depend))
-include .depend
-endif
+include $(TOPDIR)/Rules.make
diff --git a/arch/sparc/lib/ashrdi3.S b/arch/sparc/lib/ashrdi3.S
index c672d2c9f..bf589c283 100644
--- a/arch/sparc/lib/ashrdi3.S
+++ b/arch/sparc/lib/ashrdi3.S
@@ -1,4 +1,5 @@
-/* ashrdi3.S:	The filesystem code creates all kinds of references to
+/* $Id: ashrdi3.S,v 1.3 1996/09/07 23:18:10 davem Exp $
+ * ashrdi3.S:	The filesystem code creates all kinds of references to
  *              this little routine on the sparc with gcc.
  *
  * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
@@ -10,19 +11,26 @@
 C_LABEL(__ashrdi3):
 	tst	%o2
 	be	3f
-	or	%g0, 32, %g2
+	 or	%g0, 32, %g2
+
 	sub	%g2, %o2, %g2
+
 	tst	%g2
 	bg	1f
-	sra	%o0, %o2, %o4
+	 sra	%o0, %o2, %o4
+
 	sra	%o0, 31, %o4
 	sub	%g0, %g2, %g2
 	ba	2f
-	sra	%o0, %g2, %o5
-1:	sll	%o0, %g2, %g3
+	 sra	%o0, %g2, %o5
+
+1:
+	sll	%o0, %g2, %g3
 	srl	%o1, %o2, %g2
 	or	%g2, %g3, %o5
-2:	or	%g0, %o4, %o0
+2:
+	or	%g0, %o4, %o0
 	or	%g0, %o5, %o1
-3:	jmpl	%o7 + 8, %g0
-	nop
+3:
+	jmpl	%o7 + 8, %g0
+	 nop
diff --git a/arch/sparc/lib/blockops.S b/arch/sparc/lib/blockops.S
new file mode 100644
index 000000000..f8a9e80df
--- /dev/null
+++ b/arch/sparc/lib/blockops.S
@@ -0,0 +1,103 @@
+/* $Id: blockops.S,v 1.5 1996/09/24 05:22:56 davem Exp $
+ * blockops.S: Common block zero optimized routines.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <asm/cprefix.h>
+
+	/* Zero out 64 bytes of memory at (buf + offset).
+	 * Assumes %g1 contains zero.
+	 */
+#define BLAST_BLOCK(buf, offset) \
+	std	%g0, [buf + offset + 0x38]; \
+	std	%g0, [buf + offset + 0x30]; \
+	std	%g0, [buf + offset + 0x28]; \
+	std	%g0, [buf + offset + 0x20]; \
+	std	%g0, [buf + offset + 0x18]; \
+	std	%g0, [buf + offset + 0x10]; \
+	std	%g0, [buf + offset + 0x08]; \
+	std	%g0, [buf + offset + 0x00];
+
+	/* Copy 32 bytes of memory at (src + offset) to
+	 * (dst + offset).
+	 */
+#define MIRROR_BLOCK(dst, src, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+	ldd	[src + offset + 0x18], t0; \
+	ldd	[src + offset + 0x10], t2; \
+	ldd	[src + offset + 0x08], t4; \
+	ldd	[src + offset + 0x00], t6; \
+	std	t0, [dst + offset + 0x18]; \
+	std	t2, [dst + offset + 0x10]; \
+	std	t4, [dst + offset + 0x08]; \
+	std	t6, [dst + offset + 0x00];
+
+	/* Profiling evidence indicates that memset() is
+	 * commonly called for blocks of size PAGE_SIZE,
+	 * and (2 * PAGE_SIZE) (for kernel stacks)
+	 * and with a second arg of zero.  We assume in
+	 * all of these cases that the buffer is aligned
+	 * on at least an 8 byte boundry.
+	 *
+	 * Therefore we special case them to make them
+	 * as fast as possible.
+	 */
+
+	.text
+	.align	4
+
+	.globl	C_LABEL(bzero_2page), C_LABEL(bzero_1page)
+C_LABEL(bzero_2page):
+	/* %o0 = buf */
+	or	%g0, %g0, %g1
+	or	%o0, %g0, %o1
+	or	%g0, 0x20, %g2
+1:
+	BLAST_BLOCK(%o0, 0x00)
+	BLAST_BLOCK(%o0, 0x40)
+	BLAST_BLOCK(%o0, 0x80)
+	BLAST_BLOCK(%o0, 0xc0)
+	subcc	%g2, 1, %g2
+	bne	1b
+	 add	%o0, 0x100, %o0
+
+	retl
+	 mov	%o1, %o0
+
+C_LABEL(bzero_1page):
+	/* %o0 = buf */
+	or	%g0, %g0, %g1
+	or	%o0, %g0, %o1
+	or	%g0, 0x10, %g2
+1:
+	BLAST_BLOCK(%o0, 0x00)
+	BLAST_BLOCK(%o0, 0x40)
+	BLAST_BLOCK(%o0, 0x80)
+	BLAST_BLOCK(%o0, 0xc0)
+	subcc	%g2, 1, %g2
+	bne	1b
+	 add	%o0, 0x100, %o0
+
+	retl
+	 mov	%o1, %o0
+
+	.globl	C_LABEL(__copy_1page)
+C_LABEL(__copy_1page):
+	/* %o0 = dst, %o1 = src */
+	or	%g0, 0x10, %g1
+1:
+	MIRROR_BLOCK(%o0, %o1, 0x00, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+	MIRROR_BLOCK(%o0, %o1, 0x20, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+	MIRROR_BLOCK(%o0, %o1, 0x40, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+	MIRROR_BLOCK(%o0, %o1, 0x60, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+	MIRROR_BLOCK(%o0, %o1, 0x80, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+	MIRROR_BLOCK(%o0, %o1, 0xa0, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+	MIRROR_BLOCK(%o0, %o1, 0xc0, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+	MIRROR_BLOCK(%o0, %o1, 0xe0, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5)
+	subcc	%g1, 1, %g1
+	add	%o0, 0x100, %o0
+	bne	1b
+	 add	%o1, 0x100, %o1
+
+	retl
+	 nop
diff --git a/arch/sparc/lib/checksum.S b/arch/sparc/lib/checksum.S
new file mode 100644
index 000000000..a71371bf8
--- /dev/null
+++ b/arch/sparc/lib/checksum.S
@@ -0,0 +1,439 @@
+/* checksum.S: Sparc optimized checksum code.
+ *
+ *  Copyright(C) 1995 Linus Torvalds
+ *  Copyright(C) 1995 Miguel de Icaza
+ *  Copyright(C) 1996 David S. Miller
+ *
+ * derived from:
+ *	Linux/Alpha checksum c-code
+ *      Linux/ix86 inline checksum assembly
+ *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
+ *	David Mosberger-Tang for optimized reference c-code
+ *	BSD4.4 portable checksum routine
+ */
+
+#include <asm-sparc/cprefix.h>
+
+#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5)	\
+	ldd	[buf + offset + 0x00], t0;			\
+	ldd	[buf + offset + 0x08], t2;			\
+	addxcc	t0, sum, sum;					\
+	addxcc	t1, sum, sum;					\
+	ldd	[buf + offset + 0x10], t4;			\
+	addxcc	t2, sum, sum;					\
+	addxcc	t3, sum, sum;					\
+	ldd	[buf + offset + 0x18], t0;			\
+	addxcc	t4, sum, sum;					\
+	addxcc	t5, sum, sum;					\
+	addxcc	t0, sum, sum;					\
+	addxcc	t1, sum, sum;
+
+#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3)	\
+	ldd	[buf - offset - 0x08], t0;			\
+	ldd	[buf - offset - 0x00], t2;			\
+	addxcc	t0, sum, sum;					\
+	addxcc	t1, sum, sum;					\
+	addxcc	t2, sum, sum;					\
+	addxcc	t3, sum, sum;
+
+	/* Do end cruft out of band to get better cache patterns. */
+csum_partial_end_cruft:
+	be	1f				! caller asks %o1 & 0x8
+	 andcc	%o1, 4, %g0			! nope, check for word remaining
+	ldd	[%o0], %g2			! load two
+	addcc	%g2, %o2, %o2			! add first word to sum
+	addxcc	%g3, %o2, %o2			! add second word as well
+	add	%o0, 8, %o0			! advance buf ptr
+	addx	%g0, %o2, %o2			! add in final carry
+	andcc	%o1, 4, %g0			! check again for word remaining
+1:	be	1f				! nope, skip this code
+	 andcc	%o1, 3, %o1			! check for trailing bytes
+	ld	[%o0], %g2			! load it
+	addcc	%g2, %o2, %o2			! add to sum
+	add	%o0, 4, %o0			! advance buf ptr
+	addx	%g0, %o2, %o2			! add in final carry
+	andcc	%o1, 3, %g0			! check again for trailing bytes
+1:	be	1f				! no trailing bytes, return
+	 addcc	%o1, -1, %g0			! only one byte remains?
+	bne	2f				! at least two bytes more
+	 subcc	%o1, 2, %o1			! only two bytes more?
+	b	4f				! only one byte remains
+	 or	%g0, %g0, %o4			! clear fake hword value
+2:	lduh	[%o0], %o4			! get hword
+	be	6f				! jmp if only hword remains
+	 add	%o0, 2, %o0			! advance buf ptr either way
+	sll	%o4, 16, %o4			! create upper hword
+4:	ldub	[%o0], %o5			! get final byte
+	sll	%o5, 8, %o5			! put into place
+	or	%o5, %o4, %o4			! coalese with hword (if any)
+6:	addcc	%o4, %o2, %o2			! add to sum
+1:	retl					! get outta here
+	 addx	%g0, %o2, %o0			! add final carry into retval
+
+	/* Also do alignment out of band to get better cache patterns. */
+csum_partial_fix_alignment:
+	cmp	%o1, 6
+	bl	cpte - 0x4
+	 andcc	%o0, 0x2, %g0
+	be	1f
+	 andcc	%o0, 0x4, %g0
+	lduh	[%o0 + 0x00], %g2
+	sub	%o1, 2, %o1
+	add	%o0, 2, %o0
+	sll	%g2, 16, %g2
+	addcc	%g2, %o2, %o2
+	srl	%o2, 16, %g3
+	addx	%g0, %g3, %g2
+	sll	%o2, 16, %o2
+	sll	%g2, 16, %g3
+	srl	%o2, 16, %o2
+	andcc	%o0, 0x4, %g0
+	or	%g3, %o2, %o2
+1:	be	cpa
+	 andcc	%o1, 0xffffff80, %o3
+	ld	[%o0 + 0x00], %g2
+	sub	%o1, 4, %o1
+	addcc	%g2, %o2, %o2
+	add	%o0, 4, %o0
+	addx	%g0, %o2, %o2
+	b	cpa
+	 andcc	%o1, 0xffffff80, %o3
+
+	/* The common case is to get called with a nicely aligned
+	 * buffer of size 0x20.  Follow the code path for that case.
+	 */
+	.globl	C_LABEL(csum_partial)
+C_LABEL(csum_partial):			/* %o0=buf, %o1=len, %o2=sum */
+	andcc	%o0, 0x7, %g0				! alignment problems?
+	bne	csum_partial_fix_alignment		! yep, handle it
+	 sethi	%hi(cpte - 8), %g7			! prepare table jmp ptr
+	andcc	%o1, 0xffffff80, %o3			! num loop iterations
+cpa:	be	3f					! none to do
+	 andcc	%o1, 0x70, %g1				! clears carry flag too
+5:	CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	addx	%g0, %o2, %o2				! sink in final carry
+	subcc	%o3, 128, %o3				! detract from loop iters
+	bne	5b					! more to do
+	 add	%o0, 128, %o0				! advance buf ptr
+	andcc	%o1, 0x70, %g1				! clears carry flag too
+3:	be	cpte					! nope
+	 andcc	%o1, 0xf, %g0				! anything left at all?
+	srl	%g1, 1, %o4				! compute offset
+	sub	%g7, %g1, %g7				! adjust jmp ptr
+	sub	%g7, %o4, %g7				! final jmp ptr adjust
+	jmp	%g7 + %lo(cpte - 8)			! enter the table
+	 add	%o0, %g1, %o0				! advance buf ptr
+cptbl:	CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
+	addx	%g0, %o2, %o2				! fetch final carry
+	andcc	%o1, 0xf, %g0				! anything left at all?
+cpte:	bne	csum_partial_end_cruft			! yep, handle it
+	 andcc	%o1, 8, %g0				! check how much
+cpout:	retl						! get outta here
+	 mov	%o2, %o0				! return computed csum
+
+	/* This aligned version executes typically in 8.5 superscalar cycles, this
+	 * is the best I can do.  I say 8.5 because the final add will pair with
+	 * the next ldd in the main unrolled loop.  Thus the pipe is always full.
+	 */
+#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)	\
+	ldd	[src + off + 0x00], t0;							\
+	ldd	[src + off + 0x08], t2;							\
+	addxcc	t0, sum, sum;								\
+	ldd	[src + off + 0x10], t4;							\
+	addxcc	t1, sum, sum;								\
+	ldd	[src + off + 0x18], t6;							\
+	addxcc	t2, sum, sum;								\
+	std	t0, [dst + off + 0x00];							\
+	addxcc	t3, sum, sum;								\
+	std	t2, [dst + off + 0x08];							\
+	addxcc	t4, sum, sum;								\
+	std	t4, [dst + off + 0x10];							\
+	addxcc	t5, sum, sum;								\
+	std	t6, [dst + off + 0x18];							\
+	addxcc	t6, sum, sum;								\
+	addxcc	t7, sum, sum;
+
+	/* 12 superscalar cycles seems to be the limit for this case,
+	 * because of this we thus do all the ldd's together to get
+	 * Viking MXCC into streaming mode.  Ho hum...
+	 */
+#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)	\
+	ldd	[src + off + 0x00], t0;						\
+	ldd	[src + off + 0x08], t2;						\
+	ldd	[src + off + 0x10], t4;						\
+	ldd	[src + off + 0x18], t6;						\
+	st	t0, [dst + off + 0x00];						\
+	addxcc	t0, sum, sum;							\
+	st	t1, [dst + off + 0x04];						\
+	addxcc	t1, sum, sum;							\
+	st	t2, [dst + off + 0x08];						\
+	addxcc	t2, sum, sum;							\
+	st	t3, [dst + off + 0x0c];						\
+	addxcc	t3, sum, sum;							\
+	st	t4, [dst + off + 0x10];						\
+	addxcc	t4, sum, sum;							\
+	st	t5, [dst + off + 0x14];						\
+	addxcc	t5, sum, sum;							\
+	st	t6, [dst + off + 0x18];						\
+	addxcc	t6, sum, sum;							\
+	st	t7, [dst + off + 0x1c];						\
+	addxcc	t7, sum, sum;
+
+	/* Yuck, 6 superscalar cycles... */
+#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3)	\
+	ldd	[src - off - 0x08], t0;				\
+	ldd	[src - off - 0x00], t2;				\
+	addxcc	t0, sum, sum;					\
+	st	t0, [dst - off - 0x08];				\
+	addxcc	t1, sum, sum;					\
+	st	t1, [dst - off - 0x04];				\
+	addxcc	t2, sum, sum;					\
+	st	t2, [dst - off - 0x00];				\
+	addxcc	t3, sum, sum;					\
+	st	t3, [dst - off + 0x04];
+
+	/* Handle the end cruft code out of band for better cache patterns. */
+cc_end_cruft:
+	be	1f
+	 andcc	%o3, 4, %g0
+	ldd	[%o0 + 0x00], %g2
+	add	%o1, 8, %o1
+	addcc	%g2, %g7, %g7
+	add	%o0, 8, %o0
+	addxcc	%g3, %g7, %g7
+	st	%g2, [%o1 - 0x08]
+	addx	%g0, %g7, %g7
+	andcc	%o3, 4, %g0
+	st	%g3, [%o1 - 0x04]
+1:	be	1f
+	 andcc	%o3, 3, %o3
+	ld	[%o0 + 0x00], %g2
+	add	%o1, 4, %o1
+	addcc	%g2, %g7, %g7
+	st	%g2, [%o1 - 0x04]
+	addx	%g0, %g7, %g7
+	add	%o0, 4, %o0
+	andcc	%o3, 3, %g0
+1:	be	1f
+	 addcc	%o3, -1, %g0
+	bne	2f
+	 subcc	%o3, 2, %o3
+	b	4f
+	 or	%g0, %g0, %o4
+2:	lduh	[%o0 + 0x00], %o4
+	add	%o0, 2, %o0
+	sth	%o4, [%o1 + 0x00]
+	be	6f
+	 add	%o1, 2, %o1
+	sll	%o4, 16, %o4
+4:	ldub	[%o0 + 0x00], %o5
+	stb	%o5, [%o1 + 0x00]
+	sll	%o5, 8, %o5
+	or	%o5, %o4, %o4
+6:	addcc	%o4, %g7, %g7
+1:	retl
+	 addx	%g0, %g7, %o0
+
+	/* Also, handle the alignment code out of band. */
+cc_dword_align:
+	cmp	%g1, 6
+	bl,a	ccte
+	 andcc	%g1, 0xf, %o3
+	andcc	%o0, 0x1, %g0
+	bne	ccslow
+	 andcc	%o0, 0x2, %g0
+	be	1f
+	 andcc	%o0, 0x4, %g0
+	lduh	[%o0 + 0x00], %g2
+	sub	%g1, 2, %g1
+	sth	%g2, [%o1 + 0x00]
+	add	%o0, 2, %o0
+	sll	%g2, 16, %g2
+	addcc	%g2, %g7, %g7
+	add	%o1, 2, %o1
+	srl	%g7, 16, %g3
+	addx	%g0, %g3, %g2
+	sll	%g7, 16, %g7
+	sll	%g2, 16, %g3
+	srl	%g7, 16, %g7
+	andcc	%o0, 0x4, %g0
+	or	%g3, %g7, %g7
+1:	be	3f
+	 andcc	%g1, 0xffffff80, %g0
+	ld	[%o0 + 0x00], %g2
+	sub	%g1, 4, %g1
+	st	%g2, [%o1 + 0x00]
+	add	%o0, 4, %o0
+	addcc	%g2, %g7, %g7
+	add	%o1, 4, %o1
+	addx	%g0, %g7, %g7
+	b	3f
+	 andcc	%g1, 0xffffff80, %g0
+
+	/* Sun, you just can't beat me, you just can't.  Stop trying,
+	 * give up.  I'm serious, I am going to kick the living shit
+	 * out of you, game over, lights out.
+	 */
+	.align	8
+	.globl	C_LABEL(csum_partial_copy)
+C_LABEL(csum_partial_copy):		/* %o0=src, %o1=dest, %o2=len, %o3=sum */
+	xor	%o0, %o1, %o4		! get changing bits
+	mov	%o2, %g1		! free up %o2
+	andcc	%o4, 3, %g0		! check for mismatched alignment
+	bne	ccslow			! better this than unaligned/fixups
+	 andcc	%o0, 7, %g0		! need to align things?
+	mov	%o3, %g7		! free up %o3
+	bne	cc_dword_align		! yes, we check for short lengths there
+	 andcc	%g1, 0xffffff80, %g0	! can we use unrolled loop?
+3:	be	3f			! nope, less than one loop remains
+	 andcc	%o1, 4, %g0		! dest aligned on 4 or 8 byte boundry?
+	be	ccdbl + 4		! 8 byte aligned, kick ass
+5:	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	sub	%g1, 128, %g1		! detract from length
+	addx	%g0, %g7, %g7		! add in last carry bit
+	andcc	%g1, 0xffffff80, %g0	! more to csum?
+	add	%o0, 128, %o0		! advance src ptr
+	bne	5b			! we did not go negative, continue looping
+	 add	%o1, 128, %o1		! advance dest ptr
+3:	andcc	%g1, 0x70, %o2		! can use table?
+ccmerge:be	ccte			! nope, go and check for end cruft
+	 andcc	%g1, 0xf, %o3		! get low bits of length (clears carry btw)
+	srl	%o2, 1, %o4		! begin negative offset computation
+	sethi	%hi(ccte - 8), %o5	! set up table ptr end
+	add	%o0, %o2, %o0		! advance src ptr
+	sub	%o5, %o4, %o5		! continue table calculation
+	sll	%o2, 1, %g2		! constant multiplies are fun...
+	sub	%o5, %g2, %o5		! some more adjustments
+	jmp	%o5 + %lo(ccte - 8)	! jump into it, duff style, wheee...
+	 add	%o1, %o2, %o1		! advance dest ptr (carry is clear btw)
+cctbl:	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
+	addx	%g0, %g7, %g7
+	andcc	%o3, 0xf, %g0		! check for low bits set
+ccte:	bne	cc_end_cruft		! something left, handle it out of band
+	 andcc	%o3, 8, %g0		! begin checks for that code
+	retl				! return
+	 mov	%g7, %o0		! give em the computed checksum
+ccdbl:	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	sub	%g1, 128, %g1		! detract from length
+	addx	%g0, %g7, %g7		! add in last carry bit
+	andcc	%g1, 0xffffff80, %g0	! more to csum?
+	add	%o0, 128, %o0		! advance src ptr
+	bne	ccdbl			! we did not go negative, continue looping
+	 add	%o1, 128, %o1		! advance dest ptr
+	b	ccmerge			! finish it off, above
+	 andcc	%g1, 0x70, %o2		! can use table? (clears carry btw)
+
+ccslow:
+	save	%sp, -104, %sp
+	mov	%i0, %g2
+	mov	%g2, %o4
+	orcc	%i2, %g0, %o5
+	ble	.LL37
+	 mov	0, %o3
+	andcc	%g2, 1, %g3
+	be	.LL50
+	 sra	%o5, 1, %o1
+	ldub	[%g2], %o3
+	add	%i2, -1, %o5
+	add	%g2, 1, %o4
+	sra	%o5, 1, %o1
+.LL50:
+	cmp	%o1, 0
+	be	.LL39
+	 andcc	%o4, 2, %g0
+	be,a	.LL51
+	 sra	%o1, 1, %o1
+	add	%o1, -1, %o1
+	lduh	[%o4], %o0
+	add	%o5, -2, %o5
+	add	%o3, %o0, %o3
+	add	%o4, 2, %o4
+	sra	%o1, 1, %o1
+.LL51:
+	cmp	%o1, 0
+	be	.LL41
+	 mov	0, %o2
+.LL42:
+	ld	[%o4], %o0
+	add	%o3, %o2, %o3
+	add	%o3, %o0, %o3
+	cmp	%o3, %o0
+	 addx	%g0, 0, %o2
+	addcc	%o1, -1, %o1
+	bne	.LL42
+	 add	%o4, 4, %o4
+	add	%o3, %o2, %o3
+	sethi	%hi(65535), %o0
+	or	%o0, %lo(65535), %o0
+	and	%o3, %o0, %o0
+	srl	%o3, 16, %o1
+	add	%o0, %o1, %o3
+.LL41:
+	andcc	%o5, 2, %g0
+	be	.LL52
+	 andcc	%o5, 1, %g0
+	lduh	[%o4], %o0
+	add	%o3, %o0, %o3
+	add	%o4, 2, %o4
+.LL39:
+	andcc	%o5, 1, %g0
+.LL52:
+	be	.LL53
+	 sethi	%hi(65535), %o0
+	ldub	[%o4], %o0
+	sll	%o0, 8, %o0
+	add	%o3, %o0, %o3
+	sethi	%hi(65535), %o0
+.LL53:
+	or	%o0, %lo(65535), %o0
+	and	%o3, %o0, %o2
+	srl	%o3, 16, %o1
+	add	%o2, %o1, %o1
+	and	%o1, %o0, %o2
+	srl	%o1, 16, %o1
+	add	%o2, %o1, %o1
+	and	%o1, %o0, %o0
+	srl	%o1, 16, %o1
+	add	%o0, %o1, %o1
+	sll	%o1, 16, %o0
+	cmp	%g3, 0
+	be	.LL37
+	 srl	%o0, 16, %o3
+	srl	%o0, 24, %o1
+	and	%o3, 255, %o0
+	sll	%o0, 8, %o0
+	or	%o1, %o0, %o3
+.LL37:
+	add	%o3, %i3, %o1
+	sethi	%hi(65535), %o0
+	or	%o0, %lo(65535), %o0
+	and	%o1, %o0, %o0
+	srl	%o1, 16, %o1
+	add	%o0, %o1, %i0
+	mov	%i1, %o0
+	mov	%g2, %o1
+	call	C_LABEL(__memcpy)
+	 mov	%i2, %o2
+	ret
+	 restore
diff --git a/arch/sparc/lib/memcmp.S b/arch/sparc/lib/memcmp.S
new file mode 100644
index 000000000..bf22e492c
--- /dev/null
+++ b/arch/sparc/lib/memcmp.S
@@ -0,0 +1,314 @@
+#include <asm/cprefix.h>
+
+	.text
+	.align 4
+	.global C_LABEL(__memcmp), C_LABEL(memcmp)
+C_LABEL(__memcmp):
+C_LABEL(memcmp):
+#if 1
+	cmp	%o2, 0
+	ble	L3
+	 mov	0, %g3
+L5:
+	ldub	[%o0], %g2
+	ldub	[%o1], %g3
+	sub	%g2, %g3, %g2
+	mov	%g2, %g3
+	sll	%g2, 24, %g2
+
+	cmp	%g2, 0
+	bne	L3
+	 add	%o0, 1, %o0
+
+	add	%o2, -1, %o2
+
+	cmp	%o2, 0
+	bg	L5
+	 add	%o1, 1, %o1
+L3:
+	sll	%g3, 24, %o0
+	sra	%o0, 24, %o0
+
+	retl
+	 nop
+#else
+	save	%sp, -104, %sp
+	mov	%i2, %o4
+	mov	%i0, %o0
+
+	cmp	%o4, 15
+	ble	L72
+	 mov	%i1, %i2
+
+	andcc	%i2, 3, %g0
+	be	L161
+	 andcc	%o0, 3, %g2
+L75:
+	ldub	[%o0], %g3
+	ldub	[%i2], %g2
+	add	%o0,1, %o0
+
+	subcc	%g3, %g2, %i0
+	bne	L156
+	 add	%i2, 1, %i2
+
+	andcc	%i2, 3, %g0
+	bne	L75
+	 add	%o4, -1, %o4
+
+	andcc	%o0, 3, %g2
+L161:
+	bne,a	L78
+	 mov	%i2, %i1
+
+	mov	%o0, %i5
+	mov	%i2, %i3
+	srl	%o4, 2, %i4
+
+	cmp	%i4, 0
+	bge	L93
+	 mov	%i4, %g2
+
+	add %i4, 3, %g2
+L93:
+	sra	%g2, 2, %g2
+	sll	%g2, 2, %g2
+	sub	%i4, %g2, %g2
+
+	cmp	%g2, 1
+	be,a	L88
+	 add	%o0, 4, %i5
+
+	bg	L94
+	 cmp	%g2, 2
+
+	cmp	%g2, 0
+	be,a	L86
+	 ld	[%o0], %g3
+
+	b	L162
+	 ld	[%i5], %g3
+L94:
+	be	L81
+	 cmp	%g2, 3
+
+	be,a	L83
+	 add	%o0, -4, %i5
+
+	b	L162
+	 ld	[%i5], %g3
+L81:
+	add	%o0, -8, %i5
+	ld	[%o0], %g3
+	add	%i2, -8, %i3
+	ld	[%i2], %g2
+
+	b	L82
+	 add	%i4, 2, %i4
+L83:
+	ld	[%o0], %g4
+	add	%i2, -4, %i3
+	ld	[%i2], %g1
+
+	b	L84
+	 add	%i4, 1, %i4
+L86:
+	b	L87
+	 ld	[%i2], %g2
+L88:
+	add	%i2, 4, %i3
+	ld	[%o0], %g4
+	add	%i4, -1, %i4
+	ld	[%i2], %g1
+L95:
+	ld	[%i5], %g3
+L162:
+	cmp	%g4, %g1
+	be	L87
+	 ld	[%i3], %g2
+
+	cmp	%g4, %g1
+L163:
+	bleu	L114
+	 mov	-1, %i0
+
+	b	L114
+	 mov	1, %i0
+L87:
+	ld	[%i5 + 4], %g4
+	cmp	%g3, %g2
+	bne	L163
+	 ld	[%i3 + 4], %g1
+L84:
+	ld	[%i5 + 8], %g3
+
+	cmp	%g4, %g1
+	bne	L163
+	 ld	[%i3 + 8], %g2
+L82:
+	ld	[%i5 + 12], %g4
+	cmp	%g3, %g2
+	bne	L163
+	 ld	[%i3 + 12], %g1
+
+	add	%i5, 16, %i5
+
+	addcc	%i4, -4, %i4
+	bne	L95
+	 add	%i3, 16, %i3
+
+	cmp	%g4, %g1
+	bne	L163
+	 nop
+
+	b	L114
+	 mov	0, %i0
+L78:
+	srl	%o4, 2, %i0
+	and	%o0, -4, %i3
+	orcc	%i0, %g0, %g3
+	sll	%g2, 3, %o7
+	mov	32, %g2
+
+	bge	L129
+	 sub	%g2, %o7, %o1
+
+	add	%i0, 3, %g3
+L129:
+	sra	%g3, 2, %g2
+	sll	%g2, 2, %g2
+	sub	%i0, %g2, %g2
+
+	cmp	%g2, 1
+	be,a	L124
+	 ld	[%i3], %o3
+
+	bg	L130
+	 cmp	%g2, 2
+
+	cmp	%g2, 0
+	be,a	L122
+	 ld	[%i3], %o2
+
+	b	L164
+	sll	%o3, %o7, %g3
+L130:
+	be	L117
+	 cmp	%g2, 3
+
+	be,a	L119
+	 ld	[%i3], %g1
+
+	b	L164
+	 sll	%o3, %o7, %g3
+L117:
+	ld	[%i3], %g4
+	add	%i2, -8, %i1
+	ld	[%i3 + 4], %o3
+	add	%i0, 2, %i0
+	ld	[%i2], %i4
+
+	b	L118
+	 add	%i3, -4, %i3
+L119:
+	ld	[%i3 + 4], %g4
+	add	%i2, -4, %i1
+	ld	[%i2], %i5
+
+	b	L120
+	 add	%i0, 1, %i0
+L122:
+	ld	[%i3 + 4], %g1
+	ld	[%i2], %i4
+
+	b	L123
+	 add	%i3, 4, %i3
+L124:
+	add	%i2, 4, %i1
+	ld	[%i3 + 4], %o2
+	add	%i0, -1, %i0
+	ld	[%i2], %i5
+	add	%i3, 8, %i3
+L131:
+	sll	%o3, %o7, %g3
+L164:
+	srl	%o2, %o1, %g2
+	ld	[%i3], %g1
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i5
+	bne	L163
+	 ld	[%i1], %i4
+L123:
+	sll	%o2, %o7, %g3
+	srl	%g1, %o1, %g2
+	ld	[%i3 + 4], %g4
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i4
+	bne	L163
+	 ld	[%i1 + 4], %i5
+L120:
+	sll	%g1, %o7, %g3
+	srl	%g4, %o1, %g2
+	ld	[%i3 + 8], %o3
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i5
+	bne	L163
+	 ld	[%i1 + 8], %i4
+L118:
+	sll	%g4, %o7, %g3
+	srl	%o3, %o1, %g2
+	ld	[%i3 + 12], %o2
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i4
+	bne	L163
+	 ld	[%i1 + 12], %i5
+
+	add	%i3, 16, %i3
+	addcc	%i0, -4, %i0
+	bne	L131
+	 add	%i1, 16, %i1
+
+	sll	%o3, %o7, %g3
+	srl	%o2, %o1, %g2
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i5
+	be,a	L114
+	 mov	0, %i0
+
+	b,a L163
+L114:
+	cmp	%i0, 0
+	bne	L156
+	 and	%o4, -4, %g2
+
+	add	%o0, %g2, %o0
+	add	%i2, %g2, %i2
+	and	%o4, 3, %o4
+L72:
+	cmp	%o4, 0
+	be	L156
+	 mov	0, %i0
+
+	ldub	[%o0], %g3
+L165:
+	ldub	[%i2], %g2
+	add	%o0, 1, %o0
+
+	subcc	%g3, %g2, %i0
+	bne	L156
+	 add	%i2, 1, %i2
+
+	addcc	%o4, -1, %o4
+	bne,a	L165
+	 ldub	[%o0], %g3
+
+	mov	0, %i0
+L156:
+	ret
+	restore
+#endif
diff --git a/arch/sparc/lib/memcpy.S b/arch/sparc/lib/memcpy.S
new file mode 100644
index 000000000..c4f0394a4
--- /dev/null
+++ b/arch/sparc/lib/memcpy.S
@@ -0,0 +1,364 @@
+/* memcpy.S: Sparc optimized memcpy code.
+ *
+ *  Copyright(C) 1995 Linus Torvalds
+ *  Copyright(C) 1996 David S. Miller
+ *  Copyright(C) 1996 Eddie C. Dost
+ *  Copyright(C) 1996 Jakub Jelinek
+ *
+ * derived from:
+ *	e-mail between David and Eddie.
+ */
+
+#include <asm/cprefix.h>
+#include <asm/ptrace.h>
+
+/* Both these macros have to start with exactly the same insn */
+#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+	ldd	[%src + offset + 0x00], %t0; \
+	ldd	[%src + offset + 0x08], %t2; \
+	ldd	[%src + offset + 0x10], %t4; \
+	ldd	[%src + offset + 0x18], %t6; \
+	st	%t0, [%dst + offset + 0x00]; \
+	st	%t1, [%dst + offset + 0x04]; \
+	st	%t2, [%dst + offset + 0x08]; \
+	st	%t3, [%dst + offset + 0x0c]; \
+	st	%t4, [%dst + offset + 0x10]; \
+	st	%t5, [%dst + offset + 0x14]; \
+	st	%t6, [%dst + offset + 0x18]; \
+	st	%t7, [%dst + offset + 0x1c];
+
+#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
+	ldd	[%src + offset + 0x00], %t0; \
+	ldd	[%src + offset + 0x08], %t2; \
+	ldd	[%src + offset + 0x10], %t4; \
+	ldd	[%src + offset + 0x18], %t6; \
+	std	%t0, [%dst + offset + 0x00]; \
+	std	%t2, [%dst + offset + 0x08]; \
+	std	%t4, [%dst + offset + 0x10]; \
+	std	%t6, [%dst + offset + 0x18];
+
+#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
+	ldd	[%src - offset - 0x10], %t0; \
+	ldd	[%src - offset - 0x08], %t2; \
+	st	%t0, [%dst - offset - 0x10]; \
+	st	%t1, [%dst - offset - 0x0c]; \
+	st	%t2, [%dst - offset - 0x08]; \
+	st	%t3, [%dst - offset - 0x04];
+
+#define MOVE_HALFCHUNK(src, dst, offset, t0, t1, t2, t3) \
+	lduh	[%src + offset + 0x00], %t0; \
+	lduh	[%src + offset + 0x02], %t1; \
+	lduh	[%src + offset + 0x04], %t2; \
+	lduh	[%src + offset + 0x06], %t3; \
+	sth	%t0, [%dst + offset + 0x00]; \
+	sth	%t1, [%dst + offset + 0x02]; \
+	sth	%t2, [%dst + offset + 0x04]; \
+	sth	%t3, [%dst + offset + 0x06];
+
+#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
+	ldub	[%src - offset - 0x02], %t0; \
+	ldub	[%src - offset - 0x01], %t1; \
+	stb	%t0, [%dst - offset - 0x02]; \
+	stb	%t1, [%dst - offset - 0x01];
+
+	.text
+	.align	4
+
+	.globl	C_LABEL(__memcpy), C_LABEL(memcpy), C_LABEL(bcopy)
+	.globl	C_LABEL(amemmove), C_LABEL(memmove)
+C_LABEL(bcopy):
+	mov	%o0, %o3
+	mov	%o1, %o0
+	mov	%o3, %o1
+C_LABEL(amemmove):
+C_LABEL(memmove):
+/* This should be kept as optimized as possible */
+	cmp	%o0, %o1
+	bleu	1f
+	 xor	%o0, %o1, %o4
+
+	add	%o1, %o2, %o3
+	cmp	%o3, %o0
+	bleu	2f
+	 andcc	%o4, 3, %g0
+
+/* But I think from now on, we can hold on. Or tell me, is memmoving
+ * overlapping regions such a nice game? */
+
+	mov	%o0, %g1
+	add	%o1, %o2, %o1
+	add	%o0, %o2, %o0
+	sub	%o1, 1, %o1
+	sub	%o0, 1, %o0
+	
+reverse_bytes:
+	ldub	[%o1], %o4
+	subcc	%o2, 1, %o2
+	stb	%o4, [%o0]
+	sub	%o1, 1, %o1
+	bne	reverse_bytes
+	 sub	%o0, 1, %o0
+
+	retl
+	 mov	%g1, %o0
+
+/* And here start optimizing again... */
+
+dword_align:
+	andcc	%o1, 1, %g0
+	be	4f
+	 andcc	%o1, 2, %g0
+
+	ldub	[%o1], %g2
+	add	%o1, 1, %o1
+	stb	%g2, [%o0]
+	sub	%o2, 1, %o2
+	bne	3f
+	 add	%o0, 1, %o0
+
+	lduh	[%o1], %g2
+	add	%o1, 2, %o1
+	sth	%g2, [%o0]
+	sub	%o2, 2, %o2
+	b	3f
+	 add	%o0, 2, %o0
+4:
+	lduh	[%o1], %g2
+	add	%o1, 2, %o1
+	sth	%g2, [%o0]
+	sub	%o2, 2, %o2
+	b	3f
+	 add	%o0, 2, %o0
+
+C_LABEL(__memcpy):
+C_LABEL(memcpy):	/* %o0=dst %o1=src %o2=len */
+	xor	%o0, %o1, %o4
+1:
+	andcc	%o4, 3, %o5
+2:
+	bne	cannot_optimize
+	 cmp	%o2, 15
+
+	bleu	short_aligned_end
+	 andcc	%o1, 3, %g0
+
+	bne	dword_align
+3:
+	 andcc	%o1, 4, %g0
+
+	be	2f
+	 mov	%o2, %g1
+
+	ld	[%o1], %o4
+	sub	%g1, 4, %g1
+	st	%o4, [%o0]
+	add	%o1, 4, %o1
+	add	%o0, 4, %o0
+2:
+	andcc	%g1, 0xffffff80, %g7
+	be	3f
+	 andcc	%o0, 4, %g0
+
+	be	ldd_std + 4
+5:
+	MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+	MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
+	MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+	MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
+	subcc	%g7, 128, %g7
+	add	%o1, 128, %o1
+	bne	5b
+	 add	%o0, 128, %o0
+3:
+	andcc	%g1, 0x70, %g7
+	be	memcpy_table_end
+	 andcc	%g1, 8, %g0
+
+	sethi	%hi(memcpy_table_end), %o5
+	srl	%g7, 1, %o4
+	add	%g7, %o4, %o4
+	add	%o1, %g7, %o1
+	sub	%o5, %o4, %o5
+	jmpl	%o5 + %lo(memcpy_table_end), %g0
+	 add	%o0, %g7, %o0
+
+memcpy_table:
+	MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
+	MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
+	MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
+	MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
+	MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
+	MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
+	MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
+
+memcpy_table_end:
+	be	memcpy_last7
+	 andcc	%g1, 4, %g0
+
+	ldd	[%o1], %g2
+	add	%o0, 8, %o0
+	add	%o1, 8, %o1
+	st	%g2, [%o0 - 0x08]
+	st	%g3, [%o0 - 0x04]
+memcpy_last7:
+	be	1f
+	 andcc	%g1, 2, %g0
+
+	ld	[%o1], %g2
+	add	%o1, 4, %o1
+	st	%g2, [%o0]
+	add	%o0, 4, %o0
+1:
+	be	1f
+	 andcc	%g1, 1, %g0
+
+	lduh	[%o1], %g2
+	add	%o1, 2, %o1
+	sth	%g2, [%o0]
+	add	%o0, 2, %o0
+1:
+	be	1f
+	 nop
+
+	ldub	[%o1], %g2
+	stb	%g2, [%o0]
+1:
+	retl
+ 	 nop
+
+	/* Placed here for cache reasons. */
+	.globl	C_LABEL(__copy_to_user), C_LABEL(__copy_from_user)
+C_LABEL(__copy_to_user):
+	b	copy_user_common
+	 st	%o0, [%g6 + THREAD_EX_ADDR]
+
+C_LABEL(__copy_from_user):
+	st	%o1, [%g6 + THREAD_EX_ADDR]
+
+copy_user_common:
+	ld	[%g6 + THREAD_EX_COUNT], %g1
+	set	copy_user_failure, %g2
+	add	%g1, 1, %g1
+	st	%o7, [%g6 + THREAD_EX_PC]
+	st	%g1, [%g6 + THREAD_EX_COUNT]
+	call	C_LABEL(__memcpy)
+	 st	%g2, [%g6 + THREAD_EX_EXPC]
+
+copy_user_success:
+	ldd	[%g6 + THREAD_EX_COUNT], %g2
+	mov	0, %o0
+	sub	%g2, 1, %g1
+	jmpl	%g3 + 0x8, %g0
+	 st	%g1, [%g6 + THREAD_EX_COUNT]
+
+copy_user_failure:
+	jmpl	%g3 + 0x8, %g0
+	 mov	%g2, %o0
+
+ldd_std:
+	MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
+	MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
+	MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
+	MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
+	subcc	%g7, 128, %g7
+	add	%o1, 128, %o1
+	bne	ldd_std
+	 add	%o0, 128, %o0
+
+	andcc	%g1, 0x70, %g7
+	be	memcpy_table_end
+	 andcc	%g1, 8, %g0
+
+	sethi	%hi(memcpy_table_end), %o5
+	srl	%g7, 1, %o4
+	add	%g7, %o4, %o4
+	add	%o1, %g7, %o1
+	sub	%o5, %o4, %o5
+	jmpl	%o5 + %lo(memcpy_table_end), %g0
+	 add	%o0, %g7, %o0
+
+cannot_optimize:
+	bleu	short_end
+	 cmp	%o5, 2
+
+	bne	byte_chunk
+	 and	%o2, 0xfffffff0, %o3
+	 
+	andcc	%o1, 1, %g0
+	be	1f
+	 nop
+
+	ldub	[%o1], %g2
+	add	%o1, 1, %o1
+	sub	%o2, 1, %o2
+	stb	%g2, [%o0]
+	andcc	%o2, 0xfffffff0, %o3
+	be	short_end
+	 add	%o0, 1, %o0
+1:
+	MOVE_HALFCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
+	MOVE_HALFCHUNK(o1, o0, 0x08, g2, g3, g4, g5)
+	subcc	%o3, 0x10, %o3
+	add	%o1, 0x10, %o1
+	bne	1b
+	 add	%o0, 0x10, %o0
+	b	2f
+	 and	%o2, 0xe, %o3
+	
+byte_chunk:
+	MOVE_SHORTCHUNK(o1, o0, -0x02, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, -0x04, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, -0x06, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, -0x08, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, -0x0a, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, -0x0c, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, -0x0e, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, -0x10, g2, g3)
+	subcc	%o3, 0x10, %o3
+	add	%o1, 0x10, %o1
+	bne	byte_chunk
+	 add	%o0, 0x10, %o0
+
+short_end:
+	and	%o2, 0xe, %o3
+2:
+	sethi	%hi(short_table_end), %o5
+	sll	%o3, 3, %o4
+	add	%o0, %o3, %o0
+	sub	%o5, %o4, %o5
+	add	%o1, %o3, %o1
+	jmpl	%o5 + %lo(short_table_end), %g0
+	 andcc	%o2, 1, %g0
+
+	MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
+	MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
+short_table_end:
+	be	1f
+	 nop
+	ldub	[%o1], %g2
+	stb	%g2, [%o0]
+1:
+	retl
+ 	 nop
+
+short_aligned_end:
+	bne	short_end
+	 andcc	%o2, 8, %g0
+
+	be	1f
+	 andcc	%o2, 4, %g0
+
+	ld	[%o1 + 0x00], %g2
+	ld	[%o1 + 0x04], %g3
+	add	%o1, 8, %o1
+	st	%g2, [%o0 + 0x00]
+	st	%g3, [%o0 + 0x04]
+	add	%o0, 8, %o0
+1:
+	b	memcpy_last7
+	 mov	%o2, %g1
diff --git a/arch/sparc/lib/memscan.S b/arch/sparc/lib/memscan.S
new file mode 100644
index 000000000..f334751c2
--- /dev/null
+++ b/arch/sparc/lib/memscan.S
@@ -0,0 +1,135 @@
+/* $Id: memscan.S,v 1.4 1996/09/08 02:01:20 davem Exp $
+ * memscan.S: Optimized memscan for the Sparc.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <asm/cprefix.h>
+
+/* In essence, this is just a fancy strlen. */
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+	.text
+	.align	4
+	.globl	C_LABEL(__memscan_zero), C_LABEL(__memscan_generic)
+	.globl	C_LABEL(memscan)
+C_LABEL(__memscan_zero):
+	/* %o0 = addr, %o1 = size */
+	cmp	%o1, 0
+	bne,a	1f
+	 andcc	%o0, 3, %g0
+
+	retl
+	 nop
+
+1:
+	be	mzero_scan_word
+	 sethi	%hi(HI_MAGIC), %g2
+
+	ldsb	[%o0], %g3
+mzero_still_not_word_aligned:
+	cmp	%g3, 0
+	bne	1f
+	 add	%o0, 1, %o0
+
+	retl
+	 sub	%o0, 1, %o0
+
+1:
+	subcc	%o1, 1, %o1
+	bne,a	1f
+	 andcc	%o0, 3, %g0
+
+	retl
+	 nop
+
+1:
+	bne,a	mzero_still_not_word_aligned
+	 ldsb	[%o0], %g3
+
+	sethi	%hi(HI_MAGIC), %g2
+mzero_scan_word:
+	or	%g2, %lo(HI_MAGIC), %o3
+	sethi	%hi(LO_MAGIC), %g3
+	or	%g3, %lo(LO_MAGIC), %o2
+mzero_next_word:
+	ld	[%o0], %g2
+mzero_next_word_preloaded:
+	sub	%g2, %o2, %g2
+mzero_next_word_preloaded_next:
+	andcc	%g2, %o3, %g0
+	bne	mzero_byte_zero
+	 add	%o0, 4, %o0
+
+mzero_check_out_of_fuel:
+	subcc	%o1, 4, %o1
+	bg,a	1f
+	 ld	[%o0], %g2
+
+	retl
+	 nop
+
+1:
+	b	mzero_next_word_preloaded_next
+	 sub	%g2, %o2, %g2
+
+	/* Check every byte. */
+mzero_byte_zero:
+	ldsb	[%o0 - 4], %g2
+	cmp	%g2, 0
+	bne	mzero_byte_one
+	 sub	%o0, 4, %g3
+
+	retl
+	 mov	%g3, %o0
+
+mzero_byte_one:
+	ldsb	[%o0 - 3], %g2
+	cmp	%g2, 0
+	bne,a	mzero_byte_two_and_three
+	 ldsb	[%o0 - 2], %g2
+
+	retl
+	 sub	%o0, 3, %o0
+
+mzero_byte_two_and_three:
+	cmp	%g2, 0
+	bne,a	1f
+	 ldsb	[%o0 - 1], %g2
+
+	retl
+	 sub	%o0, 2, %o0
+
+1:
+	cmp	%g2, 0
+	bne,a	mzero_next_word_preloaded
+	 ld	[%o0], %g2
+
+	retl
+	 sub	%o0, 1, %o0
+
+mzero_found_it:
+	retl
+	 sub	%o0, 2, %o0
+
+C_LABEL(memscan):
+C_LABEL(__memscan_generic):
+	/* %o0 = addr, %o1 = c, %o2 = size */
+	cmp	%o2, 0
+	bne,a	0f
+	 ldub	[%o0], %g2
+
+	b,a	2f
+1:
+	ldub	[%o0], %g2
+0:
+	cmp	%g2, %o1
+	be	2f
+	 addcc	%o2, -1, %o2
+	bne	1b
+	 add	%o0, 1, %o0
+2:
+	retl
+	 nop
diff --git a/arch/sparc/lib/memset.S b/arch/sparc/lib/memset.S
new file mode 100644
index 000000000..95691debb
--- /dev/null
+++ b/arch/sparc/lib/memset.S
@@ -0,0 +1,166 @@
+/* linux/arch/sparc/lib/memset.S: Sparc optimized memset and bzero code
+ * Hand optimized from GNU libc's memset
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <asm/cprefix.h>
+#include <asm/ptrace.h>
+
+#define HANDLE_UNALIGNED 1
+
+	/* Store 64 bytes at (BASE + OFFSET) using value SOURCE. */
+#define ZERO_BIG_BLOCK(base, offset, source)    \
+	std	source, [base + offset + 0x00]; \
+	std	source, [base + offset + 0x08]; \
+	std	source, [base + offset + 0x10]; \
+	std	source, [base + offset + 0x18]; \
+	std	source, [base + offset + 0x20]; \
+	std	source, [base + offset + 0x28]; \
+	std	source, [base + offset + 0x30]; \
+	std	source, [base + offset + 0x38];
+
+#define ZERO_LAST_BLOCKS(base, offset, source)	\
+	std	source, [base - offset - 0x38]; \
+	std	source, [base - offset - 0x30]; \
+	std	source, [base - offset - 0x28]; \
+	std	source, [base - offset - 0x20]; \
+	std	source, [base - offset - 0x18]; \
+	std	source, [base - offset - 0x10]; \
+	std	source, [base - offset - 0x08]; \
+	std	source, [base - offset - 0x00];
+
+	.text
+	.align 4
+
+	.globl	C_LABEL(__bzero), C_LABEL(__memset), C_LABEL(memset)
+C_LABEL(__memset):
+C_LABEL(memset):
+	and	%o1, 0xff, %g3
+	sll	%g3, 8, %g2
+	or	%g3, %g2, %g3
+	sll	%g3, 16, %g2
+	or	%g3, %g2, %g3
+	b	1f
+	 mov	%o2, %o1
+
+#if HANDLE_UNALIGNED
+/* As this is highly unprobable, we optimize the other case (4 aligned)
+ * Define HANDLE_UNALIGNED to 0, if all the alignment work is done by
+ * the trap. Then we have to hope nobody will memset something unaligned
+ * with large counts, as this would lead to a lot of traps...
+ */
+3:
+	cmp	%o2, 3
+	be	2f
+	 stb	%g3, [%o0]
+
+	cmp	%o2, 2
+	be	2f
+	 stb	%g3, [%o0 + 0x01]
+
+	stb	%g3, [%o0 + 0x02]
+2:
+	sub	%o2, 4, %o2
+	add	%o1, %o2, %o1
+	b	4f
+	 sub	%o0, %o2, %o0
+#endif /* HANDLE_UNALIGNED */
+
+	.globl	C_LABEL(__clear_user)
+C_LABEL(__clear_user):
+	st	%o0, [%g6 + THREAD_EX_ADDR]
+	ld	[%g6 + THREAD_EX_COUNT], %g1
+	set	clear_user_failure, %g2
+	add	%g1, 1, %g1
+	st	%o7, [%g6 + THREAD_EX_PC]
+	st	%g1, [%g6 + THREAD_EX_COUNT]
+	call	C_LABEL(__bzero)
+	 st	%g2, [%g6 + THREAD_EX_EXPC]
+
+clear_user_success:
+	ldd	[%g6 + THREAD_EX_COUNT], %g2
+	mov	0, %o0
+	sub	%g2, 1, %g1
+	jmpl	%g3 + 0x8, %g0
+	 st	%g1, [%g6 + THREAD_EX_COUNT]
+
+clear_user_failure:
+	jmpl	%g3 + 0x8, %g0
+	 mov	%g2, %o0
+
+C_LABEL(__bzero):
+	mov	%g0, %g3
+1:
+	cmp	%o1, 7
+	bleu	7f
+	 mov	%o0, %g1
+
+#if HANDLE_UNALIGNED
+	andcc	%o0, 3, %o2
+	bne	3b
+#endif /* HANDLE_UNALIGNED */
+4:
+	 andcc	%o0, 4, %g0
+
+	be	2f
+	 mov	%g3, %g2
+
+	st	%g3, [%o0]
+	sub	%o1, 4, %o1
+	add	%o0, 4, %o0
+2:
+	andcc	%o1, 0xffffff80, %o3	! Now everything is 8 aligned and o1 is len to run
+	be	9f
+	 andcc	%o1, 0x78, %o2
+4:
+	ZERO_BIG_BLOCK(%o0, 0x00, %g2)
+	subcc	%o3, 128, %o3
+	ZERO_BIG_BLOCK(%o0, 0x40, %g2)
+	bne	4b
+	 add	%o0, 128, %o0
+
+	orcc	%o2, %g0, %g0
+9:
+	be	6f
+	 andcc	%o1, 7, %o1
+
+	srl	%o2, 1, %o3
+	set	bzero_table + 64, %o4
+	sub	%o4, %o3, %o4
+	jmp	%o4
+	 add	%o0, %o2, %o0
+
+bzero_table:
+	ZERO_LAST_BLOCKS(%o0, 0x48, %g2)
+	ZERO_LAST_BLOCKS(%o0, 0x08, %g2)
+
+6:
+	be	8f
+	 andcc	%o1, 4, %g0
+
+	be	1f
+	 andcc	%o1, 2, %g0
+
+	st	%g3, [%o0]
+	add	%o0, 4, %o0
+1:
+	be	1f
+	 andcc	%o1, 1, %g0
+
+	sth	%g3, [%o0]
+	add	%o0, 2, %o0
+1:
+	bne,a	8f
+	 stb	%g3, [%o0]
+8:
+	retl
+	 mov	%g1,%o0
+
+/* Don't care about alignment here. It is highly 
+ * unprobable and at most two traps may happen
+ */
+7:
+	b	6b
+	 orcc	%o1, 0, %g0
diff --git a/arch/sparc/lib/memset.c b/arch/sparc/lib/memset.c
new file mode 100644
index 000000000..1e81dff49
--- /dev/null
+++ b/arch/sparc/lib/memset.c
@@ -0,0 +1,71 @@
+/* linux/arch/sparc/lib/memset.c
+ *
+ * This is from GNU libc.
+ */
+
+#include <linux/types.h>
+
+#define op_t unsigned long int
+#define OPSIZ (sizeof(op_t))
+
+typedef unsigned char byte;
+
+void *memset(void *dstpp, char c, size_t len)
+{
+	long int dstp = (long int) dstpp;
+
+	if (len >= 8) {
+			size_t xlen;
+			op_t cccc;
+
+			cccc = (unsigned char) c;
+			cccc |= cccc << 8;
+			cccc |= cccc << 16;
+
+			/* There are at least some bytes to set.
+			   No need to test for LEN == 0 in this alignment loop.  */
+			while (dstp % OPSIZ != 0) {
+				((byte *) dstp)[0] = c;
+				dstp += 1;
+				len -= 1;
+			}
+
+			/* Write 8 `op_t' per iteration until less
+			 * than 8 `op_t' remain.
+			 */
+			xlen = len / (OPSIZ * 8);
+			while (xlen > 0) {
+				((op_t *) dstp)[0] = cccc;
+				((op_t *) dstp)[1] = cccc;
+				((op_t *) dstp)[2] = cccc;
+				((op_t *) dstp)[3] = cccc;
+				((op_t *) dstp)[4] = cccc;
+				((op_t *) dstp)[5] = cccc;
+				((op_t *) dstp)[6] = cccc;
+				((op_t *) dstp)[7] = cccc;
+				dstp += 8 * OPSIZ;
+				xlen -= 1;
+			}
+			len %= OPSIZ * 8;
+
+			/* Write 1 `op_t' per iteration until less than
+			 * OPSIZ bytes remain.
+			 */
+			xlen = len / OPSIZ;
+			while (xlen > 0) {
+				((op_t *) dstp)[0] = cccc;
+				dstp += OPSIZ;
+				xlen -= 1;
+			}
+			len %= OPSIZ;
+	}
+
+	/* Write the last few bytes.  */
+	while (len > 0) {
+		((byte *) dstp)[0] = c;
+		dstp += 1;
+		len -= 1;
+	}
+
+	return dstpp;
+}
diff --git a/arch/sparc/lib/mul.S b/arch/sparc/lib/mul.S
index e6d78f85f..83dffbc2f 100644
--- a/arch/sparc/lib/mul.S
+++ b/arch/sparc/lib/mul.S
@@ -1,4 +1,5 @@
-/* mul.S:       This routine was taken from glibc-1.09 and is covered
+/* $Id: mul.S,v 1.4 1996/09/30 02:22:32 davem Exp $
+ * mul.S:       This routine was taken from glibc-1.09 and is covered
  *              by the GNU Library General Public License Version 2.
  */
 
@@ -19,7 +20,7 @@
 	mov	%o0, %y		! multiplier -> Y
 	andncc	%o0, 0xfff, %g0	! test bits 12..31
 	be	Lmul_shortway	! if zero, can do it the short way
-	andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
+	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
 
 	/*
 	 * Long multiply.  32 steps, followed by a final shift step.
@@ -65,23 +66,23 @@
 #if 0
 	tst	%o0
 	bge	1f
-	rd	%y, %o0
+	 rd	%y, %o0
 
 	! %o0 was indeed negative; fix upper 32 bits of result by subtracting 
 	! %o1 (i.e., return %o4 - %o1 in %o1).
 	retl
-	sub	%o4, %o1, %o1
+	 sub	%o4, %o1, %o1
 
 1:
 	retl
-	mov	%o4, %o1
+	 mov	%o4, %o1
 #else
 	/* Faster code adapted from tege@sics.se's code for umul.S.  */
 	sra	%o0, 31, %o2	! make mask from sign bit
 	and	%o1, %o2, %o2	! %o2 = 0 or %o1, depending on sign of %o0
 	rd	%y, %o0		! get lower half of product
 	retl
-	sub	%o4, %o2, %o1	! subtract compensation 
+	 sub	%o4, %o2, %o1	! subtract compensation 
 				!  and put upper half in place
 #endif
 
@@ -124,4 +125,11 @@ Lmul_shortway:
 	srl	%o5, 20, %o5	! shift low bits right 20, zero fill at left
 	or	%o5, %o0, %o0	! construct low part of result
 	retl
-	sra	%o4, 20, %o1	! ... and extract high part of result
+	 sra	%o4, 20, %o1	! ... and extract high part of result
+
+	.globl	.mul_patch
+.mul_patch:
+	smul	%o0, %o1, %o0
+	retl
+	 rd	%y, %o1
+	nop
diff --git a/arch/sparc/lib/rem.S b/arch/sparc/lib/rem.S
index 3c0cc579b..44508148d 100644
--- a/arch/sparc/lib/rem.S
+++ b/arch/sparc/lib/rem.S
@@ -1,4 +1,5 @@
-/* rem.S:       This routine was taken from glibc-1.09 and is covered
+/* $Id: rem.S,v 1.7 1996/09/30 02:22:34 davem Exp $
+ * rem.S:       This routine was taken from glibc-1.09 and is covered
  *              by the GNU Library General Public License Version 2.
  */
 
@@ -46,13 +47,14 @@
 	! compute sign of result; if neither is negative, no problem
 	orcc	%o1, %o0, %g0	! either negative?
 	bge	2f			! no, go do the divide
-	xor	%o1, %o0, %g6	! compute sign in any case
+	 mov	%o0, %g2	! compute sign in any case
+
 	tst	%o1
 	bge	1f
-	tst	%o0
+	 tst	%o0
 	! %o1 is definitely negative; %o0 might also be negative
 	bge	2f			! if %o0 not negative...
-	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
+	 sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
 1:	! %o0 is negative, %o1 is nonnegative
 	sub	%g0, %o0, %o0	! make %o0 nonnegative
 2:
@@ -60,22 +62,24 @@
 	! Ready to divide.  Compute size of quotient; scale comparand.
 	orcc	%o1, %g0, %o5
 	bne	1f
-	mov	%o0, %o3
+	 mov	%o0, %o3
 
 		! Divide by zero trap.  If it returns, return 0 (about as
 		! wrong as possible, but that is what SunOS does...).
 		ta	ST_DIV0
 		retl
-		clr	%o0
+		 clr	%o0
 
 1:
 	cmp	%o3, %o5			! if %o1 exceeds %o0, done
 	blu	Lgot_result		! (and algorithm fails otherwise)
-	clr	%o2
+	 clr	%o2
+
 	sethi	%hi(1 << (32 - 4 - 1)), %g1
+
 	cmp	%o3, %g1
 	blu	Lnot_really_big
-	clr	%o4
+	 clr	%o4
 
 	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
 	! as our usual N-at-a-shot divide step will cause overflow and havoc.
@@ -85,15 +89,19 @@
 	1:
 		cmp	%o5, %g1
 		bgeu	3f
-		mov	1, %g7
+		 mov	1, %g7
+
 		sll	%o5, 4, %o5
+
 		b	1b
-		add	%o4, 1, %o4
+		 add	%o4, 1, %o4
 
 	! Now compute %g7.
-	2:	addcc	%o5, %o5, %o5
+	2:
+		addcc	%o5, %o5, %o5
+
 		bcc	Lnot_too_big
-		add	%g7, 1, %g7
+		 add	%g7, 1, %g7
 
 		! We get here if the %o1 overflowed while shifting.
 		! This means that %o3 has the high-order bit set.
@@ -101,15 +109,18 @@
 		sll	%g1, 4, %g1	! high order bit
 		srl	%o5, 1, %o5		! rest of %o5
 		add	%o5, %g1, %o5
+
 		b	Ldo_single_div
-		sub	%g7, 1, %g7
+		 sub	%g7, 1, %g7
 
 	Lnot_too_big:
-	3:	cmp	%o5, %o3
+	3:
+		cmp	%o5, %o3
 		blu	2b
-		nop
+		 nop
+
 		be	Ldo_single_div
-		nop
+		 nop
 	/* NB: these are commented out in the V8-Sparc manual as well */
 	/* (I do not understand this) */
 	! %o5 > %o3: went too far: back up 1 step
@@ -126,19 +137,23 @@
 	Ldo_single_div:
 		subcc	%g7, 1, %g7
 		bl	Lend_regular_divide
-		nop
+		 nop
+
 		sub	%o3, %o5, %o3
 		mov	1, %o2
+
 		b	Lend_single_divloop
-		nop
+		 nop
 	Lsingle_divloop:
 		sll	%o2, 1, %o2
+
 		bl	1f
-		srl	%o5, 1, %o5
+		 srl	%o5, 1, %o5
 		! %o3 >= 0
 		sub	%o3, %o5, %o3
+
 		b	2f
-		add	%o2, 1, %o2
+		 add	%o2, 1, %o2
 	1:	! %o3 < 0
 		add	%o3, %o5, %o3
 		sub	%o2, 1, %o2
@@ -146,7 +161,8 @@
 	Lend_single_divloop:
 		subcc	%g7, 1, %g7
 		bge	Lsingle_divloop
-		tst	%o3
+		 tst	%o3
+
 		b,a	Lend_regular_divide
 
 Lnot_really_big:
@@ -154,206 +170,213 @@ Lnot_really_big:
 	sll	%o5, 4, %o5
 	cmp	%o5, %o3
 	bleu	1b
-	addcc	%o4, 1, %o4
+	 addcc	%o4, 1, %o4
 	be	Lgot_result
-	sub	%o4, 1, %o4
+	 sub	%o4, 1, %o4
 
 	tst	%o3	! set up for initial iteration
 Ldivloop:
 	sll	%o2, 4, %o2
 		! depth 1, accumulated bits 0
 	bl	L.1.16
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 2, accumulated bits 1
 	bl	L.2.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 3, accumulated bits 3
 	bl	L.3.19
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 7
 	bl	L.4.23
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2+1), %o2
+
+	b	9f
+	 add	%o2, (7*2+1), %o2
 	
 L.4.23:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2-1), %o2
-	
+	b	9f
+	 add	%o2, (7*2-1), %o2
 	
 L.3.19:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 5
 	bl	L.4.21
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2+1), %o2
+	b	9f
+	 add	%o2, (5*2+1), %o2
 	
 L.4.21:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (5*2-1), %o2
 	
 L.2.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 3, accumulated bits 1
 	bl	L.3.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 3
 	bl	L.4.19
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2+1), %o2
-	
+	b	9f
+	 add	%o2, (3*2+1), %o2
+
 L.4.19:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (3*2-1), %o2
+
 L.3.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 1
 	bl	L.4.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2+1), %o2
-	
+	b	9f
+	 add	%o2, (1*2+1), %o2
+
 L.4.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2-1), %o2
-	
-	
-	
-	
+	b	9f
+	 add	%o2, (1*2-1), %o2
+
 L.1.16:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 2, accumulated bits -1
 	bl	L.2.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 3, accumulated bits -1
 	bl	L.3.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -1
 	bl	L.4.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-1*2+1), %o2
+
 L.4.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (-1*2-1), %o2
+
 L.3.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -3
 	bl	L.4.13
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-3*2+1), %o2
+
 L.4.13:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2-1), %o2
-	
-	
-	
+	b	9f
+	 add	%o2, (-3*2-1), %o2
+
 L.2.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 3, accumulated bits -3
 	bl	L.3.13
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -5
 	bl	L.4.11
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-5*2+1), %o2
+
 L.4.11:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (-5*2-1), %o2
+
+
 L.3.13:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -7
 	bl	L.4.9
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-7*2+1), %o2
+
 L.4.9:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2-1), %o2
-	
-	
-	
-	
+	b	9f
+	 add	%o2, (-7*2-1), %o2
+
 	9:
 Lend_regular_divide:
 	subcc	%o4, 1, %o4
 	bge	Ldivloop
-	tst	%o3
+	 tst	%o3
+
 	bl,a	Lgot_result
 	! non-restoring fixup here (one instruction only!)
 	add	%o3, %o1, %o3
 
-
 Lgot_result:
+	! check to see if answer should be < 0
+	tst	%g2
+	bl,a	1f
+	 sub %g0, %o3, %o3
+1:
+	retl
+	 mov %o3, %o0
 
+	.globl	.rem_patch
+.rem_patch:
+	sra	%o0, 0x1f, %o4
+	wr	%o4, 0x0, %y
+	nop
+	nop
+	nop
+	sdivcc	%o0, %o1, %o2
+	bvs,a	1f
+	 xnor	%o2, %g0, %o2
+1:	smul	%o2, %o1, %o2
 	retl
-	mov %o3, %o0
+	 sub	%o0, %o2, %o0
+	nop
diff --git a/arch/sparc/lib/sdiv.S b/arch/sparc/lib/sdiv.S
index 2fa7a9794..e0ad80b6f 100644
--- a/arch/sparc/lib/sdiv.S
+++ b/arch/sparc/lib/sdiv.S
@@ -1,4 +1,5 @@
-/* sdiv.S:      This routine was taken from glibc-1.09 and is covered
+/* $Id: sdiv.S,v 1.6 1996/10/02 17:37:00 davem Exp $
+ * sdiv.S:      This routine was taken from glibc-1.09 and is covered
  *              by the GNU Library General Public License Version 2.
  */
 
@@ -46,13 +47,14 @@
 	! compute sign of result; if neither is negative, no problem
 	orcc	%o1, %o0, %g0	! either negative?
 	bge	2f			! no, go do the divide
-	xor	%o1, %o0, %g6	! compute sign in any case
+	 xor	%o1, %o0, %g2	! compute sign in any case
+
 	tst	%o1
 	bge	1f
-	tst	%o0
+	 tst	%o0
 	! %o1 is definitely negative; %o0 might also be negative
 	bge	2f			! if %o0 not negative...
-	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
+	 sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
 1:	! %o0 is negative, %o1 is nonnegative
 	sub	%g0, %o0, %o0	! make %o0 nonnegative
 2:
@@ -60,22 +62,24 @@
 	! Ready to divide.  Compute size of quotient; scale comparand.
 	orcc	%o1, %g0, %o5
 	bne	1f
-	mov	%o0, %o3
+	 mov	%o0, %o3
 
 		! Divide by zero trap.  If it returns, return 0 (about as
 		! wrong as possible, but that is what SunOS does...).
 		ta	ST_DIV0
 		retl
-		clr	%o0
+		 clr	%o0
 
 1:
 	cmp	%o3, %o5			! if %o1 exceeds %o0, done
 	blu	Lgot_result		! (and algorithm fails otherwise)
-	clr	%o2
+	 clr	%o2
+
 	sethi	%hi(1 << (32 - 4 - 1)), %g1
+
 	cmp	%o3, %g1
 	blu	Lnot_really_big
-	clr	%o4
+	 clr	%o4
 
 	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
 	! as our usual N-at-a-shot divide step will cause overflow and havoc.
@@ -85,15 +89,18 @@
 	1:
 		cmp	%o5, %g1
 		bgeu	3f
-		mov	1, %g7
+		 mov	1, %g7
+
 		sll	%o5, 4, %o5
+
 		b	1b
-		add	%o4, 1, %o4
+		 add	%o4, 1, %o4
 
 	! Now compute %g7.
-	2:	addcc	%o5, %o5, %o5
+	2:
+		addcc	%o5, %o5, %o5
 		bcc	Lnot_too_big
-		add	%g7, 1, %g7
+		 add	%g7, 1, %g7
 
 		! We get here if the %o1 overflowed while shifting.
 		! This means that %o3 has the high-order bit set.
@@ -101,15 +108,18 @@
 		sll	%g1, 4, %g1	! high order bit
 		srl	%o5, 1, %o5		! rest of %o5
 		add	%o5, %g1, %o5
+
 		b	Ldo_single_div
-		sub	%g7, 1, %g7
+		 sub	%g7, 1, %g7
 
 	Lnot_too_big:
-	3:	cmp	%o5, %o3
+	3:
+		cmp	%o5, %o3
 		blu	2b
-		nop
+		 nop
+
 		be	Ldo_single_div
-		nop
+		 nop
 	/* NB: these are commented out in the V8-Sparc manual as well */
 	/* (I do not understand this) */
 	! %o5 > %o3: went too far: back up 1 step
@@ -126,19 +136,23 @@
 	Ldo_single_div:
 		subcc	%g7, 1, %g7
 		bl	Lend_regular_divide
-		nop
+		 nop
+
 		sub	%o3, %o5, %o3
 		mov	1, %o2
+
 		b	Lend_single_divloop
-		nop
+		 nop
 	Lsingle_divloop:
 		sll	%o2, 1, %o2
+
 		bl	1f
-		srl	%o5, 1, %o5
+		 srl	%o5, 1, %o5
 		! %o3 >= 0
 		sub	%o3, %o5, %o3
+
 		b	2f
-		add	%o2, 1, %o2
+		 add	%o2, 1, %o2
 	1:	! %o3 < 0
 		add	%o3, %o5, %o3
 		sub	%o2, 1, %o2
@@ -146,7 +160,8 @@
 	Lend_single_divloop:
 		subcc	%g7, 1, %g7
 		bge	Lsingle_divloop
-		tst	%o3
+		 tst	%o3
+
 		b,a	Lend_regular_divide
 
 Lnot_really_big:
@@ -154,83 +169,81 @@ Lnot_really_big:
 	sll	%o5, 4, %o5
 	cmp	%o5, %o3
 	bleu	1b
-	addcc	%o4, 1, %o4
+	 addcc	%o4, 1, %o4
+
 	be	Lgot_result
-	sub	%o4, 1, %o4
+	 sub	%o4, 1, %o4
 
 	tst	%o3	! set up for initial iteration
 Ldivloop:
 	sll	%o2, 4, %o2
 		! depth 1, accumulated bits 0
 	bl	L.1.16
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 2, accumulated bits 1
 	bl	L.2.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 3, accumulated bits 3
 	bl	L.3.19
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 7
 	bl	L.4.23
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2+1), %o2
-	
+	b	9f
+	 add	%o2, (7*2+1), %o2
+
 L.4.23:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (7*2-1), %o2
+
 L.3.19:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 5
 	bl	L.4.21
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2+1), %o2
-	
+	b	9f
+	 add	%o2, (5*2+1), %o2
+
 L.4.21:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2-1), %o2
-	
-	
-	
+	b	9f
+	 add	%o2, (5*2-1), %o2
+
 L.2.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 3, accumulated bits 1
 	bl	L.3.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 3
 	bl	L.4.19
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2+1), %o2
-	
+	b	9f
+	 add	%o2, (3*2+1), %o2
+
 L.4.19:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2-1), %o2
+	b	9f
+	 add	%o2, (3*2-1), %o2
 	
 	
 L.3.17:
@@ -238,126 +251,129 @@ L.3.17:
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 1
 	bl	L.4.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2+1), %o2
-	
+	b	9f
+	 add	%o2, (1*2+1), %o2
+
 L.4.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2-1), %o2
-	
-	
-	
-	
+	b	9f
+	 add	%o2, (1*2-1), %o2
+
 L.1.16:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 2, accumulated bits -1
 	bl	L.2.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 3, accumulated bits -1
 	bl	L.3.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -1
 	bl	L.4.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-1*2+1), %o2
+
 L.4.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (-1*2-1), %o2
+
 L.3.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -3
 	bl	L.4.13
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-3*2+1), %o2
+
 L.4.13:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2-1), %o2
-	
-	
-	
+	b	9f
+	 add	%o2, (-3*2-1), %o2
+
 L.2.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 3, accumulated bits -3
 	bl	L.3.13
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -5
 	bl	L.4.11
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-5*2+1), %o2
+
 L.4.11:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (-5*2-1), %o2
+
 L.3.13:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -7
 	bl	L.4.9
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-7*2+1), %o2
+
 L.4.9:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2-1), %o2
-	
-	
-	
-	
+	b	9f
+	 add	%o2, (-7*2-1), %o2
+
 	9:
 Lend_regular_divide:
 	subcc	%o4, 1, %o4
 	bge	Ldivloop
-	tst	%o3
+	 tst	%o3
+
 	bl,a	Lgot_result
 	! non-restoring fixup here (one instruction only!)
 	sub	%o2, 1, %o2
 
-
 Lgot_result:
 	! check to see if answer should be < 0
-	tst	%g6
+	tst	%g2
 	bl,a	1f
-	sub %g0, %o2, %o2
+	 sub %g0, %o2, %o2
 1:
 	retl
-	mov %o2, %o0
+	 mov %o2, %o0
+
+	.globl	.div_patch
+.div_patch:
+	sra	%o0, 0x1f, %o2
+	wr	%o2, 0x0, %y
+	nop
+	nop
+	nop
+	sdivcc	%o0, %o1, %o0
+	bvs,a	1f
+	 xnor	%o0, %g0, %o0
+1:	retl
+	 nop
diff --git a/arch/sparc/lib/strlen.S b/arch/sparc/lib/strlen.S
new file mode 100644
index 000000000..95321d4c5
--- /dev/null
+++ b/arch/sparc/lib/strlen.S
@@ -0,0 +1,88 @@
+/* strlen.S: Sparc optimized strlen().
+ *
+ * This was hand optimized by davem@caip.rutgers.edu from
+ * the C-code in GNU-libc.
+ */
+
+#include <asm/cprefix.h>
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+	.align 4
+	.global C_LABEL(strlen)
+C_LABEL(strlen):
+	mov	%o0, %o1
+	andcc	%o0, 3, %g0		! and with %o0 so no dependency problems
+	be	scan_words
+	 sethi	%hi(HI_MAGIC), %g2	! common case and most Sparcs predict taken
+
+	ldsb	[%o0], %g2
+still_not_word_aligned:
+	cmp	%g2, 0
+	bne,a	1f
+	 add	%o0, 1, %o0
+
+	/* Ok, so there are tons of quick interlocks above for the
+	 * < 4 length string unaligned... not too common so I'm not
+	 * very concerned.
+	 */
+	retl
+	 sub	%o0, %o1, %o0
+
+1:
+	andcc	%o0, 3, %g0
+	bne,a	still_not_word_aligned
+	 ldsb	[%o0], %g2
+
+	/* HyperSparc executes each sethi/or pair in 1 cycle. */
+	sethi	%hi(HI_MAGIC), %g2
+scan_words:
+	or	%g2, %lo(HI_MAGIC), %o3
+	sethi	%hi(LO_MAGIC), %g3
+	or	%g3, %lo(LO_MAGIC), %o2
+next_word:
+	ld	[%o0], %g2		! no dependencies
+next_word_preloaded:
+	sub	%g2, %o2, %g2		! lots of locks here
+	andcc	%g2, %o3, %g0		! and I dont like it...
+	be	next_word
+	 add	%o0, 4, %o0
+
+	/* Check every byte. */
+byte_zero:
+	ldsb	[%o0 - 0x4], %g2
+	cmp	%g2, 0
+	bne	byte_one
+	 add	%o0, -4, %g3
+
+	retl
+	 sub	%g3, %o1, %o0
+
+byte_one:
+	ldsb	[%o0 - 0x3], %g2
+	cmp	%g2, 0
+	bne,a	byte_two_and_three
+	 ldsb	[%o0 - 0x2], %g2
+
+	sub	%g3, %o1, %o0
+	retl
+	 add	%o0, 1, %o0
+
+byte_two_and_three:
+	cmp	%g2, 0
+	be,a	found_it
+	 sub	%g3, %o1, %o0
+
+	ldsb	[%o0 - 0x1], %g2
+	cmp	%g2, 0
+	bne,a	next_word_preloaded
+	 ld	[%o0], %g2
+
+	sub	%g3, %o1, %o0
+	retl
+	 add	%o0, 3, %o0
+
+found_it:
+	retl
+	 add	%o0, 2, %o0
diff --git a/arch/sparc/lib/strncmp.S b/arch/sparc/lib/strncmp.S
new file mode 100644
index 000000000..2f26b1b4a
--- /dev/null
+++ b/arch/sparc/lib/strncmp.S
@@ -0,0 +1,120 @@
+/* $Id: strncmp.S,v 1.2 1996/09/09 02:47:20 davem Exp $
+ * strncmp.S: Hand optimized Sparc assembly of GCC output from GNU libc
+ *            generic strncmp routine.
+ */
+
+#include <asm/cprefix.h>
+
+	.text
+	.align 4
+	.global C_LABEL(__strncmp), C_LABEL(strncmp)
+C_LABEL(__strncmp):
+C_LABEL(strncmp):
+	mov	%o0, %g3
+	mov	0, %o3
+
+	cmp	%o2, 3
+	ble	7f
+	 mov	0, %g2
+
+	sra	%o2, 2, %o4
+	ldub	[%g3], %o3
+
+0:
+	ldub	[%o1], %g2
+	add	%g3, 1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be,a	1f
+	 ldub	[%g3], %o3
+
+	retl
+	 sub	%o0, %g2, %o0
+
+1:
+	ldub	[%o1], %g2
+	add	%g3,1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be,a	1f
+	 ldub	[%g3], %o3
+
+	retl
+	 sub	%o0, %g2, %o0
+
+1:
+	ldub	[%o1], %g2
+	add	%g3, 1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be,a	1f
+	 ldub	[%g3], %o3
+
+	retl
+	 sub	%o0, %g2, %o0
+
+1:
+	ldub	[%o1], %g2
+	add	%g3, 1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be	1f
+	 add	%o4, -1, %o4
+
+	retl
+	 sub	%o0, %g2, %o0
+
+1:
+
+	cmp	%o4, 0
+	bg,a	0b
+	 ldub	[%g3], %o3
+
+	b	7f
+	 and	%o2, 3, %o2
+
+9:
+	ldub	[%o1], %g2
+	add	%g3, 1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be	7f
+	 add	%o2, -1, %o2
+
+8:
+	retl
+	 sub	%o0, %g2, %o0
+
+7:
+	cmp	%o2, 0
+	bg,a	9b
+	 ldub	[%g3], %o3
+
+	and	%g2, 0xff, %o0
+	retl
+	 sub	%o3, %o0, %o0
diff --git a/arch/sparc/lib/strncpy_from_user.S b/arch/sparc/lib/strncpy_from_user.S
new file mode 100644
index 000000000..3dd2bd71c
--- /dev/null
+++ b/arch/sparc/lib/strncpy_from_user.S
@@ -0,0 +1,49 @@
+/* strncpy_from_user.S: Sparc strncpy from userspace.
+ *
+ *  Copyright(C) 1996 David S. Miller
+ */
+
+#include <asm/cprefix.h>
+#include <asm/ptrace.h>
+
+	.text
+	.align	4
+
+	/* Must return:
+	 *
+	 * -EFAULT		for an exception
+	 * count		if we hit the buffer limit
+	 * bytes copied		if we hit a null byte
+	 */
+
+	.globl	C_LABEL(__strncpy_from_user)
+C_LABEL(__strncpy_from_user):
+	/* %o0=dest, %o1=src, %o2=count */
+	ld	[%g6 + THREAD_EX_COUNT], %g1
+	set	strncpy_user_failure, %g2
+	add	%g1, 1, %g3
+	st	%o7, [%g6 + THREAD_EX_PC]
+	st	%g3, [%g6 + THREAD_EX_COUNT]
+	st	%g2, [%g6 + THREAD_EX_EXPC]
+
+	mov	%o2, %o3
+1:
+	subcc	%o2, 1, %o2
+	bneg	2f
+	 nop
+
+	ldub	[%o1], %o4
+	add	%o0, 1, %o0
+	cmp	%o4, 0
+	add	%o1, 1, %o1
+	bne	1b
+	 stb	%o4, [%o0 - 1]
+2:
+	add	%o2, 1, %o0
+	st	%g1, [%g6 + THREAD_EX_COUNT]
+	retl
+	 sub	%o3, %o0, %o0
+
+strncpy_user_failure:
+	jmpl	%g3 + 0x8, %g0
+	 mov	%g1, %o0
diff --git a/arch/sparc/lib/udiv.S b/arch/sparc/lib/udiv.S
index 53cfeac90..2abfc6b0f 100644
--- a/arch/sparc/lib/udiv.S
+++ b/arch/sparc/lib/udiv.S
@@ -1,4 +1,5 @@
-/* udiv.S:      This routine was taken from glibc-1.09 and is covered
+/* $Id: udiv.S,v 1.4 1996/09/30 02:22:38 davem Exp $
+ * udiv.S:      This routine was taken from glibc-1.09 and is covered
  *              by the GNU Library General Public License Version 2.
  */
 
@@ -47,22 +48,24 @@
 	! Ready to divide.  Compute size of quotient; scale comparand.
 	orcc	%o1, %g0, %o5
 	bne	1f
-	mov	%o0, %o3
+	 mov	%o0, %o3
 
 		! Divide by zero trap.  If it returns, return 0 (about as
 		! wrong as possible, but that is what SunOS does...).
 		ta	ST_DIV0
 		retl
-		clr	%o0
+		 clr	%o0
 
 1:
 	cmp	%o3, %o5			! if %o1 exceeds %o0, done
 	blu	Lgot_result		! (and algorithm fails otherwise)
-	clr	%o2
+	 clr	%o2
+
 	sethi	%hi(1 << (32 - 4 - 1)), %g1
+
 	cmp	%o3, %g1
 	blu	Lnot_really_big
-	clr	%o4
+	 clr	%o4
 
 	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
 	! as our usual N-at-a-shot divide step will cause overflow and havoc.
@@ -72,15 +75,18 @@
 	1:
 		cmp	%o5, %g1
 		bgeu	3f
-		mov	1, %g7
+		 mov	1, %g7
+
 		sll	%o5, 4, %o5
+
 		b	1b
-		add	%o4, 1, %o4
+		 add	%o4, 1, %o4
 
 	! Now compute %g7.
-	2:	addcc	%o5, %o5, %o5
+	2:
+		addcc	%o5, %o5, %o5
 		bcc	Lnot_too_big
-		add	%g7, 1, %g7
+		 add	%g7, 1, %g7
 
 		! We get here if the %o1 overflowed while shifting.
 		! This means that %o3 has the high-order bit set.
@@ -88,15 +94,18 @@
 		sll	%g1, 4, %g1	! high order bit
 		srl	%o5, 1, %o5		! rest of %o5
 		add	%o5, %g1, %o5
+
 		b	Ldo_single_div
-		sub	%g7, 1, %g7
+		 sub	%g7, 1, %g7
 
 	Lnot_too_big:
-	3:	cmp	%o5, %o3
+	3:
+		cmp	%o5, %o3
 		blu	2b
-		nop
+		 nop
+
 		be	Ldo_single_div
-		nop
+		 nop
 	/* NB: these are commented out in the V8-Sparc manual as well */
 	/* (I do not understand this) */
 	! %o5 > %o3: went too far: back up 1 step
@@ -113,19 +122,21 @@
 	Ldo_single_div:
 		subcc	%g7, 1, %g7
 		bl	Lend_regular_divide
-		nop
+		 nop
+
 		sub	%o3, %o5, %o3
 		mov	1, %o2
+
 		b	Lend_single_divloop
-		nop
+		 nop
 	Lsingle_divloop:
 		sll	%o2, 1, %o2
 		bl	1f
-		srl	%o5, 1, %o5
+		 srl	%o5, 1, %o5
 		! %o3 >= 0
 		sub	%o3, %o5, %o3
 		b	2f
-		add	%o2, 1, %o2
+		 add	%o2, 1, %o2
 	1:	! %o3 < 0
 		add	%o3, %o5, %o3
 		sub	%o2, 1, %o2
@@ -133,214 +144,212 @@
 	Lend_single_divloop:
 		subcc	%g7, 1, %g7
 		bge	Lsingle_divloop
-		tst	%o3
+		 tst	%o3
+
 		b,a	Lend_regular_divide
 
 Lnot_really_big:
 1:
 	sll	%o5, 4, %o5
+
 	cmp	%o5, %o3
 	bleu	1b
-	addcc	%o4, 1, %o4
+	 addcc	%o4, 1, %o4
+
 	be	Lgot_result
-	sub	%o4, 1, %o4
+	 sub	%o4, 1, %o4
 
 	tst	%o3	! set up for initial iteration
 Ldivloop:
 	sll	%o2, 4, %o2
 		! depth 1, accumulated bits 0
 	bl	L.1.16
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 2, accumulated bits 1
 	bl	L.2.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 3, accumulated bits 3
 	bl	L.3.19
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 7
 	bl	L.4.23
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2+1), %o2
-	
+	b	9f
+	 add	%o2, (7*2+1), %o2
+
 L.4.23:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (7*2-1), %o2
+
 L.3.19:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 5
 	bl	L.4.21
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2+1), %o2
-	
+	b	9f
+	 add	%o2, (5*2+1), %o2
+
 L.4.21:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2-1), %o2
-	
-	
-	
+	b	9f
+	 add	%o2, (5*2-1), %o2
+
 L.2.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 3, accumulated bits 1
 	bl	L.3.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 3
 	bl	L.4.19
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2+1), %o2
-	
+	b	9f
+	 add	%o2, (3*2+1), %o2
+
 L.4.19:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (3*2-1), %o2
+
 L.3.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 1
 	bl	L.4.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2+1), %o2
-	
+	b	9f
+	 add	%o2, (1*2+1), %o2
+
 L.4.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2-1), %o2
-	
-	
-	
-	
+	b	9f
+	 add	%o2, (1*2-1), %o2
+
 L.1.16:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 2, accumulated bits -1
 	bl	L.2.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 3, accumulated bits -1
 	bl	L.3.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -1
 	bl	L.4.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-1*2+1), %o2
+
 L.4.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (-1*2-1), %o2
+
 L.3.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -3
 	bl	L.4.13
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-3*2+1), %o2
+
 L.4.13:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2-1), %o2
-	
-	
-	
+	b	9f
+	 add	%o2, (-3*2-1), %o2
+
 L.2.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 3, accumulated bits -3
 	bl	L.3.13
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -5
 	bl	L.4.11
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-5*2+1), %o2
+
 L.4.11:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (-5*2-1), %o2
+
 L.3.13:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -7
 	bl	L.4.9
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-7*2+1), %o2
+
 L.4.9:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2-1), %o2
-	
-	
-	
-	
+	b	9f
+	 add	%o2, (-7*2-1), %o2
+
 	9:
 Lend_regular_divide:
 	subcc	%o4, 1, %o4
 	bge	Ldivloop
-	tst	%o3
+	 tst	%o3
+
 	bl,a	Lgot_result
 	! non-restoring fixup here (one instruction only!)
 	sub	%o2, 1, %o2
 
-
 Lgot_result:
 
 	retl
-	mov %o2, %o0
+	 mov %o2, %o0
+
+	.globl	.udiv_patch
+.udiv_patch:
+	wr	%g0, 0x0, %y
+	nop
+	nop
+	retl
+	 udiv	%o0, %o1, %o0
+	nop
diff --git a/arch/sparc/lib/umul.S b/arch/sparc/lib/umul.S
index 24f7c3cda..a784720a8 100644
--- a/arch/sparc/lib/umul.S
+++ b/arch/sparc/lib/umul.S
@@ -1,4 +1,5 @@
-/* umul.S:      This routine was taken from glibc-1.09 and is covered
+/* $Id: umul.S,v 1.4 1996/09/30 02:22:39 davem Exp $
+ * umul.S:      This routine was taken from glibc-1.09 and is covered
  *              by the GNU Library General Public License Version 2.
  */
 
@@ -23,9 +24,10 @@
 .umul:
 	or	%o0, %o1, %o4
 	mov	%o0, %y		! multiplier -> Y
+
 	andncc	%o4, 0xfff, %g0	! test bits 12..31 of *both* args
 	be	Lmul_shortway	! if zero, can do it the short way
-	andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
+	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
 
 	/*
 	 * Long multiply.  32 steps, followed by a final shift step.
@@ -102,17 +104,19 @@
 #if 0
 	tst	%o1
 	bl,a	1f		! if %o1 < 0 (high order bit = 1),
-	add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half)
-1:	rd	%y, %o0		! get lower half of product
+	 add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half)
+
+1:
+	rd	%y, %o0		! get lower half of product
 	retl
-	addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0
+	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0
 #else
 	/* Faster code from tege@sics.se.  */
 	sra	%o1, 31, %o2	! make mask from sign bit
 	and	%o0, %o2, %o2	! %o2 = 0 or %o0, depending on sign of %o1
 	rd	%y, %o0		! get lower half of product
 	retl
-	addcc	%o4, %o2, %o1	! add compensation and put upper half in place
+	 addcc	%o4, %o2, %o1	! add compensation and put upper half in place
 #endif
 
 Lmul_shortway:
@@ -155,4 +159,11 @@ Lmul_shortway:
 	srl	%o5, 20, %o5	! shift low bits right 20
 	or	%o5, %o0, %o0
 	retl
-	addcc	%g0, %g0, %o1	! %o1 = zero, and set Z
+	 addcc	%g0, %g0, %o1	! %o1 = zero, and set Z
+
+	.globl	.umul_patch
+.umul_patch:
+	umul	%o0, %o1, %o0
+	retl
+	 rd	%y, %o1
+	nop
diff --git a/arch/sparc/lib/urem.S b/arch/sparc/lib/urem.S
index c84aa81e5..ec7f0c502 100644
--- a/arch/sparc/lib/urem.S
+++ b/arch/sparc/lib/urem.S
@@ -1,4 +1,5 @@
-/* urem.S:      This routine was taken from glibc-1.09 and is covered
+/* $Id: urem.S,v 1.4 1996/09/30 02:22:42 davem Exp $
+ * urem.S:      This routine was taken from glibc-1.09 and is covered
  *              by the GNU Library General Public License Version 2.
  */
 
@@ -45,22 +46,24 @@
 	! Ready to divide.  Compute size of quotient; scale comparand.
 	orcc	%o1, %g0, %o5
 	bne	1f
-	mov	%o0, %o3
+	 mov	%o0, %o3
 
 		! Divide by zero trap.  If it returns, return 0 (about as
 		! wrong as possible, but that is what SunOS does...).
 		ta	ST_DIV0
 		retl
-		clr	%o0
+		 clr	%o0
 
 1:
 	cmp	%o3, %o5			! if %o1 exceeds %o0, done
 	blu	Lgot_result		! (and algorithm fails otherwise)
-	clr	%o2
+	 clr	%o2
+
 	sethi	%hi(1 << (32 - 4 - 1)), %g1
+
 	cmp	%o3, %g1
 	blu	Lnot_really_big
-	clr	%o4
+	 clr	%o4
 
 	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
 	! as our usual N-at-a-shot divide step will cause overflow and havoc.
@@ -70,15 +73,18 @@
 	1:
 		cmp	%o5, %g1
 		bgeu	3f
-		mov	1, %g7
+		 mov	1, %g7
+
 		sll	%o5, 4, %o5
+
 		b	1b
-		add	%o4, 1, %o4
+		 add	%o4, 1, %o4
 
 	! Now compute %g7.
-	2:	addcc	%o5, %o5, %o5
+	2:
+		addcc	%o5, %o5, %o5
 		bcc	Lnot_too_big
-		add	%g7, 1, %g7
+		 add	%g7, 1, %g7
 
 		! We get here if the %o1 overflowed while shifting.
 		! This means that %o3 has the high-order bit set.
@@ -86,15 +92,18 @@
 		sll	%g1, 4, %g1	! high order bit
 		srl	%o5, 1, %o5		! rest of %o5
 		add	%o5, %g1, %o5
+
 		b	Ldo_single_div
-		sub	%g7, 1, %g7
+		 sub	%g7, 1, %g7
 
 	Lnot_too_big:
-	3:	cmp	%o5, %o3
+	3:
+		cmp	%o5, %o3
 		blu	2b
-		nop
+		 nop
+
 		be	Ldo_single_div
-		nop
+		 nop
 	/* NB: these are commented out in the V8-Sparc manual as well */
 	/* (I do not understand this) */
 	! %o5 > %o3: went too far: back up 1 step
@@ -111,19 +120,21 @@
 	Ldo_single_div:
 		subcc	%g7, 1, %g7
 		bl	Lend_regular_divide
-		nop
+		 nop
+
 		sub	%o3, %o5, %o3
 		mov	1, %o2
+
 		b	Lend_single_divloop
-		nop
+		 nop
 	Lsingle_divloop:
 		sll	%o2, 1, %o2
 		bl	1f
-		srl	%o5, 1, %o5
+		 srl	%o5, 1, %o5
 		! %o3 >= 0
 		sub	%o3, %o5, %o3
 		b	2f
-		add	%o2, 1, %o2
+		 add	%o2, 1, %o2
 	1:	! %o3 < 0
 		add	%o3, %o5, %o3
 		sub	%o2, 1, %o2
@@ -131,214 +142,214 @@
 	Lend_single_divloop:
 		subcc	%g7, 1, %g7
 		bge	Lsingle_divloop
-		tst	%o3
+		 tst	%o3
+
 		b,a	Lend_regular_divide
 
 Lnot_really_big:
 1:
 	sll	%o5, 4, %o5
+
 	cmp	%o5, %o3
 	bleu	1b
-	addcc	%o4, 1, %o4
+	 addcc	%o4, 1, %o4
+
 	be	Lgot_result
-	sub	%o4, 1, %o4
+	 sub	%o4, 1, %o4
 
 	tst	%o3	! set up for initial iteration
 Ldivloop:
 	sll	%o2, 4, %o2
 		! depth 1, accumulated bits 0
 	bl	L.1.16
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 2, accumulated bits 1
 	bl	L.2.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 3, accumulated bits 3
 	bl	L.3.19
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 7
 	bl	L.4.23
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2+1), %o2
-	
+	b	9f
+	 add	%o2, (7*2+1), %o2
+
 L.4.23:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (7*2-1), %o2
+
 L.3.19:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 5
 	bl	L.4.21
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2+1), %o2
-	
+	b	9f
+	 add	%o2, (5*2+1), %o2
+
 L.4.21:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2-1), %o2
-	
-	
-	
+	b	9f
+	 add	%o2, (5*2-1), %o2
+
 L.2.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 3, accumulated bits 1
 	bl	L.3.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 3
 	bl	L.4.19
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2+1), %o2
-	
+	b	9f
+	 add	%o2, (3*2+1), %o2
+
 L.4.19:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (3*2-1), %o2
+
 L.3.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits 1
 	bl	L.4.17
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2+1), %o2
+	b	9f
+	 add	%o2, (1*2+1), %o2
 	
 L.4.17:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2-1), %o2
-	
-	
-	
-	
+	b	9f
+	 add	%o2, (1*2-1), %o2
+
 L.1.16:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 2, accumulated bits -1
 	bl	L.2.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 3, accumulated bits -1
 	bl	L.3.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -1
 	bl	L.4.15
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-1*2+1), %o2
+
 L.4.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (-1*2-1), %o2
+
 L.3.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -3
 	bl	L.4.13
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-3*2+1), %o2
+
 L.4.13:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2-1), %o2
-	
-	
-	
+	b	9f
+	 add	%o2, (-3*2-1), %o2
+
 L.2.15:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 3, accumulated bits -3
 	bl	L.3.13
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -5
 	bl	L.4.11
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2+1), %o2
+	b	9f
+	 add	%o2, (-5*2+1), %o2
 	
 L.4.11:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2-1), %o2
-	
-	
+	b	9f
+	 add	%o2, (-5*2-1), %o2
+
 L.3.13:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
 			! depth 4, accumulated bits -7
 	bl	L.4.9
-	srl	%o5,1,%o5
+	 srl	%o5,1,%o5
 	! remainder is positive
 	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2+1), %o2
-	
+	b	9f
+	 add	%o2, (-7*2+1), %o2
+
 L.4.9:
 	! remainder is negative
 	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2-1), %o2
-	
-	
-	
-	
+	b	9f
+	 add	%o2, (-7*2-1), %o2
+
 	9:
 Lend_regular_divide:
 	subcc	%o4, 1, %o4
 	bge	Ldivloop
-	tst	%o3
+	 tst	%o3
+
 	bl,a	Lgot_result
 	! non-restoring fixup here (one instruction only!)
 	add	%o3, %o1, %o3
 
-
 Lgot_result:
 
 	retl
-	mov %o3, %o0
+	 mov %o3, %o0
+
+	.globl	.urem_patch
+.urem_patch:
+	wr	%g0, 0x0, %y
+	nop
+	nop
+	nop
+	udiv	%o0, %o1, %o2
+	umul	%o2, %o1, %o2
+	retl
+	 sub	%o0, %o2, %o0
author	Ralf Baechle <ralf@linux-mips.org>	1997-01-07 02:33:00 +0000
committer	<ralf@linux-mips.org>	1997-01-07 02:33:00 +0000
commit	beb116954b9b7f3bb56412b2494b562f02b864b1 (patch)
tree	120e997879884e1b9d93b265221b939d2ef1ade1 /arch/sparc/lib
parent	908d4681a1dc3792ecafbe64265783a86c4cccb6 (diff)