diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1997-01-07 02:33:00 +0000 |
---|---|---|
committer | <ralf@linux-mips.org> | 1997-01-07 02:33:00 +0000 |
commit | beb116954b9b7f3bb56412b2494b562f02b864b1 (patch) | |
tree | 120e997879884e1b9d93b265221b939d2ef1ade1 /arch/sparc/lib | |
parent | 908d4681a1dc3792ecafbe64265783a86c4cccb6 (diff) |
Import of Linux/MIPS 2.1.14
Diffstat (limited to 'arch/sparc/lib')
-rw-r--r-- | arch/sparc/lib/Makefile | 47 | ||||
-rw-r--r-- | arch/sparc/lib/ashrdi3.S | 24 | ||||
-rw-r--r-- | arch/sparc/lib/blockops.S | 103 | ||||
-rw-r--r-- | arch/sparc/lib/checksum.S | 439 | ||||
-rw-r--r-- | arch/sparc/lib/memcmp.S | 314 | ||||
-rw-r--r-- | arch/sparc/lib/memcpy.S | 364 | ||||
-rw-r--r-- | arch/sparc/lib/memscan.S | 135 | ||||
-rw-r--r-- | arch/sparc/lib/memset.S | 166 | ||||
-rw-r--r-- | arch/sparc/lib/memset.c | 71 | ||||
-rw-r--r-- | arch/sparc/lib/mul.S | 22 | ||||
-rw-r--r-- | arch/sparc/lib/rem.S | 221 | ||||
-rw-r--r-- | arch/sparc/lib/sdiv.S | 222 | ||||
-rw-r--r-- | arch/sparc/lib/strlen.S | 88 | ||||
-rw-r--r-- | arch/sparc/lib/strncmp.S | 120 | ||||
-rw-r--r-- | arch/sparc/lib/strncpy_from_user.S | 49 | ||||
-rw-r--r-- | arch/sparc/lib/udiv.S | 209 | ||||
-rw-r--r-- | arch/sparc/lib/umul.S | 25 | ||||
-rw-r--r-- | arch/sparc/lib/urem.S | 207 |
18 files changed, 2389 insertions, 437 deletions
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 1f2ce0e1c..2cb74336f 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -1,22 +1,44 @@ -# +# $Id: Makefile,v 1.12 1996/10/27 08:36:26 davem Exp $ # Makefile for Sparc library files.. # CFLAGS := $(CFLAGS) -ansi -.c.s: - $(CC) $(CFLAGS) -S $< -.s.o: - $(AS) -c -o $*.o $< -.c.o: - $(CC) $(CFLAGS) -c $< - -OBJS = mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o +OBJS = mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o memcpy.o memset.o \ + strlen.o checksum.o blockops.o memscan.o memcmp.o strncmp.o \ + strncpy_from_user.o lib.a: $(OBJS) $(AR) rcs lib.a $(OBJS) sync +checksum.o: checksum.S + $(CC) -ansi -c -o checksum.o checksum.S + +memcpy.o: memcpy.S + $(CC) -D__ASSEMBLY__ -ansi -c -o memcpy.o memcpy.S + +memcmp.o: memcmp.S + $(CC) -ansi -c -o memcmp.o memcmp.S + +memscan.o: memscan.S + $(CC) -ansi -c -o memscan.o memscan.S + +strncmp.o: strncmp.S + $(CC) -ansi -c -o strncmp.o strncmp.S + +strncpy_from_user.o: strncpy_from_user.S + $(CC) -D__ASSEMBLY__ -ansi -c -o strncpy_from_user.o strncpy_from_user.S + +blockops.o: blockops.S + $(CC) -ansi -c -o blockops.o blockops.S + +memset.o: memset.S + $(CC) -D__ASSEMBLY__ -ansi -c -o memset.o memset.S + +strlen.o: strlen.S + $(CC) -ansi -c -o strlen.o strlen.S + mul.o: mul.S $(CC) -c -o mul.o mul.S @@ -40,9 +62,4 @@ ashrdi3.o: ashrdi3.S dep: -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif +include $(TOPDIR)/Rules.make diff --git a/arch/sparc/lib/ashrdi3.S b/arch/sparc/lib/ashrdi3.S index c672d2c9f..bf589c283 100644 --- a/arch/sparc/lib/ashrdi3.S +++ b/arch/sparc/lib/ashrdi3.S @@ -1,4 +1,5 @@ -/* ashrdi3.S: The filesystem code creates all kinds of references to +/* $Id: ashrdi3.S,v 1.3 1996/09/07 23:18:10 davem Exp $ + * ashrdi3.S: The filesystem code creates all kinds of references to * this little routine on the sparc with gcc. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) @@ -10,19 +11,26 @@ C_LABEL(__ashrdi3): tst %o2 be 3f - or %g0, 32, %g2 + or %g0, 32, %g2 + sub %g2, %o2, %g2 + tst %g2 bg 1f - sra %o0, %o2, %o4 + sra %o0, %o2, %o4 + sra %o0, 31, %o4 sub %g0, %g2, %g2 ba 2f - sra %o0, %g2, %o5 -1: sll %o0, %g2, %g3 + sra %o0, %g2, %o5 + +1: + sll %o0, %g2, %g3 srl %o1, %o2, %g2 or %g2, %g3, %o5 -2: or %g0, %o4, %o0 +2: + or %g0, %o4, %o0 or %g0, %o5, %o1 -3: jmpl %o7 + 8, %g0 - nop +3: + jmpl %o7 + 8, %g0 + nop diff --git a/arch/sparc/lib/blockops.S b/arch/sparc/lib/blockops.S new file mode 100644 index 000000000..f8a9e80df --- /dev/null +++ b/arch/sparc/lib/blockops.S @@ -0,0 +1,103 @@ +/* $Id: blockops.S,v 1.5 1996/09/24 05:22:56 davem Exp $ + * blockops.S: Common block zero optimized routines. + * + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + */ + +#include <asm/cprefix.h> + + /* Zero out 64 bytes of memory at (buf + offset). + * Assumes %g1 contains zero. + */ +#define BLAST_BLOCK(buf, offset) \ + std %g0, [buf + offset + 0x38]; \ + std %g0, [buf + offset + 0x30]; \ + std %g0, [buf + offset + 0x28]; \ + std %g0, [buf + offset + 0x20]; \ + std %g0, [buf + offset + 0x18]; \ + std %g0, [buf + offset + 0x10]; \ + std %g0, [buf + offset + 0x08]; \ + std %g0, [buf + offset + 0x00]; + + /* Copy 32 bytes of memory at (src + offset) to + * (dst + offset). + */ +#define MIRROR_BLOCK(dst, src, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [src + offset + 0x18], t0; \ + ldd [src + offset + 0x10], t2; \ + ldd [src + offset + 0x08], t4; \ + ldd [src + offset + 0x00], t6; \ + std t0, [dst + offset + 0x18]; \ + std t2, [dst + offset + 0x10]; \ + std t4, [dst + offset + 0x08]; \ + std t6, [dst + offset + 0x00]; + + /* Profiling evidence indicates that memset() is + * commonly called for blocks of size PAGE_SIZE, + * and (2 * PAGE_SIZE) (for kernel stacks) + * and with a second arg of zero. We assume in + * all of these cases that the buffer is aligned + * on at least an 8 byte boundry. + * + * Therefore we special case them to make them + * as fast as possible. + */ + + .text + .align 4 + + .globl C_LABEL(bzero_2page), C_LABEL(bzero_1page) +C_LABEL(bzero_2page): + /* %o0 = buf */ + or %g0, %g0, %g1 + or %o0, %g0, %o1 + or %g0, 0x20, %g2 +1: + BLAST_BLOCK(%o0, 0x00) + BLAST_BLOCK(%o0, 0x40) + BLAST_BLOCK(%o0, 0x80) + BLAST_BLOCK(%o0, 0xc0) + subcc %g2, 1, %g2 + bne 1b + add %o0, 0x100, %o0 + + retl + mov %o1, %o0 + +C_LABEL(bzero_1page): + /* %o0 = buf */ + or %g0, %g0, %g1 + or %o0, %g0, %o1 + or %g0, 0x10, %g2 +1: + BLAST_BLOCK(%o0, 0x00) + BLAST_BLOCK(%o0, 0x40) + BLAST_BLOCK(%o0, 0x80) + BLAST_BLOCK(%o0, 0xc0) + subcc %g2, 1, %g2 + bne 1b + add %o0, 0x100, %o0 + + retl + mov %o1, %o0 + + .globl C_LABEL(__copy_1page) +C_LABEL(__copy_1page): + /* %o0 = dst, %o1 = src */ + or %g0, 0x10, %g1 +1: + MIRROR_BLOCK(%o0, %o1, 0x00, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5) + MIRROR_BLOCK(%o0, %o1, 0x20, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5) + MIRROR_BLOCK(%o0, %o1, 0x40, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5) + MIRROR_BLOCK(%o0, %o1, 0x60, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5) + MIRROR_BLOCK(%o0, %o1, 0x80, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5) + MIRROR_BLOCK(%o0, %o1, 0xa0, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5) + MIRROR_BLOCK(%o0, %o1, 0xc0, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5) + MIRROR_BLOCK(%o0, %o1, 0xe0, %o2, %o3, %o4, %o5, %g2, %g3, %g4, %g5) + subcc %g1, 1, %g1 + add %o0, 0x100, %o0 + bne 1b + add %o1, 0x100, %o1 + + retl + nop diff --git a/arch/sparc/lib/checksum.S b/arch/sparc/lib/checksum.S new file mode 100644 index 000000000..a71371bf8 --- /dev/null +++ b/arch/sparc/lib/checksum.S @@ -0,0 +1,439 @@ +/* checksum.S: Sparc optimized checksum code. + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1995 Miguel de Icaza + * Copyright(C) 1996 David S. Miller + * + * derived from: + * Linux/Alpha checksum c-code + * Linux/ix86 inline checksum assembly + * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) + * David Mosberger-Tang for optimized reference c-code + * BSD4.4 portable checksum routine + */ + +#include <asm-sparc/cprefix.h> + +#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \ + ldd [buf + offset + 0x00], t0; \ + ldd [buf + offset + 0x08], t2; \ + addxcc t0, sum, sum; \ + addxcc t1, sum, sum; \ + ldd [buf + offset + 0x10], t4; \ + addxcc t2, sum, sum; \ + addxcc t3, sum, sum; \ + ldd [buf + offset + 0x18], t0; \ + addxcc t4, sum, sum; \ + addxcc t5, sum, sum; \ + addxcc t0, sum, sum; \ + addxcc t1, sum, sum; + +#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3) \ + ldd [buf - offset - 0x08], t0; \ + ldd [buf - offset - 0x00], t2; \ + addxcc t0, sum, sum; \ + addxcc t1, sum, sum; \ + addxcc t2, sum, sum; \ + addxcc t3, sum, sum; + + /* Do end cruft out of band to get better cache patterns. */ +csum_partial_end_cruft: + be 1f ! caller asks %o1 & 0x8 + andcc %o1, 4, %g0 ! nope, check for word remaining + ldd [%o0], %g2 ! load two + addcc %g2, %o2, %o2 ! add first word to sum + addxcc %g3, %o2, %o2 ! add second word as well + add %o0, 8, %o0 ! advance buf ptr + addx %g0, %o2, %o2 ! add in final carry + andcc %o1, 4, %g0 ! check again for word remaining +1: be 1f ! nope, skip this code + andcc %o1, 3, %o1 ! check for trailing bytes + ld [%o0], %g2 ! load it + addcc %g2, %o2, %o2 ! add to sum + add %o0, 4, %o0 ! advance buf ptr + addx %g0, %o2, %o2 ! add in final carry + andcc %o1, 3, %g0 ! check again for trailing bytes +1: be 1f ! no trailing bytes, return + addcc %o1, -1, %g0 ! only one byte remains? + bne 2f ! at least two bytes more + subcc %o1, 2, %o1 ! only two bytes more? + b 4f ! only one byte remains + or %g0, %g0, %o4 ! clear fake hword value +2: lduh [%o0], %o4 ! get hword + be 6f ! jmp if only hword remains + add %o0, 2, %o0 ! advance buf ptr either way + sll %o4, 16, %o4 ! create upper hword +4: ldub [%o0], %o5 ! get final byte + sll %o5, 8, %o5 ! put into place + or %o5, %o4, %o4 ! coalese with hword (if any) +6: addcc %o4, %o2, %o2 ! add to sum +1: retl ! get outta here + addx %g0, %o2, %o0 ! add final carry into retval + + /* Also do alignment out of band to get better cache patterns. */ +csum_partial_fix_alignment: + cmp %o1, 6 + bl cpte - 0x4 + andcc %o0, 0x2, %g0 + be 1f + andcc %o0, 0x4, %g0 + lduh [%o0 + 0x00], %g2 + sub %o1, 2, %o1 + add %o0, 2, %o0 + sll %g2, 16, %g2 + addcc %g2, %o2, %o2 + srl %o2, 16, %g3 + addx %g0, %g3, %g2 + sll %o2, 16, %o2 + sll %g2, 16, %g3 + srl %o2, 16, %o2 + andcc %o0, 0x4, %g0 + or %g3, %o2, %o2 +1: be cpa + andcc %o1, 0xffffff80, %o3 + ld [%o0 + 0x00], %g2 + sub %o1, 4, %o1 + addcc %g2, %o2, %o2 + add %o0, 4, %o0 + addx %g0, %o2, %o2 + b cpa + andcc %o1, 0xffffff80, %o3 + + /* The common case is to get called with a nicely aligned + * buffer of size 0x20. Follow the code path for that case. + */ + .globl C_LABEL(csum_partial) +C_LABEL(csum_partial): /* %o0=buf, %o1=len, %o2=sum */ + andcc %o0, 0x7, %g0 ! alignment problems? + bne csum_partial_fix_alignment ! yep, handle it + sethi %hi(cpte - 8), %g7 ! prepare table jmp ptr + andcc %o1, 0xffffff80, %o3 ! num loop iterations +cpa: be 3f ! none to do + andcc %o1, 0x70, %g1 ! clears carry flag too +5: CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + addx %g0, %o2, %o2 ! sink in final carry + subcc %o3, 128, %o3 ! detract from loop iters + bne 5b ! more to do + add %o0, 128, %o0 ! advance buf ptr + andcc %o1, 0x70, %g1 ! clears carry flag too +3: be cpte ! nope + andcc %o1, 0xf, %g0 ! anything left at all? + srl %g1, 1, %o4 ! compute offset + sub %g7, %g1, %g7 ! adjust jmp ptr + sub %g7, %o4, %g7 ! final jmp ptr adjust + jmp %g7 + %lo(cpte - 8) ! enter the table + add %o0, %g1, %o0 ! advance buf ptr +cptbl: CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5) + addx %g0, %o2, %o2 ! fetch final carry + andcc %o1, 0xf, %g0 ! anything left at all? +cpte: bne csum_partial_end_cruft ! yep, handle it + andcc %o1, 8, %g0 ! check how much +cpout: retl ! get outta here + mov %o2, %o0 ! return computed csum + + /* This aligned version executes typically in 8.5 superscalar cycles, this + * is the best I can do. I say 8.5 because the final add will pair with + * the next ldd in the main unrolled loop. Thus the pipe is always full. + */ +#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [src + off + 0x00], t0; \ + ldd [src + off + 0x08], t2; \ + addxcc t0, sum, sum; \ + ldd [src + off + 0x10], t4; \ + addxcc t1, sum, sum; \ + ldd [src + off + 0x18], t6; \ + addxcc t2, sum, sum; \ + std t0, [dst + off + 0x00]; \ + addxcc t3, sum, sum; \ + std t2, [dst + off + 0x08]; \ + addxcc t4, sum, sum; \ + std t4, [dst + off + 0x10]; \ + addxcc t5, sum, sum; \ + std t6, [dst + off + 0x18]; \ + addxcc t6, sum, sum; \ + addxcc t7, sum, sum; + + /* 12 superscalar cycles seems to be the limit for this case, + * because of this we thus do all the ldd's together to get + * Viking MXCC into streaming mode. Ho hum... + */ +#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [src + off + 0x00], t0; \ + ldd [src + off + 0x08], t2; \ + ldd [src + off + 0x10], t4; \ + ldd [src + off + 0x18], t6; \ + st t0, [dst + off + 0x00]; \ + addxcc t0, sum, sum; \ + st t1, [dst + off + 0x04]; \ + addxcc t1, sum, sum; \ + st t2, [dst + off + 0x08]; \ + addxcc t2, sum, sum; \ + st t3, [dst + off + 0x0c]; \ + addxcc t3, sum, sum; \ + st t4, [dst + off + 0x10]; \ + addxcc t4, sum, sum; \ + st t5, [dst + off + 0x14]; \ + addxcc t5, sum, sum; \ + st t6, [dst + off + 0x18]; \ + addxcc t6, sum, sum; \ + st t7, [dst + off + 0x1c]; \ + addxcc t7, sum, sum; + + /* Yuck, 6 superscalar cycles... */ +#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3) \ + ldd [src - off - 0x08], t0; \ + ldd [src - off - 0x00], t2; \ + addxcc t0, sum, sum; \ + st t0, [dst - off - 0x08]; \ + addxcc t1, sum, sum; \ + st t1, [dst - off - 0x04]; \ + addxcc t2, sum, sum; \ + st t2, [dst - off - 0x00]; \ + addxcc t3, sum, sum; \ + st t3, [dst - off + 0x04]; + + /* Handle the end cruft code out of band for better cache patterns. */ +cc_end_cruft: + be 1f + andcc %o3, 4, %g0 + ldd [%o0 + 0x00], %g2 + add %o1, 8, %o1 + addcc %g2, %g7, %g7 + add %o0, 8, %o0 + addxcc %g3, %g7, %g7 + st %g2, [%o1 - 0x08] + addx %g0, %g7, %g7 + andcc %o3, 4, %g0 + st %g3, [%o1 - 0x04] +1: be 1f + andcc %o3, 3, %o3 + ld [%o0 + 0x00], %g2 + add %o1, 4, %o1 + addcc %g2, %g7, %g7 + st %g2, [%o1 - 0x04] + addx %g0, %g7, %g7 + add %o0, 4, %o0 + andcc %o3, 3, %g0 +1: be 1f + addcc %o3, -1, %g0 + bne 2f + subcc %o3, 2, %o3 + b 4f + or %g0, %g0, %o4 +2: lduh [%o0 + 0x00], %o4 + add %o0, 2, %o0 + sth %o4, [%o1 + 0x00] + be 6f + add %o1, 2, %o1 + sll %o4, 16, %o4 +4: ldub [%o0 + 0x00], %o5 + stb %o5, [%o1 + 0x00] + sll %o5, 8, %o5 + or %o5, %o4, %o4 +6: addcc %o4, %g7, %g7 +1: retl + addx %g0, %g7, %o0 + + /* Also, handle the alignment code out of band. */ +cc_dword_align: + cmp %g1, 6 + bl,a ccte + andcc %g1, 0xf, %o3 + andcc %o0, 0x1, %g0 + bne ccslow + andcc %o0, 0x2, %g0 + be 1f + andcc %o0, 0x4, %g0 + lduh [%o0 + 0x00], %g2 + sub %g1, 2, %g1 + sth %g2, [%o1 + 0x00] + add %o0, 2, %o0 + sll %g2, 16, %g2 + addcc %g2, %g7, %g7 + add %o1, 2, %o1 + srl %g7, 16, %g3 + addx %g0, %g3, %g2 + sll %g7, 16, %g7 + sll %g2, 16, %g3 + srl %g7, 16, %g7 + andcc %o0, 0x4, %g0 + or %g3, %g7, %g7 +1: be 3f + andcc %g1, 0xffffff80, %g0 + ld [%o0 + 0x00], %g2 + sub %g1, 4, %g1 + st %g2, [%o1 + 0x00] + add %o0, 4, %o0 + addcc %g2, %g7, %g7 + add %o1, 4, %o1 + addx %g0, %g7, %g7 + b 3f + andcc %g1, 0xffffff80, %g0 + + /* Sun, you just can't beat me, you just can't. Stop trying, + * give up. I'm serious, I am going to kick the living shit + * out of you, game over, lights out. + */ + .align 8 + .globl C_LABEL(csum_partial_copy) +C_LABEL(csum_partial_copy): /* %o0=src, %o1=dest, %o2=len, %o3=sum */ + xor %o0, %o1, %o4 ! get changing bits + mov %o2, %g1 ! free up %o2 + andcc %o4, 3, %g0 ! check for mismatched alignment + bne ccslow ! better this than unaligned/fixups + andcc %o0, 7, %g0 ! need to align things? + mov %o3, %g7 ! free up %o3 + bne cc_dword_align ! yes, we check for short lengths there + andcc %g1, 0xffffff80, %g0 ! can we use unrolled loop? +3: be 3f ! nope, less than one loop remains + andcc %o1, 4, %g0 ! dest aligned on 4 or 8 byte boundry? + be ccdbl + 4 ! 8 byte aligned, kick ass +5: CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + sub %g1, 128, %g1 ! detract from length + addx %g0, %g7, %g7 ! add in last carry bit + andcc %g1, 0xffffff80, %g0 ! more to csum? + add %o0, 128, %o0 ! advance src ptr + bne 5b ! we did not go negative, continue looping + add %o1, 128, %o1 ! advance dest ptr +3: andcc %g1, 0x70, %o2 ! can use table? +ccmerge:be ccte ! nope, go and check for end cruft + andcc %g1, 0xf, %o3 ! get low bits of length (clears carry btw) + srl %o2, 1, %o4 ! begin negative offset computation + sethi %hi(ccte - 8), %o5 ! set up table ptr end + add %o0, %o2, %o0 ! advance src ptr + sub %o5, %o4, %o5 ! continue table calculation + sll %o2, 1, %g2 ! constant multiplies are fun... + sub %o5, %g2, %o5 ! some more adjustments + jmp %o5 + %lo(ccte - 8) ! jump into it, duff style, wheee... + add %o1, %o2, %o1 ! advance dest ptr (carry is clear btw) +cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5) + addx %g0, %g7, %g7 + andcc %o3, 0xf, %g0 ! check for low bits set +ccte: bne cc_end_cruft ! something left, handle it out of band + andcc %o3, 8, %g0 ! begin checks for that code + retl ! return + mov %g7, %o0 ! give em the computed checksum +ccdbl: CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + sub %g1, 128, %g1 ! detract from length + addx %g0, %g7, %g7 ! add in last carry bit + andcc %g1, 0xffffff80, %g0 ! more to csum? + add %o0, 128, %o0 ! advance src ptr + bne ccdbl ! we did not go negative, continue looping + add %o1, 128, %o1 ! advance dest ptr + b ccmerge ! finish it off, above + andcc %g1, 0x70, %o2 ! can use table? (clears carry btw) + +ccslow: + save %sp, -104, %sp + mov %i0, %g2 + mov %g2, %o4 + orcc %i2, %g0, %o5 + ble .LL37 + mov 0, %o3 + andcc %g2, 1, %g3 + be .LL50 + sra %o5, 1, %o1 + ldub [%g2], %o3 + add %i2, -1, %o5 + add %g2, 1, %o4 + sra %o5, 1, %o1 +.LL50: + cmp %o1, 0 + be .LL39 + andcc %o4, 2, %g0 + be,a .LL51 + sra %o1, 1, %o1 + add %o1, -1, %o1 + lduh [%o4], %o0 + add %o5, -2, %o5 + add %o3, %o0, %o3 + add %o4, 2, %o4 + sra %o1, 1, %o1 +.LL51: + cmp %o1, 0 + be .LL41 + mov 0, %o2 +.LL42: + ld [%o4], %o0 + add %o3, %o2, %o3 + add %o3, %o0, %o3 + cmp %o3, %o0 + addx %g0, 0, %o2 + addcc %o1, -1, %o1 + bne .LL42 + add %o4, 4, %o4 + add %o3, %o2, %o3 + sethi %hi(65535), %o0 + or %o0, %lo(65535), %o0 + and %o3, %o0, %o0 + srl %o3, 16, %o1 + add %o0, %o1, %o3 +.LL41: + andcc %o5, 2, %g0 + be .LL52 + andcc %o5, 1, %g0 + lduh [%o4], %o0 + add %o3, %o0, %o3 + add %o4, 2, %o4 +.LL39: + andcc %o5, 1, %g0 +.LL52: + be .LL53 + sethi %hi(65535), %o0 + ldub [%o4], %o0 + sll %o0, 8, %o0 + add %o3, %o0, %o3 + sethi %hi(65535), %o0 +.LL53: + or %o0, %lo(65535), %o0 + and %o3, %o0, %o2 + srl %o3, 16, %o1 + add %o2, %o1, %o1 + and %o1, %o0, %o2 + srl %o1, 16, %o1 + add %o2, %o1, %o1 + and %o1, %o0, %o0 + srl %o1, 16, %o1 + add %o0, %o1, %o1 + sll %o1, 16, %o0 + cmp %g3, 0 + be .LL37 + srl %o0, 16, %o3 + srl %o0, 24, %o1 + and %o3, 255, %o0 + sll %o0, 8, %o0 + or %o1, %o0, %o3 +.LL37: + add %o3, %i3, %o1 + sethi %hi(65535), %o0 + or %o0, %lo(65535), %o0 + and %o1, %o0, %o0 + srl %o1, 16, %o1 + add %o0, %o1, %i0 + mov %i1, %o0 + mov %g2, %o1 + call C_LABEL(__memcpy) + mov %i2, %o2 + ret + restore diff --git a/arch/sparc/lib/memcmp.S b/arch/sparc/lib/memcmp.S new file mode 100644 index 000000000..bf22e492c --- /dev/null +++ b/arch/sparc/lib/memcmp.S @@ -0,0 +1,314 @@ +#include <asm/cprefix.h> + + .text + .align 4 + .global C_LABEL(__memcmp), C_LABEL(memcmp) +C_LABEL(__memcmp): +C_LABEL(memcmp): +#if 1 + cmp %o2, 0 + ble L3 + mov 0, %g3 +L5: + ldub [%o0], %g2 + ldub [%o1], %g3 + sub %g2, %g3, %g2 + mov %g2, %g3 + sll %g2, 24, %g2 + + cmp %g2, 0 + bne L3 + add %o0, 1, %o0 + + add %o2, -1, %o2 + + cmp %o2, 0 + bg L5 + add %o1, 1, %o1 +L3: + sll %g3, 24, %o0 + sra %o0, 24, %o0 + + retl + nop +#else + save %sp, -104, %sp + mov %i2, %o4 + mov %i0, %o0 + + cmp %o4, 15 + ble L72 + mov %i1, %i2 + + andcc %i2, 3, %g0 + be L161 + andcc %o0, 3, %g2 +L75: + ldub [%o0], %g3 + ldub [%i2], %g2 + add %o0,1, %o0 + + subcc %g3, %g2, %i0 + bne L156 + add %i2, 1, %i2 + + andcc %i2, 3, %g0 + bne L75 + add %o4, -1, %o4 + + andcc %o0, 3, %g2 +L161: + bne,a L78 + mov %i2, %i1 + + mov %o0, %i5 + mov %i2, %i3 + srl %o4, 2, %i4 + + cmp %i4, 0 + bge L93 + mov %i4, %g2 + + add %i4, 3, %g2 +L93: + sra %g2, 2, %g2 + sll %g2, 2, %g2 + sub %i4, %g2, %g2 + + cmp %g2, 1 + be,a L88 + add %o0, 4, %i5 + + bg L94 + cmp %g2, 2 + + cmp %g2, 0 + be,a L86 + ld [%o0], %g3 + + b L162 + ld [%i5], %g3 +L94: + be L81 + cmp %g2, 3 + + be,a L83 + add %o0, -4, %i5 + + b L162 + ld [%i5], %g3 +L81: + add %o0, -8, %i5 + ld [%o0], %g3 + add %i2, -8, %i3 + ld [%i2], %g2 + + b L82 + add %i4, 2, %i4 +L83: + ld [%o0], %g4 + add %i2, -4, %i3 + ld [%i2], %g1 + + b L84 + add %i4, 1, %i4 +L86: + b L87 + ld [%i2], %g2 +L88: + add %i2, 4, %i3 + ld [%o0], %g4 + add %i4, -1, %i4 + ld [%i2], %g1 +L95: + ld [%i5], %g3 +L162: + cmp %g4, %g1 + be L87 + ld [%i3], %g2 + + cmp %g4, %g1 +L163: + bleu L114 + mov -1, %i0 + + b L114 + mov 1, %i0 +L87: + ld [%i5 + 4], %g4 + cmp %g3, %g2 + bne L163 + ld [%i3 + 4], %g1 +L84: + ld [%i5 + 8], %g3 + + cmp %g4, %g1 + bne L163 + ld [%i3 + 8], %g2 +L82: + ld [%i5 + 12], %g4 + cmp %g3, %g2 + bne L163 + ld [%i3 + 12], %g1 + + add %i5, 16, %i5 + + addcc %i4, -4, %i4 + bne L95 + add %i3, 16, %i3 + + cmp %g4, %g1 + bne L163 + nop + + b L114 + mov 0, %i0 +L78: + srl %o4, 2, %i0 + and %o0, -4, %i3 + orcc %i0, %g0, %g3 + sll %g2, 3, %o7 + mov 32, %g2 + + bge L129 + sub %g2, %o7, %o1 + + add %i0, 3, %g3 +L129: + sra %g3, 2, %g2 + sll %g2, 2, %g2 + sub %i0, %g2, %g2 + + cmp %g2, 1 + be,a L124 + ld [%i3], %o3 + + bg L130 + cmp %g2, 2 + + cmp %g2, 0 + be,a L122 + ld [%i3], %o2 + + b L164 + sll %o3, %o7, %g3 +L130: + be L117 + cmp %g2, 3 + + be,a L119 + ld [%i3], %g1 + + b L164 + sll %o3, %o7, %g3 +L117: + ld [%i3], %g4 + add %i2, -8, %i1 + ld [%i3 + 4], %o3 + add %i0, 2, %i0 + ld [%i2], %i4 + + b L118 + add %i3, -4, %i3 +L119: + ld [%i3 + 4], %g4 + add %i2, -4, %i1 + ld [%i2], %i5 + + b L120 + add %i0, 1, %i0 +L122: + ld [%i3 + 4], %g1 + ld [%i2], %i4 + + b L123 + add %i3, 4, %i3 +L124: + add %i2, 4, %i1 + ld [%i3 + 4], %o2 + add %i0, -1, %i0 + ld [%i2], %i5 + add %i3, 8, %i3 +L131: + sll %o3, %o7, %g3 +L164: + srl %o2, %o1, %g2 + ld [%i3], %g1 + or %g3, %g2, %g3 + + cmp %g3, %i5 + bne L163 + ld [%i1], %i4 +L123: + sll %o2, %o7, %g3 + srl %g1, %o1, %g2 + ld [%i3 + 4], %g4 + or %g3, %g2, %g3 + + cmp %g3, %i4 + bne L163 + ld [%i1 + 4], %i5 +L120: + sll %g1, %o7, %g3 + srl %g4, %o1, %g2 + ld [%i3 + 8], %o3 + or %g3, %g2, %g3 + + cmp %g3, %i5 + bne L163 + ld [%i1 + 8], %i4 +L118: + sll %g4, %o7, %g3 + srl %o3, %o1, %g2 + ld [%i3 + 12], %o2 + or %g3, %g2, %g3 + + cmp %g3, %i4 + bne L163 + ld [%i1 + 12], %i5 + + add %i3, 16, %i3 + addcc %i0, -4, %i0 + bne L131 + add %i1, 16, %i1 + + sll %o3, %o7, %g3 + srl %o2, %o1, %g2 + or %g3, %g2, %g3 + + cmp %g3, %i5 + be,a L114 + mov 0, %i0 + + b,a L163 +L114: + cmp %i0, 0 + bne L156 + and %o4, -4, %g2 + + add %o0, %g2, %o0 + add %i2, %g2, %i2 + and %o4, 3, %o4 +L72: + cmp %o4, 0 + be L156 + mov 0, %i0 + + ldub [%o0], %g3 +L165: + ldub [%i2], %g2 + add %o0, 1, %o0 + + subcc %g3, %g2, %i0 + bne L156 + add %i2, 1, %i2 + + addcc %o4, -1, %o4 + bne,a L165 + ldub [%o0], %g3 + + mov 0, %i0 +L156: + ret + restore +#endif diff --git a/arch/sparc/lib/memcpy.S b/arch/sparc/lib/memcpy.S new file mode 100644 index 000000000..c4f0394a4 --- /dev/null +++ b/arch/sparc/lib/memcpy.S @@ -0,0 +1,364 @@ +/* memcpy.S: Sparc optimized memcpy code. + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1996 David S. Miller + * Copyright(C) 1996 Eddie C. Dost + * Copyright(C) 1996 Jakub Jelinek + * + * derived from: + * e-mail between David and Eddie. + */ + +#include <asm/cprefix.h> +#include <asm/ptrace.h> + +/* Both these macros have to start with exactly the same insn */ +#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [%src + offset + 0x00], %t0; \ + ldd [%src + offset + 0x08], %t2; \ + ldd [%src + offset + 0x10], %t4; \ + ldd [%src + offset + 0x18], %t6; \ + st %t0, [%dst + offset + 0x00]; \ + st %t1, [%dst + offset + 0x04]; \ + st %t2, [%dst + offset + 0x08]; \ + st %t3, [%dst + offset + 0x0c]; \ + st %t4, [%dst + offset + 0x10]; \ + st %t5, [%dst + offset + 0x14]; \ + st %t6, [%dst + offset + 0x18]; \ + st %t7, [%dst + offset + 0x1c]; + +#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [%src + offset + 0x00], %t0; \ + ldd [%src + offset + 0x08], %t2; \ + ldd [%src + offset + 0x10], %t4; \ + ldd [%src + offset + 0x18], %t6; \ + std %t0, [%dst + offset + 0x00]; \ + std %t2, [%dst + offset + 0x08]; \ + std %t4, [%dst + offset + 0x10]; \ + std %t6, [%dst + offset + 0x18]; + +#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ + ldd [%src - offset - 0x10], %t0; \ + ldd [%src - offset - 0x08], %t2; \ + st %t0, [%dst - offset - 0x10]; \ + st %t1, [%dst - offset - 0x0c]; \ + st %t2, [%dst - offset - 0x08]; \ + st %t3, [%dst - offset - 0x04]; + +#define MOVE_HALFCHUNK(src, dst, offset, t0, t1, t2, t3) \ + lduh [%src + offset + 0x00], %t0; \ + lduh [%src + offset + 0x02], %t1; \ + lduh [%src + offset + 0x04], %t2; \ + lduh [%src + offset + 0x06], %t3; \ + sth %t0, [%dst + offset + 0x00]; \ + sth %t1, [%dst + offset + 0x02]; \ + sth %t2, [%dst + offset + 0x04]; \ + sth %t3, [%dst + offset + 0x06]; + +#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ + ldub [%src - offset - 0x02], %t0; \ + ldub [%src - offset - 0x01], %t1; \ + stb %t0, [%dst - offset - 0x02]; \ + stb %t1, [%dst - offset - 0x01]; + + .text + .align 4 + + .globl C_LABEL(__memcpy), C_LABEL(memcpy), C_LABEL(bcopy) + .globl C_LABEL(amemmove), C_LABEL(memmove) +C_LABEL(bcopy): + mov %o0, %o3 + mov %o1, %o0 + mov %o3, %o1 +C_LABEL(amemmove): +C_LABEL(memmove): +/* This should be kept as optimized as possible */ + cmp %o0, %o1 + bleu 1f + xor %o0, %o1, %o4 + + add %o1, %o2, %o3 + cmp %o3, %o0 + bleu 2f + andcc %o4, 3, %g0 + +/* But I think from now on, we can hold on. Or tell me, is memmoving + * overlapping regions such a nice game? */ + + mov %o0, %g1 + add %o1, %o2, %o1 + add %o0, %o2, %o0 + sub %o1, 1, %o1 + sub %o0, 1, %o0 + +reverse_bytes: + ldub [%o1], %o4 + subcc %o2, 1, %o2 + stb %o4, [%o0] + sub %o1, 1, %o1 + bne reverse_bytes + sub %o0, 1, %o0 + + retl + mov %g1, %o0 + +/* And here start optimizing again... */ + +dword_align: + andcc %o1, 1, %g0 + be 4f + andcc %o1, 2, %g0 + + ldub [%o1], %g2 + add %o1, 1, %o1 + stb %g2, [%o0] + sub %o2, 1, %o2 + bne 3f + add %o0, 1, %o0 + + lduh [%o1], %g2 + add %o1, 2, %o1 + sth %g2, [%o0] + sub %o2, 2, %o2 + b 3f + add %o0, 2, %o0 +4: + lduh [%o1], %g2 + add %o1, 2, %o1 + sth %g2, [%o0] + sub %o2, 2, %o2 + b 3f + add %o0, 2, %o0 + +C_LABEL(__memcpy): +C_LABEL(memcpy): /* %o0=dst %o1=src %o2=len */ + xor %o0, %o1, %o4 +1: + andcc %o4, 3, %o5 +2: + bne cannot_optimize + cmp %o2, 15 + + bleu short_aligned_end + andcc %o1, 3, %g0 + + bne dword_align +3: + andcc %o1, 4, %g0 + + be 2f + mov %o2, %g1 + + ld [%o1], %o4 + sub %g1, 4, %g1 + st %o4, [%o0] + add %o1, 4, %o1 + add %o0, 4, %o0 +2: + andcc %g1, 0xffffff80, %g7 + be 3f + andcc %o0, 4, %g0 + + be ldd_std + 4 +5: + MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) + subcc %g7, 128, %g7 + add %o1, 128, %o1 + bne 5b + add %o0, 128, %o0 +3: + andcc %g1, 0x70, %g7 + be memcpy_table_end + andcc %g1, 8, %g0 + + sethi %hi(memcpy_table_end), %o5 + srl %g7, 1, %o4 + add %g7, %o4, %o4 + add %o1, %g7, %o1 + sub %o5, %o4, %o5 + jmpl %o5 + %lo(memcpy_table_end), %g0 + add %o0, %g7, %o0 + +memcpy_table: + MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) + +memcpy_table_end: + be memcpy_last7 + andcc %g1, 4, %g0 + + ldd [%o1], %g2 + add %o0, 8, %o0 + add %o1, 8, %o1 + st %g2, [%o0 - 0x08] + st %g3, [%o0 - 0x04] +memcpy_last7: + be 1f + andcc %g1, 2, %g0 + + ld [%o1], %g2 + add %o1, 4, %o1 + st %g2, [%o0] + add %o0, 4, %o0 +1: + be 1f + andcc %g1, 1, %g0 + + lduh [%o1], %g2 + add %o1, 2, %o1 + sth %g2, [%o0] + add %o0, 2, %o0 +1: + be 1f + nop + + ldub [%o1], %g2 + stb %g2, [%o0] +1: + retl + nop + + /* Placed here for cache reasons. */ + .globl C_LABEL(__copy_to_user), C_LABEL(__copy_from_user) +C_LABEL(__copy_to_user): + b copy_user_common + st %o0, [%g6 + THREAD_EX_ADDR] + +C_LABEL(__copy_from_user): + st %o1, [%g6 + THREAD_EX_ADDR] + +copy_user_common: + ld [%g6 + THREAD_EX_COUNT], %g1 + set copy_user_failure, %g2 + add %g1, 1, %g1 + st %o7, [%g6 + THREAD_EX_PC] + st %g1, [%g6 + THREAD_EX_COUNT] + call C_LABEL(__memcpy) + st %g2, [%g6 + THREAD_EX_EXPC] + +copy_user_success: + ldd [%g6 + THREAD_EX_COUNT], %g2 + mov 0, %o0 + sub %g2, 1, %g1 + jmpl %g3 + 0x8, %g0 + st %g1, [%g6 + THREAD_EX_COUNT] + +copy_user_failure: + jmpl %g3 + 0x8, %g0 + mov %g2, %o0 + +ldd_std: + MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) + subcc %g7, 128, %g7 + add %o1, 128, %o1 + bne ldd_std + add %o0, 128, %o0 + + andcc %g1, 0x70, %g7 + be memcpy_table_end + andcc %g1, 8, %g0 + + sethi %hi(memcpy_table_end), %o5 + srl %g7, 1, %o4 + add %g7, %o4, %o4 + add %o1, %g7, %o1 + sub %o5, %o4, %o5 + jmpl %o5 + %lo(memcpy_table_end), %g0 + add %o0, %g7, %o0 + +cannot_optimize: + bleu short_end + cmp %o5, 2 + + bne byte_chunk + and %o2, 0xfffffff0, %o3 + + andcc %o1, 1, %g0 + be 1f + nop + + ldub [%o1], %g2 + add %o1, 1, %o1 + sub %o2, 1, %o2 + stb %g2, [%o0] + andcc %o2, 0xfffffff0, %o3 + be short_end + add %o0, 1, %o0 +1: + MOVE_HALFCHUNK(o1, o0, 0x00, g2, g3, g4, g5) + MOVE_HALFCHUNK(o1, o0, 0x08, g2, g3, g4, g5) + subcc %o3, 0x10, %o3 + add %o1, 0x10, %o1 + bne 1b + add %o0, 0x10, %o0 + b 2f + and %o2, 0xe, %o3 + +byte_chunk: + MOVE_SHORTCHUNK(o1, o0, -0x02, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x04, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x06, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x08, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x0a, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x0c, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x0e, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x10, g2, g3) + subcc %o3, 0x10, %o3 + add %o1, 0x10, %o1 + bne byte_chunk + add %o0, 0x10, %o0 + +short_end: + and %o2, 0xe, %o3 +2: + sethi %hi(short_table_end), %o5 + sll %o3, 3, %o4 + add %o0, %o3, %o0 + sub %o5, %o4, %o5 + add %o1, %o3, %o1 + jmpl %o5 + %lo(short_table_end), %g0 + andcc %o2, 1, %g0 + + MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) +short_table_end: + be 1f + nop + ldub [%o1], %g2 + stb %g2, [%o0] +1: + retl + nop + +short_aligned_end: + bne short_end + andcc %o2, 8, %g0 + + be 1f + andcc %o2, 4, %g0 + + ld [%o1 + 0x00], %g2 + ld [%o1 + 0x04], %g3 + add %o1, 8, %o1 + st %g2, [%o0 + 0x00] + st %g3, [%o0 + 0x04] + add %o0, 8, %o0 +1: + b memcpy_last7 + mov %o2, %g1 diff --git a/arch/sparc/lib/memscan.S b/arch/sparc/lib/memscan.S new file mode 100644 index 000000000..f334751c2 --- /dev/null +++ b/arch/sparc/lib/memscan.S @@ -0,0 +1,135 @@ +/* $Id: memscan.S,v 1.4 1996/09/08 02:01:20 davem Exp $ + * memscan.S: Optimized memscan for the Sparc. + * + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + */ + +#include <asm/cprefix.h> + +/* In essence, this is just a fancy strlen. */ + +#define LO_MAGIC 0x01010101 +#define HI_MAGIC 0x80808080 + + .text + .align 4 + .globl C_LABEL(__memscan_zero), C_LABEL(__memscan_generic) + .globl C_LABEL(memscan) +C_LABEL(__memscan_zero): + /* %o0 = addr, %o1 = size */ + cmp %o1, 0 + bne,a 1f + andcc %o0, 3, %g0 + + retl + nop + +1: + be mzero_scan_word + sethi %hi(HI_MAGIC), %g2 + + ldsb [%o0], %g3 +mzero_still_not_word_aligned: + cmp %g3, 0 + bne 1f + add %o0, 1, %o0 + + retl + sub %o0, 1, %o0 + +1: + subcc %o1, 1, %o1 + bne,a 1f + andcc %o0, 3, %g0 + + retl + nop + +1: + bne,a mzero_still_not_word_aligned + ldsb [%o0], %g3 + + sethi %hi(HI_MAGIC), %g2 +mzero_scan_word: + or %g2, %lo(HI_MAGIC), %o3 + sethi %hi(LO_MAGIC), %g3 + or %g3, %lo(LO_MAGIC), %o2 +mzero_next_word: + ld [%o0], %g2 +mzero_next_word_preloaded: + sub %g2, %o2, %g2 +mzero_next_word_preloaded_next: + andcc %g2, %o3, %g0 + bne mzero_byte_zero + add %o0, 4, %o0 + +mzero_check_out_of_fuel: + subcc %o1, 4, %o1 + bg,a 1f + ld [%o0], %g2 + + retl + nop + +1: + b mzero_next_word_preloaded_next + sub %g2, %o2, %g2 + + /* Check every byte. */ +mzero_byte_zero: + ldsb [%o0 - 4], %g2 + cmp %g2, 0 + bne mzero_byte_one + sub %o0, 4, %g3 + + retl + mov %g3, %o0 + +mzero_byte_one: + ldsb [%o0 - 3], %g2 + cmp %g2, 0 + bne,a mzero_byte_two_and_three + ldsb [%o0 - 2], %g2 + + retl + sub %o0, 3, %o0 + +mzero_byte_two_and_three: + cmp %g2, 0 + bne,a 1f + ldsb [%o0 - 1], %g2 + + retl + sub %o0, 2, %o0 + +1: + cmp %g2, 0 + bne,a mzero_next_word_preloaded + ld [%o0], %g2 + + retl + sub %o0, 1, %o0 + +mzero_found_it: + retl + sub %o0, 2, %o0 + +C_LABEL(memscan): +C_LABEL(__memscan_generic): + /* %o0 = addr, %o1 = c, %o2 = size */ + cmp %o2, 0 + bne,a 0f + ldub [%o0], %g2 + + b,a 2f +1: + ldub [%o0], %g2 +0: + cmp %g2, %o1 + be 2f + addcc %o2, -1, %o2 + bne 1b + add %o0, 1, %o0 +2: + retl + nop diff --git a/arch/sparc/lib/memset.S b/arch/sparc/lib/memset.S new file mode 100644 index 000000000..95691debb --- /dev/null +++ b/arch/sparc/lib/memset.S @@ -0,0 +1,166 @@ +/* linux/arch/sparc/lib/memset.S: Sparc optimized memset and bzero code + * Hand optimized from GNU libc's memset + * Copyright (C) 1991,1996 Free Software Foundation + * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + */ + +#include <asm/cprefix.h> +#include <asm/ptrace.h> + +#define HANDLE_UNALIGNED 1 + + /* Store 64 bytes at (BASE + OFFSET) using value SOURCE. */ +#define ZERO_BIG_BLOCK(base, offset, source) \ + std source, [base + offset + 0x00]; \ + std source, [base + offset + 0x08]; \ + std source, [base + offset + 0x10]; \ + std source, [base + offset + 0x18]; \ + std source, [base + offset + 0x20]; \ + std source, [base + offset + 0x28]; \ + std source, [base + offset + 0x30]; \ + std source, [base + offset + 0x38]; + +#define ZERO_LAST_BLOCKS(base, offset, source) \ + std source, [base - offset - 0x38]; \ + std source, [base - offset - 0x30]; \ + std source, [base - offset - 0x28]; \ + std source, [base - offset - 0x20]; \ + std source, [base - offset - 0x18]; \ + std source, [base - offset - 0x10]; \ + std source, [base - offset - 0x08]; \ + std source, [base - offset - 0x00]; + + .text + .align 4 + + .globl C_LABEL(__bzero), C_LABEL(__memset), C_LABEL(memset) +C_LABEL(__memset): +C_LABEL(memset): + and %o1, 0xff, %g3 + sll %g3, 8, %g2 + or %g3, %g2, %g3 + sll %g3, 16, %g2 + or %g3, %g2, %g3 + b 1f + mov %o2, %o1 + +#if HANDLE_UNALIGNED +/* As this is highly unprobable, we optimize the other case (4 aligned) + * Define HANDLE_UNALIGNED to 0, if all the alignment work is done by + * the trap. Then we have to hope nobody will memset something unaligned + * with large counts, as this would lead to a lot of traps... + */ +3: + cmp %o2, 3 + be 2f + stb %g3, [%o0] + + cmp %o2, 2 + be 2f + stb %g3, [%o0 + 0x01] + + stb %g3, [%o0 + 0x02] +2: + sub %o2, 4, %o2 + add %o1, %o2, %o1 + b 4f + sub %o0, %o2, %o0 +#endif /* HANDLE_UNALIGNED */ + + .globl C_LABEL(__clear_user) +C_LABEL(__clear_user): + st %o0, [%g6 + THREAD_EX_ADDR] + ld [%g6 + THREAD_EX_COUNT], %g1 + set clear_user_failure, %g2 + add %g1, 1, %g1 + st %o7, [%g6 + THREAD_EX_PC] + st %g1, [%g6 + THREAD_EX_COUNT] + call C_LABEL(__bzero) + st %g2, [%g6 + THREAD_EX_EXPC] + +clear_user_success: + ldd [%g6 + THREAD_EX_COUNT], %g2 + mov 0, %o0 + sub %g2, 1, %g1 + jmpl %g3 + 0x8, %g0 + st %g1, [%g6 + THREAD_EX_COUNT] + +clear_user_failure: + jmpl %g3 + 0x8, %g0 + mov %g2, %o0 + +C_LABEL(__bzero): + mov %g0, %g3 +1: + cmp %o1, 7 + bleu 7f + mov %o0, %g1 + +#if HANDLE_UNALIGNED + andcc %o0, 3, %o2 + bne 3b +#endif /* HANDLE_UNALIGNED */ +4: + andcc %o0, 4, %g0 + + be 2f + mov %g3, %g2 + + st %g3, [%o0] + sub %o1, 4, %o1 + add %o0, 4, %o0 +2: + andcc %o1, 0xffffff80, %o3 ! Now everything is 8 aligned and o1 is len to run + be 9f + andcc %o1, 0x78, %o2 +4: + ZERO_BIG_BLOCK(%o0, 0x00, %g2) + subcc %o3, 128, %o3 + ZERO_BIG_BLOCK(%o0, 0x40, %g2) + bne 4b + add %o0, 128, %o0 + + orcc %o2, %g0, %g0 +9: + be 6f + andcc %o1, 7, %o1 + + srl %o2, 1, %o3 + set bzero_table + 64, %o4 + sub %o4, %o3, %o4 + jmp %o4 + add %o0, %o2, %o0 + +bzero_table: + ZERO_LAST_BLOCKS(%o0, 0x48, %g2) + ZERO_LAST_BLOCKS(%o0, 0x08, %g2) + +6: + be 8f + andcc %o1, 4, %g0 + + be 1f + andcc %o1, 2, %g0 + + st %g3, [%o0] + add %o0, 4, %o0 +1: + be 1f + andcc %o1, 1, %g0 + + sth %g3, [%o0] + add %o0, 2, %o0 +1: + bne,a 8f + stb %g3, [%o0] +8: + retl + mov %g1,%o0 + +/* Don't care about alignment here. It is highly + * unprobable and at most two traps may happen + */ +7: + b 6b + orcc %o1, 0, %g0 diff --git a/arch/sparc/lib/memset.c b/arch/sparc/lib/memset.c new file mode 100644 index 000000000..1e81dff49 --- /dev/null +++ b/arch/sparc/lib/memset.c @@ -0,0 +1,71 @@ +/* linux/arch/sparc/lib/memset.c + * + * This is from GNU libc. + */ + +#include <linux/types.h> + +#define op_t unsigned long int +#define OPSIZ (sizeof(op_t)) + +typedef unsigned char byte; + +void *memset(void *dstpp, char c, size_t len) +{ + long int dstp = (long int) dstpp; + + if (len >= 8) { + size_t xlen; + op_t cccc; + + cccc = (unsigned char) c; + cccc |= cccc << 8; + cccc |= cccc << 16; + + /* There are at least some bytes to set. + No need to test for LEN == 0 in this alignment loop. */ + while (dstp % OPSIZ != 0) { + ((byte *) dstp)[0] = c; + dstp += 1; + len -= 1; + } + + /* Write 8 `op_t' per iteration until less + * than 8 `op_t' remain. + */ + xlen = len / (OPSIZ * 8); + while (xlen > 0) { + ((op_t *) dstp)[0] = cccc; + ((op_t *) dstp)[1] = cccc; + ((op_t *) dstp)[2] = cccc; + ((op_t *) dstp)[3] = cccc; + ((op_t *) dstp)[4] = cccc; + ((op_t *) dstp)[5] = cccc; + ((op_t *) dstp)[6] = cccc; + ((op_t *) dstp)[7] = cccc; + dstp += 8 * OPSIZ; + xlen -= 1; + } + len %= OPSIZ * 8; + + /* Write 1 `op_t' per iteration until less than + * OPSIZ bytes remain. + */ + xlen = len / OPSIZ; + while (xlen > 0) { + ((op_t *) dstp)[0] = cccc; + dstp += OPSIZ; + xlen -= 1; + } + len %= OPSIZ; + } + + /* Write the last few bytes. */ + while (len > 0) { + ((byte *) dstp)[0] = c; + dstp += 1; + len -= 1; + } + + return dstpp; +} diff --git a/arch/sparc/lib/mul.S b/arch/sparc/lib/mul.S index e6d78f85f..83dffbc2f 100644 --- a/arch/sparc/lib/mul.S +++ b/arch/sparc/lib/mul.S @@ -1,4 +1,5 @@ -/* mul.S: This routine was taken from glibc-1.09 and is covered +/* $Id: mul.S,v 1.4 1996/09/30 02:22:32 davem Exp $ + * mul.S: This routine was taken from glibc-1.09 and is covered * by the GNU Library General Public License Version 2. */ @@ -19,7 +20,7 @@ mov %o0, %y ! multiplier -> Y andncc %o0, 0xfff, %g0 ! test bits 12..31 be Lmul_shortway ! if zero, can do it the short way - andcc %g0, %g0, %o4 ! zero the partial product and clear N and V + andcc %g0, %g0, %o4 ! zero the partial product and clear N and V /* * Long multiply. 32 steps, followed by a final shift step. @@ -65,23 +66,23 @@ #if 0 tst %o0 bge 1f - rd %y, %o0 + rd %y, %o0 ! %o0 was indeed negative; fix upper 32 bits of result by subtracting ! %o1 (i.e., return %o4 - %o1 in %o1). retl - sub %o4, %o1, %o1 + sub %o4, %o1, %o1 1: retl - mov %o4, %o1 + mov %o4, %o1 #else /* Faster code adapted from tege@sics.se's code for umul.S. */ sra %o0, 31, %o2 ! make mask from sign bit and %o1, %o2, %o2 ! %o2 = 0 or %o1, depending on sign of %o0 rd %y, %o0 ! get lower half of product retl - sub %o4, %o2, %o1 ! subtract compensation + sub %o4, %o2, %o1 ! subtract compensation ! and put upper half in place #endif @@ -124,4 +125,11 @@ Lmul_shortway: srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left or %o5, %o0, %o0 ! construct low part of result retl - sra %o4, 20, %o1 ! ... and extract high part of result + sra %o4, 20, %o1 ! ... and extract high part of result + + .globl .mul_patch +.mul_patch: + smul %o0, %o1, %o0 + retl + rd %y, %o1 + nop diff --git a/arch/sparc/lib/rem.S b/arch/sparc/lib/rem.S index 3c0cc579b..44508148d 100644 --- a/arch/sparc/lib/rem.S +++ b/arch/sparc/lib/rem.S @@ -1,4 +1,5 @@ -/* rem.S: This routine was taken from glibc-1.09 and is covered +/* $Id: rem.S,v 1.7 1996/09/30 02:22:34 davem Exp $ + * rem.S: This routine was taken from glibc-1.09 and is covered * by the GNU Library General Public License Version 2. */ @@ -46,13 +47,14 @@ ! compute sign of result; if neither is negative, no problem orcc %o1, %o0, %g0 ! either negative? bge 2f ! no, go do the divide - xor %o1, %o0, %g6 ! compute sign in any case + mov %o0, %g2 ! compute sign in any case + tst %o1 bge 1f - tst %o0 + tst %o0 ! %o1 is definitely negative; %o0 might also be negative bge 2f ! if %o0 not negative... - sub %g0, %o1, %o1 ! in any case, make %o1 nonneg + sub %g0, %o1, %o1 ! in any case, make %o1 nonneg 1: ! %o0 is negative, %o1 is nonnegative sub %g0, %o0, %o0 ! make %o0 nonnegative 2: @@ -60,22 +62,24 @@ ! Ready to divide. Compute size of quotient; scale comparand. orcc %o1, %g0, %o5 bne 1f - mov %o0, %o3 + mov %o0, %o3 ! Divide by zero trap. If it returns, return 0 (about as ! wrong as possible, but that is what SunOS does...). ta ST_DIV0 retl - clr %o0 + clr %o0 1: cmp %o3, %o5 ! if %o1 exceeds %o0, done blu Lgot_result ! (and algorithm fails otherwise) - clr %o2 + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 blu Lnot_really_big - clr %o4 + clr %o4 ! Here the dividend is >= 2**(31-N) or so. We must be careful here, ! as our usual N-at-a-shot divide step will cause overflow and havoc. @@ -85,15 +89,19 @@ 1: cmp %o5, %g1 bgeu 3f - mov 1, %g7 + mov 1, %g7 + sll %o5, 4, %o5 + b 1b - add %o4, 1, %o4 + add %o4, 1, %o4 ! Now compute %g7. - 2: addcc %o5, %o5, %o5 + 2: + addcc %o5, %o5, %o5 + bcc Lnot_too_big - add %g7, 1, %g7 + add %g7, 1, %g7 ! We get here if the %o1 overflowed while shifting. ! This means that %o3 has the high-order bit set. @@ -101,15 +109,18 @@ sll %g1, 4, %g1 ! high order bit srl %o5, 1, %o5 ! rest of %o5 add %o5, %g1, %o5 + b Ldo_single_div - sub %g7, 1, %g7 + sub %g7, 1, %g7 Lnot_too_big: - 3: cmp %o5, %o3 + 3: + cmp %o5, %o3 blu 2b - nop + nop + be Ldo_single_div - nop + nop /* NB: these are commented out in the V8-Sparc manual as well */ /* (I do not understand this) */ ! %o5 > %o3: went too far: back up 1 step @@ -126,19 +137,23 @@ Ldo_single_div: subcc %g7, 1, %g7 bl Lend_regular_divide - nop + nop + sub %o3, %o5, %o3 mov 1, %o2 + b Lend_single_divloop - nop + nop Lsingle_divloop: sll %o2, 1, %o2 + bl 1f - srl %o5, 1, %o5 + srl %o5, 1, %o5 ! %o3 >= 0 sub %o3, %o5, %o3 + b 2f - add %o2, 1, %o2 + add %o2, 1, %o2 1: ! %o3 < 0 add %o3, %o5, %o3 sub %o2, 1, %o2 @@ -146,7 +161,8 @@ Lend_single_divloop: subcc %g7, 1, %g7 bge Lsingle_divloop - tst %o3 + tst %o3 + b,a Lend_regular_divide Lnot_really_big: @@ -154,206 +170,213 @@ Lnot_really_big: sll %o5, 4, %o5 cmp %o5, %o3 bleu 1b - addcc %o4, 1, %o4 + addcc %o4, 1, %o4 be Lgot_result - sub %o4, 1, %o4 + sub %o4, 1, %o4 tst %o3 ! set up for initial iteration Ldivloop: sll %o2, 4, %o2 ! depth 1, accumulated bits 0 bl L.1.16 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 2, accumulated bits 1 bl L.2.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 3, accumulated bits 3 bl L.3.19 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits 7 bl L.4.23 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 + + b 9f + add %o2, (7*2+1), %o2 L.4.23: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - + b 9f + add %o2, (7*2-1), %o2 L.3.19: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits 5 bl L.4.21 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 + b 9f + add %o2, (5*2+1), %o2 L.4.21: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - + b 9f + add %o2, (5*2-1), %o2 L.2.17: ! remainder is negative addcc %o3,%o5,%o3 ! depth 3, accumulated bits 1 bl L.3.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits 3 bl L.4.19 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - + b 9f + add %o2, (3*2+1), %o2 + L.4.19: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - + b 9f + add %o2, (3*2-1), %o2 + L.3.17: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits 1 bl L.4.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - + b 9f + add %o2, (1*2+1), %o2 + L.4.17: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - + b 9f + add %o2, (1*2-1), %o2 + L.1.16: ! remainder is negative addcc %o3,%o5,%o3 ! depth 2, accumulated bits -1 bl L.2.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 3, accumulated bits -1 bl L.3.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits -1 bl L.4.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - + b 9f + add %o2, (-1*2+1), %o2 + L.4.15: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - + b 9f + add %o2, (-1*2-1), %o2 + L.3.15: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits -3 bl L.4.13 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - + b 9f + add %o2, (-3*2+1), %o2 + L.4.13: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - + b 9f + add %o2, (-3*2-1), %o2 + L.2.15: ! remainder is negative addcc %o3,%o5,%o3 ! depth 3, accumulated bits -3 bl L.3.13 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits -5 bl L.4.11 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - + b 9f + add %o2, (-5*2+1), %o2 + L.4.11: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - + b 9f + add %o2, (-5*2-1), %o2 + + L.3.13: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits -7 bl L.4.9 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - + b 9f + add %o2, (-7*2+1), %o2 + L.4.9: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - + b 9f + add %o2, (-7*2-1), %o2 + 9: Lend_regular_divide: subcc %o4, 1, %o4 bge Ldivloop - tst %o3 + tst %o3 + bl,a Lgot_result ! non-restoring fixup here (one instruction only!) add %o3, %o1, %o3 - Lgot_result: + ! check to see if answer should be < 0 + tst %g2 + bl,a 1f + sub %g0, %o3, %o3 +1: + retl + mov %o3, %o0 + .globl .rem_patch +.rem_patch: + sra %o0, 0x1f, %o4 + wr %o4, 0x0, %y + nop + nop + nop + sdivcc %o0, %o1, %o2 + bvs,a 1f + xnor %o2, %g0, %o2 +1: smul %o2, %o1, %o2 retl - mov %o3, %o0 + sub %o0, %o2, %o0 + nop diff --git a/arch/sparc/lib/sdiv.S b/arch/sparc/lib/sdiv.S index 2fa7a9794..e0ad80b6f 100644 --- a/arch/sparc/lib/sdiv.S +++ b/arch/sparc/lib/sdiv.S @@ -1,4 +1,5 @@ -/* sdiv.S: This routine was taken from glibc-1.09 and is covered +/* $Id: sdiv.S,v 1.6 1996/10/02 17:37:00 davem Exp $ + * sdiv.S: This routine was taken from glibc-1.09 and is covered * by the GNU Library General Public License Version 2. */ @@ -46,13 +47,14 @@ ! compute sign of result; if neither is negative, no problem orcc %o1, %o0, %g0 ! either negative? bge 2f ! no, go do the divide - xor %o1, %o0, %g6 ! compute sign in any case + xor %o1, %o0, %g2 ! compute sign in any case + tst %o1 bge 1f - tst %o0 + tst %o0 ! %o1 is definitely negative; %o0 might also be negative bge 2f ! if %o0 not negative... - sub %g0, %o1, %o1 ! in any case, make %o1 nonneg + sub %g0, %o1, %o1 ! in any case, make %o1 nonneg 1: ! %o0 is negative, %o1 is nonnegative sub %g0, %o0, %o0 ! make %o0 nonnegative 2: @@ -60,22 +62,24 @@ ! Ready to divide. Compute size of quotient; scale comparand. orcc %o1, %g0, %o5 bne 1f - mov %o0, %o3 + mov %o0, %o3 ! Divide by zero trap. If it returns, return 0 (about as ! wrong as possible, but that is what SunOS does...). ta ST_DIV0 retl - clr %o0 + clr %o0 1: cmp %o3, %o5 ! if %o1 exceeds %o0, done blu Lgot_result ! (and algorithm fails otherwise) - clr %o2 + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 blu Lnot_really_big - clr %o4 + clr %o4 ! Here the dividend is >= 2**(31-N) or so. We must be careful here, ! as our usual N-at-a-shot divide step will cause overflow and havoc. @@ -85,15 +89,18 @@ 1: cmp %o5, %g1 bgeu 3f - mov 1, %g7 + mov 1, %g7 + sll %o5, 4, %o5 + b 1b - add %o4, 1, %o4 + add %o4, 1, %o4 ! Now compute %g7. - 2: addcc %o5, %o5, %o5 + 2: + addcc %o5, %o5, %o5 bcc Lnot_too_big - add %g7, 1, %g7 + add %g7, 1, %g7 ! We get here if the %o1 overflowed while shifting. ! This means that %o3 has the high-order bit set. @@ -101,15 +108,18 @@ sll %g1, 4, %g1 ! high order bit srl %o5, 1, %o5 ! rest of %o5 add %o5, %g1, %o5 + b Ldo_single_div - sub %g7, 1, %g7 + sub %g7, 1, %g7 Lnot_too_big: - 3: cmp %o5, %o3 + 3: + cmp %o5, %o3 blu 2b - nop + nop + be Ldo_single_div - nop + nop /* NB: these are commented out in the V8-Sparc manual as well */ /* (I do not understand this) */ ! %o5 > %o3: went too far: back up 1 step @@ -126,19 +136,23 @@ Ldo_single_div: subcc %g7, 1, %g7 bl Lend_regular_divide - nop + nop + sub %o3, %o5, %o3 mov 1, %o2 + b Lend_single_divloop - nop + nop Lsingle_divloop: sll %o2, 1, %o2 + bl 1f - srl %o5, 1, %o5 + srl %o5, 1, %o5 ! %o3 >= 0 sub %o3, %o5, %o3 + b 2f - add %o2, 1, %o2 + add %o2, 1, %o2 1: ! %o3 < 0 add %o3, %o5, %o3 sub %o2, 1, %o2 @@ -146,7 +160,8 @@ Lend_single_divloop: subcc %g7, 1, %g7 bge Lsingle_divloop - tst %o3 + tst %o3 + b,a Lend_regular_divide Lnot_really_big: @@ -154,83 +169,81 @@ Lnot_really_big: sll %o5, 4, %o5 cmp %o5, %o3 bleu 1b - addcc %o4, 1, %o4 + addcc %o4, 1, %o4 + be Lgot_result - sub %o4, 1, %o4 + sub %o4, 1, %o4 tst %o3 ! set up for initial iteration Ldivloop: sll %o2, 4, %o2 ! depth 1, accumulated bits 0 bl L.1.16 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 2, accumulated bits 1 bl L.2.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 3, accumulated bits 3 bl L.3.19 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits 7 bl L.4.23 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - + b 9f + add %o2, (7*2+1), %o2 + L.4.23: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - + b 9f + add %o2, (7*2-1), %o2 + L.3.19: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits 5 bl L.4.21 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - + b 9f + add %o2, (5*2+1), %o2 + L.4.21: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - + b 9f + add %o2, (5*2-1), %o2 + L.2.17: ! remainder is negative addcc %o3,%o5,%o3 ! depth 3, accumulated bits 1 bl L.3.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits 3 bl L.4.19 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - + b 9f + add %o2, (3*2+1), %o2 + L.4.19: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 + b 9f + add %o2, (3*2-1), %o2 L.3.17: @@ -238,126 +251,129 @@ L.3.17: addcc %o3,%o5,%o3 ! depth 4, accumulated bits 1 bl L.4.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - + b 9f + add %o2, (1*2+1), %o2 + L.4.17: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - + b 9f + add %o2, (1*2-1), %o2 + L.1.16: ! remainder is negative addcc %o3,%o5,%o3 ! depth 2, accumulated bits -1 bl L.2.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 3, accumulated bits -1 bl L.3.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits -1 bl L.4.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - + b 9f + add %o2, (-1*2+1), %o2 + L.4.15: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - + b 9f + add %o2, (-1*2-1), %o2 + L.3.15: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits -3 bl L.4.13 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - + b 9f + add %o2, (-3*2+1), %o2 + L.4.13: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - + b 9f + add %o2, (-3*2-1), %o2 + L.2.15: ! remainder is negative addcc %o3,%o5,%o3 ! depth 3, accumulated bits -3 bl L.3.13 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits -5 bl L.4.11 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - + b 9f + add %o2, (-5*2+1), %o2 + L.4.11: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - + b 9f + add %o2, (-5*2-1), %o2 + L.3.13: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits -7 bl L.4.9 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - + b 9f + add %o2, (-7*2+1), %o2 + L.4.9: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - + b 9f + add %o2, (-7*2-1), %o2 + 9: Lend_regular_divide: subcc %o4, 1, %o4 bge Ldivloop - tst %o3 + tst %o3 + bl,a Lgot_result ! non-restoring fixup here (one instruction only!) sub %o2, 1, %o2 - Lgot_result: ! check to see if answer should be < 0 - tst %g6 + tst %g2 bl,a 1f - sub %g0, %o2, %o2 + sub %g0, %o2, %o2 1: retl - mov %o2, %o0 + mov %o2, %o0 + + .globl .div_patch +.div_patch: + sra %o0, 0x1f, %o2 + wr %o2, 0x0, %y + nop + nop + nop + sdivcc %o0, %o1, %o0 + bvs,a 1f + xnor %o0, %g0, %o0 +1: retl + nop diff --git a/arch/sparc/lib/strlen.S b/arch/sparc/lib/strlen.S new file mode 100644 index 000000000..95321d4c5 --- /dev/null +++ b/arch/sparc/lib/strlen.S @@ -0,0 +1,88 @@ +/* strlen.S: Sparc optimized strlen(). + * + * This was hand optimized by davem@caip.rutgers.edu from + * the C-code in GNU-libc. + */ + +#include <asm/cprefix.h> + +#define LO_MAGIC 0x01010101 +#define HI_MAGIC 0x80808080 + + .align 4 + .global C_LABEL(strlen) +C_LABEL(strlen): + mov %o0, %o1 + andcc %o0, 3, %g0 ! and with %o0 so no dependency problems + be scan_words + sethi %hi(HI_MAGIC), %g2 ! common case and most Sparcs predict taken + + ldsb [%o0], %g2 +still_not_word_aligned: + cmp %g2, 0 + bne,a 1f + add %o0, 1, %o0 + + /* Ok, so there are tons of quick interlocks above for the + * < 4 length string unaligned... not too common so I'm not + * very concerned. + */ + retl + sub %o0, %o1, %o0 + +1: + andcc %o0, 3, %g0 + bne,a still_not_word_aligned + ldsb [%o0], %g2 + + /* HyperSparc executes each sethi/or pair in 1 cycle. */ + sethi %hi(HI_MAGIC), %g2 +scan_words: + or %g2, %lo(HI_MAGIC), %o3 + sethi %hi(LO_MAGIC), %g3 + or %g3, %lo(LO_MAGIC), %o2 +next_word: + ld [%o0], %g2 ! no dependencies +next_word_preloaded: + sub %g2, %o2, %g2 ! lots of locks here + andcc %g2, %o3, %g0 ! and I dont like it... + be next_word + add %o0, 4, %o0 + + /* Check every byte. */ +byte_zero: + ldsb [%o0 - 0x4], %g2 + cmp %g2, 0 + bne byte_one + add %o0, -4, %g3 + + retl + sub %g3, %o1, %o0 + +byte_one: + ldsb [%o0 - 0x3], %g2 + cmp %g2, 0 + bne,a byte_two_and_three + ldsb [%o0 - 0x2], %g2 + + sub %g3, %o1, %o0 + retl + add %o0, 1, %o0 + +byte_two_and_three: + cmp %g2, 0 + be,a found_it + sub %g3, %o1, %o0 + + ldsb [%o0 - 0x1], %g2 + cmp %g2, 0 + bne,a next_word_preloaded + ld [%o0], %g2 + + sub %g3, %o1, %o0 + retl + add %o0, 3, %o0 + +found_it: + retl + add %o0, 2, %o0 diff --git a/arch/sparc/lib/strncmp.S b/arch/sparc/lib/strncmp.S new file mode 100644 index 000000000..2f26b1b4a --- /dev/null +++ b/arch/sparc/lib/strncmp.S @@ -0,0 +1,120 @@ +/* $Id: strncmp.S,v 1.2 1996/09/09 02:47:20 davem Exp $ + * strncmp.S: Hand optimized Sparc assembly of GCC output from GNU libc + * generic strncmp routine. + */ + +#include <asm/cprefix.h> + + .text + .align 4 + .global C_LABEL(__strncmp), C_LABEL(strncmp) +C_LABEL(__strncmp): +C_LABEL(strncmp): + mov %o0, %g3 + mov 0, %o3 + + cmp %o2, 3 + ble 7f + mov 0, %g2 + + sra %o2, 2, %o4 + ldub [%g3], %o3 + +0: + ldub [%o1], %g2 + add %g3, 1, %g3 + and %o3, 0xff, %o0 + + cmp %o0, 0 + be 8f + add %o1, 1, %o1 + + cmp %o0, %g2 + be,a 1f + ldub [%g3], %o3 + + retl + sub %o0, %g2, %o0 + +1: + ldub [%o1], %g2 + add %g3,1, %g3 + and %o3, 0xff, %o0 + + cmp %o0, 0 + be 8f + add %o1, 1, %o1 + + cmp %o0, %g2 + be,a 1f + ldub [%g3], %o3 + + retl + sub %o0, %g2, %o0 + +1: + ldub [%o1], %g2 + add %g3, 1, %g3 + and %o3, 0xff, %o0 + + cmp %o0, 0 + be 8f + add %o1, 1, %o1 + + cmp %o0, %g2 + be,a 1f + ldub [%g3], %o3 + + retl + sub %o0, %g2, %o0 + +1: + ldub [%o1], %g2 + add %g3, 1, %g3 + and %o3, 0xff, %o0 + + cmp %o0, 0 + be 8f + add %o1, 1, %o1 + + cmp %o0, %g2 + be 1f + add %o4, -1, %o4 + + retl + sub %o0, %g2, %o0 + +1: + + cmp %o4, 0 + bg,a 0b + ldub [%g3], %o3 + + b 7f + and %o2, 3, %o2 + +9: + ldub [%o1], %g2 + add %g3, 1, %g3 + and %o3, 0xff, %o0 + + cmp %o0, 0 + be 8f + add %o1, 1, %o1 + + cmp %o0, %g2 + be 7f + add %o2, -1, %o2 + +8: + retl + sub %o0, %g2, %o0 + +7: + cmp %o2, 0 + bg,a 9b + ldub [%g3], %o3 + + and %g2, 0xff, %o0 + retl + sub %o3, %o0, %o0 diff --git a/arch/sparc/lib/strncpy_from_user.S b/arch/sparc/lib/strncpy_from_user.S new file mode 100644 index 000000000..3dd2bd71c --- /dev/null +++ b/arch/sparc/lib/strncpy_from_user.S @@ -0,0 +1,49 @@ +/* strncpy_from_user.S: Sparc strncpy from userspace. + * + * Copyright(C) 1996 David S. Miller + */ + +#include <asm/cprefix.h> +#include <asm/ptrace.h> + + .text + .align 4 + + /* Must return: + * + * -EFAULT for an exception + * count if we hit the buffer limit + * bytes copied if we hit a null byte + */ + + .globl C_LABEL(__strncpy_from_user) +C_LABEL(__strncpy_from_user): + /* %o0=dest, %o1=src, %o2=count */ + ld [%g6 + THREAD_EX_COUNT], %g1 + set strncpy_user_failure, %g2 + add %g1, 1, %g3 + st %o7, [%g6 + THREAD_EX_PC] + st %g3, [%g6 + THREAD_EX_COUNT] + st %g2, [%g6 + THREAD_EX_EXPC] + + mov %o2, %o3 +1: + subcc %o2, 1, %o2 + bneg 2f + nop + + ldub [%o1], %o4 + add %o0, 1, %o0 + cmp %o4, 0 + add %o1, 1, %o1 + bne 1b + stb %o4, [%o0 - 1] +2: + add %o2, 1, %o0 + st %g1, [%g6 + THREAD_EX_COUNT] + retl + sub %o3, %o0, %o0 + +strncpy_user_failure: + jmpl %g3 + 0x8, %g0 + mov %g1, %o0 diff --git a/arch/sparc/lib/udiv.S b/arch/sparc/lib/udiv.S index 53cfeac90..2abfc6b0f 100644 --- a/arch/sparc/lib/udiv.S +++ b/arch/sparc/lib/udiv.S @@ -1,4 +1,5 @@ -/* udiv.S: This routine was taken from glibc-1.09 and is covered +/* $Id: udiv.S,v 1.4 1996/09/30 02:22:38 davem Exp $ + * udiv.S: This routine was taken from glibc-1.09 and is covered * by the GNU Library General Public License Version 2. */ @@ -47,22 +48,24 @@ ! Ready to divide. Compute size of quotient; scale comparand. orcc %o1, %g0, %o5 bne 1f - mov %o0, %o3 + mov %o0, %o3 ! Divide by zero trap. If it returns, return 0 (about as ! wrong as possible, but that is what SunOS does...). ta ST_DIV0 retl - clr %o0 + clr %o0 1: cmp %o3, %o5 ! if %o1 exceeds %o0, done blu Lgot_result ! (and algorithm fails otherwise) - clr %o2 + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 blu Lnot_really_big - clr %o4 + clr %o4 ! Here the dividend is >= 2**(31-N) or so. We must be careful here, ! as our usual N-at-a-shot divide step will cause overflow and havoc. @@ -72,15 +75,18 @@ 1: cmp %o5, %g1 bgeu 3f - mov 1, %g7 + mov 1, %g7 + sll %o5, 4, %o5 + b 1b - add %o4, 1, %o4 + add %o4, 1, %o4 ! Now compute %g7. - 2: addcc %o5, %o5, %o5 + 2: + addcc %o5, %o5, %o5 bcc Lnot_too_big - add %g7, 1, %g7 + add %g7, 1, %g7 ! We get here if the %o1 overflowed while shifting. ! This means that %o3 has the high-order bit set. @@ -88,15 +94,18 @@ sll %g1, 4, %g1 ! high order bit srl %o5, 1, %o5 ! rest of %o5 add %o5, %g1, %o5 + b Ldo_single_div - sub %g7, 1, %g7 + sub %g7, 1, %g7 Lnot_too_big: - 3: cmp %o5, %o3 + 3: + cmp %o5, %o3 blu 2b - nop + nop + be Ldo_single_div - nop + nop /* NB: these are commented out in the V8-Sparc manual as well */ /* (I do not understand this) */ ! %o5 > %o3: went too far: back up 1 step @@ -113,19 +122,21 @@ Ldo_single_div: subcc %g7, 1, %g7 bl Lend_regular_divide - nop + nop + sub %o3, %o5, %o3 mov 1, %o2 + b Lend_single_divloop - nop + nop Lsingle_divloop: sll %o2, 1, %o2 bl 1f - srl %o5, 1, %o5 + srl %o5, 1, %o5 ! %o3 >= 0 sub %o3, %o5, %o3 b 2f - add %o2, 1, %o2 + add %o2, 1, %o2 1: ! %o3 < 0 add %o3, %o5, %o3 sub %o2, 1, %o2 @@ -133,214 +144,212 @@ Lend_single_divloop: subcc %g7, 1, %g7 bge Lsingle_divloop - tst %o3 + tst %o3 + b,a Lend_regular_divide Lnot_really_big: 1: sll %o5, 4, %o5 + cmp %o5, %o3 bleu 1b - addcc %o4, 1, %o4 + addcc %o4, 1, %o4 + be Lgot_result - sub %o4, 1, %o4 + sub %o4, 1, %o4 tst %o3 ! set up for initial iteration Ldivloop: sll %o2, 4, %o2 ! depth 1, accumulated bits 0 bl L.1.16 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 2, accumulated bits 1 bl L.2.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 3, accumulated bits 3 bl L.3.19 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits 7 bl L.4.23 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - + b 9f + add %o2, (7*2+1), %o2 + L.4.23: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - + b 9f + add %o2, (7*2-1), %o2 + L.3.19: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits 5 bl L.4.21 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - + b 9f + add %o2, (5*2+1), %o2 + L.4.21: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - + b 9f + add %o2, (5*2-1), %o2 + L.2.17: ! remainder is negative addcc %o3,%o5,%o3 ! depth 3, accumulated bits 1 bl L.3.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits 3 bl L.4.19 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - + b 9f + add %o2, (3*2+1), %o2 + L.4.19: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - + b 9f + add %o2, (3*2-1), %o2 + L.3.17: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits 1 bl L.4.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - + b 9f + add %o2, (1*2+1), %o2 + L.4.17: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - + b 9f + add %o2, (1*2-1), %o2 + L.1.16: ! remainder is negative addcc %o3,%o5,%o3 ! depth 2, accumulated bits -1 bl L.2.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 3, accumulated bits -1 bl L.3.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits -1 bl L.4.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - + b 9f + add %o2, (-1*2+1), %o2 + L.4.15: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - + b 9f + add %o2, (-1*2-1), %o2 + L.3.15: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits -3 bl L.4.13 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - + b 9f + add %o2, (-3*2+1), %o2 + L.4.13: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - + b 9f + add %o2, (-3*2-1), %o2 + L.2.15: ! remainder is negative addcc %o3,%o5,%o3 ! depth 3, accumulated bits -3 bl L.3.13 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits -5 bl L.4.11 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - + b 9f + add %o2, (-5*2+1), %o2 + L.4.11: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - + b 9f + add %o2, (-5*2-1), %o2 + L.3.13: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits -7 bl L.4.9 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - + b 9f + add %o2, (-7*2+1), %o2 + L.4.9: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - + b 9f + add %o2, (-7*2-1), %o2 + 9: Lend_regular_divide: subcc %o4, 1, %o4 bge Ldivloop - tst %o3 + tst %o3 + bl,a Lgot_result ! non-restoring fixup here (one instruction only!) sub %o2, 1, %o2 - Lgot_result: retl - mov %o2, %o0 + mov %o2, %o0 + + .globl .udiv_patch +.udiv_patch: + wr %g0, 0x0, %y + nop + nop + retl + udiv %o0, %o1, %o0 + nop diff --git a/arch/sparc/lib/umul.S b/arch/sparc/lib/umul.S index 24f7c3cda..a784720a8 100644 --- a/arch/sparc/lib/umul.S +++ b/arch/sparc/lib/umul.S @@ -1,4 +1,5 @@ -/* umul.S: This routine was taken from glibc-1.09 and is covered +/* $Id: umul.S,v 1.4 1996/09/30 02:22:39 davem Exp $ + * umul.S: This routine was taken from glibc-1.09 and is covered * by the GNU Library General Public License Version 2. */ @@ -23,9 +24,10 @@ .umul: or %o0, %o1, %o4 mov %o0, %y ! multiplier -> Y + andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args be Lmul_shortway ! if zero, can do it the short way - andcc %g0, %g0, %o4 ! zero the partial product and clear N and V + andcc %g0, %g0, %o4 ! zero the partial product and clear N and V /* * Long multiply. 32 steps, followed by a final shift step. @@ -102,17 +104,19 @@ #if 0 tst %o1 bl,a 1f ! if %o1 < 0 (high order bit = 1), - add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) -1: rd %y, %o0 ! get lower half of product + add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) + +1: + rd %y, %o0 ! get lower half of product retl - addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 + addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 #else /* Faster code from tege@sics.se. */ sra %o1, 31, %o2 ! make mask from sign bit and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1 rd %y, %o0 ! get lower half of product retl - addcc %o4, %o2, %o1 ! add compensation and put upper half in place + addcc %o4, %o2, %o1 ! add compensation and put upper half in place #endif Lmul_shortway: @@ -155,4 +159,11 @@ Lmul_shortway: srl %o5, 20, %o5 ! shift low bits right 20 or %o5, %o0, %o0 retl - addcc %g0, %g0, %o1 ! %o1 = zero, and set Z + addcc %g0, %g0, %o1 ! %o1 = zero, and set Z + + .globl .umul_patch +.umul_patch: + umul %o0, %o1, %o0 + retl + rd %y, %o1 + nop diff --git a/arch/sparc/lib/urem.S b/arch/sparc/lib/urem.S index c84aa81e5..ec7f0c502 100644 --- a/arch/sparc/lib/urem.S +++ b/arch/sparc/lib/urem.S @@ -1,4 +1,5 @@ -/* urem.S: This routine was taken from glibc-1.09 and is covered +/* $Id: urem.S,v 1.4 1996/09/30 02:22:42 davem Exp $ + * urem.S: This routine was taken from glibc-1.09 and is covered * by the GNU Library General Public License Version 2. */ @@ -45,22 +46,24 @@ ! Ready to divide. Compute size of quotient; scale comparand. orcc %o1, %g0, %o5 bne 1f - mov %o0, %o3 + mov %o0, %o3 ! Divide by zero trap. If it returns, return 0 (about as ! wrong as possible, but that is what SunOS does...). ta ST_DIV0 retl - clr %o0 + clr %o0 1: cmp %o3, %o5 ! if %o1 exceeds %o0, done blu Lgot_result ! (and algorithm fails otherwise) - clr %o2 + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 blu Lnot_really_big - clr %o4 + clr %o4 ! Here the dividend is >= 2**(31-N) or so. We must be careful here, ! as our usual N-at-a-shot divide step will cause overflow and havoc. @@ -70,15 +73,18 @@ 1: cmp %o5, %g1 bgeu 3f - mov 1, %g7 + mov 1, %g7 + sll %o5, 4, %o5 + b 1b - add %o4, 1, %o4 + add %o4, 1, %o4 ! Now compute %g7. - 2: addcc %o5, %o5, %o5 + 2: + addcc %o5, %o5, %o5 bcc Lnot_too_big - add %g7, 1, %g7 + add %g7, 1, %g7 ! We get here if the %o1 overflowed while shifting. ! This means that %o3 has the high-order bit set. @@ -86,15 +92,18 @@ sll %g1, 4, %g1 ! high order bit srl %o5, 1, %o5 ! rest of %o5 add %o5, %g1, %o5 + b Ldo_single_div - sub %g7, 1, %g7 + sub %g7, 1, %g7 Lnot_too_big: - 3: cmp %o5, %o3 + 3: + cmp %o5, %o3 blu 2b - nop + nop + be Ldo_single_div - nop + nop /* NB: these are commented out in the V8-Sparc manual as well */ /* (I do not understand this) */ ! %o5 > %o3: went too far: back up 1 step @@ -111,19 +120,21 @@ Ldo_single_div: subcc %g7, 1, %g7 bl Lend_regular_divide - nop + nop + sub %o3, %o5, %o3 mov 1, %o2 + b Lend_single_divloop - nop + nop Lsingle_divloop: sll %o2, 1, %o2 bl 1f - srl %o5, 1, %o5 + srl %o5, 1, %o5 ! %o3 >= 0 sub %o3, %o5, %o3 b 2f - add %o2, 1, %o2 + add %o2, 1, %o2 1: ! %o3 < 0 add %o3, %o5, %o3 sub %o2, 1, %o2 @@ -131,214 +142,214 @@ Lend_single_divloop: subcc %g7, 1, %g7 bge Lsingle_divloop - tst %o3 + tst %o3 + b,a Lend_regular_divide Lnot_really_big: 1: sll %o5, 4, %o5 + cmp %o5, %o3 bleu 1b - addcc %o4, 1, %o4 + addcc %o4, 1, %o4 + be Lgot_result - sub %o4, 1, %o4 + sub %o4, 1, %o4 tst %o3 ! set up for initial iteration Ldivloop: sll %o2, 4, %o2 ! depth 1, accumulated bits 0 bl L.1.16 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 2, accumulated bits 1 bl L.2.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 3, accumulated bits 3 bl L.3.19 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits 7 bl L.4.23 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - + b 9f + add %o2, (7*2+1), %o2 + L.4.23: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - + b 9f + add %o2, (7*2-1), %o2 + L.3.19: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits 5 bl L.4.21 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - + b 9f + add %o2, (5*2+1), %o2 + L.4.21: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - + b 9f + add %o2, (5*2-1), %o2 + L.2.17: ! remainder is negative addcc %o3,%o5,%o3 ! depth 3, accumulated bits 1 bl L.3.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits 3 bl L.4.19 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - + b 9f + add %o2, (3*2+1), %o2 + L.4.19: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - + b 9f + add %o2, (3*2-1), %o2 + L.3.17: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits 1 bl L.4.17 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 + b 9f + add %o2, (1*2+1), %o2 L.4.17: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - + b 9f + add %o2, (1*2-1), %o2 + L.1.16: ! remainder is negative addcc %o3,%o5,%o3 ! depth 2, accumulated bits -1 bl L.2.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 3, accumulated bits -1 bl L.3.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits -1 bl L.4.15 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - + b 9f + add %o2, (-1*2+1), %o2 + L.4.15: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - + b 9f + add %o2, (-1*2-1), %o2 + L.3.15: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits -3 bl L.4.13 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - + b 9f + add %o2, (-3*2+1), %o2 + L.4.13: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - + b 9f + add %o2, (-3*2-1), %o2 + L.2.15: ! remainder is negative addcc %o3,%o5,%o3 ! depth 3, accumulated bits -3 bl L.3.13 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 ! depth 4, accumulated bits -5 bl L.4.11 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 + b 9f + add %o2, (-5*2+1), %o2 L.4.11: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - + b 9f + add %o2, (-5*2-1), %o2 + L.3.13: ! remainder is negative addcc %o3,%o5,%o3 ! depth 4, accumulated bits -7 bl L.4.9 - srl %o5,1,%o5 + srl %o5,1,%o5 ! remainder is positive subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - + b 9f + add %o2, (-7*2+1), %o2 + L.4.9: ! remainder is negative addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - + b 9f + add %o2, (-7*2-1), %o2 + 9: Lend_regular_divide: subcc %o4, 1, %o4 bge Ldivloop - tst %o3 + tst %o3 + bl,a Lgot_result ! non-restoring fixup here (one instruction only!) add %o3, %o1, %o3 - Lgot_result: retl - mov %o3, %o0 + mov %o3, %o0 + + .globl .urem_patch +.urem_patch: + wr %g0, 0x0, %y + nop + nop + nop + udiv %o0, %o1, %o2 + umul %o2, %o1, %o2 + retl + sub %o0, %o2, %o0 |