diff options
Diffstat (limited to 'arch/sparc64/lib')
-rw-r--r-- | arch/sparc64/lib/Makefile | 56 | ||||
-rw-r--r-- | arch/sparc64/lib/blockops.S | 138 | ||||
-rw-r--r-- | arch/sparc64/lib/checksum.S | 565 | ||||
-rw-r--r-- | arch/sparc64/lib/copy_from_user.S | 456 | ||||
-rw-r--r-- | arch/sparc64/lib/copy_to_user.S | 456 | ||||
-rw-r--r-- | arch/sparc64/lib/locks.S | 77 | ||||
-rw-r--r-- | arch/sparc64/lib/memcmp.S | 29 | ||||
-rw-r--r-- | arch/sparc64/lib/memcpy.S | 526 | ||||
-rw-r--r-- | arch/sparc64/lib/memscan.S | 116 | ||||
-rw-r--r-- | arch/sparc64/lib/memset.S | 196 | ||||
-rw-r--r-- | arch/sparc64/lib/strlen.S | 77 | ||||
-rw-r--r-- | arch/sparc64/lib/strlen_user.S | 99 | ||||
-rw-r--r-- | arch/sparc64/lib/strncmp.S | 31 | ||||
-rw-r--r-- | arch/sparc64/lib/strncpy_from_user.S | 54 |
14 files changed, 2876 insertions, 0 deletions
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile new file mode 100644 index 000000000..56c506507 --- /dev/null +++ b/arch/sparc64/lib/Makefile @@ -0,0 +1,56 @@ +# $Id: Makefile,v 1.7 1997/04/07 18:57:05 jj Exp $ +# Makefile for Sparc library files.. +# + +CFLAGS := $(CFLAGS) -ansi + +OBJS = memset.o blockops.o locks.o memcpy.o strlen.o strncmp.o \ + memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \ + copy_to_user.o copy_from_user.o + +lib.a: $(OBJS) + $(AR) rcs lib.a $(OBJS) + sync + +blockops.o: blockops.S + $(CC) -ansi -c -o blockops.o blockops.S + +memset.o: memset.S + $(CC) -D__ASSEMBLY__ -ansi -c -o memset.o memset.S + +copy_to_user.o: copy_to_user.S + $(CC) -D__ASSEMBLY__ -ansi -c -o copy_to_user.o copy_to_user.S + +copy_from_user.o: copy_from_user.S + $(CC) -D__ASSEMBLY__ -ansi -c -o copy_from_user.o copy_from_user.S + +memcpy.o: memcpy.S + $(CC) -D__ASSEMBLY__ -ansi -c -o memcpy.o memcpy.S + +strlen.o: strlen.S + $(CC) -D__ASSEMBLY__ -ansi -c -o strlen.o strlen.S + +strncmp.o: strncmp.S + $(CC) -D__ASSEMBLY__ -ansi -c -o strncmp.o strncmp.S + +memcmp.o: memcmp.S + $(CC) -D__ASSEMBLY__ -ansi -c -o memcmp.o memcmp.S + +locks.o: locks.S + $(CC) -D__ASSEMBLY__ -ansi -c -o locks.o locks.S + +checksum.o: checksum.S + $(CC) -D__ASSEMBLY__ -ansi -c -o checksum.o checksum.S + +memscan.o: memscan.S + $(CC) -D__ASSEMBLY__ -ansi -c -o memscan.o memscan.S + +strncpy_from_user.o: strncpy_from_user.S + $(CC) -D__ASSEMBLY__ -ansi -c -o strncpy_from_user.o strncpy_from_user.S + +strlen_user.o: strlen_user.S + $(CC) -D__ASSEMBLY__ -ansi -c -o strlen_user.o strlen_user.S + +dep: + +include $(TOPDIR)/Rules.make diff --git a/arch/sparc64/lib/blockops.S b/arch/sparc64/lib/blockops.S new file mode 100644 index 000000000..b3f06c18d --- /dev/null +++ b/arch/sparc64/lib/blockops.S @@ -0,0 +1,138 @@ +/* $Id: blockops.S,v 1.5 1997/03/26 18:34:28 jj Exp $ + * arch/sparc64/lib/blockops.S: UltraSparc block zero optimized routines. + * + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +#include <asm/asi.h> + + /* Zero out 256 bytes of memory at (buf + offset). */ +#define BLAST_BLOCK(buf, offset) \ + stda %f48, [buf + offset + 0x00] %asi; \ + stda %f48, [buf + offset + 0x40] %asi; \ + stda %f48, [buf + offset + 0x80] %asi; \ + stda %f48, [buf + offset + 0xc0] %asi; + + /* Copy 256 bytes of memory at (src + offset) to + * (dst + offset). + */ +#define MIRROR_BLOCK(dst, src, offset, sync) \ + ldda [src + offset + 0x000] %asi, %f0; \ + ldda [src + offset + 0x040] %asi, %f16; \ + ldda [src + offset + 0x080] %asi, %f32; \ + ldda [src + offset + 0x0c0] %asi, %f48; \ + membar sync; \ + stda %f0, [dst + offset + 0x000] %asi; \ + stda %f16, [dst + offset + 0x040] %asi; \ + stda %f32, [dst + offset + 0x080] %asi; \ + stda %f48, [dst + offset + 0x0c0] %asi; + + .text + .align 4 + + .globl bzero_2page, bzero_1page +bzero_2page: + /* %o0 = buf */ + mov %o0, %o1 + wr %g0, ASI_BLK_P, %asi + mov 0x10, %g2 + + membar #Sync|#StoreLoad + + fzero %f48 + fzero %f50 + fzero %f52 + fzero %f54 + fzero %f56 + fzero %f58 + fzero %f60 + fzero %f62 +1: + BLAST_BLOCK(%o0, 0x000) + BLAST_BLOCK(%o0, 0x100) + BLAST_BLOCK(%o0, 0x200) + BLAST_BLOCK(%o0, 0x300) + subcc %g2, 1, %g2 + bne,pt %icc, 1b + add %o0, 0x400, %o0 + + membar #Sync|#LoadStore|#StoreStore + + retl + mov %o1, %o0 + +bzero_1page: + /* %o0 = buf */ + mov %o0, %o1 + wr %g0, ASI_BLK_P, %asi + mov 0x08, %g2 + membar #Sync|#StoreLoad + fzero %f48 + fzero %f50 + fzero %f52 + fzero %f54 + fzero %f56 + fzero %f58 + fzero %f60 + fzero %f62 +1: + BLAST_BLOCK(%o0, 0x000) + BLAST_BLOCK(%o0, 0x100) + BLAST_BLOCK(%o0, 0x200) + BLAST_BLOCK(%o0, 0x300) + subcc %g2, 1, %g2 + bne,pt %icc, 1b + add %o0, 0x400, %o0 + + membar #Sync|#LoadStore|#StoreStore + + retl + mov %o1, %o0 + + .globl __bfill64 +__bfill64: + /* %o0 = buf */ + stx %o1, [%sp + 0x7ff + 128] + wr %g0, ASI_BLK_P, %asi + mov 0x08, %g2 + ldd [%sp + 0x7ff + 128], %f48 + membar #Sync|#StoreLoad + fmovd %f48, %f50 + fmovd %f48, %f52 + fmovd %f48, %f54 + fmovd %f48, %f56 + fmovd %f48, %f58 + fmovd %f48, %f60 + fmovd %f48, %f62 +1: + BLAST_BLOCK(%o0, 0x000) + BLAST_BLOCK(%o0, 0x100) + BLAST_BLOCK(%o0, 0x200) + BLAST_BLOCK(%o0, 0x300) + subcc %g2, 1, %g2 + bne,pt %icc, 1b + add %o0, 0x400, %o0 + + retl + membar #Sync|#LoadStore|#StoreStore + + .globl __copy_1page +__copy_1page: + /* %o0 = dst, %o1 = src */ + or %g0, 0x08, %g1 + wr %g0, ASI_BLK_P, %asi + membar #Sync|#StoreLoad +1: + MIRROR_BLOCK(%o0, %o1, 0x000, #Sync) + MIRROR_BLOCK(%o0, %o1, 0x100, #Sync) + MIRROR_BLOCK(%o0, %o1, 0x200, #Sync) + MIRROR_BLOCK(%o0, %o1, 0x300, #Sync) + subcc %g1, 1, %g1 + add %o0, 0x400, %o0 + bne,pt %icc, 1b + add %o1, 0x400, %o1 + + retl + membar #Sync|#LoadStore|#StoreStore + diff --git a/arch/sparc64/lib/checksum.S b/arch/sparc64/lib/checksum.S new file mode 100644 index 000000000..8a06003ee --- /dev/null +++ b/arch/sparc64/lib/checksum.S @@ -0,0 +1,565 @@ +/* checksum.S: Sparc V9 optimized checksum code. + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1995 Miguel de Icaza + * Copyright(C) 1996 David S. Miller + * Copyright(C) 1997 Jakub Jelinek + * + * derived from: + * Linux/Alpha checksum c-code + * Linux/ix86 inline checksum assembly + * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) + * David Mosberger-Tang for optimized reference c-code + * BSD4.4 portable checksum routine + */ + +#include <asm/errno.h> +#include <asm/head.h> +#include <asm/ptrace.h> +#include <asm/asi.h> + +#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \ + ldd [buf + offset + 0x00], t0; \ + ldd [buf + offset + 0x08], t2; \ + addccc t0, sum, sum; \ + addccc t1, sum, sum; \ + ldd [buf + offset + 0x10], t4; \ + addccc t2, sum, sum; \ + addccc t3, sum, sum; \ + ldd [buf + offset + 0x18], t0; \ + addccc t4, sum, sum; \ + addccc t5, sum, sum; \ + addccc t0, sum, sum; \ + addccc t1, sum, sum; + +#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3) \ + ldd [buf - offset - 0x08], t0; \ + ldd [buf - offset - 0x00], t2; \ + addccc t0, sum, sum; \ + addccc t1, sum, sum; \ + addccc t2, sum, sum; \ + addccc t3, sum, sum; + + /* Do end cruft out of band to get better cache patterns. */ +csum_partial_end_cruft: + andcc %o1, 8, %g0 ! check how much + be,pn %icc, 1f ! caller asks %o1 & 0x8 + and %o1, 4, %g3 ! nope, check for word remaining + ldd [%o0], %g2 ! load two + addcc %g2, %o2, %o2 ! add first word to sum + addccc %g3, %o2, %o2 ! add second word as well + add %o0, 8, %o0 ! advance buf ptr + addc %g0, %o2, %o2 ! add in final carry +1: brz,pn %g3, 1f ! nope, skip this code + andcc %o1, 3, %o1 ! check for trailing bytes + ld [%o0], %g2 ! load it + addcc %g2, %o2, %o2 ! add to sum + add %o0, 4, %o0 ! advance buf ptr + addc %g0, %o2, %o2 ! add in final carry +1: brz,pn %o1, 1f ! no trailing bytes, return + addcc %o1, -1, %g0 ! only one byte remains? + bne,pn %icc, 2f ! at least two bytes more + subcc %o1, 2, %o1 ! only two bytes more? + ba,pt %xcc, 4f ! only one byte remains + clr %o4 ! clear fake hword value +2: lduh [%o0], %o4 ! get hword + be,pn %icc, 6f ! jmp if only hword remains + add %o0, 2, %o0 ! advance buf ptr either way + sll %o4, 16, %o4 ! create upper hword +4: ldub [%o0], %o5 ! get final byte + sll %o5, 8, %o5 ! put into place + or %o5, %o4, %o4 ! coalese with hword (if any) +6: addcc %o4, %o2, %o2 ! add to sum +1: sllx %g4, 32, %g4 ! give gfp back + retl ! get outta here + addc %g0, %o2, %o0 ! add final carry into retval + + /* Also do alignment out of band to get better cache patterns. */ +csum_partial_fix_alignment: + + /* The common case is to get called with a nicely aligned + * buffer of size 0x20. Follow the code path for that case. + */ + .globl csum_partial +csum_partial: /* %o0=buf, %o1=len, %o2=sum */ + andcc %o0, 0x7, %g0 ! alignment problems? + be,pt %icc, csum_partial_fix_aligned ! yep, handle it + andn %o1, 0x7f, %o3 ! num loop iterations + cmp %o1, 6 + bl,pn %icc, cpte - 0x4 + andcc %o0, 0x2, %g0 + be,pn %icc, 1f + and %o0, 0x4, %g7 + lduh [%o0 + 0x00], %g2 + sub %o1, 2, %o1 + add %o0, 2, %o0 + sll %g2, 16, %g2 + addcc %g2, %o2, %o2 + srl %o2, 16, %g3 + addc %g0, %g3, %g2 + sll %o2, 16, %o2 + sll %g2, 16, %g3 + srl %o2, 16, %o2 + or %g3, %o2, %o2 +1: brz,pn %g7, csum_partial_fix_aligned + nop + ld [%o0 + 0x00], %g2 + sub %o1, 4, %o1 + addcc %g2, %o2, %o2 + add %o0, 4, %o0 + addc %g0, %o2, %o2 +csum_partial_fix_aligned: + brz,pt %o3, 3f ! none to do + andcc %o1, 0x70, %g1 ! clears carry flag too +5: CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + sub %o3, 128, %o3 ! detract from loop iters + addc %g0, %o2, %o2 ! sink in final carry + brnz,pt %o3, 5b ! more to do + add %o0, 128, %o0 ! advance buf ptr +3: brz,pn %g1, cpte ! nope + andcc %o1, 0xf, %o3 ! anything left at all? +10: rd %pc, %g7 ! get pc + srl %g1, 1, %o4 ! compute offset + sub %g7, %g1, %g7 ! adjust jmp ptr + sub %g7, %o4, %g7 ! final jmp ptr adjust + jmp %g7 + (cpte - 8 - 10b) ! enter the table + add %o0, %g1, %o0 ! advance buf ptr +cptbl: CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5) + addc %g0, %o2, %o2 ! fetch final carry + andcc %o1, 0xf, %g0 ! anything left at all? +cpte: brnz,pn %o3, csum_partial_end_cruft ! yep, handle it + sethi %uhi(KERNBASE), %g4 + mov %o2, %o0 ! return computed csum + retl ! get outta here + sllx %g4, 32, %g4 ! give gfp back + + .globl __csum_partial_copy_start, __csum_partial_copy_end +__csum_partial_copy_start: + +#define EX(x,y,a,b,z) \ +98: x,y; \ + .section .fixup,z##alloc,z##execinstr; \ + .align 4; \ +99: ba,pt %xcc, 30f; \ + a, b, %o3; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4 + +#define EX2(x,y,z) \ +98: x,y; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 30f; \ + .text; \ + .align 4 + +#define EX3(x,y,z) \ +98: x,y; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 96f; \ + .text; \ + .align 4 + +#define EXT(start,end,handler,z) \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word start, 0, end, handler; \ + .text; \ + .align 4 + + /* This aligned version executes typically in 8.5 superscalar cycles, this + * is the best I can do. I say 8.5 because the final add will pair with + * the next ldd in the main unrolled loop. Thus the pipe is always full. + * If you change these macros (including order of instructions), + * please check the fixup code below as well. + */ +#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [src + off + 0x00], t0; \ + ldd [src + off + 0x08], t2; \ + addccc t0, sum, sum; \ + ldd [src + off + 0x10], t4; \ + addccc t1, sum, sum; \ + ldd [src + off + 0x18], t6; \ + addccc t2, sum, sum; \ + std t0, [dst + off + 0x00]; \ + addccc t3, sum, sum; \ + std t2, [dst + off + 0x08]; \ + addccc t4, sum, sum; \ + std t4, [dst + off + 0x10]; \ + addccc t5, sum, sum; \ + std t6, [dst + off + 0x18]; \ + addccc t6, sum, sum; \ + addccc t7, sum, sum; + + /* 12 superscalar cycles seems to be the limit for this case, + * because of this we thus do all the ldd's together to get + * Viking MXCC into streaming mode. Ho hum... + */ +#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [src + off + 0x00], t0; \ + ldd [src + off + 0x08], t2; \ + ldd [src + off + 0x10], t4; \ + ldd [src + off + 0x18], t6; \ + st t0, [dst + off + 0x00]; \ + addccc t0, sum, sum; \ + st t1, [dst + off + 0x04]; \ + addccc t1, sum, sum; \ + st t2, [dst + off + 0x08]; \ + addccc t2, sum, sum; \ + st t3, [dst + off + 0x0c]; \ + addccc t3, sum, sum; \ + st t4, [dst + off + 0x10]; \ + addccc t4, sum, sum; \ + st t5, [dst + off + 0x14]; \ + addccc t5, sum, sum; \ + st t6, [dst + off + 0x18]; \ + addccc t6, sum, sum; \ + st t7, [dst + off + 0x1c]; \ + addccc t7, sum, sum; + + /* Yuck, 6 superscalar cycles... */ +#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3) \ + ldd [src - off - 0x08], t0; \ + ldd [src - off - 0x00], t2; \ + addccc t0, sum, sum; \ + st t0, [dst - off - 0x08]; \ + addccc t1, sum, sum; \ + st t1, [dst - off - 0x04]; \ + addccc t2, sum, sum; \ + st t2, [dst - off - 0x00]; \ + addccc t3, sum, sum; \ + st t3, [dst - off + 0x04]; + + /* Handle the end cruft code out of band for better cache patterns. */ +cc_end_cruft: + andcc %o3, 8, %g0 ! begin checks for that code + be,pn %icc, 1f + and %o3, 4, %g5 + EX(ldd [%o0 + 0x00], %g2, and %o3, 0xf,#) + add %o1, 8, %o1 + addcc %g2, %g7, %g7 + add %o0, 8, %o0 + addccc %g3, %g7, %g7 + EX2(st %g2, [%o1 - 0x08],#) + addc %g0, %g7, %g7 + EX2(st %g3, [%o1 - 0x04],#) +1: brz,pt %g5, 1f + andcc %o3, 3, %o3 + EX(ld [%o0 + 0x00], %g2, add %o3, 4,#) + add %o1, 4, %o1 + addcc %g2, %g7, %g7 + EX2(st %g2, [%o1 - 0x04],#) + addc %g0, %g7, %g7 + add %o0, 4, %o0 +1: brz,pn %o3, 1f + addcc %o3, -1, %g0 + bne,pn %icc, 2f + subcc %o3, 2, %o3 + ba,pt %xcc, 4f + clr %o4 +2: EX(lduh [%o0 + 0x00], %o4, add %o3, 2,#) + add %o0, 2, %o0 + EX2(sth %o4, [%o1 + 0x00],#) + be,pn %icc, 6f + add %o1, 2, %o1 + sll %o4, 16, %o4 +4: EX(ldub [%o0 + 0x00], %o5, add %g0, 1,#) + EX2(stb %o5, [%o1 + 0x00],#) + sll %o5, 8, %o5 + or %o5, %o4, %o4 +6: addcc %o4, %g7, %g7 +1: sllx %g4, 32, %g4 + retl + addc %g0, %g7, %o0 + + /* Sun, you just can't beat me, you just can't. Stop trying, + * give up. I'm serious, I am going to kick the living shit + * out of you, game over, lights out. + */ + .align 8 + .globl __csum_partial_copy_sparc_generic +__csum_partial_copy_sparc_generic: + /* %o0=src, %o1=dest, %g1=len, %g7=sum */ + xor %o0, %o1, %o4 ! get changing bits + andcc %o4, 3, %g0 ! check for mismatched alignment + bne,pn %icc, ccslow ! better this than unaligned/fixups + andcc %o0, 7, %g0 ! need to align things? + be,pt %icc, cc_dword_aligned ! yes, we check for short lengths there + andn %g1, 0x7f, %g2 ! can we use unrolled loop? + cmp %g1, 6 + bl,a,pn %icc, ccte + andcc %g1, 0xf, %o3 + andcc %o0, 0x1, %g0 + bne,pn %icc, ccslow + andcc %o0, 0x2, %g0 + be,pn %icc, 1f + andcc %o0, 0x4, %g0 + EX(lduh [%o0 + 0x00], %g4, add %g1, 0,#) + sub %g1, 2, %g1 + EX2(sth %g4, [%o1 + 0x00],#) + add %o0, 2, %o0 + sll %g4, 16, %g4 + addcc %g4, %g7, %g7 + add %o1, 2, %o1 + srl %g7, 16, %g3 + addc %g0, %g3, %g4 + sll %g7, 16, %g7 + sll %g4, 16, %g3 + srl %g7, 16, %g7 + andcc %o0, 0x4, %g0 + or %g3, %g7, %g7 +1: be,pt %icc, 3f + andn %g1, 0x7f, %g0 + EX(ld [%o0 + 0x00], %g4, add %g1, 0,#) + sub %g1, 4, %g1 + EX2(st %g4, [%o1 + 0x00],#) + add %o0, 4, %o0 + addcc %g4, %g7, %g7 + add %o1, 4, %o1 + addc %g0, %g7, %g7 +cc_dword_aligned: +3: brz,pn %g2, 3f ! nope, less than one loop remains + andcc %o1, 4, %g0 ! dest aligned on 4 or 8 byte boundry? + be,pn %icc, ccdbl + 4 ! 8 byte aligned, kick ass +5: CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) +10: EXT(5b, 10b, 20f,#) ! note for exception handling + sub %g1, 128, %g1 ! detract from length + addc %g0, %g7, %g7 ! add in last carry bit + andncc %g1, 0x7f, %g0 ! more to csum? + add %o0, 128, %o0 ! advance src ptr + bne,pt %icc, 5b ! we did not go negative, continue looping + add %o1, 128, %o1 ! advance dest ptr +3: andcc %g1, 0x70, %o2 ! can use table? +ccmerge:be,pn %icc, ccte ! nope, go and check for end cruft + andcc %g1, 0xf, %o3 ! get low bits of length (clears carry btw) + srl %o2, 1, %o4 ! begin negative offset computation +13: rd %pc, %o5 ! set up table ptr end + add %o0, %o2, %o0 ! advance src ptr + sub %o5, %o4, %o5 ! continue table calculation + sll %o2, 1, %g2 ! constant multiplies are fun... + sub %o5, %g2, %o5 ! some more adjustments + jmpl %o5 + (12f-13b), %g0 ! jump into it, duff style, wheee... + add %o1, %o2, %o1 ! advance dest ptr (carry is clear btw) +cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5) +12: EXT(cctbl, 12b, 22f,#) ! note for exception table handling + addc %g0, %g7, %g7 + andcc %o3, 0xf, %g0 ! check for low bits set +ccte: bne,pn %icc, cc_end_cruft ! something left, handle it out of band + sethi %uhi(KERNBASE), %g4 ! restore gfp + mov %g7, %o0 ! give em the computed checksum + retl ! return + sllx %g4, 32, %g4 ! finish gfp restoration +ccdbl: CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) +11: EXT(ccdbl, 11b, 21f,#) ! note for exception table handling + sub %g1, 128, %g1 ! detract from length + addc %g0, %g7, %g7 ! add in last carry bit + andncc %g1, 0x7f, %g0 ! more to csum? + add %o0, 128, %o0 ! advance src ptr + bne,pt %icc, ccdbl ! we did not go negative, continue looping + add %o1, 128, %o1 ! advance dest ptr + ba,pt %xcc, ccmerge ! finish it off, above + andcc %g1, 0x70, %o2 ! can use table? (clears carry btw) + +ccslow: mov 0, %g5 + brlez,pn %g1, 4f + andcc %o0, 1, %o5 + be,a,pt %icc, 1f + srl %g1, 1, %o3 + sub %g1, 1, %g1 + EX(ldub [%o0], %g5, add %g1, 1,#) + add %o0, 1, %o0 + EX2(stb %g5, [%o1],#) + srl %g1, 1, %o3 + add %o1, 1, %o1 +1: brz,a,pn %o3, 3f + andcc %g1, 1, %g0 + andcc %o0, 2, %g0 + be,a,pt %icc, 1f + srl %o3, 1, %o3 + EX(lduh [%o0], %o4, add %g1, 0,#) + sub %g1, 2, %g1 + srl %o4, 8, %g2 + sub %o3, 1, %o3 + EX2(stb %g2, [%o1],#) + add %o4, %g5, %g5 + EX2(stb %o4, [%o1 + 1],#) + add %o0, 2, %o0 + srl %o3, 1, %o3 + add %o1, 2, %o1 +1: brz,a,pn %o3, 2f + andcc %g1, 2, %g0 + EX3(ld [%o0], %o4,#) +5: srl %o4, 24, %g2 + srl %o4, 16, %g3 + EX2(stb %g2, [%o1],#) + srl %o4, 8, %g2 + EX2(stb %g3, [%o1 + 1],#) + add %o0, 4, %o0 + EX2(stb %g2, [%o1 + 2],#) + addcc %o4, %g5, %g5 + EX2(stb %o4, [%o1 + 3],#) + addc %g5, %g0, %g5 ! I am now to lazy to optimize this (question is if it + add %o1, 4, %o1 ! is worthy). Maybe some day - with the sll/srl + subcc %o3, 1, %o3 ! tricks + bne,a,pt %icc, 5b + EX3(ld [%o0], %o4,#) + sll %g5, 16, %g2 + srl %g5, 16, %g5 + srl %g2, 16, %g2 + andcc %g1, 2, %g0 + add %g2, %g5, %g5 +2: be,a,pt %icc, 3f + andcc %g1, 1, %g0 + EX(lduh [%o0], %o4, and %g1, 3,#) + andcc %g1, 1, %g0 + srl %o4, 8, %g2 + add %o0, 2, %o0 + EX2(stb %g2, [%o1],#) + add %g5, %o4, %g5 + EX2(stb %o4, [%o1 + 1],#) + add %o1, 2, %o1 +3: be,a,pt %icc, 1f + sll %g5, 16, %o4 + EX(ldub [%o0], %g2, add %g0, 1,#) + sll %g2, 8, %o4 + EX2(stb %g2, [%o1],#) + add %g5, %o4, %g5 + sll %g5, 16, %o4 +1: addcc %o4, %g5, %g5 + srl %g5, 16, %o4 + addc %g0, %o4, %g5 + brz,pt %o5, 4f + srl %g5, 8, %o4 + and %g5, 0xff, %g2 + and %o4, 0xff, %o4 + sll %g2, 8, %g2 + or %g2, %o4, %g5 +4: addcc %g7, %g5, %g7 + retl + addc %g0, %g7, %o0 +__csum_partial_copy_end: + + .section .fixup,#alloc,#execinstr + .align 4 +/* We do these strange calculations for the csum_*_from_user case only, ie. + * we only bother with faults on loads... */ + +/* o2 = ((g2%20)&3)*8 + * o3 = g1 - (g2/20)*32 - o2 */ +20: + cmp %g2, 20 + blu,a,pn %icc, 1f + and %g2, 3, %o2 + sub %g1, 32, %g1 + ba,pt %xcc, 20b + sub %g2, 20, %g2 +1: + sll %o2, 3, %o2 + ba,pt %xcc, 31f + sub %g1, %o2, %o3 + +/* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8) + * o3 = g1 - (g2/16)*32 - o2 */ +21: + andcc %g2, 15, %o3 + srl %g2, 4, %g2 + be,a,pn %icc, 1f + clr %o2 + add %o3, 1, %o3 + and %o3, 14, %o3 + sll %o3, 3, %o2 +1: + sll %g2, 5, %g2 + sub %g1, %g2, %o3 + ba,pt %xcc, 31f + sub %o3, %o2, %o3 + +/* o0 += (g2/10)*16 - 0x70 + * 01 += (g2/10)*16 - 0x70 + * o2 = (g2 % 10) ? 8 : 0 + * o3 += 0x70 - (g2/10)*16 - o2 */ +22: + cmp %g2, 10 + blu,a,pt %xcc, 1f + sub %o0, 0x70, %o0 + add %o0, 16, %o0 + add %o1, 16, %o1 + sub %o3, 16, %o3 + ba,pt %xcc, 22b + sub %g2, 10, %g2 +1: + sub %o1, 0x70, %o1 + add %o3, 0x70, %o3 + clr %o2 + movrnz %g2, 8, %o2 + ba,pt %xcc, 31f + sub %o3, %o2, %o3 +96: + and %g1, 3, %g1 + sll %o3, 2, %o3 + add %g1, %o3, %o3 +30: +/* %o1 is dst + * %o3 is # bytes to zero out + * %o4 is faulting address + * %o5 is %pc where fault occured */ + clr %o2 +31: +/* %o0 is src + * %o1 is dst + * %o2 is # of bytes to copy from src to dst + * %o3 is # bytes to zero out + * %o4 is faulting address + * %o5 is %pc where fault occured */ + save %sp, -136, %sp + mov %i5, %o0 + mov %i7, %o1 + mov %i4, %o2 + call lookup_fault + mov %g7, %i4 + cmp %o0, 2 + bne,pn %icc, 1f + add %g0, -EFAULT, %i5 + brz,pn %i2, 2f + mov %i0, %o1 + mov %i1, %o0 + call __copy_from_user + mov %i2, %o2 + brnz,a,pn %o0, 2f + add %i3, %i2, %i3 + add %i1, %i2, %i1 +2: + mov %i1, %o0 + wr %%g0, ASI_S, %%asi + call __bzero_noasi + mov %i3, %o1 +1: + ldx [%sp + STACK_BIAS + 264], %o2 ! struct_ptr of parent + st %i5, [%o2] + ret + restore diff --git a/arch/sparc64/lib/copy_from_user.S b/arch/sparc64/lib/copy_from_user.S new file mode 100644 index 000000000..ba26a1c01 --- /dev/null +++ b/arch/sparc64/lib/copy_from_user.S @@ -0,0 +1,456 @@ +/* copy_user.S: Sparc optimized copy_from_user code. + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1996 David S. Miller + * Copyright(C) 1996 Eddie C. Dost + * Copyright(C) 1996,1997 Jakub Jelinek + * + * derived from: + * e-mail between David and Eddie. + * + * Returns 0 if successful, otherwise count of bytes not copied yet + * + * FIXME: This code should be optimized for sparc64... -jj + */ + +#include <asm/ptrace.h> +#include <asm/asi.h> + +#define EX(x,y,a,b,z) \ +98: x,y; \ + .section .fixup,z##alloc,z##execinstr; \ + .align 4; \ +99: retl; \ + a, b, %o0; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4 + +#define EX2(x,y,c,d,e,a,b,z) \ +98: x,y; \ + .section .fixup,z##alloc,z##execinstr; \ + .align 4; \ +99: c, d, e; \ + retl; \ + a, b, %o0; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4 + +#define EXO2(x,y,z) \ +98: x,##y; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 97f; \ + .text; \ + .align 4 + +#define EXT(start,end,handler,z) \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word start, 0, end, handler; \ + .text; \ + .align 4 + +/* Please do not change following macros unless you change logic used + * in .fixup at the end of this file as well + */ + +/* Both these macros have to start with exactly the same insn */ +#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldda [%src + offset + 0x00] %asi, %t0; \ + ldda [%src + offset + 0x08] %asi, %t2; \ + ldda [%src + offset + 0x10] %asi, %t4; \ + ldda [%src + offset + 0x18] %asi, %t6; \ + st %t0, [%dst + offset + 0x00]; \ + st %t1, [%dst + offset + 0x04]; \ + st %t2, [%dst + offset + 0x08]; \ + st %t3, [%dst + offset + 0x0c]; \ + st %t4, [%dst + offset + 0x10]; \ + st %t5, [%dst + offset + 0x14]; \ + st %t6, [%dst + offset + 0x18]; \ + st %t7, [%dst + offset + 0x1c]; + +#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldda [%src + offset + 0x00] %asi, %t0; \ + ldda [%src + offset + 0x08] %asi, %t2; \ + ldda [%src + offset + 0x10] %asi, %t4; \ + ldda [%src + offset + 0x18] %asi, %t6; \ + std %t0, [%dst + offset + 0x00]; \ + std %t2, [%dst + offset + 0x08]; \ + std %t4, [%dst + offset + 0x10]; \ + std %t6, [%dst + offset + 0x18]; + +#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ + ldda [%src - offset - 0x10] %asi, %t0; \ + ldda [%src - offset - 0x08] %asi, %t2; \ + st %t0, [%dst - offset - 0x10]; \ + st %t1, [%dst - offset - 0x0c]; \ + st %t2, [%dst - offset - 0x08]; \ + st %t3, [%dst - offset - 0x04]; + +#define MOVE_HALFCHUNK(src, dst, offset, t0, t1, t2, t3) \ + lduha [%src + offset + 0x00] %asi, %t0; \ + lduha [%src + offset + 0x02] %asi, %t1; \ + lduha [%src + offset + 0x04] %asi, %t2; \ + lduha [%src + offset + 0x06] %asi, %t3; \ + sth %t0, [%dst + offset + 0x00]; \ + sth %t1, [%dst + offset + 0x02]; \ + sth %t2, [%dst + offset + 0x04]; \ + sth %t3, [%dst + offset + 0x06]; + +#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ + lduba [%src - offset - 0x02] %asi, %t0; \ + lduba [%src - offset - 0x01] %asi, %t1; \ + stb %t0, [%dst - offset - 0x02]; \ + stb %t1, [%dst - offset - 0x01]; + + .text + .align 4 + + .globl __copy_from_user +dword_align: + andcc %o1, 1, %g0 + be 4f + andcc %o1, 2, %g0 + + EXO2(lduba [%o1] %asi, %g2,#) + add %o1, 1, %o1 + stb %g2, [%o0] + sub %o2, 1, %o2 + bne 3f + add %o0, 1, %o0 + + EXO2(lduha [%o1] %asi, %g2,#) + add %o1, 2, %o1 + sth %g2, [%o0] + sub %o2, 2, %o2 + ba,pt %xcc, 3f + add %o0, 2, %o0 +4: + EXO2(lduha [%o1] %asi, %g2,#) + add %o1, 2, %o1 + sth %g2, [%o0] + sub %o2, 2, %o2 + ba,pt %xcc, 3f + add %o0, 2, %o0 + +__copy_from_user: /* %o0=dst %o1=src %o2=len */ + wr %g0, ASI_S, %asi + xor %o0, %o1, %o4 +1: + andcc %o4, 3, %o5 +2: + bne,pn %icc, cannot_optimize + cmp %o2, 15 + + bleu,pn %xcc, short_aligned_end + andcc %o1, 3, %g0 + + bne,pn %icc, dword_align +3: + andcc %o1, 4, %g0 + + be,pt %icc, 2f + mov %o2, %g1 + + EXO2(lda [%o1] %asi, %o4,#) + sub %g1, 4, %g1 + st %o4, [%o0] + add %o1, 4, %o1 + add %o0, 4, %o0 +2: + andcc %g1, 0xffffffffffffff80, %g7 + be,pn %xcc, 3f + andcc %o0, 4, %g0 + + be,pn %icc, ldd_std + 4 +5: + MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) +80: + EXT(5b, 80b, 50f,#) + subcc %g7, 128, %g7 + add %o1, 128, %o1 + bne,pt %xcc, 5b + add %o0, 128, %o0 +3: + andcc %g1, 0x70, %g7 + be,pn %icc, copy_user_table_end + andcc %g1, 8, %g0 +100: + rd %pc, %o5 + srl %g7, 1, %o4 + add %g7, %o4, %o4 + add %o1, %g7, %o1 + sub %o5, %o4, %o5 + jmpl %o5 + (copy_user_table_end - 100b), %g0 + add %o0, %g7, %o0 + +copy_user_table: + MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) +copy_user_table_end: + EXT(copy_user_table, copy_user_table_end, 51f,#) + be,pt %icc, copy_user_last7 + andcc %g1, 4, %g0 + + EX(ldda [%o1] %asi, %g2, and %g1, 0xf,#) + add %o0, 8, %o0 + add %o1, 8, %o1 + st %g2, [%o0 - 0x08] + st %g3, [%o0 - 0x04] +copy_user_last7: + be,pn %icc, 1f + andcc %g1, 2, %g0 + + EX(lda [%o1] %asi, %g2, and %g1, 7,#) + add %o1, 4, %o1 + st %g2, [%o0] + add %o0, 4, %o0 +1: + be,pn %icc, 1f + andcc %g1, 1, %g0 + + EX(lduha [%o1] %asi, %g2, and %g1, 3,#) + add %o1, 2, %o1 + sth %g2, [%o0] + add %o0, 2, %o0 +1: + be,pn %icc, 1f + nop + + EX(lduba [%o1] %asi, %g2, add %g0, 1,#) + stb %g2, [%o0] +1: + retl + clr %o0 + +ldd_std: + MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) +81: + EXT(ldd_std, 81b, 52f,#) + subcc %g7, 128, %g7 + add %o1, 128, %o1 + bne,pt %xcc, ldd_std + add %o0, 128, %o0 + + andcc %g1, 0x70, %g7 + be,pn %icc, copy_user_table_end + andcc %g1, 8, %g0 +101: + rd %pc, %o5 + srl %g7, 1, %o4 + add %g7, %o4, %o4 + add %o1, %g7, %o1 + sub %o5, %o4, %o5 + jmpl %o5 + (copy_user_table_end - 101b), %g0 + add %o0, %g7, %o0 + +cannot_optimize: + bleu short_end + cmp %o5, 2 + + bne byte_chunk + and %o2, 0xfffffffffffffff0, %o3 + + andcc %o1, 1, %g0 + be 10f + nop + + EXO2(lduba [%o1] %asi, %g2,#) + add %o1, 1, %o1 + stb %g2, [%o0] + sub %o2, 1, %o2 + andcc %o2, 0xfffffffffffffff0, %o3 + be short_end + add %o0, 1, %o0 +10: + MOVE_HALFCHUNK(o1, o0, 0x00, g2, g3, g4, g5) + MOVE_HALFCHUNK(o1, o0, 0x08, g2, g3, g4, g5) +82: + EXT(10b, 82b, 53f,#) + subcc %o3, 0x10, %o3 + add %o1, 0x10, %o1 + bne 10b + add %o0, 0x10, %o0 + ba,pt %xcc, 2f + and %o2, 0xe, %o3 + +byte_chunk: + MOVE_SHORTCHUNK(o1, o0, -0x02, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x04, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x06, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x08, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x0a, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x0c, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x0e, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x10, g2, g3) +83: + EXT(byte_chunk, 83b, 54f,#) + subcc %o3, 0x10, %o3 + add %o1, 0x10, %o1 + bne,pt %xcc, byte_chunk + add %o0, 0x10, %o0 + +short_end: + and %o2, 0xe, %o3 +2: + rd %pc, %o5 + sll %o3, 3, %o4 + add %o0, %o3, %o0 + sub %o5, %o4, %o5 + add %o1, %o3, %o1 + jmpl %o5 + (short_table_end - 2b), %g0 + andcc %o2, 1, %g0 +84: + MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) +short_table_end: + EXT(84b, short_table_end, 55f,#) + be 1f + nop + EX(lduba [%o1] %asi, %g2, add %g0, 1,#) + stb %g2, [%o0] +1: + retl + clr %o0 + +short_aligned_end: + bne short_end + andcc %o2, 8, %g0 + + be 1f + andcc %o2, 4, %g0 + + EXO2(lda [%o1 + 0x00] %asi, %g2,#) + EX(lda [%o1 + 0x04] %asi, %g3, sub %o2, 4,#) + add %o1, 8, %o1 + st %g2, [%o0 + 0x00] + st %g3, [%o0 + 0x04] + add %o0, 8, %o0 +1: + ba,pt %xcc, copy_user_last7 + mov %o2, %g1 + + .section .fixup,#alloc,#execinstr + .align 4 +97: + retl + mov %o2, %o0 +/* exception routine sets %g2 to (broken_insn - first_insn)>>2 */ +50: +/* This magic counts how many bytes are left when crash in MOVE_BIGCHUNK + * happens. This is derived from the amount ldd reads, st stores, etc. + * x = g2 % 12; + * o0 = g1 + g7 - ((g2 / 12) * 32 + (x < 4) ? x * 8 : (x - 4) * 4) + */ + cmp %g2, 12 + bcs 1f + cmp %g2, 24 + bcs 2f + cmp %g2, 36 + bcs 3f + nop + sub %g2, 12, %g2 + sub %g7, 32, %g7 +3: + sub %g2, 12, %g2 + sub %g7, 32, %g7 +2: + sub %g2, 12, %g2 + sub %g7, 32, %g7 +1: + cmp %g2, 4 + bcs,a 1f + sll %g2, 3, %g2 + sub %g2, 4, %g2 + sll %g2, 2, %g2 +1: + and %g1, 0x7f, %o0 + add %o0, %g7, %o0 + retl + sub %o0, %g2, %o0 +51: +/* i = 41 - g2; j = i % 6; + * o0 = (g1 & 15) + (i / 6) * 16 + (j < 4) ? (j + 1) * 4 : (j - 3) * 8; + */ + neg %g2 + and %g1, 0xf, %g1 + add %g2, 41, %g2 +1: + cmp %g2, 6 + bcs,a 2f + cmp %g2, 4 + add %g1, 16, %g1 + b 1b + sub %g2, 6, %g2 +2: + bcs,a 3f + inc %g2 + sub %g2, 3, %g2 + b 2f + sll %g2, 3, %g2 +3: + sll %g2, 2, %g2 +2: + retl + add %g1, %g2, %o0 +52: +/* o0 = g1 + g7 - (g2 / 8) * 32 + (x & 3) * 8 */ + and %g2, 0xfffffffffffffff8, %g4 + and %g2, 3, %g2 + sll %g4, 2, %g4 + sll %g2, 3, %g2 + add %g2, %g4, %g2 + b,a 1b +53: +/* o0 = o3 + (o2 & 15) - (g2 & 8) - (g2 & 3) * 2 */ + and %g2, 3, %g4 + and %g2, 0xfffffffffffffff8, %g2 + sll %g4, 1, %g4 + add %g2, %g4, %g2 + and %o2, 0xf, %o0 + add %o0, %o3, %o0 + retl + sub %o0, %g2, %o0 +54: +/* o0 = o3 + (o2 & 15) - (g2 / 4) * 2 - (g2 & 1) */ + srl %g2, 2, %o4 + and %g2, 1, %o1 + sll %o4, 1, %o4 + and %o2, 0xf, %o2 + sub %o3, %o1, %o3 + sub %o2, %o4, %o2 + retl + add %o2, %o3, %o0 +55: +/* o0 = (o2 & 1) + (27 - g2)/4 * 2 + ((27 - g2) & 1) */ + neg %g2 + and %o2, 1, %o2 + add %g2, 27, %g2 + srl %g2, 2, %o1 + and %g2, 1, %g2 + sll %o1, 1, %o1 + add %o2, %g2, %o0 + retl + add %o0, %o1, %o0 diff --git a/arch/sparc64/lib/copy_to_user.S b/arch/sparc64/lib/copy_to_user.S new file mode 100644 index 000000000..47a6bd337 --- /dev/null +++ b/arch/sparc64/lib/copy_to_user.S @@ -0,0 +1,456 @@ +/* copy_user.S: Sparc optimized copy_to_user code. + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1996 David S. Miller + * Copyright(C) 1996 Eddie C. Dost + * Copyright(C) 1996,1997 Jakub Jelinek + * + * derived from: + * e-mail between David and Eddie. + * + * Returns 0 if successful, otherwise count of bytes not copied yet + * + * FIXME: This code should be optimized for sparc64... -jj + */ + +#include <asm/ptrace.h> +#include <asm/asi.h> + +#define EX(x,y,a,b,z) \ +98: x,y; \ + .section .fixup,z##alloc,z##execinstr; \ + .align 4; \ +99: retl; \ + a, b, %o0; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4 + +#define EX2(x,y,c,d,e,a,b,z) \ +98: x,y; \ + .section .fixup,z##alloc,z##execinstr; \ + .align 4; \ +99: c, d, e; \ + retl; \ + a, b, %o0; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4 + +#define EXO2(x,y,z) \ +98: x,##y; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 97f; \ + .text; \ + .align 4 + +#define EXT(start,end,handler,z) \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word start, 0, end, handler; \ + .text; \ + .align 4 + +/* Please do not change following macros unless you change logic used + * in .fixup at the end of this file as well + */ + +/* Both these macros have to start with exactly the same insn */ +#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [%src + offset + 0x00], %t0; \ + ldd [%src + offset + 0x08], %t2; \ + ldd [%src + offset + 0x10], %t4; \ + ldd [%src + offset + 0x18], %t6; \ + sta %t0, [%dst + offset + 0x00] %asi; \ + sta %t1, [%dst + offset + 0x04] %asi; \ + sta %t2, [%dst + offset + 0x08] %asi; \ + sta %t3, [%dst + offset + 0x0c] %asi; \ + sta %t4, [%dst + offset + 0x10] %asi; \ + sta %t5, [%dst + offset + 0x14] %asi; \ + sta %t6, [%dst + offset + 0x18] %asi; \ + sta %t7, [%dst + offset + 0x1c] %asi; + +#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [%src + offset + 0x00], %t0; \ + ldd [%src + offset + 0x08], %t2; \ + ldd [%src + offset + 0x10], %t4; \ + ldd [%src + offset + 0x18], %t6; \ + stda %t0, [%dst + offset + 0x00] %asi; \ + stda %t2, [%dst + offset + 0x08] %asi; \ + stda %t4, [%dst + offset + 0x10] %asi; \ + stda %t6, [%dst + offset + 0x18] %asi; + +#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ + ldd [%src - offset - 0x10], %t0; \ + ldd [%src - offset - 0x08], %t2; \ + sta %t0, [%dst - offset - 0x10] %asi; \ + sta %t1, [%dst - offset - 0x0c] %asi; \ + sta %t2, [%dst - offset - 0x08] %asi; \ + sta %t3, [%dst - offset - 0x04] %asi; + +#define MOVE_HALFCHUNK(src, dst, offset, t0, t1, t2, t3) \ + lduh [%src + offset + 0x00], %t0; \ + lduh [%src + offset + 0x02], %t1; \ + lduh [%src + offset + 0x04], %t2; \ + lduh [%src + offset + 0x06], %t3; \ + stha %t0, [%dst + offset + 0x00] %asi; \ + stha %t1, [%dst + offset + 0x02] %asi; \ + stha %t2, [%dst + offset + 0x04] %asi; \ + stha %t3, [%dst + offset + 0x06] %asi; + +#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ + ldub [%src - offset - 0x02], %t0; \ + ldub [%src - offset - 0x01], %t1; \ + stba %t0, [%dst - offset - 0x02] %asi; \ + stba %t1, [%dst - offset - 0x01] %asi; + + .text + .align 4 + + .globl __copy_to_user +dword_align: + andcc %o1, 1, %g0 + be 4f + andcc %o1, 2, %g0 + + ldub [%o1], %g2 + add %o1, 1, %o1 + EXO2(stba %g2, [%o0] %asi,#) + sub %o2, 1, %o2 + bne 3f + add %o0, 1, %o0 + + lduh [%o1], %g2 + add %o1, 2, %o1 + EXO2(stha %g2, [%o0] %asi,#) + sub %o2, 2, %o2 + ba,pt %xcc, 3f + add %o0, 2, %o0 +4: + lduh [%o1], %g2 + add %o1, 2, %o1 + EXO2(stha %g2, [%o0] %asi,#) + sub %o2, 2, %o2 + ba,pt %xcc, 3f + add %o0, 2, %o0 + +__copy_to_user: /* %o0=dst %o1=src %o2=len */ + wr %g0, ASI_S, %asi + xor %o0, %o1, %o4 +1: + andcc %o4, 3, %o5 +2: + bne,pn %icc, cannot_optimize + cmp %o2, 15 + + bleu,pn %xcc, short_aligned_end + andcc %o1, 3, %g0 + + bne,pn %icc, dword_align +3: + andcc %o1, 4, %g0 + + be,pt %icc, 2f + mov %o2, %g1 + + ld [%o1], %o4 + sub %g1, 4, %g1 + EXO2(sta %o4, [%o0] %asi,#) + add %o1, 4, %o1 + add %o0, 4, %o0 +2: + andcc %g1, 0xffffffffffffff80, %g7 + be,pn %xcc, 3f + andcc %o0, 4, %g0 + + be,pn %icc, ldd_std + 4 +5: + MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) +80: + EXT(5b, 80b, 50f,#) + subcc %g7, 128, %g7 + add %o1, 128, %o1 + bne,pt %xcc, 5b + add %o0, 128, %o0 +3: + andcc %g1, 0x70, %g7 + be,pn %icc, copy_user_table_end + andcc %g1, 8, %g0 +100: + rd %pc, %o5 + srl %g7, 1, %o4 + add %g7, %o4, %o4 + add %o1, %g7, %o1 + sub %o5, %o4, %o5 + jmpl %o5 + (copy_user_table_end - 100b), %g0 + add %o0, %g7, %o0 + +copy_user_table: + MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) +copy_user_table_end: + EXT(copy_user_table, copy_user_table_end, 51f,#) + be,pt %icc, copy_user_last7 + andcc %g1, 4, %g0 + + ldd [%o1], %g2 + add %o0, 8, %o0 + add %o1, 8, %o1 + EX(sta %g2, [%o0 - 0x08] %asi, and %g1, 0xf,#) + EX2(sta %g3, [%o0 - 0x04] %asi, and %g1, 0xf, %g1, sub %g1, 4,#) +copy_user_last7: + be,pn %icc, 1f + andcc %g1, 2, %g0 + + ld [%o1], %g2 + add %o1, 4, %o1 + EX(sta %g2, [%o0] %asi, and %g1, 7,#) + add %o0, 4, %o0 +1: + be,pn %icc, 1f + andcc %g1, 1, %g0 + + lduh [%o1], %g2 + add %o1, 2, %o1 + EX(stha %g2, [%o0] %asi, and %g1, 3,#) + add %o0, 2, %o0 +1: + be,pn %icc, 1f + nop + + ldub [%o1], %g2 + EX(stba %g2, [%o0] %asi, add %g0, 1,#) +1: + retl + clr %o0 + +ldd_std: + MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) +81: + EXT(ldd_std, 81b, 52f,#) + subcc %g7, 128, %g7 + add %o1, 128, %o1 + bne,pt %xcc, ldd_std + add %o0, 128, %o0 + + andcc %g1, 0x70, %g7 + be,pn %icc, copy_user_table_end + andcc %g1, 8, %g0 +101: + rd %pc, %o5 + srl %g7, 1, %o4 + add %g7, %o4, %o4 + add %o1, %g7, %o1 + sub %o5, %o4, %o5 + jmpl %o5 + (copy_user_table_end - 101b), %g0 + add %o0, %g7, %o0 + +cannot_optimize: + bleu short_end + cmp %o5, 2 + + bne byte_chunk + and %o2, 0xfffffffffffffff0, %o3 + + andcc %o1, 1, %g0 + be 10f + nop + + ldub [%o1], %g2 + add %o1, 1, %o1 + EXO2(stba %g2, [%o0] %asi,#) + sub %o2, 1, %o2 + andcc %o2, 0xfffffffffffffff0, %o3 + be short_end + add %o0, 1, %o0 +10: + MOVE_HALFCHUNK(o1, o0, 0x00, g2, g3, g4, g5) + MOVE_HALFCHUNK(o1, o0, 0x08, g2, g3, g4, g5) +82: + EXT(10b, 82b, 53f,#) + subcc %o3, 0x10, %o3 + add %o1, 0x10, %o1 + bne 10b + add %o0, 0x10, %o0 + ba,pt %xcc, 2f + and %o2, 0xe, %o3 + +byte_chunk: + MOVE_SHORTCHUNK(o1, o0, -0x02, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x04, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x06, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x08, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x0a, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x0c, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x0e, g2, g3) + MOVE_SHORTCHUNK(o1, o0, -0x10, g2, g3) +83: + EXT(byte_chunk, 83b, 54f,#) + subcc %o3, 0x10, %o3 + add %o1, 0x10, %o1 + bne,pt %xcc, byte_chunk + add %o0, 0x10, %o0 + +short_end: + and %o2, 0xe, %o3 +2: + rd %pc, %o5 + sll %o3, 3, %o4 + add %o0, %o3, %o0 + sub %o5, %o4, %o5 + add %o1, %o3, %o1 + jmpl %o5 + (short_table_end - 2b), %g0 + andcc %o2, 1, %g0 +84: + MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) +short_table_end: + EXT(84b, short_table_end, 55f,#) + be 1f + nop + ldub [%o1], %g2 + EX(stba %g2, [%o0] %asi, add %g0, 1,#) +1: + retl + clr %o0 + +short_aligned_end: + bne short_end + andcc %o2, 8, %g0 + + be 1f + andcc %o2, 4, %g0 + + ld [%o1 + 0x00], %g2 + ld [%o1 + 0x04], %g3 + add %o1, 8, %o1 + EXO2(sta %g2, [%o0 + 0x00] %asi,#) + EX(sta %g3, [%o0 + 0x04] %asi, sub %o2, 4,#) + add %o0, 8, %o0 +1: + ba,pt %xcc, copy_user_last7 + mov %o2, %g1 + + .section .fixup,#alloc,#execinstr + .align 4 +97: + retl + mov %o2, %o0 +/* exception routine sets %g2 to (broken_insn - first_insn)>>2 */ +50: +/* This magic counts how many bytes are left when crash in MOVE_BIGCHUNK + * happens. This is derived from the amount ldd reads, st stores, etc. + * x = g2 % 12; + * o0 = g1 + g7 - ((g2 / 12) * 32 + (x < 4) ? x * 8 : (x - 4) * 4) + */ + cmp %g2, 12 + bcs 1f + cmp %g2, 24 + bcs 2f + cmp %g2, 36 + bcs 3f + nop + sub %g2, 12, %g2 + sub %g7, 32, %g7 +3: + sub %g2, 12, %g2 + sub %g7, 32, %g7 +2: + sub %g2, 12, %g2 + sub %g7, 32, %g7 +1: + cmp %g2, 4 + bcs,a 1f + sll %g2, 3, %g2 + sub %g2, 4, %g2 + sll %g2, 2, %g2 +1: + and %g1, 0x7f, %o0 + add %o0, %g7, %o0 + retl + sub %o0, %g2, %o0 +51: +/* i = 41 - g2; j = i % 6; + * o0 = (g1 & 15) + (i / 6) * 16 + (j < 4) ? (j + 1) * 4 : (j - 3) * 8; + */ + neg %g2 + and %g1, 0xf, %g1 + add %g2, 41, %g2 +1: + cmp %g2, 6 + bcs,a 2f + cmp %g2, 4 + add %g1, 16, %g1 + b 1b + sub %g2, 6, %g2 +2: + bcs,a 3f + inc %g2 + sub %g2, 3, %g2 + b 2f + sll %g2, 3, %g2 +3: + sll %g2, 2, %g2 +2: + retl + add %g1, %g2, %o0 +52: +/* o0 = g1 + g7 - (g2 / 8) * 32 + (x & 3) * 8 */ + and %g2, 0xfffffffffffffff8, %g4 + and %g2, 3, %g2 + sll %g4, 2, %g4 + sll %g2, 3, %g2 + add %g2, %g4, %g2 + b,a 1b +53: +/* o0 = o3 + (o2 & 15) - (g2 & 8) - (g2 & 3) * 2 */ + and %g2, 3, %g4 + and %g2, 0xfffffffffffffff8, %g2 + sll %g4, 1, %g4 + add %g2, %g4, %g2 + and %o2, 0xf, %o0 + add %o0, %o3, %o0 + retl + sub %o0, %g2, %o0 +54: +/* o0 = o3 + (o2 & 15) - (g2 / 4) * 2 - (g2 & 1) */ + srl %g2, 2, %o4 + and %g2, 1, %o1 + sll %o4, 1, %o4 + and %o2, 0xf, %o2 + sub %o3, %o1, %o3 + sub %o2, %o4, %o2 + retl + add %o2, %o3, %o0 +55: +/* o0 = (o2 & 1) + (27 - g2)/4 * 2 + ((27 - g2) & 1) */ + neg %g2 + and %o2, 1, %o2 + add %g2, 27, %g2 + srl %g2, 2, %o1 + and %g2, 1, %g2 + sll %o1, 1, %o1 + add %o2, %g2, %o0 + retl + add %o0, %o1, %o0 diff --git a/arch/sparc64/lib/locks.S b/arch/sparc64/lib/locks.S new file mode 100644 index 000000000..a1154cb6d --- /dev/null +++ b/arch/sparc64/lib/locks.S @@ -0,0 +1,77 @@ +/* $Id: locks.S,v 1.2 1997/03/10 12:28:02 jj Exp $ + * locks.S: SMP low-level lock primitives on Sparc64. + * + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + */ + +#include <asm/ptrace.h> + + .text + .align 4 + + .globl __spinlock_waitfor +__spinlock_waitfor: +1: orcc %g2, 0x0, %g0 + bne 1b + ldub [%g1], %g2 + ldstub [%g1], %g2 + jmpl %o7 - 12, %g0 + mov %g5, %o7 + + .globl ___become_idt +___become_idt: +#if 0 /* Don't know how to do this on the Ultra yet... */ +#endif + jmpl %o7 + 8, %g0 + mov %g5, %o7 + +___lk_busy_spin: + orcc %g2, 0, %g0 + bne ___lk_busy_spin + ldub [%g1 + 0], %g2 + b 1f + ldstub [%g1 + 0], %g2 + + .globl ___lock_kernel +___lock_kernel: + addcc %g2, -1, %g2 + rdpr %pil, %g3 + bcs,a 9f + st %g2, [%g6 + AOFF_task_lock_depth] + wrpr 15, %pil + ldstub [%g1 + 0], %g2 +1: orcc %g2, 0, %g0 + bne,a ___lk_busy_spin + ldub [%g1 + 0], %g2 + ldub [%g1 + 2], %g2 + cmp %g2, %g5 + be 2f + stb %g5, [%g1 + 1] + stb %g5, [%g1 + 2] +#ifdef __SMP__ + /* XXX Figure out how to become interrupt receiver in SMP system. */ +#endif +2: mov -1, %g2 + st %g2, [%g6 + AOFF_task_lock_depth] + wrpr %g3, %pil +9: jmpl %o7 + 0x8, %g0 + mov %g5, %o7 + +#undef NO_PROC_ID +#define NO_PROC_ID 0xff + + .globl ___unlock_kernel +___unlock_kernel: + addcc %g2, 1, %g2 + rdpr %pil, %g3 + bne,a 1f + st %g2, [%g6 + AOFF_task_lock_depth] + wrpr 15, %pil + mov NO_PROC_ID, %g2 + stb %g2, [%g1 + 1] + stb %g0, [%g1 + 0] + st %g0, [%g6 + AOFF_task_lock_depth] + wrpr %g3, %pil +1: jmpl %o7 + 0x8, %g0 + mov %g5, %o7 + diff --git a/arch/sparc64/lib/memcmp.S b/arch/sparc64/lib/memcmp.S new file mode 100644 index 000000000..4c08d57c3 --- /dev/null +++ b/arch/sparc64/lib/memcmp.S @@ -0,0 +1,29 @@ +/* $Id: memcmp.S,v 1.2 1997/04/01 03:43:18 davem Exp $ + * Sparc64 optimized memcmp code. + * + * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + + .text + .align 4 + .globl __memcmp, memcmp +__memcmp: +memcmp: + brlez,pn %o2, 2f + sub %g0, %o2, %o3 + add %o0, %o2, %o0 + add %o1, %o2, %o1 + ldub [%o0 + %o3], %o4 +1: + ldub [%o1 + %o3], %o5 + sub %o4, %o5, %o4 + brnz,pn %o4, 3f + addcc %o3, 1, %o3 + bne,a,pt %xcc, 1b + ldub [%o0 + %o3], %o4 +2: + retl + clr %o0 +3: + retl + mov %o4, %o0 diff --git a/arch/sparc64/lib/memcpy.S b/arch/sparc64/lib/memcpy.S new file mode 100644 index 000000000..e9462345a --- /dev/null +++ b/arch/sparc64/lib/memcpy.S @@ -0,0 +1,526 @@ +/* memcpy.S: Sparc optimized memcpy, bcopy and memmove code + * Hand optimized from GNU libc's memcpy, bcopy and memmove + * for UltraSparc + * Copyright (C) 1991,1996 Free Software Foundation + * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi) + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) + * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +#include <asm/asi.h> +#include <asm/head.h> + +#ifdef __KERNEL__ + +#define FUNC(x) \ + .globl x; \ + .type x,@function; \ + .align 4; \ +x: + +#define FASTER_ALIGNED + +/* In kernel these functions don't return a value. + * One should use macros in asm/string.h for that purpose. + * We return 0, so that bugs are more apparent. + */ +#define SETUP_RETL +#define PRE_RETL sethi %uhi(KERNBASE), %g4; clr %o0 +#define RETL_INSN sllx %g4, 32, %g4 + +#else + +/* libc */ + +#define FASTER_ALIGNED + +#ifdef DEBUG +#define FUNC(x) \ + .globl jj##x##1; \ + .type jj##x##1,@function; \ + .align 4; \ +jj##x##1: +#else +#include "DEFS.h" +#endif + +#define SETUP_RETL mov %o0, %g6 +#define PRE_RETL +#define RETL_INSN mov %g6, %o0 + +#endif + +#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [%src + offset + 0x00], %t0; \ + ldd [%src + offset + 0x08], %t2; \ + ldd [%src + offset + 0x10], %t4; \ + ldd [%src + offset + 0x18], %t6; \ + stw %t0, [%dst + offset + 0x00]; \ + stw %t1, [%dst + offset + 0x04]; \ + stw %t2, [%dst + offset + 0x08]; \ + stw %t3, [%dst + offset + 0x0c]; \ + stw %t4, [%dst + offset + 0x10]; \ + stw %t5, [%dst + offset + 0x14]; \ + stw %t6, [%dst + offset + 0x18]; \ + stw %t7, [%dst + offset + 0x1c]; + +#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldx [%src + offset + 0x00], %t0; \ + ldx [%src + offset + 0x08], %t1; \ + ldx [%src + offset + 0x10], %t2; \ + ldx [%src + offset + 0x18], %t3; \ + ldx [%src + offset + 0x20], %t4; \ + ldx [%src + offset + 0x28], %t5; \ + ldx [%src + offset + 0x30], %t6; \ + ldx [%src + offset + 0x38], %t7; \ + stx %t0, [%dst + offset + 0x00]; \ + stx %t1, [%dst + offset + 0x08]; \ + stx %t2, [%dst + offset + 0x10]; \ + stx %t3, [%dst + offset + 0x18]; \ + stx %t4, [%dst + offset + 0x20]; \ + stx %t5, [%dst + offset + 0x28]; \ + stx %t6, [%dst + offset + 0x30]; \ + stx %t7, [%dst + offset + 0x38]; + +#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ + ldd [%src - offset - 0x10], %t0; \ + ldd [%src - offset - 0x08], %t2; \ + stw %t0, [%dst - offset - 0x10]; \ + stw %t1, [%dst - offset - 0x0c]; \ + stw %t2, [%dst - offset - 0x08]; \ + stw %t3, [%dst - offset - 0x04]; + +#define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1) \ + ldx [%src - offset - 0x10], %t0; \ + ldx [%src - offset - 0x08], %t1; \ + stx %t0, [%dst - offset - 0x10]; \ + stx %t1, [%dst - offset - 0x08]; + +#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ + ldub [%src - offset - 0x02], %t0; \ + ldub [%src - offset - 0x01], %t1; \ + stb %t0, [%dst - offset - 0x02]; \ + stb %t1, [%dst - offset - 0x01]; + + .text + .align 4 + +FUNC(bcopy) + + mov %o0, %o3 + mov %o1, %o0 + mov %o3, %o1 + brgez,a,pt %o2, 1f + cmp %o0, %o1 + + retl + nop ! Only bcopy returns here and it retuns void... + +#ifdef __KERNEL__ +FUNC(amemmove) +FUNC(__memmove) +#endif +FUNC(memmove) + + cmp %o0, %o1 +1: + SETUP_RETL + bleu,pt %xcc, 9f + sub %o0, %o1, %o4 + + add %o1, %o2, %o3 + cmp %o3, %o0 + bleu,pt %xcc, 0f + andcc %o4, 3, %o5 + + add %o1, %o2, %o1 + add %o0, %o2, %o0 + sub %o1, 1, %o1 + sub %o0, 1, %o0 + +1: + ldub [%o1], %o4 + subcc %o2, 1, %o2 + sub %o1, 1, %o1 + stb %o4, [%o0] + bne,pt %icc, 1b + sub %o0, 1, %o0 + + PRE_RETL + retl + RETL_INSN + +#ifdef __KERNEL__ +FUNC(__memcpy) +#endif +FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ + + sub %o0, %o1, %o4 + SETUP_RETL +9: + andcc %o4, 3, %o5 +0: + bne,pn %icc, 86f + cmp %o2, 15 + + bleu,pn %xcc, 90f + andcc %o1, 3, %g0 + + be,a,pt %icc, 3f ! check if we need to align + andcc %o1, 4, %g0 + + andcc %o1, 1, %g0 + be,pn %icc, 4f + andcc %o1, 2, %g0 + + ldub [%o1], %g2 + add %o1, 1, %o1 + sub %o2, 1, %o2 + stb %g2, [%o0] + bne,pn %icc, 5f + add %o0, 1, %o0 +4: + lduh [%o1], %g2 + add %o1, 2, %o1 + sub %o2, 2, %o2 + sth %g2, [%o0] + add %o0, 2, %o0 +5: + andcc %o1, 4, %g0 +3: + be,pn %icc, 2f + mov %o2, %g1 + + lduw [%o1], %o4 + sub %g1, 4, %g1 + stw %o4, [%o0] + add %o1, 4, %o1 + add %o0, 4, %o0 +2: + andcc %g1, -128, %g7 + be,pn %xcc, 3f + andcc %o0, 4, %g0 + + be,a,pn %icc, 82f + 4 + ldx [%o1], %o2 +5: + MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) + subcc %g7, 128, %g7 + add %o1, 128, %o1 + bne,pt %xcc, 5b + add %o0, 128, %o0 +3: + andcc %g1, 0x70, %g7 + be,pn %icc, 80f + andcc %g1, 8, %g0 +79: + rd %pc, %o5 + srl %g7, 1, %o4 + add %g7, %o4, %o4 + add %o1, %g7, %o1 + sub %o5, %o4, %o5 + jmpl %o5 + %lo(80f-79b), %g0 + add %o0, %g7, %o0 + + MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) + MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) + +80: /* memcpy_table_end */ + be,pt %icc, 81f + andcc %g1, 4, %g0 + + ldd [%o1], %g2 + add %o0, 8, %o0 + stw %g2, [%o0 - 0x08] + add %o1, 8, %o1 + stw %g3, [%o0 - 0x04] + +81: /* memcpy_last7 */ + + be,pt %icc, 1f + andcc %g1, 2, %g0 + + lduw [%o1], %g2 + add %o1, 4, %o1 + stw %g2, [%o0] + add %o0, 4, %o0 +1: + be,pt %icc, 1f + andcc %g1, 1, %g0 + + lduh [%o1], %g2 + add %o1, 2, %o1 + sth %g2, [%o0] + add %o0, 2, %o0 +1: + be,pt %icc, 1f + nop + + ldub [%o1], %g2 + stb %g2, [%o0] +1: + PRE_RETL + retl + RETL_INSN + +82: /* ldx_stx */ + MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) + MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) + subcc %g7, 128, %g7 + add %o1, 128, %o1 + bne,pt %xcc, 82b + add %o0, 128, %o0 + +#ifndef FASTER_ALIGNED + + andcc %g1, 0x70, %g7 + be,pn %icc, 80b + andcc %g1, 8, %g0 +83: + rd %pc, %o5 + srl %g7, 1, %o4 + add %g7, %o4, %o4 + add %o1, %g7, %o1 + sub %o5, %o4, %o5 + jmpl %o5 + %lo(80b - 83b), %g0 + add %o0, %g7, %o0 + +#else /* FASTER_ALIGNED */ + + andcc %g1, 0x70, %g7 + be,pn %icc, 84f + andcc %g1, 8, %g0 +83: + rd %pc, %o5 + add %o1, %g7, %o1 + sub %o5, %g7, %o5 + jmpl %o5 + %lo(84f - 83b), %g0 + add %o0, %g7, %o0 + + MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3) + MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3) + MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3) + MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3) + MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3) + MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3) + MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3) + +84: /* amemcpy_table_end */ + be,pt %icc, 85f + andcc %g1, 4, %g0 + + ldx [%o1], %g2 + add %o1, 8, %o1 + stx %g2, [%o0] + add %o0, 8, %o0 +85: /* amemcpy_last7 */ + be,pt %icc, 1f + andcc %g1, 2, %g0 + + lduw [%o1], %g2 + add %o1, 4, %o1 + stw %g2, [%o0] + add %o0, 4, %o0 +1: + be,pt %icc, 1f + andcc %g1, 1, %g0 + + lduh [%o1], %g2 + add %o1, 2, %o1 + sth %g2, [%o0] + add %o0, 2, %o0 +1: + be,pt %icc, 1f + nop + + ldub [%o1], %g2 + stb %g2, [%o0] +1: + PRE_RETL + retl + RETL_INSN + +#endif /* FASTER_ALIGNED */ + +86: /* non_aligned */ + cmp %o2, 15 + bleu,pn %xcc, 88f + + andcc %o0, 3, %g0 + be,pn %icc, 61f + andcc %o0, 1, %g0 + be,pn %icc, 60f + andcc %o0, 2, %g0 + + ldub [%o1], %g5 + add %o1, 1, %o1 + stb %g5, [%o0] + sub %o2, 1, %o2 + bne,pn %icc, 61f + add %o0, 1, %o0 +60: + ldub [%o1], %g3 + add %o1, 2, %o1 + stb %g3, [%o0] + sub %o2, 2, %o2 + ldub [%o1 - 1], %g3 + add %o0, 2, %o0 + stb %g3, [%o0 - 1] +61: + and %o1, 3, %g2 + and %o2, 0xc, %g3 + and %o1, -4, %o1 + cmp %g3, 4 + sll %g2, 3, %g4 + mov 32, %g2 + be,pn %icc, 4f + sub %g2, %g4, %g7 + + blu,pn %icc, 3f + cmp %g3, 0x8 + + be,pn %icc, 2f + srl %o2, 2, %g3 + + lduw [%o1], %o3 + add %o0, -8, %o0 + lduw [%o1 + 4], %o4 + ba,pt %xcc, 8f + add %g3, 1, %g3 +2: + lduw [%o1], %o4 + add %o0, -12, %o0 + lduw [%o1 + 4], %o5 + add %g3, 2, %g3 + ba,pt %xcc, 9f + add %o1, -4, %o1 +3: + lduw [%o1], %g1 + add %o0, -4, %o0 + lduw [%o1 + 4], %o3 + srl %o2, 2, %g3 + ba,pt %xcc, 7f + add %o1, 4, %o1 +4: + lduw [%o1], %o5 + cmp %o2, 7 + lduw [%o1 + 4], %g1 + srl %o2, 2, %g3 + bleu,pn %xcc, 10f + add %o1, 8, %o1 + + lduw [%o1], %o3 + add %g3, -1, %g3 +5: + sll %o5, %g4, %g2 + srl %g1, %g7, %g5 + or %g2, %g5, %g2 + stw %g2, [%o0] +7: + lduw [%o1 + 4], %o4 + sll %g1, %g4, %g2 + srl %o3, %g7, %g5 + or %g2, %g5, %g2 + stw %g2, [%o0 + 4] +8: + lduw [%o1 + 8], %o5 + sll %o3, %g4, %g2 + srl %o4, %g7, %g5 + or %g2, %g5, %g2 + stw %g2, [%o0 + 8] +9: + lduw [%o1 + 12], %g1 + sll %o4, %g4, %g2 + srl %o5, %g7, %g5 + addcc %g3, -4, %g3 + or %g2, %g5, %g2 + add %o1, 16, %o1 + stw %g2, [%o0 + 12] + add %o0, 16, %o0 + bne,a,pt %xcc, 5b + lduw [%o1], %o3 +10: + sll %o5, %g4, %g2 + srl %g1, %g7, %g5 + srl %g7, 3, %g3 + or %g2, %g5, %g2 + sub %o1, %g3, %o1 + andcc %o2, 2, %g0 + stw %g2, [%o0] + be,pt %icc, 1f + andcc %o2, 1, %g0 + + ldub [%o1], %g2 + add %o1, 2, %o1 + stb %g2, [%o0 + 4] + add %o0, 2, %o0 + ldub [%o1 - 1], %g2 + stb %g2, [%o0 + 3] +1: + be,pt %icc, 1f + nop + + ldub [%o1], %g2 + stb %g2, [%o0 + 4] +1: + PRE_RETL + retl + RETL_INSN + +88: /* short_end */ + + and %o2, 0xe, %o3 +20: + rd %pc, %o5 + sll %o3, 3, %o4 + add %o0, %o3, %o0 + sub %o5, %o4, %o5 + add %o1, %o3, %o1 + jmpl %o5 + %lo(89f - 20b), %g0 + andcc %o2, 1, %g0 + + MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) + MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) + +89: /* short_table_end */ + + be,pt %icc, 1f + nop + + ldub [%o1], %g2 + stb %g2, [%o0] +1: + PRE_RETL + retl + RETL_INSN + +90: /* short_aligned_end */ + bne,pn %xcc, 88b + andcc %o2, 8, %g0 + + be,pt %icc, 1f + andcc %o2, 4, %g0 + + lduw [%o1 + 0x00], %g2 + lduw [%o1 + 0x04], %g3 + add %o1, 8, %o1 + stw %g2, [%o0 + 0x00] + stw %g3, [%o0 + 0x04] + add %o0, 8, %o0 +1: + ba,pt %xcc, 81b + mov %o2, %g1 diff --git a/arch/sparc64/lib/memscan.S b/arch/sparc64/lib/memscan.S new file mode 100644 index 000000000..83abe4040 --- /dev/null +++ b/arch/sparc64/lib/memscan.S @@ -0,0 +1,116 @@ +/* $Id: memscan.S,v 1.1 1997/03/14 21:04:24 jj Exp $ + * memscan.S: Optimized memscan for the Sparc64. + * + * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +/* In essence, this is just a fancy strlen. */ + +#define LO_MAGIC 0x01010101 +#define HI_MAGIC 0x80808080 + + .text + .align 4 + .globl __memscan_zero, __memscan_generic + .globl memscan +__memscan_zero: + /* %o0 = addr, %o1 = size */ + brlez,pn %o1, 0f + andcc %o0, 3, %g0 + be,pt %icc, 9f + sethi %hi(HI_MAGIC), %o4 + ldub [%o0], %o5 + subcc %o1, 1, %o1 + brz,pn %o5, 10f + add %o0, 1, %o0 + be,pn %xcc, 0f + andcc %o0, 3, %g0 + be,pn %icc, 4f + or %o4, %lo(HI_MAGIC), %o3 + ldub [%o0], %o5 + subcc %o1, 1, %o1 + brz,pn %o5, 10f + add %o0, 1, %o0 + be,pn %xcc, 0f + andcc %o0, 3, %g0 + be,pt %icc, 5f + sethi %hi(LO_MAGIC), %o4 + ldub [%o0], %o5 + subcc %o1, 1, %o1 + brz,pn %o5, 10f + add %o0, 1, %o0 + be,pn %xcc, 0f + or %o4, %lo(LO_MAGIC), %o2 + ba,pt %xcc, 2f + ld [%o0], %o5 +9: + or %o4, %lo(HI_MAGIC), %o3 +4: + sethi %hi(LO_MAGIC), %o4 +5: + or %o4, %lo(LO_MAGIC), %o2 + ld [%o0], %o5 +2: + sub %o5, %o2, %o4 + sub %o1, 4, %o1 + andcc %o4, %o3, %g0 + be,pn %icc, 1f + add %o0, 4, %o0 + brgz,pt %o1, 2b + ld [%o0], %o5 + + retl + add %o0, %o1, %o0 +1: + /* Check every byte. */ + srl %o5, 24, %g5 + andcc %g5, 0xff, %g0 + be,pn %icc, 1f + add %o0, -4, %o4 + srl %o5, 16, %g5 + andcc %g5, 0xff, %g0 + be,pn %icc, 1f + add %o4, 1, %o4 + srl %o5, 8, %g5 + andcc %g5, 0xff, %g0 + be,pn %icc, 1f + add %o4, 1, %o4 + andcc %o5, 0xff, %g0 + be,pn %icc, 1f + add %o4, 1, %o4 + brgz,pt %o1, 2b + ld [%o0], %o5 +1: + add %o0, %o1, %o0 + cmp %o4, %o0 + retl + movle %xcc, %o4, %o0 +0: + retl + nop +10: + retl + sub %o0, 1, %o0 + +memscan: +__memscan_generic: + /* %o0 = addr, %o1 = c, %o2 = size */ + brz,pn %o2, 3f + add %o0, %o2, %o3 + ldub [%o0], %o5 + sub %g0, %o2, %o4 +1: + cmp %o5, %o1 + be,pn %icc, 2f + addcc %o4, 1, %o4 + bne,a,pt %xcc, 1b + ldub [%o3 + %o4], %o5 + retl + /* The delay slot is the same as the next insn, this is just to make it look more awful */ +2: + add %o3, %o4, %o0 + retl + sub %o0, 1, %o0 +3: + retl + nop diff --git a/arch/sparc64/lib/memset.S b/arch/sparc64/lib/memset.S new file mode 100644 index 000000000..55de4ea9d --- /dev/null +++ b/arch/sparc64/lib/memset.S @@ -0,0 +1,196 @@ +/* linux/arch/sparc64/lib/memset.S: Sparc optimized memset, bzero and clear_user code + * Copyright (C) 1991,1996 Free Software Foundation + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * + * Returns 0, if ok, and number of bytes not yet set if exception + * occurs and we were called as clear_user. + */ + +#include <asm/asi.h> +#include <asm/ptrace.h> + +#define EX(x,y,a,b,z) \ +98: x,y; \ + .section .fixup,z##alloc,z##execinstr; \ + .align 4; \ +99: ba,pt %xcc, 30f; \ + a, b, %o0; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4 + +#define EXT(start,end,handler,z) \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word start, 0, end, handler; \ + .text; \ + .align 4 + +/* Please don't change these macros, unless you change the logic + * in the .fixup section below as well. + * Store 64 bytes at (BASE + OFFSET) using value SOURCE. */ +#define ZERO_BIG_BLOCK(base, offset, source) \ + stxa source, [base + offset + 0x00] %asi; \ + stxa source, [base + offset + 0x08] %asi; \ + stxa source, [base + offset + 0x10] %asi; \ + stxa source, [base + offset + 0x18] %asi; \ + stxa source, [base + offset + 0x20] %asi; \ + stxa source, [base + offset + 0x28] %asi; \ + stxa source, [base + offset + 0x30] %asi; \ + stxa source, [base + offset + 0x38] %asi; + +#define ZERO_LAST_BLOCKS(base, offset, source) \ + stxa source, [base - offset - 0x38] %asi; \ + stxa source, [base - offset - 0x30] %asi; \ + stxa source, [base - offset - 0x28] %asi; \ + stxa source, [base - offset - 0x20] %asi; \ + stxa source, [base - offset - 0x18] %asi; \ + stxa source, [base - offset - 0x10] %asi; \ + stxa source, [base - offset - 0x08] %asi; \ + stxa source, [base - offset - 0x00] %asi; + + .text + .align 4 + + .globl __bzero, __memset, __bzero_noasi + .globl memset, __memset_start, __memset_end +__memset_start: +__memset: +memset: + and %o1, 0xff, %g3 + sll %g3, 8, %g2 + or %g3, %g2, %g3 + sll %g3, 16, %g2 + or %g3, %g2, %g3 + mov %o2, %o1 + wr %g0, ASI_P, %asi + sllx %g3, 32, %g2 + ba,pt %xcc, 1f + or %g3, %g2, %g3 +__bzero: + wr %g0, ASI_P, %asi +__bzero_noasi: + mov %g0, %g3 +1: + cmp %o1, 7 + bleu,pn %xcc, 7f + andcc %o0, 3, %o2 + + be,a,pt %icc, 4f + andcc %o0, 4, %g0 + + cmp %o2, 3 + be,pn %icc, 2f + EX(stba %g3, [%o0] %asi, sub %o1, 0,#) + + cmp %o2, 2 + be,pt %icc, 2f + EX(stba %g3, [%o0 + 0x01] %asi, sub %o1, 1,#) + + EX(stba %g3, [%o0 + 0x02] %asi, sub %o1, 2,#) +2: + sub %o2, 4, %o2 + sub %o0, %o2, %o0 + add %o1, %o2, %o1 + andcc %o0, 4, %g0 +4: + be,a,pt %icc, 2f + andncc %o1, 0x7f, %o3 + + EX(sta %g3, [%o0] %asi, sub %o1, 0,#) + sub %o1, 4, %o1 + add %o0, 4, %o0 + andncc %o1, 0x7f, %o3 ! Now everything is 8 aligned and o1 is len to run +2: + be,pn %xcc, 9f + andcc %o1, 0x78, %o2 +10: + ZERO_BIG_BLOCK(%o0, 0x00, %g3) + subcc %o3, 128, %o3 + ZERO_BIG_BLOCK(%o0, 0x40, %g3) +11: + EXT(10b, 11b, 20f,#) + bne,pt %xcc, 10b + add %o0, 128, %o0 + + tst %o2 +9: + be,pn %xcc, 13f + andcc %o1, 7, %o1 +14: + rd %pc, %o4 + srl %o2, 1, %o3 + sub %o4, %o3, %o4 + jmpl %o4 + (13f - 14b), %g0 + add %o0, %o2, %o0 +12: + ZERO_LAST_BLOCKS(%o0, 0x48, %g3) + ZERO_LAST_BLOCKS(%o0, 0x08, %g3) +13: + be,pn %icc, 8f + andcc %o1, 4, %g0 + + be,pn %icc, 1f + andcc %o1, 2, %g0 + + EX(sta %g3, [%o0] %asi, and %o1, 7,#) + add %o0, 4, %o0 +1: + be,pn %icc, 1f + andcc %o1, 1, %g0 + + EX(stha %g3, [%o0] %asi, and %o1, 3,#) + add %o0, 2, %o0 +1: + bne,a,pn %icc, 8f + EX(stba %g3, [%o0] %asi, and %o1, 1,#) +8: + retl + clr %o0 +7: + be,pn %icc, 13b + orcc %o1, 0, %g0 + + be,pn %icc, 0f +8: + add %o0, 1, %o0 + subcc %o1, 1, %o1 + bne,a,pt %icc, 8b + EX(stba %g3, [%o0 - 1] %asi, add %o1, 1,#) +0: + retl + clr %o0 +__memset_end: + + .section .fixup,#alloc,#execinstr + .align 4 +20: + cmp %g2, 8 + bleu,pn %xcc, 1f + and %o1, 0x7f, %o1 + sub %g2, 9, %g2 + add %o3, 64, %o3 +1: + sll %g2, 3, %g2 + add %o3, %o1, %o0 + ba,pt %xcc, 30f + sub %o0, %g2, %o0 +21: + mov 8, %o0 + and %o1, 7, %o1 + sub %o0, %g2, %o0 + sll %o0, 3, %o0 + ba,pt %xcc, 30f + add %o0, %o1, %o0 +30: +/* %o4 is faulting address, %o5 is %pc where fault occured */ + save %sp, -160, %sp + mov %i5, %o0 + mov %i7, %o1 + call lookup_fault + mov %i4, %o2 + ret + restore diff --git a/arch/sparc64/lib/strlen.S b/arch/sparc64/lib/strlen.S new file mode 100644 index 000000000..5f2ec6bb4 --- /dev/null +++ b/arch/sparc64/lib/strlen.S @@ -0,0 +1,77 @@ +/* strlen.S: Sparc64 optimized strlen code + * Hand optimized from GNU libc's strlen + * Copyright (C) 1991,1996 Free Software Foundation + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +#define LO_MAGIC 0x01010101 +#define HI_MAGIC 0x80808080 + + .align 4 + .global strlen +strlen: + mov %o0, %o1 + andcc %o0, 3, %g0 + be,pt %icc, 9f + sethi %hi(HI_MAGIC), %o4 + ldub [%o0], %o5 + brz,pn %o5, 11f + add %o0, 1, %o0 + andcc %o0, 3, %g0 + be,pn %icc, 4f + or %o4, %lo(HI_MAGIC), %o3 + ldub [%o0], %o5 + brz,pn %o5, 12f + add %o0, 1, %o0 + andcc %o0, 3, %g0 + be,pt %icc, 5f + sethi %hi(LO_MAGIC), %o4 + ldub [%o0], %o5 + brz,pn %o5, 13f + add %o0, 1, %o0 + ba,pt %icc, 8f + or %o4, %lo(LO_MAGIC), %o2 +9: + or %o4, %lo(HI_MAGIC), %o3 +4: + sethi %hi(LO_MAGIC), %o4 +5: + or %o4, %lo(LO_MAGIC), %o2 +8: + ld [%o0], %o5 +2: + sub %o5, %o2, %o4 + andcc %o4, %o3, %g0 + be,pt %icc, 8b + add %o0, 4, %o0 + + /* Check every byte. */ + srl %o5, 24, %g5 + andcc %g5, 0xff, %g0 + be,pn %icc, 1f + add %o0, -4, %o4 + srl %o5, 16, %g5 + andcc %g5, 0xff, %g0 + be,pn %icc, 1f + add %o4, 1, %o4 + srl %o5, 8, %g5 + andcc %g5, 0xff, %g0 + be,pn %icc, 1f + add %o4, 1, %o4 + andcc %o5, 0xff, %g0 + bne,a,pt %icc, 2b + ld [%o0], %o5 + add %o4, 1, %o4 +1: + retl + sub %o4, %o1, %o0 +11: + retl + mov 0, %o0 +12: + retl + mov 1, %o0 +13: + retl + mov 2, %o0 diff --git a/arch/sparc64/lib/strlen_user.S b/arch/sparc64/lib/strlen_user.S new file mode 100644 index 000000000..24bea73fd --- /dev/null +++ b/arch/sparc64/lib/strlen_user.S @@ -0,0 +1,99 @@ +/* strlen_user.S: Sparc64 optimized strlen_user code + * + * Return length of string in userspace including terminating 0 + * or 0 for error + * + * Copyright (C) 1991,1996 Free Software Foundation + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +#define LO_MAGIC 0x01010101 +#define HI_MAGIC 0x80808080 + + .align 4 + .global __strlen_user +__strlen_user: + mov %o0, %o1 + andcc %o0, 3, %g0 + be,pt %icc, 9f + sethi %hi(HI_MAGIC), %o4 +10: + ldub [%o0], %o5 + brz,pn %o5, 21f + add %o0, 1, %o0 + andcc %o0, 3, %g0 + be,pn %icc, 4f + or %o4, %lo(HI_MAGIC), %o3 +11: + ldub [%o0], %o5 + brz,pn %o5, 22f + add %o0, 1, %o0 + andcc %o0, 3, %g0 + be,pt %icc, 5f + sethi %hi(LO_MAGIC), %o4 +12: + ldub [%o0], %o5 + brz,pn %o5, 23f + add %o0, 1, %o0 + ba,pt %icc, 13f + or %o4, %lo(LO_MAGIC), %o2 +9: + or %o4, %lo(HI_MAGIC), %o3 +4: + sethi %hi(LO_MAGIC), %o4 +5: + or %o4, %lo(LO_MAGIC), %o2 +13: + ld [%o0], %o5 +2: + sub %o5, %o2, %o4 + andcc %o4, %o3, %g0 + be,pt %icc, 13b + add %o0, 4, %o0 + + /* Check every byte. */ + srl %o5, 24, %g5 + andcc %g5, 0xff, %g0 + be,pn %icc, 1f + add %o0, -3, %o4 + srl %o5, 16, %g5 + andcc %g5, 0xff, %g0 + be,pn %icc, 1f + add %o4, 1, %o4 + srl %o5, 8, %g5 + andcc %g5, 0xff, %g0 + be,pn %icc, 1f + add %o4, 1, %o4 + andcc %o5, 0xff, %g0 + bne,a,pt %icc, 2b +14: + ld [%o0], %o5 + add %o4, 1, %o4 +1: + retl + sub %o4, %o1, %o0 +21: + retl + mov 1, %o0 +22: + retl + mov 2, %o0 +23: + retl + mov 3, %o0 + + .section .fixup,#alloc,#execinstr + .align 4 +30: + retl + clr %o0 + + .section __ex_table,#alloc + .align 4 + + .word 10b, 30b + .word 11b, 30b + .word 12b, 30b + .word 13b, 30b + .word 14b, 30b diff --git a/arch/sparc64/lib/strncmp.S b/arch/sparc64/lib/strncmp.S new file mode 100644 index 000000000..474ba7296 --- /dev/null +++ b/arch/sparc64/lib/strncmp.S @@ -0,0 +1,31 @@ +/* $Id: strncmp.S,v 1.2 1997/03/11 17:51:44 jj Exp $ + * Sparc64 optimized strncmp code. + * + * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +#include <asm/asi.h> + + .text + .align 4 + .global __strncmp, strncmp +__strncmp: +strncmp: + brlez,pn %o2, 3f + lduba [%o0] (ASI_PNF), %o3 +1: + add %o0, 1, %o0 + ldub [%o1], %o4 + brz,pn %o3, 2f + add %o1, 1, %o1 + cmp %o3, %o4 + bne,pn %icc, 2f + subcc %o2, 1, %o2 + bne,a,pt %xcc, 1b + ldub [%o0], %o3 +2: + retl + sub %o3, %o4, %o0 +3: + retl + clr %o0 diff --git a/arch/sparc64/lib/strncpy_from_user.S b/arch/sparc64/lib/strncpy_from_user.S new file mode 100644 index 000000000..05a48eb5a --- /dev/null +++ b/arch/sparc64/lib/strncpy_from_user.S @@ -0,0 +1,54 @@ +/* strncpy_from_user.S: Sparc64 strncpy from userspace. + * + * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +#include <asm/asi.h> +#include <asm/errno.h> + + .text + .align 4 + + /* Must return: + * + * -EFAULT for an exception + * count if we hit the buffer limit + * bytes copied if we hit a null byte + */ + + .globl __strncpy_from_user +__strncpy_from_user: + /* %o0=dest, %o1=src, %o2=count */ + brlez,pn %o2, 3f + add %o1, %o2, %o1 + sub %g0, %o2, %o3 + add %o0, %o2, %o0 +10: + ldub [%o1 + %o3], %o4 +1: + brz,pn %o4, 2f + stb %o4, [%o0 + %o3] + addcc %o3, 1, %o3 + bne,pt %xcc, 1b +11: + ldub [%o1 + %o3], %o4 + retl + mov %o2, %o0 +2: + add %o3, 1, %o3 + retl + add %o2, %o3, %o0 +3: + retl + clr %o0 + + .section .fixup,#alloc,#execinstr + .align 4 +4: + retl + mov -EFAULT, %o0 + + .section __ex_table,#alloc + .align 4 + .word 10b, 4b + .word 11b, 4b |