diff options
Diffstat (limited to 'arch/sparc64/lib')
-rw-r--r-- | arch/sparc64/lib/Makefile | 4 | ||||
-rw-r--r-- | arch/sparc64/lib/VISbzero.S | 14 | ||||
-rw-r--r-- | arch/sparc64/lib/VIScopy.S | 115 | ||||
-rw-r--r-- | arch/sparc64/lib/VIScsum.S | 15 | ||||
-rw-r--r-- | arch/sparc64/lib/VIScsumcopy.S | 75 | ||||
-rw-r--r-- | arch/sparc64/lib/VISmemset.S | 15 | ||||
-rw-r--r-- | arch/sparc64/lib/VISsave.S | 122 | ||||
-rw-r--r-- | arch/sparc64/lib/blockops.S | 94 | ||||
-rw-r--r-- | arch/sparc64/lib/memscan.S | 203 |
9 files changed, 415 insertions, 242 deletions
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile index 9f8729ee5..a580f7ae4 100644 --- a/arch/sparc64/lib/Makefile +++ b/arch/sparc64/lib/Makefile @@ -1,4 +1,4 @@ -# $Id: Makefile,v 1.15 1997/08/19 03:11:50 davem Exp $ +# $Id: Makefile,v 1.16 1998/06/12 14:53:53 jj Exp $ # Makefile for Sparc library files.. # @@ -6,7 +6,7 @@ CFLAGS := $(CFLAGS) OBJS = PeeCeeI.o blockops.o locks.o strlen.o strncmp.o \ memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \ - VIScopy.o VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o + VIScopy.o VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o VISsave.o lib.a: $(OBJS) $(AR) rcs lib.a $(OBJS) diff --git a/arch/sparc64/lib/VISbzero.S b/arch/sparc64/lib/VISbzero.S index ede87843b..3992da997 100644 --- a/arch/sparc64/lib/VISbzero.S +++ b/arch/sparc64/lib/VISbzero.S @@ -1,4 +1,4 @@ -/* $Id: VISbzero.S,v 1.8 1997/08/22 15:54:50 jj Exp $ +/* $Id: VISbzero.S,v 1.9 1998/06/12 14:53:50 jj Exp $ * VISbzero.S: High speed clear operations utilizing the UltraSparc * Visual Instruction Set. * @@ -9,6 +9,8 @@ #include "VIS.h" #ifdef __KERNEL__ +#include <asm/visasm.h> + #define EXN(x,y,a,b,z) \ 98: x,y; \ .section .fixup; \ @@ -141,9 +143,9 @@ bzero: 6: andncc %o1, 0x3f, %o3 7: be,pn %xcc, 9f #ifdef __KERNEL__ - rd %asi, %g7 - wr %g0, FPRS_FEF, %fprs - wr %g7, ASI_BLK_XOR, %asi + rd %asi, %o4 + wr %o4, ASI_BLK_XOR, %asi + VISEntryHalf #else wr %g0, ASI_BLK_P, %asi #endif @@ -178,8 +180,8 @@ bzero: add %o0, 256, %o0 12: #ifdef __KERNEL__ - wr %g0, 0, %fprs - wr %g7, 0x0, %asi + VISExitHalf + wr %o4, 0x0, %asi #else #ifndef REGS_64BIT wr %g0, FPRS_FEF, %fprs diff --git a/arch/sparc64/lib/VIScopy.S b/arch/sparc64/lib/VIScopy.S index 40b781e73..7f2f497cd 100644 --- a/arch/sparc64/lib/VIScopy.S +++ b/arch/sparc64/lib/VIScopy.S @@ -1,9 +1,9 @@ -/* $Id: VIScopy.S,v 1.14 1997/08/22 15:54:53 jj Exp $ +/* $Id: VIScopy.S,v 1.18 1998/06/12 14:53:55 jj Exp $ * VIScopy.S: High speed copy operations utilizing the UltraSparc * Visual Instruction Set. * * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) - * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1996, 1997, 1998 Jakub Jelinek (jj@ultra.linux.cz) */ #include "VIS.h" @@ -24,12 +24,15 @@ */ #ifdef __KERNEL__ + +#include <asm/visasm.h> + #define FPU_CLEAN_RETL \ - wr %g0, 0, %fprs; \ + VISExit \ retl; \ clr %o0; #define FPU_RETL \ - wr %g0, 0, %fprs; \ + VISExit \ retl; \ clr %o0; #define NORMAL_RETL \ @@ -40,7 +43,7 @@ .section .fixup; \ .align 4; \ 99: ba VIScopyfixup_ret; \ - a, b, %o0; \ + a, b, %o1; \ .section __ex_table; \ .align 4; \ .word 98b, 99b; \ @@ -52,7 +55,7 @@ .align 4; \ 99: c, d, e; \ ba VIScopyfixup_ret; \ - a, b, %o0; \ + a, b, %o1; \ .section __ex_table; \ .align 4; \ .word 98b, 99b; \ @@ -298,10 +301,6 @@ .globl __memcpy_entry .type __memcpy_entry,@function - - .globl copy_page - .type copy_page,@function - memcpy_private: __memcpy: memcpy: mov ASI_BLK_P, asi_src ! IEU0 Group @@ -310,12 +309,6 @@ memcpy: mov ASI_BLK_P, asi_src ! IEU0 Group retl clr %o0 -copy_page: wr %g0, FPRS_FEF, %fprs ! FPU Group - sethi %hi(8192), %o2 ! IEU0 Group - mov ASI_BLK_P, asi_src ! IEU1 - b,pt %xcc, dest_is_64byte_aligned ! CTI - mov ASI_BLK_P, asi_dest ! IEU0 Group - .align 32 .globl __copy_from_user .type __copy_from_user,@function @@ -355,7 +348,11 @@ __memcpy_384plus: #endif VIS_enter: be,pt %xcc, dest_is_8byte_aligned ! CTI +#ifdef __KERNEL__ + nop ! IEU0 Group +#else andcc %o0, 0x38, %g5 ! IEU1 Group +#endif do_dest_8byte_align: mov 8, %g1 ! IEU0 sub %g1, %g2, %g2 ! IEU0 Group @@ -377,7 +374,8 @@ do_dest_8byte_align: EX(LDUB [%o1] ASINORMAL, %o5, add %o2, %g2) ! Load Group add %o0, 2, %o0 ! IEU0 - EX(LDUB [%o1 + 1] ASINORMAL, %g3, + EX2(LDUB [%o1 + 1] ASINORMAL, %g3, + sub %o0, 2, %o0, add %o2, %g2) ! Load Group ASI_SETDST_NOBLK ! LSU Group subcc %g2, 2, %g2 ! IEU1 Group @@ -389,17 +387,17 @@ do_dest_8byte_align: EX2(STB %g3, [%o0 - 1] ASINORMAL, add %g2, 1, %g2, add %o2, %g2) ! Store -3: andcc %o0, 0x38, %g5 ! IEU1 Group -dest_is_8byte_aligned: - be,pt %icc, dest_is_64byte_aligned ! CTI #ifdef __KERNEL__ - wr %g0, FPRS_FEF, %fprs ! FPU Group -do_dest_64byte_align: - mov 64, %g1 ! IEU0 Group +3: +dest_is_8byte_aligned: + VISEntry + andcc %o0, 0x38, %g5 ! IEU1 Group #else - mov 64, %g1 ! IEU0 Group -do_dest_64byte_align: +3: andcc %o0, 0x38, %g5 ! IEU1 Group +dest_is_8byte_aligned: #endif + be,pt %icc, dest_is_64byte_aligned ! CTI + mov 64, %g1 ! IEU0 fmovd %f0, %f2 ! FPU sub %g1, %g5, %g5 ! IEU0 Group ASI_SETSRC_NOBLK ! LSU Group @@ -646,7 +644,9 @@ __memcpy_short: 2: ASI_SETSRC_NOBLK ! LSU Group EXO2(LDUB [%o1] ASINORMAL, %g5) ! LOAD Group add %o0, 2, %o0 ! IEU0 - EXO2(LDUB [%o1 + 1] ASINORMAL, %o5) ! LOAD Group + EX2(LDUB [%o1 + 1] ASINORMAL, %o5, + sub %o0, 2, %o0, + add %o2, %g0) ! LOAD Group add %o1, 2, %o1 ! IEU0 ASI_SETDST_NOBLK ! LSU Group subcc %o2, 2, %o2 ! IEU1 Group @@ -866,9 +866,9 @@ normal_retl: ASI_SETSRC_NOBLK ! LSU Group EX(LDX [%o1] ASINORMAL, %g2, and %o2, 0xf) ! Load Group - add %o1, 8, %o1 ! IEU0 + add %o0, 8, %o0 ! IEU0 ASI_SETDST_NOBLK ! LSU Group - add %o0, 8, %o0 ! IEU0 Group + add %o1, 8, %o1 ! IEU0 Group EX(STX %g2, [%o0 - 0x8] ASINORMAL, and %o2, 0xf) ! Store 85: be,pt %xcc, 1f ! CTI @@ -876,9 +876,9 @@ normal_retl: ASI_SETSRC_NOBLK ! LSU Group EX(LDUW [%o1] ASINORMAL, %g2, and %o2, 0x7) ! Load Group - add %o1, 4, %o1 ! IEU0 + add %o0, 4, %o0 ! IEU0 ASI_SETDST_NOBLK ! LSU Group - add %o0, 4, %o0 ! IEU0 Group + add %o1, 4, %o1 ! IEU0 Group EX(STW %g2, [%o0 - 0x4] ASINORMAL, and %o2, 0x7) ! Store 1: be,pt %xcc, 1f ! CTI @@ -886,9 +886,9 @@ normal_retl: ASI_SETSRC_NOBLK ! LSU Group EX(LDUH [%o1] ASINORMAL, %g2, and %o2, 0x3) ! Load Group - add %o1, 2, %o1 ! IEU0 + add %o0, 2, %o0 ! IEU0 ASI_SETDST_NOBLK ! LSU Group - add %o0, 2, %o0 ! IEU0 Group + add %o1, 2, %o1 ! IEU0 Group EX(STH %g2, [%o0 - 0x2] ASINORMAL, and %o2, 0x3) ! Store 1: be,pt %xcc, 1f ! CTI @@ -920,7 +920,7 @@ memcpy_noVIS_misaligned: add %o2, 1) ! Store 2: #ifdef __KERNEL__ - wr %g0, FPRS_FEF, %fprs ! FPU Group + VISEntry #endif andn %o2, 7, %g5 ! IEU0 Group and %o2, 7, %o2 ! IEU1 @@ -976,16 +976,31 @@ fpu_retl: .section .fixup .align 4 VIScopyfixup_reto2: - mov %o2, %o0 + mov %o2, %o1 VIScopyfixup_ret: + /* If this is copy_from_user(), zero out the rest of the + * kernel buffer. + */ + andcc asi_src, 0x1, %g0 + be,pt %icc, 1f + andcc asi_dest, 0x1, %g0 + bne,pn %icc, 1f + VISExit + save %sp, -160, %sp + mov %i0, %o0 + call __bzero + mov %i1, %o1 + restore +1: mov %o1, %o0 retl - wr %g0, 0, %fprs + nop VIScopyfixup1: subcc %g2, 18, %g2 + add %o0, 32, %o0 bgeu,a,pt %icc, VIScopyfixup1 sub %g7, 32, %g7 + sub %o0, 32, %o0 rd %pc, %g5 - add %g2, 18, %g2 - add %g2, 20, %g2 + add %g2, (18 + 16), %g2 ldub [%g5 + %g2], %g2 ba,a,pt %xcc, 2f .byte 0, 0, 0, 0, 0, 0, 0, 4, 4, 8, 12, 12, 16, 20, 20, 24, 28, 28 @@ -994,41 +1009,43 @@ VIScopyfixup2: mov (7 * 16), %g7 1: subcc %g2, 10, %g2 bgeu,a,pt %icc, 1b sub %g7, 16, %g7 + sub %o0, %g7, %o0 rd %pc, %g5 - add %g2, 10, %g2 - add %g2, 20, %g2 + add %g2, (10 + 16), %g2 ldub [%g5 + %g2], %g2 ba,a,pt %xcc, 4f .byte 0, 0, 0, 0, 0, 4, 4, 8, 12, 12 .align 4 VIScopyfixup3: subcc %g2, 10, %g2 + add %o0, 32, %o0 bgeu,a,pt %icc, VIScopyfixup3 sub %g7, 32, %g7 + sub %o0, 32, %o0 rd %pc, %g5 - add %g2, 10, %g2 - add %g2, 20, %g2 + add %g2, (10 + 16), %g2 ldub [%g5 + %g2], %g2 ba,a,pt %xcc, 2f .byte 0, 0, 0, 0, 0, 0, 0, 8, 16, 24 .align 4 -2: and %g1, 0x7f, %g1 +2: and %o2, 0x7f, %o2 sub %g7, %g2, %g7 ba,pt %xcc, VIScopyfixup_ret - add %g7, %g1, %o0 + add %g7, %o2, %o1 VIScopyfixup4: mov (7 * 16), %g7 3: subcc %g2, 6, %g2 bgeu,a,pt %icc, 3b sub %g7, 16, %g7 + sub %o0, %g7, %o0 rd %pc, %g5 - add %g2, 6, %g2 - add %g2, 20, %g2 + add %g2, (6 + 16), %g2 ldub [%g5 + %g2], %g2 ba,a,pt %xcc, 4f .byte 0, 0, 0, 0, 0, 8 .align 4 -4: and %g1, 7, %g1 +4: and %o2, 0xf, %o2 + sub %g7, %g2, %g7 ba,pt %xcc, VIScopyfixup_ret - add %g7, %g1, %o0 + add %g7, %o2, %o1 VIScopyfixup_vis3: sub %o2, 0x80, %o2 VIScopyfixup_vis2: @@ -1038,13 +1055,13 @@ VIScopyfixup_vis0: VIScopyfixup_vis1: add %g7, %g3, %g7 ba,pt %xcc, VIScopyfixup_ret - add %o2, %g7, %o0 + add %o2, %g7, %o1 VIScopyfixup_vis5: add %g3, 8, %g3 VIScopyfixup_vis4: add %g3, 8, %g3 ba,pt %xcc, VIScopyfixup_ret - add %o2, %g3, %o0 + add %o2, %g3, %o1 #endif #ifdef __KERNEL__ diff --git a/arch/sparc64/lib/VIScsum.S b/arch/sparc64/lib/VIScsum.S index 81b020c49..a370bdff3 100644 --- a/arch/sparc64/lib/VIScsum.S +++ b/arch/sparc64/lib/VIScsum.S @@ -1,4 +1,4 @@ -/* $Id: VIScsum.S,v 1.2 1997/08/08 08:34:05 jj Exp $ +/* $Id: VIScsum.S,v 1.3 1998/06/12 14:53:57 jj Exp $ * VIScsum.S: High bandwidth IP checksumming utilizing the UltraSparc * Visual Instruction Set. * @@ -26,6 +26,7 @@ #ifdef __KERNEL__ #include <asm/head.h> #include <asm/asi.h> +#include <asm/visasm.h> #else #define ASI_BLK_P 0xf0 #define FRPS_FEF 0x04 @@ -278,13 +279,13 @@ csum_partial: add %o2, 1, %o2 /* IEU0 */ 3: cmp %o1, 0xc0 /* IEU1 Group */ blu,pn %icc, 20f /* CTI */ - sllx %o2, 32, %g1 /* IEU0 */ - addcc %o2, %g1, %o2 /* IEU1 Group */ - sub %o1, 0xc0, %o1 /* IEU0 */ - wr %g0, ASI_BLK_P, %asi /* LSU Group */ + sllx %o2, 32, %g5 /* IEU0 */ #ifdef __KERNEL__ - wr %g0, FPRS_FEF, %fprs /* LSU Group */ + VISEntry #endif + addcc %o2, %g5, %o2 /* IEU1 Group */ + sub %o1, 0xc0, %o1 /* IEU0 */ + wr %g0, ASI_BLK_P, %asi /* LSU Group */ membar #StoreLoad /* LSU Group */ srlx %o2, 32, %o2 /* IEU0 Group */ bcs,a,pn %xcc, 1f /* CTI */ @@ -340,7 +341,7 @@ csum_partial: END_THE_TRICK(f60,f62,f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30) and %o1, 0x3f, %o1 /* IEU0 Group */ #ifdef __KERNEL__ - wr %g0, 0, %fprs /* LSU Group */ + VISExit #endif 20: andcc %o1, 0xf0, %g1 /* IEU1 Group */ be,pn %icc, 23f /* CTI */ diff --git a/arch/sparc64/lib/VIScsumcopy.S b/arch/sparc64/lib/VIScsumcopy.S index fff41bab2..469b007fc 100644 --- a/arch/sparc64/lib/VIScsumcopy.S +++ b/arch/sparc64/lib/VIScsumcopy.S @@ -1,4 +1,4 @@ -/* $Id: VIScsumcopy.S,v 1.4 1998/04/01 08:29:52 davem Exp $ +/* $Id: VIScsumcopy.S,v 1.5 1998/06/12 14:53:48 jj Exp $ * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous * copying utilizing the UltraSparc Visual Instruction Set. * @@ -27,6 +27,7 @@ #include <asm/head.h> #include <asm/asi.h> #include <asm/page.h> +#include <asm/visasm.h> #else #define ASI_P 0x80 #define ASI_BLK_P 0xf0 @@ -42,11 +43,11 @@ #define sum o3 #define x1 g1 #define x2 g2 -#define x3 g3 +#define x3 o4 #define x4 g4 #define x5 g5 #define x6 g7 -#define x7 o4 +#define x7 g3 #define x8 o5 /* Dobrou noc, SunSoft engineers. Spete sladce. @@ -248,7 +249,7 @@ csum_partial_copy_vis: andcc %dst, 7, %g0 /* IEU1 Group */ be,pt %icc, 4f /* CTI */ - and %dst, 0x38, %g3 /* IEU0 */ + and %dst, 0x38, %o4 /* IEU0 */ mov 1, %g5 /* IEU0 Group */ andcc %dst, 2, %g0 /* IEU1 */ be,pt %icc, 1f /* CTI */ @@ -266,18 +267,18 @@ csum_partial_copy_vis: add %sum, %g5, %sum /* IEU0 */ 1: lduwa [%src] %asi, %g2 /* Load */ brz,a,pn %g7, 4f /* CTI+IEU1 Group */ - and %dst, 0x38, %g3 /* IEU0 */ + and %dst, 0x38, %o4 /* IEU0 */ add %dst, 4, %dst /* IEU0 Group */ sub %len, 4, %len /* IEU1 */ addcc %g2, %sum, %sum /* IEU1 Group */ bcs,a,pn %icc, 1f /* CTI */ add %sum, 1, %sum /* IEU0 */ -1: and %dst, 0x38, %g3 /* IEU0 Group */ +1: and %dst, 0x38, %o4 /* IEU0 Group */ stw %g2, [%dst - 4] /* Store */ add %src, 4, %src /* IEU1 */ 4: #ifdef __KERNEL__ - wr %g0, FPRS_FEF, %fprs /* LSU Group */ + VISEntry #endif mov %src, %g7 /* IEU1 Group */ fzero %f48 /* FPA */ @@ -291,10 +292,10 @@ csum_partial_copy_vis: sub %sum, 1, %sum /* IEU0 */ 1: srl %sum, 0, %sum /* IEU0 Group */ clr %g5 /* IEU1 */ - brz,pn %g3, 3f /* CTI+IEU1 Group */ - sub %g1, %g3, %g1 /* IEU0 */ + brz,pn %o4, 3f /* CTI+IEU1 Group */ + sub %g1, %o4, %g1 /* IEU0 */ ldda [%src] %asi, %f0 /* Load */ - clr %g3 /* IEU0 Group */ + clr %o4 /* IEU0 Group */ andcc %dst, 8, %g0 /* IEU1 */ be,pn %icc, 1f /* CTI */ ldda [%src + 8] %asi, %f2 /* Load Group */ @@ -303,7 +304,7 @@ csum_partial_copy_vis: fpadd32 %f0, %f48, %f50 /* FPA */ addcc %dst, 8, %dst /* IEU1 Group */ faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %g3 /* FPM Group */ + fcmpgt32 %f48, %f50, %o4 /* FPM Group */ fmovd %f2, %f0 /* FPA Group */ ldda [%src + 8] %asi, %f2 /* Load */ std %f16, [%dst - 8] /* Store */ @@ -318,13 +319,13 @@ csum_partial_copy_vis: faligndata %f0, %f2, %f16 /* FPA */ fcmpgt32 %f48, %f50, %g5 /* FPM Group */ sub %len, 16, %len /* IEU0 */ - inc %g3 /* IEU1 */ + inc %o4 /* IEU1 */ std %f16, [%dst - 16] /* Store Group */ fpadd32 %f2, %f50, %f48 /* FPA */ - srl %g3, 1, %o5 /* IEU0 */ + srl %o4, 1, %o5 /* IEU0 */ faligndata %f2, %f4, %f18 /* FPA Group */ std %f18, [%dst - 8] /* Store */ - fcmpgt32 %f50, %f48, %g3 /* FPM Group */ + fcmpgt32 %f50, %f48, %o4 /* FPM Group */ add %o5, %sum, %sum /* IEU0 */ ldda [%src + 8] %asi, %f2 /* Load */ fmovd %f4, %f0 /* FPA */ @@ -337,18 +338,18 @@ csum_partial_copy_vis: add %dst, 32, %dst /* IEU1 */ faligndata %f0, %f2, %f16 /* FPA */ fcmpgt32 %f48, %f50, %o5 /* FPM Group */ - inc %g3 /* IEU0 */ + inc %o4 /* IEU0 */ ldda [%src + 24] %asi, %f6 /* Load */ - srl %g3, 1, %g3 /* IEU0 Group */ + srl %o4, 1, %o4 /* IEU0 Group */ add %g5, %sum, %sum /* IEU1 */ ldda [%src + 32] %asi, %f8 /* Load */ fpadd32 %f2, %f50, %f48 /* FPA */ faligndata %f2, %f4, %f18 /* FPA Group */ sub %len, 32, %len /* IEU0 */ std %f16, [%dst - 32] /* Store */ - fcmpgt32 %f50, %f48, %o4 /* FPM Group */ + fcmpgt32 %f50, %f48, %g3 /* FPM Group */ inc %o5 /* IEU0 */ - add %g3, %sum, %sum /* IEU1 */ + add %o4, %sum, %sum /* IEU1 */ fpadd32 %f4, %f48, %f50 /* FPA */ faligndata %f4, %f6, %f20 /* FPA Group */ srl %o5, 1, %o5 /* IEU0 */ @@ -356,14 +357,14 @@ csum_partial_copy_vis: add %o5, %sum, %sum /* IEU0 */ std %f18, [%dst - 24] /* Store */ fpadd32 %f6, %f50, %f48 /* FPA */ - inc %o4 /* IEU0 Group */ + inc %g3 /* IEU0 Group */ std %f20, [%dst - 16] /* Store */ add %src, 32, %src /* IEU1 */ faligndata %f6, %f8, %f22 /* FPA */ - fcmpgt32 %f50, %f48, %g3 /* FPM Group */ - srl %o4, 1, %o4 /* IEU0 */ + fcmpgt32 %f50, %f48, %o4 /* FPM Group */ + srl %g3, 1, %g3 /* IEU0 */ std %f22, [%dst - 8] /* Store */ - add %o4, %sum, %sum /* IEU0 Group */ + add %g3, %sum, %sum /* IEU0 Group */ 3: rd %asi, %g2 /* LSU Group + 4 bubbles */ #ifdef __KERNEL__ 4: sethi %hi(vis0s), %g7 /* IEU0 Group */ @@ -371,16 +372,16 @@ csum_partial_copy_vis: 4: rd %pc, %g7 /* LSU Group + 4 bubbles */ #endif inc %g5 /* IEU0 Group */ - and %src, 0x38, %o4 /* IEU1 */ + and %src, 0x38, %g3 /* IEU1 */ membar #StoreLoad /* LSU Group */ srl %g5, 1, %g5 /* IEU0 */ - inc %g3 /* IEU1 */ - sll %o4, 8, %o4 /* IEU0 Group */ + inc %o4 /* IEU1 */ + sll %g3, 8, %g3 /* IEU0 Group */ sub %len, 0xc0, %len /* IEU1 */ addcc %g5, %sum, %sum /* IEU1 Group */ - srl %g3, 1, %g3 /* IEU0 */ - add %g7, %o4, %g7 /* IEU0 Group */ - add %g3, %sum, %sum /* IEU1 */ + srl %o4, 1, %o4 /* IEU0 */ + add %g7, %g3, %g7 /* IEU0 Group */ + add %o4, %sum, %sum /* IEU1 */ #ifdef __KERNEL__ jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */ #else @@ -815,7 +816,7 @@ ett: rd %gsr, %x3 /* LSU Group+4bubbles */ END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62) membar #Sync /* LSU Group */ #ifdef __KERNEL__ - wr %g0, 0, %fprs /* LSU Group */ + VISExit add %sp, 8, %sp /* IEU0 Group */ #endif 23: brnz,pn %len, 26f /* CTI+IEU1 Group */ @@ -834,12 +835,12 @@ ett: rd %gsr, %x3 /* LSU Group+4bubbles */ #endif 26: andcc %len, 8, %g0 /* IEU1 Group */ be,pn %icc, 1f /* CTI */ - lduwa [%src] %asi, %g3 /* Load */ + lduwa [%src] %asi, %o4 /* Load */ lduwa [%src+4] %asi, %g2 /* Load Group */ add %src, 8, %src /* IEU0 */ add %dst, 8, %dst /* IEU1 */ - sllx %g3, 32, %g5 /* IEU0 Group */ - stw %g3, [%dst - 8] /* Store */ + sllx %o4, 32, %g5 /* IEU0 Group */ + stw %o4, [%dst - 8] /* Store */ or %g5, %g2, %g5 /* IEU0 Group */ stw %g2, [%dst - 4] /* Store */ addcc %g5, %sum, %sum /* IEU1 Group */ @@ -855,11 +856,11 @@ ett: rd %gsr, %x3 /* LSU Group+4bubbles */ stw %g7, [%dst - 4] /* Store */ 1: andcc %len, 2, %g0 /* IEU1 */ be,a,pn %icc, 1f /* CTI */ - clr %o4 /* IEU0 Group */ + clr %g3 /* IEU0 Group */ lduha [%src] %asi, %g7 /* Load */ add %src, 2, %src /* IEU1 */ add %dst, 2, %dst /* IEU0 Group */ - sll %g7, 16, %o4 /* IEU0 Group */ + sll %g7, 16, %g3 /* IEU0 Group */ sth %g7, [%dst - 2] /* Store */ 1: andcc %len, 1, %g0 /* IEU1 */ be,a,pn %icc, 1f /* CTI */ @@ -867,9 +868,9 @@ ett: rd %gsr, %x3 /* LSU Group+4bubbles */ lduba [%src] %asi, %g7 /* Load */ sll %g7, 8, %o5 /* IEU0 Group */ stb %g7, [%dst] /* Store */ -1: or %g2, %o4, %o4 /* IEU1 */ - or %o5, %o4, %o4 /* IEU0 Group (regdep) */ - addcc %o4, %sum, %sum /* IEU1 Group (regdep) */ +1: or %g2, %g3, %g3 /* IEU1 */ + or %o5, %g3, %g3 /* IEU0 Group (regdep) */ + addcc %g3, %sum, %sum /* IEU1 Group (regdep) */ bcs,a,pn %xcc, 1f /* CTI */ add %sum, 1, %sum /* IEU0 */ 1: ba,pt %xcc, 25b /* CTI Group */ diff --git a/arch/sparc64/lib/VISmemset.S b/arch/sparc64/lib/VISmemset.S index 4c24931ba..9be111134 100644 --- a/arch/sparc64/lib/VISmemset.S +++ b/arch/sparc64/lib/VISmemset.S @@ -1,4 +1,4 @@ -/* $Id: VISmemset.S,v 1.7 1997/08/22 15:54:56 jj Exp $ +/* $Id: VISmemset.S,v 1.8 1998/06/12 14:53:59 jj Exp $ * VISmemset.S: High speed memset operations utilizing the UltraSparc * Visual Instruction Set. * @@ -32,6 +32,9 @@ #endif #ifdef __KERNEL__ + +#include <asm/visasm.h> + #define RETL clr %o0 #else #define RETL mov %g3, %o0 @@ -135,8 +138,9 @@ memset: #endif add %o0, 32, %o0 7: be,pn %xcc, 9f + nop #ifdef __KERNEL__ - wr %g0, FPRS_FEF, %fprs + VISEntryHalf #endif ldd [%o0 - 8], %f0 18: wr %g0, ASI_BLK_P, %asi @@ -170,7 +174,7 @@ memset: add %o0, 256, %o0 12: #ifdef __KERNEL__ - wr %g0, 0, %fprs + VISExitHalf #else #ifndef REGS_64BIT wr %g0, FPRS_FEF, %fprs @@ -231,10 +235,9 @@ memset: #endif andncc %o2, 0x3f, %o3 be,pn %xcc, 9b -#ifdef __KERNEL__ - wr %g0, FPRS_FEF, %fprs -#else nop +#ifdef __KERNEL__ + VISEntryHalf #endif ba,pt %xcc, 18b ldd [%o0], %f0 diff --git a/arch/sparc64/lib/VISsave.S b/arch/sparc64/lib/VISsave.S new file mode 100644 index 000000000..10d127bb5 --- /dev/null +++ b/arch/sparc64/lib/VISsave.S @@ -0,0 +1,122 @@ +/* $Id: VISsave.S,v 1.2 1998/06/19 12:14:25 jj Exp $ + * VISsave.S: Code for saving FPU register state for + * VIS routines. One should not call this directly, + * but use macros provided in <asm/visasm.h>. + * + * Copyright (C) 1998 Jakub Jelinek (jj@ultra.linux.cz) + */ + +#include <asm/asi.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/visasm.h> + + .text + .globl VISenter, VISenterhalf + + /* On entry: %o5=current FPRS value, %g7 is callers address */ + /* May clobber %o5, %g1, %g2, %g3, %g7, %icc, %xcc */ + + .align 32 +VISenter: + ldub [%g6 + AOFF_task_tss + AOFF_thread_fpdepth], %g1 + brnz,a,pn %g1, 1f + cmp %g1, 1 + stb %g0, [%g6 + AOFF_task_tss + AOFF_thread_fpsaved] + stx %fsr, [%g6 + AOFF_task_tss + AOFF_thread_xfsr] +9: jmpl %g7 + %g0, %g0 + nop +1: bne,pn %icc, 2f + + srl %g1, 1, %g1 +vis1: ldub [%g6 + AOFF_task_tss + AOFF_thread_fpsaved], %g3 + stx %fsr, [%g6 + AOFF_task_tss + AOFF_thread_xfsr] + or %g3, %o5, %g3 + stb %g3, [%g6 + AOFF_task_tss + AOFF_thread_fpsaved] + rd %gsr, %g3 + clr %g1 + ba,pt %xcc, 3f + + stb %g3, [%g6 + AOFF_task_tss + AOFF_thread_gsr] +2: add %g6, %g1, %g3 + cmp %o5, FPRS_DU + be,pn %icc, 6f + sll %g1, 3, %g1 + stb %o5, [%g3 + AOFF_task_tss + AOFF_thread_fpsaved] + rd %gsr, %g2 + stb %g2, [%g3 + AOFF_task_tss + AOFF_thread_gsr] + + add %g6, %g1, %g2 + stx %fsr, [%g2 + AOFF_task_tss + AOFF_thread_xfsr] + sll %g1, 5, %g1 +3: andcc %o5, FPRS_DL|FPRS_DU, %g0 + be,pn %icc, 9b + add %g6, AOFF_task_fpregs, %g2 + andcc %o5, FPRS_DL, %g0 + membar #StoreStore | #LoadStore + + be,pn %icc, 4f + add %g6, AOFF_task_fpregs+0x40, %g3 + stda %f0, [%g2 + %g1] ASI_BLK_P + stda %f16, [%g3 + %g1] ASI_BLK_P + andcc %o5, FPRS_DU, %g0 + be,pn %icc, 5f +4: add %g1, 128, %g1 + stda %f32, [%g2 + %g1] ASI_BLK_P + + stda %f48, [%g3 + %g1] ASI_BLK_P +5: membar #Sync + jmpl %g7 + %g0, %g0 + nop + +6: ldub [%g3 + AOFF_task_tss + AOFF_thread_fpsaved], %o5 + or %o5, FPRS_DU, %o5 + add %g6, AOFF_task_fpregs+0x80, %g2 + stb %o5, [%g3 + AOFF_task_tss + AOFF_thread_fpsaved] + + sll %g1, 5, %g1 + add %g6, AOFF_task_fpregs+0xc0, %g3 + membar #StoreStore | #LoadStore + stda %f32, [%g2 + %g1] ASI_BLK_P + stda %f48, [%g3 + %g1] ASI_BLK_P + membar #Sync + jmpl %g7 + %g0, %g0 + nop + + .align 32 +VISenterhalf: + ldub [%g6 + AOFF_task_tss + AOFF_thread_fpdepth], %g1 + brnz,a,pn %g1, 1f + cmp %g1, 1 + stb %g0, [%g6 + AOFF_task_tss + AOFF_thread_fpsaved] + stx %fsr, [%g6 + AOFF_task_tss + AOFF_thread_xfsr] + clr %o5 + jmpl %g7 + %g0, %g0 + wr %g0, FPRS_FEF, %fprs + +1: bne,pn %icc, 2f + srl %g1, 1, %g1 + ba,pt %xcc, vis1 + sub %g7, 8, %g7 +2: addcc %g6, %g1, %g3 + sll %g1, 3, %g1 + andn %o5, FPRS_DU, %g2 + stb %g2, [%g3 + AOFF_task_tss + AOFF_thread_fpsaved] + + rd %gsr, %g2 + stb %g2, [%g3 + AOFF_task_tss + AOFF_thread_gsr] + add %g6, %g1, %g2 + stx %fsr, [%g2 + AOFF_task_tss + AOFF_thread_xfsr] + sll %g1, 5, %g1 +3: andcc %o5, FPRS_DL, %g0 + be,pn %icc, 4f + add %g6, AOFF_task_fpregs, %g2 + + membar #StoreStore | #LoadStore + add %g6, AOFF_task_fpregs+0x40, %g3 + stda %f0, [%g2 + %g1] ASI_BLK_P + stda %f16, [%g3 + %g1] ASI_BLK_P + membar #Sync +4: and %o5, FPRS_DU, %o5 + jmpl %g7 + %g0, %g0 + wr %o5, FPRS_FEF, %fprs diff --git a/arch/sparc64/lib/blockops.S b/arch/sparc64/lib/blockops.S index 7d5b240ad..c57f0aefc 100644 --- a/arch/sparc64/lib/blockops.S +++ b/arch/sparc64/lib/blockops.S @@ -1,52 +1,66 @@ -/* $Id: blockops.S,v 1.11 1997/07/29 09:35:36 davem Exp $ - * arch/sparc64/lib/blockops.S: UltraSparc block zero optimized routines. +/* $Id: blockops.S,v 1.14 1998/06/12 14:53:46 jj Exp $ + * blockops.S: UltraSparc block zero optimized routines. * - * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1996,1998 David S. Miller (davem@caip.rutgers.edu) * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) */ #include "VIS.h" +#include <asm/visasm.h> + +#define TOUCH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7) \ + fmovd %reg0, %f48; fmovd %reg1, %f50; \ + fmovd %reg2, %f52; fmovd %reg3, %f54; \ + fmovd %reg4, %f56; fmovd %reg5, %f58; \ + fmovd %reg6, %f60; fmovd %reg7, %f62; .text .align 32 - - .globl __bfill64 -__bfill64: /* %o0 = buf, %o1= ptr to pattern */ - wr %g0, FPRS_FEF, %fprs ! FPU Group - ldd [%o1], %f48 ! Load Group - wr %g0, ASI_BLK_P, %asi ! LSU Group - membar #StoreLoad | #StoreStore | #LoadStore ! LSU Group - mov 32, %g2 ! IEU0 Group - - /* Cannot perform real arithmatic on the pattern, that can - * lead to fp_exception_other ;-) - */ - fmovd %f48, %f50 ! FPA Group - fmovd %f48, %f52 ! FPA Group - fmovd %f48, %f54 ! FPA Group - fmovd %f48, %f56 ! FPA Group - fmovd %f48, %f58 ! FPA Group - fmovd %f48, %f60 ! FPA Group - fmovd %f48, %f62 ! FPA Group - -1: stda %f48, [%o0 + 0x00] %asi ! Store Group - stda %f48, [%o0 + 0x40] %asi ! Store Group - stda %f48, [%o0 + 0x80] %asi ! Store Group - stda %f48, [%o0 + 0xc0] %asi ! Store Group - subcc %g2, 1, %g2 ! IEU1 Group - bne,pt %icc, 1b ! CTI - add %o0, 0x100, %o0 ! IEU0 - membar #StoreLoad | #StoreStore ! LSU Group - - jmpl %o7 + 0x8, %g0 ! CTI Group brk forced - wr %g0, 0, %fprs ! FPU Group + .globl copy_page + .type copy_page,@function +copy_page: /* %o0=dest, %o1=src */ + VISEntry + membar #LoadStore | #StoreStore | #StoreLoad + ldda [%o1] ASI_BLK_P, %f0 + add %o1, 0x40, %o1 + ldda [%o1] ASI_BLK_P, %f16 + add %o1, 0x40, %o1 + sethi %hi(8192), %o2 +1: TOUCH(f0, f2, f4, f6, f8, f10, f12, f14) + ldda [%o1] ASI_BLK_P, %f32 + add %o1, 0x40, %o1 + sub %o2, 0x40, %o2 + stda %f48, [%o0] ASI_BLK_P + add %o0, 0x40, %o0 + TOUCH(f16, f18, f20, f22, f24, f26, f28, f30) + ldda [%o1] ASI_BLK_P, %f0 + add %o1, 0x40, %o1 + sub %o2, 0x40, %o2 + stda %f48, [%o0] ASI_BLK_P + add %o0, 0x40, %o0 + TOUCH(f32, f34, f36, f38, f40, f42, f44, f46) + ldda [%o1] ASI_BLK_P, %f16 + add %o1, 0x40, %o1 + sub %o2, 0x40, %o2 + stda %f48, [%o0] ASI_BLK_P + cmp %o2, 0x80 + bne,pt %xcc, 1b + add %o0, 0x40, %o0 + membar #Sync + stda %f0, [%o0] ASI_BLK_P + add %o0, 0x40, %o0 + stda %f16, [%o0] ASI_BLK_P + membar #StoreStore | #StoreLoad + jmpl %o7 + 0x8, %g0 + VISExit .align 32 .globl __bzero_1page -__bzero_1page: - wr %g0, FPRS_FEF, %fprs ! FPU Group + .type __bzero_1page,@function +__bzero_1page: /* %o0=dest */ + VISEntryHalf fzero %f0 ! FPA Group - mov 32, %g1 ! IEU0 + mov 32, %o1 ! IEU0 fzero %f2 ! FPA Group faddd %f0, %f2, %f4 ! FPA Group fmuld %f0, %f2, %f6 ! FPM @@ -62,9 +76,9 @@ __bzero_1page: stda %f0, [%o0 + 0x80] %asi ! Store Group stda %f0, [%o0 + 0xc0] %asi ! Store Group - subcc %g1, 1, %g1 ! IEU1 + subcc %o1, 1, %o1 ! IEU1 bne,pt %icc, 1b ! CTI add %o0, 0x100, %o0 ! IEU0 Group - membar #StoreLoad | #StoreStore ! LSU Group + membar #StoreStore | #StoreLoad ! LSU Group jmpl %o7 + 0x8, %g0 ! CTI Group brk forced - wr %g0, 0, %fprs ! FPU Group + VISExitHalf diff --git a/arch/sparc64/lib/memscan.S b/arch/sparc64/lib/memscan.S index 83abe4040..423bc1409 100644 --- a/arch/sparc64/lib/memscan.S +++ b/arch/sparc64/lib/memscan.S @@ -1,116 +1,129 @@ -/* $Id: memscan.S,v 1.1 1997/03/14 21:04:24 jj Exp $ - * memscan.S: Optimized memscan for the Sparc64. +/* $Id: memscan.S,v 1.2 1998/05/21 14:42:22 jj Exp $ + * memscan.S: Optimized memscan for Sparc64. * - * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997,1998 Jakub Jelinek (jj@ultra.linux.cz) + * Copyright (C) 1998 David S. Miller (davem@dm.cobaltmicro.com) */ -/* In essence, this is just a fancy strlen. */ - -#define LO_MAGIC 0x01010101 -#define HI_MAGIC 0x80808080 +#define HI_MAGIC 0x8080808080808080 +#define LO_MAGIC 0x0101010101010101 +#define ASI_PL 0x88 .text - .align 4 - .globl __memscan_zero, __memscan_generic - .globl memscan + .align 32 + .globl __memscan_zero, __memscan_generic + .globl memscan + __memscan_zero: - /* %o0 = addr, %o1 = size */ - brlez,pn %o1, 0f - andcc %o0, 3, %g0 - be,pt %icc, 9f - sethi %hi(HI_MAGIC), %o4 - ldub [%o0], %o5 - subcc %o1, 1, %o1 - brz,pn %o5, 10f - add %o0, 1, %o0 - be,pn %xcc, 0f - andcc %o0, 3, %g0 - be,pn %icc, 4f - or %o4, %lo(HI_MAGIC), %o3 - ldub [%o0], %o5 - subcc %o1, 1, %o1 - brz,pn %o5, 10f - add %o0, 1, %o0 - be,pn %xcc, 0f - andcc %o0, 3, %g0 - be,pt %icc, 5f - sethi %hi(LO_MAGIC), %o4 - ldub [%o0], %o5 - subcc %o1, 1, %o1 - brz,pn %o5, 10f - add %o0, 1, %o0 - be,pn %xcc, 0f - or %o4, %lo(LO_MAGIC), %o2 - ba,pt %xcc, 2f - ld [%o0], %o5 -9: - or %o4, %lo(HI_MAGIC), %o3 -4: - sethi %hi(LO_MAGIC), %o4 -5: - or %o4, %lo(LO_MAGIC), %o2 - ld [%o0], %o5 -2: - sub %o5, %o2, %o4 - sub %o1, 4, %o1 - andcc %o4, %o3, %g0 - be,pn %icc, 1f - add %o0, 4, %o0 - brgz,pt %o1, 2b - ld [%o0], %o5 + /* %o0 = bufp, %o1 = size */ + brlez,pn %o1, szzero + andcc %o0, 7, %g0 + be,pt %icc, we_are_aligned + sethi %hi(HI_MAGIC), %o4 + ldub [%o0], %o5 +1: subcc %o1, 1, %o1 + brz,pn %o5, 10f + add %o0, 1, %o0 + be,pn %xcc, szzero + andcc %o0, 7, %g0 + bne,a,pn %icc, 1b + ldub [%o0], %o5 +we_are_aligned: + ldxa [%o0] ASI_PL, %o5 + or %o4, %lo(HI_MAGIC), %o3 + sllx %o3, 32, %o4 + or %o4, %o3, %o3 + + srlx %o3, 7, %o2 +msloop: + sub %o1, 8, %o1 + add %o0, 8, %o0 + sub %o5, %o2, %o4 + xor %o4, %o5, %o4 + andcc %o4, %o3, %g3 + bne,pn %xcc, check_bytes + srlx %o4, 32, %g3 + + brgz,a,pt %o1, msloop + ldxa [%o0] ASI_PL, %o5 +check_bytes: + bne,a,pn %icc, 2f + andcc %o5, 0xff, %g0 + add %o0, -5, %g2 + ba,pt %xcc, 3f + srlx %o5, 32, %g5 + +2: srlx %o5, 8, %g5 + be,pn %icc, 1f + add %o0, -8, %g2 + andcc %g5, 0xff, %g0 + srlx %g5, 8, %g5 + be,pn %icc, 1f + inc %g2 + andcc %g5, 0xff, %g0 + + srlx %g5, 8, %g5 + be,pn %icc, 1f + inc %g2 + andcc %g5, 0xff, %g0 + srlx %g5, 8, %g5 + be,pn %icc, 1f + inc %g2 + andcc %g3, %o3, %g0 + + be,a,pn %icc, 2f + mov %o0, %g2 +3: andcc %g5, 0xff, %g0 + srlx %g5, 8, %g5 + be,pn %icc, 1f + inc %g2 + andcc %g5, 0xff, %g0 + srlx %g5, 8, %g5 + + be,pn %icc, 1f + inc %g2 + andcc %g5, 0xff, %g0 + srlx %g5, 8, %g5 + be,pn %icc, 1f + inc %g2 + andcc %g5, 0xff, %g0 + srlx %g5, 8, %g5 + + be,pn %icc, 1f + inc %g2 +2: brgz,a,pt %o1, msloop + ldxa [%o0] ASI_PL, %o5 + inc %g2 +1: add %o0, %o1, %o0 + cmp %g2, %o0 retl - add %o0, %o1, %o0 -1: - /* Check every byte. */ - srl %o5, 24, %g5 - andcc %g5, 0xff, %g0 - be,pn %icc, 1f - add %o0, -4, %o4 - srl %o5, 16, %g5 - andcc %g5, 0xff, %g0 - be,pn %icc, 1f - add %o4, 1, %o4 - srl %o5, 8, %g5 - andcc %g5, 0xff, %g0 - be,pn %icc, 1f - add %o4, 1, %o4 - andcc %o5, 0xff, %g0 - be,pn %icc, 1f - add %o4, 1, %o4 - brgz,pt %o1, 2b - ld [%o0], %o5 -1: - add %o0, %o1, %o0 - cmp %o4, %o0 - retl - movle %xcc, %o4, %o0 -0: - retl + + movle %xcc, %g2, %o0 +10: retl + sub %o0, 1, %o0 +szzero: retl nop -10: - retl - sub %o0, 1, %o0 memscan: __memscan_generic: /* %o0 = addr, %o1 = c, %o2 = size */ - brz,pn %o2, 3f - add %o0, %o2, %o3 - ldub [%o0], %o5 - sub %g0, %o2, %o4 + brz,pn %o2, 3f + add %o0, %o2, %o3 + ldub [%o0], %o5 + sub %g0, %o2, %o4 1: - cmp %o5, %o1 - be,pn %icc, 2f - addcc %o4, 1, %o4 - bne,a,pt %xcc, 1b - ldub [%o3 + %o4], %o5 + cmp %o5, %o1 + be,pn %icc, 2f + addcc %o4, 1, %o4 + bne,a,pt %xcc, 1b + ldub [%o3 + %o4], %o5 retl /* The delay slot is the same as the next insn, this is just to make it look more awful */ 2: - add %o3, %o4, %o0 + add %o3, %o4, %o0 retl - sub %o0, 1, %o0 + sub %o0, 1, %o0 3: retl nop |