diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-03-02 02:36:47 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-03-02 02:36:47 +0000 |
commit | 8624512aa908741ba2795200133eae0d7f4557ea (patch) | |
tree | d5d3036fccf2604f4c98dedc11e8adb929d6b52e /arch/sparc64/lib | |
parent | 7b8f5d6f1d45d9f9de1d26e7d3c32aa5af11b488 (diff) |
Merge with 2.3.48.
Diffstat (limited to 'arch/sparc64/lib')
-rw-r--r-- | arch/sparc64/lib/VIScsum.S | 693 | ||||
-rw-r--r-- | arch/sparc64/lib/VIScsumcopy.S | 1176 | ||||
-rw-r--r-- | arch/sparc64/lib/VIScsumcopyusr.S | 1162 |
3 files changed, 1518 insertions, 1513 deletions
diff --git a/arch/sparc64/lib/VIScsum.S b/arch/sparc64/lib/VIScsum.S index aad5d941a..9f77c8cb4 100644 --- a/arch/sparc64/lib/VIScsum.S +++ b/arch/sparc64/lib/VIScsum.S @@ -1,14 +1,15 @@ -/* $Id: VIScsum.S,v 1.5 1999/07/30 09:35:36 davem Exp $ +/* $Id: VIScsum.S,v 1.6 2000/02/20 23:21:39 davem Exp $ * VIScsum.S: High bandwidth IP checksumming utilizing the UltraSparc * Visual Instruction Set. * * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 2000 David S. Miller (davem@redhat.com) * * Based on older sparc32/sparc64 checksum.S, which is: * * Copyright(C) 1995 Linus Torvalds * Copyright(C) 1995 Miguel de Icaza - * Copyright(C) 1996,1997 David S. Miller + * Copyright(C) 1996, 1997 David S. Miller * derived from: * Linux/Alpha checksum c-code * Linux/ix86 inline checksum assembly @@ -38,290 +39,290 @@ * tricks are UltraLinux trade secrets :)) */ -#define START_THE_TRICK(fz,f0,f2,f4,f6,f8,f10) \ - fcmpgt32 %fz, %f0, %g1 /* FPM Group */; \ - fcmpgt32 %fz, %f2, %g2 /* FPM Group */; \ - fcmpgt32 %fz, %f4, %g3 /* FPM Group */; \ - fcmpgt32 %fz, %f6, %g5 /* FPM Group */; \ - inc %g1 /* IEU0 */; \ - fcmpgt32 %fz, %f8, %g7 /* FPM Group */; \ - srl %g1, 1, %g1 /* IEU0 */; \ - inc %g2 /* IEU1 */; \ - fcmpgt32 %fz, %f10, %o3 /* FPM Group */; \ - srl %g2, 1, %g2 /* IEU0 */; \ - add %o2, %g1, %o2 /* IEU1 */; \ - add %g3, 1, %g3 /* IEU0 Group */; \ - srl %g3, 1, %g3 /* IEU0 Group */; \ - add %o2, %g2, %o2 /* IEU1 */; \ - inc %g5 /* IEU0 Group */; \ - add %o2, %g3, %o2 /* IEU1 */; +#define START_THE_TRICK(fz,f0,f2,f4,f6,f8,f10) \ + fcmpgt32 %fz, %f0, %g1 /* FPM Group */; \ + fcmpgt32 %fz, %f2, %g2 /* FPM Group */; \ + fcmpgt32 %fz, %f4, %g3 /* FPM Group */; \ + inc %g1 /* IEU0 Group */; \ + fcmpgt32 %fz, %f6, %g5 /* FPM */; \ + srl %g1, 1, %g1 /* IEU0 Group */; \ + fcmpgt32 %fz, %f8, %g7 /* FPM */; \ + inc %g2 /* IEU0 Group */; \ + fcmpgt32 %fz, %f10, %o3 /* FPM */; \ + srl %g2, 1, %g2 /* IEU0 Group */; \ + inc %g3 /* IEU1 */; \ + srl %g3, 1, %g3 /* IEU0 Group */; \ + add %o2, %g1, %o2 /* IEU1 */; \ + add %o2, %g2, %o2 /* IEU0 Group */; \ + inc %g5 /* IEU1 */; \ + add %o2, %g3, %o2 /* IEU0 Group */; -#define DO_THE_TRICK(O12,O14,f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14) \ - fcmpgt32 %O12, %f12, %o4 /* FPM Group */; \ - srl %g5, 1, %g5 /* IEU0 */; \ - inc %g7 /* IEU1 */; \ - fpadd32 %F0, %f0, %F0 /* FPA */; \ - fcmpgt32 %O14, %f14, %o5 /* FPM Group */; \ - srl %g7, 1, %g7 /* IEU0 */; \ - add %o2, %g5, %o2 /* IEU1 */; \ - fpadd32 %F2, %f2, %F2 /* FPA */; \ - inc %o3 /* IEU0 Group */; \ - add %o2, %g7, %o2 /* IEU1 */; \ - fcmpgt32 %f0, %F0, %g1 /* FPM Group */; \ - srl %o3, 1, %o3 /* IEU0 */; \ - inc %o4 /* IEU1 */; \ - fpadd32 %F4, %f4, %F4 /* FPA */; \ - fcmpgt32 %f2, %F2, %g2 /* FPM Group */; \ - srl %o4, 1, %o4 /* IEU0 */; \ - add %o2, %o3, %o2 /* IEU1 */; \ - fpadd32 %F6, %f6, %F6 /* FPA */; \ - inc %o5 /* IEU0 Group */; \ - add %o2, %o4, %o2 /* IEU1 */; \ - fcmpgt32 %f4, %F4, %g3 /* FPM Group */; \ - srl %o5, 1, %o5 /* IEU0 */; \ - inc %g1 /* IEU1 */; \ - fpadd32 %F8, %f8, %F8 /* FPA */; \ - fcmpgt32 %f6, %F6, %g5 /* FPM Group */; \ - srl %g1, 1, %g1 /* IEU0 */; \ - add %o2, %o5, %o2 /* IEU1 */; \ - fpadd32 %F10, %f10, %F10 /* FPA */; \ - inc %g2 /* IEU0 Group */; \ - add %o2, %g1, %o2 /* IEU1 */; \ - fcmpgt32 %f8, %F8, %g7 /* FPM Group */; \ - srl %g2, 1, %g2 /* IEU0 */; \ - inc %g3 /* IEU1 */; \ - fpadd32 %F12, %f12, %F12 /* FPA */; \ - fcmpgt32 %f10, %F10, %o3 /* FPM Group */; \ - srl %g3, 1, %g3 /* IEU0 */; \ - add %o2, %g2, %o2 /* IEU1 */; \ - fpadd32 %F14, %f14, %F14 /* FPA */; \ - inc %g5 /* IEU0 Group */; \ - add %o2, %g3, %o2 /* IEU1 */; +#define DO_THE_TRICK(O12,O14,f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14) \ + srl %g5, 1, %g5 /* IEU0 Group */; \ + fpadd32 %F0, %f0, %F0 /* FPA */; \ + fcmpgt32 %O12, %f12, %o4 /* FPM */; \ + inc %g7 /* IEU0 Group */; \ + fpadd32 %F2, %f2, %F2 /* FPA */; \ + fcmpgt32 %O14, %f14, %o5 /* FPM */; \ + add %o2, %g5, %o2 /* IEU1 Group */; \ + fpadd32 %F4, %f4, %F4 /* FPA */; \ + fcmpgt32 %f0, %F0, %g1 /* FPM */; \ + srl %g7, 1, %g7 /* IEU0 Group */; \ + fpadd32 %F6, %f6, %F6 /* FPA */; \ + fcmpgt32 %f2, %F2, %g2 /* FPM */; \ + add %o2, %g7, %o2 /* IEU0 Group */; \ + fpadd32 %F8, %f8, %F8 /* FPA */; \ + fcmpgt32 %f4, %F4, %g3 /* FPM */; \ + inc %o3 /* IEU0 Group */; \ + fpadd32 %F10, %f10, %F10 /* FPA */; \ + fcmpgt32 %f6, %F6, %g5 /* FPM */; \ + srl %o3, 1, %o3 /* IEU0 Group */; \ + fpadd32 %F12, %f12, %F12 /* FPA */; \ + fcmpgt32 %f8, %F8, %g7 /* FPM */; \ + add %o2, %o3, %o2 /* IEU0 Group */; \ + fpadd32 %F14, %f14, %F14 /* FPA */; \ + fcmpgt32 %f10, %F10, %o3 /* FPM */; \ + inc %o4 /* IEU0 Group */; \ + inc %o5 /* IEU1 */; \ + srl %o4, 1, %o4 /* IEU0 Group */; \ + inc %g1 /* IEU1 */; \ + srl %o5, 1, %o5 /* IEU0 Group */; \ + add %o2, %o4, %o2 /* IEU1 */; \ + srl %g1, 1, %g1 /* IEU0 Group */; \ + add %o2, %o5, %o2 /* IEU1 */; \ + inc %g2 /* IEU0 Group */; \ + add %o2, %g1, %o2 /* IEU1 */; \ + srl %g2, 1, %g2 /* IEU0 Group */; \ + inc %g3 /* IEU1 */; \ + srl %g3, 1, %g3 /* IEU0 Group */; \ + add %o2, %g2, %o2 /* IEU1 */; \ + inc %g5 /* IEU0 Group */; \ + add %o2, %g3, %o2 /* IEU0 */; -#define END_THE_TRICK(O12,O14,f0,f2,f4,f6,f8,f10,f12,f14,S0,S1,S2,S3,T0,T1,U0,fz) \ - fcmpgt32 %O12, %f12, %o4 /* FPM Group */; \ - srl %g5, 1, %g5 /* IEU0 */; \ - inc %g7 /* IEU1 */; \ - fpadd32 %f2, %f0, %S0 /* FPA */; \ - fcmpgt32 %O14, %f14, %o5 /* FPM Group */; \ - srl %g7, 1, %g7 /* IEU0 */; \ - add %o2, %g5, %o2 /* IEU1 */; \ - fpadd32 %f6, %f4, %S1 /* FPA */; \ - inc %o3 /* IEU0 Group */; \ - add %o2, %g7, %o2 /* IEU1 */; \ - fcmpgt32 %f0, %S0, %g1 /* FPM Group */; \ - srl %o3, 1, %o3 /* IEU0 */; \ - inc %o4 /* IEU1 */; \ - fpadd32 %f10, %f8, %S2 /* FPA */; \ - fcmpgt32 %f4, %S1, %g2 /* FPM Group */; \ - srl %o4, 1, %o4 /* IEU0 */; \ - add %o2, %o3, %o2 /* IEU1 */; \ - fpadd32 %f14, %f12, %S3 /* FPA */; \ - inc %o5 /* IEU0 Group */; \ - add %o2, %o4, %o2 /* IEU1 */; \ - fzero %fz /* FPA */; \ - fcmpgt32 %f8, %S2, %g3 /* FPM Group */; \ - srl %o5, 1, %o5 /* IEU0 */; \ - inc %g1 /* IEU1 */; \ - fpadd32 %S0, %S1, %T0 /* FPA */; \ - fcmpgt32 %f12, %S3, %g5 /* FPM Group */; \ - srl %g1, 1, %g1 /* IEU0 */; \ - add %o2, %o5, %o2 /* IEU1 */; \ - fpadd32 %S2, %S3, %T1 /* FPA */; \ - inc %g2 /* IEU0 Group */; \ - add %o2, %g1, %o2 /* IEU1 */; \ - fcmpgt32 %S0, %T0, %g7 /* FPM Group */; \ - srl %g2, 1, %g2 /* IEU0 */; \ - inc %g3 /* IEU1 */; \ - fcmpgt32 %S2, %T1, %o3 /* FPM Group */; \ - srl %g3, 1, %g3 /* IEU0 */; \ - add %o2, %g2, %o2 /* IEU1 */; \ - inc %g5 /* IEU0 Group */; \ - add %o2, %g3, %o2 /* IEU1 */; \ - fcmpgt32 %fz, %f2, %o4 /* FPM Group */; \ - srl %g5, 1, %g5 /* IEU0 */; \ - inc %g7 /* IEU1 */; \ - fpadd32 %T0, %T1, %U0 /* FPA */; \ - fcmpgt32 %fz, %f6, %o5 /* FPM Group */; \ - srl %g7, 1, %g7 /* IEU0 */; \ - add %o2, %g5, %o2 /* IEU1 */; \ - inc %o3 /* IEU0 Group */; \ - add %o2, %g7, %o2 /* IEU1 */; \ - fcmpgt32 %fz, %f10, %g1 /* FPM Group */; \ - srl %o3, 1, %o3 /* IEU0 */; \ - inc %o4 /* IEU1 */; \ - fcmpgt32 %fz, %f14, %g2 /* FPM Group */; \ - srl %o4, 1, %o4 /* IEU0 */; \ - add %o2, %o3, %o2 /* IEU1 */; \ - std %U0, [%sp + STACKOFF] /* Store Group */; \ - inc %o5 /* IEU0 */; \ - sub %o2, %o4, %o2 /* IEU1 */; \ - fcmpgt32 %fz, %S1, %g3 /* FPM Group */; \ - srl %o5, 1, %o5 /* IEU0 */; \ - inc %g1 /* IEU1 */; \ - fcmpgt32 %fz, %S3, %g5 /* FPM Group */; \ - srl %g1, 1, %g1 /* IEU0 */; \ - sub %o2, %o5, %o2 /* IEU1 */; \ - ldx [%sp + STACKOFF], %o5 /* Load Group */; \ - inc %g2 /* IEU0 */; \ - sub %o2, %g1, %o2 /* IEU1 */; \ - fcmpgt32 %fz, %T1, %g7 /* FPM Group */; \ - srl %g2, 1, %g2 /* IEU0 */; \ - inc %g3 /* IEU1 */; \ - fcmpgt32 %T0, %U0, %o3 /* FPM Group */; \ - srl %g3, 1, %g3 /* IEU0 */; \ - sub %o2, %g2, %o2 /* IEU1 */; \ - inc %g5 /* IEU0 Group */; \ - sub %o2, %g3, %o2 /* IEU1 */; \ - fcmpgt32 %fz, %U0, %o4 /* FPM Group */; \ - srl %g5, 1, %g5 /* IEU0 */; \ - inc %g7 /* IEU1 */; \ - srl %g7, 1, %g7 /* IEU0 Group */; \ - sub %o2, %g5, %o2 /* IEU1 */; \ - inc %o3 /* IEU0 Group */; \ - sub %o2, %g7, %o2 /* IEU1 */; \ - srl %o3, 1, %o3 /* IEU0 Group */; \ - inc %o4 /* IEU1 */; \ - srl %o4, 1, %o4 /* IEU0 Group */; \ - add %o2, %o3, %o2 /* IEU1 */; \ - sub %o2, %o4, %o2 /* IEU0 Group */; \ - addcc %o2, %o5, %o2 /* IEU1 Group */; \ - bcs,a,pn %xcc, 33f /* CTI */; \ - add %o2, 1, %o2 /* IEU0 */; \ -33: /* That's it */; +#define END_THE_TRICK(O12,O14,f0,f2,f4,f6,f8,f10,f12,f14,S0,S1,S2,S3,T0,T1,U0,fz) \ + srl %g5, 1, %g5 /* IEU0 Group */; \ + fpadd32 %f2, %f0, %S0 /* FPA */; \ + fcmpgt32 %O12, %f12, %o4 /* FPM */; \ + inc %g7 /* IEU0 Group */; \ + fpadd32 %f6, %f4, %S1 /* FPA */; \ + fcmpgt32 %O14, %f14, %o5 /* FPM */; \ + srl %g7, 1, %g7 /* IEU0 Group */; \ + fpadd32 %f10, %f8, %S2 /* FPA */; \ + fcmpgt32 %f0, %S0, %g1 /* FPM */; \ + inc %o3 /* IEU0 Group */; \ + fpadd32 %f14, %f12, %S3 /* FPA */; \ + fcmpgt32 %f4, %S1, %g2 /* FPM */; \ + add %o2, %g5, %o2 /* IEU0 Group */; \ + fpadd32 %S0, %S1, %T0 /* FPA */; \ + fcmpgt32 %f8, %S2, %g3 /* FPM */; \ + add %o2, %g7, %o2 /* IEU0 Group */; \ + fzero %fz /* FPA */; \ + fcmpgt32 %f12, %S3, %g5 /* FPM */; \ + srl %o3, 1, %o3 /* IEU0 Group */; \ + fpadd32 %S2, %S3, %T1 /* FPA */; \ + fcmpgt32 %S0, %T0, %g7 /* FPM */; \ + add %o2, %o3, %o2 /* IEU0 Group */; \ + fpadd32 %T0, %T1, %U0 /* FPA */; \ + fcmpgt32 %S2, %T1, %o3 /* FPM */; \ + inc %o4 /* IEU0 Group */; \ + inc %o5 /* IEU1 */; \ + srl %o4, 1, %o4 /* IEU0 Group */; \ + inc %g1 /* IEU1 */; \ + add %o2, %o4, %o2 /* IEU0 Group */; \ + fcmpgt32 %fz, %f2, %o4 /* FPM */; \ + srl %o5, 1, %o5 /* IEU0 Group */; \ + inc %g2 /* IEU1 */; \ + add %o2, %o5, %o2 /* IEU0 Group */; \ + fcmpgt32 %fz, %f6, %o5 /* FPM */; \ + srl %g1, 1, %g1 /* IEU0 Group */; \ + inc %g3 /* IEU1 */; \ + add %o2, %g1, %o2 /* IEU0 Group */; \ + fcmpgt32 %fz, %f10, %g1 /* FPM */; \ + srl %g2, 1, %g2 /* IEU0 Group */; \ + inc %g5 /* IEU1 */; \ + add %o2, %g2, %o2 /* IEU0 Group */; \ + fcmpgt32 %fz, %f14, %g2 /* FPM */; \ + srl %g3, 1, %g3 /* IEU0 Group */; \ + inc %g7 /* IEU1 */; \ + add %o2, %g3, %o2 /* IEU0 Group */; \ + fcmpgt32 %fz, %S1, %g3 /* FPM */; \ + srl %g5, 1, %g5 /* IEU0 Group */; \ + inc %o3 /* IEU1 */; \ + add %o2, %g5, %o2 /* IEU0 Group */; \ + fcmpgt32 %fz, %S3, %g5 /* FPM */; \ + srl %g7, 1, %g7 /* IEU0 Group */; \ + inc %o4 /* IEU1 */; \ + add %o2, %g7, %o2 /* IEU0 Group */; \ + fcmpgt32 %fz, %T1, %g7 /* FPM */; \ + srl %o3, 1, %o3 /* IEU0 Group */; \ + inc %o5 /* IEU1 */; \ + add %o2, %o3, %o2 /* IEU0 Group */; \ + fcmpgt32 %T0, %U0, %o3 /* FPM */; \ + srl %o4, 1, %o4 /* IEU0 Group */; \ + inc %g1 /* IEU1 */; \ + sub %o2, %o4, %o2 /* IEU0 Group */; \ + fcmpgt32 %fz, %U0, %o4 /* FPM */; \ + srl %o5, 1, %o5 /* IEU0 Group */; \ + inc %g2 /* IEU1 */; \ + srl %g1, 1, %g1 /* IEU0 Group */; \ + sub %o2, %o5, %o2 /* IEU1 */; \ + std %U0, [%sp + STACKOFF] /* Store */; \ + srl %g2, 1, %g2 /* IEU0 Group */; \ + sub %o2, %g1, %o2 /* IEU1 */; \ + inc %g3 /* IEU0 Group */; \ + sub %o2, %g2, %o2 /* IEU1 */; \ + srl %g3, 1, %g3 /* IEU0 Group */; \ + inc %g5 /* IEU1 */; \ + srl %g5, 1, %g5 /* IEU0 Group */; \ + sub %o2, %g3, %o2 /* IEU1 */; \ + ldx [%sp + STACKOFF], %o5 /* Load Group */; \ + inc %g7 /* IEU0 */; \ + sub %o2, %g5, %o2 /* IEU1 */; \ + srl %g7, 1, %g7 /* IEU0 Group */; \ + inc %o3 /* IEU1 */; \ + srl %o3, 1, %o3 /* IEU0 Group */; \ + sub %o2, %g7, %o2 /* IEU1 */; \ + inc %o4 /* IEU0 Group */; \ + add %o2, %o3, %o2 /* IEU1 */; \ + srl %o4, 1, %o4 /* IEU0 Group */; \ + sub %o2, %o4, %o2 /* IEU0 Group */; \ + addcc %o2, %o5, %o2 /* IEU1 Group */; \ + bcs,a,pn %xcc, 33f /* CTI */; \ + add %o2, 1, %o2 /* IEU0 */; \ +33: /* That's it */; -#define CSUM_LASTCHUNK(offset) \ - ldx [%o0 - offset - 0x10], %g2; \ - ldx [%o0 - offset - 0x08], %g3; \ - addcc %g2, %o2, %o2; \ - bcs,a,pn %xcc, 31f; \ - add %o2, 1, %o2; \ -31: addcc %g3, %o2, %o2; \ - bcs,a,pn %xcc, 32f; \ - add %o2, 1, %o2; \ +#define CSUM_LASTCHUNK(offset) \ + ldx [%o0 - offset - 0x10], %g2; \ + ldx [%o0 - offset - 0x08], %g3; \ + addcc %g2, %o2, %o2; \ + bcs,a,pn %xcc, 31f; \ + add %o2, 1, %o2; \ +31: addcc %g3, %o2, %o2; \ + bcs,a,pn %xcc, 32f; \ + add %o2, 1, %o2; \ 32: .text .globl csum_partial .align 32 csum_partial: - andcc %o0, 7, %g0 /* IEU1 Group */ - be,pt %icc, 4f /* CTI */ - andcc %o0, 0x38, %g3 /* IEU1 */ - mov 1, %g5 /* IEU0 Group */ - cmp %o1, 6 /* IEU1 */ - bl,pn %icc, 21f /* CTI */ - andcc %o0, 2, %g0 /* IEU1 Group */ - be,pt %icc, 1f /* CTI */ - and %o0, 4, %g7 /* IEU0 */ - lduh [%o0], %g2 /* Load */ - sub %o1, 2, %o1 /* IEU0 Group */ - add %o0, 2, %o0 /* IEU1 */ - andcc %o0, 4, %g7 /* IEU1 Group */ - sll %g5, 16, %g5 /* IEU0 */ - sll %g2, 16, %g2 /* IEU0 Group */ - addcc %g2, %o2, %o2 /* IEU1 Group (regdep) */ - bcs,a,pn %icc, 1f /* CTI */ - add %o2, %g5, %o2 /* IEU0 */ -1: ld [%o0], %g2 /* Load */ - brz,a,pn %g7, 4f /* CTI+IEU1 Group */ - and %o0, 0x38, %g3 /* IEU0 */ - add %o0, 4, %o0 /* IEU0 Group */ - sub %o1, 4, %o1 /* IEU1 */ - addcc %g2, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: and %o0, 0x38, %g3 /* IEU1 Group */ -4: srl %o2, 0, %o2 /* IEU0 Group */ - mov 0x40, %g1 /* IEU1 */ - brz,pn %g3, 3f /* CTI+IEU1 Group */ - sub %g1, %g3, %g1 /* IEU0 */ - cmp %o1, 56 /* IEU1 Group */ - blu,pn %icc, 20f /* CTI */ - andcc %o0, 8, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - ldx [%o0], %g2 /* Load */ - add %o0, 8, %o0 /* IEU0 Group */ - sub %o1, 8, %o1 /* IEU1 */ - addcc %g2, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: andcc %g1, 0x10, %g0 /* IEU1 Group */ - be,pn %icc, 2f /* CTI */ - and %g1, 0x20, %g1 /* IEU0 */ - ldx [%o0], %g2 /* Load */ - ldx [%o0+8], %g3 /* Load Group */ - add %o0, 16, %o0 /* IEU0 */ - sub %o1, 16, %o1 /* IEU1 */ - addcc %g2, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: addcc %g3, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 2f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -2: brz,pn %g1, 3f /* CTI+IEU1 Group */ - ldx [%o0], %g2 /* Load */ - ldx [%o0+8], %g3 /* Load Group */ - ldx [%o0+16], %g5 /* Load Group */ - ldx [%o0+24], %g7 /* Load Group */ - add %o0, 32, %o0 /* IEU0 */ - sub %o1, 32, %o1 /* IEU1 */ - addcc %g2, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: addcc %g3, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: addcc %g5, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: addcc %g7, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 3f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -3: cmp %o1, 0xc0 /* IEU1 Group */ - blu,pn %icc, 20f /* CTI */ - sllx %o2, 32, %g5 /* IEU0 */ + andcc %o0, 7, %g0 /* IEU1 Group */ + be,pt %icc, 4f /* CTI */ + andcc %o0, 0x38, %g3 /* IEU1 */ + mov 1, %g5 /* IEU0 Group */ + cmp %o1, 6 /* IEU1 */ + bl,pn %icc, 21f /* CTI */ + andcc %o0, 2, %g0 /* IEU1 Group */ + be,pt %icc, 1f /* CTI */ + and %o0, 4, %g7 /* IEU0 */ + lduh [%o0], %g2 /* Load */ + sub %o1, 2, %o1 /* IEU0 Group */ + add %o0, 2, %o0 /* IEU1 */ + andcc %o0, 4, %g7 /* IEU1 Group */ + sll %g5, 16, %g5 /* IEU0 */ + sll %g2, 16, %g2 /* IEU0 Group */ + addcc %g2, %o2, %o2 /* IEU1 Group (regdep) */ + bcs,a,pn %icc, 1f /* CTI */ + add %o2, %g5, %o2 /* IEU0 */ +1: ld [%o0], %g2 /* Load */ + brz,a,pn %g7, 4f /* CTI+IEU1 Group */ + and %o0, 0x38, %g3 /* IEU0 */ + add %o0, 4, %o0 /* IEU0 Group */ + sub %o1, 4, %o1 /* IEU1 */ + addcc %g2, %o2, %o2 /* IEU1 Group */ + bcs,a,pn %icc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +1: and %o0, 0x38, %g3 /* IEU1 Group */ +4: srl %o2, 0, %o2 /* IEU0 Group */ + mov 0x40, %g1 /* IEU1 */ + brz,pn %g3, 3f /* CTI+IEU1 Group */ + sub %g1, %g3, %g1 /* IEU0 */ + cmp %o1, 56 /* IEU1 Group */ + blu,pn %icc, 20f /* CTI */ + andcc %o0, 8, %g0 /* IEU1 Group */ + be,pn %icc, 1f /* CTI */ + ldx [%o0], %g2 /* Load */ + add %o0, 8, %o0 /* IEU0 Group */ + sub %o1, 8, %o1 /* IEU1 */ + addcc %g2, %o2, %o2 /* IEU1 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +1: andcc %g1, 0x10, %g0 /* IEU1 Group */ + be,pn %icc, 2f /* CTI */ + and %g1, 0x20, %g1 /* IEU0 */ + ldx [%o0], %g2 /* Load */ + ldx [%o0+8], %g3 /* Load Group */ + add %o0, 16, %o0 /* IEU0 */ + sub %o1, 16, %o1 /* IEU1 */ + addcc %g2, %o2, %o2 /* IEU1 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +1: addcc %g3, %o2, %o2 /* IEU1 Group */ + bcs,a,pn %xcc, 2f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +2: brz,pn %g1, 3f /* CTI+IEU1 Group */ + ldx [%o0], %g2 /* Load */ + ldx [%o0+8], %g3 /* Load Group */ + ldx [%o0+16], %g5 /* Load Group */ + ldx [%o0+24], %g7 /* Load Group */ + add %o0, 32, %o0 /* IEU0 */ + sub %o1, 32, %o1 /* IEU1 */ + addcc %g2, %o2, %o2 /* IEU1 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +1: addcc %g3, %o2, %o2 /* IEU1 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +1: addcc %g5, %o2, %o2 /* IEU1 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +1: addcc %g7, %o2, %o2 /* IEU1 Group */ + bcs,a,pn %xcc, 3f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +3: cmp %o1, 0xc0 /* IEU1 Group */ + blu,pn %icc, 20f /* CTI */ + sllx %o2, 32, %g5 /* IEU0 */ #ifdef __KERNEL__ VISEntry #endif - addcc %o2, %g5, %o2 /* IEU1 Group */ - sub %o1, 0xc0, %o1 /* IEU0 */ - wr %g0, ASI_BLK_P, %asi /* LSU Group */ - membar #StoreLoad /* LSU Group */ - srlx %o2, 32, %o2 /* IEU0 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU1 */ -1: andcc %o1, 0x80, %g0 /* IEU1 Group */ - bne,pn %icc, 7f /* CTI */ - andcc %o1, 0x40, %g0 /* IEU1 Group */ - be,pn %icc, 6f /* CTI */ - fzero %f12 /* FPA */ - fzero %f14 /* FPA Group */ + addcc %o2, %g5, %o2 /* IEU1 Group */ + sub %o1, 0xc0, %o1 /* IEU0 */ + wr %g0, ASI_BLK_P, %asi /* LSU Group */ + membar #StoreLoad /* LSU Group */ + srlx %o2, 32, %o2 /* IEU0 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU1 */ +1: andcc %o1, 0x80, %g0 /* IEU1 Group */ + bne,pn %icc, 7f /* CTI */ + andcc %o1, 0x40, %g0 /* IEU1 Group */ + be,pn %icc, 6f /* CTI */ + fzero %f12 /* FPA */ + fzero %f14 /* FPA Group */ ldda [%o0 + 0x000] %asi, %f16 ldda [%o0 + 0x040] %asi, %f32 ldda [%o0 + 0x080] %asi, %f48 START_THE_TRICK(f12,f16,f18,f20,f22,f24,f26) ba,a,pt %xcc, 3f -6: sub %o0, 0x40, %o0 /* IEU0 Group */ - fzero %f28 /* FPA */ - fzero %f30 /* FPA Group */ +6: sub %o0, 0x40, %o0 /* IEU0 Group */ + fzero %f28 /* FPA */ + fzero %f30 /* FPA Group */ ldda [%o0 + 0x040] %asi, %f32 ldda [%o0 + 0x080] %asi, %f48 ldda [%o0 + 0x0c0] %asi, %f0 START_THE_TRICK(f28,f32,f34,f36,f38,f40,f42) ba,a,pt %xcc, 4f -7: bne,pt %icc, 8f /* CTI */ - fzero %f44 /* FPA */ - add %o0, 0x40, %o0 /* IEU0 Group */ - fzero %f60 /* FPA */ - fzero %f62 /* FPA Group */ +7: bne,pt %icc, 8f /* CTI */ + fzero %f44 /* FPA */ + add %o0, 0x40, %o0 /* IEU0 Group */ + fzero %f60 /* FPA */ + fzero %f62 /* FPA Group */ ldda [%o0 - 0x040] %asi, %f0 ldda [%o0 + 0x000] %asi, %f16 ldda [%o0 + 0x040] %asi, %f32 START_THE_TRICK(f60,f0,f2,f4,f6,f8,f10) ba,a,pt %xcc, 2f -8: add %o0, 0x80, %o0 /* IEU0 Group */ - fzero %f46 /* FPA */ +8: add %o0, 0x80, %o0 /* IEU0 Group */ + fzero %f46 /* FPA */ ldda [%o0 - 0x080] %asi, %f48 ldda [%o0 - 0x040] %asi, %f0 ldda [%o0 + 0x000] %asi, %f16 @@ -333,36 +334,36 @@ csum_partial: 3: DO_THE_TRICK(f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46) ldda [%o0 + 0x0c0] %asi, %f0 4: DO_THE_TRICK(f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,f48,f50,f52,f54,f56,f58,f60,f62) - add %o0, 0x100, %o0 /* IEU0 Group */ - subcc %o1, 0x100, %o1 /* IEU1 */ - bgeu,a,pt %icc, 1b /* CTI */ + add %o0, 0x100, %o0 /* IEU0 Group */ + subcc %o1, 0x100, %o1 /* IEU1 */ + bgeu,a,pt %icc, 1b /* CTI */ ldda [%o0 + 0x000] %asi, %f16 - membar #Sync /* LSU Group */ + membar #Sync /* LSU Group */ DO_THE_TRICK(f44,f46,f48,f50,f52,f54,f56,f58,f60,f62,f0,f2,f4,f6,f8,f10,f12,f14) END_THE_TRICK(f60,f62,f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30) #ifdef __KERNEL__ ldub [%g6 + AOFF_task_thread + AOFF_thread_current_ds], %g7 #endif - and %o1, 0x3f, %o1 /* IEU0 Group */ + and %o1, 0x3f, %o1 /* IEU0 Group */ #ifdef __KERNEL__ VISExit wr %g7, %g0, %asi #endif -20: andcc %o1, 0xf0, %g1 /* IEU1 Group */ - be,pn %icc, 23f /* CTI */ - and %o1, 0xf, %o3 /* IEU0 */ +20: andcc %o1, 0xf0, %g1 /* IEU1 Group */ + be,pn %icc, 23f /* CTI */ + and %o1, 0xf, %o3 /* IEU0 */ #ifdef __KERNEL__ -22: sll %g1, 1, %o4 /* IEU0 Group */ - sethi %hi(23f), %g7 /* IEU1 */ - sub %g7, %o4, %g7 /* IEU0 Group */ - jmpl %g7 + %lo(23f), %g0 /* CTI Group brk forced */ - add %o0, %g1, %o0 /* IEU0 */ +22: sll %g1, 1, %o4 /* IEU0 Group */ + sethi %hi(23f), %g7 /* IEU1 */ + sub %g7, %o4, %g7 /* IEU0 Group */ + jmpl %g7 + %lo(23f), %g0 /* CTI Group brk forced*/ + add %o0, %g1, %o0 /* IEU0 */ #else -22: rd %pc, %g7 /* LSU Group+4bubbles */ - sll %g1, 1, %o4 /* IEU0 Group */ - sub %g7, %o4, %g7 /* IEU0 Group (regdep) */ - jmpl %g7 + (23f - 22b), %g0 /* CTI Group brk forced */ - add %o0, %g1, %o0 /* IEU0 */ +22: rd %pc, %g7 /* LSU Group+4bubbles */ + sll %g1, 1, %o4 /* IEU0 Group */ + sub %g7, %o4, %g7 /* IEU0 Group (regdep) */ + jmpl %g7 + (23f - 22b), %g0 /* CTI Group brk forced*/ + add %o0, %g1, %o0 /* IEU0 */ #endif CSUM_LASTCHUNK(0xe0) CSUM_LASTCHUNK(0xd0) @@ -379,72 +380,72 @@ csum_partial: CSUM_LASTCHUNK(0x20) CSUM_LASTCHUNK(0x10) CSUM_LASTCHUNK(0x00) -23: brnz,pn %o3, 26f /* CTI+IEU1 Group */ -24: sllx %o2, 32, %g1 /* IEU0 */ -25: addcc %o2, %g1, %o0 /* IEU1 Group */ - srlx %o0, 32, %o0 /* IEU0 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o0, 1, %o0 /* IEU1 */ -1: retl /* CTI Group brk forced */ - srl %o0, 0, %o0 /* IEU0 */ -26: andcc %o1, 8, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - ldx [%o0], %g3 /* Load */ - add %o0, 8, %o0 /* IEU0 Group */ - addcc %g3, %o2, %o2 /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: andcc %o1, 4, %g0 /* IEU1 Group */ - be,a,pn %icc, 1f /* CTI */ - clr %g2 /* IEU0 */ - ld [%o0], %g2 /* Load */ - add %o0, 4, %o0 /* IEU0 Group */ - sllx %g2, 32, %g2 /* IEU0 Group */ -1: andcc %o1, 2, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o4 /* IEU0 Group */ - lduh [%o0], %o4 /* Load */ - add %o0, 2, %o0 /* IEU1 */ - sll %o4, 16, %o4 /* IEU0 Group */ -1: andcc %o1, 1, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o5 /* IEU0 Group */ - ldub [%o0], %o5 /* Load */ - sll %o5, 8, %o5 /* IEU0 Group */ -1: or %g2, %o4, %o4 /* IEU1 */ - or %o5, %o4, %o4 /* IEU0 Group (regdep) */ - addcc %o4, %o2, %o2 /* IEU1 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: ba,pt %xcc, 25b /* CTI Group */ - sllx %o2, 32, %g1 /* IEU0 */ -21: srl %o2, 0, %o2 /* IEU0 Group */ - cmp %o1, 0 /* IEU1 */ - be,pn %icc, 24b /* CTI */ - andcc %o1, 4, %g0 /* IEU1 Group */ - be,a,pn %icc, 1f /* CTI */ - clr %g2 /* IEU0 */ - lduh [%o0], %g3 /* Load */ - lduh [%o0+2], %g2 /* Load Group */ - add %o0, 4, %o0 /* IEU0 Group */ - sllx %g3, 48, %g3 /* IEU0 Group */ - sllx %g2, 32, %g2 /* IEU0 Group */ - or %g3, %g2, %g2 /* IEU0 Group */ -1: andcc %o1, 2, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o4 /* IEU0 Group */ - lduh [%o0], %o4 /* Load */ - add %o0, 2, %o0 /* IEU1 */ - sll %o4, 16, %o4 /* IEU0 Group */ -1: andcc %o1, 1, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o5 /* IEU0 Group */ - ldub [%o0], %o5 /* Load */ - sll %o5, 8, %o5 /* IEU0 Group */ -1: or %g2, %o4, %o4 /* IEU1 */ - or %o5, %o4, %o4 /* IEU0 Group (regdep) */ - addcc %o4, %o2, %o2 /* IEU1 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %o2, 1, %o2 /* IEU0 */ -1: ba,pt %xcc, 25b /* CTI Group */ - sllx %o2, 32, %g1 /* IEU0 */ +23: brnz,pn %o3, 26f /* CTI+IEU1 Group */ +24: sllx %o2, 32, %g1 /* IEU0 */ +25: addcc %o2, %g1, %o0 /* IEU1 Group */ + srlx %o0, 32, %o0 /* IEU0 Group (regdep) */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o0, 1, %o0 /* IEU1 */ +1: retl /* CTI Group brk forced*/ + srl %o0, 0, %o0 /* IEU0 */ +26: andcc %o1, 8, %g0 /* IEU1 Group */ + be,pn %icc, 1f /* CTI */ + ldx [%o0], %g3 /* Load */ + add %o0, 8, %o0 /* IEU0 Group */ + addcc %g3, %o2, %o2 /* IEU1 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +1: andcc %o1, 4, %g0 /* IEU1 Group */ + be,a,pn %icc, 1f /* CTI */ + clr %g2 /* IEU0 */ + ld [%o0], %g2 /* Load */ + add %o0, 4, %o0 /* IEU0 Group */ + sllx %g2, 32, %g2 /* IEU0 Group */ +1: andcc %o1, 2, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %o4 /* IEU0 Group */ + lduh [%o0], %o4 /* Load */ + add %o0, 2, %o0 /* IEU1 */ + sll %o4, 16, %o4 /* IEU0 Group */ +1: andcc %o1, 1, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %o5 /* IEU0 Group */ + ldub [%o0], %o5 /* Load */ + sll %o5, 8, %o5 /* IEU0 Group */ +1: or %g2, %o4, %o4 /* IEU1 */ + or %o5, %o4, %o4 /* IEU0 Group (regdep) */ + addcc %o4, %o2, %o2 /* IEU1 Group (regdep) */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +1: ba,pt %xcc, 25b /* CTI Group */ + sllx %o2, 32, %g1 /* IEU0 */ +21: srl %o2, 0, %o2 /* IEU0 Group */ + cmp %o1, 0 /* IEU1 */ + be,pn %icc, 24b /* CTI */ + andcc %o1, 4, %g0 /* IEU1 Group */ + be,a,pn %icc, 1f /* CTI */ + clr %g2 /* IEU0 */ + lduh [%o0], %g3 /* Load */ + lduh [%o0+2], %g2 /* Load Group */ + add %o0, 4, %o0 /* IEU0 Group */ + sllx %g3, 48, %g3 /* IEU0 Group */ + sllx %g2, 32, %g2 /* IEU0 Group */ + or %g3, %g2, %g2 /* IEU0 Group */ +1: andcc %o1, 2, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %o4 /* IEU0 Group */ + lduh [%o0], %o4 /* Load */ + add %o0, 2, %o0 /* IEU1 */ + sll %o4, 16, %o4 /* IEU0 Group */ +1: andcc %o1, 1, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %o5 /* IEU0 Group */ + ldub [%o0], %o5 /* Load */ + sll %o5, 8, %o5 /* IEU0 Group */ +1: or %g2, %o4, %o4 /* IEU1 */ + or %o5, %o4, %o4 /* IEU0 Group (regdep) */ + addcc %o4, %o2, %o2 /* IEU1 Group (regdep) */ + bcs,a,pn %xcc, 1f /* CTI */ + add %o2, 1, %o2 /* IEU0 */ +1: ba,pt %xcc, 25b /* CTI Group */ + sllx %o2, 32, %g1 /* IEU0 */ diff --git a/arch/sparc64/lib/VIScsumcopy.S b/arch/sparc64/lib/VIScsumcopy.S index 3f89eea29..9b0193022 100644 --- a/arch/sparc64/lib/VIScsumcopy.S +++ b/arch/sparc64/lib/VIScsumcopy.S @@ -1,4 +1,4 @@ -/* $Id: VIScsumcopy.S,v 1.7 2000/01/19 04:06:03 davem Exp $ +/* $Id: VIScsumcopy.S,v 1.8 2000/02/20 23:21:39 davem Exp $ * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous * copying utilizing the UltraSparc Visual Instruction Set. * @@ -62,384 +62,386 @@ * per 64bytes checksummed/copied. */ -#define LDBLK(O0) \ - ldda [%src] %asi, %O0 /* Load Group */ +#define LDBLK(O0) \ + ldda [%src] %asi, %O0 /* Load Group */ -#define STBLK \ - stda %f48, [%dst] ASI_BLK_P /* Store */ +#define STBLK \ + stda %f48, [%dst] ASI_BLK_P /* Store */ -#define ST(fx,off) \ - std %fx, [%dst + off] /* Store */ +#define ST(fx,off) \ + std %fx, [%dst + off] /* Store */ -#define SYNC \ +#define SYNC \ membar #Sync #define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \ - LOAD /* Load Group */; \ - faligndata %A14, %F0, %A14 /* FPA Group */; \ - inc %x5 /* IEU0 */; \ - STORE1 /* Store (optional) */; \ - faligndata %F0, %F2, %A0 /* FPA Group */; \ - srl %x5, 1, %x5 /* IEU0 */; \ - add %sum, %x4, %sum /* IEU1 */; \ - fpadd32 %F0, %f0, %F0 /* FPA Group */; \ - inc %x6 /* IEU0 */; \ - STORE2 /* Store (optional) */; \ - faligndata %F2, %F4, %A2 /* FPA Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fpadd32 %F2, %f2, %F2 /* FPA Group */; \ - add %src, 64, %src /* IEU0 */; \ - add %dst, 64, %dst /* IEU1 */; \ - fcmpgt32 %f0, %F0, %x1 /* FPM Group */; \ - inc %x7 /* IEU0 */; \ - STORE3 /* Store (optional) */; \ - faligndata %F4, %F6, %A4 /* FPA */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - add %sum, %x6, %sum /* IEU1 */; \ - fpadd32 %F4, %f4, %F4 /* FPA */; \ - fcmpgt32 %f2, %F2, %x2 /* FPM Group */; \ - inc %x8 /* IEU0 */; \ - STORE4 /* Store (optional) */; \ - faligndata %F6, %F8, %A6 /* FPA */; \ - srl %x8, 1, %x8 /* IEU0 Group */; \ - add %sum, %x7, %sum /* IEU1 */; \ - fpadd32 %F6, %f6, %F6 /* FPA */; \ - fcmpgt32 %f4, %F4, %x3 /* FPM Group */; \ - inc %x1 /* IEU0 */; \ - STORE5 /* Store (optional) */; \ - faligndata %F8, %F10, %A8 /* FPA */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - add %sum, %x8, %sum /* IEU1 */; \ - fpadd32 %F8, %f8, %F8 /* FPA */; \ - fcmpgt32 %f6, %F6, %x4 /* FPM Group */; \ - inc %x2 /* IEU0 */; \ - STORE6 /* Store (optional) */; \ - faligndata %F10, %F12, %A10 /* FPA */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - add %sum, %x1, %sum /* IEU1 */; \ - fpadd32 %F10, %f10, %F10 /* FPA */; \ - fcmpgt32 %f8, %F8, %x5 /* FPM Group */; \ - inc %x3 /* IEU0 */; \ - STORE7 /* Store (optional) */; \ - faligndata %F12, %F14, %A12 /* FPA */; \ - srl %x3, 1, %x3 /* IEU0 Group */; \ - add %sum, %x2, %sum /* IEU1 */; \ - fpadd32 %F12, %f12, %F12 /* FPA */; \ - fcmpgt32 %f10, %F10, %x6 /* FPM Group */; \ - inc %x4 /* IEU0 */; \ - STORE8 /* Store (optional) */; \ - fmovd %F14, %B14 /* FPA */; \ - srl %x4, 1, %x4 /* IEU0 Group */; \ - add %sum, %x3, %sum /* IEU1 */; \ - fpadd32 %F14, %f14, %F14 /* FPA */; \ - fcmpgt32 %f12, %F12, %x7 /* FPM Group */; \ - subcc %len, 64, %len /* IEU1 */; \ - BRANCH /* CTI */; \ - fcmpgt32 %f14, %F14, %x8 /* FPM Group */; \ + LOAD /* Load (Group) */; \ + faligndata %A14, %F0, %A14 /* FPA Group */; \ + inc %x5 /* IEU0 */; \ + STORE1 /* Store (optional) */; \ + faligndata %F0, %F2, %A0 /* FPA Group */; \ + srl %x5, 1, %x5 /* IEU0 */; \ + add %sum, %x4, %sum /* IEU1 */; \ + fpadd32 %F0, %f0, %F0 /* FPA Group */; \ + inc %x6 /* IEU0 */; \ + STORE2 /* Store (optional) */; \ + faligndata %F2, %F4, %A2 /* FPA Group */; \ + srl %x6, 1, %x6 /* IEU0 */; \ + add %sum, %x5, %sum /* IEU1 */; \ + fpadd32 %F2, %f2, %F2 /* FPA Group */; \ + add %src, 64, %src /* IEU0 */; \ + fcmpgt32 %f0, %F0, %x1 /* FPM */; \ + add %dst, 64, %dst /* IEU1 Group */; \ + inc %x7 /* IEU0 */; \ + STORE3 /* Store (optional) */; \ + faligndata %F4, %F6, %A4 /* FPA */; \ + fpadd32 %F4, %f4, %F4 /* FPA Group */; \ + add %sum, %x6, %sum /* IEU1 */; \ + fcmpgt32 %f2, %F2, %x2 /* FPM */; \ + srl %x7, 1, %x7 /* IEU0 Group */; \ + inc %x8 /* IEU1 */; \ + STORE4 /* Store (optional) */; \ + faligndata %F6, %F8, %A6 /* FPA */; \ + fpadd32 %F6, %f6, %F6 /* FPA Group */; \ + srl %x8, 1, %x8 /* IEU0 */; \ + fcmpgt32 %f4, %F4, %x3 /* FPM */; \ + add %sum, %x7, %sum /* IEU0 Group */; \ + inc %x1 /* IEU1 */; \ + STORE5 /* Store (optional) */; \ + faligndata %F8, %F10, %A8 /* FPA */; \ + fpadd32 %F8, %f8, %F8 /* FPA Group */; \ + srl %x1, 1, %x1 /* IEU0 */; \ + fcmpgt32 %f6, %F6, %x4 /* FPM */; \ + add %sum, %x8, %sum /* IEU0 Group */; \ + inc %x2 /* IEU1 */; \ + STORE6 /* Store (optional) */; \ + faligndata %F10, %F12, %A10 /* FPA */; \ + fpadd32 %F10, %f10, %F10 /* FPA Group */; \ + srl %x2, 1, %x2 /* IEU0 */; \ + fcmpgt32 %f8, %F8, %x5 /* FPM */; \ + add %sum, %x1, %sum /* IEU0 Group */; \ + inc %x3 /* IEU1 */; \ + STORE7 /* Store (optional) */; \ + faligndata %F12, %F14, %A12 /* FPA */; \ + fpadd32 %F12, %f12, %F12 /* FPA Group */; \ + srl %x3, 1, %x3 /* IEU0 */; \ + fcmpgt32 %f10, %F10, %x6 /* FPM */; \ + add %sum, %x2, %sum /* IEU0 Group */; \ + inc %x4 /* IEU1 */; \ + STORE8 /* Store (optional) */; \ + fmovd %F14, %B14 /* FPA */; \ + fpadd32 %F14, %f14, %F14 /* FPA Group */; \ + srl %x4, 1, %x4 /* IEU0 */; \ + fcmpgt32 %f12, %F12, %x7 /* FPM */; \ + add %sum, %x3, %sum /* IEU0 Group */; \ + subcc %len, 64, %len /* IEU1 */; \ + BRANCH /* CTI */; \ + fcmpgt32 %f14, %F14, %x8 /* FPM Group */; #define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \ - inc %x5 /* IEU0 Group */; \ - fpadd32 %f2, %f0, %S0 /* FPA */; \ - srl %x5, 1, %x5 /* IEU0 Group */; \ - add %sum, %x4, %sum /* IEU1 */; \ - fpadd32 %f6, %f4, %S1 /* FPA */; \ - inc %x6 /* IEU0 Group */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fcmpgt32 %f0, %S0, %x1 /* FPM Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - inc %x7 /* IEU1 */; \ - fpadd32 %f10, %f8, %S2 /* FPA */; \ - fcmpgt32 %f4, %S1, %x2 /* FPM Group */; \ - srl %x7, 1, %x7 /* IEU0 */; \ - add %sum, %x6, %sum /* IEU1 */; \ - fpadd32 %f14, %f12, %S3 /* FPA */; \ - inc %x8 /* IEU0 Group */; \ - add %sum, %x7, %sum /* IEU1 */; \ - fzero %fz /* FPA */; \ - fcmpgt32 %f8, %S2, %x3 /* FPM Group */; \ - srl %x8, 1, %x8 /* IEU0 */; \ - inc %x1 /* IEU1 */; \ - fpadd32 %S0, %S1, %T0 /* FPA */; \ - fcmpgt32 %f12, %S3, %x4 /* FPM Group */; \ - srl %x1, 1, %x1 /* IEU0 */; \ - add %sum, %x8, %sum /* IEU1 */; \ - fpadd32 %S2, %S3, %T1 /* FPA */; \ - inc %x2 /* IEU0 Group */; \ - add %sum, %x1, %sum /* IEU1 */; \ - fcmpgt32 %S0, %T0, %x5 /* FPM Group */; \ - srl %x2, 1, %x2 /* IEU0 */; \ - inc %x3 /* IEU1 */; \ - fcmpgt32 %S2, %T1, %x6 /* FPM Group */; \ - srl %x3, 1, %x3 /* IEU0 */; \ - add %sum, %x2, %sum /* IEU1 */; \ - inc %x4 /* IEU0 Group */; \ - add %sum, %x3, %sum /* IEU1 */; \ - fcmpgt32 %fz, %f2, %x7 /* FPM Group */; \ - srl %x4, 1, %x4 /* IEU0 */; \ - inc %x5 /* IEU1 */; \ - fpadd32 %T0, %T1, %U0 /* FPA */; \ - fcmpgt32 %fz, %f6, %x8 /* FPM Group */; \ - srl %x5, 1, %x5 /* IEU0 */; \ - add %sum, %x4, %sum /* IEU1 */; \ - inc %x6 /* IEU0 Group */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fcmpgt32 %fz, %f10, %x1 /* FPM Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - inc %x7 /* IEU1 */; \ - fcmpgt32 %fz, %f14, %x2 /* FPM Group */; \ - ba,pt %xcc, ett /* CTI */; \ - fmovd %FA, %FB /* FPA */; \ + inc %x5 /* IEU0 Group */; \ + fpadd32 %f2, %f0, %S0 /* FPA */; \ + add %sum, %x4, %sum /* IEU1 */; \ + srl %x5, 1, %x5 /* IEU0 Group */; \ + fpadd32 %f6, %f4, %S1 /* FPA */; \ + inc %x6 /* IEU1 */; \ + fpadd32 %f10, %f8, %S2 /* FPA Group */; \ + add %sum, %x5, %sum /* IEU0 */; \ + fcmpgt32 %f0, %S0, %x1 /* FPM */; \ + fpadd32 %f14, %f12, %S3 /* FPA Group */; \ + srl %x6, 1, %x6 /* IEU0 */; \ + fcmpgt32 %f4, %S1, %x2 /* FPM */; \ + add %sum, %x6, %sum /* IEU0 Group */; \ + fzero %fz /* FPA */; \ + fcmpgt32 %f8, %S2, %x3 /* FPM */; \ + inc %x7 /* IEU0 Group */; \ + inc %x8 /* IEU1 */; \ + srl %x7, 1, %x7 /* IEU0 Group */; \ + inc %x1 /* IEU1 */; \ + fpadd32 %S0, %S1, %T0 /* FPA */; \ + fpadd32 %S2, %S3, %T1 /* FPA Group */; \ + add %sum, %x7, %sum /* IEU0 */; \ + fcmpgt32 %f12, %S3, %x4 /* FPM */; \ + srl %x8, 1, %x8 /* IEU0 Group */; \ + inc %x2 /* IEU1 */; \ + srl %x1, 1, %x1 /* IEU0 Group */; \ + add %sum, %x8, %sum /* IEU1 */; \ + add %sum, %x1, %sum /* IEU0 Group */; \ + fcmpgt32 %S0, %T0, %x5 /* FPM */; \ + srl %x2, 1, %x2 /* IEU0 Group */; \ + fcmpgt32 %S2, %T1, %x6 /* FPM */; \ + inc %x3 /* IEU0 Group */; \ + add %sum, %x2, %sum /* IEU1 */; \ + srl %x3, 1, %x3 /* IEU0 Group */; \ + inc %x4 /* IEU1 */; \ + fpadd32 %T0, %T1, %U0 /* FPA Group */; \ + add %sum, %x3, %sum /* IEU0 */; \ + fcmpgt32 %fz, %f2, %x7 /* FPM */; \ + srl %x4, 1, %x4 /* IEU0 Group */; \ + fcmpgt32 %fz, %f6, %x8 /* FPM */; \ + inc %x5 /* IEU0 Group */; \ + add %sum, %x4, %sum /* IEU1 */; \ + srl %x5, 1, %x5 /* IEU0 Group */; \ + fcmpgt32 %fz, %f10, %x1 /* FPM */; \ + inc %x6 /* IEU0 Group */; \ + add %sum, %x5, %sum /* IEU1 */; \ + fmovd %FA, %FB /* FPA Group */; \ + fcmpgt32 %fz, %f14, %x2 /* FPM */; \ + srl %x6, 1, %x6 /* IEU0 Group */; \ + ba,pt %xcc, ett /* CTI */; \ + inc %x7 /* IEU1 */; -#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \ +#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \ END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62) -#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \ - fpadd32 %U0, %U1, %V0 /* FPA Group */; \ - srl %x7, 1, %x7 /* IEU0 */; \ - add %sum, %x6, %sum /* IEU1 */; \ - std %V0, [%sp + STACKOFF] /* Store Group */; \ - inc %x8 /* IEU0 */; \ - sub %sum, %x7, %sum /* IEU1 */; \ - fcmpgt32 %fz, %S1, %x3 /* FPM Group */; \ - srl %x8, 1, %x8 /* IEU0 */; \ - inc %x1 /* IEU1 */; \ - fcmpgt32 %fz, %S3, %x4 /* FPM Group */; \ - srl %x1, 1, %x1 /* IEU0 */; \ - sub %sum, %x8, %sum /* IEU1 */; \ - ldx [%sp + STACKOFF], %x8 /* Load Group */; \ - inc %x2 /* IEU0 */; \ - sub %sum, %x1, %sum /* IEU1 */; \ - fcmpgt32 %fz, %T1, %x5 /* FPM Group */; \ - srl %x2, 1, %x2 /* IEU0 */; \ - inc %x3 /* IEU1 */; \ - fcmpgt32 %T0, %U0, %x6 /* FPM Group */; \ - srl %x3, 1, %x3 /* IEU0 */; \ - sub %sum, %x2, %sum /* IEU1 */; \ - inc %x4 /* IEU0 Group */; \ - sub %sum, %x3, %sum /* IEU1 */; \ - fcmpgt32 %fz, %U1, %x7 /* FPM Group */; \ - srl %x4, 1, %x4 /* IEU0 */; \ - inc %x5 /* IEU1 */; \ - fcmpgt32 %U0, %V0, %x1 /* FPM Group */; \ - srl %x5, 1, %x5 /* IEU0 */; \ - sub %sum, %x4, %sum /* IEU1 */; \ - fcmpgt32 %fz, %V0, %x2 /* FPM Group */; \ - inc %x6 /* IEU0 */; \ - sub %sum, %x5, %sum /* IEU1 */; \ - srl %x6, 1, %x6 /* IEU0 Group */; \ - inc %x7 /* IEU1 */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - add %sum, %x6, %sum /* IEU1 */; \ - inc %x1 /* IEU0 Group */; \ - sub %sum, %x7, %sum /* IEU1 */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - inc %x2 /* IEU1 */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - add %sum, %x1, %sum /* IEU1 */; \ - sub %sum, %x2, %sum /* IEU0 Group */; \ - addcc %sum, %x8, %sum /* IEU Group */; \ - bcs,a,pn %xcc, 33f /* CTI */; \ - add %sum, 1, %sum /* IEU0 */; \ -33: /* That's it */; +#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \ + fpadd32 %U0, %U1, %V0 /* FPA Group */; \ + srl %x7, 1, %x7 /* IEU0 */; \ + add %sum, %x6, %sum /* IEU1 */; \ + std %V0, [%sp + STACKOFF] /* Store Group */; \ + inc %x8 /* IEU0 */; \ + sub %sum, %x7, %sum /* IEU1 */; \ + srl %x8, 1, %x8 /* IEU0 Group */; \ + fcmpgt32 %fz, %S1, %x3 /* FPM */; \ + inc %x1 /* IEU0 Group */; \ + fcmpgt32 %fz, %S3, %x4 /* FPM */; \ + srl %x1, 1, %x1 /* IEU0 Group */; \ + sub %sum, %x8, %sum /* IEU1 */; \ + ldx [%sp + STACKOFF], %x8 /* Load Group */; \ + inc %x2 /* IEU0 */; \ + sub %sum, %x1, %sum /* IEU1 */; \ + srl %x2, 1, %x2 /* IEU0 Group */; \ + fcmpgt32 %fz, %T1, %x5 /* FPM */; \ + inc %x3 /* IEU0 Group */; \ + fcmpgt32 %T0, %U0, %x6 /* FPM */; \ + srl %x3, 1, %x3 /* IEU0 Group */; \ + sub %sum, %x2, %sum /* IEU1 */; \ + inc %x4 /* IEU0 Group */; \ + sub %sum, %x3, %sum /* IEU1 */; \ + srl %x4, 1, %x4 /* IEU0 Group */; \ + fcmpgt32 %fz, %U1, %x7 /* FPM */; \ + inc %x5 /* IEU0 Group */; \ + fcmpgt32 %U0, %V0, %x1 /* FPM */; \ + srl %x5, 1, %x5 /* IEU0 Group */; \ + sub %sum, %x4, %sum /* IEU1 */; \ + sub %sum, %x5, %sum /* IEU0 Group */; \ + fcmpgt32 %fz, %V0, %x2 /* FPM */; \ + inc %x6 /* IEU0 Group */; \ + inc %x7 /* IEU1 */; \ + srl %x6, 1, %x6 /* IEU0 Group */; \ + inc %x1 /* IEU1 */; \ + srl %x7, 1, %x7 /* IEU0 Group */; \ + add %sum, %x6, %sum /* IEU1 */; \ + srl %x1, 1, %x1 /* IEU0 Group */; \ + sub %sum, %x7, %sum /* IEU1 */; \ + inc %x2 /* IEU0 Group */; \ + add %sum, %x1, %sum /* IEU1 */; \ + srl %x2, 1, %x2 /* IEU0 Group */; \ + sub %sum, %x2, %sum /* IEU0 Group */; \ + addcc %sum, %x8, %sum /* IEU1 Group */; \ + bcs,a,pn %xcc, 33f /* CTI */; \ + add %sum, 1, %sum /* IEU0 (Group) */; \ +33: /* That's it */; .text .globl csum_partial_copy_vis .align 32 -/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. csum_partial_copy_from_user */ -/* This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 */ +/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. + * csum_partial_copy_from_user + * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 + */ csum_partial_copy_vis: - andcc %dst, 7, %g0 /* IEU1 Group */ - be,pt %icc, 4f /* CTI */ - and %dst, 0x38, %o4 /* IEU0 */ - mov 1, %g5 /* IEU0 Group */ - andcc %dst, 2, %g0 /* IEU1 */ - be,pt %icc, 1f /* CTI */ - and %dst, 4, %g7 /* IEU0 Group */ - lduha [%src] %asi, %g2 /* Load */ - sub %len, 2, %len /* IEU0 Group */ - add %dst, 2, %dst /* IEU1 */ - andcc %dst, 4, %g7 /* IEU1 Group */ - sll %g5, 16, %g5 /* IEU0 */ - sth %g2, [%dst - 2] /* Store Group */ - sll %g2, 16, %g2 /* IEU0 */ - add %src, 2, %src /* IEU1 */ - addcc %g2, %sum, %sum /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %sum, %g5, %sum /* IEU0 */ -1: lduwa [%src] %asi, %g2 /* Load */ - brz,a,pn %g7, 4f /* CTI+IEU1 Group */ - and %dst, 0x38, %o4 /* IEU0 */ - add %dst, 4, %dst /* IEU0 Group */ - sub %len, 4, %len /* IEU1 */ - addcc %g2, %sum, %sum /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: and %dst, 0x38, %o4 /* IEU0 Group */ - stw %g2, [%dst - 4] /* Store */ - add %src, 4, %src /* IEU1 */ + andcc %dst, 7, %g0 /* IEU1 Group */ + be,pt %icc, 4f /* CTI */ + and %dst, 0x38, %o4 /* IEU0 */ + mov 1, %g5 /* IEU0 Group */ + andcc %dst, 2, %g0 /* IEU1 */ + be,pt %icc, 1f /* CTI */ + and %dst, 4, %g7 /* IEU0 Group */ + lduha [%src] %asi, %g2 /* Load */ + sub %len, 2, %len /* IEU0 Group */ + add %dst, 2, %dst /* IEU1 */ + andcc %dst, 4, %g7 /* IEU1 Group */ + sll %g5, 16, %g5 /* IEU0 */ + sth %g2, [%dst - 2] /* Store Group */ + sll %g2, 16, %g2 /* IEU0 */ + add %src, 2, %src /* IEU1 */ + addcc %g2, %sum, %sum /* IEU1 Group */ + bcs,a,pn %icc, 1f /* CTI */ + add %sum, %g5, %sum /* IEU0 */ +1: lduwa [%src] %asi, %g2 /* Load */ + brz,a,pn %g7, 4f /* CTI+IEU1 Group */ + and %dst, 0x38, %o4 /* IEU0 */ + add %dst, 4, %dst /* IEU0 Group */ + sub %len, 4, %len /* IEU1 */ + addcc %g2, %sum, %sum /* IEU1 Group */ + bcs,a,pn %icc, 1f /* CTI */ + add %sum, 1, %sum /* IEU0 */ +1: and %dst, 0x38, %o4 /* IEU0 Group */ + stw %g2, [%dst - 4] /* Store */ + add %src, 4, %src /* IEU1 */ 4: #ifdef __KERNEL__ VISEntry #endif - mov %src, %g7 /* IEU1 Group */ - fzero %f48 /* FPA */ - alignaddr %src, %g0, %src /* Single Group */ - subcc %g7, %src, %g7 /* IEU1 Group */ - be,pt %xcc, 1f /* CTI */ - mov 0x40, %g1 /* IEU0 */ - lduwa [%src] %asi, %g2 /* Load Group */ - subcc %sum, %g2, %sum /* IEU1 Group+load stall */ - bcs,a,pn %icc, 1f /* CTI */ - sub %sum, 1, %sum /* IEU0 */ -1: srl %sum, 0, %sum /* IEU0 Group */ - clr %g5 /* IEU1 */ - brz,pn %o4, 3f /* CTI+IEU1 Group */ - sub %g1, %o4, %g1 /* IEU0 */ - ldda [%src] %asi, %f0 /* Load */ - clr %o4 /* IEU0 Group */ - andcc %dst, 8, %g0 /* IEU1 */ - be,pn %icc, 1f /* CTI */ - ldda [%src + 8] %asi, %f2 /* Load Group */ - add %src, 8, %src /* IEU0 */ - sub %len, 8, %len /* IEU1 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - addcc %dst, 8, %dst /* IEU1 Group */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %o4 /* FPM Group */ - fmovd %f2, %f0 /* FPA Group */ - ldda [%src + 8] %asi, %f2 /* Load */ - std %f16, [%dst - 8] /* Store */ - fmovd %f50, %f48 /* FPA */ -1: andcc %g1, 0x10, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - and %g1, 0x20, %g1 /* IEU0 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - ldda [%src + 16] %asi, %f4 /* Load Group */ - add %src, 16, %src /* IEU0 */ - add %dst, 16, %dst /* IEU1 */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %g5 /* FPM Group */ - sub %len, 16, %len /* IEU0 */ - inc %o4 /* IEU1 */ - std %f16, [%dst - 16] /* Store Group */ - fpadd32 %f2, %f50, %f48 /* FPA */ - srl %o4, 1, %o5 /* IEU0 */ - faligndata %f2, %f4, %f18 /* FPA Group */ - std %f18, [%dst - 8] /* Store */ - fcmpgt32 %f50, %f48, %o4 /* FPM Group */ - add %o5, %sum, %sum /* IEU0 */ - ldda [%src + 8] %asi, %f2 /* Load */ - fmovd %f4, %f0 /* FPA */ -1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */ - rd %asi, %g2 /* LSU Group + 4 bubbles */ - inc %g5 /* IEU0 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - ldda [%src + 16] %asi, %f4 /* Load Group */ - srl %g5, 1, %g5 /* IEU0 */ - add %dst, 32, %dst /* IEU1 */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %o5 /* FPM Group */ - inc %o4 /* IEU0 */ - ldda [%src + 24] %asi, %f6 /* Load */ - srl %o4, 1, %o4 /* IEU0 Group */ - add %g5, %sum, %sum /* IEU1 */ - ldda [%src + 32] %asi, %f8 /* Load */ - fpadd32 %f2, %f50, %f48 /* FPA */ - faligndata %f2, %f4, %f18 /* FPA Group */ - sub %len, 32, %len /* IEU0 */ - std %f16, [%dst - 32] /* Store */ - fcmpgt32 %f50, %f48, %g3 /* FPM Group */ - inc %o5 /* IEU0 */ - add %o4, %sum, %sum /* IEU1 */ - fpadd32 %f4, %f48, %f50 /* FPA */ - faligndata %f4, %f6, %f20 /* FPA Group */ - srl %o5, 1, %o5 /* IEU0 */ - fcmpgt32 %f48, %f50, %g5 /* FPM Group */ - add %o5, %sum, %sum /* IEU0 */ - std %f18, [%dst - 24] /* Store */ - fpadd32 %f6, %f50, %f48 /* FPA */ - inc %g3 /* IEU0 Group */ - std %f20, [%dst - 16] /* Store */ - add %src, 32, %src /* IEU1 */ - faligndata %f6, %f8, %f22 /* FPA */ - fcmpgt32 %f50, %f48, %o4 /* FPM Group */ - srl %g3, 1, %g3 /* IEU0 */ - std %f22, [%dst - 8] /* Store */ - add %g3, %sum, %sum /* IEU0 Group */ -3: rd %asi, %g2 /* LSU Group + 4 bubbles */ + mov %src, %g7 /* IEU1 Group */ + fzero %f48 /* FPA */ + alignaddr %src, %g0, %src /* Single Group */ + subcc %g7, %src, %g7 /* IEU1 Group */ + be,pt %xcc, 1f /* CTI */ + mov 0x40, %g1 /* IEU0 */ + lduwa [%src] %asi, %g2 /* Load Group */ + subcc %sum, %g2, %sum /* IEU1 Group+load stall*/ + bcs,a,pn %icc, 1f /* CTI */ + sub %sum, 1, %sum /* IEU0 */ +1: srl %sum, 0, %sum /* IEU0 Group */ + clr %g5 /* IEU1 */ + brz,pn %o4, 3f /* CTI+IEU1 Group */ + sub %g1, %o4, %g1 /* IEU0 */ + ldda [%src] %asi, %f0 /* Load */ + clr %o4 /* IEU0 Group */ + andcc %dst, 8, %g0 /* IEU1 */ + be,pn %icc, 1f /* CTI */ + ldda [%src + 8] %asi, %f2 /* Load Group */ + add %src, 8, %src /* IEU0 */ + sub %len, 8, %len /* IEU1 */ + fpadd32 %f0, %f48, %f50 /* FPA */ + addcc %dst, 8, %dst /* IEU1 Group */ + faligndata %f0, %f2, %f16 /* FPA */ + fcmpgt32 %f48, %f50, %o4 /* FPM Group */ + fmovd %f2, %f0 /* FPA Group */ + ldda [%src + 8] %asi, %f2 /* Load */ + std %f16, [%dst - 8] /* Store */ + fmovd %f50, %f48 /* FPA */ +1: andcc %g1, 0x10, %g0 /* IEU1 Group */ + be,pn %icc, 1f /* CTI */ + and %g1, 0x20, %g1 /* IEU0 */ + fpadd32 %f0, %f48, %f50 /* FPA */ + ldda [%src + 16] %asi, %f4 /* Load Group */ + add %src, 16, %src /* IEU0 */ + add %dst, 16, %dst /* IEU1 */ + faligndata %f0, %f2, %f16 /* FPA */ + fcmpgt32 %f48, %f50, %g5 /* FPM Group */ + sub %len, 16, %len /* IEU0 */ + inc %o4 /* IEU1 */ + std %f16, [%dst - 16] /* Store Group */ + fpadd32 %f2, %f50, %f48 /* FPA */ + srl %o4, 1, %o5 /* IEU0 */ + faligndata %f2, %f4, %f18 /* FPA Group */ + std %f18, [%dst - 8] /* Store */ + fcmpgt32 %f50, %f48, %o4 /* FPM Group */ + add %o5, %sum, %sum /* IEU0 */ + ldda [%src + 8] %asi, %f2 /* Load */ + fmovd %f4, %f0 /* FPA */ +1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */ + rd %asi, %g2 /* LSU Group + 4 bubbles*/ + inc %g5 /* IEU0 */ + fpadd32 %f0, %f48, %f50 /* FPA */ + ldda [%src + 16] %asi, %f4 /* Load Group */ + srl %g5, 1, %g5 /* IEU0 */ + add %dst, 32, %dst /* IEU1 */ + faligndata %f0, %f2, %f16 /* FPA */ + fcmpgt32 %f48, %f50, %o5 /* FPM Group */ + inc %o4 /* IEU0 */ + ldda [%src + 24] %asi, %f6 /* Load */ + srl %o4, 1, %o4 /* IEU0 Group */ + add %g5, %sum, %sum /* IEU1 */ + ldda [%src + 32] %asi, %f8 /* Load */ + fpadd32 %f2, %f50, %f48 /* FPA */ + faligndata %f2, %f4, %f18 /* FPA Group */ + sub %len, 32, %len /* IEU0 */ + std %f16, [%dst - 32] /* Store */ + fcmpgt32 %f50, %f48, %g3 /* FPM Group */ + inc %o5 /* IEU0 */ + add %o4, %sum, %sum /* IEU1 */ + fpadd32 %f4, %f48, %f50 /* FPA */ + faligndata %f4, %f6, %f20 /* FPA Group */ + srl %o5, 1, %o5 /* IEU0 */ + fcmpgt32 %f48, %f50, %g5 /* FPM Group */ + add %o5, %sum, %sum /* IEU0 */ + std %f18, [%dst - 24] /* Store */ + fpadd32 %f6, %f50, %f48 /* FPA */ + inc %g3 /* IEU0 Group */ + std %f20, [%dst - 16] /* Store */ + add %src, 32, %src /* IEU1 */ + faligndata %f6, %f8, %f22 /* FPA */ + fcmpgt32 %f50, %f48, %o4 /* FPM Group */ + srl %g3, 1, %g3 /* IEU0 */ + std %f22, [%dst - 8] /* Store */ + add %g3, %sum, %sum /* IEU0 Group */ +3: rd %asi, %g2 /* LSU Group + 4 bubbles*/ #ifdef __KERNEL__ -4: sethi %hi(vis0s), %g7 /* IEU0 Group */ - or %g2, ASI_BLK_OR, %g2 /* IEU1 */ +4: sethi %hi(vis0s), %g7 /* IEU0 Group */ + or %g2, ASI_BLK_OR, %g2 /* IEU1 */ #else -4: rd %pc, %g7 /* LSU Group + 4 bubbles */ +4: rd %pc, %g7 /* LSU Group + 4 bubbles*/ #endif - inc %g5 /* IEU0 Group */ - and %src, 0x38, %g3 /* IEU1 */ - membar #StoreLoad /* LSU Group */ - srl %g5, 1, %g5 /* IEU0 */ - inc %o4 /* IEU1 */ - sll %g3, 8, %g3 /* IEU0 Group */ - sub %len, 0xc0, %len /* IEU1 */ - addcc %g5, %sum, %sum /* IEU1 Group */ - srl %o4, 1, %o4 /* IEU0 */ - add %g7, %g3, %g7 /* IEU0 Group */ - add %o4, %sum, %sum /* IEU1 */ + inc %g5 /* IEU0 Group */ + and %src, 0x38, %g3 /* IEU1 */ + membar #StoreLoad /* LSU Group */ + srl %g5, 1, %g5 /* IEU0 */ + inc %o4 /* IEU1 */ + sll %g3, 8, %g3 /* IEU0 Group */ + sub %len, 0xc0, %len /* IEU1 */ + addcc %g5, %sum, %sum /* IEU1 Group */ + srl %o4, 1, %o4 /* IEU0 */ + add %g7, %g3, %g7 /* IEU0 Group */ + add %o4, %sum, %sum /* IEU1 */ #ifdef __KERNEL__ - jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */ + jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */ #else - jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */ + jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */ #endif - fzero %f32 /* FPA */ + fzero %f32 /* FPA */ .align 2048 -vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f48, %f62 /* FPA Group f0 available */ - faligndata %f0, %f2, %f48 /* FPA Group f2 available */ - fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available */ - fpadd32 %f0, %f62, %f0 /* FPA */ - fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available */ - faligndata %f2, %f4, %f50 /* FPA */ - fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available */ - faligndata %f4, %f6, %f52 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available */ - inc %x1 /* IEU0 */ - faligndata %f6, %f8, %f54 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available */ - srl %x1, 1, %x1 /* IEU0 */ - inc %x2 /* IEU1 */ - faligndata %f8, %f10, %f56 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available */ - srl %x2, 1, %x2 /* IEU0 */ - add %sum, %x1, %sum /* IEU1 */ - faligndata %f10, %f12, %f58 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - add %sum, %x2, %sum /* IEU1 */ - faligndata %f12, %f14, %f60 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f62 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ +vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + add %src, 128, %src /* IEU0 Group */ + ldda [%src-128] %asi, %f0 /* Load Group */ + ldda [%src-64] %asi, %f16 /* Load Group */ + fmovd %f48, %f62 /* FPA Group f0 available*/ + faligndata %f0, %f2, %f48 /* FPA Group f2 available*/ + fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available*/ + fpadd32 %f0, %f62, %f0 /* FPA */ + fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available*/ + faligndata %f2, %f4, %f50 /* FPA */ + fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available*/ + faligndata %f4, %f6, %f52 /* FPA */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available*/ + inc %x1 /* IEU0 */ + faligndata %f6, %f8, %f54 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available*/ + srl %x1, 1, %x1 /* IEU0 */ + inc %x2 /* IEU1 */ + faligndata %f8, %f10, %f56 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available*/ + srl %x2, 1, %x2 /* IEU0 */ + add %sum, %x1, %sum /* IEU1 */ + faligndata %f10, %f12, %f58 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + inc %x3 /* IEU0 */ + add %sum, %x2, %sum /* IEU1 */ + faligndata %f12, %f14, %f60 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + srl %x3, 1, %x3 /* IEU0 */ + inc %x4 /* IEU1 */ + fmovd %f14, %f62 /* FPA */ + srl %x4, 1, %x4 /* IEU0 Group */ + add %sum, %x3, %sum /* IEU1 */ vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, - ,f48,f50,f52,f54,f56,f58,f60,f62,f62, - ,LDBLK(f32), STBLK,,,,,,,, + ,f48,f50,f52,f54,f56,f58,f60,f62,f62, + ,LDBLK(f32), STBLK,,,,,,,, ,bcs,pn %icc, vis0e1) DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, - ,f48,f50,f52,f54,f56,f58,f60,f62,f62, - ,LDBLK(f0), STBLK,,,,,,,, + ,f48,f50,f52,f54,f56,f58,f60,f62,f62, + ,LDBLK(f0), STBLK,,,,,,,, ,bcs,pn %icc, vis0e2) - DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, + DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, ,f48,f50,f52,f54,f56,f58,f60,f62,f62, ,LDBLK(f16), STBLK,,,,,,,, ,bcc,pt %icc, vis0) -vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, +vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f48,f50,f52,f54,f56,f58,f60,f62,f32, ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2) @@ -447,39 +449,39 @@ vis0e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f4 ,f48,f50,f52,f54,f56,f58,f60,f62,f0, ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3) -vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, +vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, ,f48,f50,f52,f54,f56,f58,f60,f62,f16, ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1) .align 2048 -vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128 - 8, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f0, %f58 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - fcmpgt32 %f32, %f2, %x2 /* FPM Group */ - faligndata %f2, %f4, %f48 /* FPA */ - fcmpgt32 %f32, %f4, %x3 /* FPM Group */ - faligndata %f4, %f6, %f50 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f52 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - inc %x2 /* IEU1 */ - faligndata %f8, %f10, %f54 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - srl %x2, 1, %x2 /* IEU0 */ - faligndata %f10, %f12, %f56 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - add %sum, %x2, %sum /* IEU1 */ - faligndata %f12, %f14, %f58 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f60 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ +vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + add %src, 128 - 8, %src /* IEU0 Group */ + ldda [%src-128] %asi, %f0 /* Load Group */ + ldda [%src-64] %asi, %f16 /* Load Group */ + fmovd %f0, %f58 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + fcmpgt32 %f32, %f2, %x2 /* FPM Group */ + faligndata %f2, %f4, %f48 /* FPA */ + fcmpgt32 %f32, %f4, %x3 /* FPM Group */ + faligndata %f4, %f6, %f50 /* FPA */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group */ + faligndata %f6, %f8, %f52 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + inc %x2 /* IEU1 */ + faligndata %f8, %f10, %f54 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + srl %x2, 1, %x2 /* IEU0 */ + faligndata %f10, %f12, %f56 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + inc %x3 /* IEU0 */ + add %sum, %x2, %sum /* IEU1 */ + faligndata %f12, %f14, %f58 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + srl %x3, 1, %x3 /* IEU0 */ + inc %x4 /* IEU1 */ + fmovd %f14, %f60 /* FPA */ + srl %x4, 1, %x4 /* IEU0 Group */ + add %sum, %x3, %sum /* IEU1 */ vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f62,f48,f50,f52,f54,f56,f58,f60,f60, ,LDBLK(f32), ,STBLK,,,,,,, @@ -505,31 +507,31 @@ vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1) .align 2048 -vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128 - 16, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f0, %f56 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fcmpgt32 %f32, %f4, %x3 /* FPM Group */ - faligndata %f4, %f6, %f48 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f50 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f52 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f54 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - faligndata %f12, %f14, %f56 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f58 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ +vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + add %src, 128 - 16, %src /* IEU0 Group */ + ldda [%src-128] %asi, %f0 /* Load Group */ + ldda [%src-64] %asi, %f16 /* Load Group */ + fmovd %f0, %f56 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + sub %dst, 64, %dst /* IEU0 */ + fpsub32 %f2, %f2, %f2 /* FPA Group */ + fcmpgt32 %f32, %f4, %x3 /* FPM Group */ + faligndata %f4, %f6, %f48 /* FPA */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group */ + faligndata %f6, %f8, %f50 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + faligndata %f8, %f10, %f52 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + faligndata %f10, %f12, %f54 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + inc %x3 /* IEU0 */ + faligndata %f12, %f14, %f56 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + srl %x3, 1, %x3 /* IEU0 */ + inc %x4 /* IEU1 */ + fmovd %f14, %f58 /* FPA */ + srl %x4, 1, %x4 /* IEU0 Group */ + add %sum, %x3, %sum /* IEU1 */ vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f60,f62,f48,f50,f52,f54,f56,f58,f58, ,LDBLK(f32), ,,STBLK,,,,,, @@ -555,27 +557,27 @@ vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1) .align 2048 -vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128 - 24, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f0, %f54 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fpsub32 %f4, %f4, %f4 /* FPA Group */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f48 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f50 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f52 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f54 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f56 /* FPA */ - inc %x4 /* IEU0 */ - srl %x4, 1, %x4 /* IEU0 Group */ +vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + add %src, 128 - 24, %src /* IEU0 Group */ + ldda [%src-128] %asi, %f0 /* Load Group */ + ldda [%src-64] %asi, %f16 /* Load Group */ + fmovd %f0, %f54 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + sub %dst, 64, %dst /* IEU0 */ + fpsub32 %f2, %f2, %f2 /* FPA Group */ + fpsub32 %f4, %f4, %f4 /* FPA Group */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group */ + faligndata %f6, %f8, %f48 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + faligndata %f8, %f10, %f50 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + faligndata %f10, %f12, %f52 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + faligndata %f12, %f14, %f54 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + fmovd %f14, %f56 /* FPA */ + inc %x4 /* IEU0 */ + srl %x4, 1, %x4 /* IEU0 Group */ vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f58,f60,f62,f48,f50,f52,f54,f56,f56, ,LDBLK(f32), ,,,STBLK,,,,, @@ -601,25 +603,25 @@ vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1) .align 2048 -vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - add %src, 128 - 32, %src /* IEU0 Group */ - ldda [%src-128] %asi, %f0 /* Load Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f0, %f52 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fpsub32 %f4, %f4, %f4 /* FPA Group */ - fpsub32 %f6, %f6, %f6 /* FPA Group */ - clr %x4 /* IEU0 */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f48 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f50 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f52 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f54 /* FPA */ +vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + add %src, 128 - 32, %src /* IEU0 Group */ + ldda [%src-128] %asi, %f0 /* Load Group */ + ldda [%src-64] %asi, %f16 /* Load Group */ + fmovd %f0, %f52 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + sub %dst, 64, %dst /* IEU0 */ + fpsub32 %f2, %f2, %f2 /* FPA Group */ + fpsub32 %f4, %f4, %f4 /* FPA Group */ + fpsub32 %f6, %f6, %f6 /* FPA Group */ + clr %x4 /* IEU0 */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + faligndata %f8, %f10, %f48 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + faligndata %f10, %f12, %f50 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + faligndata %f12, %f14, %f52 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + fmovd %f14, %f54 /* FPA */ vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f56,f58,f60,f62,f48,f50,f52,f54,f54, ,LDBLK(f32), ,,,,STBLK,,,, @@ -645,26 +647,26 @@ vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80), ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1) .align 2048 -vis5s: add %src, 128 - 40, %src /* IEU0 Group */ - ldda [%src-88] %asi, %f10 /* Load Group */ - ldda [%src-80] %asi, %f12 /* Load Group */ - ldda [%src-72] %asi, %f14 /* Load Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - faligndata %f10, %f12, %f48 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f50 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f52 /* FPA */ +vis5s: add %src, 128 - 40, %src /* IEU0 Group */ + ldda [%src-88] %asi, %f10 /* Load Group */ + ldda [%src-80] %asi, %f12 /* Load Group */ + ldda [%src-72] %asi, %f14 /* Load Group */ + wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src-64] %asi, %f16 /* Load Group */ + fmovd %f48, %f0 /* FPA Group */ + fmuld %f32, %f32, %f2 /* FPM */ + clr %x4 /* IEU0 */ + faddd %f32, %f32, %f4 /* FPA Group */ + fmuld %f32, %f32, %f6 /* FPM */ + clr %x5 /* IEU0 */ + faddd %f32, %f32, %f8 /* FPA Group */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + sub %dst, 64, %dst /* IEU0 */ + faligndata %f10, %f12, %f48 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + faligndata %f12, %f14, %f50 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + fmovd %f14, %f52 /* FPA */ vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f54,f56,f58,f60,f62,f48,f50,f52,f52, ,LDBLK(f32), ,,,,,STBLK,,, @@ -690,25 +692,25 @@ vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72), ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1) .align 2048 -vis6s: add %src, 128 - 48, %src /* IEU0 Group */ - ldda [%src-80] %asi, %f12 /* Load Group */ - ldda [%src-72] %asi, %f14 /* Load Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fmuld %f32, %f32, %f10 /* FPM */ - clr %x6 /* IEU0 */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - faligndata %f12, %f14, %f48 /* FPA */ - fmovd %f14, %f50 /* FPA Group */ +vis6s: add %src, 128 - 48, %src /* IEU0 Group */ + ldda [%src-80] %asi, %f12 /* Load Group */ + ldda [%src-72] %asi, %f14 /* Load Group */ + wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src-64] %asi, %f16 /* Load Group */ + fmovd %f48, %f0 /* FPA Group */ + fmuld %f32, %f32, %f2 /* FPM */ + clr %x4 /* IEU0 */ + faddd %f32, %f32, %f4 /* FPA Group */ + fmuld %f32, %f32, %f6 /* FPM */ + clr %x5 /* IEU0 */ + faddd %f32, %f32, %f8 /* FPA Group */ + fmuld %f32, %f32, %f10 /* FPM */ + clr %x6 /* IEU0 */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + sub %dst, 64, %dst /* IEU0 */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + faligndata %f12, %f14, %f48 /* FPA */ + fmovd %f14, %f50 /* FPA Group */ vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f52,f54,f56,f58,f60,f62,f48,f50,f50, ,LDBLK(f32), ,,,,,,STBLK,, @@ -734,24 +736,24 @@ vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,,,,,STBLK,ST(f48,64), ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1) .align 2048 -vis7s: add %src, 128 - 56, %src /* IEU0 Group */ - ldda [%src-72] %asi, %f14 /* Load Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src-64] %asi, %f16 /* Load Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fmuld %f32, %f32, %f10 /* FPM */ - clr %x6 /* IEU0 */ - faddd %f32, %f32, %f12 /* FPA Group */ - clr %x7 /* IEU0 */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - fmovd %f14, %f48 /* FPA */ +vis7s: add %src, 128 - 56, %src /* IEU0 Group */ + ldda [%src-72] %asi, %f14 /* Load Group */ + wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src-64] %asi, %f16 /* Load Group */ + fmovd %f48, %f0 /* FPA Group */ + fmuld %f32, %f32, %f2 /* FPM */ + clr %x4 /* IEU0 */ + faddd %f32, %f32, %f4 /* FPA Group */ + fmuld %f32, %f32, %f6 /* FPM */ + clr %x5 /* IEU0 */ + faddd %f32, %f32, %f8 /* FPA Group */ + fmuld %f32, %f32, %f10 /* FPM */ + clr %x6 /* IEU0 */ + faddd %f32, %f32, %f12 /* FPA Group */ + clr %x7 /* IEU0 */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + sub %dst, 64, %dst /* IEU0 */ + fmovd %f14, %f48 /* FPA */ vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f50,f52,f54,f56,f58,f60,f62,f48,f48, ,LDBLK(f32), ,,,,,,,STBLK, @@ -779,112 +781,112 @@ vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6) e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6) e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6) -ett: rd %asi, %x4 /* LSU Group+4bubbles */ - rd %gsr, %x3 /* LSU Group+4bubbles */ +ett: rd %asi, %x4 /* LSU Group+4bubbles */ + rd %gsr, %x3 /* LSU Group+4bubbles */ #ifdef __KERNEL__ - srl %x4, 3, %x5 /* IEU0 Group */ - xor %x4, ASI_BLK_XOR1, %x4 /* IEU1 */ - wr %x4, %x5, %asi /* LSU Group+4bubbles */ + srl %x4, 3, %x5 /* IEU0 Group */ + xor %x4, ASI_BLK_XOR1, %x4 /* IEU1 */ + wr %x4, %x5, %asi /* LSU Group+4bubbles */ #else - wr %x4, ASI_BLK_XOR, %asi /* LSU Group+4bubbles */ + wr %x4, ASI_BLK_XOR, %asi /* LSU Group+4bubbles */ #endif - andcc %x3, 7, %x3 /* IEU1 Group */ - add %dst, 8, %dst /* IEU0 */ - bne,pn %icc, 1f /* CTI */ - fzero %f10 /* FPA */ - brz,a,pn %len, 2f /* CTI+IEU1 Group */ - std %f6, [%dst - 8] /* Store */ -1: cmp %len, 8 /* IEU1 */ - blu,pn %icc, 3f /* CTI */ - sub %src, 64, %src /* IEU0 Group */ -1: ldda [%src] %asi, %f2 /* Load Group */ - fpadd32 %f10, %f2, %f12 /* FPA Group+load stall */ - add %src, 8, %src /* IEU0 */ - add %dst, 8, %dst /* IEU1 */ - faligndata %f6, %f2, %f14 /* FPA Group */ - fcmpgt32 %f10, %f12, %x5 /* FPM Group */ - std %f14, [%dst - 16] /* Store */ - fmovd %f2, %f6 /* FPA */ - fmovd %f12, %f10 /* FPA Group */ - sub %len, 8, %len /* IEU1 */ - fzero %f16 /* FPA Group - FPU nop */ - fzero %f18 /* FPA Group - FPU nop */ - inc %x5 /* IEU0 */ - srl %x5, 1, %x5 /* IEU0 Group (regdep) */ - cmp %len, 8 /* IEU1 */ - bgeu,pt %icc, 1b /* CTI */ - add %x5, %sum, %sum /* IEU0 Group */ -3: brz,a,pt %x3, 2f /* CTI+IEU1 */ - std %f6, [%dst - 8] /* Store Group */ - st %f7, [%dst - 8] /* Store Group */ - sub %dst, 4, %dst /* IEU0 */ - add %len, 4, %len /* IEU1 */ + andcc %x3, 7, %x3 /* IEU1 Group */ + add %dst, 8, %dst /* IEU0 */ + bne,pn %icc, 1f /* CTI */ + fzero %f10 /* FPA */ + brz,a,pn %len, 2f /* CTI+IEU1 Group */ + std %f6, [%dst - 8] /* Store */ +1: cmp %len, 8 /* IEU1 */ + blu,pn %icc, 3f /* CTI */ + sub %src, 64, %src /* IEU0 Group */ +1: ldda [%src] %asi, %f2 /* Load Group */ + fpadd32 %f10, %f2, %f12 /* FPA Group+load stall*/ + add %src, 8, %src /* IEU0 */ + add %dst, 8, %dst /* IEU1 */ + faligndata %f6, %f2, %f14 /* FPA Group */ + fcmpgt32 %f10, %f12, %x5 /* FPM Group */ + std %f14, [%dst - 16] /* Store */ + fmovd %f2, %f6 /* FPA */ + fmovd %f12, %f10 /* FPA Group */ + sub %len, 8, %len /* IEU1 */ + fzero %f16 /* FPA Group - FPU nop */ + fzero %f18 /* FPA Group - FPU nop */ + inc %x5 /* IEU0 */ + srl %x5, 1, %x5 /* IEU0 Group (regdep) */ + cmp %len, 8 /* IEU1 */ + bgeu,pt %icc, 1b /* CTI */ + add %x5, %sum, %sum /* IEU0 Group */ +3: brz,a,pt %x3, 2f /* CTI+IEU1 */ + std %f6, [%dst - 8] /* Store Group */ + st %f7, [%dst - 8] /* Store Group */ + sub %dst, 4, %dst /* IEU0 */ + add %len, 4, %len /* IEU1 */ 2: #ifdef __KERNEL__ - sub %sp, 8, %sp /* IEU0 Group */ + sub %sp, 8, %sp /* IEU0 Group */ #endif END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62) - membar #Sync /* LSU Group */ + membar #Sync /* LSU Group */ #ifdef __KERNEL__ VISExit - add %sp, 8, %sp /* IEU0 Group */ + add %sp, 8, %sp /* IEU0 Group */ #endif -23: brnz,pn %len, 26f /* CTI+IEU1 Group */ -24: sllx %sum, 32, %g1 /* IEU0 */ -25: addcc %sum, %g1, %src /* IEU1 Group */ - srlx %src, 32, %src /* IEU0 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %src, 1, %src /* IEU1 */ +23: brnz,pn %len, 26f /* CTI+IEU1 Group */ +24: sllx %sum, 32, %g1 /* IEU0 */ +25: addcc %sum, %g1, %src /* IEU1 Group */ + srlx %src, 32, %src /* IEU0 Group (regdep) */ + bcs,a,pn %xcc, 1f /* CTI */ + add %src, 1, %src /* IEU1 */ #ifndef __KERNEL__ -1: retl /* CTI Group brk forced */ - srl %src, 0, %src /* IEU0 */ +1: retl /* CTI Group brk forced*/ + srl %src, 0, %src /* IEU0 */ #else -1: sethi %uhi(PAGE_OFFSET), %g4 /* IEU0 Group */ - retl /* CTI Group brk forced */ - sllx %g4, 32, %g4 /* IEU0 */ +1: sethi %uhi(PAGE_OFFSET), %g4 /* IEU0 Group */ + retl /* CTI Group brk forced*/ + sllx %g4, 32, %g4 /* IEU0 */ #endif -26: andcc %len, 8, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - lduwa [%src] %asi, %o4 /* Load */ - lduwa [%src+4] %asi, %g2 /* Load Group */ - add %src, 8, %src /* IEU0 */ - add %dst, 8, %dst /* IEU1 */ - sllx %o4, 32, %g5 /* IEU0 Group */ - stw %o4, [%dst - 8] /* Store */ - or %g5, %g2, %g5 /* IEU0 Group */ - stw %g2, [%dst - 4] /* Store */ - addcc %g5, %sum, %sum /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: andcc %len, 4, %g0 /* IEU1 Group */ - be,a,pn %icc, 1f /* CTI */ - clr %g2 /* IEU0 */ - lduwa [%src] %asi, %g7 /* Load */ - add %src, 4, %src /* IEU0 Group */ - add %dst, 4, %dst /* IEU1 */ - sllx %g7, 32, %g2 /* IEU0 Group */ - stw %g7, [%dst - 4] /* Store */ -1: andcc %len, 2, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %g3 /* IEU0 Group */ - lduha [%src] %asi, %g7 /* Load */ - add %src, 2, %src /* IEU1 */ - add %dst, 2, %dst /* IEU0 Group */ - sll %g7, 16, %g3 /* IEU0 Group */ - sth %g7, [%dst - 2] /* Store */ -1: andcc %len, 1, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o5 /* IEU0 Group */ - lduba [%src] %asi, %g7 /* Load */ - sll %g7, 8, %o5 /* IEU0 Group */ - stb %g7, [%dst] /* Store */ -1: or %g2, %g3, %g3 /* IEU1 */ - or %o5, %g3, %g3 /* IEU0 Group (regdep) */ - addcc %g3, %sum, %sum /* IEU1 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: ba,pt %xcc, 25b /* CTI Group */ - sllx %sum, 32, %g1 /* IEU0 */ +26: andcc %len, 8, %g0 /* IEU1 Group */ + be,pn %icc, 1f /* CTI */ + lduwa [%src] %asi, %o4 /* Load */ + lduwa [%src+4] %asi, %g2 /* Load Group */ + add %src, 8, %src /* IEU0 */ + add %dst, 8, %dst /* IEU1 */ + sllx %o4, 32, %g5 /* IEU0 Group */ + stw %o4, [%dst - 8] /* Store */ + or %g5, %g2, %g5 /* IEU0 Group */ + stw %g2, [%dst - 4] /* Store */ + addcc %g5, %sum, %sum /* IEU1 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %sum, 1, %sum /* IEU0 */ +1: andcc %len, 4, %g0 /* IEU1 Group */ + be,a,pn %icc, 1f /* CTI */ + clr %g2 /* IEU0 */ + lduwa [%src] %asi, %g7 /* Load */ + add %src, 4, %src /* IEU0 Group */ + add %dst, 4, %dst /* IEU1 */ + sllx %g7, 32, %g2 /* IEU0 Group */ + stw %g7, [%dst - 4] /* Store */ +1: andcc %len, 2, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %g3 /* IEU0 Group */ + lduha [%src] %asi, %g7 /* Load */ + add %src, 2, %src /* IEU1 */ + add %dst, 2, %dst /* IEU0 Group */ + sll %g7, 16, %g3 /* IEU0 Group */ + sth %g7, [%dst - 2] /* Store */ +1: andcc %len, 1, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %o5 /* IEU0 Group */ + lduba [%src] %asi, %g7 /* Load */ + sll %g7, 8, %o5 /* IEU0 Group */ + stb %g7, [%dst] /* Store */ +1: or %g2, %g3, %g3 /* IEU1 */ + or %o5, %g3, %g3 /* IEU0 Group (regdep) */ + addcc %g3, %sum, %sum /* IEU1 Group (regdep) */ + bcs,a,pn %xcc, 1f /* CTI */ + add %sum, 1, %sum /* IEU0 */ +1: ba,pt %xcc, 25b /* CTI Group */ + sllx %sum, 32, %g1 /* IEU0 */ #ifdef __KERNEL__ end: diff --git a/arch/sparc64/lib/VIScsumcopyusr.S b/arch/sparc64/lib/VIScsumcopyusr.S index 17bbe78b1..4730a1c08 100644 --- a/arch/sparc64/lib/VIScsumcopyusr.S +++ b/arch/sparc64/lib/VIScsumcopyusr.S @@ -1,4 +1,4 @@ -/* $Id: VIScsumcopyusr.S,v 1.1 2000/01/19 04:06:04 davem Exp $ +/* $Id: VIScsumcopyusr.S,v 1.2 2000/02/20 23:21:40 davem Exp $ * VIScsumcopyusr.S: High bandwidth IP checksumming with simultaneous * copying utilizing the UltraSparc Visual Instruction Set. * @@ -91,358 +91,360 @@ #define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \ - LOAD /* Load Group */; \ - faligndata %A14, %F0, %A14 /* FPA Group */; \ - inc %x5 /* IEU0 */; \ - STORE1 /* Store (optional) */; \ - faligndata %F0, %F2, %A0 /* FPA Group */; \ - srl %x5, 1, %x5 /* IEU0 */; \ - add %sum, %x4, %sum /* IEU1 */; \ - fpadd32 %F0, %f0, %F0 /* FPA Group */; \ - inc %x6 /* IEU0 */; \ - STORE2 /* Store (optional) */; \ - faligndata %F2, %F4, %A2 /* FPA Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fpadd32 %F2, %f2, %F2 /* FPA Group */; \ - add %src, 64, %src /* IEU0 */; \ - add %dst, 64, %dst /* IEU1 */; \ - fcmpgt32 %f0, %F0, %x1 /* FPM Group */; \ - inc %x7 /* IEU0 */; \ - STORE3 /* Store (optional) */; \ - faligndata %F4, %F6, %A4 /* FPA */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - add %sum, %x6, %sum /* IEU1 */; \ - fpadd32 %F4, %f4, %F4 /* FPA */; \ - fcmpgt32 %f2, %F2, %x2 /* FPM Group */; \ - inc %x8 /* IEU0 */; \ - STORE4 /* Store (optional) */; \ - faligndata %F6, %F8, %A6 /* FPA */; \ - srl %x8, 1, %x8 /* IEU0 Group */; \ - add %sum, %x7, %sum /* IEU1 */; \ - fpadd32 %F6, %f6, %F6 /* FPA */; \ - fcmpgt32 %f4, %F4, %x3 /* FPM Group */; \ - inc %x1 /* IEU0 */; \ - STORE5 /* Store (optional) */; \ - faligndata %F8, %F10, %A8 /* FPA */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - add %sum, %x8, %sum /* IEU1 */; \ - fpadd32 %F8, %f8, %F8 /* FPA */; \ - fcmpgt32 %f6, %F6, %x4 /* FPM Group */; \ - inc %x2 /* IEU0 */; \ - STORE6 /* Store (optional) */; \ - faligndata %F10, %F12, %A10 /* FPA */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - add %sum, %x1, %sum /* IEU1 */; \ - fpadd32 %F10, %f10, %F10 /* FPA */; \ - fcmpgt32 %f8, %F8, %x5 /* FPM Group */; \ - inc %x3 /* IEU0 */; \ - STORE7 /* Store (optional) */; \ - faligndata %F12, %F14, %A12 /* FPA */; \ - srl %x3, 1, %x3 /* IEU0 Group */; \ - add %sum, %x2, %sum /* IEU1 */; \ - fpadd32 %F12, %f12, %F12 /* FPA */; \ - fcmpgt32 %f10, %F10, %x6 /* FPM Group */; \ - inc %x4 /* IEU0 */; \ - STORE8 /* Store (optional) */; \ - fmovd %F14, %B14 /* FPA */; \ - srl %x4, 1, %x4 /* IEU0 Group */; \ - add %sum, %x3, %sum /* IEU1 */; \ - fpadd32 %F14, %f14, %F14 /* FPA */; \ - fcmpgt32 %f12, %F12, %x7 /* FPM Group */; \ - subcc %len, 64, %len /* IEU1 */; \ - BRANCH /* CTI */; \ - fcmpgt32 %f14, %F14, %x8 /* FPM Group */; \ + LOAD /* Load (Group) */; \ + faligndata %A14, %F0, %A14 /* FPA Group */; \ + inc %x5 /* IEU0 */; \ + STORE1 /* Store (optional) */; \ + faligndata %F0, %F2, %A0 /* FPA Group */; \ + srl %x5, 1, %x5 /* IEU0 */; \ + add %sum, %x4, %sum /* IEU1 */; \ + fpadd32 %F0, %f0, %F0 /* FPA Group */; \ + inc %x6 /* IEU0 */; \ + STORE2 /* Store (optional) */; \ + faligndata %F2, %F4, %A2 /* FPA Group */; \ + srl %x6, 1, %x6 /* IEU0 */; \ + add %sum, %x5, %sum /* IEU1 */; \ + fpadd32 %F2, %f2, %F2 /* FPA Group */; \ + add %src, 64, %src /* IEU0 */; \ + fcmpgt32 %f0, %F0, %x1 /* FPM */; \ + add %dst, 64, %dst /* IEU1 Group */; \ + inc %x7 /* IEU0 */; \ + STORE3 /* Store (optional) */; \ + faligndata %F4, %F6, %A4 /* FPA */; \ + fpadd32 %F4, %f4, %F4 /* FPA Group */; \ + add %sum, %x6, %sum /* IEU1 */; \ + fcmpgt32 %f2, %F2, %x2 /* FPM */; \ + srl %x7, 1, %x7 /* IEU0 Group */; \ + inc %x8 /* IEU1 */; \ + STORE4 /* Store (optional) */; \ + faligndata %F6, %F8, %A6 /* FPA */; \ + fpadd32 %F6, %f6, %F6 /* FPA Group */; \ + srl %x8, 1, %x8 /* IEU0 */; \ + fcmpgt32 %f4, %F4, %x3 /* FPM */; \ + add %sum, %x7, %sum /* IEU0 Group */; \ + inc %x1 /* IEU1 */; \ + STORE5 /* Store (optional) */; \ + faligndata %F8, %F10, %A8 /* FPA */; \ + fpadd32 %F8, %f8, %F8 /* FPA Group */; \ + srl %x1, 1, %x1 /* IEU0 */; \ + fcmpgt32 %f6, %F6, %x4 /* FPM */; \ + add %sum, %x8, %sum /* IEU0 Group */; \ + inc %x2 /* IEU1 */; \ + STORE6 /* Store (optional) */; \ + faligndata %F10, %F12, %A10 /* FPA */; \ + fpadd32 %F10, %f10, %F10 /* FPA Group */; \ + srl %x2, 1, %x2 /* IEU0 */; \ + fcmpgt32 %f8, %F8, %x5 /* FPM */; \ + add %sum, %x1, %sum /* IEU0 Group */; \ + inc %x3 /* IEU1 */; \ + STORE7 /* Store (optional) */; \ + faligndata %F12, %F14, %A12 /* FPA */; \ + fpadd32 %F12, %f12, %F12 /* FPA Group */; \ + srl %x3, 1, %x3 /* IEU0 */; \ + fcmpgt32 %f10, %F10, %x6 /* FPM */; \ + add %sum, %x2, %sum /* IEU0 Group */; \ + inc %x4 /* IEU1 */; \ + STORE8 /* Store (optional) */; \ + fmovd %F14, %B14 /* FPA */; \ + fpadd32 %F14, %f14, %F14 /* FPA Group */; \ + srl %x4, 1, %x4 /* IEU0 */; \ + fcmpgt32 %f12, %F12, %x7 /* FPM */; \ + add %sum, %x3, %sum /* IEU0 Group */; \ + subcc %len, 64, %len /* IEU1 */; \ + BRANCH /* CTI */; \ + fcmpgt32 %f14, %F14, %x8 /* FPM Group */; #define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \ - inc %x5 /* IEU0 Group */; \ - fpadd32 %f2, %f0, %S0 /* FPA */; \ - srl %x5, 1, %x5 /* IEU0 Group */; \ - add %sum, %x4, %sum /* IEU1 */; \ - fpadd32 %f6, %f4, %S1 /* FPA */; \ - inc %x6 /* IEU0 Group */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fcmpgt32 %f0, %S0, %x1 /* FPM Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - inc %x7 /* IEU1 */; \ - fpadd32 %f10, %f8, %S2 /* FPA */; \ - fcmpgt32 %f4, %S1, %x2 /* FPM Group */; \ - srl %x7, 1, %x7 /* IEU0 */; \ - add %sum, %x6, %sum /* IEU1 */; \ - fpadd32 %f14, %f12, %S3 /* FPA */; \ - inc %x8 /* IEU0 Group */; \ - add %sum, %x7, %sum /* IEU1 */; \ - fzero %fz /* FPA */; \ - fcmpgt32 %f8, %S2, %x3 /* FPM Group */; \ - srl %x8, 1, %x8 /* IEU0 */; \ - inc %x1 /* IEU1 */; \ - fpadd32 %S0, %S1, %T0 /* FPA */; \ - fcmpgt32 %f12, %S3, %x4 /* FPM Group */; \ - srl %x1, 1, %x1 /* IEU0 */; \ - add %sum, %x8, %sum /* IEU1 */; \ - fpadd32 %S2, %S3, %T1 /* FPA */; \ - inc %x2 /* IEU0 Group */; \ - add %sum, %x1, %sum /* IEU1 */; \ - fcmpgt32 %S0, %T0, %x5 /* FPM Group */; \ - srl %x2, 1, %x2 /* IEU0 */; \ - inc %x3 /* IEU1 */; \ - fcmpgt32 %S2, %T1, %x6 /* FPM Group */; \ - srl %x3, 1, %x3 /* IEU0 */; \ - add %sum, %x2, %sum /* IEU1 */; \ - inc %x4 /* IEU0 Group */; \ - add %sum, %x3, %sum /* IEU1 */; \ - fcmpgt32 %fz, %f2, %x7 /* FPM Group */; \ - srl %x4, 1, %x4 /* IEU0 */; \ - inc %x5 /* IEU1 */; \ - fpadd32 %T0, %T1, %U0 /* FPA */; \ - fcmpgt32 %fz, %f6, %x8 /* FPM Group */; \ - srl %x5, 1, %x5 /* IEU0 */; \ - add %sum, %x4, %sum /* IEU1 */; \ - inc %x6 /* IEU0 Group */; \ - add %sum, %x5, %sum /* IEU1 */; \ - fcmpgt32 %fz, %f10, %x1 /* FPM Group */; \ - srl %x6, 1, %x6 /* IEU0 */; \ - inc %x7 /* IEU1 */; \ - fcmpgt32 %fz, %f14, %x2 /* FPM Group */; \ - ba,pt %xcc, ett /* CTI */; \ - fmovd %FA, %FB /* FPA */; \ + inc %x5 /* IEU0 Group */; \ + fpadd32 %f2, %f0, %S0 /* FPA */; \ + add %sum, %x4, %sum /* IEU1 */; \ + srl %x5, 1, %x5 /* IEU0 Group */; \ + fpadd32 %f6, %f4, %S1 /* FPA */; \ + inc %x6 /* IEU1 */; \ + fpadd32 %f10, %f8, %S2 /* FPA Group */; \ + add %sum, %x5, %sum /* IEU0 */; \ + fcmpgt32 %f0, %S0, %x1 /* FPM */; \ + fpadd32 %f14, %f12, %S3 /* FPA Group */; \ + srl %x6, 1, %x6 /* IEU0 */; \ + fcmpgt32 %f4, %S1, %x2 /* FPM */; \ + add %sum, %x6, %sum /* IEU0 Group */; \ + fzero %fz /* FPA */; \ + fcmpgt32 %f8, %S2, %x3 /* FPM */; \ + inc %x7 /* IEU0 Group */; \ + inc %x8 /* IEU1 */; \ + srl %x7, 1, %x7 /* IEU0 Group */; \ + inc %x1 /* IEU1 */; \ + fpadd32 %S0, %S1, %T0 /* FPA */; \ + fpadd32 %S2, %S3, %T1 /* FPA Group */; \ + add %sum, %x7, %sum /* IEU0 */; \ + fcmpgt32 %f12, %S3, %x4 /* FPM */; \ + srl %x8, 1, %x8 /* IEU0 Group */; \ + inc %x2 /* IEU1 */; \ + srl %x1, 1, %x1 /* IEU0 Group */; \ + add %sum, %x8, %sum /* IEU1 */; \ + add %sum, %x1, %sum /* IEU0 Group */; \ + fcmpgt32 %S0, %T0, %x5 /* FPM */; \ + srl %x2, 1, %x2 /* IEU0 Group */; \ + fcmpgt32 %S2, %T1, %x6 /* FPM */; \ + inc %x3 /* IEU0 Group */; \ + add %sum, %x2, %sum /* IEU1 */; \ + srl %x3, 1, %x3 /* IEU0 Group */; \ + inc %x4 /* IEU1 */; \ + fpadd32 %T0, %T1, %U0 /* FPA Group */; \ + add %sum, %x3, %sum /* IEU0 */; \ + fcmpgt32 %fz, %f2, %x7 /* FPM */; \ + srl %x4, 1, %x4 /* IEU0 Group */; \ + fcmpgt32 %fz, %f6, %x8 /* FPM */; \ + inc %x5 /* IEU0 Group */; \ + add %sum, %x4, %sum /* IEU1 */; \ + srl %x5, 1, %x5 /* IEU0 Group */; \ + fcmpgt32 %fz, %f10, %x1 /* FPM */; \ + inc %x6 /* IEU0 Group */; \ + add %sum, %x5, %sum /* IEU1 */; \ + fmovd %FA, %FB /* FPA Group */; \ + fcmpgt32 %fz, %f14, %x2 /* FPM */; \ + srl %x6, 1, %x6 /* IEU0 Group */; \ + ba,pt %xcc, ett /* CTI */; \ + inc %x7 /* IEU1 */; -#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \ +#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \ END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62) -#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \ - fpadd32 %U0, %U1, %V0 /* FPA Group */; \ - srl %x7, 1, %x7 /* IEU0 */; \ - add %sum, %x6, %sum /* IEU1 */; \ - std %V0, [%sp + STACKOFF] /* Store Group */; \ - inc %x8 /* IEU0 */; \ - sub %sum, %x7, %sum /* IEU1 */; \ - fcmpgt32 %fz, %S1, %x3 /* FPM Group */; \ - srl %x8, 1, %x8 /* IEU0 */; \ - inc %x1 /* IEU1 */; \ - fcmpgt32 %fz, %S3, %x4 /* FPM Group */; \ - srl %x1, 1, %x1 /* IEU0 */; \ - sub %sum, %x8, %sum /* IEU1 */; \ - ldx [%sp + STACKOFF], %x8 /* Load Group */; \ - inc %x2 /* IEU0 */; \ - sub %sum, %x1, %sum /* IEU1 */; \ - fcmpgt32 %fz, %T1, %x5 /* FPM Group */; \ - srl %x2, 1, %x2 /* IEU0 */; \ - inc %x3 /* IEU1 */; \ - fcmpgt32 %T0, %U0, %x6 /* FPM Group */; \ - srl %x3, 1, %x3 /* IEU0 */; \ - sub %sum, %x2, %sum /* IEU1 */; \ - inc %x4 /* IEU0 Group */; \ - sub %sum, %x3, %sum /* IEU1 */; \ - fcmpgt32 %fz, %U1, %x7 /* FPM Group */; \ - srl %x4, 1, %x4 /* IEU0 */; \ - inc %x5 /* IEU1 */; \ - fcmpgt32 %U0, %V0, %x1 /* FPM Group */; \ - srl %x5, 1, %x5 /* IEU0 */; \ - sub %sum, %x4, %sum /* IEU1 */; \ - fcmpgt32 %fz, %V0, %x2 /* FPM Group */; \ - inc %x6 /* IEU0 */; \ - sub %sum, %x5, %sum /* IEU1 */; \ - srl %x6, 1, %x6 /* IEU0 Group */; \ - inc %x7 /* IEU1 */; \ - srl %x7, 1, %x7 /* IEU0 Group */; \ - add %sum, %x6, %sum /* IEU1 */; \ - inc %x1 /* IEU0 Group */; \ - sub %sum, %x7, %sum /* IEU1 */; \ - srl %x1, 1, %x1 /* IEU0 Group */; \ - inc %x2 /* IEU1 */; \ - srl %x2, 1, %x2 /* IEU0 Group */; \ - add %sum, %x1, %sum /* IEU1 */; \ - sub %sum, %x2, %sum /* IEU0 Group */; \ - addcc %sum, %x8, %sum /* IEU Group */; \ - bcs,a,pn %xcc, 33f /* CTI */; \ - add %sum, 1, %sum /* IEU0 */; \ -33: /* That's it */; +#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \ + fpadd32 %U0, %U1, %V0 /* FPA Group */; \ + srl %x7, 1, %x7 /* IEU0 */; \ + add %sum, %x6, %sum /* IEU1 */; \ + std %V0, [%sp + STACKOFF] /* Store Group */; \ + inc %x8 /* IEU0 */; \ + sub %sum, %x7, %sum /* IEU1 */; \ + srl %x8, 1, %x8 /* IEU0 Group */; \ + fcmpgt32 %fz, %S1, %x3 /* FPM */; \ + inc %x1 /* IEU0 Group */; \ + fcmpgt32 %fz, %S3, %x4 /* FPM */; \ + srl %x1, 1, %x1 /* IEU0 Group */; \ + sub %sum, %x8, %sum /* IEU1 */; \ + ldx [%sp + STACKOFF], %x8 /* Load Group */; \ + inc %x2 /* IEU0 */; \ + sub %sum, %x1, %sum /* IEU1 */; \ + srl %x2, 1, %x2 /* IEU0 Group */; \ + fcmpgt32 %fz, %T1, %x5 /* FPM */; \ + inc %x3 /* IEU0 Group */; \ + fcmpgt32 %T0, %U0, %x6 /* FPM */; \ + srl %x3, 1, %x3 /* IEU0 Group */; \ + sub %sum, %x2, %sum /* IEU1 */; \ + inc %x4 /* IEU0 Group */; \ + sub %sum, %x3, %sum /* IEU1 */; \ + srl %x4, 1, %x4 /* IEU0 Group */; \ + fcmpgt32 %fz, %U1, %x7 /* FPM */; \ + inc %x5 /* IEU0 Group */; \ + fcmpgt32 %U0, %V0, %x1 /* FPM */; \ + srl %x5, 1, %x5 /* IEU0 Group */; \ + sub %sum, %x4, %sum /* IEU1 */; \ + sub %sum, %x5, %sum /* IEU0 Group */; \ + fcmpgt32 %fz, %V0, %x2 /* FPM */; \ + inc %x6 /* IEU0 Group */; \ + inc %x7 /* IEU1 */; \ + srl %x6, 1, %x6 /* IEU0 Group */; \ + inc %x1 /* IEU1 */; \ + srl %x7, 1, %x7 /* IEU0 Group */; \ + add %sum, %x6, %sum /* IEU1 */; \ + srl %x1, 1, %x1 /* IEU0 Group */; \ + sub %sum, %x7, %sum /* IEU1 */; \ + inc %x2 /* IEU0 Group */; \ + add %sum, %x1, %sum /* IEU1 */; \ + srl %x2, 1, %x2 /* IEU0 Group */; \ + sub %sum, %x2, %sum /* IEU0 Group */; \ + addcc %sum, %x8, %sum /* IEU1 Group */; \ + bcs,a,pn %xcc, 33f /* CTI */; \ + add %sum, 1, %sum /* IEU0 (Group) */; \ +33: /* That's it */; .text .globl csum_partial_copy_user_vis .align 32 -/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. csum_partial_copy_from_user */ -/* This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 */ +/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. + * csum_partial_copy_from_user + * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 + */ csum_partial_copy_user_vis: - andcc %dst, 7, %g0 /* IEU1 Group */ - be,pt %icc, 4f /* CTI */ - and %dst, 0x38, %o4 /* IEU0 */ - mov 1, %g5 /* IEU0 Group */ - andcc %dst, 2, %g0 /* IEU1 */ - be,pt %icc, 1f /* CTI */ - and %dst, 4, %g7 /* IEU0 Group */ - lduh [%src], %g2 /* Load */ - sub %len, 2, %len /* IEU0 Group */ - add %dst, 2, %dst /* IEU1 */ - andcc %dst, 4, %g7 /* IEU1 Group */ - sll %g5, 16, %g5 /* IEU0 */ - stha %g2, [%dst - 2] %asi /* Store Group */ - sll %g2, 16, %g2 /* IEU0 */ - add %src, 2, %src /* IEU1 */ - addcc %g2, %sum, %sum /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %sum, %g5, %sum /* IEU0 */ -1: lduw [%src], %g2 /* Load */ - brz,a,pn %g7, 4f /* CTI+IEU1 Group */ - and %dst, 0x38, %o4 /* IEU0 */ - add %dst, 4, %dst /* IEU0 Group */ - sub %len, 4, %len /* IEU1 */ - addcc %g2, %sum, %sum /* IEU1 Group */ - bcs,a,pn %icc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: and %dst, 0x38, %o4 /* IEU0 Group */ - stwa %g2, [%dst - 4] %asi /* Store */ - add %src, 4, %src /* IEU1 */ + andcc %dst, 7, %g0 /* IEU1 Group */ + be,pt %icc, 4f /* CTI */ + and %dst, 0x38, %o4 /* IEU0 */ + mov 1, %g5 /* IEU0 Group */ + andcc %dst, 2, %g0 /* IEU1 */ + be,pt %icc, 1f /* CTI */ + and %dst, 4, %g7 /* IEU0 Group */ + lduh [%src], %g2 /* Load */ + sub %len, 2, %len /* IEU0 Group */ + add %dst, 2, %dst /* IEU1 */ + andcc %dst, 4, %g7 /* IEU1 Group */ + sll %g5, 16, %g5 /* IEU0 */ + stha %g2, [%dst - 2] %asi /* Store Group */ + sll %g2, 16, %g2 /* IEU0 */ + add %src, 2, %src /* IEU1 */ + addcc %g2, %sum, %sum /* IEU1 Group */ + bcs,a,pn %icc, 1f /* CTI */ + add %sum, %g5, %sum /* IEU0 */ +1: lduw [%src], %g2 /* Load */ + brz,a,pn %g7, 4f /* CTI+IEU1 Group */ + and %dst, 0x38, %o4 /* IEU0 */ + add %dst, 4, %dst /* IEU0 Group */ + sub %len, 4, %len /* IEU1 */ + addcc %g2, %sum, %sum /* IEU1 Group */ + bcs,a,pn %icc, 1f /* CTI */ + add %sum, 1, %sum /* IEU0 */ +1: and %dst, 0x38, %o4 /* IEU0 Group */ + stwa %g2, [%dst - 4] %asi /* Store */ + add %src, 4, %src /* IEU1 */ 4: #ifdef __KERNEL__ VISEntry #endif - mov %src, %g7 /* IEU1 Group */ - fzero %f48 /* FPA */ - alignaddr %src, %g0, %src /* Single Group */ - subcc %g7, %src, %g7 /* IEU1 Group */ - be,pt %xcc, 1f /* CTI */ - mov 0x40, %g1 /* IEU0 */ - lduw [%src], %g2 /* Load Group */ - subcc %sum, %g2, %sum /* IEU1 Group+load stall */ - bcs,a,pn %icc, 1f /* CTI */ - sub %sum, 1, %sum /* IEU0 */ -1: srl %sum, 0, %sum /* IEU0 Group */ - clr %g5 /* IEU1 */ - brz,pn %o4, 3f /* CTI+IEU1 Group */ - sub %g1, %o4, %g1 /* IEU0 */ - ldd [%src], %f0 /* Load */ - clr %o4 /* IEU0 Group */ - andcc %dst, 8, %g0 /* IEU1 */ - be,pn %icc, 1f /* CTI */ - ldd [%src + 8], %f2 /* Load Group */ - add %src, 8, %src /* IEU0 */ - sub %len, 8, %len /* IEU1 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - addcc %dst, 8, %dst /* IEU1 Group */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %o4 /* FPM Group */ - fmovd %f2, %f0 /* FPA Group */ - ldd [%src + 8], %f2 /* Load */ - stda %f16, [%dst - 8] %asi /* Store */ - fmovd %f50, %f48 /* FPA */ -1: andcc %g1, 0x10, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - and %g1, 0x20, %g1 /* IEU0 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - ldd [%src + 16], %f4 /* Load Group */ - add %src, 16, %src /* IEU0 */ - add %dst, 16, %dst /* IEU1 */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %g5 /* FPM Group */ - sub %len, 16, %len /* IEU0 */ - inc %o4 /* IEU1 */ - stda %f16, [%dst - 16] %asi /* Store Group */ - fpadd32 %f2, %f50, %f48 /* FPA */ - srl %o4, 1, %o5 /* IEU0 */ - faligndata %f2, %f4, %f18 /* FPA Group */ - stda %f18, [%dst - 8] %asi /* Store */ - fcmpgt32 %f50, %f48, %o4 /* FPM Group */ - add %o5, %sum, %sum /* IEU0 */ - ldd [%src + 8], %f2 /* Load */ - fmovd %f4, %f0 /* FPA */ -1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */ - rd %asi, %g2 /* LSU Group + 4 bubbles */ - inc %g5 /* IEU0 */ - fpadd32 %f0, %f48, %f50 /* FPA */ - ldd [%src + 16], %f4 /* Load Group */ - srl %g5, 1, %g5 /* IEU0 */ - add %dst, 32, %dst /* IEU1 */ - faligndata %f0, %f2, %f16 /* FPA */ - fcmpgt32 %f48, %f50, %o5 /* FPM Group */ - inc %o4 /* IEU0 */ - ldd [%src + 24], %f6 /* Load */ - srl %o4, 1, %o4 /* IEU0 Group */ - add %g5, %sum, %sum /* IEU1 */ - ldd [%src + 32], %f8 /* Load */ - fpadd32 %f2, %f50, %f48 /* FPA */ - faligndata %f2, %f4, %f18 /* FPA Group */ - sub %len, 32, %len /* IEU0 */ - stda %f16, [%dst - 32] %asi /* Store */ - fcmpgt32 %f50, %f48, %g3 /* FPM Group */ - inc %o5 /* IEU0 */ - add %o4, %sum, %sum /* IEU1 */ - fpadd32 %f4, %f48, %f50 /* FPA */ - faligndata %f4, %f6, %f20 /* FPA Group */ - srl %o5, 1, %o5 /* IEU0 */ - fcmpgt32 %f48, %f50, %g5 /* FPM Group */ - add %o5, %sum, %sum /* IEU0 */ - stda %f18, [%dst - 24] %asi /* Store */ - fpadd32 %f6, %f50, %f48 /* FPA */ - inc %g3 /* IEU0 Group */ - stda %f20, [%dst - 16] %asi /* Store */ - add %src, 32, %src /* IEU1 */ - faligndata %f6, %f8, %f22 /* FPA */ - fcmpgt32 %f50, %f48, %o4 /* FPM Group */ - srl %g3, 1, %g3 /* IEU0 */ - stda %f22, [%dst - 8] %asi /* Store */ - add %g3, %sum, %sum /* IEU0 Group */ -3: rd %asi, %g2 /* LSU Group + 4 bubbles */ + mov %src, %g7 /* IEU1 Group */ + fzero %f48 /* FPA */ + alignaddr %src, %g0, %src /* Single Group */ + subcc %g7, %src, %g7 /* IEU1 Group */ + be,pt %xcc, 1f /* CTI */ + mov 0x40, %g1 /* IEU0 */ + lduw [%src], %g2 /* Load Group */ + subcc %sum, %g2, %sum /* IEU1 Group+load stall*/ + bcs,a,pn %icc, 1f /* CTI */ + sub %sum, 1, %sum /* IEU0 */ +1: srl %sum, 0, %sum /* IEU0 Group */ + clr %g5 /* IEU1 */ + brz,pn %o4, 3f /* CTI+IEU1 Group */ + sub %g1, %o4, %g1 /* IEU0 */ + ldd [%src], %f0 /* Load */ + clr %o4 /* IEU0 Group */ + andcc %dst, 8, %g0 /* IEU1 */ + be,pn %icc, 1f /* CTI */ + ldd [%src + 8], %f2 /* Load Group */ + add %src, 8, %src /* IEU0 */ + sub %len, 8, %len /* IEU1 */ + fpadd32 %f0, %f48, %f50 /* FPA */ + addcc %dst, 8, %dst /* IEU1 Group */ + faligndata %f0, %f2, %f16 /* FPA */ + fcmpgt32 %f48, %f50, %o4 /* FPM Group */ + fmovd %f2, %f0 /* FPA Group */ + ldd [%src + 8], %f2 /* Load */ + stda %f16, [%dst - 8] %asi /* Store */ + fmovd %f50, %f48 /* FPA */ +1: andcc %g1, 0x10, %g0 /* IEU1 Group */ + be,pn %icc, 1f /* CTI */ + and %g1, 0x20, %g1 /* IEU0 */ + fpadd32 %f0, %f48, %f50 /* FPA */ + ldd [%src + 16], %f4 /* Load Group */ + add %src, 16, %src /* IEU0 */ + add %dst, 16, %dst /* IEU1 */ + faligndata %f0, %f2, %f16 /* FPA */ + fcmpgt32 %f48, %f50, %g5 /* FPM Group */ + sub %len, 16, %len /* IEU0 */ + inc %o4 /* IEU1 */ + stda %f16, [%dst - 16] %asi /* Store Group */ + fpadd32 %f2, %f50, %f48 /* FPA */ + srl %o4, 1, %o5 /* IEU0 */ + faligndata %f2, %f4, %f18 /* FPA Group */ + stda %f18, [%dst - 8] %asi /* Store */ + fcmpgt32 %f50, %f48, %o4 /* FPM Group */ + add %o5, %sum, %sum /* IEU0 */ + ldd [%src + 8], %f2 /* Load */ + fmovd %f4, %f0 /* FPA */ +1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */ + rd %asi, %g2 /* LSU Group + 4 bubbles*/ + inc %g5 /* IEU0 */ + fpadd32 %f0, %f48, %f50 /* FPA */ + ldd [%src + 16], %f4 /* Load Group */ + srl %g5, 1, %g5 /* IEU0 */ + add %dst, 32, %dst /* IEU1 */ + faligndata %f0, %f2, %f16 /* FPA */ + fcmpgt32 %f48, %f50, %o5 /* FPM Group */ + inc %o4 /* IEU0 */ + ldd [%src + 24], %f6 /* Load */ + srl %o4, 1, %o4 /* IEU0 Group */ + add %g5, %sum, %sum /* IEU1 */ + ldd [%src + 32], %f8 /* Load */ + fpadd32 %f2, %f50, %f48 /* FPA */ + faligndata %f2, %f4, %f18 /* FPA Group */ + sub %len, 32, %len /* IEU0 */ + stda %f16, [%dst - 32] %asi /* Store */ + fcmpgt32 %f50, %f48, %g3 /* FPM Group */ + inc %o5 /* IEU0 */ + add %o4, %sum, %sum /* IEU1 */ + fpadd32 %f4, %f48, %f50 /* FPA */ + faligndata %f4, %f6, %f20 /* FPA Group */ + srl %o5, 1, %o5 /* IEU0 */ + fcmpgt32 %f48, %f50, %g5 /* FPM Group */ + add %o5, %sum, %sum /* IEU0 */ + stda %f18, [%dst - 24] %asi /* Store */ + fpadd32 %f6, %f50, %f48 /* FPA */ + inc %g3 /* IEU0 Group */ + stda %f20, [%dst - 16] %asi /* Store */ + add %src, 32, %src /* IEU1 */ + faligndata %f6, %f8, %f22 /* FPA */ + fcmpgt32 %f50, %f48, %o4 /* FPM Group */ + srl %g3, 1, %g3 /* IEU0 */ + stda %f22, [%dst - 8] %asi /* Store */ + add %g3, %sum, %sum /* IEU0 Group */ +3: rd %asi, %g2 /* LSU Group + 4 bubbles*/ #ifdef __KERNEL__ -4: sethi %hi(vis0s), %g7 /* IEU0 Group */ - or %g2, ASI_BLK_OR, %g2 /* IEU1 */ +4: sethi %hi(vis0s), %g7 /* IEU0 Group */ + or %g2, ASI_BLK_OR, %g2 /* IEU1 */ #else -4: rd %pc, %g7 /* LSU Group + 4 bubbles */ +4: rd %pc, %g7 /* LSU Group + 4 bubbles*/ #endif - inc %g5 /* IEU0 Group */ - and %src, 0x38, %g3 /* IEU1 */ - membar #StoreLoad /* LSU Group */ - srl %g5, 1, %g5 /* IEU0 */ - inc %o4 /* IEU1 */ - sll %g3, 8, %g3 /* IEU0 Group */ - sub %len, 0xc0, %len /* IEU1 */ - addcc %g5, %sum, %sum /* IEU1 Group */ - srl %o4, 1, %o4 /* IEU0 */ - add %g7, %g3, %g7 /* IEU0 Group */ - add %o4, %sum, %sum /* IEU1 */ + inc %g5 /* IEU0 Group */ + and %src, 0x38, %g3 /* IEU1 */ + membar #StoreLoad /* LSU Group */ + srl %g5, 1, %g5 /* IEU0 */ + inc %o4 /* IEU1 */ + sll %g3, 8, %g3 /* IEU0 Group */ + sub %len, 0xc0, %len /* IEU1 */ + addcc %g5, %sum, %sum /* IEU1 Group */ + srl %o4, 1, %o4 /* IEU0 */ + add %g7, %g3, %g7 /* IEU0 Group */ + add %o4, %sum, %sum /* IEU1 */ #ifdef __KERNEL__ - jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */ + jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */ #else - jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */ + jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */ #endif - fzero %f32 /* FPA */ + fzero %f32 /* FPA */ .align 2048 -vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f48, %f62 /* FPA Group f0 available */ - faligndata %f0, %f2, %f48 /* FPA Group f2 available */ - fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available */ - fpadd32 %f0, %f62, %f0 /* FPA */ - fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available */ - faligndata %f2, %f4, %f50 /* FPA */ - fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available */ - faligndata %f4, %f6, %f52 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available */ - inc %x1 /* IEU0 */ - faligndata %f6, %f8, %f54 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available */ - srl %x1, 1, %x1 /* IEU0 */ - inc %x2 /* IEU1 */ - faligndata %f8, %f10, %f56 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available */ - srl %x2, 1, %x2 /* IEU0 */ - add %sum, %x1, %sum /* IEU1 */ - faligndata %f10, %f12, %f58 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - add %sum, %x2, %sum /* IEU1 */ - faligndata %f12, %f14, %f60 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f62 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ +vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f48, %f62 /* FPA Group f0 available*/ + faligndata %f0, %f2, %f48 /* FPA Group f2 available*/ + fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available*/ + fpadd32 %f0, %f62, %f0 /* FPA */ + fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available*/ + faligndata %f2, %f4, %f50 /* FPA */ + fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available*/ + faligndata %f4, %f6, %f52 /* FPA */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available*/ + inc %x1 /* IEU0 */ + faligndata %f6, %f8, %f54 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available*/ + srl %x1, 1, %x1 /* IEU0 */ + inc %x2 /* IEU1 */ + faligndata %f8, %f10, %f56 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available*/ + srl %x2, 1, %x2 /* IEU0 */ + add %sum, %x1, %sum /* IEU1 */ + faligndata %f10, %f12, %f58 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + inc %x3 /* IEU0 */ + add %sum, %x2, %sum /* IEU1 */ + faligndata %f12, %f14, %f60 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + srl %x3, 1, %x3 /* IEU0 */ + inc %x4 /* IEU1 */ + fmovd %f14, %f62 /* FPA */ + srl %x4, 1, %x4 /* IEU0 Group */ + add %sum, %x3, %sum /* IEU1 */ vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f48,f50,f52,f54,f56,f58,f60,f62,f62, ,LDBLK(f32), STBLK,,,,,,,, @@ -468,36 +470,36 @@ vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1) .align 2048 -vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - sub %src, 8, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f0, %f58 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - fcmpgt32 %f32, %f2, %x2 /* FPM Group */ - faligndata %f2, %f4, %f48 /* FPA */ - fcmpgt32 %f32, %f4, %x3 /* FPM Group */ - faligndata %f4, %f6, %f50 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f52 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - inc %x2 /* IEU1 */ - faligndata %f8, %f10, %f54 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - srl %x2, 1, %x2 /* IEU0 */ - faligndata %f10, %f12, %f56 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - add %sum, %x2, %sum /* IEU1 */ - faligndata %f12, %f14, %f58 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f60 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ +vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + sub %src, 8, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f0, %f58 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + fcmpgt32 %f32, %f2, %x2 /* FPM Group */ + faligndata %f2, %f4, %f48 /* FPA */ + fcmpgt32 %f32, %f4, %x3 /* FPM Group */ + faligndata %f4, %f6, %f50 /* FPA */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group */ + faligndata %f6, %f8, %f52 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + inc %x2 /* IEU1 */ + faligndata %f8, %f10, %f54 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + srl %x2, 1, %x2 /* IEU0 */ + faligndata %f10, %f12, %f56 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + inc %x3 /* IEU0 */ + add %sum, %x2, %sum /* IEU1 */ + faligndata %f12, %f14, %f58 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + srl %x3, 1, %x3 /* IEU0 */ + inc %x4 /* IEU1 */ + fmovd %f14, %f60 /* FPA */ + srl %x4, 1, %x4 /* IEU0 Group */ + add %sum, %x3, %sum /* IEU1 */ vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f62,f48,f50,f52,f54,f56,f58,f60,f60, ,LDBLK(f32), ,STBLK,,,,,,, @@ -523,33 +525,33 @@ vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1) .align 2048 -vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - sub %src, 16, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f0, %f56 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fcmpgt32 %f32, %f4, %x3 /* FPM Group */ - faligndata %f4, %f6, %f48 /* FPA */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f50 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f52 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f54 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - inc %x3 /* IEU0 */ - faligndata %f12, %f14, %f56 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - srl %x3, 1, %x3 /* IEU0 */ - inc %x4 /* IEU1 */ - fmovd %f14, %f58 /* FPA */ - srl %x4, 1, %x4 /* IEU0 Group */ - add %sum, %x3, %sum /* IEU1 */ +vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + sub %src, 16, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f0, %f56 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + sub %dst, 64, %dst /* IEU0 */ + fpsub32 %f2, %f2, %f2 /* FPA Group */ + fcmpgt32 %f32, %f4, %x3 /* FPM Group */ + faligndata %f4, %f6, %f48 /* FPA */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group */ + faligndata %f6, %f8, %f50 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + faligndata %f8, %f10, %f52 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + faligndata %f10, %f12, %f54 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + inc %x3 /* IEU0 */ + faligndata %f12, %f14, %f56 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + srl %x3, 1, %x3 /* IEU0 */ + inc %x4 /* IEU1 */ + fmovd %f14, %f58 /* FPA */ + srl %x4, 1, %x4 /* IEU0 Group */ + add %sum, %x3, %sum /* IEU1 */ vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f60,f62,f48,f50,f52,f54,f56,f58,f58, ,LDBLK(f32), ,,STBLK,,,,,, @@ -575,29 +577,29 @@ vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1) .align 2048 -vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - sub %src, 24, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f0, %f54 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fpsub32 %f4, %f4, %f4 /* FPA Group */ - fcmpgt32 %f32, %f6, %x4 /* FPM Group */ - faligndata %f6, %f8, %f48 /* FPA */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f50 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f52 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f54 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f56 /* FPA */ - inc %x4 /* IEU0 */ - srl %x4, 1, %x4 /* IEU0 Group */ +vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + sub %src, 24, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f0, %f54 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + sub %dst, 64, %dst /* IEU0 */ + fpsub32 %f2, %f2, %f2 /* FPA Group */ + fpsub32 %f4, %f4, %f4 /* FPA Group */ + fcmpgt32 %f32, %f6, %x4 /* FPM Group */ + faligndata %f6, %f8, %f48 /* FPA */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + faligndata %f8, %f10, %f50 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + faligndata %f10, %f12, %f52 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + faligndata %f12, %f14, %f54 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + fmovd %f14, %f56 /* FPA */ + inc %x4 /* IEU0 */ + srl %x4, 1, %x4 /* IEU0 Group */ vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f58,f60,f62,f48,f50,f52,f54,f56,f56, ,LDBLK(f32), ,,,STBLK,,,,, @@ -623,27 +625,27 @@ vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1) .align 2048 -vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - sub %src, 32, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f0 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f0, %f52 /* FPA Group */ - fmovd %f48, %f0 /* FPA Group */ - sub %dst, 64, %dst /* IEU0 */ - fpsub32 %f2, %f2, %f2 /* FPA Group */ - fpsub32 %f4, %f4, %f4 /* FPA Group */ - fpsub32 %f6, %f6, %f6 /* FPA Group */ - clr %x4 /* IEU0 */ - fcmpgt32 %f32, %f8, %x5 /* FPM Group */ - faligndata %f8, %f10, %f48 /* FPA */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - faligndata %f10, %f12, %f50 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f52 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f54 /* FPA */ +vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + sub %src, 32, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f0 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f0, %f52 /* FPA Group */ + fmovd %f48, %f0 /* FPA Group */ + sub %dst, 64, %dst /* IEU0 */ + fpsub32 %f2, %f2, %f2 /* FPA Group */ + fpsub32 %f4, %f4, %f4 /* FPA Group */ + fpsub32 %f6, %f6, %f6 /* FPA Group */ + clr %x4 /* IEU0 */ + fcmpgt32 %f32, %f8, %x5 /* FPM Group */ + faligndata %f8, %f10, %f48 /* FPA */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + faligndata %f10, %f12, %f50 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + faligndata %f12, %f14, %f52 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + fmovd %f14, %f54 /* FPA */ vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f56,f58,f60,f62,f48,f50,f52,f54,f54, ,LDBLK(f32), ,,,,STBLK,,,, @@ -669,27 +671,27 @@ vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80), ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1) .align 2048 -vis5s: ldd [%src+0], %f10 /* Load Group */ - ldd [%src+8], %f12 /* Load Group */ - ldd [%src+16], %f14 /* Load Group */ - add %src, 24, %src /* IEU0 Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fcmpgt32 %f32, %f10, %x6 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - faligndata %f10, %f12, %f48 /* FPA */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - faligndata %f12, %f14, %f50 /* FPA */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - fmovd %f14, %f52 /* FPA */ +vis5s: ldd [%src+0], %f10 /* Load Group */ + ldd [%src+8], %f12 /* Load Group */ + ldd [%src+16], %f14 /* Load Group */ + add %src, 24, %src /* IEU0 Group */ + wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f48, %f0 /* FPA Group */ + fmuld %f32, %f32, %f2 /* FPM */ + clr %x4 /* IEU0 */ + faddd %f32, %f32, %f4 /* FPA Group */ + fmuld %f32, %f32, %f6 /* FPM */ + clr %x5 /* IEU0 */ + faddd %f32, %f32, %f8 /* FPA Group */ + fcmpgt32 %f32, %f10, %x6 /* FPM Group */ + sub %dst, 64, %dst /* IEU0 */ + faligndata %f10, %f12, %f48 /* FPA */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + faligndata %f12, %f14, %f50 /* FPA */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + fmovd %f14, %f52 /* FPA */ vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f54,f56,f58,f60,f62,f48,f50,f52,f52, ,LDBLK(f32), ,,,,,STBLK,,, @@ -715,26 +717,26 @@ vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72), ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1) .align 2048 -vis6s: ldd [%src+0], %f12 /* Load Group */ - ldd [%src+8], %f14 /* Load Group */ - add %src, 16, %src /* IEU0 Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fmuld %f32, %f32, %f10 /* FPM */ - clr %x6 /* IEU0 */ - fcmpgt32 %f32, %f12, %x7 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - faligndata %f12, %f14, %f48 /* FPA */ - fmovd %f14, %f50 /* FPA Group */ +vis6s: ldd [%src+0], %f12 /* Load Group */ + ldd [%src+8], %f14 /* Load Group */ + add %src, 16, %src /* IEU0 Group */ + wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f48, %f0 /* FPA Group */ + fmuld %f32, %f32, %f2 /* FPM */ + clr %x4 /* IEU0 */ + faddd %f32, %f32, %f4 /* FPA Group */ + fmuld %f32, %f32, %f6 /* FPM */ + clr %x5 /* IEU0 */ + faddd %f32, %f32, %f8 /* FPA Group */ + fmuld %f32, %f32, %f10 /* FPM */ + clr %x6 /* IEU0 */ + fcmpgt32 %f32, %f12, %x7 /* FPM Group */ + sub %dst, 64, %dst /* IEU0 */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + faligndata %f12, %f14, %f48 /* FPA */ + fmovd %f14, %f50 /* FPA Group */ vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f52,f54,f56,f58,f60,f62,f48,f50,f50, ,LDBLK(f32), ,,,,,,STBLK,, @@ -760,25 +762,25 @@ vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64), ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1) .align 2048 -vis7s: ldd [%src+0], %f14 /* Load Group */ - add %src, 8, %src /* IEU0 Group */ - wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ - ldda [%src] ASI_BLK_P, %f16 /* Load Group */ - add %src, 64, %src /* IEU0 Group */ - fmovd %f48, %f0 /* FPA Group */ - fmuld %f32, %f32, %f2 /* FPM */ - clr %x4 /* IEU0 */ - faddd %f32, %f32, %f4 /* FPA Group */ - fmuld %f32, %f32, %f6 /* FPM */ - clr %x5 /* IEU0 */ - faddd %f32, %f32, %f8 /* FPA Group */ - fmuld %f32, %f32, %f10 /* FPM */ - clr %x6 /* IEU0 */ - faddd %f32, %f32, %f12 /* FPA Group */ - clr %x7 /* IEU0 */ - fcmpgt32 %f32, %f14, %x8 /* FPM Group */ - sub %dst, 64, %dst /* IEU0 */ - fmovd %f14, %f48 /* FPA */ +vis7s: ldd [%src+0], %f14 /* Load Group */ + add %src, 8, %src /* IEU0 Group */ + wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ + ldda [%src] ASI_BLK_P, %f16 /* Load Group */ + add %src, 64, %src /* IEU0 Group */ + fmovd %f48, %f0 /* FPA Group */ + fmuld %f32, %f32, %f2 /* FPM */ + clr %x4 /* IEU0 */ + faddd %f32, %f32, %f4 /* FPA Group */ + fmuld %f32, %f32, %f6 /* FPM */ + clr %x5 /* IEU0 */ + faddd %f32, %f32, %f8 /* FPA Group */ + fmuld %f32, %f32, %f10 /* FPM */ + clr %x6 /* IEU0 */ + faddd %f32, %f32, %f12 /* FPA Group */ + clr %x7 /* IEU0 */ + fcmpgt32 %f32, %f14, %x8 /* FPM Group */ + sub %dst, 64, %dst /* IEU0 */ + fmovd %f14, %f48 /* FPA */ vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, ,f50,f52,f54,f56,f58,f60,f62,f48,f48, ,LDBLK(f32), ,,,,,,,STBLK, @@ -806,104 +808,104 @@ vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14 e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6) e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6) e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6) -ett: rd %gsr, %x3 /* LSU Group+4bubbles */ - andcc %x3, 7, %x3 /* IEU1 Group */ - add %dst, 8, %dst /* IEU0 */ - bne,pn %icc, 1f /* CTI */ - fzero %f10 /* FPA */ - brz,a,pn %len, 2f /* CTI+IEU1 Group */ - stda %f6, [%dst - 8] %asi /* Store */ -1: cmp %len, 8 /* IEU1 */ - blu,pn %icc, 3f /* CTI */ - sub %src, 64, %src /* IEU0 Group */ -1: ldd [%src], %f2 /* Load Group */ - fpadd32 %f10, %f2, %f12 /* FPA Group+load stall */ - add %src, 8, %src /* IEU0 */ - add %dst, 8, %dst /* IEU1 */ - faligndata %f6, %f2, %f14 /* FPA Group */ - fcmpgt32 %f10, %f12, %x5 /* FPM Group */ - stda %f14, [%dst - 16] %asi /* Store */ - fmovd %f2, %f6 /* FPA */ - fmovd %f12, %f10 /* FPA Group */ - sub %len, 8, %len /* IEU1 */ - fzero %f16 /* FPA Group - FPU nop */ - fzero %f18 /* FPA Group - FPU nop */ - inc %x5 /* IEU0 */ - srl %x5, 1, %x5 /* IEU0 Group (regdep) */ - cmp %len, 8 /* IEU1 */ - bgeu,pt %icc, 1b /* CTI */ - add %x5, %sum, %sum /* IEU0 Group */ -3: brz,a,pt %x3, 2f /* CTI+IEU1 */ - stda %f6, [%dst - 8] %asi /* Store Group */ - sta %f7, [%dst - 8] %asi /* Store Group */ - sub %dst, 4, %dst /* IEU0 */ - add %len, 4, %len /* IEU1 */ +ett: rd %gsr, %x3 /* LSU Group+4bubbles */ + andcc %x3, 7, %x3 /* IEU1 Group */ + add %dst, 8, %dst /* IEU0 */ + bne,pn %icc, 1f /* CTI */ + fzero %f10 /* FPA */ + brz,a,pn %len, 2f /* CTI+IEU1 Group */ + stda %f6, [%dst - 8] %asi /* Store */ +1: cmp %len, 8 /* IEU1 */ + blu,pn %icc, 3f /* CTI */ + sub %src, 64, %src /* IEU0 Group */ +1: ldd [%src], %f2 /* Load Group */ + fpadd32 %f10, %f2, %f12 /* FPA Group+load stall*/ + add %src, 8, %src /* IEU0 */ + add %dst, 8, %dst /* IEU1 */ + faligndata %f6, %f2, %f14 /* FPA Group */ + fcmpgt32 %f10, %f12, %x5 /* FPM Group */ + stda %f14, [%dst - 16] %asi /* Store */ + fmovd %f2, %f6 /* FPA */ + fmovd %f12, %f10 /* FPA Group */ + sub %len, 8, %len /* IEU1 */ + fzero %f16 /* FPA Group - FPU nop */ + fzero %f18 /* FPA Group - FPU nop */ + inc %x5 /* IEU0 */ + srl %x5, 1, %x5 /* IEU0 Group (regdep) */ + cmp %len, 8 /* IEU1 */ + bgeu,pt %icc, 1b /* CTI */ + add %x5, %sum, %sum /* IEU0 Group */ +3: brz,a,pt %x3, 2f /* CTI+IEU1 */ + stda %f6, [%dst - 8] %asi /* Store Group */ + sta %f7, [%dst - 8] %asi /* Store Group */ + sub %dst, 4, %dst /* IEU0 */ + add %len, 4, %len /* IEU1 */ 2: #ifdef __KERNEL__ - sub %sp, 8, %sp /* IEU0 Group */ + sub %sp, 8, %sp /* IEU0 Group */ #endif END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62) - membar #Sync /* LSU Group */ + membar #Sync /* LSU Group */ #ifdef __KERNEL__ VISExit - add %sp, 8, %sp /* IEU0 Group */ + add %sp, 8, %sp /* IEU0 Group */ #endif -23: brnz,pn %len, 26f /* CTI+IEU1 Group */ -24: sllx %sum, 32, %g1 /* IEU0 */ -25: addcc %sum, %g1, %src /* IEU1 Group */ - srlx %src, 32, %src /* IEU0 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %src, 1, %src /* IEU1 */ +23: brnz,pn %len, 26f /* CTI+IEU1 Group */ +24: sllx %sum, 32, %g1 /* IEU0 */ +25: addcc %sum, %g1, %src /* IEU1 Group */ + srlx %src, 32, %src /* IEU0 Group (regdep) */ + bcs,a,pn %xcc, 1f /* CTI */ + add %src, 1, %src /* IEU1 */ #ifndef __KERNEL__ -1: retl /* CTI Group brk forced */ - srl %src, 0, %src /* IEU0 */ +1: retl /* CTI Group brk forced*/ + srl %src, 0, %src /* IEU0 */ #else -1: sethi %uhi(PAGE_OFFSET), %g4 /* IEU0 Group */ - retl /* CTI Group brk forced */ - sllx %g4, 32, %g4 /* IEU0 */ +1: sethi %uhi(PAGE_OFFSET), %g4 /* IEU0 Group */ + retl /* CTI Group brk forced*/ + sllx %g4, 32, %g4 /* IEU0 */ #endif -26: andcc %len, 8, %g0 /* IEU1 Group */ - be,pn %icc, 1f /* CTI */ - lduw [%src], %o4 /* Load */ - lduw [%src+4], %g2 /* Load Group */ - add %src, 8, %src /* IEU0 */ - add %dst, 8, %dst /* IEU1 */ - sllx %o4, 32, %g5 /* IEU0 Group */ - stwa %o4, [%dst - 8] %asi /* Store */ - or %g5, %g2, %g5 /* IEU0 Group */ - stwa %g2, [%dst - 4] %asi /* Store */ - addcc %g5, %sum, %sum /* IEU1 Group */ - bcs,a,pn %xcc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: andcc %len, 4, %g0 /* IEU1 Group */ - be,a,pn %icc, 1f /* CTI */ - clr %g2 /* IEU0 */ - lduw [%src], %g7 /* Load */ - add %src, 4, %src /* IEU0 Group */ - add %dst, 4, %dst /* IEU1 */ - sllx %g7, 32, %g2 /* IEU0 Group */ - stwa %g7, [%dst - 4] %asi /* Store */ -1: andcc %len, 2, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %g3 /* IEU0 Group */ - lduh [%src], %g7 /* Load */ - add %src, 2, %src /* IEU1 */ - add %dst, 2, %dst /* IEU0 Group */ - sll %g7, 16, %g3 /* IEU0 Group */ - stha %g7, [%dst - 2] %asi /* Store */ -1: andcc %len, 1, %g0 /* IEU1 */ - be,a,pn %icc, 1f /* CTI */ - clr %o5 /* IEU0 Group */ - ldub [%src], %g7 /* Load */ - sll %g7, 8, %o5 /* IEU0 Group */ - stba %g7, [%dst] %asi /* Store */ -1: or %g2, %g3, %g3 /* IEU1 */ - or %o5, %g3, %g3 /* IEU0 Group (regdep) */ - addcc %g3, %sum, %sum /* IEU1 Group (regdep) */ - bcs,a,pn %xcc, 1f /* CTI */ - add %sum, 1, %sum /* IEU0 */ -1: ba,pt %xcc, 25b /* CTI Group */ - sllx %sum, 32, %g1 /* IEU0 */ +26: andcc %len, 8, %g0 /* IEU1 Group */ + be,pn %icc, 1f /* CTI */ + lduw [%src], %o4 /* Load */ + lduw [%src+4], %g2 /* Load Group */ + add %src, 8, %src /* IEU0 */ + add %dst, 8, %dst /* IEU1 */ + sllx %o4, 32, %g5 /* IEU0 Group */ + stwa %o4, [%dst - 8] %asi /* Store */ + or %g5, %g2, %g5 /* IEU0 Group */ + stwa %g2, [%dst - 4] %asi /* Store */ + addcc %g5, %sum, %sum /* IEU1 Group */ + bcs,a,pn %xcc, 1f /* CTI */ + add %sum, 1, %sum /* IEU0 */ +1: andcc %len, 4, %g0 /* IEU1 Group */ + be,a,pn %icc, 1f /* CTI */ + clr %g2 /* IEU0 */ + lduw [%src], %g7 /* Load */ + add %src, 4, %src /* IEU0 Group */ + add %dst, 4, %dst /* IEU1 */ + sllx %g7, 32, %g2 /* IEU0 Group */ + stwa %g7, [%dst - 4] %asi /* Store */ +1: andcc %len, 2, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %g3 /* IEU0 Group */ + lduh [%src], %g7 /* Load */ + add %src, 2, %src /* IEU1 */ + add %dst, 2, %dst /* IEU0 Group */ + sll %g7, 16, %g3 /* IEU0 Group */ + stha %g7, [%dst - 2] %asi /* Store */ +1: andcc %len, 1, %g0 /* IEU1 */ + be,a,pn %icc, 1f /* CTI */ + clr %o5 /* IEU0 Group */ + ldub [%src], %g7 /* Load */ + sll %g7, 8, %o5 /* IEU0 Group */ + stba %g7, [%dst] %asi /* Store */ +1: or %g2, %g3, %g3 /* IEU1 */ + or %o5, %g3, %g3 /* IEU0 Group (regdep) */ + addcc %g3, %sum, %sum /* IEU1 Group (regdep) */ + bcs,a,pn %xcc, 1f /* CTI */ + add %sum, 1, %sum /* IEU0 */ +1: ba,pt %xcc, 25b /* CTI Group */ + sllx %sum, 32, %g1 /* IEU0 */ #ifdef __KERNEL__ end: |