diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-02-23 00:40:54 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-02-23 00:40:54 +0000 |
commit | 529c593ece216e4aaffd36bd940cb94f1fa63129 (patch) | |
tree | 78f1c0b805f5656aa7b0417a043c5346f700a2cf /arch/ia64/lib/memset.S | |
parent | 0bd079751d25808d1972baee5c4eaa1db2227257 (diff) |
Merge with 2.3.43. I did ignore all modifications to the qlogicisp.c
driver due to the Origin A64 hacks.
Diffstat (limited to 'arch/ia64/lib/memset.S')
-rw-r--r-- | arch/ia64/lib/memset.S | 111 |
1 files changed, 111 insertions, 0 deletions
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S new file mode 100644 index 000000000..595720a2d --- /dev/null +++ b/arch/ia64/lib/memset.S @@ -0,0 +1,111 @@ +/* + * + * Optimized version of the standard memset() function + * + * Return: none + * + * + * Inputs: + * in0: address of buffer + * in1: byte value to use for storing + * in2: length of the buffer + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> + */ + + +// arguments +// +#define buf r32 +#define val r33 +#define len r34 + +// +// local registers +// +#define saved_pfs r14 +#define cnt r18 +#define buf2 r19 +#define saved_lc r20 +#define saved_pr r21 +#define tmp r22 + + .text + .psr abi64 + .psr lsb + + .align 16 + .global memset + .proc memset + +memset: + alloc saved_pfs=ar.pfs,3,0,0,0 // cnt is sink here + cmp.eq p8,p0=r0,len // check for zero length + mov saved_lc=ar.lc // preserve ar.lc (slow) + ;; + adds tmp=-1,len // br.ctop is repeat/until + tbit.nz p6,p0=buf,0 // odd alignment +(p8) br.ret.spnt.few rp + + cmp.lt p7,p0=16,len // if len > 16 then long memset + mux1 val=val,@brcst // prepare value +(p7) br.cond.dptk.few long_memset + ;; + mov ar.lc=tmp // initialize lc for small count + ;; // avoid RAW and WAW on ar.lc +1: // worst case 15 cyles, avg 8 cycles + st1 [buf]=val,1 + br.cloop.dptk.few 1b + ;; // avoid RAW on ar.lc + mov ar.lc=saved_lc + mov ar.pfs=saved_pfs + br.ret.sptk.few rp // end of short memset + + // at this point we know we have more than 16 bytes to copy + // so we focus on alignment +long_memset: +(p6) st1 [buf]=val,1 // 1-byte aligned +(p6) adds len=-1,len;; // sync because buf is modified + tbit.nz p6,p0=buf,1 + ;; +(p6) st2 [buf]=val,2 // 2-byte aligned +(p6) adds len=-2,len;; + tbit.nz p6,p0=buf,2 + ;; +(p6) st4 [buf]=val,4 // 4-byte aligned +(p6) adds len=-4,len;; + tbit.nz p6,p0=buf,3 + ;; +(p6) st8 [buf]=val,8 // 8-byte aligned +(p6) adds len=-8,len;; + shr.u cnt=len,4 // number of 128-bit (2x64bit) words + ;; + cmp.eq p6,p0=r0,cnt + adds tmp=-1,cnt +(p6) br.cond.dpnt.few .dotail // we have less than 16 bytes left + ;; + adds buf2=8,buf // setup second base pointer + mov ar.lc=tmp + ;; +2: // 16bytes/iteration + st8 [buf]=val,16 + st8 [buf2]=val,16 + br.cloop.dptk.few 2b + ;; +.dotail: // tail correction based on len only + tbit.nz p6,p0=len,3 + ;; +(p6) st8 [buf]=val,8 // at least 8 bytes + tbit.nz p6,p0=len,2 + ;; +(p6) st4 [buf]=val,4 // at least 4 bytes + tbit.nz p6,p0=len,1 + ;; +(p6) st2 [buf]=val,2 // at least 2 bytes + tbit.nz p6,p0=len,0 + mov ar.lc=saved_lc + ;; +(p6) st1 [buf]=val // only 1 byte left + br.ret.dptk.few rp + .endp |