summaryrefslogtreecommitdiffstats
path: root/arch/ia64/lib/memset.S
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2000-02-23 00:40:54 +0000
committerRalf Baechle <ralf@linux-mips.org>2000-02-23 00:40:54 +0000
commit529c593ece216e4aaffd36bd940cb94f1fa63129 (patch)
tree78f1c0b805f5656aa7b0417a043c5346f700a2cf /arch/ia64/lib/memset.S
parent0bd079751d25808d1972baee5c4eaa1db2227257 (diff)
Merge with 2.3.43. I did ignore all modifications to the qlogicisp.c
driver due to the Origin A64 hacks.
Diffstat (limited to 'arch/ia64/lib/memset.S')
-rw-r--r--arch/ia64/lib/memset.S111
1 files changed, 111 insertions, 0 deletions
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
new file mode 100644
index 000000000..595720a2d
--- /dev/null
+++ b/arch/ia64/lib/memset.S
@@ -0,0 +1,111 @@
+/*
+ *
+ * Optimized version of the standard memset() function
+ *
+ * Return: none
+ *
+ *
+ * Inputs:
+ * in0: address of buffer
+ * in1: byte value to use for storing
+ * in2: length of the buffer
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ */
+
+
+// arguments
+//
+#define buf r32
+#define val r33
+#define len r34
+
+//
+// local registers
+//
+#define saved_pfs r14
+#define cnt r18
+#define buf2 r19
+#define saved_lc r20
+#define saved_pr r21
+#define tmp r22
+
+ .text
+ .psr abi64
+ .psr lsb
+
+ .align 16
+ .global memset
+ .proc memset
+
+memset:
+ alloc saved_pfs=ar.pfs,3,0,0,0 // cnt is sink here
+ cmp.eq p8,p0=r0,len // check for zero length
+ mov saved_lc=ar.lc // preserve ar.lc (slow)
+ ;;
+ adds tmp=-1,len // br.ctop is repeat/until
+ tbit.nz p6,p0=buf,0 // odd alignment
+(p8) br.ret.spnt.few rp
+
+ cmp.lt p7,p0=16,len // if len > 16 then long memset
+ mux1 val=val,@brcst // prepare value
+(p7) br.cond.dptk.few long_memset
+ ;;
+ mov ar.lc=tmp // initialize lc for small count
+ ;; // avoid RAW and WAW on ar.lc
+1: // worst case 15 cyles, avg 8 cycles
+ st1 [buf]=val,1
+ br.cloop.dptk.few 1b
+ ;; // avoid RAW on ar.lc
+ mov ar.lc=saved_lc
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.few rp // end of short memset
+
+ // at this point we know we have more than 16 bytes to copy
+ // so we focus on alignment
+long_memset:
+(p6) st1 [buf]=val,1 // 1-byte aligned
+(p6) adds len=-1,len;; // sync because buf is modified
+ tbit.nz p6,p0=buf,1
+ ;;
+(p6) st2 [buf]=val,2 // 2-byte aligned
+(p6) adds len=-2,len;;
+ tbit.nz p6,p0=buf,2
+ ;;
+(p6) st4 [buf]=val,4 // 4-byte aligned
+(p6) adds len=-4,len;;
+ tbit.nz p6,p0=buf,3
+ ;;
+(p6) st8 [buf]=val,8 // 8-byte aligned
+(p6) adds len=-8,len;;
+ shr.u cnt=len,4 // number of 128-bit (2x64bit) words
+ ;;
+ cmp.eq p6,p0=r0,cnt
+ adds tmp=-1,cnt
+(p6) br.cond.dpnt.few .dotail // we have less than 16 bytes left
+ ;;
+ adds buf2=8,buf // setup second base pointer
+ mov ar.lc=tmp
+ ;;
+2: // 16bytes/iteration
+ st8 [buf]=val,16
+ st8 [buf2]=val,16
+ br.cloop.dptk.few 2b
+ ;;
+.dotail: // tail correction based on len only
+ tbit.nz p6,p0=len,3
+ ;;
+(p6) st8 [buf]=val,8 // at least 8 bytes
+ tbit.nz p6,p0=len,2
+ ;;
+(p6) st4 [buf]=val,4 // at least 4 bytes
+ tbit.nz p6,p0=len,1
+ ;;
+(p6) st2 [buf]=val,2 // at least 2 bytes
+ tbit.nz p6,p0=len,0
+ mov ar.lc=saved_lc
+ ;;
+(p6) st1 [buf]=val // only 1 byte left
+ br.ret.dptk.few rp
+ .endp