summaryrefslogtreecommitdiffstats
path: root/arch/ia64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r--arch/ia64/lib/Makefile42
-rw-r--r--arch/ia64/lib/checksum.c110
-rw-r--r--arch/ia64/lib/clear_page.S42
-rw-r--r--arch/ia64/lib/clear_user.S224
-rw-r--r--arch/ia64/lib/copy_page.S87
-rw-r--r--arch/ia64/lib/copy_user.S71
-rw-r--r--arch/ia64/lib/csum_partial_copy.c165
-rw-r--r--arch/ia64/lib/do_csum.S230
-rw-r--r--arch/ia64/lib/flush.S37
-rw-r--r--arch/ia64/lib/idiv.S158
-rw-r--r--arch/ia64/lib/memset.S111
-rw-r--r--arch/ia64/lib/strlen.S197
-rw-r--r--arch/ia64/lib/strlen_user.S213
-rw-r--r--arch/ia64/lib/strncpy_from_user.S53
-rw-r--r--arch/ia64/lib/strnlen_user.S55
15 files changed, 1795 insertions, 0 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
new file mode 100644
index 000000000..8a9581747
--- /dev/null
+++ b/arch/ia64/lib/Makefile
@@ -0,0 +1,42 @@
+#
+# Makefile for ia64-specific library routines..
+#
+
+.S.o:
+ $(CC) -D__ASSEMBLY__ $(AFLAGS) -traditional -c $< -o $@
+
+OBJS = __divdi3.o __divsi3.o __udivdi3.o __udivsi3.o \
+ __moddi3.o __modsi3.o __umoddi3.o __umodsi3.o \
+ checksum.o clear_page.o csum_partial_copy.o copy_page.o \
+ copy_user.o clear_user.o memset.o strncpy_from_user.o \
+ strlen.o strlen_user.o strnlen_user.o \
+ flush.o do_csum.o
+
+lib.a: $(OBJS)
+ $(AR) rcs lib.a $(OBJS)
+
+__divdi3.o: idiv.S
+ $(CC) $(AFLAGS) -c -o $@ $<
+
+__divsi3.o: idiv.S
+ $(CC) $(AFLAGS) -c -DSINGLE -c -o $@ $<
+
+__udivdi3.o: idiv.S
+ $(CC) $(AFLAGS) -c -DUNSIGNED -c -o $@ $<
+
+__udivsi3.o: idiv.S
+ $(CC) $(AFLAGS) -c -DUNSIGNED -DSINGLE -c -o $@ $<
+
+__moddi3.o: idiv.S
+ $(CC) $(AFLAGS) -c -DMODULO -c -o $@ $<
+
+__modsi3.o: idiv.S
+ $(CC) $(AFLAGS) -c -DMODULO -DSINGLE -c -o $@ $<
+
+__umoddi3.o: idiv.S
+ $(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -c -o $@ $<
+
+__umodsi3.o: idiv.S
+ $(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -DSINGLE -c -o $@ $<
+
+include $(TOPDIR)/Rules.make
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
new file mode 100644
index 000000000..9c4a8af75
--- /dev/null
+++ b/arch/ia64/lib/checksum.c
@@ -0,0 +1,110 @@
+/*
+ * Network checksum routines
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code coming from arch/alpha/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed..
+ */
+
+#include <linux/string.h>
+
+#include <asm/byteorder.h>
+
+static inline unsigned short
+from64to16(unsigned long x)
+{
+ /* add up 32-bit words for 33 bits */
+ x = (x & 0xffffffff) + (x >> 32);
+ /* add up 16-bit and 17-bit words for 17+c bits */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up 16-bit and 2-bit for 16+c bit */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up carry.. */
+ x = (x & 0xffff) + (x >> 16);
+ return x;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented.
+ */
+unsigned short int csum_tcpudp_magic(unsigned long saddr,
+ unsigned long daddr,
+ unsigned short len,
+ unsigned short proto,
+ unsigned int sum)
+{
+ return ~from64to16(saddr + daddr + sum +
+ ((unsigned long) ntohs(len) << 16) +
+ ((unsigned long) proto << 8));
+}
+
+unsigned int csum_tcpudp_nofold(unsigned long saddr,
+ unsigned long daddr,
+ unsigned short len,
+ unsigned short proto,
+ unsigned int sum)
+{
+ unsigned long result;
+
+ result = (saddr + daddr + sum +
+ ((unsigned long) ntohs(len) << 16) +
+ ((unsigned long) proto << 8));
+
+ /* Fold down to 32-bits so we don't loose in the typedef-less
+ network stack. */
+ /* 64 to 33 */
+ result = (result & 0xffffffff) + (result >> 32);
+ /* 33 to 32 */
+ result = (result & 0xffffffff) + (result >> 32);
+ return result;
+}
+
+extern unsigned long do_csum(const unsigned char *, unsigned int, unsigned int);
+extern unsigned long do_csum_c(const unsigned char *, unsigned int, unsigned int);
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.
+ */
+unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
+{
+ return ~do_csum(iph,ihl*4,0);
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+{
+ unsigned long result = do_csum(buff, len, 0);
+
+ /* add in old sum, and carry.. */
+ result += sum;
+ /* 32+c bits -> 32 bits */
+ result = (result & 0xffffffff) + (result >> 32);
+ return result;
+}
+
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short ip_compute_csum(unsigned char * buff, int len)
+{
+ return ~do_csum(buff,len, 0);
+}
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
new file mode 100644
index 000000000..314311c5c
--- /dev/null
+++ b/arch/ia64/lib/clear_page.S
@@ -0,0 +1,42 @@
+/*
+ *
+ * Optimized version of the standard clearpage() function
+ *
+ * Based on comments from ddd. Try not to overflow the write buffer.
+ *
+ * Inputs:
+ * in0: address of page
+ *
+ * Output:
+ * none
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/page.h>
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+ .align 32
+ .global clear_page
+ .proc clear_page
+clear_page:
+ alloc r11=ar.pfs,1,0,0,0
+ mov r16=ar.lc // slow
+ mov r17=PAGE_SIZE/32-1 // -1 = repeat/until
+ ;;
+ adds r18=16,in0
+ mov ar.lc=r17
+ ;;
+1: stf.spill.nta [in0]=f0,32
+ stf.spill.nta [r18]=f0,32
+ br.cloop.dptk.few 1b
+ ;;
+ mov ar.lc=r16 // restore lc
+ br.ret.sptk.few rp
+
+ .endp clear_page
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
new file mode 100644
index 000000000..0db4a78f8
--- /dev/null
+++ b/arch/ia64/lib/clear_user.S
@@ -0,0 +1,224 @@
+/*
+ * This routine clears to zero a linear memory buffer in user space.
+ *
+ * Inputs:
+ * in0: address of buffer
+ * in1: length of buffer in bytes
+ * Outputs:
+ * r8: number of bytes that didn't get cleared due to a fault
+ *
+ * Copyright (C) 1998, 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ */
+
+//
+// arguments
+//
+#define buf r32
+#define len r33
+
+//
+// local registers
+//
+#define cnt r16
+#define buf2 r17
+#define saved_lc r18
+#define saved_pr r19
+#define saved_pfs r20
+#define tmp r21
+#define len2 r22
+#define len3 r23
+
+//
+// Theory of operations:
+// - we check whether or not the buffer is small, i.e., less than 17
+// in which case we do the byte by byte loop.
+//
+// - Otherwise we go progressively from 1 byte store to 8byte store in
+// the head part, the body is a 16byte store loop and we finish we the
+// tail for the last 15 bytes.
+// The good point about this breakdown is that the long buffer handling
+// contains only 2 branches.
+//
+// The reason for not using shifting & masking for both the head and the
+// tail is to stay semantically correct. This routine is not supposed
+// to write bytes outside of the buffer. While most of the time this would
+// be ok, we can't tolerate a mistake. A classical example is the case
+// of multithreaded code were to the extra bytes touched is actually owned
+// by another thread which runs concurrently to ours. Another, less likely,
+// example is with device drivers where reading an I/O mapped location may
+// have side effects (same thing for writing).
+//
+
+// The label comes first because our store instruction contains a comma
+// and confuse the preprocessor otherwise
+//
+#define EX(y,x...) \
+ .section __ex_table,"a"; \
+ data4 @gprel(99f); \
+ data4 y-99f; \
+ .previous; \
+99: x
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+ .align 32
+ .global __do_clear_user
+ .proc __do_clear_user
+
+__do_clear_user:
+ alloc saved_pfs=ar.pfs,2,0,0,0
+ cmp.eq p6,p0=r0,len // check for zero length
+ mov saved_lc=ar.lc // preserve ar.lc (slow)
+ ;; // avoid WAW on CFM
+ adds tmp=-1,len // br.ctop is repeat/until
+ mov ret0=len // return value is length at this point
+(p6) br.ret.spnt.few rp
+ ;;
+ cmp.lt p6,p0=16,len // if len > 16 then long memset
+ mov ar.lc=tmp // initialize lc for small count
+(p6) br.cond.dptk.few long_do_clear
+ ;; // WAR on ar.lc
+ //
+ // worst case 16 cyles, avg 8 cycles
+ //
+ // We could have played with the predicates to use the extra
+ // M slot for 2 stores/iteration but the cost the initialization
+ // the various counters compared to how long the loop is supposed
+ // to last on average does not make this solution viable.
+ //
+1:
+ EX( .Lexit1, st1 [buf]=r0,1 )
+ adds len=-1,len // countdown length using len
+ br.cloop.dptk.few 1b
+ ;; // avoid RAW on ar.lc
+ //
+ // .Lexit4: comes from byte by byte loop
+ // len contains bytes left
+.Lexit1:
+ mov ret0=len // faster than using ar.lc
+ mov ar.lc=saved_lc
+ br.ret.sptk.few rp // end of short clear_user
+
+
+ //
+ // At this point we know we have more than 16 bytes to copy
+ // so we focus on alignment (no branches required)
+ //
+ // The use of len/len2 for countdown of the number of bytes left
+ // instead of ret0 is due to the fact that the exception code
+ // changes the values of r8.
+ //
+long_do_clear:
+ tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear)
+ ;;
+ EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned
+(p6) adds len=-1,len;; // sync because buf is modified
+ tbit.nz p6,p0=buf,1
+ ;;
+ EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned
+(p6) adds len=-2,len;;
+ tbit.nz p6,p0=buf,2
+ ;;
+ EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned
+(p6) adds len=-4,len;;
+ tbit.nz p6,p0=buf,3
+ ;;
+ EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned
+(p6) adds len=-8,len;;
+ shr.u cnt=len,4 // number of 128-bit (2x64bit) words
+ ;;
+ cmp.eq p6,p0=r0,cnt
+ adds tmp=-1,cnt
+(p6) br.cond.dpnt.few .dotail // we have less than 16 bytes left
+ ;;
+ adds buf2=8,buf // setup second base pointer
+ mov ar.lc=tmp
+ ;;
+
+ //
+ // 16bytes/iteration core loop
+ //
+ // The second store can never generate a fault because
+ // we come into the loop only when we are 16-byte aligned.
+ // This means that if we cross a page then it will always be
+ // in the first store and never in the second.
+ //
+ //
+ // We need to keep track of the remaining length. A possible (optimistic)
+ // way would be to ue ar.lc and derive how many byte were left by
+ // doing : left= 16*ar.lc + 16. this would avoid the addition at
+ // every iteration.
+ // However we need to keep the synchronization point. A template
+ // M;;MB does not exist and thus we can keep the addition at no
+ // extra cycle cost (use a nop slot anyway). It also simplifies the
+ // (unlikely) error recovery code
+ //
+
+2:
+
+ EX(.Lexit3, st8 [buf]=r0,16 )
+ ;; // needed to get len correct when error
+ st8 [buf2]=r0,16
+ adds len=-16,len
+ br.cloop.dptk.few 2b
+ ;;
+ mov ar.lc=saved_lc
+ //
+ // tail correction based on len only
+ //
+ // We alternate the use of len3,len2 to allow parallelism and correct
+ // error handling. We also reuse p6/p7 to return correct value.
+ // The addition of len2/len3 does not cost anything more compared to
+ // the regular memset as we had empty slots.
+ //
+.dotail:
+ mov len2=len // for parallelization of error handling
+ mov len3=len
+ tbit.nz p6,p0=len,3
+ ;;
+ EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes
+(p6) adds len3=-8,len2
+ tbit.nz p7,p6=len,2
+ ;;
+ EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes
+(p7) adds len2=-4,len3
+ tbit.nz p6,p7=len,1
+ ;;
+ EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes
+(p6) adds len3=-2,len2
+ tbit.nz p7,p6=len,0
+ ;;
+ EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left
+ mov ret0=r0 // success
+ br.ret.dptk.few rp // end of most likely path
+
+ //
+ // Outlined error handling code
+ //
+
+ //
+ // .Lexit3: comes from core loop, need restore pr/lc
+ // len contains bytes left
+ //
+ //
+ // .Lexit2:
+ // if p6 -> coming from st8 or st2 : len2 contains what's left
+ // if p7 -> coming from st4 or st1 : len3 contains what's left
+ // We must restore lc/pr even though might not have been used.
+.Lexit2:
+(p6) mov len=len2
+(p7) mov len=len3
+ ;;
+ //
+ // .Lexit4: comes from head, need not restore pr/lc
+ // len contains bytes left
+ //
+.Lexit3:
+ mov ret0=len
+ mov ar.lc=saved_lc
+ br.ret.dptk.few rp
+ .endp
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
new file mode 100644
index 000000000..0a956e5a2
--- /dev/null
+++ b/arch/ia64/lib/copy_page.S
@@ -0,0 +1,87 @@
+/*
+ *
+ * Optimized version of the standard copy_page() function
+ *
+ * Based on comments from ddd. Try not to overflow write buffer.
+ *
+ * Inputs:
+ * in0: address of target page
+ * in1: address of source page
+ * Output:
+ * no return value
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ */
+#include <asm/page.h>
+
+#define lcount r16
+#define saved_pr r17
+#define saved_lc r18
+#define saved_pfs r19
+#define src1 r20
+#define src2 r21
+#define tgt1 r22
+#define tgt2 r23
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+ .align 32
+ .global copy_page
+ .proc copy_page
+
+copy_page:
+ alloc saved_pfs=ar.pfs,10,0,0,8 // we need 6 roatating (8 minimum)
+ // + 2 input
+
+ .rotr t1[4], t2[4] // our 2 pipelines with depth of 4 each
+
+ mov saved_lc=ar.lc // save ar.lc ahead of time
+ mov saved_pr=pr // rotating predicates are preserved
+ // resgisters we must save.
+ mov src1=in1 // initialize 1st stream source
+ adds src2=8,in1 // initialize 2nd stream source
+ mov lcount=PAGE_SIZE/16-1 // as many 16bytes as there are on a page
+ // -1 is because br.ctop is repeat/until
+
+ adds tgt2=8,in0 // initialize 2nd stream target
+ mov tgt1=in0 // initialize 1st stream target
+ ;;
+ mov pr.rot=1<<16 // pr16=1 & pr[17-63]=0 , 63 not modified
+
+ mov ar.lc=lcount // set loop counter
+ mov ar.ec=4 // ar.ec must match pipeline depth
+ ;;
+
+ // We need to preload the n-1 stages of the pipeline (n=depth).
+ // We do this during the "prolog" of the loop: we execute
+ // n-1 times the "load" bundle. Then both loads & stores are
+ // enabled until we reach the end of the last word of the page
+ // on the load side. Then, we enter the epilogue (controlled by ec)
+ // where we just do the stores and no loads n-1 times : drain the pipe.
+ //
+ // The initialization of the prolog is done via the predicate registers:
+ // the choice of pr19 DEPENDS on the depth of the pipeline (n).
+ // When lc > 0 pr63=1 and it is fed back into pr16 and pr16-pr62
+ // are then shifted right at every iteration,
+ // Thus by initializing pr16=1 and pr17-19=0 (19=16+4-1) before the loop
+ // we get pr19=1 after 4 iterations (n in our case).
+ //
+1: // engage loop now, let the magic happen...
+(p16) ld8 t1[0]=[src1],16 // new data on top of pipeline in 1st stream
+(p16) ld8 t2[0]=[src2],16 // new data on top of pipeline in 2nd stream
+ nop.i 0x0
+(p19) st8 [tgt1]=t1[3],16 // store top of 1st pipeline
+(p19) st8 [tgt2]=t2[3],16 // store top of 2nd pipeline
+ br.ctop.dptk.few 1b // once lc==0, ec-- & p16=0
+ // stores but no loads anymore
+ ;;
+ mov pr=saved_pr,0xffffffffffff0000 // restore predicates
+ mov ar.pfs=saved_pfs // restore ar.ec
+ mov ar.lc=saved_lc // restore saved lc
+ br.ret.sptk.few rp // bye...
+
+ .endp copy_page
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
new file mode 100644
index 000000000..03a540a80
--- /dev/null
+++ b/arch/ia64/lib/copy_user.S
@@ -0,0 +1,71 @@
+/*
+ * This routine copies a linear memory buffer across the user/kernel boundary. When
+ * reading a byte from the source causes a fault, the remainder of the destination
+ * buffer is zeroed out. Note that this can happen only when copying from user
+ * to kernel memory and we do this to absolutely guarantee that the
+ * kernel doesn't operate on random data.
+ *
+ * This file is derived from arch/alpha/lib/copy_user.S.
+ *
+ * Inputs:
+ * in0: address of destination buffer
+ * in1: address of source buffer
+ * in2: length of buffer in bytes
+ * Outputs:
+ * r8: number of bytes that didn't get copied due to a fault
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#define EXI(x...) \
+99: x; \
+ .section __ex_table,"a"; \
+ data4 @gprel(99b); \
+ data4 .Lexit_in-99b; \
+ .previous
+
+#define EXO(x...) \
+99: x; \
+ .section __ex_table,"a"; \
+ data4 @gprel(99b); \
+ data4 .Lexit_out-99b; \
+ .previous
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+ .align 32
+ .global __copy_user
+ .proc __copy_user
+__copy_user:
+ alloc r10=ar.pfs,3,0,0,0
+ mov r9=ar.lc // save ar.lc
+ mov ar.lc=in2 // set ar.lc to length of buffer
+ br.sptk.few .Lentr
+
+ // XXX braindead copy loop---this needs to be optimized
+.Loop1:
+ EXI(ld1 r8=[in1],1)
+ ;;
+ EXO(st1 [in0]=r8,1)
+.Lentr: br.cloop.dptk.few .Loop1 // repeat unless ar.lc--==0
+ ;; // avoid RAW on ar.lc
+.Lexit_out:
+ mov r8=ar.lc // return how many bytes we _didn't_ copy
+ mov ar.lc=r9
+ br.ret.sptk.few rp
+
+.Lexit_in:
+ // clear the remainder of the buffer:
+ mov r8=ar.lc // return how many bytes we _didn't_ copy
+.Loop2:
+ st1 [in0]=r0,1 // this cannot fault because we get here only on user->kernel copies
+ br.cloop.dptk.few .Loop2
+ ;; // avoid RAW on ar.lc
+ mov ar.lc=r9
+ br.ret.sptk.few rp
+
+ .endp __copy_user
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
new file mode 100644
index 000000000..d09f11e21
--- /dev/null
+++ b/arch/ia64/lib/csum_partial_copy.c
@@ -0,0 +1,165 @@
+/*
+ * Network Checksum & Copy routine
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code has been imported from Linux/Alpha
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * XXX Fixme: those 2 inlines are meant for debugging and will go away
+ */
+static inline unsigned
+short from64to16(unsigned long x)
+{
+ /* add up 32-bit words for 33 bits */
+ x = (x & 0xffffffff) + (x >> 32);
+ /* add up 16-bit and 17-bit words for 17+c bits */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up 16-bit and 2-bit for 16+c bit */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up carry.. */
+ x = (x & 0xffff) + (x >> 16);
+ return x;
+}
+
+static inline
+unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
+{
+ int odd, count;
+ unsigned long result = (unsigned long)psum;
+
+ if (len <= 0)
+ goto out;
+ odd = 1 & (unsigned long) buff;
+ if (odd) {
+ result = *buff << 8;
+ len--;
+ buff++;
+ }
+ count = len >> 1; /* nr of 16-bit words.. */
+ if (count) {
+ if (2 & (unsigned long) buff) {
+ result += *(unsigned short *) buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ if (4 & (unsigned long) buff) {
+ result += *(unsigned int *) buff;
+ count--;
+ len -= 4;
+ buff += 4;
+ }
+ count >>= 1; /* nr of 64-bit words.. */
+ if (count) {
+ unsigned long carry = 0;
+ do {
+ unsigned long w = *(unsigned long *) buff;
+ count--;
+ buff += 8;
+ result += carry;
+ result += w;
+ carry = (w > result);
+ } while (count);
+ result += carry;
+ result = (result & 0xffffffff) + (result >> 32);
+ }
+ if (len & 4) {
+ result += *(unsigned int *) buff;
+ buff += 4;
+ }
+ }
+ if (len & 2) {
+ result += *(unsigned short *) buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+ result += *buff;
+
+ result = from64to16(result);
+
+ if (odd)
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+
+out:
+ return result;
+}
+
+/*
+ * XXX Fixme
+ *
+ * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
+ * But it's very tricky to get right even in C.
+ */
+extern unsigned long do_csum(const unsigned char *, int);
+
+static unsigned int
+do_csum_partial_copy_from_user (const char *src, char *dst, int len,
+ unsigned int psum, int *errp)
+{
+ const unsigned char *psrc = src;
+ unsigned long result;
+ int cplen = len;
+ int r = 0;
+
+ /* XXX Fixme
+ * for now we separate the copy from checksum for obvious
+ * alignment difficulties. Look at the Alpha code and you'll be
+ * scared.
+ */
+
+ while ( cplen-- ) r |=__get_user(*dst++,psrc++);
+
+ if ( r && errp ) *errp = r;
+
+ result = do_csum(src, len);
+
+ /* add in old sum, and carry.. */
+ result += psum;
+ /* 32+c bits -> 32 bits */
+ result = (result & 0xffffffff) + (result >> 32);
+ return result;
+}
+
+unsigned int
+csum_partial_copy_from_user(const char *src, char *dst, int len,
+ unsigned int sum, int *errp)
+{
+ if (!access_ok(src, len, VERIFY_READ)) {
+ *errp = -EFAULT;
+ memset(dst, 0, len);
+ return sum;
+ }
+
+ return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
+}
+
+unsigned int
+csum_partial_copy_nocheck(const char *src, char *dst, int len, unsigned int sum)
+{
+ return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
+}
+
+unsigned int
+csum_partial_copy (const char *src, char *dst, int len, unsigned int sum)
+{
+ unsigned int ret;
+ int error = 0;
+
+ ret = do_csum_partial_copy_from_user(src, dst, len, sum, &error);
+ if (error)
+ printk("csum_partial_copy_old(): tell mingo to convert me!\n");
+
+ return ret;
+}
+
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
new file mode 100644
index 000000000..d8174f10a
--- /dev/null
+++ b/arch/ia64/lib/do_csum.S
@@ -0,0 +1,230 @@
+/*
+ *
+ * Optmized version of the standard do_csum() function
+ *
+ * Return: a 64bit quantity containing the 16bit Internet checksum
+ *
+ * Inputs:
+ * in0: address of buffer to checksum (char *)
+ * in1: length of the buffer (int)
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ */
+
+//
+// Theory of operations:
+// The goal is to go as quickly as possible to the point where
+// we can checksum 8 bytes/loop. Before reaching that point we must
+// take care of incorrect alignment of first byte.
+//
+// The code hereafter also takes care of the "tail" part of the buffer
+// before entering the core loop, if any. The checksum is a sum so it
+// allows us to commute operations. So we do do the "head" and "tail"
+// first to finish at full speed in the body. Once we get the head and
+// tail values, we feed them into the pipeline, very handy initialization.
+//
+// Of course we deal with the special case where the whole buffer fits
+// into one 8 byte word. In this case we have only one entry in the pipeline.
+//
+// We use a (3+1)-stage pipeline in the loop to account for possible
+// load latency and also to accomodate for head and tail.
+//
+// The end of the function deals with folding the checksum from 64bits
+// down to 16bits taking care of the carry.
+//
+// This version avoids synchronization in the core loop by also using a
+// pipeline for the accumulation of the checksum in result[].
+//
+// p[]
+// |---|
+// 0| | r32 : new value loaded in pipeline
+// |---|
+// 1| | r33 : in transit data
+// |---|
+// 2| | r34 : current value to add to checksum
+// |---|
+// 3| | r35 : previous value added to checksum (previous iteration)
+// |---|
+//
+// result[]
+// |---|
+// 0| | r36 : new checksum
+// |---|
+// 1| | r37 : previous value of checksum
+// |---|
+// 2| | r38 : final checksum when out of the loop (after 2 epilogue rots)
+// |---|
+//
+//
+// NOT YET DONE:
+// - Take advantage of the MMI bandwidth to load more than 8byte per loop
+// iteration
+// - use the lfetch instruction to augment the chances of the data being in
+// the cache when we need it.
+// - Maybe another algorithm which would take care of the folding at the
+// end in a different manner
+// - Work with people more knowledgeable than me on the network stack
+// to figure out if we could not split the function depending on the
+// type of packet or alignment we get. Like the ip_fast_csum() routine
+// where we know we have at least 20bytes worth of data to checksum.
+// - Look at RFCs about checksums to see whether or not we can do better
+//
+// - Do a better job of handling small packets.
+//
+#define saved_pfs r11
+#define hmask r16
+#define tmask r17
+#define first r18
+#define firstval r19
+#define firstoff r20
+#define last r21
+#define lastval r22
+#define lastoff r23
+#define saved_lc r24
+#define saved_pr r25
+#define tmp1 r26
+#define tmp2 r27
+#define tmp3 r28
+#define carry r29
+
+#define buf in0
+#define len in1
+
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+// unsigned long do_csum(unsigned char *buf,int len)
+
+ .align 32
+ .global do_csum
+ .proc do_csum
+do_csum:
+ alloc saved_pfs=ar.pfs,2,8,0,8
+
+ .rotr p[4], result[3]
+ mov ret0=r0 // in case we have zero length
+ cmp4.lt p0,p6=r0,len // check for zero length or negative (32bit len)
+ ;; // avoid WAW on CFM
+ mov tmp3=0x7 // a temporary mask/value
+ add tmp1=buf,len // last byte's address
+(p6) br.ret.spnt.few rp // return if true (hope we can avoid that)
+
+ and firstoff=7,buf // how many bytes off for first element
+ tbit.nz p10,p0=buf,0 // is buf an odd address ?
+ mov hmask=-1 // intialize head mask
+ ;;
+
+ andcm first=buf,tmp3 // 8byte aligned down address of first element
+ mov tmask=-1 // initialize tail mask
+ adds tmp2=-1,tmp1 // last-1
+ ;;
+ and lastoff=7,tmp1 // how many bytes off for last element
+ andcm last=tmp2,tmp3 // address of word containing last byte
+ mov saved_pr=pr // preserve predicates (rotation)
+ ;;
+ sub tmp3=last,first // tmp3=distance from first to last
+ cmp.eq p8,p9=last,first // everything fits in one word ?
+ sub tmp1=8,lastoff // complement to lastoff
+
+ ld8 firstval=[first],8 // load,ahead of time, "first" word
+ shl tmp2=firstoff,3 // number of bits
+ ;;
+ and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
+
+(p9) ld8 lastval=[last] // load,ahead of time, "last" word, if needed
+(p8) mov lastval=r0 // we don't need lastval if first==last
+ mov result[1]=r0 // initialize result
+ ;;
+
+ shl tmp1=tmp1,3 // number of bits
+ shl hmask=hmask,tmp2 // build head mask, mask off [0,firstoff[
+ ;;
+ shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
+ mov saved_lc=ar.lc // save lc
+ ;;
+(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
+(p9) and p[1]=lastval,tmask // mask last it as appropriate
+ shr.u tmp3=tmp3,3 // we do 8 bytes per loop
+ ;;
+ cmp.lt p6,p7=2,tmp3 // tmp3 > 2 ?
+ and p[2]=firstval,hmask // and mask it as appropriate
+ add tmp1=-2,tmp3 // -2 = -1 (br.ctop) -1 (last-first)
+ ;;
+ // XXX Fixme: not very nice initialization here
+ //
+ // Setup loop control registers:
+ //
+ // tmp3=0 (1 word) : lc=0, ec=2, p16=F
+ // tmp3=1 (2 words) : lc=0, ec=3, p16=F
+ // tmp3=2 (3 words) : lc=0, ec=4, p16=T
+ // tmp3>2 (4 or more): lc=tmp3-2, ec=4, p16=T
+ //
+ cmp.eq p8,p9=r0,tmp3 // tmp3 == 0 ?
+(p6) mov ar.lc=tmp1
+(p7) mov ar.lc=0
+ ;;
+ cmp.lt p6,p7=1,tmp3 // tmp3 > 1 ?
+(p8) mov ar.ec=2 // we need the extra rotation on result[]
+(p9) mov ar.ec=3 // hard not to set it twice sometimes
+ ;;
+ mov carry=r0 // initialize carry
+(p6) mov ar.ec=4
+(p6) mov pr.rot=0xffffffffffff0000 // p16=T, p18=T
+
+ cmp.ne p8,p0=r0,r0 // p8 is false
+ mov p[3]=r0 // make sure first compare fails
+(p7) mov pr.rot=0xfffffffffffe0000 // p16=F, p18=T
+ ;;
+1:
+(p16) ld8 p[0]=[first],8 // load next
+(p8) adds carry=1,carry // add carry on prev_prev_value
+(p18) add result[0]=result[1],p[2] // new_res = prev_res + cur_val
+ cmp.ltu p8,p0=result[1],p[3] // p8= prev_result < prev_val
+ br.ctop.dptk.few 1b // loop until lc--==0
+ ;; // RAW on carry when loop exits
+ (p8) adds carry=1,carry;; // correct for carry on prev_value
+ add result[2]=carry,result[2];; // add carry to final result
+ cmp.ltu p6,p7=result[2], carry // check for new carry
+ ;;
+(p6) adds result[2]=1,result[1] // correct if required
+ movl tmp3=0xffffffff
+ ;;
+ // XXX Fixme
+ //
+ // now fold 64 into 16 bits taking care of carry
+ // that's not very good because it has lots of sequentiality
+ //
+ and tmp1=result[2],tmp3
+ shr.u tmp2=result[2],32
+ ;;
+ add result[2]=tmp1,tmp2
+ shr.u tmp3=tmp3,16
+ ;;
+ and tmp1=result[2],tmp3
+ shr.u tmp2=result[2],16
+ ;;
+ add result[2]=tmp1,tmp2
+ ;;
+ and tmp1=result[2],tmp3
+ shr.u tmp2=result[2],16
+ ;;
+ add result[2]=tmp1,tmp2
+ ;;
+ and tmp1=result[2],tmp3
+ shr.u tmp2=result[2],16
+ ;;
+ add ret0=tmp1,tmp2
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ // if buf was odd then swap bytes
+ mov ar.pfs=saved_pfs // restore ar.ec
+(p10) mux1 ret0=ret0,@rev // reverse word
+ ;;
+ mov ar.lc=saved_lc
+(p10) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
+ br.ret.sptk.few rp
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
new file mode 100644
index 000000000..0195ae5f5
--- /dev/null
+++ b/arch/ia64/lib/flush.S
@@ -0,0 +1,37 @@
+/*
+ * Cache flushing routines.
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/page.h>
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+ .align 16
+ .global ia64_flush_icache_page
+ .proc ia64_flush_icache_page
+ia64_flush_icache_page:
+ alloc r2=ar.pfs,1,0,0,0
+ mov r3=ar.lc // save ar.lc
+ mov r8=PAGE_SIZE/64-1 // repeat/until loop
+ ;;
+ mov ar.lc=r8
+ add r8=32,in0
+ ;;
+.Loop1: fc in0 // issuable on M0 only
+ add in0=64,in0
+ fc r8
+ add r8=64,r8
+ br.cloop.sptk.few .Loop1
+ ;;
+ sync.i
+ ;;
+ srlz.i
+ ;;
+ mov ar.lc=r3 // restore ar.lc
+ br.ret.sptk.few rp
+ .endp ia64_flush_icache_page
diff --git a/arch/ia64/lib/idiv.S b/arch/ia64/lib/idiv.S
new file mode 100644
index 000000000..a12097c94
--- /dev/null
+++ b/arch/ia64/lib/idiv.S
@@ -0,0 +1,158 @@
+/*
+ * Integer division routine.
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+/* Simple integer division. It uses the straight forward division
+ algorithm. This may not be the absolutely fastest way to do it,
+ but it's not horrible either. According to ski, the worst case
+ scenario of dividing 0xffffffffffffffff by 1 takes 133 cycles.
+
+ An alternative would be to use an algorithm similar to the
+ floating point division algorithm (Newton-Raphson iteration),
+ but that approach is rather tricky (one has to be very careful
+ to get the last bit right...).
+
+ While this algorithm is straight-forward, it does use a couple
+ of neat ia-64 specific tricks:
+
+ - it uses the floating point unit to determine the initial
+ shift amount (shift = floor(ld(x)) - floor(ld(y)))
+
+ - it uses predication to avoid a branch in the case where
+ x < y (this is what p8 is used for)
+
+ - it uses rotating registers and the br.ctop branch to
+ implement a software-pipelined loop that's unrolled
+ twice (without any code expansion!)
+
+ - the code is relatively well scheduled to avoid unnecessary
+ nops while maximizing parallelism
+*/
+
+#include <asm/break.h>
+
+ .text
+ .psr abi64
+#ifdef __BIG_ENDIAN__
+ .psr msb
+ .msb
+#else
+ .psr lsb
+ .lsb
+#endif
+
+#ifdef MODULO
+# define OP mod
+# define Q r9
+# define R r8
+#else
+# define OP div
+# define Q r8
+# define R r9
+#endif
+
+#ifdef SINGLE
+# define PREC si
+#else
+# define PREC di
+#endif
+
+#ifdef UNSIGNED
+# define SGN u
+# define INT_TO_FP(a,b) fma.s0 a=b,f1,f0
+# define FP_TO_INT(a,b) fcvt.fxu.trunc.s0 a=b
+#else
+# define SGN
+# define INT_TO_FP(a,b) fcvt.xf a=b
+# define FP_TO_INT(a,b) fcvt.fx.trunc.s0 a=b
+#endif
+
+#define PASTE1(a,b) a##b
+#define PASTE(a,b) PASTE1(a,b)
+#define NAME PASTE(PASTE(__,SGN),PASTE(OP,PASTE(PREC,3)))
+
+ .align 32
+ .global NAME
+ .proc NAME
+NAME:
+
+ alloc r2=ar.pfs,2,6,0,8
+ mov r18=pr
+#ifdef SINGLE
+# ifdef UNSIGNED
+ zxt4 in0=in0
+ zxt4 in1=in1
+# else
+ sxt4 in0=in0
+ sxt4 in1=in1
+# endif
+ ;;
+#endif
+
+#ifndef UNSIGNED
+ cmp.lt p6,p0=in0,r0 // x negative?
+ cmp.lt p7,p0=in1,r0 // y negative?
+ ;;
+(p6) sub in0=r0,in0 // make x positive
+(p7) sub in1=r0,in1 // ditto for y
+ ;;
+#endif
+
+ setf.sig f8=in0
+ mov r3=ar.lc // save ar.lc
+ setf.sig f9=in1
+ ;;
+ mov Q=0 // initialize q
+ mov R=in0 // stash away x in a static register
+ mov r16=1 // r16 = 1
+ INT_TO_FP(f8,f8)
+ cmp.eq p8,p0=0,in0 // x==0?
+ cmp.eq p9,p0=0,in1 // y==0?
+ ;;
+ INT_TO_FP(f9,f9)
+(p8) br.dpnt.few .L3
+(p9) break __IA64_BREAK_KDB // attempted division by zero (should never happen)
+ mov ar.ec=r0 // epilogue count = 0
+ ;;
+ getf.exp r14=f8 // r14 = exponent of x
+ getf.exp r15=f9 // r15 = exponent of y
+ mov ar.lc=r0 // loop count = 0
+ ;;
+ sub r17=r14,r15 // r17 = (exp of x - exp y) = shift amount
+ cmp.ge p8,p0=r14,r15
+ ;;
+
+ .rotr y[2], mask[2] // in0 and in1 may no longer be valid after
+ // the first write to a rotating register!
+
+(p8) shl y[1]=in1,r17 // y[1] = y<<shift
+(p8) shl mask[1]=r16,r17 // mask[1] = 1<<shift
+
+(p8) mov ar.lc=r17 // loop count = r17
+ ;;
+.L1:
+(p8) cmp.geu.unc p9,p0=R,y[1]// p9 = (x >= y[1])
+(p8) shr.u mask[0]=mask[1],1 // prepare mask[0] and y[0] for next
+(p8) shr.u y[0]=y[1],1 // iteration
+ ;;
+(p9) sub R=R,y[1] // if (x >= y[1]), subtract y[1] from x
+(p9) add Q=Q,mask[1] // and set corresponding bit in q (Q)
+ br.ctop.dptk.few .L1 // repeated unless ar.lc-- == 0
+ ;;
+.L2:
+#ifndef UNSIGNED
+# ifdef MODULO
+(p6) sub R=r0,R // set sign of remainder according to x
+# else
+(p6) sub Q=r0,Q // set sign of quotient
+ ;;
+(p7) sub Q=r0,Q
+# endif
+#endif
+.L3:
+ mov ar.pfs=r2 // restore ar.pfs
+ mov ar.lc=r3 // restore ar.lc
+ mov pr=r18,0xffffffffffff0000 // restore p16-p63
+ br.ret.sptk.few rp
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
new file mode 100644
index 000000000..595720a2d
--- /dev/null
+++ b/arch/ia64/lib/memset.S
@@ -0,0 +1,111 @@
+/*
+ *
+ * Optimized version of the standard memset() function
+ *
+ * Return: none
+ *
+ *
+ * Inputs:
+ * in0: address of buffer
+ * in1: byte value to use for storing
+ * in2: length of the buffer
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ */
+
+
+// arguments
+//
+#define buf r32
+#define val r33
+#define len r34
+
+//
+// local registers
+//
+#define saved_pfs r14
+#define cnt r18
+#define buf2 r19
+#define saved_lc r20
+#define saved_pr r21
+#define tmp r22
+
+ .text
+ .psr abi64
+ .psr lsb
+
+ .align 16
+ .global memset
+ .proc memset
+
+memset:
+ alloc saved_pfs=ar.pfs,3,0,0,0 // cnt is sink here
+ cmp.eq p8,p0=r0,len // check for zero length
+ mov saved_lc=ar.lc // preserve ar.lc (slow)
+ ;;
+ adds tmp=-1,len // br.ctop is repeat/until
+ tbit.nz p6,p0=buf,0 // odd alignment
+(p8) br.ret.spnt.few rp
+
+ cmp.lt p7,p0=16,len // if len > 16 then long memset
+ mux1 val=val,@brcst // prepare value
+(p7) br.cond.dptk.few long_memset
+ ;;
+ mov ar.lc=tmp // initialize lc for small count
+ ;; // avoid RAW and WAW on ar.lc
+1: // worst case 15 cyles, avg 8 cycles
+ st1 [buf]=val,1
+ br.cloop.dptk.few 1b
+ ;; // avoid RAW on ar.lc
+ mov ar.lc=saved_lc
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.few rp // end of short memset
+
+ // at this point we know we have more than 16 bytes to copy
+ // so we focus on alignment
+long_memset:
+(p6) st1 [buf]=val,1 // 1-byte aligned
+(p6) adds len=-1,len;; // sync because buf is modified
+ tbit.nz p6,p0=buf,1
+ ;;
+(p6) st2 [buf]=val,2 // 2-byte aligned
+(p6) adds len=-2,len;;
+ tbit.nz p6,p0=buf,2
+ ;;
+(p6) st4 [buf]=val,4 // 4-byte aligned
+(p6) adds len=-4,len;;
+ tbit.nz p6,p0=buf,3
+ ;;
+(p6) st8 [buf]=val,8 // 8-byte aligned
+(p6) adds len=-8,len;;
+ shr.u cnt=len,4 // number of 128-bit (2x64bit) words
+ ;;
+ cmp.eq p6,p0=r0,cnt
+ adds tmp=-1,cnt
+(p6) br.cond.dpnt.few .dotail // we have less than 16 bytes left
+ ;;
+ adds buf2=8,buf // setup second base pointer
+ mov ar.lc=tmp
+ ;;
+2: // 16bytes/iteration
+ st8 [buf]=val,16
+ st8 [buf2]=val,16
+ br.cloop.dptk.few 2b
+ ;;
+.dotail: // tail correction based on len only
+ tbit.nz p6,p0=len,3
+ ;;
+(p6) st8 [buf]=val,8 // at least 8 bytes
+ tbit.nz p6,p0=len,2
+ ;;
+(p6) st4 [buf]=val,4 // at least 4 bytes
+ tbit.nz p6,p0=len,1
+ ;;
+(p6) st2 [buf]=val,2 // at least 2 bytes
+ tbit.nz p6,p0=len,0
+ mov ar.lc=saved_lc
+ ;;
+(p6) st1 [buf]=val // only 1 byte left
+ br.ret.dptk.few rp
+ .endp
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
new file mode 100644
index 000000000..3062716b1
--- /dev/null
+++ b/arch/ia64/lib/strlen.S
@@ -0,0 +1,197 @@
+/*
+ *
+ * Optimized version of the standard strlen() function
+ *
+ *
+ * Inputs:
+ * in0 address of string
+ *
+ * Outputs:
+ * ret0 the number of characters in the string (0 if empty string)
+ * does not count the \0
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 09/24/99 S.Eranian add speculation recovery code
+ */
+
+//
+//
+// This is an enhanced version of the basic strlen. it includes a combination
+// of compute zero index (czx), parallel comparisons, speculative loads and
+// loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+// The goal is to look at the string in chunks of 8 bytes.
+// so we need to do a few extra checks at the beginning because the
+// string may not be 8-byte aligned. In this case we load the 8byte
+// quantity which includes the start of the string and mask the unused
+// bytes with 0xff to avoid confusing czx.
+// We use speculative loads and software pipelining to hide memory
+// latency and do read ahead safely. This way we defer any exception.
+//
+// Because we don't want the kernel to be relying on particular
+// settings of the DCR register, we provide recovery code in case
+// speculation fails. The recovery code is going to "redo" the work using
+// only normal loads. If we still get a fault then we generate a
+// kernel panic. Otherwise we return the strlen as usual.
+//
+// The fact that speculation may fail can be caused, for instance, by
+// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+// a NaT bit will be set if the translation is not present. The normal
+// load, on the other hand, will cause the translation to be inserted
+// if the mapping exists.
+//
+// It should be noted that we execute recovery code only when we need
+// to use the data that has been speculatively loaded: we don't execute
+// recovery code on pure read ahead data.
+//
+// Remarks:
+// - the cmp r0,r0 is used as a fast way to initialize a predicate
+// register to 1. This is required to make sure that we get the parallel
+// compare correct.
+//
+// - we don't use the epilogue counter to exit the loop but we need to set
+// it to zero beforehand.
+//
+// - after the loop we must test for Nat values because neither the
+// czx nor cmp instruction raise a NaT consumption fault. We must be
+// careful not to look too far for a Nat for which we don't care.
+// For instance we don't need to look at a NaT in val2 if the zero byte
+// was in val1.
+//
+// - Clearly performance tuning is required.
+//
+//
+//
+#define saved_pfs r11
+#define tmp r10
+#define base r16
+#define orig r17
+#define saved_pr r18
+#define src r19
+#define mask r20
+#define val r21
+#define val1 r22
+#define val2 r23
+
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+ .align 32
+ .global strlen
+ .proc strlen
+strlen:
+ alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
+
+ .rotr v[2], w[2] // declares our 4 aliases
+
+ extr.u tmp=in0,0,3 // tmp=least significant 3 bits
+ mov orig=in0 // keep trackof initial byte address
+ dep src=0,in0,0,3 // src=8byte-aligned in0 address
+ mov saved_pr=pr // preserve predicates (rotation)
+ ;;
+ ld8 v[1]=[src],8 // must not speculate: can fail here
+ shl tmp=tmp,3 // multiply by 8bits/byte
+ mov mask=-1 // our mask
+ ;;
+ ld8.s w[1]=[src],8 // speculatively load next
+ cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and
+ sub tmp=64,tmp // how many bits to shift our mask on the right
+ ;;
+ shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
+ mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
+ ;;
+ add base=-16,src // keep track of aligned base
+ or v[1]=v[1],mask // now we have a safe initial byte pattern
+ ;;
+1:
+ ld8.s v[0]=[src],8 // speculatively load next
+ czx1.r val1=v[1] // search 0 byte from right
+ czx1.r val2=w[1] // search 0 byte from right following 8bytes
+ ;;
+ ld8.s w[0]=[src],8 // speculatively load next to next
+ cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
+ cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
+(p6) br.wtop.dptk.few 1b // loop until p6 == 0
+ ;;
+ //
+ // We must return try the recovery code iff
+ // val1_is_nat || (val1==8 && val2_is_nat)
+ //
+ // XXX Fixme
+ // - there must be a better way of doing the test
+ //
+ cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
+#ifdef notyet
+ tnat.nz p6,p7=val1 // test NaT on val1
+#else
+ tnat.z p7,p6=val1 // test NaT on val1
+#endif
+(p6) br.cond.spnt.few recover// jump to recovery if val1 is NaT
+ ;;
+ //
+ // if we come here p7 is true, i.e., initialized for // cmp
+ //
+ cmp.eq.and p7,p0=8,val1// val1==8?
+ tnat.nz.and p7,p0=val2 // test NaT if val2
+(p7) br.cond.spnt.few recover// jump to recovery if val2 is NaT
+ ;;
+(p8) mov val1=val2 // the other test got us out of the loop
+(p8) adds src=-16,src // correct position when 3 ahead
+(p9) adds src=-24,src // correct position when 4 ahead
+ ;;
+ sub ret0=src,orig // distance from base
+ sub tmp=8,val1 // which byte in word
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ sub ret0=ret0,tmp // adjust
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.few rp // end of normal execution
+
+ //
+ // Outlined recovery code when speculation failed
+ //
+ // This time we don't use speculation and rely on the normal exception
+ // mechanism. that's why the loop is not as good as the previous one
+ // because read ahead is not possible
+ //
+ // IMPORTANT:
+ // Please note that in the case of strlen() as opposed to strlen_user()
+ // we don't use the exception mechanism, as this function is not
+ // supposed to fail. If that happens it means we have a bug and the
+ // code will cause of kernel fault.
+ //
+ // XXX Fixme
+ // - today we restart from the beginning of the string instead
+ // of trying to continue where we left off.
+ //
+recover:
+ ld8 val=[base],8 // will fail if unrecoverable fault
+ ;;
+ or val=val,mask // remask first bytes
+ cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
+ ;;
+ //
+ // ar.ec is still zero here
+ //
+2:
+(p6) ld8 val=[base],8 // will fail if unrecoverable fault
+ ;;
+ czx1.r val1=val // search 0 byte from right
+ ;;
+ cmp.eq p6,p0=8,val1 // val1==8 ?
+(p6) br.wtop.dptk.few 2b // loop until p6 == 0
+ sub ret0=base,orig // distance from base
+ sub tmp=8,val1
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ sub ret0=ret0,tmp // length=now - back -1
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.few rp // end of sucessful recovery code
+
+ .endp strlen
diff --git a/arch/ia64/lib/strlen_user.S b/arch/ia64/lib/strlen_user.S
new file mode 100644
index 000000000..8149dde8a
--- /dev/null
+++ b/arch/ia64/lib/strlen_user.S
@@ -0,0 +1,213 @@
+/*
+ * Optimized version of the strlen_user() function
+ *
+ * Inputs:
+ * in0 address of buffer
+ *
+ * Outputs:
+ * ret0 0 in case of fault, strlen(buffer)+1 otherwise
+ *
+ * Copyright (C) 1998, 1999 Hewlett-Packard Co
+ * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1998, 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 01/19/99 S.Eranian heavily enhanced version (see details below)
+ * 09/24/99 S.Eranian added speculation recovery code
+ */
+
+//
+// int strlen_user(char *)
+// ------------------------
+// Returns:
+// - length of string + 1
+// - 0 in case an exception is raised
+//
+// This is an enhanced version of the basic strlen_user. it includes a
+// combination of compute zero index (czx), parallel comparisons, speculative
+// loads and loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+// The goal is to look at the string in chunks of 8 bytes.
+// so we need to do a few extra checks at the beginning because the
+// string may not be 8-byte aligned. In this case we load the 8byte
+// quantity which includes the start of the string and mask the unused
+// bytes with 0xff to avoid confusing czx.
+// We use speculative loads and software pipelining to hide memory
+// latency and do read ahead safely. This way we defer any exception.
+//
+// Because we don't want the kernel to be relying on particular
+// settings of the DCR register, we provide recovery code in case
+// speculation fails. The recovery code is going to "redo" the work using
+// only normal loads. If we still get a fault then we return an
+// error (ret0=0). Otherwise we return the strlen+1 as usual.
+// The fact that speculation may fail can be caused, for instance, by
+// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+// a NaT bit will be set if the translation is not present. The normal
+// load, on the other hand, will cause the translation to be inserted
+// if the mapping exists.
+//
+// It should be noted that we execute recovery code only when we need
+// to use the data that has been speculatively loaded: we don't execute
+// recovery code on pure read ahead data.
+//
+// Remarks:
+// - the cmp r0,r0 is used as a fast way to initialize a predicate
+// register to 1. This is required to make sure that we get the parallel
+// compare correct.
+//
+// - we don't use the epilogue counter to exit the loop but we need to set
+// it to zero beforehand.
+//
+// - after the loop we must test for Nat values because neither the
+// czx nor cmp instruction raise a NaT consumption fault. We must be
+// careful not to look too far for a Nat for which we don't care.
+// For instance we don't need to look at a NaT in val2 if the zero byte
+// was in val1.
+//
+// - Clearly performance tuning is required.
+//
+//
+//
+
+#define EX(y,x...) \
+ .section __ex_table,"a"; \
+ data4 @gprel(99f); \
+ data4 y-99f; \
+ .previous; \
+99: x
+
+#define saved_pfs r11
+#define tmp r10
+#define base r16
+#define orig r17
+#define saved_pr r18
+#define src r19
+#define mask r20
+#define val r21
+#define val1 r22
+#define val2 r23
+
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+ .align 32
+ .global __strlen_user
+ .proc __strlen_user
+__strlen_user:
+ alloc saved_pfs=ar.pfs,11,0,0,8
+
+ .rotr v[2], w[2] // declares our 4 aliases
+
+ extr.u tmp=in0,0,3 // tmp=least significant 3 bits
+ mov orig=in0 // keep trackof initial byte address
+ dep src=0,in0,0,3 // src=8byte-aligned in0 address
+ mov saved_pr=pr // preserve predicates (rotation)
+ ;;
+ ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate)
+ shl tmp=tmp,3 // multiply by 8bits/byte
+ mov mask=-1 // our mask
+ ;;
+ ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline
+ cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and)
+ sub tmp=64,tmp // how many bits to shift our mask on the right
+ ;;
+ shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
+ mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
+ ;;
+ add base=-16,src // keep track of aligned base
+ chk.s v[1], recover // if already NaT, then directly skip to recover
+ or v[1]=v[1],mask // now we have a safe initial byte pattern
+ ;;
+1:
+ ld8.s v[0]=[src],8 // speculatively load next
+ czx1.r val1=v[1] // search 0 byte from right
+ czx1.r val2=w[1] // search 0 byte from right following 8bytes
+ ;;
+ ld8.s w[0]=[src],8 // speculatively load next to next
+ cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
+ cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
+(p6) br.wtop.dptk.few 1b // loop until p6 == 0
+ ;;
+ //
+ // We must return try the recovery code iff
+ // val1_is_nat || (val1==8 && val2_is_nat)
+ //
+ // XXX Fixme
+ // - there must be a better way of doing the test
+ //
+ cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
+#ifdef notyet
+ tnat.nz p6,p7=val1 // test NaT on val1
+#else
+ tnat.z p7,p6=val1 // test NaT on val1
+#endif
+(p6) br.cond.spnt.few recover// jump to recovery if val1 is NaT
+ ;;
+ //
+ // if we come here p7 is true, i.e., initialized for // cmp
+ //
+ cmp.eq.and p7,p0=8,val1// val1==8?
+ tnat.nz.and p7,p0=val2 // test NaT if val2
+(p7) br.cond.spnt.few recover// jump to recovery if val2 is NaT
+ ;;
+(p8) mov val1=val2 // val2 contains the value
+(p8) adds src=-16,src // correct position when 3 ahead
+(p9) adds src=-24,src // correct position when 4 ahead
+ ;;
+ sub ret0=src,orig // distance from origin
+ sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ sub ret0=ret0,tmp // length=now - back -1
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.few rp // end of normal execution
+
+ //
+ // Outlined recovery code when speculation failed
+ //
+ // This time we don't use speculation and rely on the normal exception
+ // mechanism. that's why the loop is not as good as the previous one
+ // because read ahead is not possible
+ //
+ // XXX Fixme
+ // - today we restart from the beginning of the string instead
+ // of trying to continue where we left off.
+ //
+recover:
+ EX(.Lexit1, ld8 val=[base],8) // load the initial bytes
+ ;;
+ or val=val,mask // remask first bytes
+ cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
+ ;;
+ //
+ // ar.ec is still zero here
+ //
+2:
+ EX(.Lexit1, (p6) ld8 val=[base],8)
+ ;;
+ czx1.r val1=val // search 0 byte from right
+ ;;
+ cmp.eq p6,p0=8,val1 // val1==8 ?
+(p6) br.wtop.dptk.few 2b // loop until p6 == 0
+ ;;
+ sub ret0=base,orig // distance from base
+ sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ sub ret0=ret0,tmp // length=now - back -1
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.few rp // end of sucessful recovery code
+
+ //
+ // We failed even on the normal load (called from exception handler)
+ //
+.Lexit1:
+ mov ret0=0
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.few rp
+
+ .endp __strlen_user
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
new file mode 100644
index 000000000..17f71f1a0
--- /dev/null
+++ b/arch/ia64/lib/strncpy_from_user.S
@@ -0,0 +1,53 @@
+/*
+ * Just like strncpy() except for the return value. If no fault occurs during
+ * the copying, the number of bytes copied is returned. If a fault occurs,
+ * -EFAULT is returned.
+ *
+ * Inputs:
+ * in0: address of destination buffer
+ * in1: address of string to be copied
+ * in2: length of buffer in bytes
+ * Outputs:
+ * r8: -EFAULT in case of fault or number of bytes copied if no fault
+ *
+ * Copyright (C) 1998, 1999 Hewlett-Packard Co
+ * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#define EX(x...) \
+99: x; \
+ .section __ex_table,"a"; \
+ data4 @gprel(99b); \
+ data4 .Lexit-99b; \
+ .previous
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+ .align 32
+ .global __strncpy_from_user
+ .proc __strncpy_from_user
+__strncpy_from_user:
+ alloc r11=ar.pfs,3,0,0,0
+ mov r9=in1
+ add r10=in1,in2
+
+ // XXX braindead copy loop---this needs to be optimized
+.Loop1:
+ EX(ld1 r8=[in1],1)
+ ;;
+ st1 [in0]=r8,1
+ cmp.ltu p6,p0=in1,r10
+ ;;
+(p6) cmp.ne.and p6,p0=r8,r0
+ ;;
+(p6) br.cond.dpnt.few .Loop1
+
+1: sub r8=in1,r9 // length of string (including NUL character)
+.Lexit:
+ mov ar.pfs=r11
+ br.ret.sptk.few rp
+
+ .endp __strncpy_from_user
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
new file mode 100644
index 000000000..c227a9003
--- /dev/null
+++ b/arch/ia64/lib/strnlen_user.S
@@ -0,0 +1,55 @@
+/*
+ * Returns 0 if exception before NUL or reaching the supplied limit (N),
+ * a value greater than N if the string is longer than the limit, else
+ * strlen.
+ *
+ * Inputs:
+ * in0: address of buffer
+ * in1: string length limit N
+ * Outputs:
+ * r8: 0 in case of fault, strlen(buffer)+1 otherwise
+ *
+ * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+/* If a fault occurs, r8 gets set to -EFAULT and r9 gets cleared. */
+#define EX(x...) \
+ .section __ex_table,"a"; \
+ data4 @gprel(99f); \
+ data4 (.Lexit-99f)|1; \
+ .previous \
+99: x;
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+ .align 32
+ .global __strnlen_user
+ .proc __strnlen_user
+__strnlen_user:
+ alloc r2=ar.pfs,2,0,0,0
+ mov r16=ar.lc // preserve ar.lc
+ add r3=-1,in1
+ ;;
+ mov ar.lc=r3
+ mov r9=0
+
+ // XXX braindead strlen loop---this needs to be optimized
+.Loop1:
+ EX(ld1 r8=[in0],1)
+ add r9=1,r9
+ ;;
+ cmp.eq p6,p0=r8,r0
+(p6) br.dpnt.few .Lexit
+ br.cloop.dptk.few .Loop1
+
+ add r9=1,in1 // NUL not found---return N+1
+ ;;
+.Lexit:
+ mov r8=r9
+ mov ar.lc=r16 // restore ar.lc
+ br.ret.sptk.few rp
+
+ .endp __strnlen_user