Merge with 2.3.43. I did ignore all modifications to the qlogicisp.c

driver due to the Origin A64 hacks.
author: Ralf Baechle <ralf@linux-mips.org> 2000-02-23 00:40:54 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-02-23 00:40:54 +0000
commit: 529c593ece216e4aaffd36bd940cb94f1fa63129 (patch)
tree: 78f1c0b805f5656aa7b0417a043c5346f700a2cf /arch/ia64/lib
parent: 0bd079751d25808d1972baee5c4eaa1db2227257 (diff)
15 files changed, 1795 insertions, 0 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
new file mode 100644
index 000000000..8a9581747
--- /dev/null
+++ b/arch/ia64/lib/Makefile
@@ -0,0 +1,42 @@
+#
+# Makefile for ia64-specific library routines..
+#
+
+.S.o:
+	$(CC) -D__ASSEMBLY__ $(AFLAGS) -traditional -c $< -o $@
+
+OBJS  = __divdi3.o __divsi3.o __udivdi3.o __udivsi3.o		\
+	__moddi3.o __modsi3.o __umoddi3.o __umodsi3.o		\
+	checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
+	copy_user.o clear_user.o memset.o strncpy_from_user.o	\
+	strlen.o strlen_user.o strnlen_user.o			\
+	flush.o do_csum.o
+
+lib.a: $(OBJS)
+	$(AR) rcs lib.a $(OBJS)
+
+__divdi3.o: idiv.S
+	$(CC) $(AFLAGS) -c -o $@ $<
+
+__divsi3.o: idiv.S
+	$(CC) $(AFLAGS) -c -DSINGLE -c -o $@ $<
+
+__udivdi3.o: idiv.S
+	$(CC) $(AFLAGS) -c -DUNSIGNED -c -o $@ $<
+
+__udivsi3.o: idiv.S
+	$(CC) $(AFLAGS) -c -DUNSIGNED -DSINGLE -c -o $@ $<
+
+__moddi3.o: idiv.S
+	$(CC) $(AFLAGS) -c -DMODULO -c -o $@ $<
+
+__modsi3.o: idiv.S
+	$(CC) $(AFLAGS) -c -DMODULO -DSINGLE -c -o $@ $<
+
+__umoddi3.o: idiv.S
+	$(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -c -o $@ $<
+
+__umodsi3.o: idiv.S
+	$(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -DSINGLE -c -o $@ $<
+
+include $(TOPDIR)/Rules.make
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
new file mode 100644
index 000000000..9c4a8af75
--- /dev/null
+++ b/arch/ia64/lib/checksum.c
@@ -0,0 +1,110 @@
+/*
+ * Network checksum routines
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code coming from arch/alpha/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed..
+ */
+ 
+#include <linux/string.h>
+
+#include <asm/byteorder.h>
+
+static inline unsigned short
+from64to16(unsigned long x)
+{
+	/* add up 32-bit words for 33 bits */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up 16-bit and 17-bit words for 17+c bits */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up 16-bit and 2-bit for 16+c bit */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented.
+ */
+unsigned short int csum_tcpudp_magic(unsigned long saddr,
+				   unsigned long daddr,
+				   unsigned short len,
+				   unsigned short proto,
+				   unsigned int sum)
+{
+	return ~from64to16(saddr + daddr + sum +
+		((unsigned long) ntohs(len) << 16) +
+		((unsigned long) proto << 8));
+}
+
+unsigned int csum_tcpudp_nofold(unsigned long saddr,
+				   unsigned long daddr,
+				   unsigned short len,
+				   unsigned short proto,
+				   unsigned int sum)
+{
+	unsigned long result;
+
+	result = (saddr + daddr + sum +
+		  ((unsigned long) ntohs(len) << 16) +
+		  ((unsigned long) proto << 8));
+
+	/* Fold down to 32-bits so we don't loose in the typedef-less 
+	   network stack.  */
+	/* 64 to 33 */
+	result = (result & 0xffffffff) + (result >> 32);
+	/* 33 to 32 */
+	result = (result & 0xffffffff) + (result >> 32);
+	return result;
+}
+
+extern unsigned long do_csum(const unsigned char *, unsigned int, unsigned int);
+extern unsigned long do_csum_c(const unsigned char *, unsigned int, unsigned int);
+
+/*
+ *	This is a version of ip_compute_csum() optimized for IP headers,
+ *	which always checksum on 4 octet boundaries.
+ */
+unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
+{
+	return ~do_csum(iph,ihl*4,0);
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+{
+	unsigned long result = do_csum(buff, len, 0);
+
+	/* add in old sum, and carry.. */
+	result += sum;
+	/* 32+c bits -> 32 bits */
+	result = (result & 0xffffffff) + (result >> 32);
+	return result;
+}
+
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short ip_compute_csum(unsigned char * buff, int len)
+{
+	return ~do_csum(buff,len, 0);
+}
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
new file mode 100644
index 000000000..314311c5c
--- /dev/null
+++ b/arch/ia64/lib/clear_page.S
@@ -0,0 +1,42 @@
+/*
+ *
+ * Optimized version of the standard clearpage() function
+ *
+ * Based on comments from ddd. Try not to overflow the write buffer.
+ *
+ * Inputs:
+ *	in0:	address of page
+ * 
+ * Output:
+ * 	none
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/page.h>
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+	.align 32
+	.global clear_page
+	.proc clear_page
+clear_page:
+	alloc r11=ar.pfs,1,0,0,0
+	mov r16=ar.lc		// slow
+	mov r17=PAGE_SIZE/32-1	// -1 = repeat/until
+	;;
+	adds r18=16,in0
+	mov ar.lc=r17
+	;;
+1:	stf.spill.nta [in0]=f0,32
+	stf.spill.nta [r18]=f0,32
+	br.cloop.dptk.few 1b
+	;;
+	mov ar.lc=r16		// restore lc
+	br.ret.sptk.few rp
+
+	.endp clear_page
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
new file mode 100644
index 000000000..0db4a78f8
--- /dev/null
+++ b/arch/ia64/lib/clear_user.S
@@ -0,0 +1,224 @@
+/*
+ * This routine clears to zero a linear memory buffer in user space.
+ *
+ * Inputs:
+ *	in0:	address of buffer
+ *	in1:	length of buffer in bytes
+ * Outputs:
+ *	r8:	number of bytes that didn't get cleared due to a fault
+ * 
+ * Copyright (C) 1998, 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ */
+
+//
+// arguments
+//
+#define buf		r32
+#define len		r33
+
+//
+// local registers
+//
+#define cnt		r16
+#define buf2		r17
+#define saved_lc	r18
+#define saved_pr	r19
+#define saved_pfs	r20
+#define tmp		r21
+#define len2		r22
+#define len3		r23
+
+//
+// Theory of operations:
+//	- we check whether or not the buffer is small, i.e., less than 17
+//	  in which case we do the byte by byte loop.
+//
+//	- Otherwise we go progressively from 1 byte store to 8byte store in
+//	  the head part, the body is a 16byte store loop and we finish we the
+//	  tail for the last 15 bytes.
+//	  The good point about this breakdown is that the long buffer handling
+//	  contains only 2 branches.
+//
+//	The reason for not using shifting & masking for both the head and the
+//	tail is to stay semantically correct. This routine is not supposed
+//	to write bytes outside of the buffer. While most of the time this would
+//	be ok, we can't tolerate a mistake. A classical example is the case
+//	of multithreaded code were to the extra bytes touched is actually owned
+//	by another thread which runs concurrently to ours. Another, less likely,
+//	example is with device drivers where reading an I/O mapped location may
+//	have side effects (same thing for writing).
+//
+
+// The label comes first because our store instruction contains a comma
+// and confuse the preprocessor otherwise
+//
+#define EX(y,x...)				\
+	.section __ex_table,"a";		\
+	data4 @gprel(99f);			\
+	data4 y-99f;				\
+	.previous;				\
+99:	x
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+	.align 32
+ 	.global	__do_clear_user
+ 	.proc	__do_clear_user
+
+__do_clear_user:
+ 	alloc	saved_pfs=ar.pfs,2,0,0,0
+	cmp.eq p6,p0=r0,len		// check for zero length
+	mov saved_lc=ar.lc		// preserve ar.lc (slow)
+	;;				// avoid WAW on CFM
+	adds tmp=-1,len			// br.ctop is repeat/until
+	mov ret0=len			// return value is length at this point
+(p6)	br.ret.spnt.few rp
+	;;
+	cmp.lt p6,p0=16,len		// if len > 16 then long memset
+	mov ar.lc=tmp			// initialize lc for small count 
+(p6)	br.cond.dptk.few long_do_clear
+	;;				// WAR on ar.lc
+	//
+	// worst case 16 cyles, avg 8 cycles
+	//
+	// We could have played with the predicates to use the extra
+	// M slot for 2 stores/iteration but the cost the initialization
+	// the various counters compared to how long the loop is supposed
+	// to last on average does not make this solution viable.
+	//
+1:	
+	EX( .Lexit1, st1 [buf]=r0,1 )
+	adds len=-1,len			// countdown length using len
+	br.cloop.dptk.few 1b
+	;;				// avoid RAW on ar.lc
+	//
+	// .Lexit4: comes from byte by byte loop
+	//	    len contains bytes left
+.Lexit1:			
+	mov ret0=len			// faster than using ar.lc
+	mov ar.lc=saved_lc
+	br.ret.sptk.few rp		// end of short clear_user
+
+
+	//
+	// At this point we know we have more than 16 bytes to copy
+	// so we focus on alignment (no branches required)
+	//
+	// The use of len/len2 for countdown of the number of bytes left
+	// instead of ret0 is due to the fact that the exception code 
+	// changes the values of r8.
+	//
+long_do_clear:
+	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
+	;;
+	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
+(p6)	adds len=-1,len;;		// sync because buf is modified
+	tbit.nz p6,p0=buf,1
+	;;
+	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
+(p6)	adds len=-2,len;;
+	tbit.nz p6,p0=buf,2
+	;;
+	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
+(p6)	adds len=-4,len;;
+	tbit.nz p6,p0=buf,3
+	;;
+	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
+(p6)	adds len=-8,len;;
+	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
+	;;	
+	cmp.eq p6,p0=r0,cnt
+	adds tmp=-1,cnt
+(p6)	br.cond.dpnt.few .dotail 	// we have less than 16 bytes left
+	;;
+	adds buf2=8,buf			// setup second base pointer
+	mov ar.lc=tmp
+	;;
+
+	//
+	// 16bytes/iteration core loop
+	//
+	// The second store can never generate a fault because 
+	// we come into the loop only when we are 16-byte aligned.
+	// This means that if we cross a page then it will always be
+	// in the first store and never in the second.
+	//
+	// 
+	// We need to keep track of the remaining length. A possible (optimistic)
+	// way would be to ue ar.lc and derive how many byte were left by
+	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
+	// every iteration. 
+	// However we need to keep the synchronization point. A template
+	// M;;MB does not exist and thus we can keep the addition at no
+	// extra cycle cost (use a nop slot anyway). It also simplifies the
+	// (unlikely)  error recovery code
+	//
+
+2:					
+
+	EX(.Lexit3, st8 [buf]=r0,16 )
+	;;				// needed to get len correct when error
+	st8 [buf2]=r0,16
+	adds len=-16,len		
+	br.cloop.dptk.few 2b
+	;;
+	mov ar.lc=saved_lc
+	// 
+	// tail correction based on len only
+	//
+	// We alternate the use of len3,len2 to allow parallelism and correct
+	// error handling. We also reuse p6/p7 to return correct value.
+	// The addition of len2/len3 does not cost anything more compared to
+	// the regular memset as we had empty slots.
+	//
+.dotail:				
+	mov len2=len			// for parallelization of error handling
+	mov len3=len
+	tbit.nz p6,p0=len,3	
+	;;
+	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
+(p6)	adds len3=-8,len2
+	tbit.nz p7,p6=len,2	
+	;;
+	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
+(p7)	adds len2=-4,len3
+	tbit.nz p6,p7=len,1
+	;;
+	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
+(p6)	adds len3=-2,len2
+	tbit.nz p7,p6=len,0
+	;;
+	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
+	mov ret0=r0				// success
+	br.ret.dptk.few rp			// end of most likely path
+
+	//
+	// Outlined error handling code
+	//
+
+	//
+	// .Lexit3: comes from core loop, need restore pr/lc
+	//	    len contains bytes left
+	//
+	//
+	// .Lexit2:
+	// 	if p6 -> coming from st8 or st2 : len2 contains what's left
+	// 	if p7 -> coming from st4 or st1 : len3 contains what's left
+	// We must restore lc/pr even though might not have been used.
+.Lexit2:
+(p6)	mov len=len2
+(p7)	mov len=len3
+	;;
+	//
+	// .Lexit4: comes from head, need not restore pr/lc
+	//	    len contains bytes left
+	//
+.Lexit3:
+	mov ret0=len
+	mov ar.lc=saved_lc
+	br.ret.dptk.few rp
+ 	.endp
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
new file mode 100644
index 000000000..0a956e5a2
--- /dev/null
+++ b/arch/ia64/lib/copy_page.S
@@ -0,0 +1,87 @@
+/*
+ *
+ * Optimized version of the standard copy_page() function
+ *
+ * Based on comments from ddd. Try not to overflow write buffer.
+ * 
+ * Inputs:
+ * 	in0:	address of target page
+ *	in1:	address of source page
+ * Output:
+ * 	no return value
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ */
+#include <asm/page.h>
+
+#define lcount		r16
+#define saved_pr	r17
+#define saved_lc	r18
+#define saved_pfs	r19
+#define src1		r20
+#define src2		r21
+#define tgt1		r22
+#define tgt2		r23
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+	.align 32
+	.global copy_page
+	.proc copy_page
+
+copy_page:
+	alloc saved_pfs=ar.pfs,10,0,0,8	// we need 6 roatating (8 minimum)
+					// + 2 input
+
+	.rotr t1[4], t2[4]	// our 2 pipelines with depth of 4 each
+
+	mov saved_lc=ar.lc	// save ar.lc ahead of time
+	mov saved_pr=pr		// rotating predicates are preserved
+				// resgisters we must save.
+	mov src1=in1		// initialize 1st stream source 
+	adds src2=8,in1		// initialize 2nd stream source 
+	mov lcount=PAGE_SIZE/16-1 // as many 16bytes as there are on a page
+				  // -1 is because br.ctop is repeat/until
+
+	adds tgt2=8,in0		// initialize 2nd stream target
+	mov tgt1=in0		// initialize 1st stream target
+	;;
+	mov pr.rot=1<<16	// pr16=1 & pr[17-63]=0 , 63 not modified
+
+	mov ar.lc=lcount	// set loop counter
+	mov ar.ec=4		// ar.ec must match pipeline depth
+	;;
+
+	// We need to preload the n-1 stages of the pipeline (n=depth).
+	// We do this during the "prolog" of the loop: we execute
+	// n-1 times the "load" bundle. Then both loads & stores are
+	// enabled until we reach the end of the last word of the page
+	// on the load side. Then, we enter the epilogue (controlled by ec)
+	// where we just do the stores and no loads n-1 times : drain the pipe.
+	//
+	// The initialization of the prolog is done via the predicate registers:
+	// the choice of pr19 DEPENDS on the depth of the pipeline (n).
+	// When lc > 0 pr63=1 and it is fed back into pr16 and pr16-pr62 
+	// are then shifted right at every iteration,
+	// Thus by initializing pr16=1 and pr17-19=0 (19=16+4-1) before the loop
+	// we get pr19=1 after 4 iterations (n in our case).
+	//
+1:				// engage loop now, let the magic happen...
+(p16)	ld8 t1[0]=[src1],16	// new data on top of pipeline in 1st stream
+(p16)	ld8 t2[0]=[src2],16	// new data on top of pipeline in 2nd stream
+	nop.i 0x0
+(p19)	st8 [tgt1]=t1[3],16	// store top of  1st pipeline
+(p19)	st8 [tgt2]=t2[3],16	// store top of 2nd pipeline
+	br.ctop.dptk.few 1b	// once lc==0, ec-- & p16=0
+				// stores but no loads anymore
+	;;
+	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
+	mov ar.pfs=saved_pfs	// restore ar.ec 
+	mov ar.lc=saved_lc	// restore saved lc
+	br.ret.sptk.few rp	// bye...
+
+	.endp copy_page
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
new file mode 100644
index 000000000..03a540a80
--- /dev/null
+++ b/arch/ia64/lib/copy_user.S
@@ -0,0 +1,71 @@
+/*
+ * This routine copies a linear memory buffer across the user/kernel boundary.  When
+ * reading a byte from the source causes a fault, the remainder of the destination
+ * buffer is zeroed out.  Note that this can happen only when copying from user
+ * to kernel memory and we do this to absolutely guarantee that the
+ * kernel doesn't operate on random data.
+ *
+ * This file is derived from arch/alpha/lib/copy_user.S.
+ *
+ * Inputs:
+ *	in0:	address of destination buffer
+ *	in1:	address of source buffer
+ *	in2:	length of buffer in bytes
+ * Outputs:
+ *	r8:	number of bytes that didn't get copied due to a fault
+ * 
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#define EXI(x...)				\
+99:	x;					\
+	.section __ex_table,"a";		\
+	data4 @gprel(99b);			\
+	data4 .Lexit_in-99b;			\
+	.previous
+
+#define EXO(x...)				\
+99:	x;					\
+	.section __ex_table,"a";		\
+	data4 @gprel(99b);			\
+	data4 .Lexit_out-99b;			\
+	.previous
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+	.align 32
+	.global __copy_user
+	.proc __copy_user
+__copy_user:
+	alloc r10=ar.pfs,3,0,0,0
+	mov r9=ar.lc		// save ar.lc
+	mov ar.lc=in2		// set ar.lc to length of buffer
+	br.sptk.few .Lentr
+
+	// XXX braindead copy loop---this needs to be optimized
+.Loop1:
+	EXI(ld1 r8=[in1],1)
+	;;
+	EXO(st1 [in0]=r8,1)
+.Lentr:	br.cloop.dptk.few .Loop1	// repeat unless ar.lc--==0
+	;;			// avoid RAW on ar.lc
+.Lexit_out:
+	mov r8=ar.lc		// return how many bytes we _didn't_ copy
+	mov ar.lc=r9
+	br.ret.sptk.few rp
+
+.Lexit_in:
+	// clear the remainder of the buffer:
+	mov r8=ar.lc	// return how many bytes we _didn't_ copy
+.Loop2:
+	st1 [in0]=r0,1	// this cannot fault because we get here only on user->kernel copies
+	br.cloop.dptk.few .Loop2
+	;;				// avoid RAW on ar.lc
+	mov ar.lc=r9
+	br.ret.sptk.few rp
+
+	.endp __copy_user
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
new file mode 100644
index 000000000..d09f11e21
--- /dev/null
+++ b/arch/ia64/lib/csum_partial_copy.c
@@ -0,0 +1,165 @@
+/*
+ * Network Checksum & Copy routine
+ * 
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code has been imported from Linux/Alpha
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * XXX Fixme: those 2 inlines are meant for debugging and will go away
+ */
+static inline unsigned
+short from64to16(unsigned long x)
+{
+	/* add up 32-bit words for 33 bits */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up 16-bit and 17-bit words for 17+c bits */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up 16-bit and 2-bit for 16+c bit */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+static inline
+unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
+{
+	int odd, count;
+	unsigned long result = (unsigned long)psum;
+
+	if (len <= 0)
+		goto out;
+	odd = 1 & (unsigned long) buff;
+	if (odd) {
+		result = *buff << 8;
+		len--;
+		buff++;
+	}
+	count = len >> 1;		/* nr of 16-bit words.. */
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *) buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;		/* nr of 32-bit words.. */
+		if (count) {
+			if (4 & (unsigned long) buff) {
+				result += *(unsigned int *) buff;
+				count--;
+				len -= 4;
+				buff += 4;
+			}
+			count >>= 1;	/* nr of 64-bit words.. */
+			if (count) {
+				unsigned long carry = 0;
+				do {
+					unsigned long w = *(unsigned long *) buff;
+					count--;
+					buff += 8;
+					result += carry;
+					result += w;
+					carry = (w > result);
+				} while (count);
+				result += carry;
+				result = (result & 0xffffffff) + (result >> 32);
+			}
+			if (len & 4) {
+				result += *(unsigned int *) buff;
+				buff += 4;
+			}
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += *buff;
+
+	result = from64to16(result);
+
+	if (odd)
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+
+out:
+	return result;
+}
+
+/*
+ * XXX Fixme
+ *
+ * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
+ * But it's very tricky to get right even in C.
+ */
+extern unsigned long do_csum(const unsigned char *, int);
+
+static unsigned int
+do_csum_partial_copy_from_user (const char *src, char *dst, int len,
+				unsigned int psum, int *errp)
+{
+	const unsigned char *psrc = src;
+	unsigned long result;
+	int cplen = len;
+	int r = 0;
+
+	/* XXX Fixme
+	 * for now we separate the copy from checksum for obvious 
+	 * alignment difficulties. Look at the Alpha code and you'll be
+	 * scared.
+	 */
+
+	while ( cplen-- ) r |=__get_user(*dst++,psrc++);
+
+	if ( r && errp ) *errp = r;
+
+	result = do_csum(src, len);
+
+	/* add in old sum, and carry.. */
+	result += psum;
+	/* 32+c bits -> 32 bits */
+	result = (result & 0xffffffff) + (result >> 32);
+	return result;
+}
+
+unsigned int
+csum_partial_copy_from_user(const char *src, char *dst, int len,
+			    unsigned int sum, int *errp)
+{
+	if (!access_ok(src, len, VERIFY_READ)) {
+		*errp = -EFAULT;
+		memset(dst, 0, len);
+		return sum;
+	}
+
+	return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
+}
+
+unsigned int
+csum_partial_copy_nocheck(const char *src, char *dst, int len, unsigned int sum)
+{
+	return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
+}
+
+unsigned int
+csum_partial_copy (const char *src, char *dst, int len, unsigned int sum)
+{
+	unsigned int ret;
+	int error = 0;
+
+	ret = do_csum_partial_copy_from_user(src, dst, len, sum, &error);
+	if (error)
+		printk("csum_partial_copy_old(): tell mingo to convert me!\n");
+
+	return ret;
+}
+
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
new file mode 100644
index 000000000..d8174f10a
--- /dev/null
+++ b/arch/ia64/lib/do_csum.S
@@ -0,0 +1,230 @@
+/*
+ *
+ * Optmized version of the standard do_csum() function
+ *
+ * Return: a 64bit quantity containing the 16bit Internet checksum
+ *
+ * Inputs:
+ *	in0: address of buffer to checksum (char *)
+ *	in1: length of the buffer (int)
+ * 
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ */
+
+//
+// Theory of operations:
+//	The goal is to go as quickly as possible to the point where
+//	we can checksum 8 bytes/loop. Before reaching that point we must
+//	take care of incorrect alignment of first byte.
+//
+//	The code hereafter also takes care of the "tail" part of the buffer
+//	before entering the core loop, if any. The checksum is a sum so it
+//	allows us to commute operations. So we do do the "head" and "tail"
+//	first to finish at full speed in the body. Once we get the head and
+//	tail values, we feed them into the pipeline, very handy initialization.
+//
+//	Of course we deal with the special case where the whole buffer fits
+//	into one 8 byte word. In this case we have only one entry in the pipeline.
+//
+//	We use a (3+1)-stage pipeline in the loop to account for possible
+//	load latency and also to accomodate for head and tail.
+//
+//	The end of the function deals with folding the checksum from 64bits
+//	down to 16bits taking care of the carry.
+//
+//	This version avoids synchronization in the core loop by also using a
+//	pipeline for the accumulation of the checksum in result[].
+//
+//	 p[]     
+//	|---|
+//     0|   | r32 : new value loaded in pipeline
+//	|---|
+//     1|   | r33 : in transit data
+//	|---|
+//     2|   | r34 : current value to add to checksum
+//	|---|
+//     3|   | r35 : previous value added to checksum (previous iteration)
+//      |---|
+//
+//	result[] 
+//	|---|
+//     0|   | r36 : new checksum
+//	|---|
+//     1|   | r37 : previous value of checksum
+//	|---|
+//     2|   | r38 : final checksum when out of the loop (after 2 epilogue rots)
+//	|---|
+//
+//
+// NOT YET DONE:
+//	- Take advantage of the MMI bandwidth to load more than 8byte per loop
+//	  iteration
+//	- use the lfetch instruction to augment the chances of the data being in
+//	  the cache when we need it.
+//	- Maybe another algorithm which would take care of the folding at the
+//	  end in a different manner
+//	- Work with people more knowledgeable than me on the network stack
+//	  to figure out if we could not split the function depending on the 
+//	  type of packet or alignment we get. Like the ip_fast_csum() routine
+//	  where we know we have at least 20bytes worth of data to checksum.
+//	- Look at RFCs about checksums to see whether or not we can do better
+//
+//	- Do a better job of handling small packets.
+//
+#define saved_pfs	r11
+#define hmask		r16
+#define tmask		r17
+#define first		r18
+#define firstval	r19
+#define firstoff	r20
+#define last		r21
+#define lastval		r22
+#define lastoff		r23
+#define saved_lc	r24
+#define saved_pr	r25
+#define tmp1		r26
+#define tmp2		r27
+#define tmp3		r28
+#define carry		r29
+
+#define buf		in0
+#define len		in1
+
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+// unsigned long do_csum(unsigned char *buf,int len)
+
+	.align 32
+	.global do_csum
+	.proc do_csum
+do_csum:
+	alloc saved_pfs=ar.pfs,2,8,0,8
+
+	.rotr p[4], result[3]
+	mov ret0=r0		// in case we have zero length
+	cmp4.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
+	;;			// avoid WAW on CFM
+	mov tmp3=0x7		// a temporary mask/value
+	add tmp1=buf,len	// last byte's address
+(p6)	br.ret.spnt.few rp	// return if true (hope we can avoid that)
+
+	and firstoff=7,buf	// how many bytes off for first element
+	tbit.nz p10,p0=buf,0	// is buf an odd address ?
+	mov hmask=-1		// intialize head mask
+	;;
+
+	andcm first=buf,tmp3	// 8byte aligned down address of first element
+	mov tmask=-1		// initialize tail mask
+	adds tmp2=-1,tmp1	// last-1
+	;;
+	and lastoff=7,tmp1	// how many bytes off for last element
+	andcm last=tmp2,tmp3	// address of word containing last byte
+	mov saved_pr=pr		// preserve predicates (rotation)
+	;;
+	sub tmp3=last,first	// tmp3=distance from first to last
+	cmp.eq p8,p9=last,first	// everything fits in one word ?
+	sub tmp1=8,lastoff	// complement to lastoff
+
+	ld8 firstval=[first],8	// load,ahead of time, "first" word
+	shl tmp2=firstoff,3	// number of bits
+	;;
+	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
+
+(p9)	ld8 lastval=[last]	// load,ahead of time, "last" word, if needed
+(p8)	mov lastval=r0		// we don't need lastval if first==last
+	mov result[1]=r0	// initialize result
+	;;
+
+	shl tmp1=tmp1,3		// number of bits
+	shl hmask=hmask,tmp2 	// build head mask, mask off [0,firstoff[
+	;;
+	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
+	mov saved_lc=ar.lc	// save lc
+	;;
+(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
+(p9)	and p[1]=lastval,tmask	// mask last it as appropriate
+	shr.u tmp3=tmp3,3	// we do 8 bytes per loop
+	;;
+	cmp.lt p6,p7=2,tmp3	// tmp3 > 2 ?
+	and p[2]=firstval,hmask	// and mask it as appropriate
+	add tmp1=-2,tmp3	// -2 = -1 (br.ctop) -1 (last-first)
+	;;
+	// XXX Fixme: not very nice initialization here
+	//
+	// Setup loop control registers: 
+	//
+	// tmp3=0 (1 word)   : lc=0, ec=2, p16=F
+	// tmp3=1 (2 words)  : lc=0, ec=3, p16=F
+	// tmp3=2 (3 words)  : lc=0, ec=4, p16=T
+	// tmp3>2 (4 or more): lc=tmp3-2, ec=4, p16=T
+	//
+	cmp.eq p8,p9=r0,tmp3	// tmp3 == 0 ?
+(p6)	mov ar.lc=tmp1
+(p7)	mov ar.lc=0
+	;;
+	cmp.lt p6,p7=1,tmp3	// tmp3 > 1 ?
+(p8)	mov ar.ec=2		// we need the extra rotation on result[]
+(p9)	mov ar.ec=3		// hard not to set it twice sometimes
+	;;
+	mov carry=r0			// initialize carry
+(p6)	mov ar.ec=4
+(p6)	mov pr.rot=0xffffffffffff0000	// p16=T, p18=T
+
+	cmp.ne p8,p0=r0,r0		// p8 is false
+	mov p[3]=r0			// make sure first compare fails
+(p7)	mov pr.rot=0xfffffffffffe0000	// p16=F, p18=T
+	;;
+1:
+(p16)	ld8 p[0]=[first],8		// load next
+(p8)	adds carry=1,carry		// add carry on prev_prev_value
+(p18)	add result[0]=result[1],p[2]	// new_res = prev_res + cur_val
+	cmp.ltu p8,p0=result[1],p[3]	// p8= prev_result < prev_val
+	br.ctop.dptk.few 1b		// loop until lc--==0
+	;;				// RAW on carry when loop exits
+ (p8)	adds carry=1,carry;;		// correct for carry on prev_value
+	add result[2]=carry,result[2];;	// add carry to final result
+	cmp.ltu p6,p7=result[2], carry	// check for new carry
+	;;
+(p6)	adds result[2]=1,result[1]	// correct if required
+	movl tmp3=0xffffffff
+	;;
+	// XXX Fixme
+	//
+	// now fold 64 into 16 bits taking care of carry
+	// that's not very good because it has lots of sequentiality
+	//
+	and tmp1=result[2],tmp3
+	shr.u tmp2=result[2],32
+	;;
+	add result[2]=tmp1,tmp2
+	shr.u tmp3=tmp3,16
+	;;
+	and tmp1=result[2],tmp3
+	shr.u tmp2=result[2],16
+	;;
+	add result[2]=tmp1,tmp2
+	;;
+	and tmp1=result[2],tmp3
+	shr.u tmp2=result[2],16
+	;;
+	add result[2]=tmp1,tmp2
+	;;
+	and tmp1=result[2],tmp3
+	shr.u tmp2=result[2],16
+	;;
+	add ret0=tmp1,tmp2
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	// if buf was odd then swap bytes 
+	mov ar.pfs=saved_pfs		// restore ar.ec
+(p10)	mux1 ret0=ret0,@rev		// reverse word
+	;;
+	mov ar.lc=saved_lc
+(p10)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
+	br.ret.sptk.few rp
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
new file mode 100644
index 000000000..0195ae5f5
--- /dev/null
+++ b/arch/ia64/lib/flush.S
@@ -0,0 +1,37 @@
+/*
+ * Cache flushing routines.
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/page.h>
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+	.align 16
+	.global ia64_flush_icache_page
+	.proc ia64_flush_icache_page
+ia64_flush_icache_page:
+	alloc r2=ar.pfs,1,0,0,0
+	mov r3=ar.lc			// save ar.lc	
+	mov r8=PAGE_SIZE/64-1		// repeat/until loop
+	;;
+	mov ar.lc=r8
+	add r8=32,in0
+	;;
+.Loop1:	fc in0				// issuable on M0 only
+	add in0=64,in0
+	fc r8
+	add r8=64,r8
+	br.cloop.sptk.few .Loop1
+	;;
+	sync.i
+	;;
+	srlz.i
+	;;	
+	mov ar.lc=r3			// restore ar.lc
+	br.ret.sptk.few rp
+	.endp ia64_flush_icache_page
diff --git a/arch/ia64/lib/idiv.S b/arch/ia64/lib/idiv.S
new file mode 100644
index 000000000..a12097c94
--- /dev/null
+++ b/arch/ia64/lib/idiv.S
@@ -0,0 +1,158 @@
+/*
+ * Integer division routine.
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+/* Simple integer division.  It uses the straight forward division
+   algorithm.  This may not be the absolutely fastest way to do it,
+   but it's not horrible either.  According to ski, the worst case
+   scenario of dividing 0xffffffffffffffff by 1 takes 133 cycles.
+
+   An alternative would be to use an algorithm similar to the
+   floating point division algorithm (Newton-Raphson iteration),
+   but that approach is rather tricky (one has to be very careful
+   to get the last bit right...).
+
+   While this algorithm is straight-forward, it does use a couple
+   of neat ia-64 specific tricks:
+
+	- it uses the floating point unit to determine the initial
+	  shift amount (shift = floor(ld(x)) - floor(ld(y)))
+
+	- it uses predication to avoid a branch in the case where
+	  x < y (this is what p8 is used for)
+
+	- it uses rotating registers and the br.ctop branch to
+	  implement a software-pipelined loop that's unrolled
+	  twice (without any code expansion!)
+
+	- the code is relatively well scheduled to avoid unnecessary
+	  nops while maximizing parallelism
+*/
+
+#include <asm/break.h>
+
+	.text
+	.psr abi64
+#ifdef __BIG_ENDIAN__
+	.psr msb
+	.msb
+#else
+	.psr lsb
+	.lsb
+#endif
+
+#ifdef MODULO
+# define OP	mod
+# define Q	r9
+# define R	r8
+#else
+# define OP div
+# define Q	r8
+# define R	r9
+#endif
+
+#ifdef SINGLE
+# define PREC si
+#else
+# define PREC di
+#endif
+
+#ifdef UNSIGNED
+# define SGN		u
+# define INT_TO_FP(a,b)	fma.s0 a=b,f1,f0
+# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s0 a=b
+#else
+# define SGN
+# define INT_TO_FP(a,b)	fcvt.xf a=b
+# define FP_TO_INT(a,b)	fcvt.fx.trunc.s0 a=b
+#endif
+
+#define PASTE1(a,b)	a##b
+#define PASTE(a,b)	PASTE1(a,b)
+#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,PASTE(PREC,3)))
+
+	.align 32
+	.global NAME
+	.proc NAME
+NAME:
+
+	alloc r2=ar.pfs,2,6,0,8
+	mov r18=pr
+#ifdef SINGLE
+# ifdef UNSIGNED
+	zxt4 in0=in0
+	zxt4 in1=in1
+# else
+	sxt4 in0=in0
+	sxt4 in1=in1
+# endif
+	;;
+#endif
+
+#ifndef UNSIGNED
+	cmp.lt p6,p0=in0,r0	// x negative?
+	cmp.lt p7,p0=in1,r0	// y negative?
+	;;
+(p6)	sub in0=r0,in0		// make x positive
+(p7)	sub in1=r0,in1		// ditto for y
+	;;
+#endif
+
+	setf.sig f8=in0
+	mov r3=ar.lc		// save ar.lc
+	setf.sig f9=in1
+	;;
+	mov Q=0			// initialize q
+	mov R=in0		// stash away x in a static register
+	mov r16=1		// r16 = 1
+	INT_TO_FP(f8,f8)
+	cmp.eq p8,p0=0,in0	// x==0?
+	cmp.eq p9,p0=0,in1	// y==0?
+	;;
+	INT_TO_FP(f9,f9)
+(p8)	br.dpnt.few .L3
+(p9)	break __IA64_BREAK_KDB	// attempted division by zero (should never happen)
+	mov ar.ec=r0		// epilogue count = 0
+	;;
+	getf.exp r14=f8		// r14 = exponent of x
+	getf.exp r15=f9		// r15 = exponent of y
+	mov ar.lc=r0		// loop count = 0
+	;;
+	sub r17=r14,r15		// r17 = (exp of x - exp y) = shift amount
+	cmp.ge p8,p0=r14,r15
+	;;
+
+	.rotr y[2], mask[2]	// in0 and in1 may no longer be valid after
+				// the first write to a rotating register!
+
+(p8)	shl y[1]=in1,r17	// y[1]    = y<<shift
+(p8)	shl mask[1]=r16,r17	// mask[1] = 1<<shift
+
+(p8)	mov ar.lc=r17		// loop count = r17
+	;;
+.L1:
+(p8)	cmp.geu.unc p9,p0=R,y[1]// p9 = (x >= y[1])
+(p8)	shr.u mask[0]=mask[1],1	// prepare mask[0] and y[0] for next
+(p8)	shr.u y[0]=y[1],1	// iteration
+	;;
+(p9)	sub R=R,y[1]		// if (x >= y[1]), subtract y[1] from x
+(p9)	add Q=Q,mask[1]		// and set corresponding bit in q (Q)
+	br.ctop.dptk.few .L1	// repeated unless ar.lc-- == 0
+	;;
+.L2:
+#ifndef UNSIGNED
+# ifdef MODULO
+(p6)	sub R=r0,R		// set sign of remainder according to x
+# else
+(p6)	sub Q=r0,Q		// set sign of quotient
+	;;
+(p7)	sub Q=r0,Q
+# endif
+#endif
+.L3:
+	mov ar.pfs=r2		// restore ar.pfs
+	mov ar.lc=r3		// restore ar.lc
+	mov pr=r18,0xffffffffffff0000	// restore p16-p63
+	br.ret.sptk.few rp
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
new file mode 100644
index 000000000..595720a2d
--- /dev/null
+++ b/arch/ia64/lib/memset.S
@@ -0,0 +1,111 @@
+/*
+ *
+ * Optimized version of the standard memset() function
+ *
+ * Return: none
+ *         
+ *
+ * Inputs:
+ *	in0:	address of buffer
+ * 	in1:	byte value to use for storing
+ *	in2:	length of the buffer
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ */
+
+
+// arguments
+//
+#define buf		r32
+#define val		r33
+#define len		r34
+
+//
+// local registers
+//
+#define saved_pfs	r14
+#define cnt		r18
+#define buf2		r19
+#define saved_lc	r20
+#define saved_pr	r21
+#define tmp		r22
+
+ 	.text
+ 	.psr	abi64
+ 	.psr	lsb
+
+ 	.align	16
+ 	.global	memset
+ 	.proc	memset
+
+memset:
+ 	alloc	saved_pfs=ar.pfs,3,0,0,0	// cnt is sink here
+	cmp.eq p8,p0=r0,len	// check for zero length
+	mov saved_lc=ar.lc	// preserve ar.lc (slow)
+	;; 
+	adds tmp=-1,len		// br.ctop is repeat/until
+	tbit.nz p6,p0=buf,0	// odd alignment
+(p8)	br.ret.spnt.few rp
+
+	cmp.lt p7,p0=16,len	// if len > 16 then long memset
+	mux1 val=val,@brcst	// prepare value
+(p7)	br.cond.dptk.few long_memset
+	;;
+	mov ar.lc=tmp		// initialize lc for small count
+	;;			// avoid RAW and WAW on ar.lc
+1:				// worst case 15 cyles, avg 8 cycles
+	st1 [buf]=val,1
+	br.cloop.dptk.few 1b
+	;;				// avoid RAW on ar.lc
+	mov ar.lc=saved_lc
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.few rp	// end of short memset
+
+	// at this point we know we have more than 16 bytes to copy
+	// so we focus on alignment
+long_memset:
+(p6)	st1 [buf]=val,1		// 1-byte aligned
+(p6)	adds len=-1,len;;	// sync because buf is modified
+	tbit.nz p6,p0=buf,1
+	;;
+(p6)	st2 [buf]=val,2		// 2-byte aligned
+(p6)	adds len=-2,len;;
+	tbit.nz p6,p0=buf,2
+	;;
+(p6)	st4 [buf]=val,4		// 4-byte aligned
+(p6)	adds len=-4,len;;
+	tbit.nz p6,p0=buf,3
+	;;
+(p6)	st8 [buf]=val,8		// 8-byte aligned
+(p6)	adds len=-8,len;;
+	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
+	;;	
+	cmp.eq p6,p0=r0,cnt
+	adds tmp=-1,cnt
+(p6)	br.cond.dpnt.few .dotail // we have less than 16 bytes left
+	;;
+	adds buf2=8,buf		// setup second base pointer
+	mov ar.lc=tmp
+	;;
+2:				// 16bytes/iteration
+	st8 [buf]=val,16
+	st8 [buf2]=val,16
+	br.cloop.dptk.few 2b
+	;;
+.dotail:			// tail correction based on len only
+	tbit.nz p6,p0=len,3	
+	;;
+(p6)	st8 [buf]=val,8		// at least 8 bytes
+	tbit.nz p6,p0=len,2	
+	;;
+(p6)	st4 [buf]=val,4		// at least 4 bytes
+	tbit.nz p6,p0=len,1
+	;;
+(p6)	st2 [buf]=val,2		// at least 2 bytes
+	tbit.nz p6,p0=len,0
+	mov ar.lc=saved_lc
+	;;
+(p6)	st1 [buf]=val		// only 1 byte left
+	br.ret.dptk.few rp
+ 	.endp
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
new file mode 100644
index 000000000..3062716b1
--- /dev/null
+++ b/arch/ia64/lib/strlen.S
@@ -0,0 +1,197 @@
+/*
+ *
+ * Optimized version of the standard strlen() function
+ *
+ *
+ * Inputs:
+ *	in0	address of string
+ * 
+ * Outputs: 
+ * 	ret0	the number of characters in the string (0 if empty string)
+ *         	does not count the \0
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 09/24/99 S.Eranian add speculation recovery code
+ */
+
+//
+//
+// This is an enhanced version of the basic strlen. it includes a combination
+// of compute zero index (czx), parallel comparisons, speculative loads and
+// loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+//	  The goal is to look at the string in chunks of 8 bytes.
+//	  so we need to do a few extra checks at the beginning because the
+//	  string may not be 8-byte aligned. In this case we load the 8byte
+//	  quantity which includes the start of the string and mask the unused
+//	  bytes with 0xff to avoid confusing czx.
+//	  We use speculative loads and software pipelining to hide memory 
+//	  latency and do read ahead safely. This way we defer any exception.
+//
+//	  Because we don't want the kernel to be relying on particular
+//	  settings of the DCR register, we provide recovery code in case
+//	  speculation fails. The recovery code is going to "redo" the work using
+//	  only normal loads. If we still get a fault then we generate a
+//	  kernel panic. Otherwise we return the strlen as usual.
+//
+//	  The fact that speculation may fail can be caused, for instance, by
+//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+//	  a NaT bit will be set if the translation is not present. The normal
+//	  load, on the other hand, will cause the translation to be inserted 
+//	  if the mapping exists.
+//
+//	  It should be noted that we execute recovery code only when we need
+//	  to use the data that has been speculatively loaded: we don't execute
+//	  recovery code on pure read ahead data.
+//
+// Remarks:
+//	- the cmp r0,r0 is used as a fast way to initialize a predicate 
+//	  register to 1. This is required to make sure that we get the parallel
+//	  compare correct.
+//
+//	- we don't use the epilogue counter to exit the loop but we need to set
+//	  it to zero beforehand.
+//
+//	- after the loop we must test for Nat values because neither the 
+//	  czx nor cmp instruction raise a NaT consumption fault. We must be
+//	  careful not to look too far for a Nat for which we don't care. 
+//	  For instance we don't need to look at a NaT in val2 if the zero byte
+//	  was in val1.
+//
+//	- Clearly performance tuning is required.
+//
+// 
+//
+#define saved_pfs	r11
+#define	tmp		r10
+#define base		r16
+#define orig		r17
+#define saved_pr	r18
+#define src		r19
+#define mask		r20
+#define val		r21
+#define val1		r22
+#define val2		r23
+
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+	.align 32
+	.global strlen
+	.proc strlen
+strlen:
+	alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
+
+	.rotr v[2], w[2]	// declares our 4 aliases
+
+	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
+	mov orig=in0		// keep trackof initial byte address
+	dep src=0,in0,0,3	// src=8byte-aligned in0 address
+	mov saved_pr=pr		// preserve predicates (rotation)
+	;;
+	ld8 v[1]=[src],8	// must not speculate: can fail here
+	shl tmp=tmp,3		// multiply by 8bits/byte
+	mov mask=-1		// our mask
+	;;
+	ld8.s w[1]=[src],8	// speculatively load next
+	cmp.eq p6,p0=r0,r0	// sets p6 to true for cmp.and
+	sub tmp=64,tmp		// how many bits to shift our mask on the right
+	;;
+	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
+	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
+	;;
+	add base=-16,src	// keep track of aligned base
+	or v[1]=v[1],mask	// now we have a safe initial byte pattern
+	;;
+1:
+	ld8.s v[0]=[src],8	// speculatively load next 
+	czx1.r val1=v[1]	// search 0 byte from right 
+	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
+	;;
+	ld8.s w[0]=[src],8	// speculatively load next to next
+	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
+	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
+(p6)	br.wtop.dptk.few 1b	// loop until p6 == 0
+	;;
+	//
+	// We must return try the recovery code iff
+	// val1_is_nat || (val1==8 && val2_is_nat)
+	//
+	// XXX Fixme
+	//	- there must be a better way of doing the test
+	//
+	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
+#ifdef notyet
+	tnat.nz p6,p7=val1	// test NaT on val1
+#else
+	tnat.z p7,p6=val1	// test NaT on val1
+#endif
+(p6)	br.cond.spnt.few recover// jump to recovery if val1 is NaT
+	;;
+	//
+	// if we come here p7 is true, i.e., initialized for // cmp
+	//
+	cmp.eq.and  p7,p0=8,val1// val1==8?
+	tnat.nz.and p7,p0=val2	// test NaT if val2
+(p7)	br.cond.spnt.few recover// jump to recovery if val2 is NaT
+	;;
+(p8)	mov val1=val2		// the other test got us out of the loop
+(p8)	adds src=-16,src	// correct position when 3 ahead
+(p9)	adds src=-24,src	// correct position when 4 ahead
+	;;
+	sub ret0=src,orig	// distance from base
+	sub tmp=8,val1		// which byte in word
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	sub ret0=ret0,tmp	// adjust 
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.few rp	// end of normal execution
+
+	//
+	// Outlined recovery code when speculation failed
+	//
+	// This time we don't use speculation and rely on the normal exception
+	// mechanism. that's why the loop is not as good as the previous one
+	// because read ahead is not possible
+	//
+	// IMPORTANT:
+	// Please note that in the case of strlen() as opposed to strlen_user()
+	// we don't use the exception mechanism, as this function is not 
+	// supposed to fail. If that happens it means we have a bug and the 
+	// code will cause of kernel fault.
+	//
+	// XXX Fixme
+	//	- today we restart from the beginning of the string instead
+	//	  of trying to continue where we left off.
+	//
+recover:
+	ld8 val=[base],8	// will fail if unrecoverable fault
+	;;
+	or val=val,mask		// remask first bytes
+	cmp.eq p0,p6=r0,r0	// nullify first ld8 in loop
+	;;
+	//
+	// ar.ec is still zero here
+	//
+2:
+(p6)	ld8 val=[base],8	// will fail if unrecoverable fault
+	;;
+	czx1.r val1=val		// search 0 byte from right 
+	;;
+	cmp.eq p6,p0=8,val1	// val1==8 ?
+(p6)	br.wtop.dptk.few 2b	// loop until p6 == 0
+	sub ret0=base,orig	// distance from base
+	sub tmp=8,val1
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	sub ret0=ret0,tmp	// length=now - back -1
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.few rp	// end of sucessful recovery code
+
+	.endp strlen
diff --git a/arch/ia64/lib/strlen_user.S b/arch/ia64/lib/strlen_user.S
new file mode 100644
index 000000000..8149dde8a
--- /dev/null
+++ b/arch/ia64/lib/strlen_user.S
@@ -0,0 +1,213 @@
+/*
+ * Optimized version of the strlen_user() function
+ *
+ * Inputs:
+ *	in0	address of buffer
+ *
+ * Outputs:
+ *	ret0	0 in case of fault, strlen(buffer)+1 otherwise
+ * 
+ * Copyright (C) 1998, 1999 Hewlett-Packard Co
+ * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1998, 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 01/19/99 S.Eranian heavily enhanced version (see details below)
+ * 09/24/99 S.Eranian added speculation recovery code
+ */
+
+//
+// int strlen_user(char *)
+// ------------------------
+// Returns:
+//	- length of string + 1
+//	- 0 in case an exception is raised
+//
+// This is an enhanced version of the basic strlen_user. it includes a 
+// combination of compute zero index (czx), parallel comparisons, speculative 
+// loads and loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+//	  The goal is to look at the string in chunks of 8 bytes.
+//	  so we need to do a few extra checks at the beginning because the
+//	  string may not be 8-byte aligned. In this case we load the 8byte
+//	  quantity which includes the start of the string and mask the unused
+//	  bytes with 0xff to avoid confusing czx.
+//	  We use speculative loads and software pipelining to hide memory 
+//	  latency and do read ahead safely. This way we defer any exception.
+//
+//	  Because we don't want the kernel to be relying on particular
+//	  settings of the DCR register, we provide recovery code in case
+//	  speculation fails. The recovery code is going to "redo" the work using
+//	  only normal loads. If we still get a fault then we return an
+//	  error (ret0=0). Otherwise we return the strlen+1 as usual.
+//	  The fact that speculation may fail can be caused, for instance, by
+//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+//	  a NaT bit will be set if the translation is not present. The normal
+//	  load, on the other hand, will cause the translation to be inserted 
+//	  if the mapping exists.
+//
+//	  It should be noted that we execute recovery code only when we need
+//	  to use the data that has been speculatively loaded: we don't execute
+//	  recovery code on pure read ahead data.
+//
+// Remarks:
+//	- the cmp r0,r0 is used as a fast way to initialize a predicate 
+//	  register to 1. This is required to make sure that we get the parallel
+//	  compare correct.
+//
+//	- we don't use the epilogue counter to exit the loop but we need to set
+//	  it to zero beforehand.
+//
+//	- after the loop we must test for Nat values because neither the 
+//	  czx nor cmp instruction raise a NaT consumption fault. We must be
+//	  careful not to look too far for a Nat for which we don't care. 
+//	  For instance we don't need to look at a NaT in val2 if the zero byte
+//	  was in val1.
+//
+//	- Clearly performance tuning is required.
+//
+// 
+//
+
+#define EX(y,x...)				\
+	.section __ex_table,"a";		\
+	data4 @gprel(99f);			\
+	data4 y-99f;				\
+	.previous;				\
+99:	x
+
+#define saved_pfs	r11
+#define	tmp		r10
+#define base		r16
+#define orig		r17
+#define saved_pr	r18
+#define src		r19
+#define mask		r20
+#define val		r21
+#define val1		r22
+#define val2		r23
+
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+	.align 32
+	.global __strlen_user
+	.proc __strlen_user
+__strlen_user:
+	alloc saved_pfs=ar.pfs,11,0,0,8
+
+	.rotr v[2], w[2]	// declares our 4 aliases
+
+	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
+	mov orig=in0		// keep trackof initial byte address
+	dep src=0,in0,0,3	// src=8byte-aligned in0 address
+	mov saved_pr=pr		// preserve predicates (rotation)
+	;;
+	ld8.s v[1]=[src],8	// load the initial 8bytes (must speculate)
+	shl tmp=tmp,3		// multiply by 8bits/byte
+	mov mask=-1		// our mask
+	;;
+	ld8.s w[1]=[src],8	// load next 8 bytes in 2nd pipeline
+	cmp.eq p6,p0=r0,r0	// sets p6 (required because of // cmp.and)
+	sub tmp=64,tmp		// how many bits to shift our mask on the right
+	;;
+	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
+	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
+	;;
+	add base=-16,src	// keep track of aligned base
+	chk.s v[1], recover	// if already NaT, then directly skip to recover
+	or v[1]=v[1],mask	// now we have a safe initial byte pattern
+	;;
+1:
+	ld8.s v[0]=[src],8	// speculatively load next 
+	czx1.r val1=v[1]	// search 0 byte from right 
+	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
+	;;
+	ld8.s w[0]=[src],8	// speculatively load next to next
+	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
+	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
+(p6)	br.wtop.dptk.few 1b	// loop until p6 == 0
+	;;
+	//
+	// We must return try the recovery code iff
+	// val1_is_nat || (val1==8 && val2_is_nat)
+	//
+	// XXX Fixme
+	//	- there must be a better way of doing the test
+	//
+	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
+#ifdef notyet
+	tnat.nz p6,p7=val1	// test NaT on val1
+#else
+	tnat.z p7,p6=val1	// test NaT on val1
+#endif
+(p6)	br.cond.spnt.few recover// jump to recovery if val1 is NaT
+	;;
+	//
+	// if we come here p7 is true, i.e., initialized for // cmp
+	//
+	cmp.eq.and  p7,p0=8,val1// val1==8?
+	tnat.nz.and p7,p0=val2	// test NaT if val2
+(p7)	br.cond.spnt.few recover// jump to recovery if val2 is NaT
+	;;
+(p8)	mov val1=val2		// val2 contains the value
+(p8)	adds src=-16,src	// correct position when 3 ahead
+(p9)	adds src=-24,src	// correct position when 4 ahead
+	;;
+	sub ret0=src,orig	// distance from origin
+	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	sub ret0=ret0,tmp	// length=now - back -1
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.few rp	// end of normal execution
+
+	//
+	// Outlined recovery code when speculation failed
+	//
+	// This time we don't use speculation and rely on the normal exception
+	// mechanism. that's why the loop is not as good as the previous one
+	// because read ahead is not possible
+	//
+	// XXX Fixme
+	//	- today we restart from the beginning of the string instead
+	//	  of trying to continue where we left off.
+	//
+recover:
+	EX(.Lexit1, ld8 val=[base],8)	// load the initial bytes
+	;;
+	or val=val,mask			// remask first bytes
+	cmp.eq p0,p6=r0,r0		// nullify first ld8 in loop
+	;;
+	//
+	// ar.ec is still zero here
+	//
+2:
+	EX(.Lexit1, (p6) ld8 val=[base],8)
+	;;
+	czx1.r val1=val		// search 0 byte from right 
+	;;
+	cmp.eq p6,p0=8,val1	// val1==8 ?
+(p6)	br.wtop.dptk.few 2b	// loop until p6 == 0
+	;;
+	sub ret0=base,orig	// distance from base
+	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	sub ret0=ret0,tmp	// length=now - back -1
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.few rp	// end of sucessful recovery code
+
+	//
+	// We failed even on the normal load (called from exception handler)
+	//
+.Lexit1:
+	mov ret0=0
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
+	br.ret.sptk.few rp
+	
+	.endp __strlen_user
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
new file mode 100644
index 000000000..17f71f1a0
--- /dev/null
+++ b/arch/ia64/lib/strncpy_from_user.S
@@ -0,0 +1,53 @@
+/*
+ * Just like strncpy() except for the return value.  If no fault occurs during
+ * the copying, the number of bytes copied is returned.  If a fault occurs,
+ * -EFAULT is returned.
+ *
+ * Inputs:
+ *	in0:	address of destination buffer
+ *	in1:	address of string to be copied
+ *	in2:	length of buffer in bytes
+ * Outputs:
+ *	r8:	-EFAULT in case of fault or number of bytes copied if no fault
+ * 
+ * Copyright (C) 1998, 1999 Hewlett-Packard Co
+ * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#define EX(x...)				\
+99:	x;					\
+	.section __ex_table,"a";		\
+	data4 @gprel(99b);			\
+	data4 .Lexit-99b;			\
+	.previous
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+	.align 32
+	.global __strncpy_from_user
+	.proc __strncpy_from_user
+__strncpy_from_user:
+	alloc r11=ar.pfs,3,0,0,0
+	mov r9=in1
+	add r10=in1,in2
+
+	// XXX braindead copy loop---this needs to be optimized
+.Loop1:
+	EX(ld1 r8=[in1],1)
+	;;
+	st1 [in0]=r8,1
+	cmp.ltu p6,p0=in1,r10
+	;;
+(p6)	cmp.ne.and p6,p0=r8,r0
+	;;
+(p6)	br.cond.dpnt.few .Loop1
+
+1:	sub r8=in1,r9		// length of string (including NUL character)
+.Lexit:
+	mov ar.pfs=r11
+	br.ret.sptk.few rp
+
+	.endp __strncpy_from_user
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
new file mode 100644
index 000000000..c227a9003
--- /dev/null
+++ b/arch/ia64/lib/strnlen_user.S
@@ -0,0 +1,55 @@
+/*
+ * Returns 0 if exception before NUL or reaching the supplied limit (N),
+ * a value greater than N if the string is longer than the limit, else
+ * strlen.
+ *
+ * Inputs:
+ *	in0:	address of buffer
+ *	in1:	string length limit N
+ * Outputs:
+ *	r8:	0 in case of fault, strlen(buffer)+1 otherwise
+ * 
+ * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+/* If a fault occurs, r8 gets set to -EFAULT and r9 gets cleared.  */
+#define EX(x...)				\
+	.section __ex_table,"a";		\
+	data4 @gprel(99f);			\
+	data4 (.Lexit-99f)|1;			\
+	.previous				\
+99:	x;
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+	.align 32
+	.global __strnlen_user
+	.proc __strnlen_user
+__strnlen_user:
+	alloc r2=ar.pfs,2,0,0,0
+	mov r16=ar.lc			// preserve ar.lc
+	add r3=-1,in1
+	;;
+	mov ar.lc=r3
+	mov r9=0
+
+	// XXX braindead strlen loop---this needs to be optimized
+.Loop1:
+	EX(ld1 r8=[in0],1)
+	add r9=1,r9
+	;;
+	cmp.eq p6,p0=r8,r0
+(p6)	br.dpnt.few .Lexit
+	br.cloop.dptk.few .Loop1
+
+	add r9=1,in1			// NUL not found---return N+1
+	;;
+.Lexit:
+	mov r8=r9
+	mov ar.lc=r16			// restore ar.lc
+	br.ret.sptk.few rp
+
+	.endp __strnlen_user
author	Ralf Baechle <ralf@linux-mips.org>	2000-02-23 00:40:54 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-02-23 00:40:54 +0000
commit	529c593ece216e4aaffd36bd940cb94f1fa63129 (patch)
tree	78f1c0b805f5656aa7b0417a043c5346f700a2cf /arch/ia64/lib
parent	0bd079751d25808d1972baee5c4eaa1db2227257 (diff)