4 files changed, 153 insertions, 145 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
index 882bdaed9..318e314cc 100644
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -7,40 +7,26 @@
 
 L_TARGET = lib.a
 
-L_OBJS  = __divdi3.o __divsi3.o __udivdi3.o __udivsi3.o		\
-	__moddi3.o __modsi3.o __umoddi3.o __umodsi3.o		\
-	checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
-	copy_user.o clear_user.o memset.o strncpy_from_user.o	\
-	strlen.o strlen_user.o strnlen_user.o			\
+L_OBJS  = __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
+	checksum.o clear_page.o csum_partial_copy.o copy_page.o		\
+	copy_user.o clear_user.o memcpy.o memset.o strncpy_from_user.o	\
+	strlen.o strlen_user.o strnlen_user.o				\
 	flush.o do_csum.o
 
 LX_OBJS = io.o
 
-IGNORE_FLAGS_OBJS = __divdi3.o __divsi3.o __udivdi3.o __udivsi3.o \
-		    __moddi3.o __modsi3.o __umoddi3.o __umodsi3.o
+IGNORE_FLAGS_OBJS = __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
 
 include $(TOPDIR)/Rules.make
 
 __divdi3.o: idiv.S
 	$(CC) $(AFLAGS) -c -o $@ $<
 
-__divsi3.o: idiv.S
-	$(CC) $(AFLAGS) -c -DSINGLE -c -o $@ $<
-
 __udivdi3.o: idiv.S
 	$(CC) $(AFLAGS) -c -DUNSIGNED -c -o $@ $<
 
-__udivsi3.o: idiv.S
-	$(CC) $(AFLAGS) -c -DUNSIGNED -DSINGLE -c -o $@ $<
-
 __moddi3.o: idiv.S
 	$(CC) $(AFLAGS) -c -DMODULO -c -o $@ $<
 
-__modsi3.o: idiv.S
-	$(CC) $(AFLAGS) -c -DMODULO -DSINGLE -c -o $@ $<
-
 __umoddi3.o: idiv.S
 	$(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-__umodsi3.o: idiv.S
-	$(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -DSINGLE -c -o $@ $<
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
index 4a0abaed0..e13febb22 100644
--- a/arch/ia64/lib/copy_user.S
+++ b/arch/ia64/lib/copy_user.S
@@ -116,7 +116,7 @@ GLOBAL_ENTRY(__copy_user)
 	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy 
 
 	xor tmp=src,dst		// same alignment test prepare
-(p10)	br.cond.dptk.few long_memcpy
+(p10)	br.cond.dptk.few long_copy_user
 	;;			// RAW pr.rot/p16 ?
 	//
 	// Now we do the byte by byte loop with software pipeline
@@ -136,7 +136,7 @@ GLOBAL_ENTRY(__copy_user)
 	//
 	// Beginning of long mempcy (i.e. > 16 bytes)
 	//
-long_memcpy:
+long_copy_user:
 	tbit.nz p6,p7=src1,0	// odd alignement
 	and tmp=7,tmp
 	;;
diff --git a/arch/ia64/lib/idiv.S b/arch/ia64/lib/idiv.S
index af424c41b..da96863d3 100644
--- a/arch/ia64/lib/idiv.S
+++ b/arch/ia64/lib/idiv.S
@@ -1,162 +1,98 @@
 /*
  * Integer division routine.
  *
- * Copyright (C) 1999 Hewlett-Packard Co
- * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999-2000 Hewlett-Packard Co
+ * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
  */
-/* Simple integer division.  It uses the straight forward division
-   algorithm.  This may not be the absolutely fastest way to do it,
-   but it's not horrible either.  According to ski, the worst case
-   scenario of dividing 0xffffffffffffffff by 1 takes 133 cycles.
-
-   An alternative would be to use an algorithm similar to the
-   floating point division algorithm (Newton-Raphson iteration),
-   but that approach is rather tricky (one has to be very careful
-   to get the last bit right...).
-
-   While this algorithm is straight-forward, it does use a couple
-   of neat ia-64 specific tricks:
-
-	- it uses the floating point unit to determine the initial
-	  shift amount (shift = floor(ld(x)) - floor(ld(y)))
-
-	- it uses predication to avoid a branch in the case where
-	  x < y (this is what p8 is used for)
-
-	- it uses rotating registers and the br.ctop branch to
-	  implement a software-pipelined loop that's unrolled
-	  twice (without any code expansion!)
-
-	- the code is relatively well scheduled to avoid unnecessary
-	  nops while maximizing parallelism
-*/
 
 #include <asm/asmmacro.h>
-#include <asm/break.h>
 
-	.text
-	.psr abi64
-#ifdef __BIG_ENDIAN__
-	.psr msb
-	.msb
-#else
-	.psr lsb
-	.lsb
-#endif
+/*
+ * Compute a 64-bit unsigned integer quotient.
+ *
+ * Use reciprocal approximation and Newton-Raphson iteration to compute the
+ * quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
+ * to get more than the 64 bits of precision that we need for DImode.
+ *
+ * Must use max precision for the reciprocal computations to get 64 bits of
+ * precision.
+ *
+ * r32 holds the dividend.  r33 holds the divisor.
+ */
 
 #ifdef MODULO
 # define OP	mod
-# define Q	r9
-# define R	r8
 #else
-# define OP div
-# define Q	r8
-# define R	r9
-#endif
-
-#ifdef SINGLE
-# define PREC si
-#else
-# define PREC di
+# define OP	div
 #endif
 
 #ifdef UNSIGNED
-# define SGN		u
-# define INT_TO_FP(a,b)	fma.s0 a=b,f1,f0
-# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s0 a=b
+# define SGN	u
+# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
+# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
 #else
 # define SGN
 # define INT_TO_FP(a,b)	fcvt.xf a=b
-# define FP_TO_INT(a,b)	fcvt.fx.trunc.s0 a=b
+# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
 #endif
 
 #define PASTE1(a,b)	a##b
 #define PASTE(a,b)	PASTE1(a,b)
-#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,PASTE(PREC,3)))
+#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,di3))
 
 GLOBAL_ENTRY(NAME)
 	UNW(.prologue)
-	alloc r2=ar.pfs,2,6,0,8
-	UNW(.save pr, r18)
-	mov r18=pr
-#ifdef SINGLE
-# ifdef UNSIGNED
-	zxt4 in0=in0
-	zxt4 in1=in1
-# else
-	sxt4 in0=in0
-	sxt4 in1=in1
-# endif
-	;;
-#endif
-
-#ifndef UNSIGNED
-	cmp.lt p6,p0=in0,r0	// x negative?
-	cmp.lt p7,p0=in1,r0	// y negative?
+	.regstk 2,0,0,0
+	// Transfer inputs to FP registers.
+	setf.sig f8 = in0
+	setf.sig f9 = in1
+	UNW(.fframe 16)
+	UNW(.save.f 0x20)
+	stf.spill [sp] = f17,-16
+
+	// Convert the inputs to FP, to avoid FP software-assist faults.
+	INT_TO_FP(f8, f8)
 	;;
-(p6)	sub in0=r0,in0		// make x positive
-(p7)	sub in1=r0,in1		// ditto for y
-	;;
-#endif
-
-	setf.sig f8=in0
-	UNW(.save ar.lc, r3)
 
+	UNW(.save.f 0x10)
+	stf.spill [sp] = f16
 	UNW(.body)
-
-	mov r3=ar.lc		// save ar.lc
-	setf.sig f9=in1
+	INT_TO_FP(f9, f9)
 	;;
-	mov Q=0			// initialize q
-	mov R=in0		// stash away x in a static register
-	mov r16=1		// r16 = 1
-	INT_TO_FP(f8,f8)
-	cmp.eq p8,p0=0,in0	// x==0?
-	cmp.eq p9,p0=0,in1	// y==0?
+	frcpa.s1 f17, p6 = f8, f9	// y = frcpa(b)
 	;;
-	INT_TO_FP(f9,f9)
-(p8)	br.dpnt.few .L3
-(p9)	break __IA64_BREAK_KDB	// attempted division by zero (should never happen)
-	mov ar.ec=r0		// epilogue count = 0
+	/*
+	 * This is the magic algorithm described in Section 8.6.2 of "IA-64
+	 * and Elementary Functions" by Peter Markstein; HP Professional Books
+	 * (http://www.hp.com/go/retailbooks/)
+	 */
+(p6)	fmpy.s1 f7 = f8, f17		// q = a*y
+(p6)	fnma.s1 f6 = f9, f17, f1	// e = -b*y + 1 
 	;;
-	getf.exp r14=f8		// r14 = exponent of x
-	getf.exp r15=f9		// r15 = exponent of y
-	mov ar.lc=r0		// loop count = 0
+(p6)	fma.s1 f16 = f7, f6, f7		// q1 = q*e + q
+(p6)	fmpy.s1 f7 = f6, f6		// e1 = e*e
 	;;
-	sub r17=r14,r15		// r17 = (exp of x - exp y) = shift amount
-	cmp.ge p8,p0=r14,r15
+(p6)	fma.s1 f16 = f16, f7, f16	// q2 = q1*e1 + q1
+(p6)	fma.s1 f6 = f17, f6, f17	// y1 = y*e + y 
 	;;
-
-	.rotr y[2], mask[2]	// in0 and in1 may no longer be valid after
-				// the first write to a rotating register!
-
-(p8)	shl y[1]=in1,r17	// y[1]    = y<<shift
-(p8)	shl mask[1]=r16,r17	// mask[1] = 1<<shift
-
-(p8)	mov ar.lc=r17		// loop count = r17
+(p6)	fma.s1 f6 = f6, f7, f6		// y2 = y1*e1 + y1
+(p6)	fnma.s1 f7 = f9, f16, f8	// r = -b*q2 + a 
 	;;
-.L1:
-(p8)	cmp.geu.unc p9,p0=R,y[1]// p9 = (x >= y[1])
-(p8)	shr.u mask[0]=mask[1],1	// prepare mask[0] and y[0] for next
-(p8)	shr.u y[0]=y[1],1	// iteration
+(p6)	fma.s1 f17 = f7, f6, f16	// q3 = r*y2 + q2
 	;;
-(p9)	sub R=R,y[1]		// if (x >= y[1]), subtract y[1] from x
-(p9)	add Q=Q,mask[1]		// and set corresponding bit in q (Q)
-	br.ctop.dptk.few .L1	// repeated unless ar.lc-- == 0
+#ifdef MODULO
+	FP_TO_INT(f17, f17)		// round quotient to an unsigned integer
+	;;
+	INT_TO_FP(f17, f17)		// renormalize
 	;;
-.L2:
-#ifndef UNSIGNED
-# ifdef MODULO
-(p6)	sub R=r0,R		// set sign of remainder according to x
-# else
-(p6)	sub Q=r0,Q		// set sign of quotient
+	fnma.s1 f17 = f17, f9, f8	// compute remainder
 	;;
-(p7)	sub Q=r0,Q
-# endif
 #endif
-.L3:
-	mov ar.pfs=r2		// restore ar.pfs
-	mov ar.lc=r3		// restore ar.lc
-	mov pr=r18,0xffffffffffff0000	// restore p16-p63
-	br.ret.sptk.few rp
+	UNW(.restore sp)
+	ldf.fill f16 = [sp], 16
+	FP_TO_INT(f8, f17)		// round result to an (unsigned) integer
+	;;
+	ldf.fill f17 = [sp]
+	getf.sig r8 = f8		// transfer result to result register
+	br.ret.sptk rp
 END(NAME)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
new file mode 100644
index 000000000..3b16916d0
--- /dev/null
+++ b/arch/ia64/lib/memcpy.S
@@ -0,0 +1,86 @@
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(bcopy)
+	.regstk 3,0,0,0
+	mov r8=in0
+	mov in0=in1
+	;;
+	mov in1=r8
+END(bcopy)
+	// FALL THROUGH
+GLOBAL_ENTRY(memcpy)
+
+#	define MEM_LAT	4
+
+#	define N	MEM_LAT-1
+#	define Nrot	((MEM_LAT + 7) & ~7)
+
+#	define dst	r2
+#	define src	r3
+#	define len	r9
+#	define saved_pfs r10
+#	define saved_lc	r11
+#	define saved_pr	r16
+#	define t0	r17
+#	define cnt	r18
+
+	UNW(.prologue)
+	UNW(.save ar.pfs, saved_pfs)
+	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
+	lfetch [in1]
+
+	.rotr val[MEM_LAT]
+	.rotp p[MEM_LAT]
+
+	UNW(.save ar.lc, saved_lc)
+	mov saved_lc=ar.lc
+
+	or t0=in0,in1
+	UNW(.save pr, saved_pr)
+	mov saved_pr=pr
+
+	UNW(.body)
+
+	mov ar.ec=MEM_LAT
+
+	mov r8=in0		// return dst
+	shr cnt=in2,3		// number of 8-byte words to copy
+	mov pr.rot=1<<16
+	;;
+	cmp.eq p6,p0=in2,r0	// zero length?
+	or t0=t0,in2
+(p6)	br.ret.spnt.many rp	// yes, return immediately
+
+	mov dst=in0		// copy because of rotation
+	mov src=in1		// copy because of rotation
+	adds cnt=-1,cnt		// br.ctop is repeat/until
+	;;
+	and t0=0x7,t0
+	mov ar.lc=cnt
+	;;
+	cmp.ne p6,p0=t0,r0
+(p6)	br.cond.spnt.few slow_memcpy
+
+1:
+(p[0])	ld8 val[0]=[src],8
+(p[N])	st8 [dst]=val[N],8
+	br.ctop.sptk.few 1b
+	;;
+.exit:
+	mov ar.lc=saved_lc
+	mov pr=saved_pr,0xffffffffffff0000
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+
+slow_memcpy:
+	adds cnt=-1,in2
+	;;
+	mov ar.lc=cnt
+	;;
+1:
+(p[0])	ld1 val[0]=[src],1
+(p[N])	st1 [dst]=val[N],1
+	br.ctop.sptk.few 1b
+	br.sptk.few .exit
+
+END(memcpy)