summaryrefslogtreecommitdiffstats
path: root/arch/ia64/lib
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2000-07-18 23:31:08 +0000
committerRalf Baechle <ralf@linux-mips.org>2000-07-18 23:31:08 +0000
commite8f9b6396e79654ea5932f64131cafbe8caadfb1 (patch)
tree90cf8b3a2475a774369eda22247cab7874fd7bbf /arch/ia64/lib
parentd4c419dfaa853f5db107973976d4215ae648e976 (diff)
Merge with Linux 2.4.0-test5-pre2.
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r--arch/ia64/lib/Makefile24
-rw-r--r--arch/ia64/lib/copy_user.S4
-rw-r--r--arch/ia64/lib/idiv.S184
-rw-r--r--arch/ia64/lib/memcpy.S86
4 files changed, 153 insertions, 145 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
index 882bdaed9..318e314cc 100644
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -7,40 +7,26 @@
L_TARGET = lib.a
-L_OBJS = __divdi3.o __divsi3.o __udivdi3.o __udivsi3.o \
- __moddi3.o __modsi3.o __umoddi3.o __umodsi3.o \
- checksum.o clear_page.o csum_partial_copy.o copy_page.o \
- copy_user.o clear_user.o memset.o strncpy_from_user.o \
- strlen.o strlen_user.o strnlen_user.o \
+L_OBJS = __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
+ checksum.o clear_page.o csum_partial_copy.o copy_page.o \
+ copy_user.o clear_user.o memcpy.o memset.o strncpy_from_user.o \
+ strlen.o strlen_user.o strnlen_user.o \
flush.o do_csum.o
LX_OBJS = io.o
-IGNORE_FLAGS_OBJS = __divdi3.o __divsi3.o __udivdi3.o __udivsi3.o \
- __moddi3.o __modsi3.o __umoddi3.o __umodsi3.o
+IGNORE_FLAGS_OBJS = __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
include $(TOPDIR)/Rules.make
__divdi3.o: idiv.S
$(CC) $(AFLAGS) -c -o $@ $<
-__divsi3.o: idiv.S
- $(CC) $(AFLAGS) -c -DSINGLE -c -o $@ $<
-
__udivdi3.o: idiv.S
$(CC) $(AFLAGS) -c -DUNSIGNED -c -o $@ $<
-__udivsi3.o: idiv.S
- $(CC) $(AFLAGS) -c -DUNSIGNED -DSINGLE -c -o $@ $<
-
__moddi3.o: idiv.S
$(CC) $(AFLAGS) -c -DMODULO -c -o $@ $<
-__modsi3.o: idiv.S
- $(CC) $(AFLAGS) -c -DMODULO -DSINGLE -c -o $@ $<
-
__umoddi3.o: idiv.S
$(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-__umodsi3.o: idiv.S
- $(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -DSINGLE -c -o $@ $<
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
index 4a0abaed0..e13febb22 100644
--- a/arch/ia64/lib/copy_user.S
+++ b/arch/ia64/lib/copy_user.S
@@ -116,7 +116,7 @@ GLOBAL_ENTRY(__copy_user)
cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
xor tmp=src,dst // same alignment test prepare
-(p10) br.cond.dptk.few long_memcpy
+(p10) br.cond.dptk.few long_copy_user
;; // RAW pr.rot/p16 ?
//
// Now we do the byte by byte loop with software pipeline
@@ -136,7 +136,7 @@ GLOBAL_ENTRY(__copy_user)
//
// Beginning of long mempcy (i.e. > 16 bytes)
//
-long_memcpy:
+long_copy_user:
tbit.nz p6,p7=src1,0 // odd alignement
and tmp=7,tmp
;;
diff --git a/arch/ia64/lib/idiv.S b/arch/ia64/lib/idiv.S
index af424c41b..da96863d3 100644
--- a/arch/ia64/lib/idiv.S
+++ b/arch/ia64/lib/idiv.S
@@ -1,162 +1,98 @@
/*
* Integer division routine.
*
- * Copyright (C) 1999 Hewlett-Packard Co
- * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999-2000 Hewlett-Packard Co
+ * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
*/
-/* Simple integer division. It uses the straight forward division
- algorithm. This may not be the absolutely fastest way to do it,
- but it's not horrible either. According to ski, the worst case
- scenario of dividing 0xffffffffffffffff by 1 takes 133 cycles.
-
- An alternative would be to use an algorithm similar to the
- floating point division algorithm (Newton-Raphson iteration),
- but that approach is rather tricky (one has to be very careful
- to get the last bit right...).
-
- While this algorithm is straight-forward, it does use a couple
- of neat ia-64 specific tricks:
-
- - it uses the floating point unit to determine the initial
- shift amount (shift = floor(ld(x)) - floor(ld(y)))
-
- - it uses predication to avoid a branch in the case where
- x < y (this is what p8 is used for)
-
- - it uses rotating registers and the br.ctop branch to
- implement a software-pipelined loop that's unrolled
- twice (without any code expansion!)
-
- - the code is relatively well scheduled to avoid unnecessary
- nops while maximizing parallelism
-*/
#include <asm/asmmacro.h>
-#include <asm/break.h>
- .text
- .psr abi64
-#ifdef __BIG_ENDIAN__
- .psr msb
- .msb
-#else
- .psr lsb
- .lsb
-#endif
+/*
+ * Compute a 64-bit unsigned integer quotient.
+ *
+ * Use reciprocal approximation and Newton-Raphson iteration to compute the
+ * quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
+ * to get more than the 64 bits of precision that we need for DImode.
+ *
+ * Must use max precision for the reciprocal computations to get 64 bits of
+ * precision.
+ *
+ * r32 holds the dividend. r33 holds the divisor.
+ */
#ifdef MODULO
# define OP mod
-# define Q r9
-# define R r8
#else
-# define OP div
-# define Q r8
-# define R r9
-#endif
-
-#ifdef SINGLE
-# define PREC si
-#else
-# define PREC di
+# define OP div
#endif
#ifdef UNSIGNED
-# define SGN u
-# define INT_TO_FP(a,b) fma.s0 a=b,f1,f0
-# define FP_TO_INT(a,b) fcvt.fxu.trunc.s0 a=b
+# define SGN u
+# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
+# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
#else
# define SGN
# define INT_TO_FP(a,b) fcvt.xf a=b
-# define FP_TO_INT(a,b) fcvt.fx.trunc.s0 a=b
+# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
#endif
#define PASTE1(a,b) a##b
#define PASTE(a,b) PASTE1(a,b)
-#define NAME PASTE(PASTE(__,SGN),PASTE(OP,PASTE(PREC,3)))
+#define NAME PASTE(PASTE(__,SGN),PASTE(OP,di3))
GLOBAL_ENTRY(NAME)
UNW(.prologue)
- alloc r2=ar.pfs,2,6,0,8
- UNW(.save pr, r18)
- mov r18=pr
-#ifdef SINGLE
-# ifdef UNSIGNED
- zxt4 in0=in0
- zxt4 in1=in1
-# else
- sxt4 in0=in0
- sxt4 in1=in1
-# endif
- ;;
-#endif
-
-#ifndef UNSIGNED
- cmp.lt p6,p0=in0,r0 // x negative?
- cmp.lt p7,p0=in1,r0 // y negative?
+ .regstk 2,0,0,0
+ // Transfer inputs to FP registers.
+ setf.sig f8 = in0
+ setf.sig f9 = in1
+ UNW(.fframe 16)
+ UNW(.save.f 0x20)
+ stf.spill [sp] = f17,-16
+
+ // Convert the inputs to FP, to avoid FP software-assist faults.
+ INT_TO_FP(f8, f8)
;;
-(p6) sub in0=r0,in0 // make x positive
-(p7) sub in1=r0,in1 // ditto for y
- ;;
-#endif
-
- setf.sig f8=in0
- UNW(.save ar.lc, r3)
+ UNW(.save.f 0x10)
+ stf.spill [sp] = f16
UNW(.body)
-
- mov r3=ar.lc // save ar.lc
- setf.sig f9=in1
+ INT_TO_FP(f9, f9)
;;
- mov Q=0 // initialize q
- mov R=in0 // stash away x in a static register
- mov r16=1 // r16 = 1
- INT_TO_FP(f8,f8)
- cmp.eq p8,p0=0,in0 // x==0?
- cmp.eq p9,p0=0,in1 // y==0?
+ frcpa.s1 f17, p6 = f8, f9 // y = frcpa(b)
;;
- INT_TO_FP(f9,f9)
-(p8) br.dpnt.few .L3
-(p9) break __IA64_BREAK_KDB // attempted division by zero (should never happen)
- mov ar.ec=r0 // epilogue count = 0
+ /*
+ * This is the magic algorithm described in Section 8.6.2 of "IA-64
+ * and Elementary Functions" by Peter Markstein; HP Professional Books
+ * (http://www.hp.com/go/retailbooks/)
+ */
+(p6) fmpy.s1 f7 = f8, f17 // q = a*y
+(p6) fnma.s1 f6 = f9, f17, f1 // e = -b*y + 1
;;
- getf.exp r14=f8 // r14 = exponent of x
- getf.exp r15=f9 // r15 = exponent of y
- mov ar.lc=r0 // loop count = 0
+(p6) fma.s1 f16 = f7, f6, f7 // q1 = q*e + q
+(p6) fmpy.s1 f7 = f6, f6 // e1 = e*e
;;
- sub r17=r14,r15 // r17 = (exp of x - exp y) = shift amount
- cmp.ge p8,p0=r14,r15
+(p6) fma.s1 f16 = f16, f7, f16 // q2 = q1*e1 + q1
+(p6) fma.s1 f6 = f17, f6, f17 // y1 = y*e + y
;;
-
- .rotr y[2], mask[2] // in0 and in1 may no longer be valid after
- // the first write to a rotating register!
-
-(p8) shl y[1]=in1,r17 // y[1] = y<<shift
-(p8) shl mask[1]=r16,r17 // mask[1] = 1<<shift
-
-(p8) mov ar.lc=r17 // loop count = r17
+(p6) fma.s1 f6 = f6, f7, f6 // y2 = y1*e1 + y1
+(p6) fnma.s1 f7 = f9, f16, f8 // r = -b*q2 + a
;;
-.L1:
-(p8) cmp.geu.unc p9,p0=R,y[1]// p9 = (x >= y[1])
-(p8) shr.u mask[0]=mask[1],1 // prepare mask[0] and y[0] for next
-(p8) shr.u y[0]=y[1],1 // iteration
+(p6) fma.s1 f17 = f7, f6, f16 // q3 = r*y2 + q2
;;
-(p9) sub R=R,y[1] // if (x >= y[1]), subtract y[1] from x
-(p9) add Q=Q,mask[1] // and set corresponding bit in q (Q)
- br.ctop.dptk.few .L1 // repeated unless ar.lc-- == 0
+#ifdef MODULO
+ FP_TO_INT(f17, f17) // round quotient to an unsigned integer
+ ;;
+ INT_TO_FP(f17, f17) // renormalize
;;
-.L2:
-#ifndef UNSIGNED
-# ifdef MODULO
-(p6) sub R=r0,R // set sign of remainder according to x
-# else
-(p6) sub Q=r0,Q // set sign of quotient
+ fnma.s1 f17 = f17, f9, f8 // compute remainder
;;
-(p7) sub Q=r0,Q
-# endif
#endif
-.L3:
- mov ar.pfs=r2 // restore ar.pfs
- mov ar.lc=r3 // restore ar.lc
- mov pr=r18,0xffffffffffff0000 // restore p16-p63
- br.ret.sptk.few rp
+ UNW(.restore sp)
+ ldf.fill f16 = [sp], 16
+ FP_TO_INT(f8, f17) // round result to an (unsigned) integer
+ ;;
+ ldf.fill f17 = [sp]
+ getf.sig r8 = f8 // transfer result to result register
+ br.ret.sptk rp
END(NAME)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
new file mode 100644
index 000000000..3b16916d0
--- /dev/null
+++ b/arch/ia64/lib/memcpy.S
@@ -0,0 +1,86 @@
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(bcopy)
+ .regstk 3,0,0,0
+ mov r8=in0
+ mov in0=in1
+ ;;
+ mov in1=r8
+END(bcopy)
+ // FALL THROUGH
+GLOBAL_ENTRY(memcpy)
+
+# define MEM_LAT 4
+
+# define N MEM_LAT-1
+# define Nrot ((MEM_LAT + 7) & ~7)
+
+# define dst r2
+# define src r3
+# define len r9
+# define saved_pfs r10
+# define saved_lc r11
+# define saved_pr r16
+# define t0 r17
+# define cnt r18
+
+ UNW(.prologue)
+ UNW(.save ar.pfs, saved_pfs)
+ alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
+ lfetch [in1]
+
+ .rotr val[MEM_LAT]
+ .rotp p[MEM_LAT]
+
+ UNW(.save ar.lc, saved_lc)
+ mov saved_lc=ar.lc
+
+ or t0=in0,in1
+ UNW(.save pr, saved_pr)
+ mov saved_pr=pr
+
+ UNW(.body)
+
+ mov ar.ec=MEM_LAT
+
+ mov r8=in0 // return dst
+ shr cnt=in2,3 // number of 8-byte words to copy
+ mov pr.rot=1<<16
+ ;;
+ cmp.eq p6,p0=in2,r0 // zero length?
+ or t0=t0,in2
+(p6) br.ret.spnt.many rp // yes, return immediately
+
+ mov dst=in0 // copy because of rotation
+ mov src=in1 // copy because of rotation
+ adds cnt=-1,cnt // br.ctop is repeat/until
+ ;;
+ and t0=0x7,t0
+ mov ar.lc=cnt
+ ;;
+ cmp.ne p6,p0=t0,r0
+(p6) br.cond.spnt.few slow_memcpy
+
+1:
+(p[0]) ld8 val[0]=[src],8
+(p[N]) st8 [dst]=val[N],8
+ br.ctop.sptk.few 1b
+ ;;
+.exit:
+ mov ar.lc=saved_lc
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.many rp
+
+slow_memcpy:
+ adds cnt=-1,in2
+ ;;
+ mov ar.lc=cnt
+ ;;
+1:
+(p[0]) ld1 val[0]=[src],1
+(p[N]) st1 [dst]=val[N],1
+ br.ctop.sptk.few 1b
+ br.sptk.few .exit
+
+END(memcpy)