diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-02-23 00:40:54 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-02-23 00:40:54 +0000 |
commit | 529c593ece216e4aaffd36bd940cb94f1fa63129 (patch) | |
tree | 78f1c0b805f5656aa7b0417a043c5346f700a2cf /arch/ia64/lib | |
parent | 0bd079751d25808d1972baee5c4eaa1db2227257 (diff) |
Merge with 2.3.43. I did ignore all modifications to the qlogicisp.c
driver due to the Origin A64 hacks.
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r-- | arch/ia64/lib/Makefile | 42 | ||||
-rw-r--r-- | arch/ia64/lib/checksum.c | 110 | ||||
-rw-r--r-- | arch/ia64/lib/clear_page.S | 42 | ||||
-rw-r--r-- | arch/ia64/lib/clear_user.S | 224 | ||||
-rw-r--r-- | arch/ia64/lib/copy_page.S | 87 | ||||
-rw-r--r-- | arch/ia64/lib/copy_user.S | 71 | ||||
-rw-r--r-- | arch/ia64/lib/csum_partial_copy.c | 165 | ||||
-rw-r--r-- | arch/ia64/lib/do_csum.S | 230 | ||||
-rw-r--r-- | arch/ia64/lib/flush.S | 37 | ||||
-rw-r--r-- | arch/ia64/lib/idiv.S | 158 | ||||
-rw-r--r-- | arch/ia64/lib/memset.S | 111 | ||||
-rw-r--r-- | arch/ia64/lib/strlen.S | 197 | ||||
-rw-r--r-- | arch/ia64/lib/strlen_user.S | 213 | ||||
-rw-r--r-- | arch/ia64/lib/strncpy_from_user.S | 53 | ||||
-rw-r--r-- | arch/ia64/lib/strnlen_user.S | 55 |
15 files changed, 1795 insertions, 0 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile new file mode 100644 index 000000000..8a9581747 --- /dev/null +++ b/arch/ia64/lib/Makefile @@ -0,0 +1,42 @@ +# +# Makefile for ia64-specific library routines.. +# + +.S.o: + $(CC) -D__ASSEMBLY__ $(AFLAGS) -traditional -c $< -o $@ + +OBJS = __divdi3.o __divsi3.o __udivdi3.o __udivsi3.o \ + __moddi3.o __modsi3.o __umoddi3.o __umodsi3.o \ + checksum.o clear_page.o csum_partial_copy.o copy_page.o \ + copy_user.o clear_user.o memset.o strncpy_from_user.o \ + strlen.o strlen_user.o strnlen_user.o \ + flush.o do_csum.o + +lib.a: $(OBJS) + $(AR) rcs lib.a $(OBJS) + +__divdi3.o: idiv.S + $(CC) $(AFLAGS) -c -o $@ $< + +__divsi3.o: idiv.S + $(CC) $(AFLAGS) -c -DSINGLE -c -o $@ $< + +__udivdi3.o: idiv.S + $(CC) $(AFLAGS) -c -DUNSIGNED -c -o $@ $< + +__udivsi3.o: idiv.S + $(CC) $(AFLAGS) -c -DUNSIGNED -DSINGLE -c -o $@ $< + +__moddi3.o: idiv.S + $(CC) $(AFLAGS) -c -DMODULO -c -o $@ $< + +__modsi3.o: idiv.S + $(CC) $(AFLAGS) -c -DMODULO -DSINGLE -c -o $@ $< + +__umoddi3.o: idiv.S + $(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -c -o $@ $< + +__umodsi3.o: idiv.S + $(CC) $(AFLAGS) -c -DMODULO -DUNSIGNED -DSINGLE -c -o $@ $< + +include $(TOPDIR)/Rules.make diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c new file mode 100644 index 000000000..9c4a8af75 --- /dev/null +++ b/arch/ia64/lib/checksum.c @@ -0,0 +1,110 @@ +/* + * Network checksum routines + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> + * + * Most of the code coming from arch/alpha/lib/checksum.c + * + * This file contains network checksum routines that are better done + * in an architecture-specific manner due to speed.. + */ + +#include <linux/string.h> + +#include <asm/byteorder.h> + +static inline unsigned short +from64to16(unsigned long x) +{ + /* add up 32-bit words for 33 bits */ + x = (x & 0xffffffff) + (x >> 32); + /* add up 16-bit and 17-bit words for 17+c bits */ + x = (x & 0xffff) + (x >> 16); + /* add up 16-bit and 2-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented. + */ +unsigned short int csum_tcpudp_magic(unsigned long saddr, + unsigned long daddr, + unsigned short len, + unsigned short proto, + unsigned int sum) +{ + return ~from64to16(saddr + daddr + sum + + ((unsigned long) ntohs(len) << 16) + + ((unsigned long) proto << 8)); +} + +unsigned int csum_tcpudp_nofold(unsigned long saddr, + unsigned long daddr, + unsigned short len, + unsigned short proto, + unsigned int sum) +{ + unsigned long result; + + result = (saddr + daddr + sum + + ((unsigned long) ntohs(len) << 16) + + ((unsigned long) proto << 8)); + + /* Fold down to 32-bits so we don't loose in the typedef-less + network stack. */ + /* 64 to 33 */ + result = (result & 0xffffffff) + (result >> 32); + /* 33 to 32 */ + result = (result & 0xffffffff) + (result >> 32); + return result; +} + +extern unsigned long do_csum(const unsigned char *, unsigned int, unsigned int); +extern unsigned long do_csum_c(const unsigned char *, unsigned int, unsigned int); + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + */ +unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl) +{ + return ~do_csum(iph,ihl*4,0); +} + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) +{ + unsigned long result = do_csum(buff, len, 0); + + /* add in old sum, and carry.. */ + result += sum; + /* 32+c bits -> 32 bits */ + result = (result & 0xffffffff) + (result >> 32); + return result; +} + + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +unsigned short ip_compute_csum(unsigned char * buff, int len) +{ + return ~do_csum(buff,len, 0); +} diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S new file mode 100644 index 000000000..314311c5c --- /dev/null +++ b/arch/ia64/lib/clear_page.S @@ -0,0 +1,42 @@ +/* + * + * Optimized version of the standard clearpage() function + * + * Based on comments from ddd. Try not to overflow the write buffer. + * + * Inputs: + * in0: address of page + * + * Output: + * none + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com> + */ +#include <asm/page.h> + + .text + .psr abi64 + .psr lsb + .lsb + + .align 32 + .global clear_page + .proc clear_page +clear_page: + alloc r11=ar.pfs,1,0,0,0 + mov r16=ar.lc // slow + mov r17=PAGE_SIZE/32-1 // -1 = repeat/until + ;; + adds r18=16,in0 + mov ar.lc=r17 + ;; +1: stf.spill.nta [in0]=f0,32 + stf.spill.nta [r18]=f0,32 + br.cloop.dptk.few 1b + ;; + mov ar.lc=r16 // restore lc + br.ret.sptk.few rp + + .endp clear_page diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S new file mode 100644 index 000000000..0db4a78f8 --- /dev/null +++ b/arch/ia64/lib/clear_user.S @@ -0,0 +1,224 @@ +/* + * This routine clears to zero a linear memory buffer in user space. + * + * Inputs: + * in0: address of buffer + * in1: length of buffer in bytes + * Outputs: + * r8: number of bytes that didn't get cleared due to a fault + * + * Copyright (C) 1998, 1999 Hewlett-Packard Co + * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> + */ + +// +// arguments +// +#define buf r32 +#define len r33 + +// +// local registers +// +#define cnt r16 +#define buf2 r17 +#define saved_lc r18 +#define saved_pr r19 +#define saved_pfs r20 +#define tmp r21 +#define len2 r22 +#define len3 r23 + +// +// Theory of operations: +// - we check whether or not the buffer is small, i.e., less than 17 +// in which case we do the byte by byte loop. +// +// - Otherwise we go progressively from 1 byte store to 8byte store in +// the head part, the body is a 16byte store loop and we finish we the +// tail for the last 15 bytes. +// The good point about this breakdown is that the long buffer handling +// contains only 2 branches. +// +// The reason for not using shifting & masking for both the head and the +// tail is to stay semantically correct. This routine is not supposed +// to write bytes outside of the buffer. While most of the time this would +// be ok, we can't tolerate a mistake. A classical example is the case +// of multithreaded code were to the extra bytes touched is actually owned +// by another thread which runs concurrently to ours. Another, less likely, +// example is with device drivers where reading an I/O mapped location may +// have side effects (same thing for writing). +// + +// The label comes first because our store instruction contains a comma +// and confuse the preprocessor otherwise +// +#define EX(y,x...) \ + .section __ex_table,"a"; \ + data4 @gprel(99f); \ + data4 y-99f; \ + .previous; \ +99: x + + .text + .psr abi64 + .psr lsb + .lsb + + .align 32 + .global __do_clear_user + .proc __do_clear_user + +__do_clear_user: + alloc saved_pfs=ar.pfs,2,0,0,0 + cmp.eq p6,p0=r0,len // check for zero length + mov saved_lc=ar.lc // preserve ar.lc (slow) + ;; // avoid WAW on CFM + adds tmp=-1,len // br.ctop is repeat/until + mov ret0=len // return value is length at this point +(p6) br.ret.spnt.few rp + ;; + cmp.lt p6,p0=16,len // if len > 16 then long memset + mov ar.lc=tmp // initialize lc for small count +(p6) br.cond.dptk.few long_do_clear + ;; // WAR on ar.lc + // + // worst case 16 cyles, avg 8 cycles + // + // We could have played with the predicates to use the extra + // M slot for 2 stores/iteration but the cost the initialization + // the various counters compared to how long the loop is supposed + // to last on average does not make this solution viable. + // +1: + EX( .Lexit1, st1 [buf]=r0,1 ) + adds len=-1,len // countdown length using len + br.cloop.dptk.few 1b + ;; // avoid RAW on ar.lc + // + // .Lexit4: comes from byte by byte loop + // len contains bytes left +.Lexit1: + mov ret0=len // faster than using ar.lc + mov ar.lc=saved_lc + br.ret.sptk.few rp // end of short clear_user + + + // + // At this point we know we have more than 16 bytes to copy + // so we focus on alignment (no branches required) + // + // The use of len/len2 for countdown of the number of bytes left + // instead of ret0 is due to the fact that the exception code + // changes the values of r8. + // +long_do_clear: + tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) + ;; + EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned +(p6) adds len=-1,len;; // sync because buf is modified + tbit.nz p6,p0=buf,1 + ;; + EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned +(p6) adds len=-2,len;; + tbit.nz p6,p0=buf,2 + ;; + EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned +(p6) adds len=-4,len;; + tbit.nz p6,p0=buf,3 + ;; + EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned +(p6) adds len=-8,len;; + shr.u cnt=len,4 // number of 128-bit (2x64bit) words + ;; + cmp.eq p6,p0=r0,cnt + adds tmp=-1,cnt +(p6) br.cond.dpnt.few .dotail // we have less than 16 bytes left + ;; + adds buf2=8,buf // setup second base pointer + mov ar.lc=tmp + ;; + + // + // 16bytes/iteration core loop + // + // The second store can never generate a fault because + // we come into the loop only when we are 16-byte aligned. + // This means that if we cross a page then it will always be + // in the first store and never in the second. + // + // + // We need to keep track of the remaining length. A possible (optimistic) + // way would be to ue ar.lc and derive how many byte were left by + // doing : left= 16*ar.lc + 16. this would avoid the addition at + // every iteration. + // However we need to keep the synchronization point. A template + // M;;MB does not exist and thus we can keep the addition at no + // extra cycle cost (use a nop slot anyway). It also simplifies the + // (unlikely) error recovery code + // + +2: + + EX(.Lexit3, st8 [buf]=r0,16 ) + ;; // needed to get len correct when error + st8 [buf2]=r0,16 + adds len=-16,len + br.cloop.dptk.few 2b + ;; + mov ar.lc=saved_lc + // + // tail correction based on len only + // + // We alternate the use of len3,len2 to allow parallelism and correct + // error handling. We also reuse p6/p7 to return correct value. + // The addition of len2/len3 does not cost anything more compared to + // the regular memset as we had empty slots. + // +.dotail: + mov len2=len // for parallelization of error handling + mov len3=len + tbit.nz p6,p0=len,3 + ;; + EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes +(p6) adds len3=-8,len2 + tbit.nz p7,p6=len,2 + ;; + EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes +(p7) adds len2=-4,len3 + tbit.nz p6,p7=len,1 + ;; + EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes +(p6) adds len3=-2,len2 + tbit.nz p7,p6=len,0 + ;; + EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left + mov ret0=r0 // success + br.ret.dptk.few rp // end of most likely path + + // + // Outlined error handling code + // + + // + // .Lexit3: comes from core loop, need restore pr/lc + // len contains bytes left + // + // + // .Lexit2: + // if p6 -> coming from st8 or st2 : len2 contains what's left + // if p7 -> coming from st4 or st1 : len3 contains what's left + // We must restore lc/pr even though might not have been used. +.Lexit2: +(p6) mov len=len2 +(p7) mov len=len3 + ;; + // + // .Lexit4: comes from head, need not restore pr/lc + // len contains bytes left + // +.Lexit3: + mov ret0=len + mov ar.lc=saved_lc + br.ret.dptk.few rp + .endp diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S new file mode 100644 index 000000000..0a956e5a2 --- /dev/null +++ b/arch/ia64/lib/copy_page.S @@ -0,0 +1,87 @@ +/* + * + * Optimized version of the standard copy_page() function + * + * Based on comments from ddd. Try not to overflow write buffer. + * + * Inputs: + * in0: address of target page + * in1: address of source page + * Output: + * no return value + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> + */ +#include <asm/page.h> + +#define lcount r16 +#define saved_pr r17 +#define saved_lc r18 +#define saved_pfs r19 +#define src1 r20 +#define src2 r21 +#define tgt1 r22 +#define tgt2 r23 + + .text + .psr abi64 + .psr lsb + .lsb + + .align 32 + .global copy_page + .proc copy_page + +copy_page: + alloc saved_pfs=ar.pfs,10,0,0,8 // we need 6 roatating (8 minimum) + // + 2 input + + .rotr t1[4], t2[4] // our 2 pipelines with depth of 4 each + + mov saved_lc=ar.lc // save ar.lc ahead of time + mov saved_pr=pr // rotating predicates are preserved + // resgisters we must save. + mov src1=in1 // initialize 1st stream source + adds src2=8,in1 // initialize 2nd stream source + mov lcount=PAGE_SIZE/16-1 // as many 16bytes as there are on a page + // -1 is because br.ctop is repeat/until + + adds tgt2=8,in0 // initialize 2nd stream target + mov tgt1=in0 // initialize 1st stream target + ;; + mov pr.rot=1<<16 // pr16=1 & pr[17-63]=0 , 63 not modified + + mov ar.lc=lcount // set loop counter + mov ar.ec=4 // ar.ec must match pipeline depth + ;; + + // We need to preload the n-1 stages of the pipeline (n=depth). + // We do this during the "prolog" of the loop: we execute + // n-1 times the "load" bundle. Then both loads & stores are + // enabled until we reach the end of the last word of the page + // on the load side. Then, we enter the epilogue (controlled by ec) + // where we just do the stores and no loads n-1 times : drain the pipe. + // + // The initialization of the prolog is done via the predicate registers: + // the choice of pr19 DEPENDS on the depth of the pipeline (n). + // When lc > 0 pr63=1 and it is fed back into pr16 and pr16-pr62 + // are then shifted right at every iteration, + // Thus by initializing pr16=1 and pr17-19=0 (19=16+4-1) before the loop + // we get pr19=1 after 4 iterations (n in our case). + // +1: // engage loop now, let the magic happen... +(p16) ld8 t1[0]=[src1],16 // new data on top of pipeline in 1st stream +(p16) ld8 t2[0]=[src2],16 // new data on top of pipeline in 2nd stream + nop.i 0x0 +(p19) st8 [tgt1]=t1[3],16 // store top of 1st pipeline +(p19) st8 [tgt2]=t2[3],16 // store top of 2nd pipeline + br.ctop.dptk.few 1b // once lc==0, ec-- & p16=0 + // stores but no loads anymore + ;; + mov pr=saved_pr,0xffffffffffff0000 // restore predicates + mov ar.pfs=saved_pfs // restore ar.ec + mov ar.lc=saved_lc // restore saved lc + br.ret.sptk.few rp // bye... + + .endp copy_page diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S new file mode 100644 index 000000000..03a540a80 --- /dev/null +++ b/arch/ia64/lib/copy_user.S @@ -0,0 +1,71 @@ +/* + * This routine copies a linear memory buffer across the user/kernel boundary. When + * reading a byte from the source causes a fault, the remainder of the destination + * buffer is zeroed out. Note that this can happen only when copying from user + * to kernel memory and we do this to absolutely guarantee that the + * kernel doesn't operate on random data. + * + * This file is derived from arch/alpha/lib/copy_user.S. + * + * Inputs: + * in0: address of destination buffer + * in1: address of source buffer + * in2: length of buffer in bytes + * Outputs: + * r8: number of bytes that didn't get copied due to a fault + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com> + */ + +#define EXI(x...) \ +99: x; \ + .section __ex_table,"a"; \ + data4 @gprel(99b); \ + data4 .Lexit_in-99b; \ + .previous + +#define EXO(x...) \ +99: x; \ + .section __ex_table,"a"; \ + data4 @gprel(99b); \ + data4 .Lexit_out-99b; \ + .previous + + .text + .psr abi64 + .psr lsb + .lsb + + .align 32 + .global __copy_user + .proc __copy_user +__copy_user: + alloc r10=ar.pfs,3,0,0,0 + mov r9=ar.lc // save ar.lc + mov ar.lc=in2 // set ar.lc to length of buffer + br.sptk.few .Lentr + + // XXX braindead copy loop---this needs to be optimized +.Loop1: + EXI(ld1 r8=[in1],1) + ;; + EXO(st1 [in0]=r8,1) +.Lentr: br.cloop.dptk.few .Loop1 // repeat unless ar.lc--==0 + ;; // avoid RAW on ar.lc +.Lexit_out: + mov r8=ar.lc // return how many bytes we _didn't_ copy + mov ar.lc=r9 + br.ret.sptk.few rp + +.Lexit_in: + // clear the remainder of the buffer: + mov r8=ar.lc // return how many bytes we _didn't_ copy +.Loop2: + st1 [in0]=r0,1 // this cannot fault because we get here only on user->kernel copies + br.cloop.dptk.few .Loop2 + ;; // avoid RAW on ar.lc + mov ar.lc=r9 + br.ret.sptk.few rp + + .endp __copy_user diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c new file mode 100644 index 000000000..d09f11e21 --- /dev/null +++ b/arch/ia64/lib/csum_partial_copy.c @@ -0,0 +1,165 @@ +/* + * Network Checksum & Copy routine + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> + * + * Most of the code has been imported from Linux/Alpha + */ + +#include <linux/types.h> +#include <linux/string.h> + +#include <asm/uaccess.h> + +/* + * XXX Fixme: those 2 inlines are meant for debugging and will go away + */ +static inline unsigned +short from64to16(unsigned long x) +{ + /* add up 32-bit words for 33 bits */ + x = (x & 0xffffffff) + (x >> 32); + /* add up 16-bit and 17-bit words for 17+c bits */ + x = (x & 0xffff) + (x >> 16); + /* add up 16-bit and 2-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +static inline +unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum) +{ + int odd, count; + unsigned long result = (unsigned long)psum; + + if (len <= 0) + goto out; + odd = 1 & (unsigned long) buff; + if (odd) { + result = *buff << 8; + len--; + buff++; + } + count = len >> 1; /* nr of 16-bit words.. */ + if (count) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *) buff; + count--; + len -= 2; + buff += 2; + } + count >>= 1; /* nr of 32-bit words.. */ + if (count) { + if (4 & (unsigned long) buff) { + result += *(unsigned int *) buff; + count--; + len -= 4; + buff += 4; + } + count >>= 1; /* nr of 64-bit words.. */ + if (count) { + unsigned long carry = 0; + do { + unsigned long w = *(unsigned long *) buff; + count--; + buff += 8; + result += carry; + result += w; + carry = (w > result); + } while (count); + result += carry; + result = (result & 0xffffffff) + (result >> 32); + } + if (len & 4) { + result += *(unsigned int *) buff; + buff += 4; + } + } + if (len & 2) { + result += *(unsigned short *) buff; + buff += 2; + } + } + if (len & 1) + result += *buff; + + result = from64to16(result); + + if (odd) + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); + +out: + return result; +} + +/* + * XXX Fixme + * + * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS. + * But it's very tricky to get right even in C. + */ +extern unsigned long do_csum(const unsigned char *, int); + +static unsigned int +do_csum_partial_copy_from_user (const char *src, char *dst, int len, + unsigned int psum, int *errp) +{ + const unsigned char *psrc = src; + unsigned long result; + int cplen = len; + int r = 0; + + /* XXX Fixme + * for now we separate the copy from checksum for obvious + * alignment difficulties. Look at the Alpha code and you'll be + * scared. + */ + + while ( cplen-- ) r |=__get_user(*dst++,psrc++); + + if ( r && errp ) *errp = r; + + result = do_csum(src, len); + + /* add in old sum, and carry.. */ + result += psum; + /* 32+c bits -> 32 bits */ + result = (result & 0xffffffff) + (result >> 32); + return result; +} + +unsigned int +csum_partial_copy_from_user(const char *src, char *dst, int len, + unsigned int sum, int *errp) +{ + if (!access_ok(src, len, VERIFY_READ)) { + *errp = -EFAULT; + memset(dst, 0, len); + return sum; + } + + return do_csum_partial_copy_from_user(src, dst, len, sum, errp); +} + +unsigned int +csum_partial_copy_nocheck(const char *src, char *dst, int len, unsigned int sum) +{ + return do_csum_partial_copy_from_user(src, dst, len, sum, NULL); +} + +unsigned int +csum_partial_copy (const char *src, char *dst, int len, unsigned int sum) +{ + unsigned int ret; + int error = 0; + + ret = do_csum_partial_copy_from_user(src, dst, len, sum, &error); + if (error) + printk("csum_partial_copy_old(): tell mingo to convert me!\n"); + + return ret; +} + diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S new file mode 100644 index 000000000..d8174f10a --- /dev/null +++ b/arch/ia64/lib/do_csum.S @@ -0,0 +1,230 @@ +/* + * + * Optmized version of the standard do_csum() function + * + * Return: a 64bit quantity containing the 16bit Internet checksum + * + * Inputs: + * in0: address of buffer to checksum (char *) + * in1: length of the buffer (int) + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> + * + */ + +// +// Theory of operations: +// The goal is to go as quickly as possible to the point where +// we can checksum 8 bytes/loop. Before reaching that point we must +// take care of incorrect alignment of first byte. +// +// The code hereafter also takes care of the "tail" part of the buffer +// before entering the core loop, if any. The checksum is a sum so it +// allows us to commute operations. So we do do the "head" and "tail" +// first to finish at full speed in the body. Once we get the head and +// tail values, we feed them into the pipeline, very handy initialization. +// +// Of course we deal with the special case where the whole buffer fits +// into one 8 byte word. In this case we have only one entry in the pipeline. +// +// We use a (3+1)-stage pipeline in the loop to account for possible +// load latency and also to accomodate for head and tail. +// +// The end of the function deals with folding the checksum from 64bits +// down to 16bits taking care of the carry. +// +// This version avoids synchronization in the core loop by also using a +// pipeline for the accumulation of the checksum in result[]. +// +// p[] +// |---| +// 0| | r32 : new value loaded in pipeline +// |---| +// 1| | r33 : in transit data +// |---| +// 2| | r34 : current value to add to checksum +// |---| +// 3| | r35 : previous value added to checksum (previous iteration) +// |---| +// +// result[] +// |---| +// 0| | r36 : new checksum +// |---| +// 1| | r37 : previous value of checksum +// |---| +// 2| | r38 : final checksum when out of the loop (after 2 epilogue rots) +// |---| +// +// +// NOT YET DONE: +// - Take advantage of the MMI bandwidth to load more than 8byte per loop +// iteration +// - use the lfetch instruction to augment the chances of the data being in +// the cache when we need it. +// - Maybe another algorithm which would take care of the folding at the +// end in a different manner +// - Work with people more knowledgeable than me on the network stack +// to figure out if we could not split the function depending on the +// type of packet or alignment we get. Like the ip_fast_csum() routine +// where we know we have at least 20bytes worth of data to checksum. +// - Look at RFCs about checksums to see whether or not we can do better +// +// - Do a better job of handling small packets. +// +#define saved_pfs r11 +#define hmask r16 +#define tmask r17 +#define first r18 +#define firstval r19 +#define firstoff r20 +#define last r21 +#define lastval r22 +#define lastoff r23 +#define saved_lc r24 +#define saved_pr r25 +#define tmp1 r26 +#define tmp2 r27 +#define tmp3 r28 +#define carry r29 + +#define buf in0 +#define len in1 + + + .text + .psr abi64 + .psr lsb + .lsb + +// unsigned long do_csum(unsigned char *buf,int len) + + .align 32 + .global do_csum + .proc do_csum +do_csum: + alloc saved_pfs=ar.pfs,2,8,0,8 + + .rotr p[4], result[3] + mov ret0=r0 // in case we have zero length + cmp4.lt p0,p6=r0,len // check for zero length or negative (32bit len) + ;; // avoid WAW on CFM + mov tmp3=0x7 // a temporary mask/value + add tmp1=buf,len // last byte's address +(p6) br.ret.spnt.few rp // return if true (hope we can avoid that) + + and firstoff=7,buf // how many bytes off for first element + tbit.nz p10,p0=buf,0 // is buf an odd address ? + mov hmask=-1 // intialize head mask + ;; + + andcm first=buf,tmp3 // 8byte aligned down address of first element + mov tmask=-1 // initialize tail mask + adds tmp2=-1,tmp1 // last-1 + ;; + and lastoff=7,tmp1 // how many bytes off for last element + andcm last=tmp2,tmp3 // address of word containing last byte + mov saved_pr=pr // preserve predicates (rotation) + ;; + sub tmp3=last,first // tmp3=distance from first to last + cmp.eq p8,p9=last,first // everything fits in one word ? + sub tmp1=8,lastoff // complement to lastoff + + ld8 firstval=[first],8 // load,ahead of time, "first" word + shl tmp2=firstoff,3 // number of bits + ;; + and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 + +(p9) ld8 lastval=[last] // load,ahead of time, "last" word, if needed +(p8) mov lastval=r0 // we don't need lastval if first==last + mov result[1]=r0 // initialize result + ;; + + shl tmp1=tmp1,3 // number of bits + shl hmask=hmask,tmp2 // build head mask, mask off [0,firstoff[ + ;; + shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] + mov saved_lc=ar.lc // save lc + ;; +(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only +(p9) and p[1]=lastval,tmask // mask last it as appropriate + shr.u tmp3=tmp3,3 // we do 8 bytes per loop + ;; + cmp.lt p6,p7=2,tmp3 // tmp3 > 2 ? + and p[2]=firstval,hmask // and mask it as appropriate + add tmp1=-2,tmp3 // -2 = -1 (br.ctop) -1 (last-first) + ;; + // XXX Fixme: not very nice initialization here + // + // Setup loop control registers: + // + // tmp3=0 (1 word) : lc=0, ec=2, p16=F + // tmp3=1 (2 words) : lc=0, ec=3, p16=F + // tmp3=2 (3 words) : lc=0, ec=4, p16=T + // tmp3>2 (4 or more): lc=tmp3-2, ec=4, p16=T + // + cmp.eq p8,p9=r0,tmp3 // tmp3 == 0 ? +(p6) mov ar.lc=tmp1 +(p7) mov ar.lc=0 + ;; + cmp.lt p6,p7=1,tmp3 // tmp3 > 1 ? +(p8) mov ar.ec=2 // we need the extra rotation on result[] +(p9) mov ar.ec=3 // hard not to set it twice sometimes + ;; + mov carry=r0 // initialize carry +(p6) mov ar.ec=4 +(p6) mov pr.rot=0xffffffffffff0000 // p16=T, p18=T + + cmp.ne p8,p0=r0,r0 // p8 is false + mov p[3]=r0 // make sure first compare fails +(p7) mov pr.rot=0xfffffffffffe0000 // p16=F, p18=T + ;; +1: +(p16) ld8 p[0]=[first],8 // load next +(p8) adds carry=1,carry // add carry on prev_prev_value +(p18) add result[0]=result[1],p[2] // new_res = prev_res + cur_val + cmp.ltu p8,p0=result[1],p[3] // p8= prev_result < prev_val + br.ctop.dptk.few 1b // loop until lc--==0 + ;; // RAW on carry when loop exits + (p8) adds carry=1,carry;; // correct for carry on prev_value + add result[2]=carry,result[2];; // add carry to final result + cmp.ltu p6,p7=result[2], carry // check for new carry + ;; +(p6) adds result[2]=1,result[1] // correct if required + movl tmp3=0xffffffff + ;; + // XXX Fixme + // + // now fold 64 into 16 bits taking care of carry + // that's not very good because it has lots of sequentiality + // + and tmp1=result[2],tmp3 + shr.u tmp2=result[2],32 + ;; + add result[2]=tmp1,tmp2 + shr.u tmp3=tmp3,16 + ;; + and tmp1=result[2],tmp3 + shr.u tmp2=result[2],16 + ;; + add result[2]=tmp1,tmp2 + ;; + and tmp1=result[2],tmp3 + shr.u tmp2=result[2],16 + ;; + add result[2]=tmp1,tmp2 + ;; + and tmp1=result[2],tmp3 + shr.u tmp2=result[2],16 + ;; + add ret0=tmp1,tmp2 + mov pr=saved_pr,0xffffffffffff0000 + ;; + // if buf was odd then swap bytes + mov ar.pfs=saved_pfs // restore ar.ec +(p10) mux1 ret0=ret0,@rev // reverse word + ;; + mov ar.lc=saved_lc +(p10) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes + br.ret.sptk.few rp diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S new file mode 100644 index 000000000..0195ae5f5 --- /dev/null +++ b/arch/ia64/lib/flush.S @@ -0,0 +1,37 @@ +/* + * Cache flushing routines. + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com> + */ +#include <asm/page.h> + + .text + .psr abi64 + .psr lsb + .lsb + + .align 16 + .global ia64_flush_icache_page + .proc ia64_flush_icache_page +ia64_flush_icache_page: + alloc r2=ar.pfs,1,0,0,0 + mov r3=ar.lc // save ar.lc + mov r8=PAGE_SIZE/64-1 // repeat/until loop + ;; + mov ar.lc=r8 + add r8=32,in0 + ;; +.Loop1: fc in0 // issuable on M0 only + add in0=64,in0 + fc r8 + add r8=64,r8 + br.cloop.sptk.few .Loop1 + ;; + sync.i + ;; + srlz.i + ;; + mov ar.lc=r3 // restore ar.lc + br.ret.sptk.few rp + .endp ia64_flush_icache_page diff --git a/arch/ia64/lib/idiv.S b/arch/ia64/lib/idiv.S new file mode 100644 index 000000000..a12097c94 --- /dev/null +++ b/arch/ia64/lib/idiv.S @@ -0,0 +1,158 @@ +/* + * Integer division routine. + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com> + */ +/* Simple integer division. It uses the straight forward division + algorithm. This may not be the absolutely fastest way to do it, + but it's not horrible either. According to ski, the worst case + scenario of dividing 0xffffffffffffffff by 1 takes 133 cycles. + + An alternative would be to use an algorithm similar to the + floating point division algorithm (Newton-Raphson iteration), + but that approach is rather tricky (one has to be very careful + to get the last bit right...). + + While this algorithm is straight-forward, it does use a couple + of neat ia-64 specific tricks: + + - it uses the floating point unit to determine the initial + shift amount (shift = floor(ld(x)) - floor(ld(y))) + + - it uses predication to avoid a branch in the case where + x < y (this is what p8 is used for) + + - it uses rotating registers and the br.ctop branch to + implement a software-pipelined loop that's unrolled + twice (without any code expansion!) + + - the code is relatively well scheduled to avoid unnecessary + nops while maximizing parallelism +*/ + +#include <asm/break.h> + + .text + .psr abi64 +#ifdef __BIG_ENDIAN__ + .psr msb + .msb +#else + .psr lsb + .lsb +#endif + +#ifdef MODULO +# define OP mod +# define Q r9 +# define R r8 +#else +# define OP div +# define Q r8 +# define R r9 +#endif + +#ifdef SINGLE +# define PREC si +#else +# define PREC di +#endif + +#ifdef UNSIGNED +# define SGN u +# define INT_TO_FP(a,b) fma.s0 a=b,f1,f0 +# define FP_TO_INT(a,b) fcvt.fxu.trunc.s0 a=b +#else +# define SGN +# define INT_TO_FP(a,b) fcvt.xf a=b +# define FP_TO_INT(a,b) fcvt.fx.trunc.s0 a=b +#endif + +#define PASTE1(a,b) a##b +#define PASTE(a,b) PASTE1(a,b) +#define NAME PASTE(PASTE(__,SGN),PASTE(OP,PASTE(PREC,3))) + + .align 32 + .global NAME + .proc NAME +NAME: + + alloc r2=ar.pfs,2,6,0,8 + mov r18=pr +#ifdef SINGLE +# ifdef UNSIGNED + zxt4 in0=in0 + zxt4 in1=in1 +# else + sxt4 in0=in0 + sxt4 in1=in1 +# endif + ;; +#endif + +#ifndef UNSIGNED + cmp.lt p6,p0=in0,r0 // x negative? + cmp.lt p7,p0=in1,r0 // y negative? + ;; +(p6) sub in0=r0,in0 // make x positive +(p7) sub in1=r0,in1 // ditto for y + ;; +#endif + + setf.sig f8=in0 + mov r3=ar.lc // save ar.lc + setf.sig f9=in1 + ;; + mov Q=0 // initialize q + mov R=in0 // stash away x in a static register + mov r16=1 // r16 = 1 + INT_TO_FP(f8,f8) + cmp.eq p8,p0=0,in0 // x==0? + cmp.eq p9,p0=0,in1 // y==0? + ;; + INT_TO_FP(f9,f9) +(p8) br.dpnt.few .L3 +(p9) break __IA64_BREAK_KDB // attempted division by zero (should never happen) + mov ar.ec=r0 // epilogue count = 0 + ;; + getf.exp r14=f8 // r14 = exponent of x + getf.exp r15=f9 // r15 = exponent of y + mov ar.lc=r0 // loop count = 0 + ;; + sub r17=r14,r15 // r17 = (exp of x - exp y) = shift amount + cmp.ge p8,p0=r14,r15 + ;; + + .rotr y[2], mask[2] // in0 and in1 may no longer be valid after + // the first write to a rotating register! + +(p8) shl y[1]=in1,r17 // y[1] = y<<shift +(p8) shl mask[1]=r16,r17 // mask[1] = 1<<shift + +(p8) mov ar.lc=r17 // loop count = r17 + ;; +.L1: +(p8) cmp.geu.unc p9,p0=R,y[1]// p9 = (x >= y[1]) +(p8) shr.u mask[0]=mask[1],1 // prepare mask[0] and y[0] for next +(p8) shr.u y[0]=y[1],1 // iteration + ;; +(p9) sub R=R,y[1] // if (x >= y[1]), subtract y[1] from x +(p9) add Q=Q,mask[1] // and set corresponding bit in q (Q) + br.ctop.dptk.few .L1 // repeated unless ar.lc-- == 0 + ;; +.L2: +#ifndef UNSIGNED +# ifdef MODULO +(p6) sub R=r0,R // set sign of remainder according to x +# else +(p6) sub Q=r0,Q // set sign of quotient + ;; +(p7) sub Q=r0,Q +# endif +#endif +.L3: + mov ar.pfs=r2 // restore ar.pfs + mov ar.lc=r3 // restore ar.lc + mov pr=r18,0xffffffffffff0000 // restore p16-p63 + br.ret.sptk.few rp diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S new file mode 100644 index 000000000..595720a2d --- /dev/null +++ b/arch/ia64/lib/memset.S @@ -0,0 +1,111 @@ +/* + * + * Optimized version of the standard memset() function + * + * Return: none + * + * + * Inputs: + * in0: address of buffer + * in1: byte value to use for storing + * in2: length of the buffer + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> + */ + + +// arguments +// +#define buf r32 +#define val r33 +#define len r34 + +// +// local registers +// +#define saved_pfs r14 +#define cnt r18 +#define buf2 r19 +#define saved_lc r20 +#define saved_pr r21 +#define tmp r22 + + .text + .psr abi64 + .psr lsb + + .align 16 + .global memset + .proc memset + +memset: + alloc saved_pfs=ar.pfs,3,0,0,0 // cnt is sink here + cmp.eq p8,p0=r0,len // check for zero length + mov saved_lc=ar.lc // preserve ar.lc (slow) + ;; + adds tmp=-1,len // br.ctop is repeat/until + tbit.nz p6,p0=buf,0 // odd alignment +(p8) br.ret.spnt.few rp + + cmp.lt p7,p0=16,len // if len > 16 then long memset + mux1 val=val,@brcst // prepare value +(p7) br.cond.dptk.few long_memset + ;; + mov ar.lc=tmp // initialize lc for small count + ;; // avoid RAW and WAW on ar.lc +1: // worst case 15 cyles, avg 8 cycles + st1 [buf]=val,1 + br.cloop.dptk.few 1b + ;; // avoid RAW on ar.lc + mov ar.lc=saved_lc + mov ar.pfs=saved_pfs + br.ret.sptk.few rp // end of short memset + + // at this point we know we have more than 16 bytes to copy + // so we focus on alignment +long_memset: +(p6) st1 [buf]=val,1 // 1-byte aligned +(p6) adds len=-1,len;; // sync because buf is modified + tbit.nz p6,p0=buf,1 + ;; +(p6) st2 [buf]=val,2 // 2-byte aligned +(p6) adds len=-2,len;; + tbit.nz p6,p0=buf,2 + ;; +(p6) st4 [buf]=val,4 // 4-byte aligned +(p6) adds len=-4,len;; + tbit.nz p6,p0=buf,3 + ;; +(p6) st8 [buf]=val,8 // 8-byte aligned +(p6) adds len=-8,len;; + shr.u cnt=len,4 // number of 128-bit (2x64bit) words + ;; + cmp.eq p6,p0=r0,cnt + adds tmp=-1,cnt +(p6) br.cond.dpnt.few .dotail // we have less than 16 bytes left + ;; + adds buf2=8,buf // setup second base pointer + mov ar.lc=tmp + ;; +2: // 16bytes/iteration + st8 [buf]=val,16 + st8 [buf2]=val,16 + br.cloop.dptk.few 2b + ;; +.dotail: // tail correction based on len only + tbit.nz p6,p0=len,3 + ;; +(p6) st8 [buf]=val,8 // at least 8 bytes + tbit.nz p6,p0=len,2 + ;; +(p6) st4 [buf]=val,4 // at least 4 bytes + tbit.nz p6,p0=len,1 + ;; +(p6) st2 [buf]=val,2 // at least 2 bytes + tbit.nz p6,p0=len,0 + mov ar.lc=saved_lc + ;; +(p6) st1 [buf]=val // only 1 byte left + br.ret.dptk.few rp + .endp diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S new file mode 100644 index 000000000..3062716b1 --- /dev/null +++ b/arch/ia64/lib/strlen.S @@ -0,0 +1,197 @@ +/* + * + * Optimized version of the standard strlen() function + * + * + * Inputs: + * in0 address of string + * + * Outputs: + * ret0 the number of characters in the string (0 if empty string) + * does not count the \0 + * + * Copyright (C) 1999 Hewlett-Packard Co + * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> + * + * 09/24/99 S.Eranian add speculation recovery code + */ + +// +// +// This is an enhanced version of the basic strlen. it includes a combination +// of compute zero index (czx), parallel comparisons, speculative loads and +// loop unroll using rotating registers. +// +// General Ideas about the algorithm: +// The goal is to look at the string in chunks of 8 bytes. +// so we need to do a few extra checks at the beginning because the +// string may not be 8-byte aligned. In this case we load the 8byte +// quantity which includes the start of the string and mask the unused +// bytes with 0xff to avoid confusing czx. +// We use speculative loads and software pipelining to hide memory +// latency and do read ahead safely. This way we defer any exception. +// +// Because we don't want the kernel to be relying on particular +// settings of the DCR register, we provide recovery code in case +// speculation fails. The recovery code is going to "redo" the work using +// only normal loads. If we still get a fault then we generate a +// kernel panic. Otherwise we return the strlen as usual. +// +// The fact that speculation may fail can be caused, for instance, by +// the DCR.dm bit being set. In this case TLB misses are deferred, i.e., +// a NaT bit will be set if the translation is not present. The normal +// load, on the other hand, will cause the translation to be inserted +// if the mapping exists. +// +// It should be noted that we execute recovery code only when we need +// to use the data that has been speculatively loaded: we don't execute +// recovery code on pure read ahead data. +// +// Remarks: +// - the cmp r0,r0 is used as a fast way to initialize a predicate +// register to 1. This is required to make sure that we get the parallel +// compare correct. +// +// - we don't use the epilogue counter to exit the loop but we need to set +// it to zero beforehand. +// +// - after the loop we must test for Nat values because neither the +// czx nor cmp instruction raise a NaT consumption fault. We must be +// careful not to look too far for a Nat for which we don't care. +// For instance we don't need to look at a NaT in val2 if the zero byte +// was in val1. +// +// - Clearly performance tuning is required. +// +// +// +#define saved_pfs r11 +#define tmp r10 +#define base r16 +#define orig r17 +#define saved_pr r18 +#define src r19 +#define mask r20 +#define val r21 +#define val1 r22 +#define val2 r23 + + + .text + .psr abi64 + .psr lsb + .lsb + + .align 32 + .global strlen + .proc strlen +strlen: + alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8 + + .rotr v[2], w[2] // declares our 4 aliases + + extr.u tmp=in0,0,3 // tmp=least significant 3 bits + mov orig=in0 // keep trackof initial byte address + dep src=0,in0,0,3 // src=8byte-aligned in0 address + mov saved_pr=pr // preserve predicates (rotation) + ;; + ld8 v[1]=[src],8 // must not speculate: can fail here + shl tmp=tmp,3 // multiply by 8bits/byte + mov mask=-1 // our mask + ;; + ld8.s w[1]=[src],8 // speculatively load next + cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and + sub tmp=64,tmp // how many bits to shift our mask on the right + ;; + shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part + mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) + ;; + add base=-16,src // keep track of aligned base + or v[1]=v[1],mask // now we have a safe initial byte pattern + ;; +1: + ld8.s v[0]=[src],8 // speculatively load next + czx1.r val1=v[1] // search 0 byte from right + czx1.r val2=w[1] // search 0 byte from right following 8bytes + ;; + ld8.s w[0]=[src],8 // speculatively load next to next + cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 + cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 +(p6) br.wtop.dptk.few 1b // loop until p6 == 0 + ;; + // + // We must return try the recovery code iff + // val1_is_nat || (val1==8 && val2_is_nat) + // + // XXX Fixme + // - there must be a better way of doing the test + // + cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) +#ifdef notyet + tnat.nz p6,p7=val1 // test NaT on val1 +#else + tnat.z p7,p6=val1 // test NaT on val1 +#endif +(p6) br.cond.spnt.few recover// jump to recovery if val1 is NaT + ;; + // + // if we come here p7 is true, i.e., initialized for // cmp + // + cmp.eq.and p7,p0=8,val1// val1==8? + tnat.nz.and p7,p0=val2 // test NaT if val2 +(p7) br.cond.spnt.few recover// jump to recovery if val2 is NaT + ;; +(p8) mov val1=val2 // the other test got us out of the loop +(p8) adds src=-16,src // correct position when 3 ahead +(p9) adds src=-24,src // correct position when 4 ahead + ;; + sub ret0=src,orig // distance from base + sub tmp=8,val1 // which byte in word + mov pr=saved_pr,0xffffffffffff0000 + ;; + sub ret0=ret0,tmp // adjust + mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what + br.ret.sptk.few rp // end of normal execution + + // + // Outlined recovery code when speculation failed + // + // This time we don't use speculation and rely on the normal exception + // mechanism. that's why the loop is not as good as the previous one + // because read ahead is not possible + // + // IMPORTANT: + // Please note that in the case of strlen() as opposed to strlen_user() + // we don't use the exception mechanism, as this function is not + // supposed to fail. If that happens it means we have a bug and the + // code will cause of kernel fault. + // + // XXX Fixme + // - today we restart from the beginning of the string instead + // of trying to continue where we left off. + // +recover: + ld8 val=[base],8 // will fail if unrecoverable fault + ;; + or val=val,mask // remask first bytes + cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop + ;; + // + // ar.ec is still zero here + // +2: +(p6) ld8 val=[base],8 // will fail if unrecoverable fault + ;; + czx1.r val1=val // search 0 byte from right + ;; + cmp.eq p6,p0=8,val1 // val1==8 ? +(p6) br.wtop.dptk.few 2b // loop until p6 == 0 + sub ret0=base,orig // distance from base + sub tmp=8,val1 + mov pr=saved_pr,0xffffffffffff0000 + ;; + sub ret0=ret0,tmp // length=now - back -1 + mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what + br.ret.sptk.few rp // end of sucessful recovery code + + .endp strlen diff --git a/arch/ia64/lib/strlen_user.S b/arch/ia64/lib/strlen_user.S new file mode 100644 index 000000000..8149dde8a --- /dev/null +++ b/arch/ia64/lib/strlen_user.S @@ -0,0 +1,213 @@ +/* + * Optimized version of the strlen_user() function + * + * Inputs: + * in0 address of buffer + * + * Outputs: + * ret0 0 in case of fault, strlen(buffer)+1 otherwise + * + * Copyright (C) 1998, 1999 Hewlett-Packard Co + * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 1998, 1999 Stephane Eranian <eranian@hpl.hp.com> + * + * 01/19/99 S.Eranian heavily enhanced version (see details below) + * 09/24/99 S.Eranian added speculation recovery code + */ + +// +// int strlen_user(char *) +// ------------------------ +// Returns: +// - length of string + 1 +// - 0 in case an exception is raised +// +// This is an enhanced version of the basic strlen_user. it includes a +// combination of compute zero index (czx), parallel comparisons, speculative +// loads and loop unroll using rotating registers. +// +// General Ideas about the algorithm: +// The goal is to look at the string in chunks of 8 bytes. +// so we need to do a few extra checks at the beginning because the +// string may not be 8-byte aligned. In this case we load the 8byte +// quantity which includes the start of the string and mask the unused +// bytes with 0xff to avoid confusing czx. +// We use speculative loads and software pipelining to hide memory +// latency and do read ahead safely. This way we defer any exception. +// +// Because we don't want the kernel to be relying on particular +// settings of the DCR register, we provide recovery code in case +// speculation fails. The recovery code is going to "redo" the work using +// only normal loads. If we still get a fault then we return an +// error (ret0=0). Otherwise we return the strlen+1 as usual. +// The fact that speculation may fail can be caused, for instance, by +// the DCR.dm bit being set. In this case TLB misses are deferred, i.e., +// a NaT bit will be set if the translation is not present. The normal +// load, on the other hand, will cause the translation to be inserted +// if the mapping exists. +// +// It should be noted that we execute recovery code only when we need +// to use the data that has been speculatively loaded: we don't execute +// recovery code on pure read ahead data. +// +// Remarks: +// - the cmp r0,r0 is used as a fast way to initialize a predicate +// register to 1. This is required to make sure that we get the parallel +// compare correct. +// +// - we don't use the epilogue counter to exit the loop but we need to set +// it to zero beforehand. +// +// - after the loop we must test for Nat values because neither the +// czx nor cmp instruction raise a NaT consumption fault. We must be +// careful not to look too far for a Nat for which we don't care. +// For instance we don't need to look at a NaT in val2 if the zero byte +// was in val1. +// +// - Clearly performance tuning is required. +// +// +// + +#define EX(y,x...) \ + .section __ex_table,"a"; \ + data4 @gprel(99f); \ + data4 y-99f; \ + .previous; \ +99: x + +#define saved_pfs r11 +#define tmp r10 +#define base r16 +#define orig r17 +#define saved_pr r18 +#define src r19 +#define mask r20 +#define val r21 +#define val1 r22 +#define val2 r23 + + + .text + .psr abi64 + .psr lsb + .lsb + + .align 32 + .global __strlen_user + .proc __strlen_user +__strlen_user: + alloc saved_pfs=ar.pfs,11,0,0,8 + + .rotr v[2], w[2] // declares our 4 aliases + + extr.u tmp=in0,0,3 // tmp=least significant 3 bits + mov orig=in0 // keep trackof initial byte address + dep src=0,in0,0,3 // src=8byte-aligned in0 address + mov saved_pr=pr // preserve predicates (rotation) + ;; + ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate) + shl tmp=tmp,3 // multiply by 8bits/byte + mov mask=-1 // our mask + ;; + ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline + cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and) + sub tmp=64,tmp // how many bits to shift our mask on the right + ;; + shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part + mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) + ;; + add base=-16,src // keep track of aligned base + chk.s v[1], recover // if already NaT, then directly skip to recover + or v[1]=v[1],mask // now we have a safe initial byte pattern + ;; +1: + ld8.s v[0]=[src],8 // speculatively load next + czx1.r val1=v[1] // search 0 byte from right + czx1.r val2=w[1] // search 0 byte from right following 8bytes + ;; + ld8.s w[0]=[src],8 // speculatively load next to next + cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 + cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 +(p6) br.wtop.dptk.few 1b // loop until p6 == 0 + ;; + // + // We must return try the recovery code iff + // val1_is_nat || (val1==8 && val2_is_nat) + // + // XXX Fixme + // - there must be a better way of doing the test + // + cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) +#ifdef notyet + tnat.nz p6,p7=val1 // test NaT on val1 +#else + tnat.z p7,p6=val1 // test NaT on val1 +#endif +(p6) br.cond.spnt.few recover// jump to recovery if val1 is NaT + ;; + // + // if we come here p7 is true, i.e., initialized for // cmp + // + cmp.eq.and p7,p0=8,val1// val1==8? + tnat.nz.and p7,p0=val2 // test NaT if val2 +(p7) br.cond.spnt.few recover// jump to recovery if val2 is NaT + ;; +(p8) mov val1=val2 // val2 contains the value +(p8) adds src=-16,src // correct position when 3 ahead +(p9) adds src=-24,src // correct position when 4 ahead + ;; + sub ret0=src,orig // distance from origin + sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 + mov pr=saved_pr,0xffffffffffff0000 + ;; + sub ret0=ret0,tmp // length=now - back -1 + mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what + br.ret.sptk.few rp // end of normal execution + + // + // Outlined recovery code when speculation failed + // + // This time we don't use speculation and rely on the normal exception + // mechanism. that's why the loop is not as good as the previous one + // because read ahead is not possible + // + // XXX Fixme + // - today we restart from the beginning of the string instead + // of trying to continue where we left off. + // +recover: + EX(.Lexit1, ld8 val=[base],8) // load the initial bytes + ;; + or val=val,mask // remask first bytes + cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop + ;; + // + // ar.ec is still zero here + // +2: + EX(.Lexit1, (p6) ld8 val=[base],8) + ;; + czx1.r val1=val // search 0 byte from right + ;; + cmp.eq p6,p0=8,val1 // val1==8 ? +(p6) br.wtop.dptk.few 2b // loop until p6 == 0 + ;; + sub ret0=base,orig // distance from base + sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 + mov pr=saved_pr,0xffffffffffff0000 + ;; + sub ret0=ret0,tmp // length=now - back -1 + mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what + br.ret.sptk.few rp // end of sucessful recovery code + + // + // We failed even on the normal load (called from exception handler) + // +.Lexit1: + mov ret0=0 + mov pr=saved_pr,0xffffffffffff0000 + mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what + br.ret.sptk.few rp + + .endp __strlen_user diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S new file mode 100644 index 000000000..17f71f1a0 --- /dev/null +++ b/arch/ia64/lib/strncpy_from_user.S @@ -0,0 +1,53 @@ +/* + * Just like strncpy() except for the return value. If no fault occurs during + * the copying, the number of bytes copied is returned. If a fault occurs, + * -EFAULT is returned. + * + * Inputs: + * in0: address of destination buffer + * in1: address of string to be copied + * in2: length of buffer in bytes + * Outputs: + * r8: -EFAULT in case of fault or number of bytes copied if no fault + * + * Copyright (C) 1998, 1999 Hewlett-Packard Co + * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com> + */ + +#define EX(x...) \ +99: x; \ + .section __ex_table,"a"; \ + data4 @gprel(99b); \ + data4 .Lexit-99b; \ + .previous + + .text + .psr abi64 + .psr lsb + .lsb + + .align 32 + .global __strncpy_from_user + .proc __strncpy_from_user +__strncpy_from_user: + alloc r11=ar.pfs,3,0,0,0 + mov r9=in1 + add r10=in1,in2 + + // XXX braindead copy loop---this needs to be optimized +.Loop1: + EX(ld1 r8=[in1],1) + ;; + st1 [in0]=r8,1 + cmp.ltu p6,p0=in1,r10 + ;; +(p6) cmp.ne.and p6,p0=r8,r0 + ;; +(p6) br.cond.dpnt.few .Loop1 + +1: sub r8=in1,r9 // length of string (including NUL character) +.Lexit: + mov ar.pfs=r11 + br.ret.sptk.few rp + + .endp __strncpy_from_user diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S new file mode 100644 index 000000000..c227a9003 --- /dev/null +++ b/arch/ia64/lib/strnlen_user.S @@ -0,0 +1,55 @@ +/* + * Returns 0 if exception before NUL or reaching the supplied limit (N), + * a value greater than N if the string is longer than the limit, else + * strlen. + * + * Inputs: + * in0: address of buffer + * in1: string length limit N + * Outputs: + * r8: 0 in case of fault, strlen(buffer)+1 otherwise + * + * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com> + */ + +/* If a fault occurs, r8 gets set to -EFAULT and r9 gets cleared. */ +#define EX(x...) \ + .section __ex_table,"a"; \ + data4 @gprel(99f); \ + data4 (.Lexit-99f)|1; \ + .previous \ +99: x; + + .text + .psr abi64 + .psr lsb + .lsb + + .align 32 + .global __strnlen_user + .proc __strnlen_user +__strnlen_user: + alloc r2=ar.pfs,2,0,0,0 + mov r16=ar.lc // preserve ar.lc + add r3=-1,in1 + ;; + mov ar.lc=r3 + mov r9=0 + + // XXX braindead strlen loop---this needs to be optimized +.Loop1: + EX(ld1 r8=[in0],1) + add r9=1,r9 + ;; + cmp.eq p6,p0=r8,r0 +(p6) br.dpnt.few .Lexit + br.cloop.dptk.few .Loop1 + + add r9=1,in1 // NUL not found---return N+1 + ;; +.Lexit: + mov r8=r9 + mov ar.lc=r16 // restore ar.lc + br.ret.sptk.few rp + + .endp __strnlen_user |