/* * * Optmized version of the standard do_csum() function * * Return: a 64bit quantity containing the 16bit Internet checksum * * Inputs: * in0: address of buffer to checksum (char *) * in1: length of the buffer (int) * * Copyright (C) 1999 Hewlett-Packard Co * Copyright (C) 1999 Stephane Eranian * */ #include // // Theory of operations: // The goal is to go as quickly as possible to the point where // we can checksum 8 bytes/loop. Before reaching that point we must // take care of incorrect alignment of first byte. // // The code hereafter also takes care of the "tail" part of the buffer // before entering the core loop, if any. The checksum is a sum so it // allows us to commute operations. So we do do the "head" and "tail" // first to finish at full speed in the body. Once we get the head and // tail values, we feed them into the pipeline, very handy initialization. // // Of course we deal with the special case where the whole buffer fits // into one 8 byte word. In this case we have only one entry in the pipeline. // // We use a (3+1)-stage pipeline in the loop to account for possible // load latency and also to accomodate for head and tail. // // The end of the function deals with folding the checksum from 64bits // down to 16bits taking care of the carry. // // This version avoids synchronization in the core loop by also using a // pipeline for the accumulation of the checksum in result[]. // // p[] // |---| // 0| | r32 : new value loaded in pipeline // |---| // 1| | r33 : in transit data // |---| // 2| | r34 : current value to add to checksum // |---| // 3| | r35 : previous value added to checksum (previous iteration) // |---| // // result[] // |---| // 0| | r36 : new checksum // |---| // 1| | r37 : previous value of checksum // |---| // 2| | r38 : final checksum when out of the loop (after 2 epilogue rots) // |---| // // // NOT YET DONE: // - Take advantage of the MMI bandwidth to load more than 8byte per loop // iteration // - use the lfetch instruction to augment the chances of the data being in // the cache when we need it. // - Maybe another algorithm which would take care of the folding at the // end in a different manner // - Work with people more knowledgeable than me on the network stack // to figure out if we could not split the function depending on the // type of packet or alignment we get. Like the ip_fast_csum() routine // where we know we have at least 20bytes worth of data to checksum. // - Look at RFCs about checksums to see whether or not we can do better // // - Do a better job of handling small packets. // #define saved_pfs r11 #define hmask r16 #define tmask r17 #define first r18 #define firstval r19 #define firstoff r20 #define last r21 #define lastval r22 #define lastoff r23 #define saved_lc r24 #define saved_pr r25 #define tmp1 r26 #define tmp2 r27 #define tmp3 r28 #define carry r29 #define buf in0 #define len in1 .text .psr abi64 .psr lsb .lsb // unsigned long do_csum(unsigned char *buf,int len) GLOBAL_ENTRY(do_csum) UNW(.prologue) UNW(.save ar.pfs, saved_pfs) alloc saved_pfs=ar.pfs,2,8,0,8 .rotr p[4], result[3] mov ret0=r0 // in case we have zero length cmp4.lt p0,p6=r0,len // check for zero length or negative (32bit len) ;; // avoid WAW on CFM mov tmp3=0x7 // a temporary mask/value add tmp1=buf,len // last byte's address (p6) br.ret.spnt.few rp // return if true (hope we can avoid that) and firstoff=7,buf // how many bytes off for first element tbit.nz p10,p0=buf,0 // is buf an odd address ? mov hmask=-1 // intialize head mask ;; andcm first=buf,tmp3 // 8byte aligned down address of first element mov tmask=-1 // initialize tail mask adds tmp2=-1,tmp1 // last-1 ;; and lastoff=7,tmp1 // how many bytes off for last element andcm last=tmp2,tmp3 // address of word containing last byte UNW(.save pr, saved_pr) mov saved_pr=pr // preserve predicates (rotation) ;; sub tmp3=last,first // tmp3=distance from first to last cmp.eq p8,p9=last,first // everything fits in one word ? sub tmp1=8,lastoff // complement to lastoff ld8 firstval=[first],8 // load,ahead of time, "first" word shl tmp2=firstoff,3 // number of bits ;; and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 (p9) ld8 lastval=[last] // load,ahead of time, "last" word, if needed (p8) mov lastval=r0 // we don't need lastval if first==last mov result[1]=r0 // initialize result ;; shl tmp1=tmp1,3 // number of bits shl hmask=hmask,tmp2 // build head mask, mask off [0,firstoff[ ;; shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] UNW(.save ar.lc, saved_lc) mov saved_lc=ar.lc // save lc ;; UNW(.body) (p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only (p9) and p[1]=lastval,tmask // mask last it as appropriate shr.u tmp3=tmp3,3 // we do 8 bytes per loop ;; cmp.lt p6,p7=2,tmp3 // tmp3 > 2 ? and p[2]=firstval,hmask // and mask it as appropriate add tmp1=-2,tmp3 // -2 = -1 (br.ctop) -1 (last-first) ;; // XXX Fixme: not very nice initialization here // // Setup loop control registers: // // tmp3=0 (1 word) : lc=0, ec=2, p16=F // tmp3=1 (2 words) : lc=0, ec=3, p16=F // tmp3=2 (3 words) : lc=0, ec=4, p16=T // tmp3>2 (4 or more): lc=tmp3-2, ec=4, p16=T // cmp.eq p8,p9=r0,tmp3 // tmp3 == 0 ? (p6) mov ar.lc=tmp1 (p7) mov ar.lc=0 ;; cmp.lt p6,p7=1,tmp3 // tmp3 > 1 ? (p8) mov ar.ec=2 // we need the extra rotation on result[] (p9) mov ar.ec=3 // hard not to set it twice sometimes ;; mov carry=r0 // initialize carry (p6) mov ar.ec=4 (p6) mov pr.rot=0xffffffffffff0000 // p16=T, p18=T cmp.ne p8,p0=r0,r0 // p8 is false mov p[3]=r0 // make sure first compare fails (p7) mov pr.rot=0xfffffffffffe0000 // p16=F, p18=T ;; 1: (p16) ld8 p[0]=[first],8 // load next (p8) adds carry=1,carry // add carry on prev_prev_value (p18) add result[0]=result[1],p[2] // new_res = prev_res + cur_val cmp.ltu p8,p0=result[1],p[3] // p8= prev_result < prev_val br.ctop.dptk.few 1b // loop until lc--==0 ;; // RAW on carry when loop exits (p8) adds carry=1,carry;; // correct for carry on prev_value add result[2]=carry,result[2];; // add carry to final result cmp.ltu p6,p7=result[2], carry // check for new carry ;; (p6) adds result[2]=1,result[1] // correct if required movl tmp3=0xffffffff ;; // XXX Fixme // // now fold 64 into 16 bits taking care of carry // that's not very good because it has lots of sequentiality // and tmp1=result[2],tmp3 shr.u tmp2=result[2],32 ;; add result[2]=tmp1,tmp2 shr.u tmp3=tmp3,16 ;; and tmp1=result[2],tmp3 shr.u tmp2=result[2],16 ;; add result[2]=tmp1,tmp2 ;; and tmp1=result[2],tmp3 shr.u tmp2=result[2],16 ;; add result[2]=tmp1,tmp2 ;; and tmp1=result[2],tmp3 shr.u tmp2=result[2],16 ;; add ret0=tmp1,tmp2 mov pr=saved_pr,0xffffffffffff0000 ;; // if buf was odd then swap bytes mov ar.pfs=saved_pfs // restore ar.ec (p10) mux1 ret0=ret0,@rev // reverse word ;; mov ar.lc=saved_lc (p10) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes br.ret.sptk.few rp END(do_csum)