diff options
Diffstat (limited to 'arch/cris/lib')
-rw-r--r-- | arch/cris/lib/Makefile | 11 | ||||
-rw-r--r-- | arch/cris/lib/checksum.S | 113 | ||||
-rw-r--r-- | arch/cris/lib/checksumcopy.S | 120 | ||||
-rw-r--r-- | arch/cris/lib/dmacopy.c | 43 | ||||
-rw-r--r-- | arch/cris/lib/memset.c | 245 | ||||
-rw-r--r-- | arch/cris/lib/old_checksum.c | 127 | ||||
-rw-r--r-- | arch/cris/lib/string.c | 223 | ||||
-rw-r--r-- | arch/cris/lib/usercopy.c | 501 |
8 files changed, 1383 insertions, 0 deletions
diff --git a/arch/cris/lib/Makefile b/arch/cris/lib/Makefile new file mode 100644 index 000000000..6ede712e3 --- /dev/null +++ b/arch/cris/lib/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for Etrax-specific library files.. +# + +.S.o: + $(CC) -D__ASSEMBLY__ $(AFLAGS) -traditional -c $< -o $*.o + +L_TARGET = lib.a +obj-y = checksum.o checksumcopy.o string.o usercopy.o memset.o + +include $(TOPDIR)/Rules.make diff --git a/arch/cris/lib/checksum.S b/arch/cris/lib/checksum.S new file mode 100644 index 000000000..4ee0daa0c --- /dev/null +++ b/arch/cris/lib/checksum.S @@ -0,0 +1,113 @@ + ;; $Id: checksum.S,v 1.1 2000/07/10 16:25:21 bjornw Exp $ + ;; A fast checksum routine using movem + ;; Copyright (c) 1998 Bjorn Wesen/Axis Communications AB + + ;; csum_partial(const unsigned char * buff, int len, unsigned int sum) + + .globl _csum_partial +_csum_partial: + + ;; check for breakeven length between movem and normal word looping versions + + cmpu.w 80,r11 + bcs no_movem + nop + + ;; need to save the registers we use below in the movem loop + ;; this overhead is why we have a check above for breakeven length + + subq 9*4,sp + movem r8,[sp] + + ;; do a movem checksum + + ;; r10 - src + ;; r11 - length + ;; r12 - checksum + + subq 10*4,r11 ; update length for the first loop + +mloop: movem [r10+],r9 ; read 10 longwords + + ;; perform dword checksumming on the 10 longwords + + add.d r0,r12 + ax + add.d r1,r12 + ax + add.d r2,r12 + ax + add.d r3,r12 + ax + add.d r4,r12 + ax + add.d r5,r12 + ax + add.d r6,r12 + ax + add.d r7,r12 + ax + add.d r8,r12 + ax + add.d r9,r12 + + ;; fold the carry into the checksum, to avoid having to loop the carry + ;; back into the top + + ax + addq 0,r12 + ax ; do it again, since we might have generated a carry + addq 0,r12 + + subq 10*4,r11 + bge mloop + nop + + addq 10*4,r11 ; compensate for last loop underflowing length + + ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below + + moveq -1,r1 ; put 0xffff in r1, faster than move.d 0xffff,r1 + lsrq 16,r1 + + move.d r12,r0 + lsrq 16,r0 ; r0 = checksum >> 16 + and.d r1,r12 ; checksum = checksum & 0xffff + add.d r0,r12 ; checksum += r0 + move.d r12,r0 ; do the same again, maybe we got a carry last add + lsrq 16,r0 + and.d r1,r12 + add.d r0,r12 + + movem [sp+],r8 ; restore regs + +no_movem: + cmpq 2,r11 + blt no_words + nop + + ;; checksum the rest of the words + + subq 2,r11 + +wloop: subq 2,r11 + bge wloop + addu.w [r10+],r12 + + addq 2,r11 + +no_words: + ;; see if we have one odd byte more + cmpq 1,r11 + beq do_byte + nop + ret + move.d r12, r10 + +do_byte: + ;; copy and checksum the last byte + addu.b [r10],r12 + ret + move.d r12, r10 + +
\ No newline at end of file diff --git a/arch/cris/lib/checksumcopy.S b/arch/cris/lib/checksumcopy.S new file mode 100644 index 000000000..eae9c7ace --- /dev/null +++ b/arch/cris/lib/checksumcopy.S @@ -0,0 +1,120 @@ + ;; $Id: checksumcopy.S,v 1.2 2000/08/08 16:57:31 bjornw Exp $ + ;; A fast checksum+copy routine using movem + ;; Copyright (c) 1998, 2000 Axis Communications AB + ;; + ;; Authors: Bjorn Wesen + ;; + ;; csum_partial_copy_nocheck(const char *src, char *dst, + ;; int len, unsigned int sum) + + .globl _csum_partial_copy_nocheck +_csum_partial_copy_nocheck: + + ;; check for breakeven length between movem and normal word looping versions + + cmpu.w 80,r12 + bcs no_movem + nop + + ;; need to save the registers we use below in the movem loop + ;; this overhead is why we have a check above for breakeven length + + subq 9*4,sp + movem r8,[sp] + + ;; do a movem copy and checksum + + ;; r10 - src + ;; r11 - dst + ;; r12 - length + ;; r13 - checksum + + subq 10*4,r12 ; update length for the first loop + +mloop: movem [r10+],r9 ; read 10 longwords + movem r9,[r11+] ; write 10 longwords + + ;; perform dword checksumming on the 10 longwords + + add.d r0,r13 + ax + add.d r1,r13 + ax + add.d r2,r13 + ax + add.d r3,r13 + ax + add.d r4,r13 + ax + add.d r5,r13 + ax + add.d r6,r13 + ax + add.d r7,r13 + ax + add.d r8,r13 + ax + add.d r9,r13 + + ;; fold the carry into the checksum, to avoid having to loop the carry + ;; back into the top + + ax + addq 0,r13 + + subq 10*4,r12 + bge mloop + nop + + addq 10*4,r12 ; compensate for last loop underflowing length + + ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below + + moveq -1,r1 ; put 0xffff in r1, faster than move.d 0xffff,r1 + lsrq 16,r1 + + move.d r13,r0 + lsrq 16,r0 ; r0 = checksum >> 16 + and.d r1,r13 ; checksum = checksum & 0xffff + add.d r0,r13 ; checksum += r0 + move.d r13,r0 ; do the same again, maybe we got a carry last add + lsrq 16,r0 + and.d r1,r13 + add.d r0,r13 + + movem [sp+],r8 ; restore regs + +no_movem: + cmpq 2,r12 + blt no_words + nop + + ;; copy and checksum the rest of the words + + subq 2,r12 + +wloop: move.w [r10+],r9 + addu.w r9,r13 + subq 2,r12 + bge wloop + move.w r9,[r11+] + + addq 2,r12 + +no_words: + ;; see if we have one odd byte more + cmpq 1,r12 + beq do_byte + nop + ret + move.d r13, r10 + +do_byte: + ;; copy and checksum the last byte + move.b [r10],r9 + addu.b r9,r13 + move.b r9,[r11] + ret + move.d r13, r10 + +
\ No newline at end of file diff --git a/arch/cris/lib/dmacopy.c b/arch/cris/lib/dmacopy.c new file mode 100644 index 000000000..318577a2d --- /dev/null +++ b/arch/cris/lib/dmacopy.c @@ -0,0 +1,43 @@ +/* $Id: dmacopy.c,v 1.1 2000/07/10 16:25:21 bjornw Exp $ + * + * memcpy for large blocks, using memory-memory DMA channels 6 and 7 in Etrax + */ + +#include <asm/svinto.h> +#include <asm/io.h> + +#define D(x) + +void *dma_memcpy(void *pdst, + const void *psrc, + unsigned int pn) +{ + static etrax_dma_descr indma, outdma; + + D(printk("dma_memcpy %d bytes... ", pn)); + +#if 0 + *R_GEN_CONFIG = genconfig_shadow = + (genconfig_shadow & ~0x3c0000) | + IO_STATE(R_GEN_CONFIG, dma6, intdma7) | + IO_STATE(R_GEN_CONFIG, dma7, intdma6); +#endif + indma.sw_len = outdma.sw_len = pn; + indma.ctrl = d_eol | d_eop; + outdma.ctrl = d_eol; + indma.buf = psrc; + outdma.buf = pdst; + + *R_DMA_CH6_FIRST = &indma; + *R_DMA_CH7_FIRST = &outdma; + *R_DMA_CH6_CMD = IO_STATE(R_DMA_CH6_CMD, cmd, start); + *R_DMA_CH7_CMD = IO_STATE(R_DMA_CH7_CMD, cmd, start); + + while(*R_DMA_CH7_CMD == 1) /* wait for completion */ ; + + D(printk("done\n")); + +} + + + diff --git a/arch/cris/lib/memset.c b/arch/cris/lib/memset.c new file mode 100644 index 000000000..2f9f3fe37 --- /dev/null +++ b/arch/cris/lib/memset.c @@ -0,0 +1,245 @@ +/*#************************************************************************#*/ +/*#-------------------------------------------------------------------------*/ +/*# */ +/*# FUNCTION NAME: memset() */ +/*# */ +/*# PARAMETERS: void* dst; Destination address. */ +/*# int c; Value of byte to write. */ +/*# int len; Number of bytes to write. */ +/*# */ +/*# RETURNS: dst. */ +/*# */ +/*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */ +/*# Framework taken from memcpy. This routine is */ +/*# very sensitive to compiler changes in register allocation. */ +/*# Should really be rewritten to avoid this problem. */ +/*# */ +/*#-------------------------------------------------------------------------*/ +/*# */ +/*# HISTORY */ +/*# */ +/*# DATE NAME CHANGES */ +/*# ---- ---- ------- */ +/*# 990713 HP Tired of watching this function (or */ +/*# really, the nonoptimized generic */ +/*# implementation) take up 90% of simulator */ +/*# output. Measurements needed. */ +/*# */ +/*#-------------------------------------------------------------------------*/ + +/* No, there's no macro saying 12*4, since it is "hard" to get it into + the asm in a good way. Thus better to expose the problem everywhere. + */ + +/* Assuming 1 cycle per dword written or read (ok, not really true), and + one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1) + so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */ + +#define ZERO_BLOCK_SIZE (1*12*4) + +void *memset(void *pdst, + int c, + unsigned int plen) +{ + /* Ok. Now we want the parameters put in special registers. + Make sure the compiler is able to make something useful of this. */ + + register char *return_dst __asm__ ("r10") = pdst; + register int n __asm__ ("r12") = plen; + register int lc __asm__ ("r11") = c; + + /* Most apps use memset sanely. Only those memsetting about 3..4 + bytes or less get penalized compared to the generic implementation + - and that's not really sane use. */ + + /* Ugh. This is fragile at best. Check with newer GCC releases, if + they compile cascaded "x |= x << 8" sanely! */ + __asm__("movu.b %0,r13\n\tlslq 8,r13\n\tmove.b %0,r13\n\tmove.d r13,%0\n\tlslq 16,r13\n\tor.d r13,%0" + : "=r" (lc) : "0" (lc) : "r13"); + + { + register char *dst __asm__ ("r13") = pdst; + + /* This is NONPORTABLE, but since this whole routine is */ + /* grossly nonportable that doesn't matter. */ + + if (((unsigned long) pdst & 3) != 0 + /* Oops! n=0 must be a legal call, regardless of alignment. */ + && n >= 3) + { + if ((unsigned long)dst & 1) + { + *dst = (char) lc; + n--; + dst++; + } + + if ((unsigned long)dst & 2) + { + *(short *)dst = lc; + n -= 2; + dst += 2; + } + } + + /* Now the fun part. For the threshold value of this, check the equation + above. */ + /* Decide which copying method to use. */ + if (n >= ZERO_BLOCK_SIZE) + { + /* For large copies we use 'movem' */ + + /* It is not optimal to tell the compiler about clobbering any + registers; that will move the saving/restoring of those registers + to the function prologue/epilogue, and make non-movem sizes + suboptimal. + + This method is not foolproof; it assumes that the "asm reg" + declarations at the beginning of the function really are used + here (beware: they may be moved to temporary registers). + This way, we do not have to save/move the registers around into + temporaries; we can safely use them straight away. + + If you want to check that the allocation was right; then + check the equalities in the first comment. It should say + "r13=r13, r12=r12, r11=r11" */ + __asm__ volatile (" + ;; Check that the following is true (same register names on + ;; both sides of equal sign, as in r8=r8): + ;; %0=r13, %1=r12, %4=r11 + ;; + ;; Save the registers we'll clobber in the movem process + ;; on the stack. Don't mention them to gcc, it will only be + ;; upset. + subq 11*4,sp + movem r10,[sp] + + move.d r11,r0 + move.d r11,r1 + move.d r11,r2 + move.d r11,r3 + move.d r11,r4 + move.d r11,r5 + move.d r11,r6 + move.d r11,r7 + move.d r11,r8 + move.d r11,r9 + move.d r11,r10 + + ;; Now we've got this: + ;; r13 - dst + ;; r12 - n + + ;; Update n for the first loop + subq 12*4,r12 +0: + subq 12*4,r12 + bge 0b + movem r11,[r13+] + + addq 12*4,r12 ;; compensate for last loop underflowing n + + ;; Restore registers from stack + movem [sp+],r10" + + /* Outputs */ : "=r" (dst), "=r" (n) + /* Inputs */ : "0" (dst), "1" (n), "r" (lc)); + + } + + /* Either we directly starts copying, using dword copying + in a loop, or we copy as much as possible with 'movem' + and then the last block (<44 bytes) is copied here. + This will work since 'movem' will have updated src,dst,n. */ + + while ( n >= 16 ) + { + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + n -= 16; + } + + /* A switch() is definitely the fastest although it takes a LOT of code. + * Particularly if you inline code this. + */ + switch (n) + { + case 0: + break; + case 1: + *(char*)dst = (char) lc; + break; + case 2: + *(short*)dst = (short) lc; + break; + case 3: + *((short*)dst)++ = (short) lc; + *(char*)dst = (char) lc; + break; + case 4: + *((long*)dst)++ = lc; + break; + case 5: + *((long*)dst)++ = lc; + *(char*)dst = (char) lc; + break; + case 6: + *((long*)dst)++ = lc; + *(short*)dst = (short) lc; + break; + case 7: + *((long*)dst)++ = lc; + *((short*)dst)++ = (short) lc; + *(char*)dst = (char) lc; + break; + case 8: + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + break; + case 9: + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *(char*)dst = (char) lc; + break; + case 10: + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *(short*)dst = (short) lc; + break; + case 11: + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *((short*)dst)++ = (short) lc; + *(char*)dst = (char) lc; + break; + case 12: + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + break; + case 13: + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *(char*)dst = (char) lc; + break; + case 14: + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *(short*)dst = (short) lc; + break; + case 15: + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *((long*)dst)++ = lc; + *((short*)dst)++ = (short) lc; + *(char*)dst = (char) lc; + break; + } + } + + return return_dst; /* destination pointer. */ +} /* memset() */ diff --git a/arch/cris/lib/old_checksum.c b/arch/cris/lib/old_checksum.c new file mode 100644 index 000000000..6035a48ae --- /dev/null +++ b/arch/cris/lib/old_checksum.c @@ -0,0 +1,127 @@ +/* $Id: old_checksum.c,v 1.1 2000/07/10 16:25:21 bjornw Exp $ + * + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Tom May, <ftom@netcom.com> + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <net/checksum.h> + +#undef PROFILE_CHECKSUM + +#ifdef PROFILE_CHECKSUM +/* these are just for profiling the checksum code with an oscillioscope.. uh */ +#if 0 +#define BITOFF *((unsigned char *)0xb0000030) = 0xff +#define BITON *((unsigned char *)0xb0000030) = 0x0 +#endif +#include <asm/io.h> +#define CBITON LED_ACTIVE_SET(1) +#define CBITOFF LED_ACTIVE_SET(0) +#define BITOFF +#define BITON +#else +#define BITOFF +#define BITON +#define CBITOFF +#define CBITON +#endif + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +#include <asm/delay.h> + +unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) +{ + /* + * Experiments with ethernet and slip connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. + */ + const unsigned char *endMarker = buff + len; + const unsigned char *marker = endMarker - (len % 16); +#if 0 + if((int)buff & 0x3) + printk("unaligned buff %p\n", buff); + __delay(900); /* extra delay of 90 us to test performance hit */ +#endif + BITON; + while (buff < marker) { + sum += *((unsigned short *)buff)++; + sum += *((unsigned short *)buff)++; + sum += *((unsigned short *)buff)++; + sum += *((unsigned short *)buff)++; + sum += *((unsigned short *)buff)++; + sum += *((unsigned short *)buff)++; + sum += *((unsigned short *)buff)++; + sum += *((unsigned short *)buff)++; + } + marker = endMarker - (len % 2); + while(buff < marker) { + sum += *((unsigned short *)buff)++; + } + if(endMarker - buff > 0) { + sum += *buff; /* add extra byte seperately */ + } + BITOFF; + return(sum); +} + +#if 0 + +/* + * copy while checksumming, otherwise like csum_partial + */ + +unsigned int csum_partial_copy(const unsigned char *src, unsigned char *dst, + int len, unsigned int sum) +{ + const unsigned char *endMarker; + const unsigned char *marker; + printk("csum_partial_copy len %d.\n", len); +#if 0 + if((int)src & 0x3) + printk("unaligned src %p\n", src); + if((int)dst & 0x3) + printk("unaligned dst %p\n", dst); + __delay(1800); /* extra delay of 90 us to test performance hit */ +#endif + endMarker = src + len; + marker = endMarker - (len % 16); + CBITON; + while(src < marker) { + sum += (*((unsigned short *)dst)++ = *((unsigned short *)src)++); + sum += (*((unsigned short *)dst)++ = *((unsigned short *)src)++); + sum += (*((unsigned short *)dst)++ = *((unsigned short *)src)++); + sum += (*((unsigned short *)dst)++ = *((unsigned short *)src)++); + sum += (*((unsigned short *)dst)++ = *((unsigned short *)src)++); + sum += (*((unsigned short *)dst)++ = *((unsigned short *)src)++); + sum += (*((unsigned short *)dst)++ = *((unsigned short *)src)++); + sum += (*((unsigned short *)dst)++ = *((unsigned short *)src)++); + } + marker = endMarker - (len % 2); + while(src < marker) { + sum += (*((unsigned short *)dst)++ = *((unsigned short *)src)++); + } + if(endMarker - src > 0) { + sum += (*dst = *src); /* add extra byte seperately */ + } + CBITOFF; + return(sum); +} + +#endif diff --git a/arch/cris/lib/string.c b/arch/cris/lib/string.c new file mode 100644 index 000000000..6218cad56 --- /dev/null +++ b/arch/cris/lib/string.c @@ -0,0 +1,223 @@ +/*#************************************************************************#*/ +/*#-------------------------------------------------------------------------*/ +/*# */ +/*# FUNCTION NAME: memcpy() */ +/*# */ +/*# PARAMETERS: void* dst; Destination address. */ +/*# void* src; Source address. */ +/*# int len; Number of bytes to copy. */ +/*# */ +/*# RETURNS: dst. */ +/*# */ +/*# DESCRIPTION: Copies len bytes of memory from src to dst. No guarantees */ +/*# about copying of overlapping memory areas. This routine is */ +/*# very sensitive to compiler changes in register allocation. */ +/*# Should really be rewritten to avoid this problem. */ +/*# */ +/*#-------------------------------------------------------------------------*/ +/*# */ +/*# HISTORY */ +/*# */ +/*# DATE NAME CHANGES */ +/*# ---- ---- ------- */ +/*# 941007 Kenny R Creation */ +/*# 941011 Kenny R Lots of optimizations and inlining. */ +/*# 941129 Ulf A Adapted for use in libc. */ +/*# 950216 HP N==0 forgotten if non-aligned src/dst. */ +/*# Added some optimizations. */ +/*# 001025 HP Make src and dst char *. Align dst to */ +/*# dword, not just word-if-both-src-and-dst- */ +/*# are-misaligned. */ +/*# */ +/*#-------------------------------------------------------------------------*/ + +void *memcpy(void *pdst, + const void *psrc, + unsigned int pn) +{ + /* Ok. Now we want the parameters put in special registers. + Make sure the compiler is able to make something useful of this. + As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). + + If gcc was allright, it really would need no temporaries, and no + stack space to save stuff on. */ + + register void *return_dst __asm__ ("r10") = pdst; + register char *dst __asm__ ("r13") = pdst; + register const char *src __asm__ ("r11") = psrc; + register int n __asm__ ("r12") = pn; + + + /* When src is aligned but not dst, this makes a few extra needless + cycles. I believe it would take as many to check that the + re-alignment was unnecessary. */ + if (((unsigned long) dst & 3) != 0 + /* Don't align if we wouldn't copy more than a few bytes; so we + don't have to check further for overflows. */ + && n >= 3) + { + if ((unsigned long) dst & 1) + { + n--; + *(char*)dst = *(char*)src; + src++; + dst++; + } + + if ((unsigned long) dst & 2) + { + n -= 2; + *(short*)dst = *(short*)src; + src += 2; + dst += 2; + } + } + + /* Decide which copying method to use. */ + if (n >= 44*2) /* Break even between movem and + move16 is at 38.7*2, but modulo 44. */ + { + /* For large copies we use 'movem' */ + + /* It is not optimal to tell the compiler about clobbering any + registers; that will move the saving/restoring of those registers + to the function prologue/epilogue, and make non-movem sizes + suboptimal. + + This method is not foolproof; it assumes that the "asm reg" + declarations at the beginning of the function really are used + here (beware: they may be moved to temporary registers). + This way, we do not have to save/move the registers around into + temporaries; we can safely use them straight away. + + If you want to check that the allocation was right; then + check the equalities in the first comment. It should say + "r13=r13, r11=r11, r12=r12" */ + __asm__ volatile (" + ;; Check that the following is true (same register names on + ;; both sides of equal sign, as in r8=r8): + ;; %0=r13, %1=r11, %2=r12 + ;; + ;; Save the registers we'll use in the movem process + ;; on the stack. + subq 11*4,sp + movem r10,[sp] + + ;; Now we've got this: + ;; r11 - src + ;; r13 - dst + ;; r12 - n + + ;; Update n for the first loop + subq 44,r12 +0: + movem [r11+],r10 + subq 44,r12 + bge 0b + movem r10,[r13+] + + addq 44,r12 ;; compensate for last loop underflowing n + + ;; Restore registers from stack + movem [sp+],r10" + + /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n) + /* Inputs */ : "0" (dst), "1" (src), "2" (n)); + + } + + /* Either we directly starts copying, using dword copying + in a loop, or we copy as much as possible with 'movem' + and then the last block (<44 bytes) is copied here. + This will work since 'movem' will have updated src,dst,n. */ + + while ( n >= 16 ) + { + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + n -= 16; + } + + /* A switch() is definitely the fastest although it takes a LOT of code. + * Particularly if you inline code this. + */ + switch (n) + { + case 0: + break; + case 1: + *(char*)dst = *(char*)src; + break; + case 2: + *(short*)dst = *(short*)src; + break; + case 3: + *((short*)dst)++ = *((short*)src)++; + *(char*)dst = *(char*)src; + break; + case 4: + *((long*)dst)++ = *((long*)src)++; + break; + case 5: + *((long*)dst)++ = *((long*)src)++; + *(char*)dst = *(char*)src; + break; + case 6: + *((long*)dst)++ = *((long*)src)++; + *(short*)dst = *(short*)src; + break; + case 7: + *((long*)dst)++ = *((long*)src)++; + *((short*)dst)++ = *((short*)src)++; + *(char*)dst = *(char*)src; + break; + case 8: + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + break; + case 9: + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *(char*)dst = *(char*)src; + break; + case 10: + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *(short*)dst = *(short*)src; + break; + case 11: + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *((short*)dst)++ = *((short*)src)++; + *(char*)dst = *(char*)src; + break; + case 12: + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + break; + case 13: + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *(char*)dst = *(char*)src; + break; + case 14: + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *(short*)dst = *(short*)src; + break; + case 15: + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *((long*)dst)++ = *((long*)src)++; + *((short*)dst)++ = *((short*)src)++; + *(char*)dst = *(char*)src; + break; + } + + return return_dst; /* destination pointer. */ +} /* memcpy() */ diff --git a/arch/cris/lib/usercopy.c b/arch/cris/lib/usercopy.c new file mode 100644 index 000000000..17eebf2ee --- /dev/null +++ b/arch/cris/lib/usercopy.c @@ -0,0 +1,501 @@ +/* + * User address space access functions. + * The non-inlined parts of asm-cris/uaccess.h are here. + * + * Copyright (C) 2000, Axis Communications AB. + * + * Written by Hans-Peter Nilsson. + * Pieces used from memcpy, originally by Kenny Ranerup long time ago. + */ + +#include <asm/uaccess.h> + +/* Asm:s have been tweaked (within the domain of correctness) to give + satisfactory results for "gcc version 2.96 20000427 (experimental)". + + Check regularly... + + Note that the PC saved at a bus-fault is the address *after* the + faulting instruction, which means the branch-target for instructions in + delay-slots for taken branches. Note also that the postincrement in + the instruction is performed regardless of bus-fault; the register is + seen updated in fault handlers. + + Oh, and on the code formatting issue, to whomever feels like "fixing + it" to Conformity: I'm too "lazy", but why don't you go ahead and "fix" + string.c too. I just don't think too many people will hack this file + for the code format to be an issue. */ + + +/* Copy to userspace. This is based on the memcpy used for + kernel-to-kernel copying; see "string.c". */ + +unsigned long +__copy_user (void *pdst, const void *psrc, unsigned long pn) +{ + /* We want the parameters put in special registers. + Make sure the compiler is able to make something useful of this. + As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). + + FIXME: Comment for old gcc version. Check. + If gcc was allright, it really would need no temporaries, and no + stack space to save stuff on. */ + + register char *dst __asm__ ("r13") = pdst; + register const char *src __asm__ ("r11") = psrc; + register int n __asm__ ("r12") = pn; + register int retn __asm__ ("r10") = 0; + + + /* When src is aligned but not dst, this makes a few extra needless + cycles. I believe it would take as many to check that the + re-alignment was unnecessary. */ + if (((unsigned long) dst & 3) != 0 + /* Don't align if we wouldn't copy more than a few bytes; so we + don't have to check further for overflows. */ + && n >= 3) + { + if ((unsigned long) dst & 1) + { + __asm_copy_to_user_1 (dst, src, retn); + n--; + } + + if ((unsigned long) dst & 2) + { + __asm_copy_to_user_2 (dst, src, retn); + n -= 2; + } + } + + /* Decide which copying method to use. */ + if (n >= 44*2) /* Break even between movem and + move16 is at 38.7*2, but modulo 44. */ + { + /* For large copies we use 'movem'. */ + + /* It is not optimal to tell the compiler about clobbering any + registers; that will move the saving/restoring of those registers + to the function prologue/epilogue, and make non-movem sizes + suboptimal. + + This method is not foolproof; it assumes that the "asm reg" + declarations at the beginning of the function really are used + here (beware: they may be moved to temporary registers). + This way, we do not have to save/move the registers around into + temporaries; we can safely use them straight away. + + If you want to check that the allocation was right; then + check the equalities in the first comment. It should say + "r13=r13, r11=r11, r12=r12". */ + __asm__ volatile (" + ;; Check that the following is true (same register names on + ;; both sides of equal sign, as in r8=r8): + ;; %0=r13, %1=r11, %2=r12 %3=r10 + ;; + ;; Save the registers we'll use in the movem process + ;; on the stack. + subq 11*4,sp + movem r10,[sp] + + ;; Now we've got this: + ;; r11 - src + ;; r13 - dst + ;; r12 - n + + ;; Update n for the first loop + subq 44,r12 + +; Since the noted PC of a faulting instruction in a delay-slot of a taken +; branch, is that of the branch target, we actually point at the from-movem +; for this case. There is no ambiguity here; if there was a fault in that +; instruction (meaning a kernel oops), the faulted PC would be the address +; after *that* movem. + +0: + movem [r11+],r10 + subq 44,r12 + bge 0b + movem r10,[r13+] +1: + addq 44,r12 ;; compensate for last loop underflowing n + + ;; Restore registers from stack + movem [sp+],r10 +2: + .section .fixup,\"ax\" + +; To provide a correct count in r10 of bytes that failed to be copied, +; we jump back into the loop if the loop-branch was taken. There is no +; performance penalty for sany use; the program will segfault soon enough. + +3: + move.d [sp],r10 + addq 44,r10 + move.d r10,[sp] + jump 0b +4: + movem [sp+],r10 + addq 44,r10 + addq 44,r12 + jump 2b + + .previous + .section __ex_table,\"a\" + .dword 0b,3b + .dword 1b,4b + .previous" + + /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn) + /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn)); + + } + + /* Either we directly start copying, using dword copying in a loop, or + we copy as much as possible with 'movem' and then the last block (<44 + bytes) is copied here. This will work since 'movem' will have + updated SRC, DST and N. */ + + while (n >= 16) + { + __asm_copy_to_user_16 (dst, src, retn); + n -= 16; + } + + /* Having a separate by-four loops cuts down on cache footprint. + FIXME: Test with and without; increasing switch to be 0..15. */ + while (n >= 4) + { + __asm_copy_to_user_4 (dst, src, retn); + n -= 4; + } + + switch (n) + { + case 0: + break; + case 1: + __asm_copy_to_user_1 (dst, src, retn); + break; + case 2: + __asm_copy_to_user_2 (dst, src, retn); + break; + case 3: + __asm_copy_to_user_3 (dst, src, retn); + break; + } + + return retn; +} + +/* Copy from user to kernel, zeroing the bytes that were inaccessible in + userland. */ + +unsigned long +__copy_user_zeroing (void *pdst, const void *psrc, unsigned long pn) +{ + /* We want the parameters put in special registers. + Make sure the compiler is able to make something useful of this. + As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). + + FIXME: Comment for old gcc version. Check. + If gcc was allright, it really would need no temporaries, and no + stack space to save stuff on. */ + + register char *dst __asm__ ("r13") = pdst; + register const char *src __asm__ ("r11") = psrc; + register int n __asm__ ("r12") = pn; + register int retn __asm__ ("r10") = 0; + + /* When src is aligned but not dst, this makes a few extra needless + cycles. I believe it would take as many to check that the + re-alignment was unnecessary. */ + if (((unsigned long) dst & 3) != 0 + /* Don't align if we wouldn't copy more than a few bytes; so we + don't have to check further for overflows. */ + && n >= 3) + { + if ((unsigned long) dst & 1) + { + __asm_copy_from_user_1 (dst, src, retn); + n--; + } + + if ((unsigned long) dst & 2) + { + __asm_copy_from_user_2 (dst, src, retn); + n -= 2; + } + } + + /* Decide which copying method to use. */ + if (n >= 44*2) /* Break even between movem and + move16 is at 38.7*2, but modulo 44. */ + { + /* For large copies we use 'movem' */ + + /* It is not optimal to tell the compiler about clobbering any + registers; that will move the saving/restoring of those registers + to the function prologue/epilogue, and make non-movem sizes + suboptimal. + + This method is not foolproof; it assumes that the "asm reg" + declarations at the beginning of the function really are used + here (beware: they may be moved to temporary registers). + This way, we do not have to save/move the registers around into + temporaries; we can safely use them straight away. + + If you want to check that the allocation was right; then + check the equalities in the first comment. It should say + "r13=r13, r11=r11, r12=r12" */ + __asm__ volatile (" + ;; Check that the following is true (same register names on + ;; both sides of equal sign, as in r8=r8): + ;; %0=r13, %1=r11, %2=r12 %3=r10 + ;; + ;; Save the registers we'll use in the movem process + ;; on the stack. + subq 11*4,sp + movem r10,[sp] + + ;; Now we've got this: + ;; r11 - src + ;; r13 - dst + ;; r12 - n + + ;; Update n for the first loop + subq 44,r12 +0: + movem [r11+],r10 +1: + subq 44,r12 + bge 0b + movem r10,[r13+] + + addq 44,r12 ;; compensate for last loop underflowing n + + ;; Restore registers from stack + movem [sp+],r10 + + .section .fixup,\"ax\" + +; To provide a correct count in r10 of bytes that failed to be copied, +; we jump back into the loop if the loop-branch was taken. +; There is no performance penalty; the program will segfault soon +; enough. + +3: + move.d [sp],r10 + addq 44,r10 + move.d r10,[sp] + clear.d r0 + clear.d r1 + clear.d r2 + clear.d r3 + clear.d r4 + clear.d r5 + clear.d r6 + clear.d r7 + clear.d r8 + clear.d r9 + clear.d r10 + jump 1b + + .previous + .section __ex_table,\"a\" + .dword 1b,3b + .previous" + + /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn) + /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn)); + + } + + /* Either we directly start copying here, using dword copying in a loop, + or we copy as much as possible with 'movem' and then the last block + (<44 bytes) is copied here. This will work since 'movem' will have + updated src, dst and n. */ + + while (n >= 16) + { + __asm_copy_from_user_16 (dst, src, retn); + n -= 16; + } + + /* Having a separate by-four loops cuts down on cache footprint. + FIXME: Test with and without; increasing switch to be 0..15. */ + while (n >= 4) + { + __asm_copy_from_user_4 (dst, src, retn); + n -= 4; + } + + switch (n) + { + case 0: + break; + case 1: + __asm_copy_from_user_1 (dst, src, retn); + break; + case 2: + __asm_copy_from_user_2 (dst, src, retn); + break; + case 3: + __asm_copy_from_user_3 (dst, src, retn); + break; + } + + return retn; +} + +/* Zero userspace. */ + +unsigned long +__do_clear_user (void *pto, unsigned long pn) +{ + /* We want the parameters put in special registers. + Make sure the compiler is able to make something useful of this. + As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). + + FIXME: Comment for old gcc version. Check. + If gcc was allright, it really would need no temporaries, and no + stack space to save stuff on. */ + + register char *dst __asm__ ("r13") = pto; + register int n __asm__ ("r12") = pn; + register int retn __asm__ ("r10") = 0; + + + if (((unsigned long) dst & 3) != 0 + /* Don't align if we wouldn't copy more than a few bytes. */ + && n >= 3) + { + if ((unsigned long) dst & 1) + { + __asm_clear_1 (dst, retn); + n--; + } + + if ((unsigned long) dst & 2) + { + __asm_clear_2 (dst, retn); + n -= 2; + } + } + + /* Decide which copying method to use. + FIXME: This number is from the "ordinary" kernel memset. */ + if (n >= (1*48)) + { + /* For large clears we use 'movem' */ + + /* It is not optimal to tell the compiler about clobbering any + call-saved registers; that will move the saving/restoring of + those registers to the function prologue/epilogue, and make + non-movem sizes suboptimal. + + This method is not foolproof; it assumes that the "asm reg" + declarations at the beginning of the function really are used + here (beware: they may be moved to temporary registers). + This way, we do not have to save/move the registers around into + temporaries; we can safely use them straight away. + + If you want to check that the allocation was right; then + check the equalities in the first comment. It should say + something like "r13=r13, r11=r11, r12=r12". */ + __asm__ volatile (" + ;; Check that the following is true (same register names on + ;; both sides of equal sign, as in r8=r8): + ;; %0=r13, %1=r12 %2=r10 + ;; + ;; Save the registers we'll clobber in the movem process + ;; on the stack. Don't mention them to gcc, it will only be + ;; upset. + subq 11*4,sp + movem r10,[sp] + + clear.d r0 + clear.d r1 + clear.d r2 + clear.d r3 + clear.d r4 + clear.d r5 + clear.d r6 + clear.d r7 + clear.d r8 + clear.d r9 + clear.d r10 + clear.d r11 + + ;; Now we've got this: + ;; r13 - dst + ;; r12 - n + + ;; Update n for the first loop + subq 12*4,r12 +0: + subq 12*4,r12 + bge 0b + movem r11,[r13+] +1: + addq 12*4,r12 ;; compensate for last loop underflowing n + + ;; Restore registers from stack + movem [sp+],r10 +2: + .section .fixup,\"ax\" +3: + move.d [sp],r10 + addq 12*4,r10 + move.d r10,[sp] + clear.d r10 + jump 0b + +4: + movem [sp+],r10 + addq 12*4,r10 + addq 12*4,r12 + jump 2b + + .previous + .section __ex_table,\"a\" + .dword 0b,3b + .dword 1b,4b + .previous" + + /* Outputs */ : "=r" (dst), "=r" (n), "=r" (retn) + /* Inputs */ : "0" (dst), "1" (n), "2" (retn) + /* Clobber */ : "r11"); + } + + while (n >= 16) + { + __asm_clear_16 (dst, retn); + n -= 16; + } + + /* Having a separate by-four loops cuts down on cache footprint. + FIXME: Test with and without; increasing switch to be 0..15. */ + while (n >= 4) + { + __asm_clear_4 (dst, retn); + n -= 4; + } + + switch (n) + { + case 0: + break; + case 1: + __asm_clear_1 (dst, retn); + break; + case 2: + __asm_clear_2 (dst, retn); + break; + case 3: + __asm_clear_3 (dst, retn); + break; + } + + return retn; +} |