/* * xor.c : Multiple Devices driver for Linux * * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek * * * optimized RAID-5 checksumming functions. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include #define BH_TRACE 0 #include #include #ifdef __sparc_v9__ #include #include #include #endif /* * we use the 'XOR function template' to register multiple xor * functions runtime. The kernel measures their speed upon bootup * and decides which one to use. (compile-time registration is * not enough as certain CPU features like MMX can only be detected * runtime) * * this architecture makes it pretty easy to add new routines * that are faster on certain CPUs, without killing other CPU's * 'native' routine. Although the current routines are belived * to be the physically fastest ones on all CPUs tested, but * feel free to prove me wrong and add yet another routine =B-) * --mingo */ #define MAX_XOR_BLOCKS 5 #define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr) typedef void (*xor_block_t) XOR_ARGS; xor_block_t xor_block = NULL; #ifndef __sparc_v9__ struct xor_block_template; struct xor_block_template { char * name; xor_block_t xor_block; int speed; struct xor_block_template * next; }; struct xor_block_template * xor_functions = NULL; #define XORBLOCK_TEMPLATE(x) \ static void xor_block_##x XOR_ARGS; \ static struct xor_block_template t_xor_block_##x = \ { #x, xor_block_##x, 0, NULL }; \ static void xor_block_##x XOR_ARGS #ifdef __i386__ #ifdef CONFIG_X86_XMM /* * Cache avoiding checksumming functions utilizing KNI instructions * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) */ XORBLOCK_TEMPLATE(pIII_kni) { char xmm_save[16*4]; int cr0; int lines = (bh_ptr[0]->b_size>>8); __asm__ __volatile__ ( "movl %%cr0,%0 ;\n\t" "clts ;\n\t" "movups %%xmm0,(%1) ;\n\t" "movups %%xmm1,0x10(%1) ;\n\t" "movups %%xmm2,0x20(%1) ;\n\t" "movups %%xmm3,0x30(%1) ;\n\t" : "=r" (cr0) : "r" (xmm_save) : "memory" ); #define OFFS(x) "8*("#x"*2)" #define PF0(x) \ " prefetcht0 "OFFS(x)"(%1) ;\n" #define LD(x,y) \ " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" #define ST(x,y) \ " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" #define PF1(x) \ " prefetchnta "OFFS(x)"(%2) ;\n" #define PF2(x) \ " prefetchnta "OFFS(x)"(%3) ;\n" #define PF3(x) \ " prefetchnta "OFFS(x)"(%4) ;\n" #define PF4(x) \ " prefetchnta "OFFS(x)"(%5) ;\n" #define PF5(x) \ " prefetchnta "OFFS(x)"(%6) ;\n" #define XO1(x,y) \ " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" #define XO2(x,y) \ " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" #define XO3(x,y) \ " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" #define XO4(x,y) \ " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" #define XO5(x,y) \ " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" switch(count) { case 2: __asm__ __volatile__ ( #undef BLOCK #define BLOCK(i) \ LD(i,0) \ LD(i+1,1) \ PF1(i) \ PF1(i+2) \ LD(i+2,2) \ LD(i+3,3) \ PF0(i+4) \ PF0(i+6) \ XO1(i,0) \ XO1(i+1,1) \ XO1(i+2,2) \ XO1(i+3,3) \ ST(i,0) \ ST(i+1,1) \ ST(i+2,2) \ ST(i+3,3) \ PF0(0) PF0(2) " .align 32,0x90 ;\n" " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) " addl $256, %1 ;\n" " addl $256, %2 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data) : "memory" ); break; case 3: __asm__ __volatile__ ( #undef BLOCK #define BLOCK(i) \ PF1(i) \ PF1(i+2) \ LD(i,0) \ LD(i+1,1) \ LD(i+2,2) \ LD(i+3,3) \ PF2(i) \ PF2(i+2) \ PF0(i+4) \ PF0(i+6) \ XO1(i,0) \ XO1(i+1,1) \ XO1(i+2,2) \ XO1(i+3,3) \ XO2(i,0) \ XO2(i+1,1) \ XO2(i+2,2) \ XO2(i+3,3) \ ST(i,0) \ ST(i+1,1) \ ST(i+2,2) \ ST(i+3,3) \ PF0(0) PF0(2) " .align 32,0x90 ;\n" " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) " addl $256, %1 ;\n" " addl $256, %2 ;\n" " addl $256, %3 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data), "r" (bh_ptr[2]->b_data) : "memory" ); break; case 4: __asm__ __volatile__ ( #undef BLOCK #define BLOCK(i) \ PF1(i) \ PF1(i+2) \ LD(i,0) \ LD(i+1,1) \ LD(i+2,2) \ LD(i+3,3) \ PF2(i) \ PF2(i+2) \ XO1(i,0) \ XO1(i+1,1) \ XO1(i+2,2) \ XO1(i+3,3) \ PF3(i) \ PF3(i+2) \ PF0(i+4) \ PF0(i+6) \ XO2(i,0) \ XO2(i+1,1) \ XO2(i+2,2) \ XO2(i+3,3) \ XO3(i,0) \ XO3(i+1,1) \ XO3(i+2,2) \ XO3(i+3,3) \ ST(i,0) \ ST(i+1,1) \ ST(i+2,2) \ ST(i+3,3) \ PF0(0) PF0(2) " .align 32,0x90 ;\n" " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) " addl $256, %1 ;\n" " addl $256, %2 ;\n" " addl $256, %3 ;\n" " addl $256, %4 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data), "r" (bh_ptr[2]->b_data), "r" (bh_ptr[3]->b_data) : "memory" ); break; case 5: __asm__ __volatile__ ( #undef BLOCK #define BLOCK(i) \ PF1(i) \ PF1(i+2) \ LD(i,0) \ LD(i+1,1) \ LD(i+2,2) \ LD(i+3,3) \ PF2(i) \ PF2(i+2) \ XO1(i,0) \ XO1(i+1,1) \ XO1(i+2,2) \ XO1(i+3,3) \ PF3(i) \ PF3(i+2) \ XO2(i,0) \ XO2(i+1,1) \ XO2(i+2,2) \ XO2(i+3,3) \ PF4(i) \ PF4(i+2) \ PF0(i+4) \ PF0(i+6) \ XO3(i,0) \ XO3(i+1,1) \ XO3(i+2,2) \ XO3(i+3,3) \ XO4(i,0) \ XO4(i+1,1) \ XO4(i+2,2) \ XO4(i+3,3) \ ST(i,0) \ ST(i+1,1) \ ST(i+2,2) \ ST(i+3,3) \ PF0(0) PF0(2) " .align 32,0x90 ;\n" " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) " addl $256, %1 ;\n" " addl $256, %2 ;\n" " addl $256, %3 ;\n" " addl $256, %4 ;\n" " addl $256, %5 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data), "r" (bh_ptr[2]->b_data), "r" (bh_ptr[3]->b_data), "r" (bh_ptr[4]->b_data) : "memory"); break; } __asm__ __volatile__ ( "sfence ;\n\t" "movups (%1),%%xmm0 ;\n\t" "movups 0x10(%1),%%xmm1 ;\n\t" "movups 0x20(%1),%%xmm2 ;\n\t" "movups 0x30(%1),%%xmm3 ;\n\t" "movl %0,%%cr0 ;\n\t" : : "r" (cr0), "r" (xmm_save) : "memory" ); } #undef OFFS #undef LD #undef ST #undef PF0 #undef PF1 #undef PF2 #undef PF3 #undef PF4 #undef PF5 #undef XO1 #undef XO2 #undef XO3 #undef XO4 #undef XO5 #undef BLOCK #endif /* CONFIG_X86_XMM */ /* * high-speed RAID5 checksumming functions utilizing MMX instructions * Copyright (C) 1998 Ingo Molnar */ XORBLOCK_TEMPLATE(pII_mmx) { char fpu_save[108]; int lines = (bh_ptr[0]->b_size>>7); if (!(current->flags & PF_USEDFPU)) __asm__ __volatile__ ( " clts;\n"); __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); #define LD(x,y) \ " movq 8*("#x")(%1), %%mm"#y" ;\n" #define ST(x,y) \ " movq %%mm"#y", 8*("#x")(%1) ;\n" #define XO1(x,y) \ " pxor 8*("#x")(%2), %%mm"#y" ;\n" #define XO2(x,y) \ " pxor 8*("#x")(%3), %%mm"#y" ;\n" #define XO3(x,y) \ " pxor 8*("#x")(%4), %%mm"#y" ;\n" #define XO4(x,y) \ " pxor 8*("#x")(%5), %%mm"#y" ;\n" switch(count) { case 2: __asm__ __volatile__ ( #undef BLOCK #define BLOCK(i) \ LD(i,0) \ LD(i+1,1) \ LD(i+2,2) \ LD(i+3,3) \ XO1(i,0) \ ST(i,0) \ XO1(i+1,1) \ ST(i+1,1) \ XO1(i+2,2) \ ST(i+2,2) \ XO1(i+3,3) \ ST(i+3,3) " .align 32,0x90 ;\n" " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) " addl $128, %1 ;\n" " addl $128, %2 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data) : "memory"); break; case 3: __asm__ __volatile__ ( #undef BLOCK #define BLOCK(i) \ LD(i,0) \ LD(i+1,1) \ LD(i+2,2) \ LD(i+3,3) \ XO1(i,0) \ XO1(i+1,1) \ XO1(i+2,2) \ XO1(i+3,3) \ XO2(i,0) \ ST(i,0) \ XO2(i+1,1) \ ST(i+1,1) \ XO2(i+2,2) \ ST(i+2,2) \ XO2(i+3,3) \ ST(i+3,3) " .align 32,0x90 ;\n" " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) " addl $128, %1 ;\n" " addl $128, %2 ;\n" " addl $128, %3 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data), "r" (bh_ptr[2]->b_data) : "memory"); break; case 4: __asm__ __volatile__ ( #undef BLOCK #define BLOCK(i) \ LD(i,0) \ LD(i+1,1) \ LD(i+2,2) \ LD(i+3,3) \ XO1(i,0) \ XO1(i+1,1) \ XO1(i+2,2) \ XO1(i+3,3) \ XO2(i,0) \ XO2(i+1,1) \ XO2(i+2,2) \ XO2(i+3,3) \ XO3(i,0) \ ST(i,0) \ XO3(i+1,1) \ ST(i+1,1) \ XO3(i+2,2) \ ST(i+2,2) \ XO3(i+3,3) \ ST(i+3,3) " .align 32,0x90 ;\n" " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) " addl $128, %1 ;\n" " addl $128, %2 ;\n" " addl $128, %3 ;\n" " addl $128, %4 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data), "r" (bh_ptr[2]->b_data), "r" (bh_ptr[3]->b_data) : "memory"); break; case 5: __asm__ __volatile__ ( #undef BLOCK #define BLOCK(i) \ LD(i,0) \ LD(i+1,1) \ LD(i+2,2) \ LD(i+3,3) \ XO1(i,0) \ XO1(i+1,1) \ XO1(i+2,2) \ XO1(i+3,3) \ XO2(i,0) \ XO2(i+1,1) \ XO2(i+2,2) \ XO2(i+3,3) \ XO3(i,0) \ XO3(i+1,1) \ XO3(i+2,2) \ XO3(i+3,3) \ XO4(i,0) \ ST(i,0) \ XO4(i+1,1) \ ST(i+1,1) \ XO4(i+2,2) \ ST(i+2,2) \ XO4(i+3,3) \ ST(i+3,3) " .align 32,0x90 ;\n" " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) " addl $128, %1 ;\n" " addl $128, %2 ;\n" " addl $128, %3 ;\n" " addl $128, %4 ;\n" " addl $128, %5 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "g" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data), "r" (bh_ptr[2]->b_data), "r" (bh_ptr[3]->b_data), "r" (bh_ptr[4]->b_data) : "memory"); break; } __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); if (!(current->flags & PF_USEDFPU)) stts(); } #undef LD #undef XO1 #undef XO2 #undef XO3 #undef XO4 #undef ST #undef BLOCK XORBLOCK_TEMPLATE(p5_mmx) { char fpu_save[108]; int lines = (bh_ptr[0]->b_size>>6); if (!(current->flags & PF_USEDFPU)) __asm__ __volatile__ ( " clts;\n"); __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); switch(count) { case 2: __asm__ __volatile__ ( " .align 32,0x90 ;\n" " 1: ;\n" " movq (%1), %%mm0 ;\n" " movq 8(%1), %%mm1 ;\n" " pxor (%2), %%mm0 ;\n" " movq 16(%1), %%mm2 ;\n" " movq %%mm0, (%1) ;\n" " pxor 8(%2), %%mm1 ;\n" " movq 24(%1), %%mm3 ;\n" " movq %%mm1, 8(%1) ;\n" " pxor 16(%2), %%mm2 ;\n" " movq 32(%1), %%mm4 ;\n" " movq %%mm2, 16(%1) ;\n" " pxor 24(%2), %%mm3 ;\n" " movq 40(%1), %%mm5 ;\n" " movq %%mm3, 24(%1) ;\n" " pxor 32(%2), %%mm4 ;\n" " movq 48(%1), %%mm6 ;\n" " movq %%mm4, 32(%1) ;\n" " pxor 40(%2), %%mm5 ;\n" " movq 56(%1), %%mm7 ;\n" " movq %%mm5, 40(%1) ;\n" " pxor 48(%2), %%mm6 ;\n" " pxor 56(%2), %%mm7 ;\n" " movq %%mm6, 48(%1) ;\n" " movq %%mm7, 56(%1) ;\n" " addl $64, %1 ;\n" " addl $64, %2 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data) : "memory" ); break; case 3: __asm__ __volatile__ ( " .align 32,0x90 ;\n" " 1: ;\n" " movq (%1), %%mm0 ;\n" " movq 8(%1), %%mm1 ;\n" " pxor (%2), %%mm0 ;\n" " movq 16(%1), %%mm2 ;\n" " pxor 8(%2), %%mm1 ;\n" " pxor (%3), %%mm0 ;\n" " pxor 16(%2), %%mm2 ;\n" " movq %%mm0, (%1) ;\n" " pxor 8(%3), %%mm1 ;\n" " pxor 16(%3), %%mm2 ;\n" " movq 24(%1), %%mm3 ;\n" " movq %%mm1, 8(%1) ;\n" " movq 32(%1), %%mm4 ;\n" " movq 40(%1), %%mm5 ;\n" " pxor 24(%2), %%mm3 ;\n" " movq %%mm2, 16(%1) ;\n" " pxor 32(%2), %%mm4 ;\n" " pxor 24(%3), %%mm3 ;\n" " pxor 40(%2), %%mm5 ;\n" " movq %%mm3, 24(%1) ;\n" " pxor 32(%3), %%mm4 ;\n" " pxor 40(%3), %%mm5 ;\n" " movq 48(%1), %%mm6 ;\n" " movq %%mm4, 32(%1) ;\n" " movq 56(%1), %%mm7 ;\n" " pxor 48(%2), %%mm6 ;\n" " movq %%mm5, 40(%1) ;\n" " pxor 56(%2), %%mm7 ;\n" " pxor 48(%3), %%mm6 ;\n" " pxor 56(%3), %%mm7 ;\n" " movq %%mm6, 48(%1) ;\n" " movq %%mm7, 56(%1) ;\n" " addl $64, %1 ;\n" " addl $64, %2 ;\n" " addl $64, %3 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data), "r" (bh_ptr[2]->b_data) : "memory" ); break; case 4: __asm__ __volatile__ ( " .align 32,0x90 ;\n" " 1: ;\n" " movq (%1), %%mm0 ;\n" " movq 8(%1), %%mm1 ;\n" " pxor (%2), %%mm0 ;\n" " movq 16(%1), %%mm2 ;\n" " pxor 8(%2), %%mm1 ;\n" " pxor (%3), %%mm0 ;\n" " pxor 16(%2), %%mm2 ;\n" " pxor 8(%3), %%mm1 ;\n" " pxor (%4), %%mm0 ;\n" " movq 24(%1), %%mm3 ;\n" " pxor 16(%3), %%mm2 ;\n" " pxor 8(%4), %%mm1 ;\n" " movq %%mm0, (%1) ;\n" " movq 32(%1), %%mm4 ;\n" " pxor 24(%2), %%mm3 ;\n" " pxor 16(%4), %%mm2 ;\n" " movq %%mm1, 8(%1) ;\n" " movq 40(%1), %%mm5 ;\n" " pxor 32(%2), %%mm4 ;\n" " pxor 24(%3), %%mm3 ;\n" " movq %%mm2, 16(%1) ;\n" " pxor 40(%2), %%mm5 ;\n" " pxor 32(%3), %%mm4 ;\n" " pxor 24(%4), %%mm3 ;\n" " movq %%mm3, 24(%1) ;\n" " movq 56(%1), %%mm7 ;\n" " movq 48(%1), %%mm6 ;\n" " pxor 40(%3), %%mm5 ;\n" " pxor 32(%4), %%mm4 ;\n" " pxor 48(%2), %%mm6 ;\n" " movq %%mm4, 32(%1) ;\n" " pxor 56(%2), %%mm7 ;\n" " pxor 40(%4), %%mm5 ;\n" " pxor 48(%3), %%mm6 ;\n" " pxor 56(%3), %%mm7 ;\n" " movq %%mm5, 40(%1) ;\n" " pxor 48(%4), %%mm6 ;\n" " pxor 56(%4), %%mm7 ;\n" " movq %%mm6, 48(%1) ;\n" " movq %%mm7, 56(%1) ;\n" " addl $64, %1 ;\n" " addl $64, %2 ;\n" " addl $64, %3 ;\n" " addl $64, %4 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "r" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data), "r" (bh_ptr[2]->b_data), "r" (bh_ptr[3]->b_data) : "memory" ); break; case 5: __asm__ __volatile__ ( " .align 32,0x90 ;\n" " 1: ;\n" " movq (%1), %%mm0 ;\n" " movq 8(%1), %%mm1 ;\n" " pxor (%2), %%mm0 ;\n" " pxor 8(%2), %%mm1 ;\n" " movq 16(%1), %%mm2 ;\n" " pxor (%3), %%mm0 ;\n" " pxor 8(%3), %%mm1 ;\n" " pxor 16(%2), %%mm2 ;\n" " pxor (%4), %%mm0 ;\n" " pxor 8(%4), %%mm1 ;\n" " pxor 16(%3), %%mm2 ;\n" " movq 24(%1), %%mm3 ;\n" " pxor (%5), %%mm0 ;\n" " pxor 8(%5), %%mm1 ;\n" " movq %%mm0, (%1) ;\n" " pxor 16(%4), %%mm2 ;\n" " pxor 24(%2), %%mm3 ;\n" " movq %%mm1, 8(%1) ;\n" " pxor 16(%5), %%mm2 ;\n" " pxor 24(%3), %%mm3 ;\n" " movq 32(%1), %%mm4 ;\n" " movq %%mm2, 16(%1) ;\n" " pxor 24(%4), %%mm3 ;\n" " pxor 32(%2), %%mm4 ;\n" " movq 40(%1), %%mm5 ;\n" " pxor 24(%5), %%mm3 ;\n" " pxor 32(%3), %%mm4 ;\n" " pxor 40(%2), %%mm5 ;\n" " movq %%mm3, 24(%1) ;\n" " pxor 32(%4), %%mm4 ;\n" " pxor 40(%3), %%mm5 ;\n" " movq 48(%1), %%mm6 ;\n" " movq 56(%1), %%mm7 ;\n" " pxor 32(%5), %%mm4 ;\n" " pxor 40(%4), %%mm5 ;\n" " pxor 48(%2), %%mm6 ;\n" " pxor 56(%2), %%mm7 ;\n" " movq %%mm4, 32(%1) ;\n" " pxor 48(%3), %%mm6 ;\n" " pxor 56(%3), %%mm7 ;\n" " pxor 40(%5), %%mm5 ;\n" " pxor 48(%4), %%mm6 ;\n" " pxor 56(%4), %%mm7 ;\n" " movq %%mm5, 40(%1) ;\n" " pxor 48(%5), %%mm6 ;\n" " pxor 56(%5), %%mm7 ;\n" " movq %%mm6, 48(%1) ;\n" " movq %%mm7, 56(%1) ;\n" " addl $64, %1 ;\n" " addl $64, %2 ;\n" " addl $64, %3 ;\n" " addl $64, %4 ;\n" " addl $64, %5 ;\n" " decl %0 ;\n" " jnz 1b ;\n" : : "g" (lines), "r" (bh_ptr[0]->b_data), "r" (bh_ptr[1]->b_data), "r" (bh_ptr[2]->b_data), "r" (bh_ptr[3]->b_data), "r" (bh_ptr[4]->b_data) : "memory" ); break; } __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); if (!(current->flags & PF_USEDFPU)) stts(); } #endif /* __i386__ */ #endif /* !__sparc_v9__ */ #ifdef __sparc_v9__ /* * High speed xor_block operation for RAID4/5 utilizing the * UltraSparc Visual Instruction Set. * * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) * * Requirements: * !(((long)dest | (long)sourceN) & (64 - 1)) && * !(len & 127) && len >= 256 * * It is done in pure assembly, as otherwise gcc makes it * a non-leaf function, which is not what we want. * Also, we don't measure the speeds as on other architectures, * as the measuring routine does not take into account cold caches * and the fact that xor_block_VIS bypasses the caches. * xor_block_32regs might be 5% faster for count 2 if caches are hot * and things just right (for count 3 VIS is about as fast as 32regs for * hot caches and for count 4 and 5 VIS is faster by good margin always), * but I think it is better not to pollute the caches. * Actually, if I'd just fight for speed for hot caches, I could * write a hybrid VIS/integer routine, which would do always two * 64B blocks in VIS and two in IEUs, but I really care more about * caches. */ extern void *VISenter(void); extern void xor_block_VIS XOR_ARGS; void __xor_block_VIS(void) { __asm__ (" .globl xor_block_VIS xor_block_VIS: ldx [%%o1 + 0], %%o4 ldx [%%o1 + 8], %%o3 ldx [%%o4 + %1], %%g5 ldx [%%o4 + %0], %%o4 ldx [%%o3 + %0], %%o3 rd %%fprs, %%o5 andcc %%o5, %2, %%g0 be,pt %%icc, 297f sethi %%hi(%5), %%g1 jmpl %%g1 + %%lo(%5), %%g7 add %%g7, 8, %%g7 297: wr %%g0, %4, %%fprs membar #LoadStore|#StoreLoad|#StoreStore sub %%g5, 64, %%g5 ldda [%%o4] %3, %%f0 ldda [%%o3] %3, %%f16 cmp %%o0, 4 bgeu,pt %%xcc, 10f cmp %%o0, 3 be,pn %%xcc, 13f mov -64, %%g1 sub %%g5, 64, %%g5 rd %%asi, %%g1 wr %%g0, %3, %%asi 2: ldda [%%o4 + 64] %%asi, %%f32 fxor %%f0, %%f16, %%f16 fxor %%f2, %%f18, %%f18 fxor %%f4, %%f20, %%f20 fxor %%f6, %%f22, %%f22 fxor %%f8, %%f24, %%f24 fxor %%f10, %%f26, %%f26 fxor %%f12, %%f28, %%f28 fxor %%f14, %%f30, %%f30 stda %%f16, [%%o4] %3 ldda [%%o3 + 64] %%asi, %%f48 ldda [%%o4 + 128] %%asi, %%f0 fxor %%f32, %%f48, %%f48 fxor %%f34, %%f50, %%f50 add %%o4, 128, %%o4 fxor %%f36, %%f52, %%f52 add %%o3, 128, %%o3 fxor %%f38, %%f54, %%f54 subcc %%g5, 128, %%g5 fxor %%f40, %%f56, %%f56 fxor %%f42, %%f58, %%f58 fxor %%f44, %%f60, %%f60 fxor %%f46, %%f62, %%f62 stda %%f48, [%%o4 - 64] %%asi bne,pt %%xcc, 2b ldda [%%o3] %3, %%f16 ldda [%%o4 + 64] %%asi, %%f32 fxor %%f0, %%f16, %%f16 fxor %%f2, %%f18, %%f18 fxor %%f4, %%f20, %%f20 fxor %%f6, %%f22, %%f22 fxor %%f8, %%f24, %%f24 fxor %%f10, %%f26, %%f26 fxor %%f12, %%f28, %%f28 fxor %%f14, %%f30, %%f30 stda %%f16, [%%o4] %3 ldda [%%o3 + 64] %%asi, %%f48 membar #Sync fxor %%f32, %%f48, %%f48 fxor %%f34, %%f50, %%f50 fxor %%f36, %%f52, %%f52 fxor %%f38, %%f54, %%f54 fxor %%f40, %%f56, %%f56 fxor %%f42, %%f58, %%f58 fxor %%f44, %%f60, %%f60 fxor %%f46, %%f62, %%f62 stda %%f48, [%%o4 + 64] %%asi membar #Sync|#StoreStore|#StoreLoad wr %%g0, 0, %%fprs retl wr %%g1, %%g0, %%asi 13: ldx [%%o1 + 16], %%o2 ldx [%%o2 + %0], %%o2 3: ldda [%%o2] %3, %%f32 fxor %%f0, %%f16, %%f48 fxor %%f2, %%f18, %%f50 add %%o4, 64, %%o4 fxor %%f4, %%f20, %%f52 fxor %%f6, %%f22, %%f54 add %%o3, 64, %%o3 fxor %%f8, %%f24, %%f56 fxor %%f10, %%f26, %%f58 fxor %%f12, %%f28, %%f60 fxor %%f14, %%f30, %%f62 ldda [%%o4] %3, %%f0 fxor %%f48, %%f32, %%f48 fxor %%f50, %%f34, %%f50 fxor %%f52, %%f36, %%f52 fxor %%f54, %%f38, %%f54 add %%o2, 64, %%o2 fxor %%f56, %%f40, %%f56 fxor %%f58, %%f42, %%f58 subcc %%g5, 64, %%g5 fxor %%f60, %%f44, %%f60 fxor %%f62, %%f46, %%f62 stda %%f48, [%%o4 + %%g1] %3 bne,pt %%xcc, 3b ldda [%%o3] %3, %%f16 ldda [%%o2] %3, %%f32 fxor %%f0, %%f16, %%f48 fxor %%f2, %%f18, %%f50 fxor %%f4, %%f20, %%f52 fxor %%f6, %%f22, %%f54 fxor %%f8, %%f24, %%f56 fxor %%f10, %%f26, %%f58 fxor %%f12, %%f28, %%f60 fxor %%f14, %%f30, %%f62 membar #Sync fxor %%f48, %%f32, %%f48 fxor %%f50, %%f34, %%f50 fxor %%f52, %%f36, %%f52 fxor %%f54, %%f38, %%f54 fxor %%f56, %%f40, %%f56 fxor %%f58, %%f42, %%f58 fxor %%f60, %%f44, %%f60 fxor %%f62, %%f46, %%f62 stda %%f48, [%%o4] %3 membar #Sync|#StoreStore|#StoreLoad retl wr %%g0, 0, %%fprs 10: cmp %%o0, 5 be,pt %%xcc, 15f mov -64, %%g1 14: ldx [%%o1 + 16], %%o2 ldx [%%o1 + 24], %%o0 ldx [%%o2 + %0], %%o2 ldx [%%o0 + %0], %%o0 4: ldda [%%o2] %3, %%f32 fxor %%f0, %%f16, %%f16 fxor %%f2, %%f18, %%f18 add %%o4, 64, %%o4 fxor %%f4, %%f20, %%f20 fxor %%f6, %%f22, %%f22 add %%o3, 64, %%o3 fxor %%f8, %%f24, %%f24 fxor %%f10, %%f26, %%f26 fxor %%f12, %%f28, %%f28 fxor %%f14, %%f30, %%f30 ldda [%%o0] %3, %%f48 fxor %%f16, %%f32, %%f32 fxor %%f18, %%f34, %%f34 fxor %%f20, %%f36, %%f36 fxor %%f22, %%f38, %%f38 add %%o2, 64, %%o2 fxor %%f24, %%f40, %%f40 fxor %%f26, %%f42, %%f42 fxor %%f28, %%f44, %%f44 fxor %%f30, %%f46, %%f46 ldda [%%o4] %3, %%f0 fxor %%f32, %%f48, %%f48 fxor %%f34, %%f50, %%f50 fxor %%f36, %%f52, %%f52 add %%o0, 64, %%o0 fxor %%f38, %%f54, %%f54 fxor %%f40, %%f56, %%f56 fxor %%f42, %%f58, %%f58 subcc %%g5, 64, %%g5 fxor %%f44, %%f60, %%f60 fxor %%f46, %%f62, %%f62 stda %%f48, [%%o4 + %%g1] %3 bne,pt %%xcc, 4b ldda [%%o3] %3, %%f16 ldda [%%o2] %3, %%f32 fxor %%f0, %%f16, %%f16 fxor %%f2, %%f18, %%f18 fxor %%f4, %%f20, %%f20 fxor %%f6, %%f22, %%f22 fxor %%f8, %%f24, %%f24 fxor %%f10, %%f26, %%f26 fxor %%f12, %%f28, %%f28 fxor %%f14, %%f30, %%f30 ldda [%%o0] %3, %%f48 fxor %%f16, %%f32, %%f32 fxor %%f18, %%f34, %%f34 fxor %%f20, %%f36, %%f36 fxor %%f22, %%f38, %%f38 fxor %%f24, %%f40, %%f40 fxor %%f26, %%f42, %%f42 fxor %%f28, %%f44, %%f44 fxor %%f30, %%f46, %%f46 membar #Sync fxor %%f32, %%f48, %%f48 fxor %%f34, %%f50, %%f50 fxor %%f36, %%f52, %%f52 fxor %%f38, %%f54, %%f54 fxor %%f40, %%f56, %%f56 fxor %%f42, %%f58, %%f58 fxor %%f44, %%f60, %%f60 fxor %%f46, %%f62, %%f62 stda %%f48, [%%o4] %3 membar #Sync|#StoreStore|#StoreLoad retl wr %%g0, 0, %%fprs 15: ldx [%%o1 + 16], %%o2 ldx [%%o1 + 24], %%o0 ldx [%%o1 + 32], %%o1 ldx [%%o2 + %0], %%o2 ldx [%%o0 + %0], %%o0 ldx [%%o1 + %0], %%o1 5: ldda [%%o2] %3, %%f32 fxor %%f0, %%f16, %%f48 fxor %%f2, %%f18, %%f50 add %%o4, 64, %%o4 fxor %%f4, %%f20, %%f52 fxor %%f6, %%f22, %%f54 add %%o3, 64, %%o3 fxor %%f8, %%f24, %%f56 fxor %%f10, %%f26, %%f58 fxor %%f12, %%f28, %%f60 fxor %%f14, %%f30, %%f62 ldda [%%o0] %3, %%f16 fxor %%f48, %%f32, %%f48 fxor %%f50, %%f34, %%f50 fxor %%f52, %%f36, %%f52 fxor %%f54, %%f38, %%f54 add %%o2, 64, %%o2 fxor %%f56, %%f40, %%f56 fxor %%f58, %%f42, %%f58 fxor %%f60, %%f44, %%f60 fxor %%f62, %%f46, %%f62 ldda [%%o1] %3, %%f32 fxor %%f48, %%f16, %%f48 fxor %%f50, %%f18, %%f50 add %%o0, 64, %%o0 fxor %%f52, %%f20, %%f52 fxor %%f54, %%f22, %%f54 add %%o1, 64, %%o1 fxor %%f56, %%f24, %%f56 fxor %%f58, %%f26, %%f58 fxor %%f60, %%f28, %%f60 fxor %%f62, %%f30, %%f62 ldda [%%o4] %3, %%f0 fxor %%f48, %%f32, %%f48 fxor %%f50, %%f34, %%f50 fxor %%f52, %%f36, %%f52 fxor %%f54, %%f38, %%f54 fxor %%f56, %%f40, %%f56 fxor %%f58, %%f42, %%f58 subcc %%g5, 64, %%g5 fxor %%f60, %%f44, %%f60 fxor %%f62, %%f46, %%f62 stda %%f48, [%%o4 + %%g1] %3 bne,pt %%xcc, 5b ldda [%%o3] %3, %%f16 ldda [%%o2] %3, %%f32 fxor %%f0, %%f16, %%f48 fxor %%f2, %%f18, %%f50 fxor %%f4, %%f20, %%f52 fxor %%f6, %%f22, %%f54 fxor %%f8, %%f24, %%f56 fxor %%f10, %%f26, %%f58 fxor %%f12, %%f28, %%f60 fxor %%f14, %%f30, %%f62 ldda [%%o0] %3, %%f16 fxor %%f48, %%f32, %%f48 fxor %%f50, %%f34, %%f50 fxor %%f52, %%f36, %%f52 fxor %%f54, %%f38, %%f54 fxor %%f56, %%f40, %%f56 fxor %%f58, %%f42, %%f58 fxor %%f60, %%f44, %%f60 fxor %%f62, %%f46, %%f62 ldda [%%o1] %3, %%f32 fxor %%f48, %%f16, %%f48 fxor %%f50, %%f18, %%f50 fxor %%f52, %%f20, %%f52 fxor %%f54, %%f22, %%f54 fxor %%f56, %%f24, %%f56 fxor %%f58, %%f26, %%f58 fxor %%f60, %%f28, %%f60 fxor %%f62, %%f30, %%f62 membar #Sync fxor %%f48, %%f32, %%f48 fxor %%f50, %%f34, %%f50 fxor %%f52, %%f36, %%f52 fxor %%f54, %%f38, %%f54 fxor %%f56, %%f40, %%f56 fxor %%f58, %%f42, %%f58 fxor %%f60, %%f44, %%f60 fxor %%f62, %%f46, %%f62 stda %%f48, [%%o4] %3 membar #Sync|#StoreStore|#StoreLoad retl wr %%g0, 0, %%fprs " : : "i" (&((struct buffer_head *)0)->b_data), "i" (&((struct buffer_head *)0)->b_size), "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P), "i" (FPRS_FEF), "i" (VISenter)); } #endif /* __sparc_v9__ */ #if defined(__sparc__) && !defined(__sparc_v9__) /* * High speed xor_block operation for RAID4/5 utilizing the * ldd/std SPARC instructions. * * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) * */ XORBLOCK_TEMPLATE(SPARC) { int size = bh_ptr[0]->b_size; int lines = size / (sizeof (long)) / 8, i; long *destp = (long *) bh_ptr[0]->b_data; long *source1 = (long *) bh_ptr[1]->b_data; long *source2, *source3, *source4; switch (count) { case 2: for (i = lines; i > 0; i--) { __asm__ __volatile__(" ldd [%0 + 0x00], %%g2 ldd [%0 + 0x08], %%g4 ldd [%0 + 0x10], %%o0 ldd [%0 + 0x18], %%o2 ldd [%1 + 0x00], %%o4 ldd [%1 + 0x08], %%l0 ldd [%1 + 0x10], %%l2 ldd [%1 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 std %%g2, [%0 + 0x00] std %%g4, [%0 + 0x08] std %%o0, [%0 + 0x10] std %%o2, [%0 + 0x18] " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); destp += 8; source1 += 8; } break; case 3: source2 = (long *) bh_ptr[2]->b_data; for (i = lines; i > 0; i--) { __asm__ __volatile__(" ldd [%0 + 0x00], %%g2 ldd [%0 + 0x08], %%g4 ldd [%0 + 0x10], %%o0 ldd [%0 + 0x18], %%o2 ldd [%1 + 0x00], %%o4 ldd [%1 + 0x08], %%l0 ldd [%1 + 0x10], %%l2 ldd [%1 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 ldd [%2 + 0x00], %%o4 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 ldd [%2 + 0x08], %%l0 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 ldd [%2 + 0x10], %%l2 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 ldd [%2 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 std %%g2, [%0 + 0x00] std %%g4, [%0 + 0x08] std %%o0, [%0 + 0x10] std %%o2, [%0 + 0x18] " : : "r" (destp), "r" (source1), "r" (source2) : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); destp += 8; source1 += 8; source2 += 8; } break; case 4: source2 = (long *) bh_ptr[2]->b_data; source3 = (long *) bh_ptr[3]->b_data; for (i = lines; i > 0; i--) { __asm__ __volatile__(" ldd [%0 + 0x00], %%g2 ldd [%0 + 0x08], %%g4 ldd [%0 + 0x10], %%o0 ldd [%0 + 0x18], %%o2 ldd [%1 + 0x00], %%o4 ldd [%1 + 0x08], %%l0 ldd [%1 + 0x10], %%l2 ldd [%1 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 ldd [%2 + 0x00], %%o4 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 ldd [%2 + 0x08], %%l0 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 ldd [%2 + 0x10], %%l2 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 ldd [%2 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 ldd [%3 + 0x00], %%o4 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 ldd [%3 + 0x08], %%l0 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 ldd [%3 + 0x10], %%l2 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 ldd [%3 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 std %%g2, [%0 + 0x00] std %%g4, [%0 + 0x08] std %%o0, [%0 + 0x10] std %%o2, [%0 + 0x18] " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3) : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); destp += 8; source1 += 8; source2 += 8; source3 += 8; } break; case 5: source2 = (long *) bh_ptr[2]->b_data; source3 = (long *) bh_ptr[3]->b_data; source4 = (long *) bh_ptr[4]->b_data; for (i = lines; i > 0; i--) { __asm__ __volatile__(" ldd [%0 + 0x00], %%g2 ldd [%0 + 0x08], %%g4 ldd [%0 + 0x10], %%o0 ldd [%0 + 0x18], %%o2 ldd [%1 + 0x00], %%o4 ldd [%1 + 0x08], %%l0 ldd [%1 + 0x10], %%l2 ldd [%1 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 ldd [%2 + 0x00], %%o4 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 ldd [%2 + 0x08], %%l0 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 ldd [%2 + 0x10], %%l2 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 ldd [%2 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 ldd [%3 + 0x00], %%o4 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 ldd [%3 + 0x08], %%l0 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 ldd [%3 + 0x10], %%l2 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 ldd [%3 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 ldd [%4 + 0x00], %%o4 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 ldd [%4 + 0x08], %%l0 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 ldd [%4 + 0x10], %%l2 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 ldd [%4 + 0x18], %%l4 xor %%g2, %%o4, %%g2 xor %%g3, %%o5, %%g3 xor %%g4, %%l0, %%g4 xor %%g5, %%l1, %%g5 xor %%o0, %%l2, %%o0 xor %%o1, %%l3, %%o1 xor %%o2, %%l4, %%o2 xor %%o3, %%l5, %%o3 std %%g2, [%0 + 0x00] std %%g4, [%0 + 0x08] std %%o0, [%0 + 0x10] std %%o2, [%0 + 0x18] " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4) : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); destp += 8; source1 += 8; source2 += 8; source3 += 8; source4 += 8; } break; } } #endif /* __sparc_v[78]__ */ #ifdef __alpha__ /* * High speed xor_block operation for RAID4/5 pipelined for Alpha EV5. * There is a second version using EV6 prefetch instructions. * * Copyright (C) 2000 Richard Henderson (rth@redhat.com) */ XORBLOCK_TEMPLATE(alpha) { long lines = bh_ptr[0]->b_size / sizeof (long) / 8; long *d = (long *) bh_ptr[0]->b_data; long *s1 = (long *) bh_ptr[1]->b_data; long *s2, *s3, *s4; if (count == 2) goto two_blocks; s2 = (long *) bh_ptr[2]->b_data; if (count == 3) goto three_blocks; s3 = (long *) bh_ptr[3]->b_data; if (count == 4) goto four_blocks; s4 = (long *) bh_ptr[4]->b_data; goto five_blocks; two_blocks: asm volatile (" .align 4 2: ldq $0,0(%0) ldq $1,0(%1) ldq $2,8(%0) ldq $3,8(%1) ldq $4,16(%0) ldq $5,16(%1) ldq $6,24(%0) ldq $7,24(%1) ldq $16,32(%0) ldq $17,32(%1) ldq $18,40(%0) ldq $19,40(%1) ldq $20,48(%0) ldq $21,48(%1) ldq $22,56(%0) xor $0,$1,$0 # 7 cycles from $1 load ldq $23,56(%1) xor $2,$3,$2 stq $0,0(%0) xor $4,$5,$4 stq $2,8(%0) xor $6,$7,$6 stq $4,16(%0) xor $16,$17,$16 stq $6,24(%0) xor $18,$19,$18 stq $16,32(%0) xor $20,$21,$20 stq $18,40(%0) xor $22,$23,$22 stq $20,48(%0) subq %2,1,%2 stq $22,56(%0) addq %0,64,%0 addq %1,64,%1 bgt %2,2b" : "=r"(d), "=r"(s1), "=r"(lines) : "0"(d), "1"(s1), "2"(lines) : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23"); return; three_blocks: asm volatile (" .align 4 3: ldq $0,0(%0) ldq $1,0(%1) ldq $2,0(%2) ldq $3,8(%0) ldq $4,8(%1) ldq $6,16(%0) ldq $7,16(%1) ldq $17,24(%0) ldq $18,24(%1) ldq $20,32(%0) ldq $21,32(%1) ldq $5,8(%2) ldq $16,16(%2) ldq $19,24(%2) ldq $22,32(%2) nop xor $0,$1,$1 # 8 cycles from $0 load xor $3,$4,$4 # 6 cycles from $4 load xor $6,$7,$7 # 6 cycles from $7 load xor $17,$18,$18 # 5 cycles from $18 load xor $1,$2,$2 # 9 cycles from $2 load xor $20,$21,$21 # 5 cycles from $21 load stq $2,0(%0) xor $4,$5,$5 # 6 cycles from $5 load stq $5,8(%0) xor $7,$16,$16 # 7 cycles from $16 load stq $16,16(%0) xor $18,$19,$19 # 7 cycles from $19 load stq $19,24(%0) xor $21,$22,$22 # 7 cycles from $22 load stq $22,32(%0) nop ldq $0,40(%0) ldq $1,40(%1) ldq $3,48(%0) ldq $4,48(%1) ldq $6,56(%0) ldq $7,56(%1) ldq $2,40(%2) ldq $5,48(%2) ldq $16,56(%2) xor $0,$1,$1 # 4 cycles from $1 load xor $3,$4,$4 # 5 cycles from $4 load xor $6,$7,$7 # 5 cycles from $7 load xor $1,$2,$2 # 4 cycles from $2 load xor $4,$5,$5 # 5 cycles from $5 load stq $2,40(%0) xor $7,$16,$16 # 4 cycles from $16 load stq $5,48(%0) subq %3,1,%3 stq $16,56(%0) addq %2,64,%2 addq %1,64,%1 addq %0,64,%0 bgt %3,3b" : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines) : "0"(d), "1"(s1), "2"(s2), "3"(lines) : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$16", "$17", "$18", "$19", "$20", "$21", "$22"); return; four_blocks: asm volatile (" .align 4 4: ldq $0,0(%0) ldq $1,0(%1) ldq $2,0(%2) ldq $3,0(%3) ldq $4,8(%0) ldq $5,8(%1) ldq $6,8(%2) ldq $7,8(%3) ldq $16,16(%0) ldq $17,16(%1) ldq $18,16(%2) ldq $19,16(%3) ldq $20,24(%0) xor $0,$1,$1 # 6 cycles from $1 load ldq $21,24(%1) xor $2,$3,$3 # 6 cycles from $3 load ldq $0,24(%2) xor $1,$3,$3 ldq $1,24(%3) xor $4,$5,$5 # 7 cycles from $5 load stq $3,0(%0) xor $6,$7,$7 xor $16,$17,$17 # 7 cycles from $17 load xor $5,$7,$7 stq $7,8(%0) xor $18,$19,$19 # 7 cycles from $19 load ldq $2,32(%0) xor $17,$19,$19 ldq $3,32(%1) ldq $4,32(%2) ldq $5,32(%3) xor $20,$21,$21 # 8 cycles from $21 load ldq $6,40(%0) ldq $7,40(%1) ldq $16,40(%2) ldq $17,40(%3) stq $19,16(%0) xor $0,$1,$1 # 9 cycles from $1 load xor $2,$3,$3 # 5 cycles from $3 load xor $21,$1,$1 ldq $18,48(%0) xor $4,$5,$5 # 5 cycles from $5 load ldq $19,48(%1) xor $3,$5,$5 ldq $20,48(%2) ldq $21,48(%3) ldq $0,56(%0) ldq $1,56(%1) ldq $2,56(%2) xor $6,$7,$7 # 8 cycles from $6 load ldq $3,56(%3) xor $16,$17,$17 # 8 cycles from $17 load xor $7,$17,$17 xor $18,$19,$19 # 5 cycles from $19 load xor $20,$21,$21 # 5 cycles from $21 load xor $19,$21,$21 stq $1,24(%0) xor $0,$1,$1 # 5 cycles from $1 load stq $5,32(%0) xor $2,$3,$3 # 4 cycles from $3 load stq $17,40(%0) xor $1,$3,$3 stq $21,48(%0) subq %4,1,%4 stq $3,56(%0) addq %3,64,%3 addq %2,64,%2 addq %1,64,%1 addq %0,64,%0 bgt %4,4b" : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines) : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines) : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$16", "$17", "$18", "$19", "$20", "$21"); return; five_blocks: asm volatile (" ldq %0,0(%6) ldq %1,8(%6) ldq %2,16(%6) ldq %3,24(%6) ldq %4,32(%6) ldq %0,%7(%0) ldq %1,%7(%1) ldq %2,%7(%2) ldq %3,%7(%3) ldq %4,%7(%4) .align 4 5: ldq $0,0(%0) ldq $1,0(%1) ldq $2,0(%2) ldq $3,0(%3) ldq $4,0(%4) ldq $5,8(%0) ldq $6,8(%1) ldq $7,8(%2) ldq $16,8(%3) ldq $17,8(%4) ldq $18,16(%0) ldq $19,16(%1) ldq $20,16(%2) xor $0,$1,$1 # 6 cycles from $1 load ldq $21,16(%3) xor $2,$3,$3 # 6 cycles from $3 load ldq $0,16(%4) xor $1,$3,$3 ldq $1,24(%0) xor $3,$4,$4 # 7 cycles from $4 load stq $4,0(%0) xor $5,$6,$6 # 7 cycles from $6 load xor $7,$16,$16 # 7 cycles from $16 load xor $6,$17,$17 # 7 cycles from $17 load ldq $2,24(%1) xor $16,$17,$17 ldq $3,24(%2) xor $18,$19,$19 # 8 cycles from $19 load stq $17,8(%0) xor $19,$20,$20 # 8 cycles from $20 load ldq $4,24(%3) xor $21,$0,$0 # 7 cycles from $0 load ldq $5,24(%4) xor $20,$0,$0 ldq $6,32(%0) ldq $7,32(%1) stq $0,16(%0) xor $1,$2,$2 # 6 cycles from $2 load ldq $16,32(%2) xor $3,$4,$4 # 4 cycles from $4 load ldq $17,32(%3) xor $2,$4,$4 ldq $18,32(%4) ldq $19,40(%0) ldq $20,40(%1) ldq $21,40(%2) ldq $0,40(%3) xor $4,$5,$5 # 7 cycles from $5 load stq $5,24(%0) xor $6,$7,$7 # 7 cycles from $7 load ldq $1,40(%4) ldq $2,48(%0) ldq $3,48(%1) xor $7,$16,$16 # 7 cycles from $16 load ldq $4,48(%2) xor $17,$18,$18 # 6 cycles from $18 load ldq $5,48(%3) xor $16,$18,$18 ldq $6,48(%4) xor $19,$20,$20 # 7 cycles from $20 load stq $18,32(%0) xor $20,$21,$21 # 8 cycles from $21 load ldq $7,56(%0) xor $0,$1,$1 # 6 cycles from $1 load ldq $16,56(%1) ldq $17,56(%2) ldq $18,56(%3) ldq $19,56(%4) xor $21,$1,$1 xor $2,$3,$3 # 9 cycles from $3 load xor $3,$4,$4 # 9 cycles from $4 load xor $5,$6,$6 # 8 cycles from $6 load unop xor $4,$6,$6 xor $7,$16,$16 # 7 cycles from $16 load xor $17,$18,$18 # 6 cycles from $18 load stq $6,48(%0) xor $16,$18,$18 subq %5,1,%5 xor $18,$19,$19 # 8 cycles from $19 load stq $19,56(%0) addq %4,64,%4 addq %3,64,%3 addq %2,64,%2 addq %1,64,%1 addq %0,64,%0 bgt %5,5b" : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines) /* ARG! We've run out of asm arguments! We've got to reload all those pointers we just loaded. */ : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines) : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$16", "$17", "$18", "$19", "$20", "$21"); return; } #define prefetch(base, ofs) \ asm("ldq $31,%2(%0)" : "=r"(base) : "0"(base), "i"(ofs)) XORBLOCK_TEMPLATE(alpha_prefetch) { long lines = bh_ptr[0]->b_size / sizeof (long) / 8; long *d = (long *) bh_ptr[0]->b_data; long *s1 = (long *) bh_ptr[1]->b_data; long *s2, *s3, *s4; long p; p = count == 2; prefetch(d, 0); prefetch(s1, 0); prefetch(d, 64); prefetch(s1, 64); prefetch(d, 128); prefetch(s1, 128); prefetch(d, 192); prefetch(s1, 192); if (p) goto two_blocks; s2 = (long *) bh_ptr[2]->b_data; p = count == 3; prefetch(s2, 0); prefetch(s2, 64); prefetch(s2, 128); prefetch(s2, 192); if (p) goto three_blocks; s3 = (long *) bh_ptr[3]->b_data; p = count == 4; prefetch(s3, 0); prefetch(s3, 64); prefetch(s3, 128); prefetch(s3, 192); if (p) goto four_blocks; s4 = (long *) bh_ptr[4]->b_data; prefetch(s4, 0); prefetch(s4, 64); prefetch(s4, 128); prefetch(s4, 192); goto five_blocks; two_blocks: asm volatile (" .align 4 2: ldq $0,0(%0) ldq $1,0(%1) ldq $2,8(%0) ldq $3,8(%1) ldq $4,16(%0) ldq $5,16(%1) ldq $6,24(%0) ldq $7,24(%1) ldq $16,32(%0) ldq $17,32(%1) ldq $18,40(%0) ldq $19,40(%1) ldq $20,48(%0) ldq $21,48(%1) ldq $22,56(%0) ldq $23,56(%1) ldq $31,256(%0) xor $0,$1,$0 # 8 cycles from $1 load ldq $31,256(%1) xor $2,$3,$2 stq $0,0(%0) xor $4,$5,$4 stq $2,8(%0) xor $6,$7,$6 stq $4,16(%0) xor $16,$17,$16 stq $6,24(%0) xor $18,$19,$18 stq $16,32(%0) xor $20,$21,$20 stq $18,40(%0) xor $22,$23,$22 stq $20,48(%0) subq %2,1,%2 stq $22,56(%0) addq %0,64,%0 addq %1,64,%1 bgt %2,2b" : "=r"(d), "=r"(s1), "=r"(lines) : "0"(d), "1"(s1), "2"(lines) : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23"); return; three_blocks: asm volatile (" .align 4 3: ldq $0,0(%0) ldq $1,0(%1) ldq $2,0(%2) ldq $3,8(%0) ldq $4,8(%1) ldq $6,16(%0) ldq $7,16(%1) ldq $17,24(%0) ldq $18,24(%1) ldq $20,32(%0) ldq $21,32(%1) ldq $5,8(%2) ldq $16,16(%2) ldq $19,24(%2) ldq $22,32(%2) nop xor $0,$1,$1 # 8 cycles from $0 load xor $3,$4,$4 # 7 cycles from $4 load xor $6,$7,$7 # 6 cycles from $7 load xor $17,$18,$18 # 5 cycles from $18 load xor $1,$2,$2 # 9 cycles from $2 load xor $20,$21,$21 # 5 cycles from $21 load stq $2,0(%0) xor $4,$5,$5 # 6 cycles from $5 load stq $5,8(%0) xor $7,$16,$16 # 7 cycles from $16 load stq $16,16(%0) xor $18,$19,$19 # 7 cycles from $19 load stq $19,24(%0) xor $21,$22,$22 # 7 cycles from $22 load stq $22,32(%0) nop ldq $0,40(%0) ldq $1,40(%1) ldq $3,48(%0) ldq $4,48(%1) ldq $6,56(%0) ldq $7,56(%1) ldq $2,40(%2) ldq $5,48(%2) ldq $16,56(%2) ldq $31,256(%0) ldq $31,256(%1) ldq $31,256(%2) xor $0,$1,$1 # 6 cycles from $1 load xor $3,$4,$4 # 5 cycles from $4 load xor $6,$7,$7 # 5 cycles from $7 load xor $1,$2,$2 # 4 cycles from $2 load xor $4,$5,$5 # 5 cycles from $5 load xor $7,$16,$16 # 4 cycles from $16 load stq $2,40(%0) subq %3,1,%3 stq $5,48(%0) addq %2,64,%2 stq $16,56(%0) addq %1,64,%1 addq %0,64,%0 bgt %3,3b" : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines) : "0"(d), "1"(s1), "2"(s2), "3"(lines) : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$16", "$17", "$18", "$19", "$20", "$21", "$22"); return; four_blocks: asm volatile (" .align 4 4: ldq $0,0(%0) ldq $1,0(%1) ldq $2,0(%2) ldq $3,0(%3) ldq $4,8(%0) ldq $5,8(%1) ldq $6,8(%2) ldq $7,8(%3) ldq $16,16(%0) ldq $17,16(%1) ldq $18,16(%2) ldq $19,16(%3) ldq $20,24(%0) xor $0,$1,$1 # 6 cycles from $1 load ldq $21,24(%1) xor $2,$3,$3 # 6 cycles from $3 load ldq $0,24(%2) xor $1,$3,$3 ldq $1,24(%3) xor $4,$5,$5 # 7 cycles from $5 load stq $3,0(%0) xor $6,$7,$7 xor $16,$17,$17 # 7 cycles from $17 load xor $5,$7,$7 stq $7,8(%0) xor $18,$19,$19 # 7 cycles from $19 load ldq $2,32(%0) xor $17,$19,$19 ldq $3,32(%1) ldq $4,32(%2) ldq $5,32(%3) xor $20,$21,$21 # 8 cycles from $21 load ldq $6,40(%0) ldq $7,40(%1) ldq $16,40(%2) ldq $17,40(%3) stq $19,16(%0) xor $0,$1,$1 # 9 cycles from $1 load xor $2,$3,$3 # 5 cycles from $3 load xor $21,$1,$1 ldq $18,48(%0) xor $4,$5,$5 # 5 cycles from $5 load ldq $19,48(%1) xor $3,$5,$5 ldq $20,48(%2) ldq $21,48(%3) ldq $0,56(%0) ldq $1,56(%1) ldq $2,56(%2) xor $6,$7,$7 # 8 cycles from $6 load ldq $3,56(%3) xor $16,$17,$17 # 8 cycles from $17 load ldq $31,256(%0) xor $7,$17,$17 ldq $31,256(%1) xor $18,$19,$19 # 6 cycles from $19 load ldq $31,256(%2) xor $20,$21,$21 # 6 cycles from $21 load ldq $31,256(%3) xor $19,$21,$21 stq $1,24(%0) xor $0,$1,$1 # 7 cycles from $1 load stq $5,32(%0) xor $2,$3,$3 # 6 cycles from $3 load stq $17,40(%0) xor $1,$3,$3 stq $21,48(%0) subq %4,1,%4 stq $3,56(%0) addq %3,64,%3 addq %2,64,%2 addq %1,64,%1 addq %0,64,%0 bgt %4,4b" : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines) : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines) : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$16", "$17", "$18", "$19", "$20", "$21"); return; five_blocks: asm volatile (" ldq %0,0(%6) ldq %1,8(%6) ldq %2,16(%6) ldq %3,24(%6) ldq %4,32(%6) ldq %0,%7(%0) ldq %1,%7(%1) ldq %2,%7(%2) ldq %3,%7(%3) ldq %4,%7(%4) .align 4 5: ldq $0,0(%0) ldq $1,0(%1) ldq $2,0(%2) ldq $3,0(%3) ldq $4,0(%4) ldq $5,8(%0) ldq $6,8(%1) ldq $7,8(%2) ldq $16,8(%3) ldq $17,8(%4) ldq $18,16(%0) ldq $19,16(%1) ldq $20,16(%2) xor $0,$1,$1 # 6 cycles from $1 load ldq $21,16(%3) xor $2,$3,$3 # 6 cycles from $3 load ldq $0,16(%4) xor $1,$3,$3 ldq $1,24(%0) xor $3,$4,$4 # 7 cycles from $4 load stq $4,0(%0) xor $5,$6,$6 # 7 cycles from $6 load xor $7,$16,$16 # 7 cycles from $16 load xor $6,$17,$17 # 7 cycles from $17 load ldq $2,24(%1) xor $16,$17,$17 ldq $3,24(%2) xor $18,$19,$19 # 8 cycles from $19 load stq $17,8(%0) xor $19,$20,$20 # 8 cycles from $20 load ldq $4,24(%3) xor $21,$0,$0 # 7 cycles from $0 load ldq $5,24(%4) xor $20,$0,$0 ldq $6,32(%0) ldq $7,32(%1) stq $0,16(%0) xor $1,$2,$2 # 6 cycles from $2 load ldq $16,32(%2) xor $3,$4,$4 # 4 cycles from $4 load ldq $17,32(%3) xor $2,$4,$4 ldq $18,32(%4) ldq $19,40(%0) ldq $20,40(%1) ldq $21,40(%2) ldq $0,40(%3) xor $4,$5,$5 # 7 cycles from $5 load stq $5,24(%0) xor $6,$7,$7 # 7 cycles from $7 load ldq $1,40(%4) ldq $2,48(%0) ldq $3,48(%1) xor $7,$16,$16 # 7 cycles from $16 load ldq $4,48(%2) xor $17,$18,$18 # 6 cycles from $18 load ldq $5,48(%3) xor $16,$18,$18 ldq $6,48(%4) xor $19,$20,$20 # 7 cycles from $20 load stq $18,32(%0) xor $20,$21,$21 # 8 cycles from $21 load ldq $7,56(%0) xor $0,$1,$1 # 6 cycles from $1 load ldq $16,56(%1) ldq $17,56(%2) ldq $18,56(%3) ldq $19,56(%4) ldq $31,256(%0) xor $21,$1,$1 ldq $31,256(%1) xor $2,$3,$3 # 9 cycles from $3 load ldq $31,256(%2) xor $3,$4,$4 # 9 cycles from $4 load ldq $31,256(%3) xor $5,$6,$6 # 8 cycles from $6 load ldq $31,256(%4) xor $4,$6,$6 xor $7,$16,$16 # 7 cycles from $16 load xor $17,$18,$18 # 6 cycles from $18 load stq $6,48(%0) xor $16,$18,$18 subq %5,1,%5 xor $18,$19,$19 # 8 cycles from $19 load stq $19,56(%0) addq %4,64,%4 addq %3,64,%3 addq %2,64,%2 addq %1,64,%1 addq %0,64,%0 bgt %5,5b" : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines) /* ARG! We've run out of asm arguments! We've got to reload all those pointers we just loaded. */ : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines) : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$16", "$17", "$18", "$19", "$20", "$21"); return; } #undef prefetch #endif /* __alpha__ */ #ifndef __sparc_v9__ /* * this one works reasonably on any x86 CPU * (send me an assembly version for inclusion if you can make it faster) * * this one is just as fast as written in pure assembly on x86. * the reason for this separate version is that the * fast open-coded xor routine "32reg" produces suboptimal code * on x86, due to lack of registers. */ XORBLOCK_TEMPLATE(8regs) { int len = bh_ptr[0]->b_size; long *destp = (long *) bh_ptr[0]->b_data; long *source1, *source2, *source3, *source4; long lines = len / (sizeof (long)) / 8, i; switch(count) { case 2: source1 = (long *) bh_ptr[1]->b_data; for (i = lines; i > 0; i--) { *(destp + 0) ^= *(source1 + 0); *(destp + 1) ^= *(source1 + 1); *(destp + 2) ^= *(source1 + 2); *(destp + 3) ^= *(source1 + 3); *(destp + 4) ^= *(source1 + 4); *(destp + 5) ^= *(source1 + 5); *(destp + 6) ^= *(source1 + 6); *(destp + 7) ^= *(source1 + 7); source1 += 8; destp += 8; } break; case 3: source2 = (long *) bh_ptr[2]->b_data; source1 = (long *) bh_ptr[1]->b_data; for (i = lines; i > 0; i--) { *(destp + 0) ^= *(source1 + 0); *(destp + 0) ^= *(source2 + 0); *(destp + 1) ^= *(source1 + 1); *(destp + 1) ^= *(source2 + 1); *(destp + 2) ^= *(source1 + 2); *(destp + 2) ^= *(source2 + 2); *(destp + 3) ^= *(source1 + 3); *(destp + 3) ^= *(source2 + 3); *(destp + 4) ^= *(source1 + 4); *(destp + 4) ^= *(source2 + 4); *(destp + 5) ^= *(source1 + 5); *(destp + 5) ^= *(source2 + 5); *(destp + 6) ^= *(source1 + 6); *(destp + 6) ^= *(source2 + 6); *(destp + 7) ^= *(source1 + 7); *(destp + 7) ^= *(source2 + 7); source1 += 8; source2 += 8; destp += 8; } break; case 4: source3 = (long *) bh_ptr[3]->b_data; source2 = (long *) bh_ptr[2]->b_data; source1 = (long *) bh_ptr[1]->b_data; for (i = lines; i > 0; i--) { *(destp + 0) ^= *(source1 + 0); *(destp + 0) ^= *(source2 + 0); *(destp + 0) ^= *(source3 + 0); *(destp + 1) ^= *(source1 + 1); *(destp + 1) ^= *(source2 + 1); *(destp + 1) ^= *(source3 + 1); *(destp + 2) ^= *(source1 + 2); *(destp + 2) ^= *(source2 + 2); *(destp + 2) ^= *(source3 + 2); *(destp + 3) ^= *(source1 + 3); *(destp + 3) ^= *(source2 + 3); *(destp + 3) ^= *(source3 + 3); *(destp + 4) ^= *(source1 + 4); *(destp + 4) ^= *(source2 + 4); *(destp + 4) ^= *(source3 + 4); *(destp + 5) ^= *(source1 + 5); *(destp + 5) ^= *(source2 + 5); *(destp + 5) ^= *(source3 + 5); *(destp + 6) ^= *(source1 + 6); *(destp + 6) ^= *(source2 + 6); *(destp + 6) ^= *(source3 + 6); *(destp + 7) ^= *(source1 + 7); *(destp + 7) ^= *(source2 + 7); *(destp + 7) ^= *(source3 + 7); source1 += 8; source2 += 8; source3 += 8; destp += 8; } break; case 5: source4 = (long *) bh_ptr[4]->b_data; source3 = (long *) bh_ptr[3]->b_data; source2 = (long *) bh_ptr[2]->b_data; source1 = (long *) bh_ptr[1]->b_data; for (i = lines; i > 0; i--) { *(destp + 0) ^= *(source1 + 0); *(destp + 0) ^= *(source2 + 0); *(destp + 0) ^= *(source3 + 0); *(destp + 0) ^= *(source4 + 0); *(destp + 1) ^= *(source1 + 1); *(destp + 1) ^= *(source2 + 1); *(destp + 1) ^= *(source3 + 1); *(destp + 1) ^= *(source4 + 1); *(destp + 2) ^= *(source1 + 2); *(destp + 2) ^= *(source2 + 2); *(destp + 2) ^= *(source3 + 2); *(destp + 2) ^= *(source4 + 2); *(destp + 3) ^= *(source1 + 3); *(destp + 3) ^= *(source2 + 3); *(destp + 3) ^= *(source3 + 3); *(destp + 3) ^= *(source4 + 3); *(destp + 4) ^= *(source1 + 4); *(destp + 4) ^= *(source2 + 4); *(destp + 4) ^= *(source3 + 4); *(destp + 4) ^= *(source4 + 4); *(destp + 5) ^= *(source1 + 5); *(destp + 5) ^= *(source2 + 5); *(destp + 5) ^= *(source3 + 5); *(destp + 5) ^= *(source4 + 5); *(destp + 6) ^= *(source1 + 6); *(destp + 6) ^= *(source2 + 6); *(destp + 6) ^= *(source3 + 6); *(destp + 6) ^= *(source4 + 6); *(destp + 7) ^= *(source1 + 7); *(destp + 7) ^= *(source2 + 7); *(destp + 7) ^= *(source3 + 7); *(destp + 7) ^= *(source4 + 7); source1 += 8; source2 += 8; source3 += 8; source4 += 8; destp += 8; } break; } } /* * platform independent RAID5 checksum calculation, this should * be very fast on any platform that has a decent amount of * registers. (32 or more) */ XORBLOCK_TEMPLATE(32regs) { int size = bh_ptr[0]->b_size; int lines = size / (sizeof (long)) / 8, i; long *destp = (long *) bh_ptr[0]->b_data; long *source1, *source2, *source3, *source4; /* LOTS of registers available... We do explicite loop-unrolling here for code which favours RISC machines. In fact this is almoast direct RISC assembly on Alpha and SPARC :-) */ switch(count) { case 2: source1 = (long *) bh_ptr[1]->b_data; for (i = lines; i > 0; i--) { register long d0, d1, d2, d3, d4, d5, d6, d7; d0 = destp[0]; /* Pull the stuff into registers */ d1 = destp[1]; /* ... in bursts, if possible. */ d2 = destp[2]; d3 = destp[3]; d4 = destp[4]; d5 = destp[5]; d6 = destp[6]; d7 = destp[7]; d0 ^= source1[0]; d1 ^= source1[1]; d2 ^= source1[2]; d3 ^= source1[3]; d4 ^= source1[4]; d5 ^= source1[5]; d6 ^= source1[6]; d7 ^= source1[7]; destp[0] = d0; /* Store the result (in burts) */ destp[1] = d1; destp[2] = d2; destp[3] = d3; destp[4] = d4; /* Store the result (in burts) */ destp[5] = d5; destp[6] = d6; destp[7] = d7; source1 += 8; destp += 8; } break; case 3: source2 = (long *) bh_ptr[2]->b_data; source1 = (long *) bh_ptr[1]->b_data; for (i = lines; i > 0; i--) { register long d0, d1, d2, d3, d4, d5, d6, d7; d0 = destp[0]; /* Pull the stuff into registers */ d1 = destp[1]; /* ... in bursts, if possible. */ d2 = destp[2]; d3 = destp[3]; d4 = destp[4]; d5 = destp[5]; d6 = destp[6]; d7 = destp[7]; d0 ^= source1[0]; d1 ^= source1[1]; d2 ^= source1[2]; d3 ^= source1[3]; d4 ^= source1[4]; d5 ^= source1[5]; d6 ^= source1[6]; d7 ^= source1[7]; d0 ^= source2[0]; d1 ^= source2[1]; d2 ^= source2[2]; d3 ^= source2[3]; d4 ^= source2[4]; d5 ^= source2[5]; d6 ^= source2[6]; d7 ^= source2[7]; destp[0] = d0; /* Store the result (in burts) */ destp[1] = d1; destp[2] = d2; destp[3] = d3; destp[4] = d4; /* Store the result (in burts) */ destp[5] = d5; destp[6] = d6; destp[7] = d7; source1 += 8; source2 += 8; destp += 8; } break; case 4: source3 = (long *) bh_ptr[3]->b_data; source2 = (long *) bh_ptr[2]->b_data; source1 = (long *) bh_ptr[1]->b_data; for (i = lines; i > 0; i--) { register long d0, d1, d2, d3, d4, d5, d6, d7; d0 = destp[0]; /* Pull the stuff into registers */ d1 = destp[1]; /* ... in bursts, if possible. */ d2 = destp[2]; d3 = destp[3]; d4 = destp[4]; d5 = destp[5]; d6 = destp[6]; d7 = destp[7]; d0 ^= source1[0]; d1 ^= source1[1]; d2 ^= source1[2]; d3 ^= source1[3]; d4 ^= source1[4]; d5 ^= source1[5]; d6 ^= source1[6]; d7 ^= source1[7]; d0 ^= source2[0]; d1 ^= source2[1]; d2 ^= source2[2]; d3 ^= source2[3]; d4 ^= source2[4]; d5 ^= source2[5]; d6 ^= source2[6]; d7 ^= source2[7]; d0 ^= source3[0]; d1 ^= source3[1]; d2 ^= source3[2]; d3 ^= source3[3]; d4 ^= source3[4]; d5 ^= source3[5]; d6 ^= source3[6]; d7 ^= source3[7]; destp[0] = d0; /* Store the result (in burts) */ destp[1] = d1; destp[2] = d2; destp[3] = d3; destp[4] = d4; /* Store the result (in burts) */ destp[5] = d5; destp[6] = d6; destp[7] = d7; source1 += 8; source2 += 8; source3 += 8; destp += 8; } break; case 5: source4 = (long *) bh_ptr[4]->b_data; source3 = (long *) bh_ptr[3]->b_data; source2 = (long *) bh_ptr[2]->b_data; source1 = (long *) bh_ptr[1]->b_data; for (i = lines; i > 0; i--) { register long d0, d1, d2, d3, d4, d5, d6, d7; d0 = destp[0]; /* Pull the stuff into registers */ d1 = destp[1]; /* ... in bursts, if possible. */ d2 = destp[2]; d3 = destp[3]; d4 = destp[4]; d5 = destp[5]; d6 = destp[6]; d7 = destp[7]; d0 ^= source1[0]; d1 ^= source1[1]; d2 ^= source1[2]; d3 ^= source1[3]; d4 ^= source1[4]; d5 ^= source1[5]; d6 ^= source1[6]; d7 ^= source1[7]; d0 ^= source2[0]; d1 ^= source2[1]; d2 ^= source2[2]; d3 ^= source2[3]; d4 ^= source2[4]; d5 ^= source2[5]; d6 ^= source2[6]; d7 ^= source2[7]; d0 ^= source3[0]; d1 ^= source3[1]; d2 ^= source3[2]; d3 ^= source3[3]; d4 ^= source3[4]; d5 ^= source3[5]; d6 ^= source3[6]; d7 ^= source3[7]; d0 ^= source4[0]; d1 ^= source4[1]; d2 ^= source4[2]; d3 ^= source4[3]; d4 ^= source4[4]; d5 ^= source4[5]; d6 ^= source4[6]; d7 ^= source4[7]; destp[0] = d0; /* Store the result (in burts) */ destp[1] = d1; destp[2] = d2; destp[3] = d3; destp[4] = d4; /* Store the result (in burts) */ destp[5] = d5; destp[6] = d6; destp[7] = d7; source1 += 8; source2 += 8; source3 += 8; source4 += 8; destp += 8; } break; } } /* * (the -6*32 shift factor colors the cache) */ #define SIZE (PAGE_SIZE-6*32) static void xor_speed ( struct xor_block_template * func, struct buffer_head *b1, struct buffer_head *b2) { int speed; unsigned long now; int i, count, max; struct buffer_head *bh_ptr[6]; func->next = xor_functions; xor_functions = func; bh_ptr[0] = b1; bh_ptr[1] = b2; /* * count the number of XORs done during a whole jiffy. * calculate the speed of checksumming from this. * (we use a 2-page allocation to have guaranteed * color L1-cache layout) */ max = 0; for (i = 0; i < 5; i++) { now = jiffies; count = 0; while (jiffies == now) { mb(); func->xor_block(2,bh_ptr); mb(); count++; mb(); } if (count > max) max = count; } speed = max * (HZ*SIZE/1024); func->speed = speed; printk( " %-10s: %5d.%03d MB/sec\n", func->name, speed / 1000, speed % 1000); } static inline void pick_fastest_function(void) { struct xor_block_template *f, *fastest; fastest = xor_functions; for (f = fastest; f; f = f->next) { if (f->speed > fastest->speed) fastest = f; } #ifdef CONFIG_X86_XMM if (cpu_has_xmm) { /* we force the use of the KNI xor block because it can write around l2. we may also be able to load into the l1 only depending on how the cpu deals with a load to a line that is being prefetched. */ fastest = &t_xor_block_pIII_kni; } #endif #ifdef __alpha__ if (implver() == IMPLVER_EV6) { /* Force the use of alpha_prefetch if EV6, as it is significantly faster in the cold cache case. */ fastest = &t_xor_block_alpha_prefetch; } #endif xor_block = fastest->xor_block; printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name, fastest->speed / 1000, fastest->speed % 1000); } static struct buffer_head b1, b2; void calibrate_xor_block(void) { if (xor_block) return; memset(&b1,0,sizeof(b1)); b2 = b1; b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2); if (!b1.b_data) { pick_fastest_function(); return; } b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE; b1.b_size = SIZE; printk(KERN_INFO "raid5: measuring checksumming speed\n"); sti(); /* should be safe */ #if defined(__sparc__) && !defined(__sparc_v9__) printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n"); xor_speed(&t_xor_block_SPARC,&b1,&b2); #endif #ifdef CONFIG_X86_XMM if (cpu_has_xmm) { printk(KERN_INFO "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n"); xor_speed(&t_xor_block_pIII_kni,&b1,&b2); } #endif /* CONFIG_X86_XMM */ #ifdef __i386__ if (md_cpu_has_mmx()) { printk(KERN_INFO "raid5: MMX detected, trying high-speed MMX checksum routines\n"); xor_speed(&t_xor_block_pII_mmx,&b1,&b2); xor_speed(&t_xor_block_p5_mmx,&b1,&b2); } #endif /* __i386__ */ #ifdef __alpha__ xor_speed(&t_xor_block_alpha,&b1,&b2); xor_speed(&t_xor_block_alpha_prefetch,&b1,&b2); #endif xor_speed(&t_xor_block_8regs,&b1,&b2); xor_speed(&t_xor_block_32regs,&b1,&b2); free_pages((unsigned long)b1.b_data,2); pick_fastest_function(); } #else /* __sparc_v9__ */ void calibrate_xor_block(void) { if (xor_block) return; printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n"); xor_block = xor_block_VIS; } #endif /* __sparc_v9__ */ MD_EXPORT_SYMBOL(xor_block); MD_EXPORT_SYMBOL(calibrate_xor_block); #ifdef MODULE int init_module(void) { calibrate_xor_block(); return 0; } #endif