From b63ad0882a16a5d28003e57f2b0b81dee3fb322b Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 28 Nov 2000 03:58:46 +0000 Subject: Merge with 2.4.0-test11. --- drivers/md/xor.c | 2721 ++---------------------------------------------------- 1 file changed, 68 insertions(+), 2653 deletions(-) (limited to 'drivers/md/xor.c') diff --git a/drivers/md/xor.c b/drivers/md/xor.c index 4fe04fb89..f58463ebc 100644 --- a/drivers/md/xor.c +++ b/drivers/md/xor.c @@ -1,10 +1,10 @@ /* * xor.c : Multiple Devices driver for Linux * - * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek + * Copyright (C) 1996, 1997, 1998, 1999, 2000, + * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson. * - * - * optimized RAID-5 checksumming functions. + * Dispatch optimized RAID-5 checksumming functions. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -15,2584 +15,66 @@ * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + #include #define BH_TRACE 0 #include #include -#ifdef __sparc_v9__ -#include -#include -#include -#endif - -/* - * we use the 'XOR function template' to register multiple xor - * functions runtime. The kernel measures their speed upon bootup - * and decides which one to use. (compile-time registration is - * not enough as certain CPU features like MMX can only be detected - * runtime) - * - * this architecture makes it pretty easy to add new routines - * that are faster on certain CPUs, without killing other CPU's - * 'native' routine. Although the current routines are belived - * to be the physically fastest ones on all CPUs tested, but - * feel free to prove me wrong and add yet another routine =B-) - * --mingo - */ - -#define MAX_XOR_BLOCKS 5 - -#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr) - -typedef void (*xor_block_t) XOR_ARGS; -xor_block_t xor_block = NULL; - -#ifndef __sparc_v9__ - -struct xor_block_template; - -struct xor_block_template { - char * name; - xor_block_t xor_block; - int speed; - struct xor_block_template * next; -}; - -struct xor_block_template * xor_functions = NULL; - -#define XORBLOCK_TEMPLATE(x) \ -static void xor_block_##x XOR_ARGS; \ -static struct xor_block_template t_xor_block_##x = \ - { #x, xor_block_##x, 0, NULL }; \ -static void xor_block_##x XOR_ARGS - -#ifdef __i386__ - -#ifdef CONFIG_X86_XMM -/* - * Cache avoiding checksumming functions utilizing KNI instructions - * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) - */ - -XORBLOCK_TEMPLATE(pIII_kni) -{ - char xmm_save[16*4]; - int cr0; - int lines = (bh_ptr[0]->b_size>>8); - - __asm__ __volatile__ ( - "movl %%cr0,%0 ;\n\t" - "clts ;\n\t" - "movups %%xmm0,(%1) ;\n\t" - "movups %%xmm1,0x10(%1) ;\n\t" - "movups %%xmm2,0x20(%1) ;\n\t" - "movups %%xmm3,0x30(%1) ;\n\t" - : "=r" (cr0) - : "r" (xmm_save) - : "memory" ); - -#define OFFS(x) "8*("#x"*2)" -#define PF0(x) \ - " prefetcht0 "OFFS(x)"(%1) ;\n" -#define LD(x,y) \ - " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" -#define ST(x,y) \ - " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" -#define PF1(x) \ - " prefetchnta "OFFS(x)"(%2) ;\n" -#define PF2(x) \ - " prefetchnta "OFFS(x)"(%3) ;\n" -#define PF3(x) \ - " prefetchnta "OFFS(x)"(%4) ;\n" -#define PF4(x) \ - " prefetchnta "OFFS(x)"(%5) ;\n" -#define PF5(x) \ - " prefetchnta "OFFS(x)"(%6) ;\n" -#define XO1(x,y) \ - " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" -#define XO2(x,y) \ - " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" -#define XO3(x,y) \ - " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" -#define XO4(x,y) \ - " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" -#define XO5(x,y) \ - " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" - - switch(count) { - case 2: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - PF1(i) \ - PF1(i+2) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data) - : "memory" ); - break; - case 3: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data) - : "memory" ); - break; - case 4: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - PF3(i) \ - PF3(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) +#include +#include - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " addl $256, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" +/* The xor routines to use. */ +static struct xor_block_template *active_template; - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data) - : "memory" ); - break; - case 5: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - PF3(i) \ - PF3(i+2) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - PF4(i) \ - PF4(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - XO4(i,0) \ - XO4(i+1,1) \ - XO4(i+2,2) \ - XO4(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " addl $256, %4 ;\n" - " addl $256, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data), - "r" (bh_ptr[4]->b_data) - : "memory"); - break; - } - - __asm__ __volatile__ ( - "sfence ;\n\t" - "movups (%1),%%xmm0 ;\n\t" - "movups 0x10(%1),%%xmm1 ;\n\t" - "movups 0x20(%1),%%xmm2 ;\n\t" - "movups 0x30(%1),%%xmm3 ;\n\t" - "movl %0,%%cr0 ;\n\t" - : - : "r" (cr0), "r" (xmm_save) - : "memory" ); -} - -#undef OFFS -#undef LD -#undef ST -#undef PF0 -#undef PF1 -#undef PF2 -#undef PF3 -#undef PF4 -#undef PF5 -#undef XO1 -#undef XO2 -#undef XO3 -#undef XO4 -#undef XO5 -#undef BLOCK - -#endif /* CONFIG_X86_XMM */ - -/* - * high-speed RAID5 checksumming functions utilizing MMX instructions - * Copyright (C) 1998 Ingo Molnar - */ -XORBLOCK_TEMPLATE(pII_mmx) +void +xor_block(unsigned int count, struct buffer_head **bh_ptr) { - char fpu_save[108]; - int lines = (bh_ptr[0]->b_size>>7); - - if (!(current->flags & PF_USEDFPU)) - __asm__ __volatile__ ( " clts;\n"); - - __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); - -#define LD(x,y) \ - " movq 8*("#x")(%1), %%mm"#y" ;\n" -#define ST(x,y) \ - " movq %%mm"#y", 8*("#x")(%1) ;\n" -#define XO1(x,y) \ - " pxor 8*("#x")(%2), %%mm"#y" ;\n" -#define XO2(x,y) \ - " pxor 8*("#x")(%3), %%mm"#y" ;\n" -#define XO3(x,y) \ - " pxor 8*("#x")(%4), %%mm"#y" ;\n" -#define XO4(x,y) \ - " pxor 8*("#x")(%5), %%mm"#y" ;\n" - - switch(count) { - case 2: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - ST(i,0) \ - XO1(i+1,1) \ - ST(i+1,1) \ - XO1(i+2,2) \ - ST(i+2,2) \ - XO1(i+3,3) \ - ST(i+3,3) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) + unsigned long *p0, *p1, *p2, *p3, *p4; + unsigned long bytes = bh_ptr[0]->b_size; - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data) - : "memory"); - break; - case 3: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - ST(i,0) \ - XO2(i+1,1) \ - ST(i+1,1) \ - XO2(i+2,2) \ - ST(i+2,2) \ - XO2(i+3,3) \ - ST(i+3,3) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " addl $128, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data) - : "memory"); - break; - case 4: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - ST(i,0) \ - XO3(i+1,1) \ - ST(i+1,1) \ - XO3(i+2,2) \ - ST(i+2,2) \ - XO3(i+3,3) \ - ST(i+3,3) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " addl $128, %3 ;\n" - " addl $128, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data) - : "memory"); - break; - case 5: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - XO4(i,0) \ - ST(i,0) \ - XO4(i+1,1) \ - ST(i+1,1) \ - XO4(i+2,2) \ - ST(i+2,2) \ - XO4(i+3,3) \ - ST(i+3,3) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " addl $128, %3 ;\n" - " addl $128, %4 ;\n" - " addl $128, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "g" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data), - "r" (bh_ptr[4]->b_data) - : "memory"); - break; + p0 = (unsigned long *) bh_ptr[0]->b_data; + p1 = (unsigned long *) bh_ptr[1]->b_data; + if (count == 2) { + active_template->do_2(bytes, p0, p1); + return; } - __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); - - if (!(current->flags & PF_USEDFPU)) - stts(); -} - -#undef LD -#undef XO1 -#undef XO2 -#undef XO3 -#undef XO4 -#undef ST -#undef BLOCK - -XORBLOCK_TEMPLATE(p5_mmx) -{ - char fpu_save[108]; - int lines = (bh_ptr[0]->b_size>>6); - - if (!(current->flags & PF_USEDFPU)) - __asm__ __volatile__ ( " clts;\n"); - - __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); - - switch(count) { - case 2: - __asm__ __volatile__ ( - - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " movq 16(%1), %%mm2 ;\n" - " movq %%mm0, (%1) ;\n" - " pxor 8(%2), %%mm1 ;\n" - " movq 24(%1), %%mm3 ;\n" - " movq %%mm1, 8(%1) ;\n" - " pxor 16(%2), %%mm2 ;\n" - " movq 32(%1), %%mm4 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 24(%2), %%mm3 ;\n" - " movq 40(%1), %%mm5 ;\n" - " movq %%mm3, 24(%1) ;\n" - " pxor 32(%2), %%mm4 ;\n" - " movq 48(%1), %%mm6 ;\n" - " movq %%mm4, 32(%1) ;\n" - " pxor 40(%2), %%mm5 ;\n" - " movq 56(%1), %%mm7 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 48(%2), %%mm6 ;\n" - " pxor 56(%2), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data) - : "memory" ); - break; - case 3: - __asm__ __volatile__ ( - - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " movq 16(%1), %%mm2 ;\n" - " pxor 8(%2), %%mm1 ;\n" - " pxor (%3), %%mm0 ;\n" - " pxor 16(%2), %%mm2 ;\n" - " movq %%mm0, (%1) ;\n" - " pxor 8(%3), %%mm1 ;\n" - " pxor 16(%3), %%mm2 ;\n" - " movq 24(%1), %%mm3 ;\n" - " movq %%mm1, 8(%1) ;\n" - " movq 32(%1), %%mm4 ;\n" - " movq 40(%1), %%mm5 ;\n" - " pxor 24(%2), %%mm3 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 32(%2), %%mm4 ;\n" - " pxor 24(%3), %%mm3 ;\n" - " pxor 40(%2), %%mm5 ;\n" - " movq %%mm3, 24(%1) ;\n" - " pxor 32(%3), %%mm4 ;\n" - " pxor 40(%3), %%mm5 ;\n" - " movq 48(%1), %%mm6 ;\n" - " movq %%mm4, 32(%1) ;\n" - " movq 56(%1), %%mm7 ;\n" - " pxor 48(%2), %%mm6 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 56(%2), %%mm7 ;\n" - " pxor 48(%3), %%mm6 ;\n" - " pxor 56(%3), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " addl $64, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data) - : "memory" ); - break; - case 4: - __asm__ __volatile__ ( - - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " movq 16(%1), %%mm2 ;\n" - " pxor 8(%2), %%mm1 ;\n" - " pxor (%3), %%mm0 ;\n" - " pxor 16(%2), %%mm2 ;\n" - " pxor 8(%3), %%mm1 ;\n" - " pxor (%4), %%mm0 ;\n" - " movq 24(%1), %%mm3 ;\n" - " pxor 16(%3), %%mm2 ;\n" - " pxor 8(%4), %%mm1 ;\n" - " movq %%mm0, (%1) ;\n" - " movq 32(%1), %%mm4 ;\n" - " pxor 24(%2), %%mm3 ;\n" - " pxor 16(%4), %%mm2 ;\n" - " movq %%mm1, 8(%1) ;\n" - " movq 40(%1), %%mm5 ;\n" - " pxor 32(%2), %%mm4 ;\n" - " pxor 24(%3), %%mm3 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 40(%2), %%mm5 ;\n" - " pxor 32(%3), %%mm4 ;\n" - " pxor 24(%4), %%mm3 ;\n" - " movq %%mm3, 24(%1) ;\n" - " movq 56(%1), %%mm7 ;\n" - " movq 48(%1), %%mm6 ;\n" - " pxor 40(%3), %%mm5 ;\n" - " pxor 32(%4), %%mm4 ;\n" - " pxor 48(%2), %%mm6 ;\n" - " movq %%mm4, 32(%1) ;\n" - " pxor 56(%2), %%mm7 ;\n" - " pxor 40(%4), %%mm5 ;\n" - " pxor 48(%3), %%mm6 ;\n" - " pxor 56(%3), %%mm7 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 48(%4), %%mm6 ;\n" - " pxor 56(%4), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " addl $64, %3 ;\n" - " addl $64, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data) - : "memory" ); - break; - case 5: - __asm__ __volatile__ ( - - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " pxor 8(%2), %%mm1 ;\n" - " movq 16(%1), %%mm2 ;\n" - " pxor (%3), %%mm0 ;\n" - " pxor 8(%3), %%mm1 ;\n" - " pxor 16(%2), %%mm2 ;\n" - " pxor (%4), %%mm0 ;\n" - " pxor 8(%4), %%mm1 ;\n" - " pxor 16(%3), %%mm2 ;\n" - " movq 24(%1), %%mm3 ;\n" - " pxor (%5), %%mm0 ;\n" - " pxor 8(%5), %%mm1 ;\n" - " movq %%mm0, (%1) ;\n" - " pxor 16(%4), %%mm2 ;\n" - " pxor 24(%2), %%mm3 ;\n" - " movq %%mm1, 8(%1) ;\n" - " pxor 16(%5), %%mm2 ;\n" - " pxor 24(%3), %%mm3 ;\n" - " movq 32(%1), %%mm4 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 24(%4), %%mm3 ;\n" - " pxor 32(%2), %%mm4 ;\n" - " movq 40(%1), %%mm5 ;\n" - " pxor 24(%5), %%mm3 ;\n" - " pxor 32(%3), %%mm4 ;\n" - " pxor 40(%2), %%mm5 ;\n" - " movq %%mm3, 24(%1) ;\n" - " pxor 32(%4), %%mm4 ;\n" - " pxor 40(%3), %%mm5 ;\n" - " movq 48(%1), %%mm6 ;\n" - " movq 56(%1), %%mm7 ;\n" - " pxor 32(%5), %%mm4 ;\n" - " pxor 40(%4), %%mm5 ;\n" - " pxor 48(%2), %%mm6 ;\n" - " pxor 56(%2), %%mm7 ;\n" - " movq %%mm4, 32(%1) ;\n" - " pxor 48(%3), %%mm6 ;\n" - " pxor 56(%3), %%mm7 ;\n" - " pxor 40(%5), %%mm5 ;\n" - " pxor 48(%4), %%mm6 ;\n" - " pxor 56(%4), %%mm7 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 48(%5), %%mm6 ;\n" - " pxor 56(%5), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " addl $64, %3 ;\n" - " addl $64, %4 ;\n" - " addl $64, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "g" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data), - "r" (bh_ptr[4]->b_data) - : "memory" ); - break; + p2 = (unsigned long *) bh_ptr[2]->b_data; + if (count == 3) { + active_template->do_3(bytes, p0, p1, p2); + return; } - __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); - - if (!(current->flags & PF_USEDFPU)) - stts(); -} -#endif /* __i386__ */ -#endif /* !__sparc_v9__ */ - -#ifdef __sparc_v9__ -/* - * High speed xor_block operation for RAID4/5 utilizing the - * UltraSparc Visual Instruction Set. - * - * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) - * - * Requirements: - * !(((long)dest | (long)sourceN) & (64 - 1)) && - * !(len & 127) && len >= 256 - * - * It is done in pure assembly, as otherwise gcc makes it - * a non-leaf function, which is not what we want. - * Also, we don't measure the speeds as on other architectures, - * as the measuring routine does not take into account cold caches - * and the fact that xor_block_VIS bypasses the caches. - * xor_block_32regs might be 5% faster for count 2 if caches are hot - * and things just right (for count 3 VIS is about as fast as 32regs for - * hot caches and for count 4 and 5 VIS is faster by good margin always), - * but I think it is better not to pollute the caches. - * Actually, if I'd just fight for speed for hot caches, I could - * write a hybrid VIS/integer routine, which would do always two - * 64B blocks in VIS and two in IEUs, but I really care more about - * caches. - */ -extern void *VISenter(void); -extern void xor_block_VIS XOR_ARGS; - -void __xor_block_VIS(void) -{ -__asm__ (" - .globl xor_block_VIS -xor_block_VIS: - ldx [%%o1 + 0], %%o4 - ldx [%%o1 + 8], %%o3 - ldx [%%o4 + %1], %%g5 - ldx [%%o4 + %0], %%o4 - ldx [%%o3 + %0], %%o3 - rd %%fprs, %%o5 - andcc %%o5, %2, %%g0 - be,pt %%icc, 297f - sethi %%hi(%5), %%g1 - jmpl %%g1 + %%lo(%5), %%g7 - add %%g7, 8, %%g7 -297: wr %%g0, %4, %%fprs - membar #LoadStore|#StoreLoad|#StoreStore - sub %%g5, 64, %%g5 - ldda [%%o4] %3, %%f0 - ldda [%%o3] %3, %%f16 - cmp %%o0, 4 - bgeu,pt %%xcc, 10f - cmp %%o0, 3 - be,pn %%xcc, 13f - mov -64, %%g1 - sub %%g5, 64, %%g5 - rd %%asi, %%g1 - wr %%g0, %3, %%asi - -2: ldda [%%o4 + 64] %%asi, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - stda %%f16, [%%o4] %3 - ldda [%%o3 + 64] %%asi, %%f48 - ldda [%%o4 + 128] %%asi, %%f0 - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - add %%o4, 128, %%o4 - fxor %%f36, %%f52, %%f52 - add %%o3, 128, %%o3 - fxor %%f38, %%f54, %%f54 - subcc %%g5, 128, %%g5 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%%o4 - 64] %%asi - bne,pt %%xcc, 2b - ldda [%%o3] %3, %%f16 - - ldda [%%o4 + 64] %%asi, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - stda %%f16, [%%o4] %3 - ldda [%%o3 + 64] %%asi, %%f48 - membar #Sync - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - fxor %%f36, %%f52, %%f52 - fxor %%f38, %%f54, %%f54 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%%o4 + 64] %%asi - membar #Sync|#StoreStore|#StoreLoad - wr %%g0, 0, %%fprs - retl - wr %%g1, %%g0, %%asi - -13: ldx [%%o1 + 16], %%o2 - ldx [%%o2 + %0], %%o2 - -3: ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f48 - fxor %%f2, %%f18, %%f50 - add %%o4, 64, %%o4 - fxor %%f4, %%f20, %%f52 - fxor %%f6, %%f22, %%f54 - add %%o3, 64, %%o3 - fxor %%f8, %%f24, %%f56 - fxor %%f10, %%f26, %%f58 - fxor %%f12, %%f28, %%f60 - fxor %%f14, %%f30, %%f62 - ldda [%%o4] %3, %%f0 - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - add %%o2, 64, %%o2 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - subcc %%g5, 64, %%g5 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - stda %%f48, [%%o4 + %%g1] %3 - bne,pt %%xcc, 3b - ldda [%%o3] %3, %%f16 - - ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f48 - fxor %%f2, %%f18, %%f50 - fxor %%f4, %%f20, %%f52 - fxor %%f6, %%f22, %%f54 - fxor %%f8, %%f24, %%f56 - fxor %%f10, %%f26, %%f58 - fxor %%f12, %%f28, %%f60 - fxor %%f14, %%f30, %%f62 - membar #Sync - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - stda %%f48, [%%o4] %3 - membar #Sync|#StoreStore|#StoreLoad - retl - wr %%g0, 0, %%fprs - -10: cmp %%o0, 5 - be,pt %%xcc, 15f - mov -64, %%g1 - -14: ldx [%%o1 + 16], %%o2 - ldx [%%o1 + 24], %%o0 - ldx [%%o2 + %0], %%o2 - ldx [%%o0 + %0], %%o0 - -4: ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - add %%o4, 64, %%o4 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - add %%o3, 64, %%o3 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - ldda [%%o0] %3, %%f48 - fxor %%f16, %%f32, %%f32 - fxor %%f18, %%f34, %%f34 - fxor %%f20, %%f36, %%f36 - fxor %%f22, %%f38, %%f38 - add %%o2, 64, %%o2 - fxor %%f24, %%f40, %%f40 - fxor %%f26, %%f42, %%f42 - fxor %%f28, %%f44, %%f44 - fxor %%f30, %%f46, %%f46 - ldda [%%o4] %3, %%f0 - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - fxor %%f36, %%f52, %%f52 - add %%o0, 64, %%o0 - fxor %%f38, %%f54, %%f54 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - subcc %%g5, 64, %%g5 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%%o4 + %%g1] %3 - bne,pt %%xcc, 4b - ldda [%%o3] %3, %%f16 - - ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - ldda [%%o0] %3, %%f48 - fxor %%f16, %%f32, %%f32 - fxor %%f18, %%f34, %%f34 - fxor %%f20, %%f36, %%f36 - fxor %%f22, %%f38, %%f38 - fxor %%f24, %%f40, %%f40 - fxor %%f26, %%f42, %%f42 - fxor %%f28, %%f44, %%f44 - fxor %%f30, %%f46, %%f46 - membar #Sync - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - fxor %%f36, %%f52, %%f52 - fxor %%f38, %%f54, %%f54 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%%o4] %3 - membar #Sync|#StoreStore|#StoreLoad - retl - wr %%g0, 0, %%fprs - -15: ldx [%%o1 + 16], %%o2 - ldx [%%o1 + 24], %%o0 - ldx [%%o1 + 32], %%o1 - ldx [%%o2 + %0], %%o2 - ldx [%%o0 + %0], %%o0 - ldx [%%o1 + %0], %%o1 - -5: ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f48 - fxor %%f2, %%f18, %%f50 - add %%o4, 64, %%o4 - fxor %%f4, %%f20, %%f52 - fxor %%f6, %%f22, %%f54 - add %%o3, 64, %%o3 - fxor %%f8, %%f24, %%f56 - fxor %%f10, %%f26, %%f58 - fxor %%f12, %%f28, %%f60 - fxor %%f14, %%f30, %%f62 - ldda [%%o0] %3, %%f16 - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - add %%o2, 64, %%o2 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - ldda [%%o1] %3, %%f32 - fxor %%f48, %%f16, %%f48 - fxor %%f50, %%f18, %%f50 - add %%o0, 64, %%o0 - fxor %%f52, %%f20, %%f52 - fxor %%f54, %%f22, %%f54 - add %%o1, 64, %%o1 - fxor %%f56, %%f24, %%f56 - fxor %%f58, %%f26, %%f58 - fxor %%f60, %%f28, %%f60 - fxor %%f62, %%f30, %%f62 - ldda [%%o4] %3, %%f0 - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - subcc %%g5, 64, %%g5 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - stda %%f48, [%%o4 + %%g1] %3 - bne,pt %%xcc, 5b - ldda [%%o3] %3, %%f16 - - ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f48 - fxor %%f2, %%f18, %%f50 - fxor %%f4, %%f20, %%f52 - fxor %%f6, %%f22, %%f54 - fxor %%f8, %%f24, %%f56 - fxor %%f10, %%f26, %%f58 - fxor %%f12, %%f28, %%f60 - fxor %%f14, %%f30, %%f62 - ldda [%%o0] %3, %%f16 - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - ldda [%%o1] %3, %%f32 - fxor %%f48, %%f16, %%f48 - fxor %%f50, %%f18, %%f50 - fxor %%f52, %%f20, %%f52 - fxor %%f54, %%f22, %%f54 - fxor %%f56, %%f24, %%f56 - fxor %%f58, %%f26, %%f58 - fxor %%f60, %%f28, %%f60 - fxor %%f62, %%f30, %%f62 - membar #Sync - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - stda %%f48, [%%o4] %3 - membar #Sync|#StoreStore|#StoreLoad - retl - wr %%g0, 0, %%fprs - " : : - "i" (&((struct buffer_head *)0)->b_data), - "i" (&((struct buffer_head *)0)->b_size), - "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P), - "i" (FPRS_FEF), "i" (VISenter)); -} -#endif /* __sparc_v9__ */ - -#if defined(__sparc__) && !defined(__sparc_v9__) -/* - * High speed xor_block operation for RAID4/5 utilizing the - * ldd/std SPARC instructions. - * - * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) - * - */ - -XORBLOCK_TEMPLATE(SPARC) -{ - int size = bh_ptr[0]->b_size; - int lines = size / (sizeof (long)) / 8, i; - long *destp = (long *) bh_ptr[0]->b_data; - long *source1 = (long *) bh_ptr[1]->b_data; - long *source2, *source3, *source4; - - switch (count) { - case 2: - for (i = lines; i > 0; i--) { - __asm__ __volatile__(" - ldd [%0 + 0x00], %%g2 - ldd [%0 + 0x08], %%g4 - ldd [%0 + 0x10], %%o0 - ldd [%0 + 0x18], %%o2 - ldd [%1 + 0x00], %%o4 - ldd [%1 + 0x08], %%l0 - ldd [%1 + 0x10], %%l2 - ldd [%1 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - std %%g2, [%0 + 0x00] - std %%g4, [%0 + 0x08] - std %%o0, [%0 + 0x10] - std %%o2, [%0 + 0x18] - " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", - "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); - destp += 8; - source1 += 8; - } - break; - case 3: - source2 = (long *) bh_ptr[2]->b_data; - for (i = lines; i > 0; i--) { - __asm__ __volatile__(" - ldd [%0 + 0x00], %%g2 - ldd [%0 + 0x08], %%g4 - ldd [%0 + 0x10], %%o0 - ldd [%0 + 0x18], %%o2 - ldd [%1 + 0x00], %%o4 - ldd [%1 + 0x08], %%l0 - ldd [%1 + 0x10], %%l2 - ldd [%1 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%2 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%2 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%2 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%2 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - std %%g2, [%0 + 0x00] - std %%g4, [%0 + 0x08] - std %%o0, [%0 + 0x10] - std %%o2, [%0 + 0x18] - " : : "r" (destp), "r" (source1), "r" (source2) - : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", - "l0", "l1", "l2", "l3", "l4", "l5"); - destp += 8; - source1 += 8; - source2 += 8; - } - break; - case 4: - source2 = (long *) bh_ptr[2]->b_data; - source3 = (long *) bh_ptr[3]->b_data; - for (i = lines; i > 0; i--) { - __asm__ __volatile__(" - ldd [%0 + 0x00], %%g2 - ldd [%0 + 0x08], %%g4 - ldd [%0 + 0x10], %%o0 - ldd [%0 + 0x18], %%o2 - ldd [%1 + 0x00], %%o4 - ldd [%1 + 0x08], %%l0 - ldd [%1 + 0x10], %%l2 - ldd [%1 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%2 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%2 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%2 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%2 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%3 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%3 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%3 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%3 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - std %%g2, [%0 + 0x00] - std %%g4, [%0 + 0x08] - std %%o0, [%0 + 0x10] - std %%o2, [%0 + 0x18] - " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3) - : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", - "l0", "l1", "l2", "l3", "l4", "l5"); - destp += 8; - source1 += 8; - source2 += 8; - source3 += 8; - } - break; - case 5: - source2 = (long *) bh_ptr[2]->b_data; - source3 = (long *) bh_ptr[3]->b_data; - source4 = (long *) bh_ptr[4]->b_data; - for (i = lines; i > 0; i--) { - __asm__ __volatile__(" - ldd [%0 + 0x00], %%g2 - ldd [%0 + 0x08], %%g4 - ldd [%0 + 0x10], %%o0 - ldd [%0 + 0x18], %%o2 - ldd [%1 + 0x00], %%o4 - ldd [%1 + 0x08], %%l0 - ldd [%1 + 0x10], %%l2 - ldd [%1 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%2 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%2 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%2 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%2 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%3 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%3 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%3 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%3 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%4 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%4 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%4 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%4 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - std %%g2, [%0 + 0x00] - std %%g4, [%0 + 0x08] - std %%o0, [%0 + 0x10] - std %%o2, [%0 + 0x18] - " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4) - : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", - "l0", "l1", "l2", "l3", "l4", "l5"); - destp += 8; - source1 += 8; - source2 += 8; - source3 += 8; - source4 += 8; - } - break; + p3 = (unsigned long *) bh_ptr[3]->b_data; + if (count == 4) { + active_template->do_4(bytes, p0, p1, p2, p3); + return; } -} -#endif /* __sparc_v[78]__ */ - -#ifdef __alpha__ -/* - * High speed xor_block operation for RAID4/5 pipelined for Alpha EV5. - * There is a second version using EV6 prefetch instructions. - * - * Copyright (C) 2000 Richard Henderson (rth@redhat.com) - */ - -XORBLOCK_TEMPLATE(alpha) -{ - long lines = bh_ptr[0]->b_size / sizeof (long) / 8; - long *d = (long *) bh_ptr[0]->b_data; - long *s1 = (long *) bh_ptr[1]->b_data; - long *s2, *s3, *s4; - - if (count == 2) goto two_blocks; - - s2 = (long *) bh_ptr[2]->b_data; - if (count == 3) goto three_blocks; - - s3 = (long *) bh_ptr[3]->b_data; - if (count == 4) goto four_blocks; - - s4 = (long *) bh_ptr[4]->b_data; - goto five_blocks; - -two_blocks: -asm volatile (" - .align 4 -2: - ldq $0,0(%0) - ldq $1,0(%1) - ldq $2,8(%0) - ldq $3,8(%1) - - ldq $4,16(%0) - ldq $5,16(%1) - ldq $6,24(%0) - ldq $7,24(%1) - - ldq $16,32(%0) - ldq $17,32(%1) - ldq $18,40(%0) - ldq $19,40(%1) - - ldq $20,48(%0) - ldq $21,48(%1) - ldq $22,56(%0) - xor $0,$1,$0 # 7 cycles from $1 load - - ldq $23,56(%1) - xor $2,$3,$2 - stq $0,0(%0) - xor $4,$5,$4 - - stq $2,8(%0) - xor $6,$7,$6 - stq $4,16(%0) - xor $16,$17,$16 - - stq $6,24(%0) - xor $18,$19,$18 - stq $16,32(%0) - xor $20,$21,$20 - - stq $18,40(%0) - xor $22,$23,$22 - stq $20,48(%0) - subq %2,1,%2 - - stq $22,56(%0) - addq %0,64,%0 - addq %1,64,%1 - bgt %2,2b" - : "=r"(d), "=r"(s1), "=r"(lines) - : "0"(d), "1"(s1), "2"(lines) - : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", - "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23"); - return; - -three_blocks: -asm volatile (" - .align 4 -3: - ldq $0,0(%0) - ldq $1,0(%1) - ldq $2,0(%2) - ldq $3,8(%0) - - ldq $4,8(%1) - ldq $6,16(%0) - ldq $7,16(%1) - ldq $17,24(%0) - - ldq $18,24(%1) - ldq $20,32(%0) - ldq $21,32(%1) - ldq $5,8(%2) - - ldq $16,16(%2) - ldq $19,24(%2) - ldq $22,32(%2) - nop - - xor $0,$1,$1 # 8 cycles from $0 load - xor $3,$4,$4 # 6 cycles from $4 load - xor $6,$7,$7 # 6 cycles from $7 load - xor $17,$18,$18 # 5 cycles from $18 load - - xor $1,$2,$2 # 9 cycles from $2 load - xor $20,$21,$21 # 5 cycles from $21 load - stq $2,0(%0) - xor $4,$5,$5 # 6 cycles from $5 load - - stq $5,8(%0) - xor $7,$16,$16 # 7 cycles from $16 load - stq $16,16(%0) - xor $18,$19,$19 # 7 cycles from $19 load - - stq $19,24(%0) - xor $21,$22,$22 # 7 cycles from $22 load - stq $22,32(%0) - nop - - ldq $0,40(%0) - ldq $1,40(%1) - ldq $3,48(%0) - ldq $4,48(%1) - - ldq $6,56(%0) - ldq $7,56(%1) - ldq $2,40(%2) - ldq $5,48(%2) - - ldq $16,56(%2) - xor $0,$1,$1 # 4 cycles from $1 load - xor $3,$4,$4 # 5 cycles from $4 load - xor $6,$7,$7 # 5 cycles from $7 load - - xor $1,$2,$2 # 4 cycles from $2 load - xor $4,$5,$5 # 5 cycles from $5 load - stq $2,40(%0) - xor $7,$16,$16 # 4 cycles from $16 load - - stq $5,48(%0) - subq %3,1,%3 - stq $16,56(%0) - addq %2,64,%2 - - addq %1,64,%1 - addq %0,64,%0 - bgt %3,3b" - : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines) - : "0"(d), "1"(s1), "2"(s2), "3"(lines) - : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", - "$16", "$17", "$18", "$19", "$20", "$21", "$22"); - return; - -four_blocks: -asm volatile (" - .align 4 -4: - ldq $0,0(%0) - ldq $1,0(%1) - ldq $2,0(%2) - ldq $3,0(%3) - - ldq $4,8(%0) - ldq $5,8(%1) - ldq $6,8(%2) - ldq $7,8(%3) - ldq $16,16(%0) - ldq $17,16(%1) - ldq $18,16(%2) - ldq $19,16(%3) - - ldq $20,24(%0) - xor $0,$1,$1 # 6 cycles from $1 load - ldq $21,24(%1) - xor $2,$3,$3 # 6 cycles from $3 load - - ldq $0,24(%2) - xor $1,$3,$3 - ldq $1,24(%3) - xor $4,$5,$5 # 7 cycles from $5 load - - stq $3,0(%0) - xor $6,$7,$7 - xor $16,$17,$17 # 7 cycles from $17 load - xor $5,$7,$7 - - stq $7,8(%0) - xor $18,$19,$19 # 7 cycles from $19 load - ldq $2,32(%0) - xor $17,$19,$19 - - ldq $3,32(%1) - ldq $4,32(%2) - ldq $5,32(%3) - xor $20,$21,$21 # 8 cycles from $21 load - - ldq $6,40(%0) - ldq $7,40(%1) - ldq $16,40(%2) - ldq $17,40(%3) - - stq $19,16(%0) - xor $0,$1,$1 # 9 cycles from $1 load - xor $2,$3,$3 # 5 cycles from $3 load - xor $21,$1,$1 - - ldq $18,48(%0) - xor $4,$5,$5 # 5 cycles from $5 load - ldq $19,48(%1) - xor $3,$5,$5 - - ldq $20,48(%2) - ldq $21,48(%3) - ldq $0,56(%0) - ldq $1,56(%1) - - ldq $2,56(%2) - xor $6,$7,$7 # 8 cycles from $6 load - ldq $3,56(%3) - xor $16,$17,$17 # 8 cycles from $17 load - - xor $7,$17,$17 - xor $18,$19,$19 # 5 cycles from $19 load - xor $20,$21,$21 # 5 cycles from $21 load - xor $19,$21,$21 - - stq $1,24(%0) - xor $0,$1,$1 # 5 cycles from $1 load - stq $5,32(%0) - xor $2,$3,$3 # 4 cycles from $3 load - - stq $17,40(%0) - xor $1,$3,$3 - stq $21,48(%0) - subq %4,1,%4 - - stq $3,56(%0) - addq %3,64,%3 - addq %2,64,%2 - addq %1,64,%1 - - addq %0,64,%0 - bgt %4,4b" - : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines) - : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines) - : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", - "$16", "$17", "$18", "$19", "$20", "$21"); - return; - -five_blocks: -asm volatile (" - ldq %0,0(%6) - ldq %1,8(%6) - ldq %2,16(%6) - ldq %3,24(%6) - ldq %4,32(%6) - ldq %0,%7(%0) - ldq %1,%7(%1) - ldq %2,%7(%2) - ldq %3,%7(%3) - ldq %4,%7(%4) - .align 4 -5: - ldq $0,0(%0) - ldq $1,0(%1) - ldq $2,0(%2) - ldq $3,0(%3) - - ldq $4,0(%4) - ldq $5,8(%0) - ldq $6,8(%1) - ldq $7,8(%2) - - ldq $16,8(%3) - ldq $17,8(%4) - ldq $18,16(%0) - ldq $19,16(%1) - - ldq $20,16(%2) - xor $0,$1,$1 # 6 cycles from $1 load - ldq $21,16(%3) - xor $2,$3,$3 # 6 cycles from $3 load - - ldq $0,16(%4) - xor $1,$3,$3 - ldq $1,24(%0) - xor $3,$4,$4 # 7 cycles from $4 load - - stq $4,0(%0) - xor $5,$6,$6 # 7 cycles from $6 load - xor $7,$16,$16 # 7 cycles from $16 load - xor $6,$17,$17 # 7 cycles from $17 load - - ldq $2,24(%1) - xor $16,$17,$17 - ldq $3,24(%2) - xor $18,$19,$19 # 8 cycles from $19 load - - stq $17,8(%0) - xor $19,$20,$20 # 8 cycles from $20 load - ldq $4,24(%3) - xor $21,$0,$0 # 7 cycles from $0 load - - ldq $5,24(%4) - xor $20,$0,$0 - ldq $6,32(%0) - ldq $7,32(%1) - - stq $0,16(%0) - xor $1,$2,$2 # 6 cycles from $2 load - ldq $16,32(%2) - xor $3,$4,$4 # 4 cycles from $4 load - - ldq $17,32(%3) - xor $2,$4,$4 - ldq $18,32(%4) - ldq $19,40(%0) - - ldq $20,40(%1) - ldq $21,40(%2) - ldq $0,40(%3) - xor $4,$5,$5 # 7 cycles from $5 load - - stq $5,24(%0) - xor $6,$7,$7 # 7 cycles from $7 load - ldq $1,40(%4) - ldq $2,48(%0) - - ldq $3,48(%1) - xor $7,$16,$16 # 7 cycles from $16 load - ldq $4,48(%2) - xor $17,$18,$18 # 6 cycles from $18 load - - ldq $5,48(%3) - xor $16,$18,$18 - ldq $6,48(%4) - xor $19,$20,$20 # 7 cycles from $20 load - - stq $18,32(%0) - xor $20,$21,$21 # 8 cycles from $21 load - ldq $7,56(%0) - xor $0,$1,$1 # 6 cycles from $1 load - - ldq $16,56(%1) - ldq $17,56(%2) - ldq $18,56(%3) - ldq $19,56(%4) - - xor $21,$1,$1 - xor $2,$3,$3 # 9 cycles from $3 load - xor $3,$4,$4 # 9 cycles from $4 load - xor $5,$6,$6 # 8 cycles from $6 load - - unop - xor $4,$6,$6 - xor $7,$16,$16 # 7 cycles from $16 load - xor $17,$18,$18 # 6 cycles from $18 load - - stq $6,48(%0) - xor $16,$18,$18 - subq %5,1,%5 - xor $18,$19,$19 # 8 cycles from $19 load - - stq $19,56(%0) - addq %4,64,%4 - addq %3,64,%3 - addq %2,64,%2 - - addq %1,64,%1 - addq %0,64,%0 - bgt %5,5b" - : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines) - /* ARG! We've run out of asm arguments! We've got to reload - all those pointers we just loaded. */ - : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines) - : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", - "$16", "$17", "$18", "$19", "$20", "$21"); - return; + p4 = (unsigned long *) bh_ptr[4]->b_data; + active_template->do_5(bytes, p0, p1, p2, p3, p4); } -#define prefetch(base, ofs) \ - asm("ldq $31,%2(%0)" : "=r"(base) : "0"(base), "i"(ofs)) - -XORBLOCK_TEMPLATE(alpha_prefetch) -{ - long lines = bh_ptr[0]->b_size / sizeof (long) / 8; - long *d = (long *) bh_ptr[0]->b_data; - long *s1 = (long *) bh_ptr[1]->b_data; - long *s2, *s3, *s4; - long p; - - p = count == 2; - prefetch(d, 0); - prefetch(s1, 0); - prefetch(d, 64); - prefetch(s1, 64); - prefetch(d, 128); - prefetch(s1, 128); - prefetch(d, 192); - prefetch(s1, 192); - if (p) goto two_blocks; - - s2 = (long *) bh_ptr[2]->b_data; - p = count == 3; - prefetch(s2, 0); - prefetch(s2, 64); - prefetch(s2, 128); - prefetch(s2, 192); - if (p) goto three_blocks; - - s3 = (long *) bh_ptr[3]->b_data; - p = count == 4; - prefetch(s3, 0); - prefetch(s3, 64); - prefetch(s3, 128); - prefetch(s3, 192); - if (p) goto four_blocks; - - s4 = (long *) bh_ptr[4]->b_data; - prefetch(s4, 0); - prefetch(s4, 64); - prefetch(s4, 128); - prefetch(s4, 192); - goto five_blocks; - -two_blocks: -asm volatile (" - .align 4 -2: - ldq $0,0(%0) - ldq $1,0(%1) - ldq $2,8(%0) - ldq $3,8(%1) - - ldq $4,16(%0) - ldq $5,16(%1) - ldq $6,24(%0) - ldq $7,24(%1) - - ldq $16,32(%0) - ldq $17,32(%1) - ldq $18,40(%0) - ldq $19,40(%1) - - ldq $20,48(%0) - ldq $21,48(%1) - ldq $22,56(%0) - ldq $23,56(%1) - - ldq $31,256(%0) - xor $0,$1,$0 # 8 cycles from $1 load - ldq $31,256(%1) - xor $2,$3,$2 - - stq $0,0(%0) - xor $4,$5,$4 - stq $2,8(%0) - xor $6,$7,$6 - - stq $4,16(%0) - xor $16,$17,$16 - stq $6,24(%0) - xor $18,$19,$18 - - stq $16,32(%0) - xor $20,$21,$20 - stq $18,40(%0) - xor $22,$23,$22 - - stq $20,48(%0) - subq %2,1,%2 - stq $22,56(%0) - addq %0,64,%0 - - addq %1,64,%1 - bgt %2,2b" - : "=r"(d), "=r"(s1), "=r"(lines) - : "0"(d), "1"(s1), "2"(lines) - : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", - "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23"); - return; - -three_blocks: -asm volatile (" - .align 4 -3: - ldq $0,0(%0) - ldq $1,0(%1) - ldq $2,0(%2) - ldq $3,8(%0) - - ldq $4,8(%1) - ldq $6,16(%0) - ldq $7,16(%1) - ldq $17,24(%0) - - ldq $18,24(%1) - ldq $20,32(%0) - ldq $21,32(%1) - ldq $5,8(%2) - - ldq $16,16(%2) - ldq $19,24(%2) - ldq $22,32(%2) - nop - - xor $0,$1,$1 # 8 cycles from $0 load - xor $3,$4,$4 # 7 cycles from $4 load - xor $6,$7,$7 # 6 cycles from $7 load - xor $17,$18,$18 # 5 cycles from $18 load - - xor $1,$2,$2 # 9 cycles from $2 load - xor $20,$21,$21 # 5 cycles from $21 load - stq $2,0(%0) - xor $4,$5,$5 # 6 cycles from $5 load - - stq $5,8(%0) - xor $7,$16,$16 # 7 cycles from $16 load - stq $16,16(%0) - xor $18,$19,$19 # 7 cycles from $19 load - - stq $19,24(%0) - xor $21,$22,$22 # 7 cycles from $22 load - stq $22,32(%0) - nop - - ldq $0,40(%0) - ldq $1,40(%1) - ldq $3,48(%0) - ldq $4,48(%1) - - ldq $6,56(%0) - ldq $7,56(%1) - ldq $2,40(%2) - ldq $5,48(%2) - - ldq $16,56(%2) - ldq $31,256(%0) - ldq $31,256(%1) - ldq $31,256(%2) - - xor $0,$1,$1 # 6 cycles from $1 load - xor $3,$4,$4 # 5 cycles from $4 load - xor $6,$7,$7 # 5 cycles from $7 load - xor $1,$2,$2 # 4 cycles from $2 load - - xor $4,$5,$5 # 5 cycles from $5 load - xor $7,$16,$16 # 4 cycles from $16 load - stq $2,40(%0) - subq %3,1,%3 - - stq $5,48(%0) - addq %2,64,%2 - stq $16,56(%0) - addq %1,64,%1 - - addq %0,64,%0 - bgt %3,3b" - : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines) - : "0"(d), "1"(s1), "2"(s2), "3"(lines) - : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", - "$16", "$17", "$18", "$19", "$20", "$21", "$22"); - return; - -four_blocks: -asm volatile (" - .align 4 -4: - ldq $0,0(%0) - ldq $1,0(%1) - ldq $2,0(%2) - ldq $3,0(%3) - - ldq $4,8(%0) - ldq $5,8(%1) - ldq $6,8(%2) - ldq $7,8(%3) - - ldq $16,16(%0) - ldq $17,16(%1) - ldq $18,16(%2) - ldq $19,16(%3) - - ldq $20,24(%0) - xor $0,$1,$1 # 6 cycles from $1 load - ldq $21,24(%1) - xor $2,$3,$3 # 6 cycles from $3 load - - ldq $0,24(%2) - xor $1,$3,$3 - ldq $1,24(%3) - xor $4,$5,$5 # 7 cycles from $5 load - - stq $3,0(%0) - xor $6,$7,$7 - xor $16,$17,$17 # 7 cycles from $17 load - xor $5,$7,$7 - - stq $7,8(%0) - xor $18,$19,$19 # 7 cycles from $19 load - ldq $2,32(%0) - xor $17,$19,$19 - - ldq $3,32(%1) - ldq $4,32(%2) - ldq $5,32(%3) - xor $20,$21,$21 # 8 cycles from $21 load - - ldq $6,40(%0) - ldq $7,40(%1) - ldq $16,40(%2) - ldq $17,40(%3) - - stq $19,16(%0) - xor $0,$1,$1 # 9 cycles from $1 load - xor $2,$3,$3 # 5 cycles from $3 load - xor $21,$1,$1 - - ldq $18,48(%0) - xor $4,$5,$5 # 5 cycles from $5 load - ldq $19,48(%1) - xor $3,$5,$5 - - ldq $20,48(%2) - ldq $21,48(%3) - ldq $0,56(%0) - ldq $1,56(%1) - - ldq $2,56(%2) - xor $6,$7,$7 # 8 cycles from $6 load - ldq $3,56(%3) - xor $16,$17,$17 # 8 cycles from $17 load - - ldq $31,256(%0) - xor $7,$17,$17 - ldq $31,256(%1) - xor $18,$19,$19 # 6 cycles from $19 load +/* Set of all registered templates. */ +static struct xor_block_template *template_list; - ldq $31,256(%2) - xor $20,$21,$21 # 6 cycles from $21 load - ldq $31,256(%3) - xor $19,$21,$21 +/* The -6*32 shift factor colors the cache. */ +#define BENCH_SIZE (PAGE_SIZE-6*32) - stq $1,24(%0) - xor $0,$1,$1 # 7 cycles from $1 load - stq $5,32(%0) - xor $2,$3,$3 # 6 cycles from $3 load - - stq $17,40(%0) - xor $1,$3,$3 - stq $21,48(%0) - subq %4,1,%4 - - stq $3,56(%0) - addq %3,64,%3 - addq %2,64,%2 - addq %1,64,%1 - - addq %0,64,%0 - bgt %4,4b" - : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines) - : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines) - : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", - "$16", "$17", "$18", "$19", "$20", "$21"); - return; - -five_blocks: -asm volatile (" - ldq %0,0(%6) - ldq %1,8(%6) - ldq %2,16(%6) - ldq %3,24(%6) - ldq %4,32(%6) - ldq %0,%7(%0) - ldq %1,%7(%1) - ldq %2,%7(%2) - ldq %3,%7(%3) - ldq %4,%7(%4) - .align 4 -5: - ldq $0,0(%0) - ldq $1,0(%1) - ldq $2,0(%2) - ldq $3,0(%3) - - ldq $4,0(%4) - ldq $5,8(%0) - ldq $6,8(%1) - ldq $7,8(%2) - - ldq $16,8(%3) - ldq $17,8(%4) - ldq $18,16(%0) - ldq $19,16(%1) - - ldq $20,16(%2) - xor $0,$1,$1 # 6 cycles from $1 load - ldq $21,16(%3) - xor $2,$3,$3 # 6 cycles from $3 load - - ldq $0,16(%4) - xor $1,$3,$3 - ldq $1,24(%0) - xor $3,$4,$4 # 7 cycles from $4 load - - stq $4,0(%0) - xor $5,$6,$6 # 7 cycles from $6 load - xor $7,$16,$16 # 7 cycles from $16 load - xor $6,$17,$17 # 7 cycles from $17 load - - ldq $2,24(%1) - xor $16,$17,$17 - ldq $3,24(%2) - xor $18,$19,$19 # 8 cycles from $19 load - - stq $17,8(%0) - xor $19,$20,$20 # 8 cycles from $20 load - ldq $4,24(%3) - xor $21,$0,$0 # 7 cycles from $0 load - - ldq $5,24(%4) - xor $20,$0,$0 - ldq $6,32(%0) - ldq $7,32(%1) - - stq $0,16(%0) - xor $1,$2,$2 # 6 cycles from $2 load - ldq $16,32(%2) - xor $3,$4,$4 # 4 cycles from $4 load - - ldq $17,32(%3) - xor $2,$4,$4 - ldq $18,32(%4) - ldq $19,40(%0) - - ldq $20,40(%1) - ldq $21,40(%2) - ldq $0,40(%3) - xor $4,$5,$5 # 7 cycles from $5 load - - stq $5,24(%0) - xor $6,$7,$7 # 7 cycles from $7 load - ldq $1,40(%4) - ldq $2,48(%0) - - ldq $3,48(%1) - xor $7,$16,$16 # 7 cycles from $16 load - ldq $4,48(%2) - xor $17,$18,$18 # 6 cycles from $18 load - - ldq $5,48(%3) - xor $16,$18,$18 - ldq $6,48(%4) - xor $19,$20,$20 # 7 cycles from $20 load - - stq $18,32(%0) - xor $20,$21,$21 # 8 cycles from $21 load - ldq $7,56(%0) - xor $0,$1,$1 # 6 cycles from $1 load - - ldq $16,56(%1) - ldq $17,56(%2) - ldq $18,56(%3) - ldq $19,56(%4) - - ldq $31,256(%0) - xor $21,$1,$1 - ldq $31,256(%1) - xor $2,$3,$3 # 9 cycles from $3 load - - ldq $31,256(%2) - xor $3,$4,$4 # 9 cycles from $4 load - ldq $31,256(%3) - xor $5,$6,$6 # 8 cycles from $6 load - - ldq $31,256(%4) - xor $4,$6,$6 - xor $7,$16,$16 # 7 cycles from $16 load - xor $17,$18,$18 # 6 cycles from $18 load - - stq $6,48(%0) - xor $16,$18,$18 - subq %5,1,%5 - xor $18,$19,$19 # 8 cycles from $19 load - - stq $19,56(%0) - addq %4,64,%4 - addq %3,64,%3 - addq %2,64,%2 - - addq %1,64,%1 - addq %0,64,%0 - bgt %5,5b" - : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines) - /* ARG! We've run out of asm arguments! We've got to reload - all those pointers we just loaded. */ - : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines) - : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", - "$16", "$17", "$18", "$19", "$20", "$21"); - return; -} - -#undef prefetch - -#endif /* __alpha__ */ - -#ifndef __sparc_v9__ - -/* - * this one works reasonably on any x86 CPU - * (send me an assembly version for inclusion if you can make it faster) - * - * this one is just as fast as written in pure assembly on x86. - * the reason for this separate version is that the - * fast open-coded xor routine "32reg" produces suboptimal code - * on x86, due to lack of registers. - */ -XORBLOCK_TEMPLATE(8regs) -{ - int len = bh_ptr[0]->b_size; - long *destp = (long *) bh_ptr[0]->b_data; - long *source1, *source2, *source3, *source4; - long lines = len / (sizeof (long)) / 8, i; - - switch(count) { - case 2: - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(source1 + 0); - *(destp + 1) ^= *(source1 + 1); - *(destp + 2) ^= *(source1 + 2); - *(destp + 3) ^= *(source1 + 3); - *(destp + 4) ^= *(source1 + 4); - *(destp + 5) ^= *(source1 + 5); - *(destp + 6) ^= *(source1 + 6); - *(destp + 7) ^= *(source1 + 7); - source1 += 8; - destp += 8; - } - break; - case 3: - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(source1 + 0); - *(destp + 0) ^= *(source2 + 0); - *(destp + 1) ^= *(source1 + 1); - *(destp + 1) ^= *(source2 + 1); - *(destp + 2) ^= *(source1 + 2); - *(destp + 2) ^= *(source2 + 2); - *(destp + 3) ^= *(source1 + 3); - *(destp + 3) ^= *(source2 + 3); - *(destp + 4) ^= *(source1 + 4); - *(destp + 4) ^= *(source2 + 4); - *(destp + 5) ^= *(source1 + 5); - *(destp + 5) ^= *(source2 + 5); - *(destp + 6) ^= *(source1 + 6); - *(destp + 6) ^= *(source2 + 6); - *(destp + 7) ^= *(source1 + 7); - *(destp + 7) ^= *(source2 + 7); - source1 += 8; - source2 += 8; - destp += 8; - } - break; - case 4: - source3 = (long *) bh_ptr[3]->b_data; - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(source1 + 0); - *(destp + 0) ^= *(source2 + 0); - *(destp + 0) ^= *(source3 + 0); - *(destp + 1) ^= *(source1 + 1); - *(destp + 1) ^= *(source2 + 1); - *(destp + 1) ^= *(source3 + 1); - *(destp + 2) ^= *(source1 + 2); - *(destp + 2) ^= *(source2 + 2); - *(destp + 2) ^= *(source3 + 2); - *(destp + 3) ^= *(source1 + 3); - *(destp + 3) ^= *(source2 + 3); - *(destp + 3) ^= *(source3 + 3); - *(destp + 4) ^= *(source1 + 4); - *(destp + 4) ^= *(source2 + 4); - *(destp + 4) ^= *(source3 + 4); - *(destp + 5) ^= *(source1 + 5); - *(destp + 5) ^= *(source2 + 5); - *(destp + 5) ^= *(source3 + 5); - *(destp + 6) ^= *(source1 + 6); - *(destp + 6) ^= *(source2 + 6); - *(destp + 6) ^= *(source3 + 6); - *(destp + 7) ^= *(source1 + 7); - *(destp + 7) ^= *(source2 + 7); - *(destp + 7) ^= *(source3 + 7); - source1 += 8; - source2 += 8; - source3 += 8; - destp += 8; - } - break; - case 5: - source4 = (long *) bh_ptr[4]->b_data; - source3 = (long *) bh_ptr[3]->b_data; - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(source1 + 0); - *(destp + 0) ^= *(source2 + 0); - *(destp + 0) ^= *(source3 + 0); - *(destp + 0) ^= *(source4 + 0); - *(destp + 1) ^= *(source1 + 1); - *(destp + 1) ^= *(source2 + 1); - *(destp + 1) ^= *(source3 + 1); - *(destp + 1) ^= *(source4 + 1); - *(destp + 2) ^= *(source1 + 2); - *(destp + 2) ^= *(source2 + 2); - *(destp + 2) ^= *(source3 + 2); - *(destp + 2) ^= *(source4 + 2); - *(destp + 3) ^= *(source1 + 3); - *(destp + 3) ^= *(source2 + 3); - *(destp + 3) ^= *(source3 + 3); - *(destp + 3) ^= *(source4 + 3); - *(destp + 4) ^= *(source1 + 4); - *(destp + 4) ^= *(source2 + 4); - *(destp + 4) ^= *(source3 + 4); - *(destp + 4) ^= *(source4 + 4); - *(destp + 5) ^= *(source1 + 5); - *(destp + 5) ^= *(source2 + 5); - *(destp + 5) ^= *(source3 + 5); - *(destp + 5) ^= *(source4 + 5); - *(destp + 6) ^= *(source1 + 6); - *(destp + 6) ^= *(source2 + 6); - *(destp + 6) ^= *(source3 + 6); - *(destp + 6) ^= *(source4 + 6); - *(destp + 7) ^= *(source1 + 7); - *(destp + 7) ^= *(source2 + 7); - *(destp + 7) ^= *(source3 + 7); - *(destp + 7) ^= *(source4 + 7); - source1 += 8; - source2 += 8; - source3 += 8; - source4 += 8; - destp += 8; - } - break; - } -} - -/* - * platform independent RAID5 checksum calculation, this should - * be very fast on any platform that has a decent amount of - * registers. (32 or more) - */ -XORBLOCK_TEMPLATE(32regs) -{ - int size = bh_ptr[0]->b_size; - int lines = size / (sizeof (long)) / 8, i; - long *destp = (long *) bh_ptr[0]->b_data; - long *source1, *source2, *source3, *source4; - - /* LOTS of registers available... - We do explicite loop-unrolling here for code which - favours RISC machines. In fact this is almoast direct - RISC assembly on Alpha and SPARC :-) */ - - - switch(count) { - case 2: - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = destp[0]; /* Pull the stuff into registers */ - d1 = destp[1]; /* ... in bursts, if possible. */ - d2 = destp[2]; - d3 = destp[3]; - d4 = destp[4]; - d5 = destp[5]; - d6 = destp[6]; - d7 = destp[7]; - d0 ^= source1[0]; - d1 ^= source1[1]; - d2 ^= source1[2]; - d3 ^= source1[3]; - d4 ^= source1[4]; - d5 ^= source1[5]; - d6 ^= source1[6]; - d7 ^= source1[7]; - destp[0] = d0; /* Store the result (in burts) */ - destp[1] = d1; - destp[2] = d2; - destp[3] = d3; - destp[4] = d4; /* Store the result (in burts) */ - destp[5] = d5; - destp[6] = d6; - destp[7] = d7; - source1 += 8; - destp += 8; - } - break; - case 3: - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = destp[0]; /* Pull the stuff into registers */ - d1 = destp[1]; /* ... in bursts, if possible. */ - d2 = destp[2]; - d3 = destp[3]; - d4 = destp[4]; - d5 = destp[5]; - d6 = destp[6]; - d7 = destp[7]; - d0 ^= source1[0]; - d1 ^= source1[1]; - d2 ^= source1[2]; - d3 ^= source1[3]; - d4 ^= source1[4]; - d5 ^= source1[5]; - d6 ^= source1[6]; - d7 ^= source1[7]; - d0 ^= source2[0]; - d1 ^= source2[1]; - d2 ^= source2[2]; - d3 ^= source2[3]; - d4 ^= source2[4]; - d5 ^= source2[5]; - d6 ^= source2[6]; - d7 ^= source2[7]; - destp[0] = d0; /* Store the result (in burts) */ - destp[1] = d1; - destp[2] = d2; - destp[3] = d3; - destp[4] = d4; /* Store the result (in burts) */ - destp[5] = d5; - destp[6] = d6; - destp[7] = d7; - source1 += 8; - source2 += 8; - destp += 8; - } - break; - case 4: - source3 = (long *) bh_ptr[3]->b_data; - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = destp[0]; /* Pull the stuff into registers */ - d1 = destp[1]; /* ... in bursts, if possible. */ - d2 = destp[2]; - d3 = destp[3]; - d4 = destp[4]; - d5 = destp[5]; - d6 = destp[6]; - d7 = destp[7]; - d0 ^= source1[0]; - d1 ^= source1[1]; - d2 ^= source1[2]; - d3 ^= source1[3]; - d4 ^= source1[4]; - d5 ^= source1[5]; - d6 ^= source1[6]; - d7 ^= source1[7]; - d0 ^= source2[0]; - d1 ^= source2[1]; - d2 ^= source2[2]; - d3 ^= source2[3]; - d4 ^= source2[4]; - d5 ^= source2[5]; - d6 ^= source2[6]; - d7 ^= source2[7]; - d0 ^= source3[0]; - d1 ^= source3[1]; - d2 ^= source3[2]; - d3 ^= source3[3]; - d4 ^= source3[4]; - d5 ^= source3[5]; - d6 ^= source3[6]; - d7 ^= source3[7]; - destp[0] = d0; /* Store the result (in burts) */ - destp[1] = d1; - destp[2] = d2; - destp[3] = d3; - destp[4] = d4; /* Store the result (in burts) */ - destp[5] = d5; - destp[6] = d6; - destp[7] = d7; - source1 += 8; - source2 += 8; - source3 += 8; - destp += 8; - } - break; - case 5: - source4 = (long *) bh_ptr[4]->b_data; - source3 = (long *) bh_ptr[3]->b_data; - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = destp[0]; /* Pull the stuff into registers */ - d1 = destp[1]; /* ... in bursts, if possible. */ - d2 = destp[2]; - d3 = destp[3]; - d4 = destp[4]; - d5 = destp[5]; - d6 = destp[6]; - d7 = destp[7]; - d0 ^= source1[0]; - d1 ^= source1[1]; - d2 ^= source1[2]; - d3 ^= source1[3]; - d4 ^= source1[4]; - d5 ^= source1[5]; - d6 ^= source1[6]; - d7 ^= source1[7]; - d0 ^= source2[0]; - d1 ^= source2[1]; - d2 ^= source2[2]; - d3 ^= source2[3]; - d4 ^= source2[4]; - d5 ^= source2[5]; - d6 ^= source2[6]; - d7 ^= source2[7]; - d0 ^= source3[0]; - d1 ^= source3[1]; - d2 ^= source3[2]; - d3 ^= source3[3]; - d4 ^= source3[4]; - d5 ^= source3[5]; - d6 ^= source3[6]; - d7 ^= source3[7]; - d0 ^= source4[0]; - d1 ^= source4[1]; - d2 ^= source4[2]; - d3 ^= source4[3]; - d4 ^= source4[4]; - d5 ^= source4[5]; - d6 ^= source4[6]; - d7 ^= source4[7]; - destp[0] = d0; /* Store the result (in burts) */ - destp[1] = d1; - destp[2] = d2; - destp[3] = d3; - destp[4] = d4; /* Store the result (in burts) */ - destp[5] = d5; - destp[6] = d6; - destp[7] = d7; - source1 += 8; - source2 += 8; - source3 += 8; - source4 += 8; - destp += 8; - } - break; - } -} - -/* - * (the -6*32 shift factor colors the cache) - */ -#define SIZE (PAGE_SIZE-6*32) - -static void xor_speed ( struct xor_block_template * func, - struct buffer_head *b1, struct buffer_head *b2) +static void +do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) { int speed; unsigned long now; int i, count, max; - struct buffer_head *bh_ptr[6]; - func->next = xor_functions; - xor_functions = func; - bh_ptr[0] = b1; - bh_ptr[1] = b2; + tmpl->next = template_list; + template_list = tmpl; /* - * count the number of XORs done during a whole jiffy. - * calculate the speed of checksumming from this. - * (we use a 2-page allocation to have guaranteed - * color L1-cache layout) + * Count the number of XORs done during a whole jiffy, and use + * this to calculate the speed of checksumming. We use a 2-page + * allocation to have guaranteed color L1-cache layout. */ max = 0; for (i = 0; i < 5; i++) { @@ -2600,7 +82,7 @@ static void xor_speed ( struct xor_block_template * func, count = 0; while (jiffies == now) { mb(); - func->xor_block(2,bh_ptr); + tmpl->do_2(BENCH_SIZE, b1, b2); mb(); count++; mb(); @@ -2609,120 +91,53 @@ static void xor_speed ( struct xor_block_template * func, max = count; } - speed = max * (HZ*SIZE/1024); - func->speed = speed; + speed = max * (HZ * BENCH_SIZE / 1024); + tmpl->speed = speed; - printk( " %-10s: %5d.%03d MB/sec\n", func->name, - speed / 1000, speed % 1000); + printk(" %-10s: %5d.%03d MB/sec\n", tmpl->name, + speed / 1000, speed % 1000); } -static inline void pick_fastest_function(void) +static int +calibrate_xor_block(void) { + void *b1, *b2; struct xor_block_template *f, *fastest; - fastest = xor_functions; - for (f = fastest; f; f = f->next) { - if (f->speed > fastest->speed) - fastest = f; - } -#ifdef CONFIG_X86_XMM - if (cpu_has_xmm) { - /* we force the use of the KNI xor block because it - can write around l2. we may also be able - to load into the l1 only depending on how - the cpu deals with a load to a line that is - being prefetched. - */ - fastest = &t_xor_block_pIII_kni; + b1 = (void *) md__get_free_pages(GFP_KERNEL, 2); + if (! b1) { + printk("raid5: Yikes! No memory available.\n"); + return -ENOMEM; } -#endif -#ifdef __alpha__ - if (implver() == IMPLVER_EV6) { - /* Force the use of alpha_prefetch if EV6, as it - is significantly faster in the cold cache case. */ - fastest = &t_xor_block_alpha_prefetch; - } -#endif - xor_block = fastest->xor_block; - printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name, - fastest->speed / 1000, fastest->speed % 1000); -} - -static struct buffer_head b1, b2; - -void calibrate_xor_block(void) -{ - if (xor_block) - return; - memset(&b1,0,sizeof(b1)); - b2 = b1; - - b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2); - if (!b1.b_data) { - pick_fastest_function(); - return; - } - b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE; - - b1.b_size = SIZE; + b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE; printk(KERN_INFO "raid5: measuring checksumming speed\n"); + sti(); - sti(); /* should be safe */ +#define xor_speed(templ) do_xor_speed((templ), b1, b2) -#if defined(__sparc__) && !defined(__sparc_v9__) - printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n"); - xor_speed(&t_xor_block_SPARC,&b1,&b2); -#endif + XOR_TRY_TEMPLATES; -#ifdef CONFIG_X86_XMM - if (cpu_has_xmm) { - printk(KERN_INFO - "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n"); - xor_speed(&t_xor_block_pIII_kni,&b1,&b2); - } -#endif /* CONFIG_X86_XMM */ +#undef xor_speed -#ifdef __i386__ - if (md_cpu_has_mmx()) { - printk(KERN_INFO - "raid5: MMX detected, trying high-speed MMX checksum routines\n"); - xor_speed(&t_xor_block_pII_mmx,&b1,&b2); - xor_speed(&t_xor_block_p5_mmx,&b1,&b2); - } -#endif /* __i386__ */ + free_pages((unsigned long)b1, 2); -#ifdef __alpha__ - xor_speed(&t_xor_block_alpha,&b1,&b2); - xor_speed(&t_xor_block_alpha_prefetch,&b1,&b2); -#endif - - xor_speed(&t_xor_block_8regs,&b1,&b2); - xor_speed(&t_xor_block_32regs,&b1,&b2); + fastest = template_list; + for (f = fastest; f; f = f->next) + if (f->speed > fastest->speed) + fastest = f; - free_pages((unsigned long)b1.b_data,2); - pick_fastest_function(); -} +#ifdef XOR_SELECT_TEMPLATE + fastest = XOR_SELECT_TEMPLATE(fastest); +#endif -#else /* __sparc_v9__ */ + active_template = fastest; + printk("raid5: using function: %s (%d.%03d MB/sec)\n", + fastest->name, fastest->speed / 1000, fastest->speed % 1000); -void calibrate_xor_block(void) -{ - if (xor_block) - return; - printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n"); - xor_block = xor_block_VIS; + return 0; } -#endif /* __sparc_v9__ */ - MD_EXPORT_SYMBOL(xor_block); -MD_EXPORT_SYMBOL(calibrate_xor_block); -#ifdef MODULE -int init_module(void) -{ - calibrate_xor_block(); - return 0; -} -#endif +module_init(calibrate_xor_block); -- cgit v1.2.3