1 files changed, 1894 insertions, 0 deletions
diff --git a/drivers/block/xor.c b/drivers/block/xor.c
new file mode 100644
index 000000000..ca1bb1564
--- /dev/null
+++ b/drivers/block/xor.c
@@ -0,0 +1,1894 @@
+/*
+ * xor.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
+ *
+ *
+ * optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#define BH_TRACE 0
+#include <linux/module.h>
+#include <linux/raid/md.h>
+#ifdef __sparc_v9__
+#include <asm/head.h>
+#include <asm/asi.h>
+#include <asm/visasm.h>
+#endif
+
+/*
+ * we use the 'XOR function template' to register multiple xor
+ * functions runtime. The kernel measures their speed upon bootup
+ * and decides which one to use. (compile-time registration is
+ * not enough as certain CPU features like MMX can only be detected
+ * runtime)
+ *
+ * this architecture makes it pretty easy to add new routines
+ * that are faster on certain CPUs, without killing other CPU's
+ * 'native' routine. Although the current routines are belived
+ * to be the physically fastest ones on all CPUs tested, but
+ * feel free to prove me wrong and add yet another routine =B-)
+ * --mingo
+ */
+
+#define MAX_XOR_BLOCKS 5
+
+#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
+
+typedef void (*xor_block_t) XOR_ARGS;
+xor_block_t xor_block = NULL;
+
+#ifndef __sparc_v9__
+
+struct xor_block_template;
+
+struct xor_block_template {
+	char * name;
+	xor_block_t xor_block;
+	int speed;
+	struct xor_block_template * next;
+};
+
+struct xor_block_template * xor_functions = NULL;
+
+#define XORBLOCK_TEMPLATE(x) \
+static void xor_block_##x XOR_ARGS; \
+static struct xor_block_template t_xor_block_##x = \
+				 { #x, xor_block_##x, 0, NULL }; \
+static void xor_block_##x XOR_ARGS
+
+#ifdef __i386__
+
+#ifdef CONFIG_X86_XMM
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+XORBLOCK_TEMPLATE(pIII_kni)
+{
+	char xmm_save[16*4];
+	int cr0;
+        int lines = (bh_ptr[0]->b_size>>8);
+
+	__asm__ __volatile__ ( 
+		"movl %%cr0,%0		;\n\t"
+		"clts			;\n\t"
+		"movups %%xmm0,(%1)	;\n\t"
+		"movups %%xmm1,0x10(%1)	;\n\t"
+		"movups %%xmm2,0x20(%1)	;\n\t"
+		"movups %%xmm3,0x30(%1)	;\n\t"
+		: "=r" (cr0)
+		: "r" (xmm_save) 
+		: "memory" );
+
+#define OFFS(x) "8*("#x"*2)"
+#define	PF0(x) \
+	"	prefetcht0  "OFFS(x)"(%1)   ;\n"
+#define LD(x,y) \
+        "       movaps   "OFFS(x)"(%1), %%xmm"#y"   ;\n"
+#define ST(x,y) \
+        "       movaps %%xmm"#y",   "OFFS(x)"(%1)   ;\n"
+#define PF1(x) \
+	"	prefetchnta "OFFS(x)"(%2)   ;\n"
+#define PF2(x) \
+	"	prefetchnta "OFFS(x)"(%3)   ;\n"
+#define PF3(x) \
+	"	prefetchnta "OFFS(x)"(%4)   ;\n"
+#define PF4(x) \
+	"	prefetchnta "OFFS(x)"(%5)   ;\n"
+#define PF5(x) \
+	"	prefetchnta "OFFS(x)"(%6)   ;\n"
+#define XO1(x,y) \
+        "       xorps   "OFFS(x)"(%2), %%xmm"#y"   ;\n"
+#define XO2(x,y) \
+        "       xorps   "OFFS(x)"(%3), %%xmm"#y"   ;\n"
+#define XO3(x,y) \
+        "       xorps   "OFFS(x)"(%4), %%xmm"#y"   ;\n"
+#define XO4(x,y) \
+        "       xorps   "OFFS(x)"(%5), %%xmm"#y"   ;\n"
+#define XO5(x,y) \
+        "       xorps   "OFFS(x)"(%6), %%xmm"#y"   ;\n"
+
+	switch(count) {
+		case 2:
+		        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		LD(i,0)					\
+			LD(i+1,1)			\
+		PF1(i)					\
+				PF1(i+2)		\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32,0x90		;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+
+        		:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+        		  "r" (bh_ptr[1]->b_data)
+		        : "memory" );
+			break;
+		case 3:
+		        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i+2)		\
+		LD(i,0)					\
+			LD(i+1,1)			\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF2(i)					\
+				PF2(i+2)		\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		XO2(i,0)				\
+			XO2(i+1,1)			\
+				XO2(i+2,2)		\
+					XO2(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32,0x90		;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+        		:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+        		  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data)
+		        : "memory" );
+			break;
+		case 4:
+		        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i+2)		\
+		LD(i,0)					\
+			LD(i+1,1)			\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF2(i)					\
+				PF2(i+2)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		PF3(i)					\
+				PF3(i+2)		\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO2(i,0)				\
+			XO2(i+1,1)			\
+				XO2(i+2,2)		\
+					XO2(i+3,3)	\
+		XO3(i,0)				\
+			XO3(i+1,1)			\
+				XO3(i+2,2)		\
+					XO3(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32,0x90		;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       addl $256, %4           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+
+        		:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+        		  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data),
+			  "r" (bh_ptr[3]->b_data)
+		        : "memory" );
+			break;
+		case 5:
+		        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i+2)		\
+		LD(i,0)					\
+			LD(i+1,1)			\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF2(i)					\
+				PF2(i+2)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		PF3(i)					\
+				PF3(i+2)		\
+		XO2(i,0)				\
+			XO2(i+1,1)			\
+				XO2(i+2,2)		\
+					XO2(i+3,3)	\
+		PF4(i)					\
+				PF4(i+2)		\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO3(i,0)				\
+			XO3(i+1,1)			\
+				XO3(i+2,2)		\
+					XO3(i+3,3)	\
+		XO4(i,0)				\
+			XO4(i+1,1)			\
+				XO4(i+2,2)		\
+					XO4(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32,0x90		;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       addl $256, %4           ;\n"
+        "       addl $256, %5           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+
+        		:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+        		  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data),
+			  "r" (bh_ptr[3]->b_data),
+			  "r" (bh_ptr[4]->b_data)
+			: "memory");
+			break;
+	}
+
+	__asm__ __volatile__ ( 
+		"sfence			;\n\t"
+		"movups (%1),%%xmm0	;\n\t"
+		"movups 0x10(%1),%%xmm1	;\n\t"
+		"movups 0x20(%1),%%xmm2	;\n\t"
+		"movups 0x30(%1),%%xmm3	;\n\t"
+		"movl 	%0,%%cr0	;\n\t"
+		:
+		: "r" (cr0), "r" (xmm_save)
+		: "memory" );
+}
+
+#undef OFFS
+#undef LD
+#undef ST
+#undef PF0
+#undef PF1
+#undef PF2
+#undef PF3
+#undef PF4
+#undef PF5
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef XO5
+#undef BLOCK
+
+#endif /* CONFIG_X86_XMM */
+
+/*
+ * high-speed RAID5 checksumming functions utilizing MMX instructions
+ * Copyright (C) 1998 Ingo Molnar
+ */
+XORBLOCK_TEMPLATE(pII_mmx)
+{
+	char fpu_save[108];
+        int lines = (bh_ptr[0]->b_size>>7);
+
+	if (!(current->flags & PF_USEDFPU))
+		__asm__ __volatile__ ( " clts;\n");
+
+	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+
+#define LD(x,y) \
+        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
+#define ST(x,y) \
+        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
+#define XO1(x,y) \
+        "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
+#define XO2(x,y) \
+        "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
+#define XO3(x,y) \
+        "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
+#define XO4(x,y) \
+        "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
+
+	switch(count) {
+		case 2:
+			__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+			LD(i,0)					\
+				LD(i+1,1)			\
+					LD(i+2,2)		\
+						LD(i+3,3)	\
+			XO1(i,0)				\
+			ST(i,0)					\
+				XO1(i+1,1)			\
+				ST(i+1,1)			\
+					XO1(i+2,2)		\
+					ST(i+2,2)		\
+						XO1(i+3,3)	\
+						ST(i+3,3)
+
+			" .align 32,0x90		;\n"
+  			" 1:                            ;\n"
+
+			BLOCK(0)
+			BLOCK(4)
+			BLOCK(8)
+			BLOCK(12)
+
+		        "       addl $128, %1         ;\n"
+		        "       addl $128, %2         ;\n"
+		        "       decl %0               ;\n"
+		        "       jnz 1b                ;\n"
+	        	:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+			  "r" (bh_ptr[1]->b_data)
+			: "memory");
+			break;
+		case 3:
+			__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+			LD(i,0)					\
+				LD(i+1,1)			\
+					LD(i+2,2)		\
+						LD(i+3,3)	\
+			XO1(i,0)				\
+				XO1(i+1,1)			\
+					XO1(i+2,2)		\
+						XO1(i+3,3)	\
+			XO2(i,0)				\
+			ST(i,0)					\
+				XO2(i+1,1)			\
+				ST(i+1,1)			\
+					XO2(i+2,2)		\
+					ST(i+2,2)		\
+						XO2(i+3,3)	\
+						ST(i+3,3)
+
+			" .align 32,0x90		;\n"
+  			" 1:                            ;\n"
+
+			BLOCK(0)
+			BLOCK(4)
+			BLOCK(8)
+			BLOCK(12)
+
+		        "       addl $128, %1         ;\n"
+		        "       addl $128, %2         ;\n"
+		        "       addl $128, %3         ;\n"
+		        "       decl %0               ;\n"
+		        "       jnz 1b                ;\n"
+	        	:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+			  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data)
+			: "memory");
+			break;
+		case 4:
+			__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+			LD(i,0)					\
+				LD(i+1,1)			\
+					LD(i+2,2)		\
+						LD(i+3,3)	\
+			XO1(i,0)				\
+				XO1(i+1,1)			\
+					XO1(i+2,2)		\
+						XO1(i+3,3)	\
+			XO2(i,0)				\
+				XO2(i+1,1)			\
+					XO2(i+2,2)		\
+						XO2(i+3,3)	\
+			XO3(i,0)				\
+			ST(i,0)					\
+				XO3(i+1,1)			\
+				ST(i+1,1)			\
+					XO3(i+2,2)		\
+					ST(i+2,2)		\
+						XO3(i+3,3)	\
+						ST(i+3,3)
+
+			" .align 32,0x90		;\n"
+  			" 1:                            ;\n"
+
+			BLOCK(0)
+			BLOCK(4)
+			BLOCK(8)
+			BLOCK(12)
+
+		        "       addl $128, %1         ;\n"
+		        "       addl $128, %2         ;\n"
+		        "       addl $128, %3         ;\n"
+		        "       addl $128, %4         ;\n"
+		        "       decl %0               ;\n"
+		        "       jnz 1b                ;\n"
+	        	:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+			  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data),
+			  "r" (bh_ptr[3]->b_data)
+			: "memory");
+			break;
+		case 5:
+			__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+			LD(i,0)					\
+				LD(i+1,1)			\
+					LD(i+2,2)		\
+						LD(i+3,3)	\
+			XO1(i,0)				\
+				XO1(i+1,1)			\
+					XO1(i+2,2)		\
+						XO1(i+3,3)	\
+			XO2(i,0)				\
+				XO2(i+1,1)			\
+					XO2(i+2,2)		\
+						XO2(i+3,3)	\
+			XO3(i,0)				\
+				XO3(i+1,1)			\
+					XO3(i+2,2)		\
+						XO3(i+3,3)	\
+			XO4(i,0)				\
+			ST(i,0)					\
+				XO4(i+1,1)			\
+				ST(i+1,1)			\
+					XO4(i+2,2)		\
+					ST(i+2,2)		\
+						XO4(i+3,3)	\
+						ST(i+3,3)
+
+			" .align 32,0x90		;\n"
+  			" 1:                            ;\n"
+
+			BLOCK(0)
+			BLOCK(4)
+			BLOCK(8)
+			BLOCK(12)
+
+		        "       addl $128, %1         ;\n"
+		        "       addl $128, %2         ;\n"
+		        "       addl $128, %3         ;\n"
+		        "       addl $128, %4         ;\n"
+		        "       addl $128, %5         ;\n"
+		        "       decl %0               ;\n"
+		        "       jnz 1b                ;\n"
+	        	:
+			: "g" (lines),
+			  "r" (bh_ptr[0]->b_data),
+			  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data),
+			  "r" (bh_ptr[3]->b_data),
+			  "r" (bh_ptr[4]->b_data)
+			: "memory");
+			break;
+	}
+
+	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
+
+	if (!(current->flags & PF_USEDFPU))
+		stts();
+}
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef BLOCK
+
+XORBLOCK_TEMPLATE(p5_mmx)
+{
+	char fpu_save[108];
+        int lines = (bh_ptr[0]->b_size>>6);
+
+	if (!(current->flags & PF_USEDFPU))
+		__asm__ __volatile__ ( " clts;\n");
+
+	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+
+	switch(count) {
+		case 2:
+		        __asm__ __volatile__ (
+
+			        " .align 32,0x90             ;\n"
+			        " 1:                         ;\n"
+			        "       movq   (%1), %%mm0   ;\n"
+			        "       movq  8(%1), %%mm1   ;\n"
+			        "       pxor   (%2), %%mm0   ;\n"
+			        "       movq 16(%1), %%mm2   ;\n"
+			        "       movq %%mm0,   (%1)   ;\n"
+			        "       pxor  8(%2), %%mm1   ;\n"
+			        "       movq 24(%1), %%mm3   ;\n"
+			        "       movq %%mm1,  8(%1)   ;\n"
+			        "       pxor 16(%2), %%mm2   ;\n"
+			        "       movq 32(%1), %%mm4   ;\n"
+			        "       movq %%mm2, 16(%1)   ;\n"
+			        "       pxor 24(%2), %%mm3   ;\n"
+			        "       movq 40(%1), %%mm5   ;\n"
+			        "       movq %%mm3, 24(%1)   ;\n"
+			        "       pxor 32(%2), %%mm4   ;\n"
+			        "       movq 48(%1), %%mm6   ;\n"
+			        "       movq %%mm4, 32(%1)   ;\n"
+			        "       pxor 40(%2), %%mm5   ;\n"
+			        "       movq 56(%1), %%mm7   ;\n"
+			        "       movq %%mm5, 40(%1)   ;\n"
+			        "       pxor 48(%2), %%mm6   ;\n"
+			        "       pxor 56(%2), %%mm7   ;\n"
+			        "       movq %%mm6, 48(%1)   ;\n"
+			        "       movq %%mm7, 56(%1)   ;\n"
+        
+			        "       addl $64, %1         ;\n"
+			        "       addl $64, %2         ;\n"
+			        "       decl %0              ;\n"
+			        "       jnz 1b               ;\n"
+
+			        : 
+			        : "r" (lines),
+				  "r" (bh_ptr[0]->b_data),
+				  "r" (bh_ptr[1]->b_data)
+			        : "memory" );
+			break;
+		case 3:
+			__asm__ __volatile__ (
+
+			        " .align 32,0x90             ;\n"
+			        " 1:                         ;\n"
+			        "       movq   (%1), %%mm0   ;\n"
+			        "       movq  8(%1), %%mm1   ;\n"
+			        "       pxor   (%2), %%mm0   ;\n"
+			        "       movq 16(%1), %%mm2   ;\n"
+			        "       pxor  8(%2), %%mm1   ;\n"
+			        "       pxor   (%3), %%mm0   ;\n"
+			        "       pxor 16(%2), %%mm2   ;\n"
+			        "       movq %%mm0,   (%1)   ;\n"
+			        "       pxor  8(%3), %%mm1   ;\n"
+			        "       pxor 16(%3), %%mm2   ;\n"
+			        "       movq 24(%1), %%mm3   ;\n"
+			        "       movq %%mm1,  8(%1)   ;\n"
+			        "       movq 32(%1), %%mm4   ;\n"
+			        "       movq 40(%1), %%mm5   ;\n"
+			        "       pxor 24(%2), %%mm3   ;\n"
+			        "       movq %%mm2, 16(%1)   ;\n"
+			        "       pxor 32(%2), %%mm4   ;\n"
+			        "       pxor 24(%3), %%mm3   ;\n"
+			        "       pxor 40(%2), %%mm5   ;\n"
+			        "       movq %%mm3, 24(%1)   ;\n"
+			        "       pxor 32(%3), %%mm4   ;\n"
+			        "       pxor 40(%3), %%mm5   ;\n"
+			        "       movq 48(%1), %%mm6   ;\n"
+			        "       movq %%mm4, 32(%1)   ;\n"
+			        "       movq 56(%1), %%mm7   ;\n"
+			        "       pxor 48(%2), %%mm6   ;\n"
+			        "       movq %%mm5, 40(%1)   ;\n"
+			        "       pxor 56(%2), %%mm7   ;\n"
+			        "       pxor 48(%3), %%mm6   ;\n"
+			        "       pxor 56(%3), %%mm7   ;\n"
+			        "       movq %%mm6, 48(%1)   ;\n"
+			        "       movq %%mm7, 56(%1)   ;\n"
+        
+			        "       addl $64, %1         ;\n"
+			        "       addl $64, %2         ;\n"
+			        "       addl $64, %3         ;\n"
+			        "       decl %0              ;\n"
+			        "       jnz 1b               ;\n"
+
+			        : 
+			        : "r" (lines),
+				  "r" (bh_ptr[0]->b_data),
+				  "r" (bh_ptr[1]->b_data),
+				  "r" (bh_ptr[2]->b_data)
+			        : "memory" );
+			break;
+		case 4:
+			__asm__ __volatile__ (
+
+			        " .align 32,0x90             ;\n"
+			        " 1:                         ;\n"
+			        "       movq   (%1), %%mm0   ;\n"
+			        "       movq  8(%1), %%mm1   ;\n"
+			        "       pxor   (%2), %%mm0   ;\n"
+			        "       movq 16(%1), %%mm2   ;\n"
+			        "       pxor  8(%2), %%mm1   ;\n"
+			        "       pxor   (%3), %%mm0   ;\n"
+			        "       pxor 16(%2), %%mm2   ;\n"
+			        "       pxor  8(%3), %%mm1   ;\n"
+			        "       pxor   (%4), %%mm0   ;\n"
+			        "       movq 24(%1), %%mm3   ;\n"
+			        "       pxor 16(%3), %%mm2   ;\n"
+			        "       pxor  8(%4), %%mm1   ;\n"
+			        "       movq %%mm0,   (%1)   ;\n"
+			        "       movq 32(%1), %%mm4   ;\n"
+			        "       pxor 24(%2), %%mm3   ;\n"
+			        "       pxor 16(%4), %%mm2   ;\n"
+			        "       movq %%mm1,  8(%1)   ;\n"
+			        "       movq 40(%1), %%mm5   ;\n"
+			        "       pxor 32(%2), %%mm4   ;\n"
+			        "       pxor 24(%3), %%mm3   ;\n"
+			        "       movq %%mm2, 16(%1)   ;\n"
+			        "       pxor 40(%2), %%mm5   ;\n"
+			        "       pxor 32(%3), %%mm4   ;\n"
+			        "       pxor 24(%4), %%mm3   ;\n"
+			        "       movq %%mm3, 24(%1)   ;\n"
+			        "       movq 56(%1), %%mm7   ;\n"
+			        "       movq 48(%1), %%mm6   ;\n"
+			        "       pxor 40(%3), %%mm5   ;\n"
+			        "       pxor 32(%4), %%mm4   ;\n"
+			        "       pxor 48(%2), %%mm6   ;\n"
+			        "       movq %%mm4, 32(%1)   ;\n"
+			        "       pxor 56(%2), %%mm7   ;\n"
+			        "       pxor 40(%4), %%mm5   ;\n"
+			        "       pxor 48(%3), %%mm6   ;\n"
+			        "       pxor 56(%3), %%mm7   ;\n"
+			        "       movq %%mm5, 40(%1)   ;\n"
+			        "       pxor 48(%4), %%mm6   ;\n"
+			        "       pxor 56(%4), %%mm7   ;\n"
+			        "       movq %%mm6, 48(%1)   ;\n"
+			        "       movq %%mm7, 56(%1)   ;\n"
+        
+			        "       addl $64, %1         ;\n"
+			        "       addl $64, %2         ;\n"
+			        "       addl $64, %3         ;\n"
+			        "       addl $64, %4         ;\n"
+			        "       decl %0              ;\n"
+			        "       jnz 1b               ;\n"
+
+			        : 
+			        : "r" (lines),
+				  "r" (bh_ptr[0]->b_data),
+				  "r" (bh_ptr[1]->b_data),
+				  "r" (bh_ptr[2]->b_data),
+				  "r" (bh_ptr[3]->b_data)
+			        : "memory" );
+			break;
+		case 5:
+			__asm__ __volatile__ (
+
+			        " .align 32,0x90             ;\n"
+			        " 1:                         ;\n"
+			        "       movq   (%1), %%mm0   ;\n"
+			        "       movq  8(%1), %%mm1   ;\n"
+			        "       pxor   (%2), %%mm0   ;\n"
+			        "       pxor  8(%2), %%mm1   ;\n"
+			        "       movq 16(%1), %%mm2   ;\n"
+			        "       pxor   (%3), %%mm0   ;\n"
+			        "       pxor  8(%3), %%mm1   ;\n"
+			        "       pxor 16(%2), %%mm2   ;\n"
+			        "       pxor   (%4), %%mm0   ;\n"
+			        "       pxor  8(%4), %%mm1   ;\n"
+			        "       pxor 16(%3), %%mm2   ;\n"
+			        "       movq 24(%1), %%mm3   ;\n"
+			        "       pxor   (%5), %%mm0   ;\n"
+			        "       pxor  8(%5), %%mm1   ;\n"
+			        "       movq %%mm0,   (%1)   ;\n"
+			        "       pxor 16(%4), %%mm2   ;\n"
+			        "       pxor 24(%2), %%mm3   ;\n"
+			        "       movq %%mm1,  8(%1)   ;\n"
+			        "       pxor 16(%5), %%mm2   ;\n"
+			        "       pxor 24(%3), %%mm3   ;\n"
+			        "       movq 32(%1), %%mm4   ;\n"
+			        "       movq %%mm2, 16(%1)   ;\n"
+			        "       pxor 24(%4), %%mm3   ;\n"
+			        "       pxor 32(%2), %%mm4   ;\n"
+			        "       movq 40(%1), %%mm5   ;\n"
+			        "       pxor 24(%5), %%mm3   ;\n"
+			        "       pxor 32(%3), %%mm4   ;\n"
+			        "       pxor 40(%2), %%mm5   ;\n"
+			        "       movq %%mm3, 24(%1)   ;\n"
+			        "       pxor 32(%4), %%mm4   ;\n"
+			        "       pxor 40(%3), %%mm5   ;\n"
+			        "       movq 48(%1), %%mm6   ;\n"
+			        "       movq 56(%1), %%mm7   ;\n"
+			        "       pxor 32(%5), %%mm4   ;\n"
+			        "       pxor 40(%4), %%mm5   ;\n"
+			        "       pxor 48(%2), %%mm6   ;\n"
+			        "       pxor 56(%2), %%mm7   ;\n"
+			        "       movq %%mm4, 32(%1)   ;\n"
+			        "       pxor 48(%3), %%mm6   ;\n"
+			        "       pxor 56(%3), %%mm7   ;\n"
+			        "       pxor 40(%5), %%mm5   ;\n"
+			        "       pxor 48(%4), %%mm6   ;\n"
+			        "       pxor 56(%4), %%mm7   ;\n"
+			        "       movq %%mm5, 40(%1)   ;\n"
+			        "       pxor 48(%5), %%mm6   ;\n"
+			        "       pxor 56(%5), %%mm7   ;\n"
+			        "       movq %%mm6, 48(%1)   ;\n"
+			        "       movq %%mm7, 56(%1)   ;\n"
+        
+			        "       addl $64, %1         ;\n"
+			        "       addl $64, %2         ;\n"
+			        "       addl $64, %3         ;\n"
+			        "       addl $64, %4         ;\n"
+			        "       addl $64, %5         ;\n"
+			        "       decl %0              ;\n"
+			        "       jnz 1b               ;\n"
+
+			        : 
+			        : "g" (lines),
+				  "r" (bh_ptr[0]->b_data),
+				  "r" (bh_ptr[1]->b_data),
+				  "r" (bh_ptr[2]->b_data),
+				  "r" (bh_ptr[3]->b_data),
+				  "r" (bh_ptr[4]->b_data)
+			        : "memory" );
+			break;
+	}
+
+	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
+
+	if (!(current->flags & PF_USEDFPU))
+		stts();
+}
+#endif /* __i386__ */
+#endif /* !__sparc_v9__ */
+
+#ifdef __sparc_v9__
+/*
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * UltraSparc Visual Instruction Set.
+ *
+ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ *
+ *	Requirements:
+ *	!(((long)dest | (long)sourceN) & (64 - 1)) &&
+ *	!(len & 127) && len >= 256
+ *
+ * It is done in pure assembly, as otherwise gcc makes it
+ * a non-leaf function, which is not what we want.
+ * Also, we don't measure the speeds as on other architectures,
+ * as the measuring routine does not take into account cold caches
+ * and the fact that xor_block_VIS bypasses the caches.
+ * xor_block_32regs might be 5% faster for count 2 if caches are hot
+ * and things just right (for count 3 VIS is about as fast as 32regs for
+ * hot caches and for count 4 and 5 VIS is faster by good margin always),
+ * but I think it is better not to pollute the caches.
+ * Actually, if I'd just fight for speed for hot caches, I could
+ * write a hybrid VIS/integer routine, which would do always two
+ * 64B blocks in VIS and two in IEUs, but I really care more about
+ * caches.
+ */
+extern void *VISenter(void);
+extern void xor_block_VIS XOR_ARGS;
+
+void __xor_block_VIS(void)
+{
+__asm__ ("
+	.globl xor_block_VIS
+xor_block_VIS:
+	ldx	[%%o1 + 0], %%o4
+	ldx	[%%o1 + 8], %%o3
+	ldx	[%%o4 + %1], %%g5
+	ldx	[%%o4 + %0], %%o4
+	ldx	[%%o3 + %0], %%o3
+	rd	%%fprs, %%o5
+	andcc	%%o5, %2, %%g0
+	be,pt	%%icc, 297f
+	 sethi	%%hi(%5), %%g1
+	jmpl	%%g1 + %%lo(%5), %%g7
+	 add	%%g7, 8, %%g7
+297:	wr	%%g0, %4, %%fprs
+	membar	#LoadStore|#StoreLoad|#StoreStore
+	sub	%%g5, 64, %%g5
+	ldda	[%%o4] %3, %%f0
+	ldda	[%%o3] %3, %%f16
+	cmp	%%o0, 4
+	bgeu,pt	%%xcc, 10f
+	 cmp	%%o0, 3
+	be,pn	%%xcc, 13f
+	 mov	-64, %%g1
+	sub	%%g5, 64, %%g5
+	rd	%%asi, %%g1
+	wr	%%g0, %3, %%asi
+
+2:	ldda	[%%o4 + 64] %%asi, %%f32
+	fxor	%%f0, %%f16, %%f16
+	fxor	%%f2, %%f18, %%f18
+	fxor	%%f4, %%f20, %%f20
+	fxor	%%f6, %%f22, %%f22
+	fxor	%%f8, %%f24, %%f24
+	fxor	%%f10, %%f26, %%f26
+	fxor	%%f12, %%f28, %%f28
+	fxor	%%f14, %%f30, %%f30
+	stda	%%f16, [%%o4] %3
+	ldda	[%%o3 + 64] %%asi, %%f48
+	ldda	[%%o4 + 128] %%asi, %%f0
+	fxor	%%f32, %%f48, %%f48
+	fxor	%%f34, %%f50, %%f50
+	add	%%o4, 128, %%o4
+	fxor	%%f36, %%f52, %%f52
+	add	%%o3, 128, %%o3
+	fxor	%%f38, %%f54, %%f54
+	subcc	%%g5, 128, %%g5
+	fxor	%%f40, %%f56, %%f56
+	fxor	%%f42, %%f58, %%f58
+	fxor	%%f44, %%f60, %%f60
+	fxor	%%f46, %%f62, %%f62
+	stda	%%f48, [%%o4 - 64] %%asi
+	bne,pt	%%xcc, 2b
+	 ldda	[%%o3] %3, %%f16
+
+	ldda	[%%o4 + 64] %%asi, %%f32
+	fxor	%%f0, %%f16, %%f16
+	fxor	%%f2, %%f18, %%f18
+	fxor	%%f4, %%f20, %%f20
+	fxor	%%f6, %%f22, %%f22
+	fxor	%%f8, %%f24, %%f24
+	fxor	%%f10, %%f26, %%f26
+	fxor	%%f12, %%f28, %%f28
+	fxor	%%f14, %%f30, %%f30
+	stda	%%f16, [%%o4] %3
+	ldda	[%%o3 + 64] %%asi, %%f48
+	membar	#Sync
+	fxor	%%f32, %%f48, %%f48
+	fxor	%%f34, %%f50, %%f50
+	fxor	%%f36, %%f52, %%f52
+	fxor	%%f38, %%f54, %%f54
+	fxor	%%f40, %%f56, %%f56
+	fxor	%%f42, %%f58, %%f58
+	fxor	%%f44, %%f60, %%f60
+	fxor	%%f46, %%f62, %%f62
+	stda	%%f48, [%%o4 + 64] %%asi
+	membar	#Sync|#StoreStore|#StoreLoad
+	wr	%%g0, 0, %%fprs
+	retl
+	 wr	%%g1, %%g0, %%asi
+
+13:	ldx	[%%o1 + 16], %%o2
+	ldx	[%%o2 + %0], %%o2
+
+3:	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f48
+	fxor	%%f2, %%f18, %%f50
+	add	%%o4, 64, %%o4
+	fxor	%%f4, %%f20, %%f52
+	fxor	%%f6, %%f22, %%f54
+	add	%%o3, 64, %%o3
+	fxor	%%f8, %%f24, %%f56
+	fxor	%%f10, %%f26, %%f58
+	fxor	%%f12, %%f28, %%f60
+	fxor	%%f14, %%f30, %%f62
+	ldda	[%%o4] %3, %%f0
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	add	%%o2, 64, %%o2
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	subcc	%%g5, 64, %%g5
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	stda	%%f48, [%%o4 + %%g1] %3
+	bne,pt	%%xcc, 3b
+	 ldda	[%%o3] %3, %%f16
+
+	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f48
+	fxor	%%f2, %%f18, %%f50
+	fxor	%%f4, %%f20, %%f52
+	fxor	%%f6, %%f22, %%f54
+	fxor	%%f8, %%f24, %%f56
+	fxor	%%f10, %%f26, %%f58
+	fxor	%%f12, %%f28, %%f60
+	fxor	%%f14, %%f30, %%f62
+	membar	#Sync
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	stda	%%f48, [%%o4] %3
+	membar	#Sync|#StoreStore|#StoreLoad
+	retl
+	 wr	%%g0, 0, %%fprs
+
+10:	cmp	%%o0, 5
+	be,pt	%%xcc, 15f
+	 mov	-64, %%g1
+
+14:	ldx	[%%o1 + 16], %%o2
+	ldx	[%%o1 + 24], %%o0
+	ldx	[%%o2 + %0], %%o2
+	ldx	[%%o0 + %0], %%o0
+
+4:	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f16
+	fxor	%%f2, %%f18, %%f18
+	add	%%o4, 64, %%o4
+	fxor	%%f4, %%f20, %%f20
+	fxor	%%f6, %%f22, %%f22
+	add	%%o3, 64, %%o3
+	fxor	%%f8, %%f24, %%f24
+	fxor	%%f10, %%f26, %%f26
+	fxor	%%f12, %%f28, %%f28
+	fxor	%%f14, %%f30, %%f30
+	ldda	[%%o0] %3, %%f48
+	fxor	%%f16, %%f32, %%f32
+	fxor	%%f18, %%f34, %%f34
+	fxor	%%f20, %%f36, %%f36
+	fxor	%%f22, %%f38, %%f38
+	add	%%o2, 64, %%o2
+	fxor	%%f24, %%f40, %%f40
+	fxor	%%f26, %%f42, %%f42
+	fxor	%%f28, %%f44, %%f44
+	fxor	%%f30, %%f46, %%f46
+	ldda	[%%o4] %3, %%f0
+	fxor	%%f32, %%f48, %%f48
+	fxor	%%f34, %%f50, %%f50
+	fxor	%%f36, %%f52, %%f52
+	add	%%o0, 64, %%o0
+	fxor	%%f38, %%f54, %%f54
+	fxor	%%f40, %%f56, %%f56
+	fxor	%%f42, %%f58, %%f58
+	subcc	%%g5, 64, %%g5
+	fxor	%%f44, %%f60, %%f60
+	fxor	%%f46, %%f62, %%f62
+	stda	%%f48, [%%o4 + %%g1] %3
+	bne,pt	%%xcc, 4b
+	 ldda	[%%o3] %3, %%f16
+
+	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f16
+	fxor	%%f2, %%f18, %%f18
+	fxor	%%f4, %%f20, %%f20
+	fxor	%%f6, %%f22, %%f22
+	fxor	%%f8, %%f24, %%f24
+	fxor	%%f10, %%f26, %%f26
+	fxor	%%f12, %%f28, %%f28
+	fxor	%%f14, %%f30, %%f30
+	ldda	[%%o0] %3, %%f48
+	fxor	%%f16, %%f32, %%f32
+	fxor	%%f18, %%f34, %%f34
+	fxor	%%f20, %%f36, %%f36
+	fxor	%%f22, %%f38, %%f38
+	fxor	%%f24, %%f40, %%f40
+	fxor	%%f26, %%f42, %%f42
+	fxor	%%f28, %%f44, %%f44
+	fxor	%%f30, %%f46, %%f46
+	membar	#Sync
+	fxor	%%f32, %%f48, %%f48
+	fxor	%%f34, %%f50, %%f50
+	fxor	%%f36, %%f52, %%f52
+	fxor	%%f38, %%f54, %%f54
+	fxor	%%f40, %%f56, %%f56
+	fxor	%%f42, %%f58, %%f58
+	fxor	%%f44, %%f60, %%f60
+	fxor	%%f46, %%f62, %%f62
+	stda	%%f48, [%%o4] %3
+	membar	#Sync|#StoreStore|#StoreLoad
+	retl
+	 wr	%%g0, 0, %%fprs
+
+15:	ldx	[%%o1 + 16], %%o2
+	ldx	[%%o1 + 24], %%o0
+	ldx	[%%o1 + 32], %%o1
+	ldx	[%%o2 + %0], %%o2
+	ldx	[%%o0 + %0], %%o0
+	ldx	[%%o1 + %0], %%o1
+
+5:	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f48
+	fxor	%%f2, %%f18, %%f50
+	add	%%o4, 64, %%o4
+	fxor	%%f4, %%f20, %%f52
+	fxor	%%f6, %%f22, %%f54
+	add	%%o3, 64, %%o3
+	fxor	%%f8, %%f24, %%f56
+	fxor	%%f10, %%f26, %%f58
+	fxor	%%f12, %%f28, %%f60
+	fxor	%%f14, %%f30, %%f62
+	ldda	[%%o0] %3, %%f16
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	add	%%o2, 64, %%o2
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	ldda	[%%o1] %3, %%f32
+	fxor	%%f48, %%f16, %%f48
+	fxor	%%f50, %%f18, %%f50
+	add	%%o0, 64, %%o0
+	fxor	%%f52, %%f20, %%f52
+	fxor	%%f54, %%f22, %%f54
+	add	%%o1, 64, %%o1
+	fxor	%%f56, %%f24, %%f56
+	fxor	%%f58, %%f26, %%f58
+	fxor	%%f60, %%f28, %%f60
+	fxor	%%f62, %%f30, %%f62
+	ldda	[%%o4] %3, %%f0
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	subcc	%%g5, 64, %%g5
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	stda	%%f48, [%%o4 + %%g1] %3
+	bne,pt	%%xcc, 5b
+	 ldda	[%%o3] %3, %%f16
+
+	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f48
+	fxor	%%f2, %%f18, %%f50
+	fxor	%%f4, %%f20, %%f52
+	fxor	%%f6, %%f22, %%f54
+	fxor	%%f8, %%f24, %%f56
+	fxor	%%f10, %%f26, %%f58
+	fxor	%%f12, %%f28, %%f60
+	fxor	%%f14, %%f30, %%f62
+	ldda	[%%o0] %3, %%f16
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	ldda	[%%o1] %3, %%f32
+	fxor	%%f48, %%f16, %%f48
+	fxor	%%f50, %%f18, %%f50
+	fxor	%%f52, %%f20, %%f52
+	fxor	%%f54, %%f22, %%f54
+	fxor	%%f56, %%f24, %%f56
+	fxor	%%f58, %%f26, %%f58
+	fxor	%%f60, %%f28, %%f60
+	fxor	%%f62, %%f30, %%f62
+	membar	#Sync
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	stda	%%f48, [%%o4] %3
+	membar	#Sync|#StoreStore|#StoreLoad
+	retl
+	 wr	%%g0, 0, %%fprs
+	" : :
+	"i" (&((struct buffer_head *)0)->b_data),
+	"i" (&((struct buffer_head *)0)->b_data),
+	"i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
+	"i" (FPRS_FEF), "i" (VISenter));
+}
+#endif /* __sparc_v9__ */
+
+#if defined(__sparc__) && !defined(__sparc_v9__)
+/*
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * ldd/std SPARC instructions.
+ *
+ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ *
+ */
+
+XORBLOCK_TEMPLATE(SPARC)
+{
+	int size  = bh_ptr[0]->b_size;
+	int lines = size / (sizeof (long)) / 8, i;
+	long *destp   = (long *) bh_ptr[0]->b_data;
+	long *source1 = (long *) bh_ptr[1]->b_data;
+	long *source2, *source3, *source4;
+
+	switch (count) {
+	case 2:
+		for (i = lines; i > 0; i--) {
+		  __asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", 
+		  "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
+		  destp += 8;
+		  source1 += 8;
+		}
+		break;
+	case 3:
+		source2 = (long *) bh_ptr[2]->b_data;
+		for (i = lines; i > 0; i--) {
+		  __asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%2 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%2 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%2 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%2 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  " : : "r" (destp), "r" (source1), "r" (source2)
+		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		  destp += 8;
+		  source1 += 8;
+		  source2 += 8;
+		}
+		break;
+	case 4:
+		source2 = (long *) bh_ptr[2]->b_data;
+		source3 = (long *) bh_ptr[3]->b_data;
+		for (i = lines; i > 0; i--) {
+		  __asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%2 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%2 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%2 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%2 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%3 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%3 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%3 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%3 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
+		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		  destp += 8;
+		  source1 += 8;
+		  source2 += 8;
+		  source3 += 8;
+		}
+		break;
+	case 5:
+		source2 = (long *) bh_ptr[2]->b_data;
+		source3 = (long *) bh_ptr[3]->b_data;
+		source4 = (long *) bh_ptr[4]->b_data;
+		for (i = lines; i > 0; i--) {
+		  __asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%2 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%2 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%2 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%2 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%3 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%3 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%3 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%3 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%4 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%4 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%4 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%4 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
+		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		  destp += 8;
+		  source1 += 8;
+		  source2 += 8;
+		  source3 += 8;
+		  source4 += 8;
+		}
+		break;
+	}
+}
+#endif /* __sparc_v[78]__ */
+
+#ifndef __sparc_v9__
+
+/*
+ * this one works reasonably on any x86 CPU
+ * (send me an assembly version for inclusion if you can make it faster)
+ *
+ * this one is just as fast as written in pure assembly on x86.
+ * the reason for this separate version is that the
+ * fast open-coded xor routine "32reg" produces suboptimal code
+ * on x86, due to lack of registers.
+ */
+XORBLOCK_TEMPLATE(8regs)
+{
+	int len  = bh_ptr[0]->b_size;
+	long *destp   = (long *) bh_ptr[0]->b_data;
+	long *source1, *source2, *source3, *source4;
+	long lines = len / (sizeof (long)) / 8, i;
+
+	switch(count) {
+		case 2:
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+				*(destp + 0) ^= *(source1 + 0);
+				*(destp + 1) ^= *(source1 + 1);
+				*(destp + 2) ^= *(source1 + 2);
+				*(destp + 3) ^= *(source1 + 3);
+				*(destp + 4) ^= *(source1 + 4);
+				*(destp + 5) ^= *(source1 + 5);
+				*(destp + 6) ^= *(source1 + 6);
+				*(destp + 7) ^= *(source1 + 7);
+				source1 += 8;
+				destp += 8;
+			}
+			break;
+		case 3:
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+				*(destp + 0) ^= *(source1 + 0);
+				*(destp + 0) ^= *(source2 + 0);
+				*(destp + 1) ^= *(source1 + 1);
+				*(destp + 1) ^= *(source2 + 1);
+				*(destp + 2) ^= *(source1 + 2);
+				*(destp + 2) ^= *(source2 + 2);
+				*(destp + 3) ^= *(source1 + 3);
+				*(destp + 3) ^= *(source2 + 3);
+				*(destp + 4) ^= *(source1 + 4);
+				*(destp + 4) ^= *(source2 + 4);
+				*(destp + 5) ^= *(source1 + 5);
+				*(destp + 5) ^= *(source2 + 5);
+				*(destp + 6) ^= *(source1 + 6);
+				*(destp + 6) ^= *(source2 + 6);
+				*(destp + 7) ^= *(source1 + 7);
+				*(destp + 7) ^= *(source2 + 7);
+				source1 += 8;
+				source2 += 8;
+				destp += 8;
+			}
+			break;
+		case 4:
+			source3 = (long *) bh_ptr[3]->b_data;
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+				*(destp + 0) ^= *(source1 + 0);
+				*(destp + 0) ^= *(source2 + 0);
+				*(destp + 0) ^= *(source3 + 0);
+				*(destp + 1) ^= *(source1 + 1);
+				*(destp + 1) ^= *(source2 + 1);
+				*(destp + 1) ^= *(source3 + 1);
+				*(destp + 2) ^= *(source1 + 2);
+				*(destp + 2) ^= *(source2 + 2);
+				*(destp + 2) ^= *(source3 + 2);
+				*(destp + 3) ^= *(source1 + 3);
+				*(destp + 3) ^= *(source2 + 3);
+				*(destp + 3) ^= *(source3 + 3);
+				*(destp + 4) ^= *(source1 + 4);
+				*(destp + 4) ^= *(source2 + 4);
+				*(destp + 4) ^= *(source3 + 4);
+				*(destp + 5) ^= *(source1 + 5);
+				*(destp + 5) ^= *(source2 + 5);
+				*(destp + 5) ^= *(source3 + 5);
+				*(destp + 6) ^= *(source1 + 6);
+				*(destp + 6) ^= *(source2 + 6);
+				*(destp + 6) ^= *(source3 + 6);
+				*(destp + 7) ^= *(source1 + 7);
+				*(destp + 7) ^= *(source2 + 7);
+				*(destp + 7) ^= *(source3 + 7);
+				source1 += 8;
+				source2 += 8;
+				source3 += 8;
+				destp += 8;
+			}
+			break;
+		case 5:
+			source4 = (long *) bh_ptr[4]->b_data;
+			source3 = (long *) bh_ptr[3]->b_data;
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+				*(destp + 0) ^= *(source1 + 0);
+				*(destp + 0) ^= *(source2 + 0);
+				*(destp + 0) ^= *(source3 + 0);
+				*(destp + 0) ^= *(source4 + 0);
+				*(destp + 1) ^= *(source1 + 1);
+				*(destp + 1) ^= *(source2 + 1);
+				*(destp + 1) ^= *(source3 + 1);
+				*(destp + 1) ^= *(source4 + 1);
+				*(destp + 2) ^= *(source1 + 2);
+				*(destp + 2) ^= *(source2 + 2);
+				*(destp + 2) ^= *(source3 + 2);
+				*(destp + 2) ^= *(source4 + 2);
+				*(destp + 3) ^= *(source1 + 3);
+				*(destp + 3) ^= *(source2 + 3);
+				*(destp + 3) ^= *(source3 + 3);
+				*(destp + 3) ^= *(source4 + 3);
+				*(destp + 4) ^= *(source1 + 4);
+				*(destp + 4) ^= *(source2 + 4);
+				*(destp + 4) ^= *(source3 + 4);
+				*(destp + 4) ^= *(source4 + 4);
+				*(destp + 5) ^= *(source1 + 5);
+				*(destp + 5) ^= *(source2 + 5);
+				*(destp + 5) ^= *(source3 + 5);
+				*(destp + 5) ^= *(source4 + 5);
+				*(destp + 6) ^= *(source1 + 6);
+				*(destp + 6) ^= *(source2 + 6);
+				*(destp + 6) ^= *(source3 + 6);
+				*(destp + 6) ^= *(source4 + 6);
+				*(destp + 7) ^= *(source1 + 7);
+				*(destp + 7) ^= *(source2 + 7);
+				*(destp + 7) ^= *(source3 + 7);
+				*(destp + 7) ^= *(source4 + 7);
+				source1 += 8;
+				source2 += 8;
+				source3 += 8;
+				source4 += 8;
+				destp += 8;
+			}
+			break;
+	}
+}
+
+/*
+ * platform independent RAID5 checksum calculation, this should
+ * be very fast on any platform that has a decent amount of
+ * registers. (32 or more)
+ */
+XORBLOCK_TEMPLATE(32regs)
+{
+	int size  = bh_ptr[0]->b_size;
+	int lines = size / (sizeof (long)) / 8, i;
+	long *destp   = (long *) bh_ptr[0]->b_data;
+	long *source1, *source2, *source3, *source4;
+	
+	  /* LOTS of registers available...
+	     We do explicite loop-unrolling here for code which
+	     favours RISC machines.  In fact this is almoast direct
+	     RISC assembly on Alpha and SPARC :-)  */
+
+
+	switch(count) {
+		case 2:
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
+				d0 = destp[0];	/* Pull the stuff into registers	*/
+				d1 = destp[1];	/*  ... in bursts, if possible.		*/
+				d2 = destp[2];
+				d3 = destp[3];
+				d4 = destp[4];
+				d5 = destp[5];
+				d6 = destp[6];
+				d7 = destp[7];
+				d0 ^= source1[0];
+				d1 ^= source1[1];
+				d2 ^= source1[2];
+				d3 ^= source1[3];
+				d4 ^= source1[4];
+				d5 ^= source1[5];
+				d6 ^= source1[6];
+				d7 ^= source1[7];
+				destp[0] = d0;	/* Store the result (in burts)		*/
+				destp[1] = d1;
+				destp[2] = d2;
+				destp[3] = d3;
+				destp[4] = d4;	/* Store the result (in burts)		*/
+				destp[5] = d5;
+				destp[6] = d6;
+				destp[7] = d7;
+				source1 += 8;
+				destp += 8;
+			}
+			break;
+	  	case 3:
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
+				d0 = destp[0];	/* Pull the stuff into registers	*/
+				d1 = destp[1];	/*  ... in bursts, if possible.		*/
+				d2 = destp[2];
+				d3 = destp[3];
+				d4 = destp[4];
+				d5 = destp[5];
+				d6 = destp[6];
+				d7 = destp[7];
+				d0 ^= source1[0];
+				d1 ^= source1[1];
+				d2 ^= source1[2];
+				d3 ^= source1[3];
+				d4 ^= source1[4];
+				d5 ^= source1[5];
+				d6 ^= source1[6];
+				d7 ^= source1[7];
+				d0 ^= source2[0];
+				d1 ^= source2[1];
+				d2 ^= source2[2];
+				d3 ^= source2[3];
+				d4 ^= source2[4];
+				d5 ^= source2[5];
+				d6 ^= source2[6];
+				d7 ^= source2[7];
+				destp[0] = d0;	/* Store the result (in burts)		*/
+				destp[1] = d1;
+				destp[2] = d2;
+				destp[3] = d3;
+				destp[4] = d4;	/* Store the result (in burts)		*/
+				destp[5] = d5;
+				destp[6] = d6;
+				destp[7] = d7;
+				source1 += 8;
+				source2 += 8;
+				destp += 8;
+			}
+			break;
+		case 4:
+			source3 = (long *) bh_ptr[3]->b_data;
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
+				d0 = destp[0];	/* Pull the stuff into registers	*/
+				d1 = destp[1];	/*  ... in bursts, if possible.		*/
+				d2 = destp[2];
+				d3 = destp[3];
+				d4 = destp[4];
+				d5 = destp[5];
+				d6 = destp[6];
+				d7 = destp[7];
+				d0 ^= source1[0];
+				d1 ^= source1[1];
+				d2 ^= source1[2];
+				d3 ^= source1[3];
+				d4 ^= source1[4];
+				d5 ^= source1[5];
+				d6 ^= source1[6];
+				d7 ^= source1[7];
+				d0 ^= source2[0];
+				d1 ^= source2[1];
+				d2 ^= source2[2];
+				d3 ^= source2[3];
+				d4 ^= source2[4];
+				d5 ^= source2[5];
+				d6 ^= source2[6];
+				d7 ^= source2[7];
+				d0 ^= source3[0];
+				d1 ^= source3[1];
+				d2 ^= source3[2];
+				d3 ^= source3[3];
+				d4 ^= source3[4];
+				d5 ^= source3[5];
+				d6 ^= source3[6];
+				d7 ^= source3[7];
+				destp[0] = d0;	/* Store the result (in burts)		*/
+				destp[1] = d1;
+				destp[2] = d2;
+				destp[3] = d3;
+				destp[4] = d4;	/* Store the result (in burts)		*/
+				destp[5] = d5;
+				destp[6] = d6;
+				destp[7] = d7;
+				source1 += 8;
+				source2 += 8;
+				source3 += 8;
+				destp += 8;
+			}
+			break;
+		case 5:
+			source4 = (long *) bh_ptr[4]->b_data;
+			source3 = (long *) bh_ptr[3]->b_data;
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
+				d0 = destp[0];	/* Pull the stuff into registers	*/
+				d1 = destp[1];	/*  ... in bursts, if possible.		*/
+				d2 = destp[2];
+				d3 = destp[3];
+				d4 = destp[4];
+				d5 = destp[5];
+				d6 = destp[6];
+				d7 = destp[7];
+				d0 ^= source1[0];
+				d1 ^= source1[1];
+				d2 ^= source1[2];
+				d3 ^= source1[3];
+				d4 ^= source1[4];
+				d5 ^= source1[5];
+				d6 ^= source1[6];
+				d7 ^= source1[7];
+				d0 ^= source2[0];
+				d1 ^= source2[1];
+				d2 ^= source2[2];
+				d3 ^= source2[3];
+				d4 ^= source2[4];
+				d5 ^= source2[5];
+				d6 ^= source2[6];
+				d7 ^= source2[7];
+				d0 ^= source3[0];
+				d1 ^= source3[1];
+				d2 ^= source3[2];
+				d3 ^= source3[3];
+				d4 ^= source3[4];
+				d5 ^= source3[5];
+				d6 ^= source3[6];
+				d7 ^= source3[7];
+				d0 ^= source4[0];
+				d1 ^= source4[1];
+				d2 ^= source4[2];
+				d3 ^= source4[3];
+				d4 ^= source4[4];
+				d5 ^= source4[5];
+				d6 ^= source4[6];
+				d7 ^= source4[7];
+				destp[0] = d0;	/* Store the result (in burts)		*/
+				destp[1] = d1;
+				destp[2] = d2;
+				destp[3] = d3;
+				destp[4] = d4;	/* Store the result (in burts)		*/
+				destp[5] = d5;
+				destp[6] = d6;
+				destp[7] = d7;
+				source1 += 8;
+				source2 += 8;
+				source3 += 8;
+				source4 += 8;
+				destp += 8;
+			}
+			break;
+	}
+}
+
+/*
+ * (the -6*32 shift factor colors the cache)
+ */
+#define SIZE (PAGE_SIZE-6*32)
+
+static void xor_speed ( struct xor_block_template * func, 
+	struct buffer_head *b1, struct buffer_head *b2)
+{
+	int speed;
+	unsigned long now;
+	int i, count, max;
+	struct buffer_head *bh_ptr[6];
+
+	func->next = xor_functions;
+	xor_functions = func;
+	bh_ptr[0] = b1;
+	bh_ptr[1] = b2;
+
+	/*
+	 * count the number of XORs done during a whole jiffy.
+	 * calculate the speed of checksumming from this.
+	 * (we use a 2-page allocation to have guaranteed
+	 * color L1-cache layout)
+	 */
+	max = 0;
+	for (i = 0; i < 5; i++) {
+		now = jiffies;
+		count = 0;
+		while (jiffies == now) {
+			mb();
+			func->xor_block(2,bh_ptr);
+			mb();
+			count++;
+			mb();
+		}
+		if (count > max)
+			max = count;
+	}
+
+	speed = max * (HZ*SIZE/1024);
+	func->speed = speed;
+
+	printk( "   %-10s: %5d.%03d MB/sec\n", func->name,
+		speed / 1000, speed % 1000);
+}
+
+static inline void pick_fastest_function(void)
+{
+	struct xor_block_template *f, *fastest;
+
+	fastest = xor_functions;
+	for (f = fastest; f; f = f->next) {
+		if (f->speed > fastest->speed)
+			fastest = f;
+	}
+#ifdef CONFIG_X86_XMM 
+	if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
+		fastest = &t_xor_block_pIII_kni;
+	}
+#endif
+	xor_block = fastest->xor_block;
+	printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
+		fastest->speed / 1000, fastest->speed % 1000);
+}
+ 
+static struct buffer_head b1, b2;
+
+void calibrate_xor_block(void)
+{
+	memset(&b1,0,sizeof(b1));
+	b2 = b1;
+
+	b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
+	if (!b1.b_data) {
+		pick_fastest_function();
+		return;
+	}
+	b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
+
+	b1.b_size = SIZE;
+
+	printk(KERN_INFO "raid5: measuring checksumming speed\n");
+
+	sti(); /* should be safe */
+
+#if defined(__sparc__) && !defined(__sparc_v9__)
+	printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
+	xor_speed(&t_xor_block_SPARC,&b1,&b2);
+#endif
+
+#ifdef CONFIG_X86_XMM 
+	if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
+		printk(KERN_INFO
+			"raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
+		/* we force the use of the KNI xor block because it
+			can write around l2.  we may also be able
+			to load into the l1 only depending on how
+			the cpu deals with a load to a line that is
+			being prefetched.
+		*/
+		xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
+	}
+#endif /* CONFIG_X86_XMM */
+
+#ifdef __i386__
+
+	if (md_cpu_has_mmx()) {
+		printk(KERN_INFO
+			"raid5: MMX detected, trying high-speed MMX checksum routines\n");
+		xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
+		xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
+	}
+
+#endif /* __i386__ */
+	
+	
+	xor_speed(&t_xor_block_8regs,&b1,&b2);
+	xor_speed(&t_xor_block_32regs,&b1,&b2);
+
+	free_pages((unsigned long)b1.b_data,2);
+	pick_fastest_function();
+}
+
+#else /* __sparc_v9__ */
+
+void calibrate_xor_block(void)
+{
+	printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
+	xor_block = xor_block_VIS;
+}
+
+#endif /* __sparc_v9__ */
+
+MD_EXPORT_SYMBOL(xor_block);
+