From b63ad0882a16a5d28003e57f2b0b81dee3fb322b Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Tue, 28 Nov 2000 03:58:46 +0000
Subject: Merge with 2.4.0-test11.

---
 drivers/md/xor.c | 2721 ++----------------------------------------------------
 1 file changed, 68 insertions(+), 2653 deletions(-)

(limited to 'drivers/md/xor.c')

diff --git a/drivers/md/xor.c b/drivers/md/xor.c
index 4fe04fb89..f58463ebc 100644
--- a/drivers/md/xor.c
+++ b/drivers/md/xor.c
@@ -1,10 +1,10 @@
 /*
  * xor.c : Multiple Devices driver for Linux
  *
- * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
  *
- *
- * optimized RAID-5 checksumming functions.
+ * Dispatch optimized RAID-5 checksumming functions.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -15,2584 +15,66 @@
  * (for example /usr/src/linux/COPYING); if not, write to the Free
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
+
 #include <linux/config.h>
 #define BH_TRACE 0
 #include <linux/module.h>
 #include <linux/raid/md.h>
-#ifdef __sparc_v9__
-#include <asm/head.h>
-#include <asm/asi.h>
-#include <asm/visasm.h>
-#endif
-
-/*
- * we use the 'XOR function template' to register multiple xor
- * functions runtime. The kernel measures their speed upon bootup
- * and decides which one to use. (compile-time registration is
- * not enough as certain CPU features like MMX can only be detected
- * runtime)
- *
- * this architecture makes it pretty easy to add new routines
- * that are faster on certain CPUs, without killing other CPU's
- * 'native' routine. Although the current routines are belived
- * to be the physically fastest ones on all CPUs tested, but
- * feel free to prove me wrong and add yet another routine =B-)
- * --mingo
- */
-
-#define MAX_XOR_BLOCKS 5
-
-#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
-
-typedef void (*xor_block_t) XOR_ARGS;
-xor_block_t xor_block = NULL;
-
-#ifndef __sparc_v9__
-
-struct xor_block_template;
-
-struct xor_block_template {
-	char * name;
-	xor_block_t xor_block;
-	int speed;
-	struct xor_block_template * next;
-};
-
-struct xor_block_template * xor_functions = NULL;
-
-#define XORBLOCK_TEMPLATE(x) \
-static void xor_block_##x XOR_ARGS; \
-static struct xor_block_template t_xor_block_##x = \
-				 { #x, xor_block_##x, 0, NULL }; \
-static void xor_block_##x XOR_ARGS
-
-#ifdef __i386__
-
-#ifdef CONFIG_X86_XMM
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-
-XORBLOCK_TEMPLATE(pIII_kni)
-{
-	char xmm_save[16*4];
-	int cr0;
-        int lines = (bh_ptr[0]->b_size>>8);
-
-	__asm__ __volatile__ ( 
-		"movl %%cr0,%0		;\n\t"
-		"clts			;\n\t"
-		"movups %%xmm0,(%1)	;\n\t"
-		"movups %%xmm1,0x10(%1)	;\n\t"
-		"movups %%xmm2,0x20(%1)	;\n\t"
-		"movups %%xmm3,0x30(%1)	;\n\t"
-		: "=r" (cr0)
-		: "r" (xmm_save) 
-		: "memory" );
-
-#define OFFS(x) "8*("#x"*2)"
-#define	PF0(x) \
-	"	prefetcht0  "OFFS(x)"(%1)   ;\n"
-#define LD(x,y) \
-        "       movaps   "OFFS(x)"(%1), %%xmm"#y"   ;\n"
-#define ST(x,y) \
-        "       movaps %%xmm"#y",   "OFFS(x)"(%1)   ;\n"
-#define PF1(x) \
-	"	prefetchnta "OFFS(x)"(%2)   ;\n"
-#define PF2(x) \
-	"	prefetchnta "OFFS(x)"(%3)   ;\n"
-#define PF3(x) \
-	"	prefetchnta "OFFS(x)"(%4)   ;\n"
-#define PF4(x) \
-	"	prefetchnta "OFFS(x)"(%5)   ;\n"
-#define PF5(x) \
-	"	prefetchnta "OFFS(x)"(%6)   ;\n"
-#define XO1(x,y) \
-        "       xorps   "OFFS(x)"(%2), %%xmm"#y"   ;\n"
-#define XO2(x,y) \
-        "       xorps   "OFFS(x)"(%3), %%xmm"#y"   ;\n"
-#define XO3(x,y) \
-        "       xorps   "OFFS(x)"(%4), %%xmm"#y"   ;\n"
-#define XO4(x,y) \
-        "       xorps   "OFFS(x)"(%5), %%xmm"#y"   ;\n"
-#define XO5(x,y) \
-        "       xorps   "OFFS(x)"(%6), %%xmm"#y"   ;\n"
-
-	switch(count) {
-		case 2:
-		        __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-		LD(i,0)					\
-			LD(i+1,1)			\
-		PF1(i)					\
-				PF1(i+2)		\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32,0x90		;\n"
-        " 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-
-        		:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-        		  "r" (bh_ptr[1]->b_data)
-		        : "memory" );
-			break;
-		case 3:
-		        __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i+2)		\
-		LD(i,0)					\
-			LD(i+1,1)			\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
-		PF2(i)					\
-				PF2(i+2)		\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		XO2(i,0)				\
-			XO2(i+1,1)			\
-				XO2(i+2,2)		\
-					XO2(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32,0x90		;\n"
-        " 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-        		:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-        		  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data)
-		        : "memory" );
-			break;
-		case 4:
-		        __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i+2)		\
-		LD(i,0)					\
-			LD(i+1,1)			\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
-		PF2(i)					\
-				PF2(i+2)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		PF3(i)					\
-				PF3(i+2)		\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO2(i,0)				\
-			XO2(i+1,1)			\
-				XO2(i+2,2)		\
-					XO2(i+3,3)	\
-		XO3(i,0)				\
-			XO3(i+1,1)			\
-				XO3(i+2,2)		\
-					XO3(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32,0x90		;\n"
-        " 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
+#include <linux/raid/xor.h>
+#include <asm/xor.h>
 
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       addl $256, %4           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
+/* The xor routines to use.  */
+static struct xor_block_template *active_template;
 
-        		:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-        		  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data),
-			  "r" (bh_ptr[3]->b_data)
-		        : "memory" );
-			break;
-		case 5:
-		        __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i+2)		\
-		LD(i,0)					\
-			LD(i+1,1)			\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
-		PF2(i)					\
-				PF2(i+2)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		PF3(i)					\
-				PF3(i+2)		\
-		XO2(i,0)				\
-			XO2(i+1,1)			\
-				XO2(i+2,2)		\
-					XO2(i+3,3)	\
-		PF4(i)					\
-				PF4(i+2)		\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO3(i,0)				\
-			XO3(i+1,1)			\
-				XO3(i+2,2)		\
-					XO3(i+3,3)	\
-		XO4(i,0)				\
-			XO4(i+1,1)			\
-				XO4(i+2,2)		\
-					XO4(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32,0x90		;\n"
-        " 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       addl $256, %4           ;\n"
-        "       addl $256, %5           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-
-        		:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-        		  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data),
-			  "r" (bh_ptr[3]->b_data),
-			  "r" (bh_ptr[4]->b_data)
-			: "memory");
-			break;
-	}
-
-	__asm__ __volatile__ ( 
-		"sfence			;\n\t"
-		"movups (%1),%%xmm0	;\n\t"
-		"movups 0x10(%1),%%xmm1	;\n\t"
-		"movups 0x20(%1),%%xmm2	;\n\t"
-		"movups 0x30(%1),%%xmm3	;\n\t"
-		"movl 	%0,%%cr0	;\n\t"
-		:
-		: "r" (cr0), "r" (xmm_save)
-		: "memory" );
-}
-
-#undef OFFS
-#undef LD
-#undef ST
-#undef PF0
-#undef PF1
-#undef PF2
-#undef PF3
-#undef PF4
-#undef PF5
-#undef XO1
-#undef XO2
-#undef XO3
-#undef XO4
-#undef XO5
-#undef BLOCK
-
-#endif /* CONFIG_X86_XMM */
-
-/*
- * high-speed RAID5 checksumming functions utilizing MMX instructions
- * Copyright (C) 1998 Ingo Molnar
- */
-XORBLOCK_TEMPLATE(pII_mmx)
+void
+xor_block(unsigned int count, struct buffer_head **bh_ptr)
 {
-	char fpu_save[108];
-        int lines = (bh_ptr[0]->b_size>>7);
-
-	if (!(current->flags & PF_USEDFPU))
-		__asm__ __volatile__ ( " clts;\n");
-
-	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
-
-#define LD(x,y) \
-        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
-#define ST(x,y) \
-        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
-#define XO1(x,y) \
-        "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
-#define XO2(x,y) \
-        "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
-#define XO3(x,y) \
-        "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
-#define XO4(x,y) \
-        "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
-
-	switch(count) {
-		case 2:
-			__asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-			LD(i,0)					\
-				LD(i+1,1)			\
-					LD(i+2,2)		\
-						LD(i+3,3)	\
-			XO1(i,0)				\
-			ST(i,0)					\
-				XO1(i+1,1)			\
-				ST(i+1,1)			\
-					XO1(i+2,2)		\
-					ST(i+2,2)		\
-						XO1(i+3,3)	\
-						ST(i+3,3)
-
-			" .align 32,0x90		;\n"
-  			" 1:                            ;\n"
-
-			BLOCK(0)
-			BLOCK(4)
-			BLOCK(8)
-			BLOCK(12)
+	unsigned long *p0, *p1, *p2, *p3, *p4;
+	unsigned long bytes = bh_ptr[0]->b_size;
 
-		        "       addl $128, %1         ;\n"
-		        "       addl $128, %2         ;\n"
-		        "       decl %0               ;\n"
-		        "       jnz 1b                ;\n"
-	        	:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-			  "r" (bh_ptr[1]->b_data)
-			: "memory");
-			break;
-		case 3:
-			__asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-			LD(i,0)					\
-				LD(i+1,1)			\
-					LD(i+2,2)		\
-						LD(i+3,3)	\
-			XO1(i,0)				\
-				XO1(i+1,1)			\
-					XO1(i+2,2)		\
-						XO1(i+3,3)	\
-			XO2(i,0)				\
-			ST(i,0)					\
-				XO2(i+1,1)			\
-				ST(i+1,1)			\
-					XO2(i+2,2)		\
-					ST(i+2,2)		\
-						XO2(i+3,3)	\
-						ST(i+3,3)
-
-			" .align 32,0x90		;\n"
-  			" 1:                            ;\n"
-
-			BLOCK(0)
-			BLOCK(4)
-			BLOCK(8)
-			BLOCK(12)
-
-		        "       addl $128, %1         ;\n"
-		        "       addl $128, %2         ;\n"
-		        "       addl $128, %3         ;\n"
-		        "       decl %0               ;\n"
-		        "       jnz 1b                ;\n"
-	        	:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-			  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data)
-			: "memory");
-			break;
-		case 4:
-			__asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-			LD(i,0)					\
-				LD(i+1,1)			\
-					LD(i+2,2)		\
-						LD(i+3,3)	\
-			XO1(i,0)				\
-				XO1(i+1,1)			\
-					XO1(i+2,2)		\
-						XO1(i+3,3)	\
-			XO2(i,0)				\
-				XO2(i+1,1)			\
-					XO2(i+2,2)		\
-						XO2(i+3,3)	\
-			XO3(i,0)				\
-			ST(i,0)					\
-				XO3(i+1,1)			\
-				ST(i+1,1)			\
-					XO3(i+2,2)		\
-					ST(i+2,2)		\
-						XO3(i+3,3)	\
-						ST(i+3,3)
-
-			" .align 32,0x90		;\n"
-  			" 1:                            ;\n"
-
-			BLOCK(0)
-			BLOCK(4)
-			BLOCK(8)
-			BLOCK(12)
-
-		        "       addl $128, %1         ;\n"
-		        "       addl $128, %2         ;\n"
-		        "       addl $128, %3         ;\n"
-		        "       addl $128, %4         ;\n"
-		        "       decl %0               ;\n"
-		        "       jnz 1b                ;\n"
-	        	:
-			: "r" (lines),
-			  "r" (bh_ptr[0]->b_data),
-			  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data),
-			  "r" (bh_ptr[3]->b_data)
-			: "memory");
-			break;
-		case 5:
-			__asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-			LD(i,0)					\
-				LD(i+1,1)			\
-					LD(i+2,2)		\
-						LD(i+3,3)	\
-			XO1(i,0)				\
-				XO1(i+1,1)			\
-					XO1(i+2,2)		\
-						XO1(i+3,3)	\
-			XO2(i,0)				\
-				XO2(i+1,1)			\
-					XO2(i+2,2)		\
-						XO2(i+3,3)	\
-			XO3(i,0)				\
-				XO3(i+1,1)			\
-					XO3(i+2,2)		\
-						XO3(i+3,3)	\
-			XO4(i,0)				\
-			ST(i,0)					\
-				XO4(i+1,1)			\
-				ST(i+1,1)			\
-					XO4(i+2,2)		\
-					ST(i+2,2)		\
-						XO4(i+3,3)	\
-						ST(i+3,3)
-
-			" .align 32,0x90		;\n"
-  			" 1:                            ;\n"
-
-			BLOCK(0)
-			BLOCK(4)
-			BLOCK(8)
-			BLOCK(12)
-
-		        "       addl $128, %1         ;\n"
-		        "       addl $128, %2         ;\n"
-		        "       addl $128, %3         ;\n"
-		        "       addl $128, %4         ;\n"
-		        "       addl $128, %5         ;\n"
-		        "       decl %0               ;\n"
-		        "       jnz 1b                ;\n"
-	        	:
-			: "g" (lines),
-			  "r" (bh_ptr[0]->b_data),
-			  "r" (bh_ptr[1]->b_data),
-			  "r" (bh_ptr[2]->b_data),
-			  "r" (bh_ptr[3]->b_data),
-			  "r" (bh_ptr[4]->b_data)
-			: "memory");
-			break;
+	p0 = (unsigned long *) bh_ptr[0]->b_data;
+	p1 = (unsigned long *) bh_ptr[1]->b_data;
+	if (count == 2) {
+		active_template->do_2(bytes, p0, p1);
+		return;
 	}
 
-	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
-	if (!(current->flags & PF_USEDFPU))
-		stts();
-}
-
-#undef LD
-#undef XO1
-#undef XO2
-#undef XO3
-#undef XO4
-#undef ST
-#undef BLOCK
-
-XORBLOCK_TEMPLATE(p5_mmx)
-{
-	char fpu_save[108];
-        int lines = (bh_ptr[0]->b_size>>6);
-
-	if (!(current->flags & PF_USEDFPU))
-		__asm__ __volatile__ ( " clts;\n");
-
-	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
-
-	switch(count) {
-		case 2:
-		        __asm__ __volatile__ (
-
-			        " .align 32,0x90             ;\n"
-			        " 1:                         ;\n"
-			        "       movq   (%1), %%mm0   ;\n"
-			        "       movq  8(%1), %%mm1   ;\n"
-			        "       pxor   (%2), %%mm0   ;\n"
-			        "       movq 16(%1), %%mm2   ;\n"
-			        "       movq %%mm0,   (%1)   ;\n"
-			        "       pxor  8(%2), %%mm1   ;\n"
-			        "       movq 24(%1), %%mm3   ;\n"
-			        "       movq %%mm1,  8(%1)   ;\n"
-			        "       pxor 16(%2), %%mm2   ;\n"
-			        "       movq 32(%1), %%mm4   ;\n"
-			        "       movq %%mm2, 16(%1)   ;\n"
-			        "       pxor 24(%2), %%mm3   ;\n"
-			        "       movq 40(%1), %%mm5   ;\n"
-			        "       movq %%mm3, 24(%1)   ;\n"
-			        "       pxor 32(%2), %%mm4   ;\n"
-			        "       movq 48(%1), %%mm6   ;\n"
-			        "       movq %%mm4, 32(%1)   ;\n"
-			        "       pxor 40(%2), %%mm5   ;\n"
-			        "       movq 56(%1), %%mm7   ;\n"
-			        "       movq %%mm5, 40(%1)   ;\n"
-			        "       pxor 48(%2), %%mm6   ;\n"
-			        "       pxor 56(%2), %%mm7   ;\n"
-			        "       movq %%mm6, 48(%1)   ;\n"
-			        "       movq %%mm7, 56(%1)   ;\n"
-        
-			        "       addl $64, %1         ;\n"
-			        "       addl $64, %2         ;\n"
-			        "       decl %0              ;\n"
-			        "       jnz 1b               ;\n"
-
-			        : 
-			        : "r" (lines),
-				  "r" (bh_ptr[0]->b_data),
-				  "r" (bh_ptr[1]->b_data)
-			        : "memory" );
-			break;
-		case 3:
-			__asm__ __volatile__ (
-
-			        " .align 32,0x90             ;\n"
-			        " 1:                         ;\n"
-			        "       movq   (%1), %%mm0   ;\n"
-			        "       movq  8(%1), %%mm1   ;\n"
-			        "       pxor   (%2), %%mm0   ;\n"
-			        "       movq 16(%1), %%mm2   ;\n"
-			        "       pxor  8(%2), %%mm1   ;\n"
-			        "       pxor   (%3), %%mm0   ;\n"
-			        "       pxor 16(%2), %%mm2   ;\n"
-			        "       movq %%mm0,   (%1)   ;\n"
-			        "       pxor  8(%3), %%mm1   ;\n"
-			        "       pxor 16(%3), %%mm2   ;\n"
-			        "       movq 24(%1), %%mm3   ;\n"
-			        "       movq %%mm1,  8(%1)   ;\n"
-			        "       movq 32(%1), %%mm4   ;\n"
-			        "       movq 40(%1), %%mm5   ;\n"
-			        "       pxor 24(%2), %%mm3   ;\n"
-			        "       movq %%mm2, 16(%1)   ;\n"
-			        "       pxor 32(%2), %%mm4   ;\n"
-			        "       pxor 24(%3), %%mm3   ;\n"
-			        "       pxor 40(%2), %%mm5   ;\n"
-			        "       movq %%mm3, 24(%1)   ;\n"
-			        "       pxor 32(%3), %%mm4   ;\n"
-			        "       pxor 40(%3), %%mm5   ;\n"
-			        "       movq 48(%1), %%mm6   ;\n"
-			        "       movq %%mm4, 32(%1)   ;\n"
-			        "       movq 56(%1), %%mm7   ;\n"
-			        "       pxor 48(%2), %%mm6   ;\n"
-			        "       movq %%mm5, 40(%1)   ;\n"
-			        "       pxor 56(%2), %%mm7   ;\n"
-			        "       pxor 48(%3), %%mm6   ;\n"
-			        "       pxor 56(%3), %%mm7   ;\n"
-			        "       movq %%mm6, 48(%1)   ;\n"
-			        "       movq %%mm7, 56(%1)   ;\n"
-        
-			        "       addl $64, %1         ;\n"
-			        "       addl $64, %2         ;\n"
-			        "       addl $64, %3         ;\n"
-			        "       decl %0              ;\n"
-			        "       jnz 1b               ;\n"
-
-			        : 
-			        : "r" (lines),
-				  "r" (bh_ptr[0]->b_data),
-				  "r" (bh_ptr[1]->b_data),
-				  "r" (bh_ptr[2]->b_data)
-			        : "memory" );
-			break;
-		case 4:
-			__asm__ __volatile__ (
-
-			        " .align 32,0x90             ;\n"
-			        " 1:                         ;\n"
-			        "       movq   (%1), %%mm0   ;\n"
-			        "       movq  8(%1), %%mm1   ;\n"
-			        "       pxor   (%2), %%mm0   ;\n"
-			        "       movq 16(%1), %%mm2   ;\n"
-			        "       pxor  8(%2), %%mm1   ;\n"
-			        "       pxor   (%3), %%mm0   ;\n"
-			        "       pxor 16(%2), %%mm2   ;\n"
-			        "       pxor  8(%3), %%mm1   ;\n"
-			        "       pxor   (%4), %%mm0   ;\n"
-			        "       movq 24(%1), %%mm3   ;\n"
-			        "       pxor 16(%3), %%mm2   ;\n"
-			        "       pxor  8(%4), %%mm1   ;\n"
-			        "       movq %%mm0,   (%1)   ;\n"
-			        "       movq 32(%1), %%mm4   ;\n"
-			        "       pxor 24(%2), %%mm3   ;\n"
-			        "       pxor 16(%4), %%mm2   ;\n"
-			        "       movq %%mm1,  8(%1)   ;\n"
-			        "       movq 40(%1), %%mm5   ;\n"
-			        "       pxor 32(%2), %%mm4   ;\n"
-			        "       pxor 24(%3), %%mm3   ;\n"
-			        "       movq %%mm2, 16(%1)   ;\n"
-			        "       pxor 40(%2), %%mm5   ;\n"
-			        "       pxor 32(%3), %%mm4   ;\n"
-			        "       pxor 24(%4), %%mm3   ;\n"
-			        "       movq %%mm3, 24(%1)   ;\n"
-			        "       movq 56(%1), %%mm7   ;\n"
-			        "       movq 48(%1), %%mm6   ;\n"
-			        "       pxor 40(%3), %%mm5   ;\n"
-			        "       pxor 32(%4), %%mm4   ;\n"
-			        "       pxor 48(%2), %%mm6   ;\n"
-			        "       movq %%mm4, 32(%1)   ;\n"
-			        "       pxor 56(%2), %%mm7   ;\n"
-			        "       pxor 40(%4), %%mm5   ;\n"
-			        "       pxor 48(%3), %%mm6   ;\n"
-			        "       pxor 56(%3), %%mm7   ;\n"
-			        "       movq %%mm5, 40(%1)   ;\n"
-			        "       pxor 48(%4), %%mm6   ;\n"
-			        "       pxor 56(%4), %%mm7   ;\n"
-			        "       movq %%mm6, 48(%1)   ;\n"
-			        "       movq %%mm7, 56(%1)   ;\n"
-        
-			        "       addl $64, %1         ;\n"
-			        "       addl $64, %2         ;\n"
-			        "       addl $64, %3         ;\n"
-			        "       addl $64, %4         ;\n"
-			        "       decl %0              ;\n"
-			        "       jnz 1b               ;\n"
-
-			        : 
-			        : "r" (lines),
-				  "r" (bh_ptr[0]->b_data),
-				  "r" (bh_ptr[1]->b_data),
-				  "r" (bh_ptr[2]->b_data),
-				  "r" (bh_ptr[3]->b_data)
-			        : "memory" );
-			break;
-		case 5:
-			__asm__ __volatile__ (
-
-			        " .align 32,0x90             ;\n"
-			        " 1:                         ;\n"
-			        "       movq   (%1), %%mm0   ;\n"
-			        "       movq  8(%1), %%mm1   ;\n"
-			        "       pxor   (%2), %%mm0   ;\n"
-			        "       pxor  8(%2), %%mm1   ;\n"
-			        "       movq 16(%1), %%mm2   ;\n"
-			        "       pxor   (%3), %%mm0   ;\n"
-			        "       pxor  8(%3), %%mm1   ;\n"
-			        "       pxor 16(%2), %%mm2   ;\n"
-			        "       pxor   (%4), %%mm0   ;\n"
-			        "       pxor  8(%4), %%mm1   ;\n"
-			        "       pxor 16(%3), %%mm2   ;\n"
-			        "       movq 24(%1), %%mm3   ;\n"
-			        "       pxor   (%5), %%mm0   ;\n"
-			        "       pxor  8(%5), %%mm1   ;\n"
-			        "       movq %%mm0,   (%1)   ;\n"
-			        "       pxor 16(%4), %%mm2   ;\n"
-			        "       pxor 24(%2), %%mm3   ;\n"
-			        "       movq %%mm1,  8(%1)   ;\n"
-			        "       pxor 16(%5), %%mm2   ;\n"
-			        "       pxor 24(%3), %%mm3   ;\n"
-			        "       movq 32(%1), %%mm4   ;\n"
-			        "       movq %%mm2, 16(%1)   ;\n"
-			        "       pxor 24(%4), %%mm3   ;\n"
-			        "       pxor 32(%2), %%mm4   ;\n"
-			        "       movq 40(%1), %%mm5   ;\n"
-			        "       pxor 24(%5), %%mm3   ;\n"
-			        "       pxor 32(%3), %%mm4   ;\n"
-			        "       pxor 40(%2), %%mm5   ;\n"
-			        "       movq %%mm3, 24(%1)   ;\n"
-			        "       pxor 32(%4), %%mm4   ;\n"
-			        "       pxor 40(%3), %%mm5   ;\n"
-			        "       movq 48(%1), %%mm6   ;\n"
-			        "       movq 56(%1), %%mm7   ;\n"
-			        "       pxor 32(%5), %%mm4   ;\n"
-			        "       pxor 40(%4), %%mm5   ;\n"
-			        "       pxor 48(%2), %%mm6   ;\n"
-			        "       pxor 56(%2), %%mm7   ;\n"
-			        "       movq %%mm4, 32(%1)   ;\n"
-			        "       pxor 48(%3), %%mm6   ;\n"
-			        "       pxor 56(%3), %%mm7   ;\n"
-			        "       pxor 40(%5), %%mm5   ;\n"
-			        "       pxor 48(%4), %%mm6   ;\n"
-			        "       pxor 56(%4), %%mm7   ;\n"
-			        "       movq %%mm5, 40(%1)   ;\n"
-			        "       pxor 48(%5), %%mm6   ;\n"
-			        "       pxor 56(%5), %%mm7   ;\n"
-			        "       movq %%mm6, 48(%1)   ;\n"
-			        "       movq %%mm7, 56(%1)   ;\n"
-        
-			        "       addl $64, %1         ;\n"
-			        "       addl $64, %2         ;\n"
-			        "       addl $64, %3         ;\n"
-			        "       addl $64, %4         ;\n"
-			        "       addl $64, %5         ;\n"
-			        "       decl %0              ;\n"
-			        "       jnz 1b               ;\n"
-
-			        : 
-			        : "g" (lines),
-				  "r" (bh_ptr[0]->b_data),
-				  "r" (bh_ptr[1]->b_data),
-				  "r" (bh_ptr[2]->b_data),
-				  "r" (bh_ptr[3]->b_data),
-				  "r" (bh_ptr[4]->b_data)
-			        : "memory" );
-			break;
+	p2 = (unsigned long *) bh_ptr[2]->b_data;
+	if (count == 3) {
+		active_template->do_3(bytes, p0, p1, p2);
+		return;
 	}
 
-	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
-	if (!(current->flags & PF_USEDFPU))
-		stts();
-}
-#endif /* __i386__ */
-#endif /* !__sparc_v9__ */
-
-#ifdef __sparc_v9__
-/*
- * High speed xor_block operation for RAID4/5 utilizing the
- * UltraSparc Visual Instruction Set.
- *
- * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
- *
- *	Requirements:
- *	!(((long)dest | (long)sourceN) & (64 - 1)) &&
- *	!(len & 127) && len >= 256
- *
- * It is done in pure assembly, as otherwise gcc makes it
- * a non-leaf function, which is not what we want.
- * Also, we don't measure the speeds as on other architectures,
- * as the measuring routine does not take into account cold caches
- * and the fact that xor_block_VIS bypasses the caches.
- * xor_block_32regs might be 5% faster for count 2 if caches are hot
- * and things just right (for count 3 VIS is about as fast as 32regs for
- * hot caches and for count 4 and 5 VIS is faster by good margin always),
- * but I think it is better not to pollute the caches.
- * Actually, if I'd just fight for speed for hot caches, I could
- * write a hybrid VIS/integer routine, which would do always two
- * 64B blocks in VIS and two in IEUs, but I really care more about
- * caches.
- */
-extern void *VISenter(void);
-extern void xor_block_VIS XOR_ARGS;
-
-void __xor_block_VIS(void)
-{
-__asm__ ("
-	.globl xor_block_VIS
-xor_block_VIS:
-	ldx	[%%o1 + 0], %%o4
-	ldx	[%%o1 + 8], %%o3
-	ldx	[%%o4 + %1], %%g5
-	ldx	[%%o4 + %0], %%o4
-	ldx	[%%o3 + %0], %%o3
-	rd	%%fprs, %%o5
-	andcc	%%o5, %2, %%g0
-	be,pt	%%icc, 297f
-	 sethi	%%hi(%5), %%g1
-	jmpl	%%g1 + %%lo(%5), %%g7
-	 add	%%g7, 8, %%g7
-297:	wr	%%g0, %4, %%fprs
-	membar	#LoadStore|#StoreLoad|#StoreStore
-	sub	%%g5, 64, %%g5
-	ldda	[%%o4] %3, %%f0
-	ldda	[%%o3] %3, %%f16
-	cmp	%%o0, 4
-	bgeu,pt	%%xcc, 10f
-	 cmp	%%o0, 3
-	be,pn	%%xcc, 13f
-	 mov	-64, %%g1
-	sub	%%g5, 64, %%g5
-	rd	%%asi, %%g1
-	wr	%%g0, %3, %%asi
-
-2:	ldda	[%%o4 + 64] %%asi, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	stda	%%f16, [%%o4] %3
-	ldda	[%%o3 + 64] %%asi, %%f48
-	ldda	[%%o4 + 128] %%asi, %%f0
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	add	%%o4, 128, %%o4
-	fxor	%%f36, %%f52, %%f52
-	add	%%o3, 128, %%o3
-	fxor	%%f38, %%f54, %%f54
-	subcc	%%g5, 128, %%g5
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%%o4 - 64] %%asi
-	bne,pt	%%xcc, 2b
-	 ldda	[%%o3] %3, %%f16
-
-	ldda	[%%o4 + 64] %%asi, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	stda	%%f16, [%%o4] %3
-	ldda	[%%o3 + 64] %%asi, %%f48
-	membar	#Sync
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	fxor	%%f36, %%f52, %%f52
-	fxor	%%f38, %%f54, %%f54
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%%o4 + 64] %%asi
-	membar	#Sync|#StoreStore|#StoreLoad
-	wr	%%g0, 0, %%fprs
-	retl
-	 wr	%%g1, %%g0, %%asi
-
-13:	ldx	[%%o1 + 16], %%o2
-	ldx	[%%o2 + %0], %%o2
-
-3:	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f48
-	fxor	%%f2, %%f18, %%f50
-	add	%%o4, 64, %%o4
-	fxor	%%f4, %%f20, %%f52
-	fxor	%%f6, %%f22, %%f54
-	add	%%o3, 64, %%o3
-	fxor	%%f8, %%f24, %%f56
-	fxor	%%f10, %%f26, %%f58
-	fxor	%%f12, %%f28, %%f60
-	fxor	%%f14, %%f30, %%f62
-	ldda	[%%o4] %3, %%f0
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	add	%%o2, 64, %%o2
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	subcc	%%g5, 64, %%g5
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	stda	%%f48, [%%o4 + %%g1] %3
-	bne,pt	%%xcc, 3b
-	 ldda	[%%o3] %3, %%f16
-
-	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f48
-	fxor	%%f2, %%f18, %%f50
-	fxor	%%f4, %%f20, %%f52
-	fxor	%%f6, %%f22, %%f54
-	fxor	%%f8, %%f24, %%f56
-	fxor	%%f10, %%f26, %%f58
-	fxor	%%f12, %%f28, %%f60
-	fxor	%%f14, %%f30, %%f62
-	membar	#Sync
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	stda	%%f48, [%%o4] %3
-	membar	#Sync|#StoreStore|#StoreLoad
-	retl
-	 wr	%%g0, 0, %%fprs
-
-10:	cmp	%%o0, 5
-	be,pt	%%xcc, 15f
-	 mov	-64, %%g1
-
-14:	ldx	[%%o1 + 16], %%o2
-	ldx	[%%o1 + 24], %%o0
-	ldx	[%%o2 + %0], %%o2
-	ldx	[%%o0 + %0], %%o0
-
-4:	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	add	%%o4, 64, %%o4
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	add	%%o3, 64, %%o3
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	ldda	[%%o0] %3, %%f48
-	fxor	%%f16, %%f32, %%f32
-	fxor	%%f18, %%f34, %%f34
-	fxor	%%f20, %%f36, %%f36
-	fxor	%%f22, %%f38, %%f38
-	add	%%o2, 64, %%o2
-	fxor	%%f24, %%f40, %%f40
-	fxor	%%f26, %%f42, %%f42
-	fxor	%%f28, %%f44, %%f44
-	fxor	%%f30, %%f46, %%f46
-	ldda	[%%o4] %3, %%f0
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	fxor	%%f36, %%f52, %%f52
-	add	%%o0, 64, %%o0
-	fxor	%%f38, %%f54, %%f54
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	subcc	%%g5, 64, %%g5
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%%o4 + %%g1] %3
-	bne,pt	%%xcc, 4b
-	 ldda	[%%o3] %3, %%f16
-
-	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	ldda	[%%o0] %3, %%f48
-	fxor	%%f16, %%f32, %%f32
-	fxor	%%f18, %%f34, %%f34
-	fxor	%%f20, %%f36, %%f36
-	fxor	%%f22, %%f38, %%f38
-	fxor	%%f24, %%f40, %%f40
-	fxor	%%f26, %%f42, %%f42
-	fxor	%%f28, %%f44, %%f44
-	fxor	%%f30, %%f46, %%f46
-	membar	#Sync
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	fxor	%%f36, %%f52, %%f52
-	fxor	%%f38, %%f54, %%f54
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%%o4] %3
-	membar	#Sync|#StoreStore|#StoreLoad
-	retl
-	 wr	%%g0, 0, %%fprs
-
-15:	ldx	[%%o1 + 16], %%o2
-	ldx	[%%o1 + 24], %%o0
-	ldx	[%%o1 + 32], %%o1
-	ldx	[%%o2 + %0], %%o2
-	ldx	[%%o0 + %0], %%o0
-	ldx	[%%o1 + %0], %%o1
-
-5:	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f48
-	fxor	%%f2, %%f18, %%f50
-	add	%%o4, 64, %%o4
-	fxor	%%f4, %%f20, %%f52
-	fxor	%%f6, %%f22, %%f54
-	add	%%o3, 64, %%o3
-	fxor	%%f8, %%f24, %%f56
-	fxor	%%f10, %%f26, %%f58
-	fxor	%%f12, %%f28, %%f60
-	fxor	%%f14, %%f30, %%f62
-	ldda	[%%o0] %3, %%f16
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	add	%%o2, 64, %%o2
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	ldda	[%%o1] %3, %%f32
-	fxor	%%f48, %%f16, %%f48
-	fxor	%%f50, %%f18, %%f50
-	add	%%o0, 64, %%o0
-	fxor	%%f52, %%f20, %%f52
-	fxor	%%f54, %%f22, %%f54
-	add	%%o1, 64, %%o1
-	fxor	%%f56, %%f24, %%f56
-	fxor	%%f58, %%f26, %%f58
-	fxor	%%f60, %%f28, %%f60
-	fxor	%%f62, %%f30, %%f62
-	ldda	[%%o4] %3, %%f0
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	subcc	%%g5, 64, %%g5
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	stda	%%f48, [%%o4 + %%g1] %3
-	bne,pt	%%xcc, 5b
-	 ldda	[%%o3] %3, %%f16
-
-	ldda	[%%o2] %3, %%f32
-	fxor	%%f0, %%f16, %%f48
-	fxor	%%f2, %%f18, %%f50
-	fxor	%%f4, %%f20, %%f52
-	fxor	%%f6, %%f22, %%f54
-	fxor	%%f8, %%f24, %%f56
-	fxor	%%f10, %%f26, %%f58
-	fxor	%%f12, %%f28, %%f60
-	fxor	%%f14, %%f30, %%f62
-	ldda	[%%o0] %3, %%f16
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	ldda	[%%o1] %3, %%f32
-	fxor	%%f48, %%f16, %%f48
-	fxor	%%f50, %%f18, %%f50
-	fxor	%%f52, %%f20, %%f52
-	fxor	%%f54, %%f22, %%f54
-	fxor	%%f56, %%f24, %%f56
-	fxor	%%f58, %%f26, %%f58
-	fxor	%%f60, %%f28, %%f60
-	fxor	%%f62, %%f30, %%f62
-	membar	#Sync
-	fxor	%%f48, %%f32, %%f48
-	fxor	%%f50, %%f34, %%f50
-	fxor	%%f52, %%f36, %%f52
-	fxor	%%f54, %%f38, %%f54
-	fxor	%%f56, %%f40, %%f56
-	fxor	%%f58, %%f42, %%f58
-	fxor	%%f60, %%f44, %%f60
-	fxor	%%f62, %%f46, %%f62
-	stda	%%f48, [%%o4] %3
-	membar	#Sync|#StoreStore|#StoreLoad
-	retl
-	 wr	%%g0, 0, %%fprs
-	" : :
-	"i" (&((struct buffer_head *)0)->b_data),
-	"i" (&((struct buffer_head *)0)->b_size),
-	"i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
-	"i" (FPRS_FEF), "i" (VISenter));
-}
-#endif /* __sparc_v9__ */
-
-#if defined(__sparc__) && !defined(__sparc_v9__)
-/*
- * High speed xor_block operation for RAID4/5 utilizing the
- * ldd/std SPARC instructions.
- *
- * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
- *
- */
-
-XORBLOCK_TEMPLATE(SPARC)
-{
-	int size  = bh_ptr[0]->b_size;
-	int lines = size / (sizeof (long)) / 8, i;
-	long *destp   = (long *) bh_ptr[0]->b_data;
-	long *source1 = (long *) bh_ptr[1]->b_data;
-	long *source2, *source3, *source4;
-
-	switch (count) {
-	case 2:
-		for (i = lines; i > 0; i--) {
-		  __asm__ __volatile__("
-		  ldd [%0 + 0x00], %%g2
-		  ldd [%0 + 0x08], %%g4
-		  ldd [%0 + 0x10], %%o0
-		  ldd [%0 + 0x18], %%o2
-		  ldd [%1 + 0x00], %%o4
-		  ldd [%1 + 0x08], %%l0
-		  ldd [%1 + 0x10], %%l2
-		  ldd [%1 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  std %%g2, [%0 + 0x00]
-		  std %%g4, [%0 + 0x08]
-		  std %%o0, [%0 + 0x10]
-		  std %%o2, [%0 + 0x18]
-		  " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", 
-		  "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
-		  destp += 8;
-		  source1 += 8;
-		}
-		break;
-	case 3:
-		source2 = (long *) bh_ptr[2]->b_data;
-		for (i = lines; i > 0; i--) {
-		  __asm__ __volatile__("
-		  ldd [%0 + 0x00], %%g2
-		  ldd [%0 + 0x08], %%g4
-		  ldd [%0 + 0x10], %%o0
-		  ldd [%0 + 0x18], %%o2
-		  ldd [%1 + 0x00], %%o4
-		  ldd [%1 + 0x08], %%l0
-		  ldd [%1 + 0x10], %%l2
-		  ldd [%1 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%2 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%2 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%2 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%2 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  std %%g2, [%0 + 0x00]
-		  std %%g4, [%0 + 0x08]
-		  std %%o0, [%0 + 0x10]
-		  std %%o2, [%0 + 0x18]
-		  " : : "r" (destp), "r" (source1), "r" (source2)
-		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
-		  "l0", "l1", "l2", "l3", "l4", "l5");
-		  destp += 8;
-		  source1 += 8;
-		  source2 += 8;
-		}
-		break;
-	case 4:
-		source2 = (long *) bh_ptr[2]->b_data;
-		source3 = (long *) bh_ptr[3]->b_data;
-		for (i = lines; i > 0; i--) {
-		  __asm__ __volatile__("
-		  ldd [%0 + 0x00], %%g2
-		  ldd [%0 + 0x08], %%g4
-		  ldd [%0 + 0x10], %%o0
-		  ldd [%0 + 0x18], %%o2
-		  ldd [%1 + 0x00], %%o4
-		  ldd [%1 + 0x08], %%l0
-		  ldd [%1 + 0x10], %%l2
-		  ldd [%1 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%2 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%2 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%2 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%2 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%3 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%3 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%3 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%3 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  std %%g2, [%0 + 0x00]
-		  std %%g4, [%0 + 0x08]
-		  std %%o0, [%0 + 0x10]
-		  std %%o2, [%0 + 0x18]
-		  " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
-		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
-		  "l0", "l1", "l2", "l3", "l4", "l5");
-		  destp += 8;
-		  source1 += 8;
-		  source2 += 8;
-		  source3 += 8;
-		}
-		break;
-	case 5:
-		source2 = (long *) bh_ptr[2]->b_data;
-		source3 = (long *) bh_ptr[3]->b_data;
-		source4 = (long *) bh_ptr[4]->b_data;
-		for (i = lines; i > 0; i--) {
-		  __asm__ __volatile__("
-		  ldd [%0 + 0x00], %%g2
-		  ldd [%0 + 0x08], %%g4
-		  ldd [%0 + 0x10], %%o0
-		  ldd [%0 + 0x18], %%o2
-		  ldd [%1 + 0x00], %%o4
-		  ldd [%1 + 0x08], %%l0
-		  ldd [%1 + 0x10], %%l2
-		  ldd [%1 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%2 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%2 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%2 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%2 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%3 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%3 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%3 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%3 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  ldd [%4 + 0x00], %%o4
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  ldd [%4 + 0x08], %%l0
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  ldd [%4 + 0x10], %%l2
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  ldd [%4 + 0x18], %%l4
-		  xor %%g2, %%o4, %%g2
-		  xor %%g3, %%o5, %%g3
-		  xor %%g4, %%l0, %%g4
-		  xor %%g5, %%l1, %%g5
-		  xor %%o0, %%l2, %%o0
-		  xor %%o1, %%l3, %%o1
-		  xor %%o2, %%l4, %%o2
-		  xor %%o3, %%l5, %%o3
-		  std %%g2, [%0 + 0x00]
-		  std %%g4, [%0 + 0x08]
-		  std %%o0, [%0 + 0x10]
-		  std %%o2, [%0 + 0x18]
-		  " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
-		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
-		  "l0", "l1", "l2", "l3", "l4", "l5");
-		  destp += 8;
-		  source1 += 8;
-		  source2 += 8;
-		  source3 += 8;
-		  source4 += 8;
-		}
-		break;
+	p3 = (unsigned long *) bh_ptr[3]->b_data;
+	if (count == 4) {
+		active_template->do_4(bytes, p0, p1, p2, p3);
+		return;
 	}
-}
-#endif /* __sparc_v[78]__ */
-
-#ifdef __alpha__
-/*
- * High speed xor_block operation for RAID4/5 pipelined for Alpha EV5.
- * There is a second version using EV6 prefetch instructions.
- *
- * Copyright (C) 2000 Richard Henderson (rth@redhat.com)
- */
-
-XORBLOCK_TEMPLATE(alpha)
-{
-	long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
-	long *d = (long *) bh_ptr[0]->b_data;
-	long *s1 = (long *) bh_ptr[1]->b_data;
-	long *s2, *s3, *s4;
-
-	if (count == 2) goto two_blocks;
-
-	s2 = (long *) bh_ptr[2]->b_data;
-	if (count == 3) goto three_blocks;
-
-	s3 = (long *) bh_ptr[3]->b_data;
-	if (count == 4) goto four_blocks;
-
-	s4 = (long *) bh_ptr[4]->b_data;
-	goto five_blocks;
-
-two_blocks:
-asm volatile ("
-	.align 4
-2:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,8(%0)
-	ldq $3,8(%1)
-
-	ldq $4,16(%0)
-	ldq $5,16(%1)
-	ldq $6,24(%0)
-	ldq $7,24(%1)
-
-	ldq $16,32(%0)
-	ldq $17,32(%1)
-	ldq $18,40(%0)
-	ldq $19,40(%1)
-
-	ldq $20,48(%0)
-	ldq $21,48(%1)
-	ldq $22,56(%0)
-	xor $0,$1,$0		# 7 cycles from $1 load
-
-	ldq $23,56(%1)
-	xor $2,$3,$2
-	stq $0,0(%0)
-	xor $4,$5,$4
-
-	stq $2,8(%0)
-	xor $6,$7,$6
-	stq $4,16(%0)
-	xor $16,$17,$16
-
-	stq $6,24(%0)
-	xor $18,$19,$18
-	stq $16,32(%0)
-	xor $20,$21,$20
-
-	stq $18,40(%0)
-	xor $22,$23,$22
-	stq $20,48(%0)
-	subq %2,1,%2
-
-	stq $22,56(%0)
-	addq %0,64,%0
-	addq %1,64,%1
-	bgt %2,2b"
-	: "=r"(d), "=r"(s1), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
-	return;
-
-three_blocks:
-asm volatile ("
-	.align 4
-3:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,8(%0)
-
-	ldq $4,8(%1)
-	ldq $6,16(%0)
-	ldq $7,16(%1)
-	ldq $17,24(%0)
-
-	ldq $18,24(%1)
-	ldq $20,32(%0)
-	ldq $21,32(%1)
-	ldq $5,8(%2)
-
-	ldq $16,16(%2)
-	ldq $19,24(%2)
-	ldq $22,32(%2)
-	nop
-
-	xor $0,$1,$1		# 8 cycles from $0 load
-	xor $3,$4,$4		# 6 cycles from $4 load
-	xor $6,$7,$7		# 6 cycles from $7 load
-	xor $17,$18,$18		# 5 cycles from $18 load
-
-	xor $1,$2,$2		# 9 cycles from $2 load
-	xor $20,$21,$21		# 5 cycles from $21 load
-	stq $2,0(%0)
-	xor $4,$5,$5		# 6 cycles from $5 load
-
-	stq $5,8(%0)
-	xor $7,$16,$16		# 7 cycles from $16 load
-	stq $16,16(%0)
-	xor $18,$19,$19		# 7 cycles from $19 load
-
-	stq $19,24(%0)
-	xor $21,$22,$22		# 7 cycles from $22 load
-	stq $22,32(%0)
-	nop
-
-	ldq $0,40(%0)
-	ldq $1,40(%1)
-	ldq $3,48(%0)
-	ldq $4,48(%1)
-
-	ldq $6,56(%0)
-	ldq $7,56(%1)
-	ldq $2,40(%2)
-	ldq $5,48(%2)
-
-	ldq $16,56(%2)
-	xor $0,$1,$1		# 4 cycles from $1 load
-	xor $3,$4,$4		# 5 cycles from $4 load
-	xor $6,$7,$7		# 5 cycles from $7 load
-
-	xor $1,$2,$2		# 4 cycles from $2 load
-	xor $4,$5,$5		# 5 cycles from $5 load
-	stq $2,40(%0)
-	xor $7,$16,$16		# 4 cycles from $16 load
-
-	stq $5,48(%0)
-	subq %3,1,%3
-	stq $16,56(%0)
-	addq %2,64,%2
-
-	addq %1,64,%1
-	addq %0,64,%0
-	bgt %3,3b"
-	: "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(s2), "3"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21", "$22");
-	return;
-
-four_blocks:
-asm volatile ("
-	.align 4
-4:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,0(%3)
-
-	ldq $4,8(%0)
-	ldq $5,8(%1)
-	ldq $6,8(%2)
-	ldq $7,8(%3)
 
-	ldq $16,16(%0)
-	ldq $17,16(%1)
-	ldq $18,16(%2)
-	ldq $19,16(%3)
-
-	ldq $20,24(%0)
-	xor $0,$1,$1		# 6 cycles from $1 load
-	ldq $21,24(%1)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	ldq $0,24(%2)
-	xor $1,$3,$3
-	ldq $1,24(%3)
-	xor $4,$5,$5		# 7 cycles from $5 load
-
-	stq $3,0(%0)
-	xor $6,$7,$7
-	xor $16,$17,$17		# 7 cycles from $17 load
-	xor $5,$7,$7
-
-	stq $7,8(%0)
-	xor $18,$19,$19		# 7 cycles from $19 load
-	ldq $2,32(%0)
-	xor $17,$19,$19
-
-	ldq $3,32(%1)
-	ldq $4,32(%2)
-	ldq $5,32(%3)
-	xor $20,$21,$21		# 8 cycles from $21 load
-
-	ldq $6,40(%0)
-	ldq $7,40(%1)
-	ldq $16,40(%2)
-	ldq $17,40(%3)
-
-	stq $19,16(%0)
-	xor $0,$1,$1		# 9 cycles from $1 load
-	xor $2,$3,$3		# 5 cycles from $3 load
-	xor $21,$1,$1
-
-	ldq $18,48(%0)
-	xor $4,$5,$5		# 5 cycles from $5 load
-	ldq $19,48(%1)
-	xor $3,$5,$5
-
-	ldq $20,48(%2)
-	ldq $21,48(%3)
-	ldq $0,56(%0)
-	ldq $1,56(%1)
-
-	ldq $2,56(%2)
-	xor $6,$7,$7		# 8 cycles from $6 load
-	ldq $3,56(%3)
-	xor $16,$17,$17		# 8 cycles from $17 load
-
-	xor $7,$17,$17
-	xor $18,$19,$19		# 5 cycles from $19 load
-	xor $20,$21,$21		# 5 cycles from $21 load
-	xor $19,$21,$21
-
-	stq $1,24(%0)
-	xor $0,$1,$1		# 5 cycles from $1 load
-	stq $5,32(%0)
-	xor $2,$3,$3		# 4 cycles from $3 load
-
-	stq $17,40(%0)
-	xor $1,$3,$3
-	stq $21,48(%0)
-	subq %4,1,%4
-
-	stq $3,56(%0)
-	addq %3,64,%3
-	addq %2,64,%2
-	addq %1,64,%1
-
-	addq %0,64,%0
-	bgt %4,4b"
-	: "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21");
-	return;
-
-five_blocks:
-asm volatile ("
-	ldq %0,0(%6)
-	ldq %1,8(%6)
-	ldq %2,16(%6)
-	ldq %3,24(%6)
-	ldq %4,32(%6)
-	ldq %0,%7(%0)
-	ldq %1,%7(%1)
-	ldq %2,%7(%2)
-	ldq %3,%7(%3)
-	ldq %4,%7(%4)
-	.align 4
-5:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,0(%3)
-
-	ldq $4,0(%4)
-	ldq $5,8(%0)
-	ldq $6,8(%1)
-	ldq $7,8(%2)
-
-	ldq $16,8(%3)
-	ldq $17,8(%4)
-	ldq $18,16(%0)
-	ldq $19,16(%1)
-
-	ldq $20,16(%2)
-	xor $0,$1,$1		# 6 cycles from $1 load
-	ldq $21,16(%3)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	ldq $0,16(%4)
-	xor $1,$3,$3
-	ldq $1,24(%0)
-	xor $3,$4,$4		# 7 cycles from $4 load
-
-	stq $4,0(%0)
-	xor $5,$6,$6		# 7 cycles from $6 load
-	xor $7,$16,$16		# 7 cycles from $16 load
-	xor $6,$17,$17		# 7 cycles from $17 load
-
-	ldq $2,24(%1)
-	xor $16,$17,$17
-	ldq $3,24(%2)
-	xor $18,$19,$19		# 8 cycles from $19 load
-
-	stq $17,8(%0)
-	xor $19,$20,$20		# 8 cycles from $20 load
-	ldq $4,24(%3)
-	xor $21,$0,$0		# 7 cycles from $0 load
-
-	ldq $5,24(%4)
-	xor $20,$0,$0
-	ldq $6,32(%0)
-	ldq $7,32(%1)
-
-	stq $0,16(%0)
-	xor $1,$2,$2		# 6 cycles from $2 load
-	ldq $16,32(%2)
-	xor $3,$4,$4		# 4 cycles from $4 load
-	
-	ldq $17,32(%3)
-	xor $2,$4,$4
-	ldq $18,32(%4)
-	ldq $19,40(%0)
-
-	ldq $20,40(%1)
-	ldq $21,40(%2)
-	ldq $0,40(%3)
-	xor $4,$5,$5		# 7 cycles from $5 load
-
-	stq $5,24(%0)
-	xor $6,$7,$7		# 7 cycles from $7 load
-	ldq $1,40(%4)
-	ldq $2,48(%0)
-
-	ldq $3,48(%1)
-	xor $7,$16,$16		# 7 cycles from $16 load
-	ldq $4,48(%2)
-	xor $17,$18,$18		# 6 cycles from $18 load
-
-	ldq $5,48(%3)
-	xor $16,$18,$18
-	ldq $6,48(%4)
-	xor $19,$20,$20		# 7 cycles from $20 load
-
-	stq $18,32(%0)
-	xor $20,$21,$21		# 8 cycles from $21 load
-	ldq $7,56(%0)
-	xor $0,$1,$1		# 6 cycles from $1 load
-
-	ldq $16,56(%1)
-	ldq $17,56(%2)
-	ldq $18,56(%3)
-	ldq $19,56(%4)
-
-	xor $21,$1,$1
-	xor $2,$3,$3		# 9 cycles from $3 load
-	xor $3,$4,$4		# 9 cycles from $4 load
-	xor $5,$6,$6		# 8 cycles from $6 load
-
-	unop
-	xor $4,$6,$6
-	xor $7,$16,$16		# 7 cycles from $16 load
-	xor $17,$18,$18		# 6 cycles from $18 load
-
-	stq $6,48(%0)
-	xor $16,$18,$18
-	subq %5,1,%5
-	xor $18,$19,$19		# 8 cycles from $19 load
-
-	stq $19,56(%0)
-	addq %4,64,%4
-	addq %3,64,%3
-	addq %2,64,%2
-
-	addq %1,64,%1
-	addq %0,64,%0
-	bgt %5,5b"
-	: "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
-	/* ARG! We've run out of asm arguments!  We've got to reload
-	   all those pointers we just loaded.  */
-	: "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21");
-	return;
+	p4 = (unsigned long *) bh_ptr[4]->b_data;
+	active_template->do_5(bytes, p0, p1, p2, p3, p4);
 }
 
-#define prefetch(base, ofs) \
-	asm("ldq $31,%2(%0)" : "=r"(base) : "0"(base), "i"(ofs))
-
-XORBLOCK_TEMPLATE(alpha_prefetch)
-{
-	long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
-	long *d = (long *) bh_ptr[0]->b_data;
-	long *s1 = (long *) bh_ptr[1]->b_data;
-	long *s2, *s3, *s4;
-	long p;
-
-	p = count == 2;
-	prefetch(d, 0);
-	prefetch(s1, 0);
-	prefetch(d, 64);
-	prefetch(s1, 64);
-	prefetch(d, 128);
-	prefetch(s1, 128);
-	prefetch(d, 192);
-	prefetch(s1, 192);
-	if (p) goto two_blocks;
-
-	s2 = (long *) bh_ptr[2]->b_data;
-	p = count == 3;
-	prefetch(s2, 0);
-	prefetch(s2, 64);
-	prefetch(s2, 128);
-	prefetch(s2, 192);
-	if (p) goto three_blocks;
-
-	s3 = (long *) bh_ptr[3]->b_data;
-	p = count == 4;
-	prefetch(s3, 0);
-	prefetch(s3, 64);
-	prefetch(s3, 128);
-	prefetch(s3, 192);
-	if (p) goto four_blocks;
-
-	s4 = (long *) bh_ptr[4]->b_data;
-	prefetch(s4, 0);
-	prefetch(s4, 64);
-	prefetch(s4, 128);
-	prefetch(s4, 192);
-	goto five_blocks;
-
-two_blocks:
-asm volatile ("
-	.align 4
-2:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,8(%0)
-	ldq $3,8(%1)
-
-	ldq $4,16(%0)
-	ldq $5,16(%1)
-	ldq $6,24(%0)
-	ldq $7,24(%1)
-
-	ldq $16,32(%0)
-	ldq $17,32(%1)
-	ldq $18,40(%0)
-	ldq $19,40(%1)
-
-	ldq $20,48(%0)
-	ldq $21,48(%1)
-	ldq $22,56(%0)
-	ldq $23,56(%1)
-
-	ldq $31,256(%0)
-	xor $0,$1,$0		# 8 cycles from $1 load
-	ldq $31,256(%1)
-	xor $2,$3,$2
-
-	stq $0,0(%0)
-	xor $4,$5,$4
-	stq $2,8(%0)
-	xor $6,$7,$6
-
-	stq $4,16(%0)
-	xor $16,$17,$16
-	stq $6,24(%0)
-	xor $18,$19,$18
-
-	stq $16,32(%0)
-	xor $20,$21,$20
-	stq $18,40(%0)
-	xor $22,$23,$22
-
-	stq $20,48(%0)
-	subq %2,1,%2
-	stq $22,56(%0)
-	addq %0,64,%0
-
-	addq %1,64,%1
-	bgt %2,2b"
-	: "=r"(d), "=r"(s1), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
-	return;
-
-three_blocks:
-asm volatile ("
-	.align 4
-3:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,8(%0)
-
-	ldq $4,8(%1)
-	ldq $6,16(%0)
-	ldq $7,16(%1)
-	ldq $17,24(%0)
-
-	ldq $18,24(%1)
-	ldq $20,32(%0)
-	ldq $21,32(%1)
-	ldq $5,8(%2)
-
-	ldq $16,16(%2)
-	ldq $19,24(%2)
-	ldq $22,32(%2)
-	nop
-
-	xor $0,$1,$1		# 8 cycles from $0 load
-	xor $3,$4,$4		# 7 cycles from $4 load
-	xor $6,$7,$7		# 6 cycles from $7 load
-	xor $17,$18,$18		# 5 cycles from $18 load
-
-	xor $1,$2,$2		# 9 cycles from $2 load
-	xor $20,$21,$21		# 5 cycles from $21 load
-	stq $2,0(%0)
-	xor $4,$5,$5		# 6 cycles from $5 load
-
-	stq $5,8(%0)
-	xor $7,$16,$16		# 7 cycles from $16 load
-	stq $16,16(%0)
-	xor $18,$19,$19		# 7 cycles from $19 load
-
-	stq $19,24(%0)
-	xor $21,$22,$22		# 7 cycles from $22 load
-	stq $22,32(%0)
-	nop
-
-	ldq $0,40(%0)
-	ldq $1,40(%1)
-	ldq $3,48(%0)
-	ldq $4,48(%1)
-
-	ldq $6,56(%0)
-	ldq $7,56(%1)
-	ldq $2,40(%2)
-	ldq $5,48(%2)
-
-	ldq $16,56(%2)
-	ldq $31,256(%0)
-	ldq $31,256(%1)
-	ldq $31,256(%2)
-
-	xor $0,$1,$1		# 6 cycles from $1 load
-	xor $3,$4,$4		# 5 cycles from $4 load
-	xor $6,$7,$7		# 5 cycles from $7 load
-	xor $1,$2,$2		# 4 cycles from $2 load
-	
-	xor $4,$5,$5		# 5 cycles from $5 load
-	xor $7,$16,$16		# 4 cycles from $16 load
-	stq $2,40(%0)
-	subq %3,1,%3
-
-	stq $5,48(%0)
-	addq %2,64,%2
-	stq $16,56(%0)
-	addq %1,64,%1
-
-	addq %0,64,%0
-	bgt %3,3b"
-	: "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(s2), "3"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21", "$22");
-	return;
-
-four_blocks:
-asm volatile ("
-	.align 4
-4:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,0(%3)
-
-	ldq $4,8(%0)
-	ldq $5,8(%1)
-	ldq $6,8(%2)
-	ldq $7,8(%3)
-
-	ldq $16,16(%0)
-	ldq $17,16(%1)
-	ldq $18,16(%2)
-	ldq $19,16(%3)
-
-	ldq $20,24(%0)
-	xor $0,$1,$1		# 6 cycles from $1 load
-	ldq $21,24(%1)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	ldq $0,24(%2)
-	xor $1,$3,$3
-	ldq $1,24(%3)
-	xor $4,$5,$5		# 7 cycles from $5 load
-
-	stq $3,0(%0)
-	xor $6,$7,$7
-	xor $16,$17,$17		# 7 cycles from $17 load
-	xor $5,$7,$7
-
-	stq $7,8(%0)
-	xor $18,$19,$19		# 7 cycles from $19 load
-	ldq $2,32(%0)
-	xor $17,$19,$19
-
-	ldq $3,32(%1)
-	ldq $4,32(%2)
-	ldq $5,32(%3)
-	xor $20,$21,$21		# 8 cycles from $21 load
-
-	ldq $6,40(%0)
-	ldq $7,40(%1)
-	ldq $16,40(%2)
-	ldq $17,40(%3)
-
-	stq $19,16(%0)
-	xor $0,$1,$1		# 9 cycles from $1 load
-	xor $2,$3,$3		# 5 cycles from $3 load
-	xor $21,$1,$1
-
-	ldq $18,48(%0)
-	xor $4,$5,$5		# 5 cycles from $5 load
-	ldq $19,48(%1)
-	xor $3,$5,$5
-
-	ldq $20,48(%2)
-	ldq $21,48(%3)
-	ldq $0,56(%0)
-	ldq $1,56(%1)
-
-	ldq $2,56(%2)
-	xor $6,$7,$7		# 8 cycles from $6 load
-	ldq $3,56(%3)
-	xor $16,$17,$17		# 8 cycles from $17 load
-
-	ldq $31,256(%0)
-	xor $7,$17,$17
-	ldq $31,256(%1)
-	xor $18,$19,$19		# 6 cycles from $19 load
+/* Set of all registered templates.  */
+static struct xor_block_template *template_list;
 
-	ldq $31,256(%2)
-	xor $20,$21,$21		# 6 cycles from $21 load
-	ldq $31,256(%3)
-	xor $19,$21,$21
+/* The -6*32 shift factor colors the cache.  */
+#define BENCH_SIZE (PAGE_SIZE-6*32)
 
-	stq $1,24(%0)
-	xor $0,$1,$1		# 7 cycles from $1 load
-	stq $5,32(%0)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	stq $17,40(%0)
-	xor $1,$3,$3
-	stq $21,48(%0)
-	subq %4,1,%4
-
-	stq $3,56(%0)
-	addq %3,64,%3
-	addq %2,64,%2
-	addq %1,64,%1
-
-	addq %0,64,%0
-	bgt %4,4b"
-	: "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
-	: "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21");
-	return;
-
-five_blocks:
-asm volatile ("
-	ldq %0,0(%6)
-	ldq %1,8(%6)
-	ldq %2,16(%6)
-	ldq %3,24(%6)
-	ldq %4,32(%6)
-	ldq %0,%7(%0)
-	ldq %1,%7(%1)
-	ldq %2,%7(%2)
-	ldq %3,%7(%3)
-	ldq %4,%7(%4)
-	.align 4
-5:
-	ldq $0,0(%0)
-	ldq $1,0(%1)
-	ldq $2,0(%2)
-	ldq $3,0(%3)
-
-	ldq $4,0(%4)
-	ldq $5,8(%0)
-	ldq $6,8(%1)
-	ldq $7,8(%2)
-
-	ldq $16,8(%3)
-	ldq $17,8(%4)
-	ldq $18,16(%0)
-	ldq $19,16(%1)
-
-	ldq $20,16(%2)
-	xor $0,$1,$1		# 6 cycles from $1 load
-	ldq $21,16(%3)
-	xor $2,$3,$3		# 6 cycles from $3 load
-
-	ldq $0,16(%4)
-	xor $1,$3,$3
-	ldq $1,24(%0)
-	xor $3,$4,$4		# 7 cycles from $4 load
-
-	stq $4,0(%0)
-	xor $5,$6,$6		# 7 cycles from $6 load
-	xor $7,$16,$16		# 7 cycles from $16 load
-	xor $6,$17,$17		# 7 cycles from $17 load
-
-	ldq $2,24(%1)
-	xor $16,$17,$17
-	ldq $3,24(%2)
-	xor $18,$19,$19		# 8 cycles from $19 load
-
-	stq $17,8(%0)
-	xor $19,$20,$20		# 8 cycles from $20 load
-	ldq $4,24(%3)
-	xor $21,$0,$0		# 7 cycles from $0 load
-
-	ldq $5,24(%4)
-	xor $20,$0,$0
-	ldq $6,32(%0)
-	ldq $7,32(%1)
-
-	stq $0,16(%0)
-	xor $1,$2,$2		# 6 cycles from $2 load
-	ldq $16,32(%2)
-	xor $3,$4,$4		# 4 cycles from $4 load
-	
-	ldq $17,32(%3)
-	xor $2,$4,$4
-	ldq $18,32(%4)
-	ldq $19,40(%0)
-
-	ldq $20,40(%1)
-	ldq $21,40(%2)
-	ldq $0,40(%3)
-	xor $4,$5,$5		# 7 cycles from $5 load
-
-	stq $5,24(%0)
-	xor $6,$7,$7		# 7 cycles from $7 load
-	ldq $1,40(%4)
-	ldq $2,48(%0)
-
-	ldq $3,48(%1)
-	xor $7,$16,$16		# 7 cycles from $16 load
-	ldq $4,48(%2)
-	xor $17,$18,$18		# 6 cycles from $18 load
-
-	ldq $5,48(%3)
-	xor $16,$18,$18
-	ldq $6,48(%4)
-	xor $19,$20,$20		# 7 cycles from $20 load
-
-	stq $18,32(%0)
-	xor $20,$21,$21		# 8 cycles from $21 load
-	ldq $7,56(%0)
-	xor $0,$1,$1		# 6 cycles from $1 load
-
-	ldq $16,56(%1)
-	ldq $17,56(%2)
-	ldq $18,56(%3)
-	ldq $19,56(%4)
-
-	ldq $31,256(%0)
-	xor $21,$1,$1
-	ldq $31,256(%1)
-	xor $2,$3,$3		# 9 cycles from $3 load
-
-	ldq $31,256(%2)
-	xor $3,$4,$4		# 9 cycles from $4 load
-	ldq $31,256(%3)
-	xor $5,$6,$6		# 8 cycles from $6 load
-
-	ldq $31,256(%4)
-	xor $4,$6,$6
-	xor $7,$16,$16		# 7 cycles from $16 load
-	xor $17,$18,$18		# 6 cycles from $18 load
-
-	stq $6,48(%0)
-	xor $16,$18,$18
-	subq %5,1,%5
-	xor $18,$19,$19		# 8 cycles from $19 load
-
-	stq $19,56(%0)
-	addq %4,64,%4
-	addq %3,64,%3
-	addq %2,64,%2
-
-	addq %1,64,%1
-	addq %0,64,%0
-	bgt %5,5b"
-	: "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
-	/* ARG! We've run out of asm arguments!  We've got to reload
-	   all those pointers we just loaded.  */
-	: "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
-	: "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-	  "$16", "$17", "$18", "$19", "$20", "$21");
-	return;
-}
-
-#undef prefetch
-
-#endif /* __alpha__ */
-
-#ifndef __sparc_v9__
-
-/*
- * this one works reasonably on any x86 CPU
- * (send me an assembly version for inclusion if you can make it faster)
- *
- * this one is just as fast as written in pure assembly on x86.
- * the reason for this separate version is that the
- * fast open-coded xor routine "32reg" produces suboptimal code
- * on x86, due to lack of registers.
- */
-XORBLOCK_TEMPLATE(8regs)
-{
-	int len  = bh_ptr[0]->b_size;
-	long *destp   = (long *) bh_ptr[0]->b_data;
-	long *source1, *source2, *source3, *source4;
-	long lines = len / (sizeof (long)) / 8, i;
-
-	switch(count) {
-		case 2:
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-				*(destp + 0) ^= *(source1 + 0);
-				*(destp + 1) ^= *(source1 + 1);
-				*(destp + 2) ^= *(source1 + 2);
-				*(destp + 3) ^= *(source1 + 3);
-				*(destp + 4) ^= *(source1 + 4);
-				*(destp + 5) ^= *(source1 + 5);
-				*(destp + 6) ^= *(source1 + 6);
-				*(destp + 7) ^= *(source1 + 7);
-				source1 += 8;
-				destp += 8;
-			}
-			break;
-		case 3:
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-				*(destp + 0) ^= *(source1 + 0);
-				*(destp + 0) ^= *(source2 + 0);
-				*(destp + 1) ^= *(source1 + 1);
-				*(destp + 1) ^= *(source2 + 1);
-				*(destp + 2) ^= *(source1 + 2);
-				*(destp + 2) ^= *(source2 + 2);
-				*(destp + 3) ^= *(source1 + 3);
-				*(destp + 3) ^= *(source2 + 3);
-				*(destp + 4) ^= *(source1 + 4);
-				*(destp + 4) ^= *(source2 + 4);
-				*(destp + 5) ^= *(source1 + 5);
-				*(destp + 5) ^= *(source2 + 5);
-				*(destp + 6) ^= *(source1 + 6);
-				*(destp + 6) ^= *(source2 + 6);
-				*(destp + 7) ^= *(source1 + 7);
-				*(destp + 7) ^= *(source2 + 7);
-				source1 += 8;
-				source2 += 8;
-				destp += 8;
-			}
-			break;
-		case 4:
-			source3 = (long *) bh_ptr[3]->b_data;
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-				*(destp + 0) ^= *(source1 + 0);
-				*(destp + 0) ^= *(source2 + 0);
-				*(destp + 0) ^= *(source3 + 0);
-				*(destp + 1) ^= *(source1 + 1);
-				*(destp + 1) ^= *(source2 + 1);
-				*(destp + 1) ^= *(source3 + 1);
-				*(destp + 2) ^= *(source1 + 2);
-				*(destp + 2) ^= *(source2 + 2);
-				*(destp + 2) ^= *(source3 + 2);
-				*(destp + 3) ^= *(source1 + 3);
-				*(destp + 3) ^= *(source2 + 3);
-				*(destp + 3) ^= *(source3 + 3);
-				*(destp + 4) ^= *(source1 + 4);
-				*(destp + 4) ^= *(source2 + 4);
-				*(destp + 4) ^= *(source3 + 4);
-				*(destp + 5) ^= *(source1 + 5);
-				*(destp + 5) ^= *(source2 + 5);
-				*(destp + 5) ^= *(source3 + 5);
-				*(destp + 6) ^= *(source1 + 6);
-				*(destp + 6) ^= *(source2 + 6);
-				*(destp + 6) ^= *(source3 + 6);
-				*(destp + 7) ^= *(source1 + 7);
-				*(destp + 7) ^= *(source2 + 7);
-				*(destp + 7) ^= *(source3 + 7);
-				source1 += 8;
-				source2 += 8;
-				source3 += 8;
-				destp += 8;
-			}
-			break;
-		case 5:
-			source4 = (long *) bh_ptr[4]->b_data;
-			source3 = (long *) bh_ptr[3]->b_data;
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-				*(destp + 0) ^= *(source1 + 0);
-				*(destp + 0) ^= *(source2 + 0);
-				*(destp + 0) ^= *(source3 + 0);
-				*(destp + 0) ^= *(source4 + 0);
-				*(destp + 1) ^= *(source1 + 1);
-				*(destp + 1) ^= *(source2 + 1);
-				*(destp + 1) ^= *(source3 + 1);
-				*(destp + 1) ^= *(source4 + 1);
-				*(destp + 2) ^= *(source1 + 2);
-				*(destp + 2) ^= *(source2 + 2);
-				*(destp + 2) ^= *(source3 + 2);
-				*(destp + 2) ^= *(source4 + 2);
-				*(destp + 3) ^= *(source1 + 3);
-				*(destp + 3) ^= *(source2 + 3);
-				*(destp + 3) ^= *(source3 + 3);
-				*(destp + 3) ^= *(source4 + 3);
-				*(destp + 4) ^= *(source1 + 4);
-				*(destp + 4) ^= *(source2 + 4);
-				*(destp + 4) ^= *(source3 + 4);
-				*(destp + 4) ^= *(source4 + 4);
-				*(destp + 5) ^= *(source1 + 5);
-				*(destp + 5) ^= *(source2 + 5);
-				*(destp + 5) ^= *(source3 + 5);
-				*(destp + 5) ^= *(source4 + 5);
-				*(destp + 6) ^= *(source1 + 6);
-				*(destp + 6) ^= *(source2 + 6);
-				*(destp + 6) ^= *(source3 + 6);
-				*(destp + 6) ^= *(source4 + 6);
-				*(destp + 7) ^= *(source1 + 7);
-				*(destp + 7) ^= *(source2 + 7);
-				*(destp + 7) ^= *(source3 + 7);
-				*(destp + 7) ^= *(source4 + 7);
-				source1 += 8;
-				source2 += 8;
-				source3 += 8;
-				source4 += 8;
-				destp += 8;
-			}
-			break;
-	}
-}
-
-/*
- * platform independent RAID5 checksum calculation, this should
- * be very fast on any platform that has a decent amount of
- * registers. (32 or more)
- */
-XORBLOCK_TEMPLATE(32regs)
-{
-	int size  = bh_ptr[0]->b_size;
-	int lines = size / (sizeof (long)) / 8, i;
-	long *destp   = (long *) bh_ptr[0]->b_data;
-	long *source1, *source2, *source3, *source4;
-	
-	  /* LOTS of registers available...
-	     We do explicite loop-unrolling here for code which
-	     favours RISC machines.  In fact this is almoast direct
-	     RISC assembly on Alpha and SPARC :-)  */
-
-
-	switch(count) {
-		case 2:
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
-				d0 = destp[0];	/* Pull the stuff into registers	*/
-				d1 = destp[1];	/*  ... in bursts, if possible.		*/
-				d2 = destp[2];
-				d3 = destp[3];
-				d4 = destp[4];
-				d5 = destp[5];
-				d6 = destp[6];
-				d7 = destp[7];
-				d0 ^= source1[0];
-				d1 ^= source1[1];
-				d2 ^= source1[2];
-				d3 ^= source1[3];
-				d4 ^= source1[4];
-				d5 ^= source1[5];
-				d6 ^= source1[6];
-				d7 ^= source1[7];
-				destp[0] = d0;	/* Store the result (in burts)		*/
-				destp[1] = d1;
-				destp[2] = d2;
-				destp[3] = d3;
-				destp[4] = d4;	/* Store the result (in burts)		*/
-				destp[5] = d5;
-				destp[6] = d6;
-				destp[7] = d7;
-				source1 += 8;
-				destp += 8;
-			}
-			break;
-	  	case 3:
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
-				d0 = destp[0];	/* Pull the stuff into registers	*/
-				d1 = destp[1];	/*  ... in bursts, if possible.		*/
-				d2 = destp[2];
-				d3 = destp[3];
-				d4 = destp[4];
-				d5 = destp[5];
-				d6 = destp[6];
-				d7 = destp[7];
-				d0 ^= source1[0];
-				d1 ^= source1[1];
-				d2 ^= source1[2];
-				d3 ^= source1[3];
-				d4 ^= source1[4];
-				d5 ^= source1[5];
-				d6 ^= source1[6];
-				d7 ^= source1[7];
-				d0 ^= source2[0];
-				d1 ^= source2[1];
-				d2 ^= source2[2];
-				d3 ^= source2[3];
-				d4 ^= source2[4];
-				d5 ^= source2[5];
-				d6 ^= source2[6];
-				d7 ^= source2[7];
-				destp[0] = d0;	/* Store the result (in burts)		*/
-				destp[1] = d1;
-				destp[2] = d2;
-				destp[3] = d3;
-				destp[4] = d4;	/* Store the result (in burts)		*/
-				destp[5] = d5;
-				destp[6] = d6;
-				destp[7] = d7;
-				source1 += 8;
-				source2 += 8;
-				destp += 8;
-			}
-			break;
-		case 4:
-			source3 = (long *) bh_ptr[3]->b_data;
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
-				d0 = destp[0];	/* Pull the stuff into registers	*/
-				d1 = destp[1];	/*  ... in bursts, if possible.		*/
-				d2 = destp[2];
-				d3 = destp[3];
-				d4 = destp[4];
-				d5 = destp[5];
-				d6 = destp[6];
-				d7 = destp[7];
-				d0 ^= source1[0];
-				d1 ^= source1[1];
-				d2 ^= source1[2];
-				d3 ^= source1[3];
-				d4 ^= source1[4];
-				d5 ^= source1[5];
-				d6 ^= source1[6];
-				d7 ^= source1[7];
-				d0 ^= source2[0];
-				d1 ^= source2[1];
-				d2 ^= source2[2];
-				d3 ^= source2[3];
-				d4 ^= source2[4];
-				d5 ^= source2[5];
-				d6 ^= source2[6];
-				d7 ^= source2[7];
-				d0 ^= source3[0];
-				d1 ^= source3[1];
-				d2 ^= source3[2];
-				d3 ^= source3[3];
-				d4 ^= source3[4];
-				d5 ^= source3[5];
-				d6 ^= source3[6];
-				d7 ^= source3[7];
-				destp[0] = d0;	/* Store the result (in burts)		*/
-				destp[1] = d1;
-				destp[2] = d2;
-				destp[3] = d3;
-				destp[4] = d4;	/* Store the result (in burts)		*/
-				destp[5] = d5;
-				destp[6] = d6;
-				destp[7] = d7;
-				source1 += 8;
-				source2 += 8;
-				source3 += 8;
-				destp += 8;
-			}
-			break;
-		case 5:
-			source4 = (long *) bh_ptr[4]->b_data;
-			source3 = (long *) bh_ptr[3]->b_data;
-			source2 = (long *) bh_ptr[2]->b_data;
-			source1 = (long *) bh_ptr[1]->b_data;
-			for (i = lines; i > 0; i--) {
-	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
-				d0 = destp[0];	/* Pull the stuff into registers	*/
-				d1 = destp[1];	/*  ... in bursts, if possible.		*/
-				d2 = destp[2];
-				d3 = destp[3];
-				d4 = destp[4];
-				d5 = destp[5];
-				d6 = destp[6];
-				d7 = destp[7];
-				d0 ^= source1[0];
-				d1 ^= source1[1];
-				d2 ^= source1[2];
-				d3 ^= source1[3];
-				d4 ^= source1[4];
-				d5 ^= source1[5];
-				d6 ^= source1[6];
-				d7 ^= source1[7];
-				d0 ^= source2[0];
-				d1 ^= source2[1];
-				d2 ^= source2[2];
-				d3 ^= source2[3];
-				d4 ^= source2[4];
-				d5 ^= source2[5];
-				d6 ^= source2[6];
-				d7 ^= source2[7];
-				d0 ^= source3[0];
-				d1 ^= source3[1];
-				d2 ^= source3[2];
-				d3 ^= source3[3];
-				d4 ^= source3[4];
-				d5 ^= source3[5];
-				d6 ^= source3[6];
-				d7 ^= source3[7];
-				d0 ^= source4[0];
-				d1 ^= source4[1];
-				d2 ^= source4[2];
-				d3 ^= source4[3];
-				d4 ^= source4[4];
-				d5 ^= source4[5];
-				d6 ^= source4[6];
-				d7 ^= source4[7];
-				destp[0] = d0;	/* Store the result (in burts)		*/
-				destp[1] = d1;
-				destp[2] = d2;
-				destp[3] = d3;
-				destp[4] = d4;	/* Store the result (in burts)		*/
-				destp[5] = d5;
-				destp[6] = d6;
-				destp[7] = d7;
-				source1 += 8;
-				source2 += 8;
-				source3 += 8;
-				source4 += 8;
-				destp += 8;
-			}
-			break;
-	}
-}
-
-/*
- * (the -6*32 shift factor colors the cache)
- */
-#define SIZE (PAGE_SIZE-6*32)
-
-static void xor_speed ( struct xor_block_template * func, 
-	struct buffer_head *b1, struct buffer_head *b2)
+static void
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 {
 	int speed;
 	unsigned long now;
 	int i, count, max;
-	struct buffer_head *bh_ptr[6];
 
-	func->next = xor_functions;
-	xor_functions = func;
-	bh_ptr[0] = b1;
-	bh_ptr[1] = b2;
+	tmpl->next = template_list;
+	template_list = tmpl;
 
 	/*
-	 * count the number of XORs done during a whole jiffy.
-	 * calculate the speed of checksumming from this.
-	 * (we use a 2-page allocation to have guaranteed
-	 * color L1-cache layout)
+	 * Count the number of XORs done during a whole jiffy, and use
+	 * this to calculate the speed of checksumming.  We use a 2-page
+	 * allocation to have guaranteed color L1-cache layout.
 	 */
 	max = 0;
 	for (i = 0; i < 5; i++) {
@@ -2600,7 +82,7 @@ static void xor_speed ( struct xor_block_template * func,
 		count = 0;
 		while (jiffies == now) {
 			mb();
-			func->xor_block(2,bh_ptr);
+			tmpl->do_2(BENCH_SIZE, b1, b2);
 			mb();
 			count++;
 			mb();
@@ -2609,120 +91,53 @@ static void xor_speed ( struct xor_block_template * func,
 			max = count;
 	}
 
-	speed = max * (HZ*SIZE/1024);
-	func->speed = speed;
+	speed = max * (HZ * BENCH_SIZE / 1024);
+	tmpl->speed = speed;
 
-	printk( "   %-10s: %5d.%03d MB/sec\n", func->name,
-		speed / 1000, speed % 1000);
+	printk("   %-10s: %5d.%03d MB/sec\n", tmpl->name,
+	       speed / 1000, speed % 1000);
 }
 
-static inline void pick_fastest_function(void)
+static int
+calibrate_xor_block(void)
 {
+	void *b1, *b2;
 	struct xor_block_template *f, *fastest;
 
-	fastest = xor_functions;
-	for (f = fastest; f; f = f->next) {
-		if (f->speed > fastest->speed)
-			fastest = f;
-	}
-#ifdef CONFIG_X86_XMM 
-	if (cpu_has_xmm) {
-		/* we force the use of the KNI xor block because it
-			can write around l2.  we may also be able
-			to load into the l1 only depending on how
-			the cpu deals with a load to a line that is
-			being prefetched.
-		*/
-		fastest = &t_xor_block_pIII_kni;
+	b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
+	if (! b1) {
+		printk("raid5: Yikes!  No memory available.\n");
+		return -ENOMEM;
 	}
-#endif
-#ifdef __alpha__
-	if (implver() == IMPLVER_EV6) {
-		/* Force the use of alpha_prefetch if EV6, as it
-		   is significantly faster in the cold cache case.  */
-		fastest = &t_xor_block_alpha_prefetch;
-	}
-#endif
-	xor_block = fastest->xor_block;
-	printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
-		fastest->speed / 1000, fastest->speed % 1000);
-}
- 
-static struct buffer_head b1, b2;
-
-void calibrate_xor_block(void)
-{
-	if (xor_block)
-		return;
-	memset(&b1,0,sizeof(b1));
-	b2 = b1;
-
-	b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
-	if (!b1.b_data) {
-		pick_fastest_function();
-		return;
-	}
-	b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
-
-	b1.b_size = SIZE;
+	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
 
 	printk(KERN_INFO "raid5: measuring checksumming speed\n");
+	sti();
 
-	sti(); /* should be safe */
+#define xor_speed(templ)	do_xor_speed((templ), b1, b2)
 
-#if defined(__sparc__) && !defined(__sparc_v9__)
-	printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
-	xor_speed(&t_xor_block_SPARC,&b1,&b2);
-#endif
+	XOR_TRY_TEMPLATES;
 
-#ifdef CONFIG_X86_XMM 
-	if (cpu_has_xmm) {
-		printk(KERN_INFO
-			"raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
-		xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
-	}
-#endif /* CONFIG_X86_XMM */
+#undef xor_speed
 
-#ifdef __i386__
-	if (md_cpu_has_mmx()) {
-		printk(KERN_INFO
-			"raid5: MMX detected, trying high-speed MMX checksum routines\n");
-		xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
-		xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
-	}
-#endif /* __i386__ */
+	free_pages((unsigned long)b1, 2);
 
-#ifdef __alpha__
-	xor_speed(&t_xor_block_alpha,&b1,&b2);
-	xor_speed(&t_xor_block_alpha_prefetch,&b1,&b2);
-#endif
-	
-	xor_speed(&t_xor_block_8regs,&b1,&b2);
-	xor_speed(&t_xor_block_32regs,&b1,&b2);
+	fastest = template_list;
+	for (f = fastest; f; f = f->next)
+		if (f->speed > fastest->speed)
+			fastest = f;
 
-	free_pages((unsigned long)b1.b_data,2);
-	pick_fastest_function();
-}
+#ifdef XOR_SELECT_TEMPLATE
+	fastest = XOR_SELECT_TEMPLATE(fastest);
+#endif
 
-#else /* __sparc_v9__ */
+	active_template = fastest;
+	printk("raid5: using function: %s (%d.%03d MB/sec)\n",
+	       fastest->name, fastest->speed / 1000, fastest->speed % 1000);
 
-void calibrate_xor_block(void)
-{
-	if (xor_block)
-		return;
-	printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
-	xor_block = xor_block_VIS;
+	return 0;
 }
 
-#endif /* __sparc_v9__ */
-
 MD_EXPORT_SYMBOL(xor_block);
-MD_EXPORT_SYMBOL(calibrate_xor_block);
 
-#ifdef MODULE
-int init_module(void)
-{
-	calibrate_xor_block();
-	return 0;
-}
-#endif
+module_init(calibrate_xor_block);
-- 
cgit v1.2.3