Merge with Linux 2.4.0-prerelease. Big Makefile rewrite, test your

Makefiles.
author: Ralf Baechle <ralf@linux-mips.org> 2001-01-10 17:17:53 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2001-01-10 17:17:53 +0000
commit: b2ad5f821b1381492d792ca10b1eb7a107b48f14 (patch)
tree: 954a648692e7da983db1d2470953705f6a729264 /drivers/md
parent: c9c06167e7933d93a6e396174c68abf242294abb (diff)
4 files changed, 1883 insertions, 1503 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index c37ce84db..041b18661 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,25 +3,15 @@
 #
 
 O_TARGET	:= mddev.o
-SUB_DIRS	:=
-ALL_SUB_DIRS	:=
-MOD_SUB_DIRS	:=
 
 export-objs	:= md.o xor.o
 list-multi	:= lvm-mod.o
 lvm-mod-objs	:= lvm.o lvm-snap.o
 
-obj-y		:=
-obj-m		:=
-obj-n		:=
-obj-		:=
-
 # Note: link order is important.  All raid personalities
 # and xor.o must come before md.o, as they each initialise 
 # themselves, and md.o may use the personalities when it 
 # auto-initialised.
-# The use of MIX_OBJS allows link order to be maintained even
-# though some are export-objs and some aren't.
 
 obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
@@ -30,13 +20,6 @@ obj-$(CONFIG_MD_RAID5)		+= raid5.o xor.o
 obj-$(CONFIG_BLK_DEV_MD)	+= md.o
 obj-$(CONFIG_BLK_DEV_LVM)	+= lvm-mod.o
 
-# Translate to Rules.make lists.
-active-objs	:= $(sort $(obj-y) $(obj-m))
-
-O_OBJS		:= $(obj-y)
-M_OBJS		:= $(obj-m)
-MIX_OBJS	:= $(filter $(export-objs), $(active-objs))
-
 include $(TOPDIR)/Rules.make
 
 lvm-mod.o: $(lvm-mod-objs)
diff --git a/drivers/md/lvm-snap.c b/drivers/md/lvm-snap.c
index 04007c1be..980694ee3 100644
--- a/drivers/md/lvm-snap.c
+++ b/drivers/md/lvm-snap.c
@@ -2,13 +2,14 @@
  * kernel/lvm-snap.c
  *
  * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *                    Heinz Mauelshagen, Sistina Software (persistent snapshots)
  *
  * LVM snapshot driver is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2, or (at your option)
  * any later version.
  * 
- * LVM driver is distributed in the hope that it will be useful,
+ * LVM snapshot driver is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
@@ -29,13 +30,27 @@
 #include <linux/lvm.h>
 
 
-static char *lvm_snap_version __attribute__ ((unused)) = "LVM 0.8final (15/02/2000)\n";
+static char *lvm_snap_version __attribute__ ((unused)) = "LVM 0.9 snapshot code (13/11/2000)\n";
 
 extern const char *const lvm_name;
 extern int lvm_blocksizes[];
 
 void lvm_snapshot_release(lv_t *);
 
+uint lvm_pv_get_number(vg_t * vg, kdev_t rdev)
+{
+	uint p;
+
+	for ( p = 0; p < vg->pv_max; p++)
+	{
+		if ( vg->pv[p] == NULL) continue;
+		if ( vg->pv[p]->pv_dev == rdev) break;
+	}
+
+	return vg->pv[p]->pv_number;
+}
+
+
 #define hashfn(dev,block,mask,chunk_size) \
 	((HASHDEV(dev)^((block)/(chunk_size))) & (mask))
 
@@ -72,9 +87,9 @@ lvm_find_exception_table(kdev_t org_dev, unsigned long org_start, lv_t * lv)
 	return ret;
 }
 
-static inline void lvm_hash_link(lv_block_exception_t * exception,
-				 kdev_t org_dev, unsigned long org_start,
-				 lv_t * lv)
+inline void lvm_hash_link(lv_block_exception_t * exception,
+			  kdev_t org_dev, unsigned long org_start,
+			  lv_t * lv)
 {
 	struct list_head * hash_table = lv->lv_snapshot_hash_table;
 	unsigned long mask = lv->lv_snapshot_hash_mask;
@@ -97,7 +112,6 @@ int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector,
 	pe_adjustment = (*org_sector-pe_off) % chunk_size;
 	__org_start = *org_sector - pe_adjustment;
 	__org_dev = *org_dev;
-
 	ret = 0;
 	exception = lvm_find_exception_table(__org_dev, __org_start, lv);
 	if (exception)
@@ -109,7 +123,7 @@ int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector,
 	return ret;
 }
 
-static void lvm_drop_snapshot(lv_t * lv_snap, const char * reason)
+void lvm_drop_snapshot(lv_t * lv_snap, const char * reason)
 {
 	kdev_t last_dev;
 	int i;
@@ -118,8 +132,7 @@ static void lvm_drop_snapshot(lv_t * lv_snap, const char * reason)
 	   or error on this snapshot --> release it */
 	invalidate_buffers(lv_snap->lv_dev);
 
-	last_dev = 0;
-	for (i = 0; i < lv_snap->lv_remap_ptr; i++) {
+	for (i = last_dev = 0; i < lv_snap->lv_remap_ptr; i++) {
 		if ( lv_snap->lv_block_exception[i].rdev_new != last_dev) {
 			last_dev = lv_snap->lv_block_exception[i].rdev_new;
 			invalidate_buffers(last_dev);
@@ -149,7 +162,7 @@ static inline void lvm_snapshot_prepare_blocks(unsigned long * blocks,
 		blocks[i] = start++;
 }
 
-static inline int get_blksize(kdev_t dev)
+inline int lvm_get_blksize(kdev_t dev)
 {
 	int correct_size = BLOCK_SIZE, i, major;
 
@@ -185,6 +198,133 @@ static inline void invalidate_snap_cache(unsigned long start, unsigned long nr,
 }
 #endif
 
+
+void lvm_snapshot_fill_COW_page(vg_t * vg, lv_t * lv_snap)
+{
+	int 	id = 0, is = lv_snap->lv_remap_ptr;
+	ulong	blksize_snap;
+	lv_COW_table_disk_t * lv_COW_table =
+	   ( lv_COW_table_disk_t *) page_address(lv_snap->lv_COW_table_page);
+
+	if (is == 0) return;
+	is--;
+        blksize_snap = lvm_get_blksize(lv_snap->lv_block_exception[is].rdev_new);
+        is -= is % (blksize_snap / sizeof(lv_COW_table_disk_t));
+
+	memset(lv_COW_table, 0, blksize_snap);
+	for ( ; is < lv_snap->lv_remap_ptr; is++, id++) {
+		/* store new COW_table entry */
+		lv_COW_table[id].pv_org_number = LVM_TO_DISK64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[is].rdev_org));
+		lv_COW_table[id].pv_org_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[is].rsector_org);
+		lv_COW_table[id].pv_snap_number = LVM_TO_DISK64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[is].rdev_new));
+		lv_COW_table[id].pv_snap_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[is].rsector_new);
+	}
+}
+
+
+/*
+ * writes a COW exception table sector to disk (HM)
+ *
+ */
+
+int lvm_write_COW_table_block(vg_t * vg,
+			      lv_t * lv_snap)
+{
+	int blksize_snap;
+	int end_of_table;
+	int idx = lv_snap->lv_remap_ptr, idx_COW_table;
+	int nr_pages_tmp;
+	int length_tmp;
+	ulong snap_pe_start, COW_table_sector_offset,
+	      COW_entries_per_pe, COW_chunks_per_pe, COW_entries_per_block;
+	ulong blocks[1];
+	const char * reason;
+	kdev_t snap_phys_dev;
+	struct kiobuf * iobuf = lv_snap->lv_iobuf;
+	struct page * page_tmp;
+	lv_COW_table_disk_t * lv_COW_table =
+	   ( lv_COW_table_disk_t *) page_address(lv_snap->lv_COW_table_page);
+
+	idx--;
+
+	COW_chunks_per_pe = LVM_GET_COW_TABLE_CHUNKS_PER_PE(vg, lv_snap);
+	COW_entries_per_pe = LVM_GET_COW_TABLE_ENTRIES_PER_PE(vg, lv_snap);
+
+	/* get physical addresse of destination chunk */
+	snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new;
+	snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size;
+
+	blksize_snap = lvm_get_blksize(snap_phys_dev);
+
+        COW_entries_per_block = blksize_snap / sizeof(lv_COW_table_disk_t);
+        idx_COW_table = idx % COW_entries_per_pe % COW_entries_per_block;
+
+	if ( idx_COW_table == 0) memset(lv_COW_table, 0, blksize_snap);
+
+	/* sector offset into the on disk COW table */
+	COW_table_sector_offset = (idx % COW_entries_per_pe) / (SECTOR_SIZE / sizeof(lv_COW_table_disk_t));
+
+        /* COW table block to write next */
+	blocks[0] = (snap_pe_start + COW_table_sector_offset) >> (blksize_snap >> 10);
+
+	/* store new COW_table entry */
+	lv_COW_table[idx_COW_table].pv_org_number = LVM_TO_DISK64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[idx].rdev_org));
+	lv_COW_table[idx_COW_table].pv_org_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[idx].rsector_org);
+	lv_COW_table[idx_COW_table].pv_snap_number = LVM_TO_DISK64(lvm_pv_get_number(vg, snap_phys_dev));
+	lv_COW_table[idx_COW_table].pv_snap_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[idx].rsector_new);
+
+	length_tmp = iobuf->length;
+	iobuf->length = blksize_snap;
+	page_tmp = iobuf->maplist[0];
+        iobuf->maplist[0] = lv_snap->lv_COW_table_page;
+	nr_pages_tmp = iobuf->nr_pages;
+	iobuf->nr_pages = 1;
+
+	if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev,
+		       blocks, blksize_snap) != blksize_snap)
+		goto fail_raw_write;
+
+
+	/* initialization of next COW exception table block with zeroes */
+	end_of_table = idx % COW_entries_per_pe == COW_entries_per_pe - 1;
+	if (idx_COW_table % COW_entries_per_block == COW_entries_per_block - 1 || end_of_table)
+	{
+		/* don't go beyond the end */
+		if (idx + 1 >= lv_snap->lv_remap_end) goto good_out;
+
+		memset(lv_COW_table, 0, blksize_snap);
+
+		if (end_of_table)
+		{
+			idx++;
+			snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new;
+			snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size;
+			blksize_snap = lvm_get_blksize(snap_phys_dev);
+			blocks[0] = snap_pe_start >> (blksize_snap >> 10);
+		} else blocks[0]++;
+
+		if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev,
+			       blocks, blksize_snap) != blksize_snap)
+			goto fail_raw_write;
+	}
+
+
+ good_out:
+	iobuf->length = length_tmp;
+        iobuf->maplist[0] = page_tmp;
+	iobuf->nr_pages = nr_pages_tmp;
+	return 0;
+
+	/* slow path */
+ out:
+	lvm_drop_snapshot(lv_snap, reason);
+	return 1;
+
+ fail_raw_write:
+	reason = "write error";
+	goto out;
+}
+
 /*
  * copy on write handler for one snapshot logical volume
  *
@@ -200,9 +340,8 @@ int lvm_snapshot_COW(kdev_t org_phys_dev,
 		     lv_t * lv_snap)
 {
 	const char * reason;
-	unsigned long org_start, snap_start, virt_start, pe_off;
+	unsigned long org_start, snap_start, snap_phys_dev, virt_start, pe_off;
 	int idx = lv_snap->lv_remap_ptr, chunk_size = lv_snap->lv_chunk_size;
-	kdev_t snap_phys_dev;
 	struct kiobuf * iobuf;
 	unsigned long blocks[KIO_MAX_SECTORS];
 	int blksize_snap, blksize_org, min_blksize, max_blksize;
@@ -238,8 +377,8 @@ int lvm_snapshot_COW(kdev_t org_phys_dev,
 
 	iobuf = lv_snap->lv_iobuf;
 
-	blksize_org = get_blksize(org_phys_dev);
-	blksize_snap = get_blksize(snap_phys_dev);
+	blksize_org = lvm_get_blksize(org_phys_dev);
+	blksize_snap = lvm_get_blksize(snap_phys_dev);
 	max_blksize = max(blksize_org, blksize_snap);
 	min_blksize = min(blksize_org, blksize_snap);
 	max_sectors = KIO_MAX_SECTORS * (min_blksize>>9);
@@ -268,7 +407,7 @@ int lvm_snapshot_COW(kdev_t org_phys_dev,
 	}
 
 #ifdef DEBUG_SNAPSHOT
-	/* invalidate the logcial snapshot buffer cache */
+	/* invalidate the logical snapshot buffer cache */
 	invalidate_snap_cache(virt_start, lv_snap->lv_chunk_size,
 			      lv_snap->lv_dev);
 #endif
@@ -277,15 +416,20 @@ int lvm_snapshot_COW(kdev_t org_phys_dev,
 	   so update the execption table */
 	lv_snap->lv_block_exception[idx].rdev_org = org_phys_dev;
 	lv_snap->lv_block_exception[idx].rsector_org = org_start;
+
 	lvm_hash_link(lv_snap->lv_block_exception + idx,
 		      org_phys_dev, org_start, lv_snap);
 	lv_snap->lv_remap_ptr = idx + 1;
-	return 1;
+	if (lv_snap->lv_snapshot_use_rate > 0) {
+		if (lv_snap->lv_remap_ptr * 100 / lv_snap->lv_remap_end >= lv_snap->lv_snapshot_use_rate)
+			wake_up_interruptible(&lv_snap->lv_snapshot_wait);
+	}
+	return 0;
 
 	/* slow path */
  out:
 	lvm_drop_snapshot(lv_snap, reason);
-	return -1;
+	return 1;
 
  fail_out_of_space:
 	reason = "out of space";
@@ -301,7 +445,7 @@ int lvm_snapshot_COW(kdev_t org_phys_dev,
 	goto out;
 }
 
-static int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors)
+int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors)
 {
 	int bytes, nr_pages, err, i;
 
@@ -312,33 +456,17 @@ static int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors)
 		goto out;
 
 	err = -ENOMEM;
-	iobuf->locked = 1;
+	iobuf->locked = 0;
 	iobuf->nr_pages = 0;
 	for (i = 0; i < nr_pages; i++)
 	{
 		struct page * page;
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,27)
 		page = alloc_page(GFP_KERNEL);
 		if (!page)
 			goto out;
-#else
-		{
-			unsigned long addr = __get_free_page(GFP_USER);
-			if (!addr)
-				goto out;
-			iobuf->pagelist[i] = addr;
-			page = virt_to_page(addr);
-		}
-#endif
 
 		iobuf->maplist[i] = page;
-		/* the only point to lock the page here is to be allowed
-		   to share unmap_kiobuf() in the fail-path */
-#ifndef LockPage
-#define LockPage(map) set_bit(PG_locked, &(map)->flags)
-#endif
-		LockPage(page);
 		iobuf->nr_pages++;
 	}
 	iobuf->offset = 0;
@@ -360,7 +488,7 @@ static int calc_max_buckets(void)
 	return mem;
 }
 
-static int lvm_snapshot_alloc_hash_table(lv_t * lv)
+int lvm_snapshot_alloc_hash_table(lv_t * lv)
 {
 	int err;
 	unsigned long buckets, max_buckets, size;
@@ -380,6 +508,7 @@ static int lvm_snapshot_alloc_hash_table(lv_t * lv)
 
 	if (!hash)
 		goto out;
+	lv->lv_snapshot_hash_table_size = size;
 
 	lv->lv_snapshot_hash_mask = buckets-1;
 	while (buckets--)
@@ -407,12 +536,20 @@ int lvm_snapshot_alloc(lv_t * lv_snap)
 	err = lvm_snapshot_alloc_hash_table(lv_snap);
 	if (err)
 		goto out_free_kiovec;
+
+
+		lv_snap->lv_COW_table_page = alloc_page(GFP_KERNEL);
+		if (!lv_snap->lv_COW_table_page)
+			goto out_free_kiovec;
+
  out:
 	return err;
 
  out_free_kiovec:
 	unmap_kiobuf(lv_snap->lv_iobuf);
 	free_kiovec(1, &lv_snap->lv_iobuf);
+	vfree(lv_snap->lv_snapshot_hash_table);
+	lv_snap->lv_snapshot_hash_table = NULL;
 	goto out;
 }
 
@@ -427,10 +564,17 @@ void lvm_snapshot_release(lv_t * lv)
 	{
 		vfree(lv->lv_snapshot_hash_table);
 		lv->lv_snapshot_hash_table = NULL;
+		lv->lv_snapshot_hash_table_size = 0;
 	}
 	if (lv->lv_iobuf)
 	{
+		unmap_kiobuf(lv->lv_iobuf);
 		free_kiovec(1, &lv->lv_iobuf);
 		lv->lv_iobuf = NULL;
 	}
+	if (lv->lv_COW_table_page)
+	{
+		free_page((ulong)lv->lv_COW_table_page);
+		lv->lv_COW_table_page = NULL;
+	}
 }
diff --git a/drivers/md/lvm.c b/drivers/md/lvm.c
index f9433232e..ea276c57c 100644
--- a/drivers/md/lvm.c
+++ b/drivers/md/lvm.c
@@ -1,12 +1,12 @@
 /*
  * kernel/lvm.c
  *
- * Copyright (C) 1997 - 2000  Heinz Mauelshagen, Germany
+ * Copyright (C) 1997 - 2000  Heinz Mauelshagen, Sistina Software
  *
  * February-November 1997
  * April-May,July-August,November 1998
  * January-March,May,July,September,October 1999
- * January,February 2000
+ * January,February,July,September-November 2000
  *
  *
  * LVM driver is free software; you can redistribute it and/or modify
@@ -38,7 +38,7 @@
  *                      lvm_status_byindex_req_t vars
  *    04/05/1998 - added multiple device support
  *    08/05/1998 - added support to set/clear extendable flag in volume group
- *    09/05/1998 - changed output of lvm_proc_get_info() because of
+ *    09/05/1998 - changed output of lvm_proc_get_global_info() because of
  *                 support for free (eg. longer) logical volume names
  *    12/05/1998 - added spin_locks (thanks to Pascal van Dam
  *                 <pascal@ramoth.xs4all.nl>)
@@ -122,18 +122,36 @@
  *               - avoided "/dev/" in proc filesystem output
  *               - avoided inline strings functions lvm_strlen etc.
  *    14/02/2000 - support for 2.3.43
- *               - integrated Andrea Arcangeli's snapshot code
+ *               - integrated Andrea Arcagneli's snapshot code
+ *    25/06/2000 - james (chip) , IKKHAYD! roffl
+ *    26/06/2000 - enhanced lv_extend_reduce for snapshot logical volume support
+ *    06/09/2000 - added devfs support
+ *    07/09/2000 - changed IOP version to 9
+ *               - started to add new char ioctl LV_STATUS_BYDEV_T to support
+ *                 getting an lv_t based on the dev_t of the Logical Volume
+ *    14/09/2000 - enhanced lvm_do_lv_create to upcall VFS functions
+ *                 to sync and lock, activate snapshot and unlock the FS
+ *                 (to support journaled filesystems)
+ *    18/09/2000 - hardsector size support
+ *    27/09/2000 - implemented lvm_do_lv_rename() and lvm_do_vg_rename()
+ *    30/10/2000 - added Andi Kleen's LV_BMAP ioctl to support LILO
+ *    01/11/2000 - added memory information on hash tables to
+ *                 lvm_proc_get_global_info()
+ *    02/11/2000 - implemented /proc/lvm/ hierarchy
  *    07/12/2000 - make sure lvm_make_request_fn returns correct value - 0 or 1 - NeilBrown
  *
  */
 
 
-static char *lvm_version = "LVM version 0.8final  by Heinz Mauelshagen  (15/02/2000)\n";
-static char *lvm_short_version = "version 0.8final  (15/02/2000)";
+static char *lvm_version = "LVM version 0.9  by Heinz Mauelshagen  (13/11/2000)\n";
+static char *lvm_short_version = "version 0.9 (13/11/2000)";
 
 #define MAJOR_NR	LVM_BLK_MAJOR
 #define	DEVICE_OFF(device)
 
+/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */
+/* #define	LVM_VFS_ENHANCEMENT */
+
 #include <linux/config.h>
 #include <linux/version.h>
 
@@ -166,17 +184,15 @@ static char *lvm_short_version = "version 0.8final  (15/02/2000)";
 #include <linux/kerneld.h>
 #endif
 
-#define LOCAL_END_REQUEST
-
 #include <linux/blk.h>
 #include <linux/blkpg.h>
 
 #include <linux/errno.h>
 #include <linux/lvm.h>
 
-#define	LVM_CORRECT_READ_AHEAD(a)				\
-	(((a) < LVM_MIN_READ_AHEAD || (a) > LVM_MAX_READ_AHEAD)	\
-	 ? LVM_MAX_READ_AHEAD : (a))
+#define	LVM_CORRECT_READ_AHEAD( a) \
+   if      ( a < LVM_MIN_READ_AHEAD || \
+             a > LVM_MAX_READ_AHEAD) a = LVM_MAX_READ_AHEAD;
 
 #ifndef WRITEA
 #  define WRITEA WRITE
@@ -195,8 +211,7 @@ extern int lvm_init(void);
 static void lvm_dummy_device_request(request_queue_t *);
 #define	DEVICE_REQUEST	lvm_dummy_device_request
 
-static int lvm_make_request_fn(request_queue_t *, int, struct buffer_head*);
-static void lvm_plug_device_noop(request_queue_t *, kdev_t);
+static int lvm_make_request_fn(request_queue_t*, int, struct buffer_head*);
 
 static int lvm_blk_ioctl(struct inode *, struct file *, uint, ulong);
 static int lvm_blk_open(struct inode *, struct file *);
@@ -205,13 +220,21 @@ static int lvm_chr_open(struct inode *, struct file *);
 
 static int lvm_chr_close(struct inode *, struct file *);
 static int lvm_blk_close(struct inode *, struct file *);
+static int lvm_user_bmap(struct inode *, struct lv_bmap *);
 
 static int lvm_chr_ioctl(struct inode *, struct file *, uint, ulong);
 
 #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
-static int lvm_proc_get_info(char *, char **, off_t, int);
-static int (*lvm_proc_get_info_ptr) (char *, char **, off_t, int) =
-&lvm_proc_get_info;
+int lvm_proc_read_vg_info(char *, char **, off_t, int, int *, void *);
+int lvm_proc_read_lv_info(char *, char **, off_t, int, int *, void *);
+int lvm_proc_read_pv_info(char *, char **, off_t, int, int *, void *);
+static int lvm_proc_get_global_info(char *, char **, off_t, int, int *, void *);
+void lvm_do_create_proc_entry_of_vg ( vg_t *);
+inline void lvm_do_remove_proc_entry_of_vg ( vg_t *);
+inline void lvm_do_create_proc_entry_of_lv ( vg_t *, lv_t *);
+inline void lvm_do_remove_proc_entry_of_lv ( vg_t *, lv_t *);
+inline void lvm_do_create_proc_entry_of_pv ( vg_t *, pv_t *);
+inline void lvm_do_remove_proc_entry_of_pv ( vg_t *, pv_t *);
 #endif
 
 #ifdef LVM_HD_NAME
@@ -226,10 +249,16 @@ void lvm_hd_name(char *, int);
 static void lvm_init_vars(void);
 
 /* external snapshot calls */
-int lvm_snapshot_remap_block(kdev_t *, ulong *, ulong, lv_t *);
-int lvm_snapshot_COW(kdev_t, ulong, ulong, ulong, lv_t *);
-int lvm_snapshot_alloc(lv_t *);
-void lvm_snapshot_release(lv_t *); 
+extern inline int lvm_get_blksize(kdev_t);
+extern int lvm_snapshot_alloc(lv_t *);
+extern void lvm_snapshot_fill_COW_page(vg_t *, lv_t *);
+extern int lvm_snapshot_COW(kdev_t, ulong, ulong, ulong, lv_t *);
+extern int lvm_snapshot_remap_block(kdev_t *, ulong *, ulong, lv_t *);
+extern void lvm_snapshot_release(lv_t *); 
+extern int lvm_write_COW_table_block(vg_t *, lv_t *);
+extern inline void lvm_hash_link(lv_block_exception_t *, kdev_t, ulong, lv_t *);
+extern int lvm_snapshot_alloc_hash_table(lv_t *);
+extern void lvm_drop_snapshot(lv_t *, char *);
 
 #ifdef LVM_HD_NAME
 extern void (*lvm_hd_name_ptr) (char *, int);
@@ -237,21 +266,30 @@ extern void (*lvm_hd_name_ptr) (char *, int);
 static int lvm_map(struct buffer_head *, int);
 static int lvm_do_lock_lvm(void);
 static int lvm_do_le_remap(vg_t *, void *);
-static int lvm_do_pe_lock_unlock(vg_t *r, void *);
-static int lvm_do_vg_create(int, void *);
-static int lvm_do_vg_extend(vg_t *, void *);
-static int lvm_do_vg_reduce(vg_t *, void *);
-static int lvm_do_vg_remove(int);
+
+static int lvm_do_pv_create(pv_t *, vg_t *, ulong);
+static int lvm_do_pv_remove(vg_t *, ulong);
 static int lvm_do_lv_create(int, char *, lv_t *);
-static int lvm_do_lv_remove(int, char *, int);
 static int lvm_do_lv_extend_reduce(int, char *, lv_t *);
+static int lvm_do_lv_remove(int, char *, int);
+static int lvm_do_lv_rename(vg_t *, lv_req_t *, lv_t *);
 static int lvm_do_lv_status_byname(vg_t *r, void *);
-static int lvm_do_lv_status_byindex(vg_t *, void *arg);
+static int lvm_do_lv_status_byindex(vg_t *, void *);
+static int lvm_do_lv_status_bydev(vg_t *, void *);
+
+static int lvm_do_pe_lock_unlock(vg_t *r, void *);
+
 static int lvm_do_pv_change(vg_t*, void*);
 static int lvm_do_pv_status(vg_t *, void *);
+
+static int lvm_do_vg_create(int, void *);
+static int lvm_do_vg_extend(vg_t *, void *);
+static int lvm_do_vg_reduce(vg_t *, void *);
+static int lvm_do_vg_rename(vg_t *, void *);
+static int lvm_do_vg_remove(int);
 static void lvm_geninit(struct gendisk *);
 #ifdef LVM_GET_INODE
-static struct inode *lvm_get_inode(kdev_t);
+static struct inode *lvm_get_inode(int);
 void lvm_clear_inode(struct inode *);
 #endif
 /* END Internal function prototypes */
@@ -259,10 +297,19 @@ void lvm_clear_inode(struct inode *);
 
 /* volume group descriptor area pointers */
 static vg_t *vg[ABS_MAX_VG];
+
+#ifdef	CONFIG_DEVFS_FS
+static devfs_handle_t lvm_devfs_handle;
+static devfs_handle_t vg_devfs_handle[MAX_VG];
+static devfs_handle_t ch_devfs_handle[MAX_VG];
+static devfs_handle_t lv_devfs_handle[MAX_LV];
+#endif
+
 static pv_t *pvp = NULL;
 static lv_t *lvp = NULL;
 static pe_t *pep = NULL;
 static pe_t *pep1 = NULL;
+static char *basename = NULL;
 
 
 /* map from block minor number to VG and LV numbers */
@@ -287,7 +334,6 @@ static int lvm_reset_spindown = 0;
 
 static char pv_name[NAME_LEN];
 /* static char rootvg[NAME_LEN] = { 0, }; */
-static uint lv_open = 0;
 const char *const lvm_name = LVM_NAME;
 static int lock = 0;
 static int loadtime = 0;
@@ -299,27 +345,31 @@ static DECLARE_WAIT_QUEUE_HEAD(lvm_wait);
 static DECLARE_WAIT_QUEUE_HEAD(lvm_map_wait);
 
 static spinlock_t lvm_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t lvm_snapshot_lock = SPIN_LOCK_UNLOCKED;
 
-static devfs_handle_t lvm_devfs_handle;
-static devfs_handle_t vg_devfs_handle[MAX_VG];
-static devfs_handle_t ch_devfs_handle[MAX_VG];
-static devfs_handle_t lv_devfs_handle[MAX_LV];
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+static struct proc_dir_entry *lvm_proc_dir = NULL;
+static struct proc_dir_entry *lvm_proc_vg_subdir = NULL;
+struct proc_dir_entry *pde = NULL;
+#endif
 
 static struct file_operations lvm_chr_fops =
 {
-	owner:		THIS_MODULE,
 	open:		lvm_chr_open,
 	release:	lvm_chr_close,
 	ioctl:		lvm_chr_ioctl,
 };
 
+#define BLOCK_DEVICE_OPERATIONS
+/* block device operations structure needed for 2.3.38? and above */
 static struct block_device_operations lvm_blk_dops =
 {
 	open: 		lvm_blk_open,
 	release:	lvm_blk_close,
-	ioctl:		lvm_blk_ioctl
+	ioctl:		lvm_blk_ioctl,
 };
 
+
 /* gendisk structures */
 static struct hd_struct lvm_hd_struct[MAX_LV];
 static int lvm_blocksizes[MAX_LV] =
@@ -364,21 +414,32 @@ int __init lvm_init(void)
 		printk(KERN_ERR "%s -- register_chrdev failed\n", lvm_name);
 		return -EIO;
 	}
-	if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0) {
+#ifdef BLOCK_DEVICE_OPERATIONS
+	if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0)
+#else
+	if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_fops) < 0)
+#endif
+	{
 		printk("%s -- register_blkdev failed\n", lvm_name);
 		if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0)
 			printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name);
 		return -EIO;
 	}
 
+#ifdef	CONFIG_DEVFS_FS
 	lvm_devfs_handle = devfs_register(
 		0 , "lvm", 0, 0, LVM_CHAR_MAJOR,
 		S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP,
 		&lvm_chr_fops, NULL);
+#endif
 
 #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
-	create_proc_info_entry(LVM_NAME, S_IFREG | S_IRUGO,
-			       &proc_root, lvm_proc_get_info_ptr);
+	lvm_proc_dir = create_proc_entry (LVM_DIR, S_IFDIR, &proc_root);
+	if (lvm_proc_dir != NULL) {
+		lvm_proc_vg_subdir = create_proc_entry (LVM_VG_SUBDIR, S_IFDIR, lvm_proc_dir);
+		pde = create_proc_entry(LVM_GLOBAL, S_IFREG, lvm_proc_dir);
+		if ( pde != NULL) pde->read_proc = &lvm_proc_get_global_info;
+	}
 #endif
 
 	lvm_init_vars();
@@ -405,7 +466,7 @@ int __init lvm_init(void)
 
 	blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST);
 	blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_make_request_fn);
-	blk_queue_pluggable(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_plug_device_noop);
+
 	/* optional read root VGDA */
 /*
    if ( *rootvg != 0) vg_read_with_pv_and_lv ( rootvg, &vg);
@@ -433,7 +494,9 @@ void cleanup_module(void)
 {
 	struct gendisk *gendisk_ptr = NULL, *gendisk_ptr_prev = NULL;
 
+#ifdef	CONFIG_DEVFS_FS
 	devfs_unregister (lvm_devfs_handle);
+#endif
 
 	if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) {
 		printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name);
@@ -456,9 +519,12 @@ void cleanup_module(void)
 
 	blk_size[MAJOR_NR] = NULL;
 	blksize_size[MAJOR_NR] = NULL;
+	hardsect_size[MAJOR_NR] = NULL;
 
 #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
-	remove_proc_entry(LVM_NAME, &proc_root);
+	remove_proc_entry(LVM_GLOBAL, lvm_proc_dir);
+	remove_proc_entry(LVM_VG_SUBDIR, lvm_proc_dir);
+	remove_proc_entry(LVM_DIR, &proc_root);
 #endif
 
 #ifdef LVM_HD_NAME
@@ -486,8 +552,11 @@ void __init lvm_init_vars(void)
 
 	loadtime = CURRENT_TIME;
 
+	lvm_lock = lvm_snapshot_lock = SPIN_LOCK_UNLOCKED;
+
 	pe_lock_req.lock = UNLOCK_PE;
-	pe_lock_req.data.lv_dev = pe_lock_req.data.pv_dev = 0;
+	pe_lock_req.data.lv_dev = \
+	pe_lock_req.data.pv_dev = \
 	pe_lock_req.data.pv_offset = 0;
 
 	/* Initialize VG pointers */
@@ -531,6 +600,9 @@ static int lvm_chr_open(struct inode *inode,
 	if (VG_CHR(minor) > MAX_VG) return -ENXIO;
 
 	lvm_chr_open_count++;
+
+	MOD_INC_USE_COUNT;
+
 	return 0;
 } /* lvm_chr_open() */
 
@@ -592,7 +664,7 @@ static int lvm_chr_ioctl(struct inode *inode, struct file *file,
 			MOD_INC_USE_COUNT;
 		while (GET_USE_COUNT(&__this_module) > 1)
 			MOD_DEC_USE_COUNT;
-#endif				/* MODULE */
+#endif /* MODULE */
 		lock = 0;	/* release lock */
 		wake_up_interruptible(&lvm_wait);
 		return 0;
@@ -612,17 +684,21 @@ static int lvm_chr_ioctl(struct inode *inode, struct file *file,
 		/* create a VGDA */
 		return lvm_do_vg_create(minor, arg);
 
-	case VG_REMOVE:
-		/* remove an inactive VGDA */
-		return lvm_do_vg_remove(minor);
-
 	case VG_EXTEND:
 		/* extend a volume group */
-		return lvm_do_vg_extend(vg_ptr,arg);
+		return lvm_do_vg_extend(vg_ptr, arg);
 
 	case VG_REDUCE:
 		/* reduce a volume group */
-		return lvm_do_vg_reduce(vg_ptr,arg);
+		return lvm_do_vg_reduce(vg_ptr, arg);
+
+	case VG_RENAME:
+		/* rename a volume group */
+		return lvm_do_vg_rename(vg_ptr, arg);
+
+	case VG_REMOVE:
+		/* remove an inactive VGDA */
+		return lvm_do_vg_remove(minor);
 
 
 	case VG_SET_EXTENDABLE:
@@ -660,20 +736,22 @@ static int lvm_chr_ioctl(struct inode *inode, struct file *file,
 		/* get volume group count */
 		for (l = v = 0; v < ABS_MAX_VG; v++) {
 			if (vg[v] != NULL) {
-				if (copy_to_user(arg + l++ * NAME_LEN,
+				if (copy_to_user(arg + l * NAME_LEN,
 						 vg[v]->vg_name,
 						 NAME_LEN) != 0)
 					return -EFAULT;
+				l++;
 			}
 		}
 		return 0;
 
 
 	case LV_CREATE:
-	case LV_REMOVE:
 	case LV_EXTEND:
 	case LV_REDUCE:
-		/* create, remove, extend or reduce a logical volume */
+	case LV_REMOVE:
+	case LV_RENAME:
+		/* create, extend, reduce, remove or rename a logical volume */
 		if (vg_ptr == NULL) return -ENXIO;
 		if (copy_from_user(&lv_req, arg, sizeof(lv_req)) != 0)
 			return -EFAULT;
@@ -686,52 +764,54 @@ static int lvm_chr_ioctl(struct inode *inode, struct file *file,
 		case LV_CREATE:
 			return lvm_do_lv_create(minor, lv_req.lv_name, &lv);
 
-		case LV_REMOVE:
-			return lvm_do_lv_remove(minor, lv_req.lv_name, -1);
-
 		case LV_EXTEND:
 		case LV_REDUCE:
 			return lvm_do_lv_extend_reduce(minor, lv_req.lv_name, &lv);
+		case LV_REMOVE:
+			return lvm_do_lv_remove(minor, lv_req.lv_name, -1);
+
+		case LV_RENAME:
+			return lvm_do_lv_rename(vg_ptr, &lv_req, &lv);
 		}
 
 
+
+
 	case LV_STATUS_BYNAME:
 		/* get status of a logical volume by name */
-		return lvm_do_lv_status_byname(vg_ptr,arg);
+		return lvm_do_lv_status_byname(vg_ptr, arg);
+
 
 	case LV_STATUS_BYINDEX:
 		/* get status of a logical volume by index */
-		return lvm_do_lv_status_byindex(vg_ptr,arg);
+		return lvm_do_lv_status_byindex(vg_ptr, arg);
+
+
+	case LV_STATUS_BYDEV:
+		return lvm_do_lv_status_bydev(vg_ptr, arg);
+
 
 	case PV_CHANGE:
 		/* change a physical volume */
 		return lvm_do_pv_change(vg_ptr,arg);
 
+
 	case PV_STATUS:
 		/* get physical volume data (pv_t structure only) */
 		return lvm_do_pv_status(vg_ptr,arg);
 
+
 	case PV_FLUSH:
 		/* physical volume buffer flush/invalidate */
 		if (copy_from_user(&pv_flush_req, arg,
 				   sizeof(pv_flush_req)) != 0)
 			return -EFAULT;
 
-		for ( v = 0; v < ABS_MAX_VG; v++) {
-			unsigned int p;
-			if ( vg[v] == NULL) continue;
-			for ( p = 0; p < vg[v]->pv_max; p++) {
-				if ( vg[v]->pv[p] != NULL &&
-				     strcmp ( vg[v]->pv[p]->pv_name,
-					      pv_flush_req.pv_name) == 0) {
-					fsync_dev ( vg[v]->pv[p]->pv_dev);
-					invalidate_buffers ( vg[v]->pv[p]->pv_dev);
-					return 0;
-				}
-			}
-		}
+		fsync_dev(pv_flush_req.pv_dev);
+		invalidate_buffers(pv_flush_req.pv_dev);
 		return 0;
 
+
 	default:
 		printk(KERN_WARNING
 		       "%s -- lvm_chr_ioctl: unknown command %x\n",
@@ -754,11 +834,10 @@ static int lvm_chr_close(struct inode *inode, struct file *file)
 	     "%s -- lvm_chr_close   VG#: %d\n", lvm_name, VG_CHR(minor));
 #endif
 
-	lock_kernel();
 #ifdef LVM_TOTAL_RESET
 	if (lvm_reset_spindown > 0) {
 		lvm_reset_spindown = 0;
-		lvm_chr_open_count = 1;
+		lvm_chr_open_count = 0;
 	}
 #endif
 
@@ -767,7 +846,8 @@ static int lvm_chr_close(struct inode *inode, struct file *file)
 		lock = 0;	/* release lock */
 		wake_up_interruptible(&lvm_wait);
 	}
-	unlock_kernel();
+
+	MOD_DEC_USE_COUNT;
 
 	return 0;
 } /* lvm_chr_close() */
@@ -815,6 +895,10 @@ static int lvm_blk_open(struct inode *inode, struct file *file)
 			if (!(lv_ptr->lv_access & LV_WRITE))  return -EACCES;
 		}
 
+#ifndef BLOCK_DEVICE_OPERATIONS
+		file->f_op = &lvm_blk_fops;
+#endif
+
                 /* be sure to increment VG counter */
 		if (lv_ptr->lv_open == 0) vg_ptr->lv_open++;
 		lv_ptr->lv_open++;
@@ -863,7 +947,7 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file,
 		       lvm_name, lv_ptr->lv_size);
 #endif
 		if (put_user(lv_ptr->lv_size, (long *)arg))
-			return -EFAULT;
+			return -EFAULT; 
 		break;
 
 
@@ -892,7 +976,7 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file,
 		if ((long) arg < LVM_MIN_READ_AHEAD ||
 		    (long) arg > LVM_MAX_READ_AHEAD)
 			return -EINVAL;
-		read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = (long) arg;
+		lv_ptr->lv_read_ahead = (long) arg;
 		break;
 
 
@@ -944,6 +1028,10 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file,
 		/* set access flags of a logical volume */
 		if (!capable(CAP_SYS_ADMIN)) return -EACCES;
 		lv_ptr->lv_access = (ulong) arg;
+		if ( lv_ptr->lv_access & LV_WRITE)
+			set_device_ro(lv_ptr->lv_dev, 0);
+		else
+			set_device_ro(lv_ptr->lv_dev, 1);
 		break;
 
 
@@ -955,6 +1043,10 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file,
 		lv_ptr->lv_status = (ulong) arg;
 		break;
 
+	case LV_BMAP:
+		/* turn logical block into (dev_t, block). non privileged. */
+		return lvm_user_bmap(inode, (struct lv_bmap *) arg);
+		break;
 
 	case LV_SET_ALLOCATION:
 		/* set allocation flags of a logical volume */
@@ -962,6 +1054,37 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file,
 		lv_ptr->lv_allocation = (ulong) arg;
 		break;
 
+	case LV_SNAPSHOT_USE_RATE:
+		if (!(lv_ptr->lv_access & LV_SNAPSHOT)) return -EPERM;
+		{
+			lv_snapshot_use_rate_req_t	lv_snapshot_use_rate_req;
+
+			if (copy_from_user(&lv_snapshot_use_rate_req, arg,
+					   sizeof(lv_snapshot_use_rate_req_t)))
+				return -EFAULT;
+			if (lv_snapshot_use_rate_req.rate < 0 ||
+			    lv_snapshot_use_rate_req.rate  > 100) return -EFAULT;
+
+			switch (lv_snapshot_use_rate_req.block)
+			{
+			case 0:
+				lv_ptr->lv_snapshot_use_rate = lv_snapshot_use_rate_req.rate;
+				if (lv_ptr->lv_remap_ptr * 100 / lv_ptr->lv_remap_end < lv_ptr->lv_snapshot_use_rate)
+					interruptible_sleep_on (&lv_ptr->lv_snapshot_wait);
+				break;
+
+			case O_NONBLOCK:
+				break;
+
+			default:
+				return -EFAULT;
+			}
+			lv_snapshot_use_rate_req.rate = lv_ptr->lv_remap_ptr * 100 / lv_ptr->lv_remap_end;
+			if (copy_to_user(arg, &lv_snapshot_use_rate_req,
+					 sizeof(lv_snapshot_use_rate_req_t)))
+				return -EFAULT;
+		}
+		break;
 
 	default:
 		printk(KERN_WARNING
@@ -999,20 +1122,163 @@ static int lvm_blk_close(struct inode *inode, struct file *file)
 } /* lvm_blk_close() */
 
 
+static int lvm_user_bmap(struct inode *inode, struct lv_bmap *user_result)
+{
+	struct buffer_head bh;
+	unsigned long block;
+	int err;
+	
+	if (get_user(block, &user_result->lv_block))
+	return -EFAULT;
+	
+	memset(&bh,0,sizeof bh);
+	bh.b_rsector = block;
+	bh.b_dev = bh.b_rdev = inode->i_dev;
+	bh.b_size = lvm_get_blksize(bh.b_dev);
+	if ((err=lvm_map(&bh, READ)) < 0)  {
+	printk("lvm map failed: %d\n", err);
+	return -EINVAL;
+	}
+	
+	return put_user(  kdev_t_to_nr(bh.b_rdev), &user_result->lv_dev) ||
+	put_user(bh.b_rsector, &user_result->lv_block) ? -EFAULT : 0;
+}     
+
+
+/*
+ * provide VG info for proc filesystem use (global)
+ */
+int lvm_vg_info(vg_t *vg_ptr, char *buf) {
+	int sz = 0;
+	char inactive_flag = ' ';
+
+	if (!(vg_ptr->vg_status & VG_ACTIVE)) inactive_flag = 'I';
+	sz = sprintf(buf,
+		     "\nVG: %c%s  [%d PV, %d LV/%d open] "
+		     " PE Size: %d KB\n"
+		     "  Usage [KB/PE]: %d /%d total  "
+		     "%d /%d used  %d /%d free",
+		     inactive_flag,
+		     vg_ptr->vg_name,
+		     vg_ptr->pv_cur,
+		     vg_ptr->lv_cur,
+		     vg_ptr->lv_open,
+	     	     vg_ptr->pe_size >> 1,
+		     vg_ptr->pe_size * vg_ptr->pe_total >> 1,
+		     vg_ptr->pe_total,
+		     vg_ptr->pe_allocated * vg_ptr->pe_size >> 1,
+	     	     vg_ptr->pe_allocated,
+		     (vg_ptr->pe_total - vg_ptr->pe_allocated) *	
+	     	     vg_ptr->pe_size >> 1,
+		     vg_ptr->pe_total - vg_ptr->pe_allocated);
+	return sz;
+}
+
+
+/*
+ * provide LV info for proc filesystem use (global)
+ */
+int lvm_lv_info(vg_t *vg_ptr, lv_t *lv_ptr, char *buf) {
+	int sz = 0;
+	char inactive_flag = 'A', allocation_flag = ' ',
+	     stripes_flag = ' ', rw_flag = ' ';
+
+	if (!(lv_ptr->lv_status & LV_ACTIVE))
+		inactive_flag = 'I';
+	rw_flag = 'R';
+	if (lv_ptr->lv_access & LV_WRITE)
+		rw_flag = 'W';
+	allocation_flag = 'D';
+	if (lv_ptr->lv_allocation & LV_CONTIGUOUS)
+		allocation_flag = 'C';
+	stripes_flag = 'L';
+	if (lv_ptr->lv_stripes > 1)
+		stripes_flag = 'S';
+	sz += sprintf(buf+sz,
+		      "[%c%c%c%c",
+		      inactive_flag,
+	 rw_flag,
+		      allocation_flag,
+		      stripes_flag);
+	if (lv_ptr->lv_stripes > 1)
+		sz += sprintf(buf+sz, "%-2d",
+			      lv_ptr->lv_stripes);
+	else
+		sz += sprintf(buf+sz, "  ");
+	basename = strrchr(lv_ptr->lv_name, '/');
+	if ( basename == 0) basename = lv_ptr->lv_name;
+	else                basename++;
+	sz += sprintf(buf+sz, "] %-25s", basename);
+	if (strlen(basename) > 25)
+		sz += sprintf(buf+sz,
+			      "\n                              ");
+	sz += sprintf(buf+sz, "%9d /%-6d   ",
+		      lv_ptr->lv_size >> 1,
+		      lv_ptr->lv_size / vg_ptr->pe_size);
+
+	if (lv_ptr->lv_open == 0)
+		sz += sprintf(buf+sz, "close");
+	else
+		sz += sprintf(buf+sz, "%dx open",
+			      lv_ptr->lv_open);
+
+	return sz;
+}
+
+
+/*
+ * provide PV info for proc filesystem use (global)
+ */
+int lvm_pv_info(pv_t *pv_ptr, char *buf) {
+	int sz = 0;
+	char inactive_flag = 'A', allocation_flag = ' ';
+	char *pv_name = NULL;
+
+	if (!(pv_ptr->pv_status & PV_ACTIVE))
+		inactive_flag = 'I';
+	allocation_flag = 'A';
+	if (!(pv_ptr->pv_allocatable & PV_ALLOCATABLE))
+		allocation_flag = 'N';
+	pv_name = strrchr(pv_ptr->pv_name+1,'/');
+	if ( pv_name == 0) pv_name = pv_ptr->pv_name;
+	else               pv_name++;
+	sz = sprintf(buf,
+		     "[%c%c] %-21s %8d /%-6d  "
+		     "%8d /%-6d  %8d /%-6d",
+		     inactive_flag,
+		     allocation_flag,
+		     pv_name,
+		     pv_ptr->pe_total *
+		     pv_ptr->pe_size >> 1,
+		     pv_ptr->pe_total,
+		     pv_ptr->pe_allocated *
+		     pv_ptr->pe_size >> 1,
+		     pv_ptr->pe_allocated,
+		     (pv_ptr->pe_total -
+		      pv_ptr->pe_allocated) *
+		     pv_ptr->pe_size >> 1,
+		     pv_ptr->pe_total -
+		     pv_ptr->pe_allocated);
+	return sz;
+}
+
+
 #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
 /*
- * Support function /proc-Filesystem
+ * Support functions /proc-Filesystem
  */
+
 #define  LVM_PROC_BUF   ( i == 0 ? dummy_buf : &buf[sz])
 
-static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
+/*
+ * provide global LVM information
+ */
+static int lvm_proc_get_global_info(char *page, char **start, off_t pos, int count, int *eof, void *data)
 {
 	int c, i, l, p, v, vg_counter, pv_counter, lv_counter, lv_open_counter,
-	 lv_open_total, pe_t_bytes, lv_block_exception_t_bytes, seconds;
+	 lv_open_total, pe_t_bytes, hash_table_bytes, lv_block_exception_t_bytes, seconds;
 	static off_t sz;
 	off_t sz_last;
-	char allocation_flag, inactive_flag, rw_flag, stripes_flag;
-	char *lv_name, *pv_name;
 	static char *buf = NULL;
 	static char dummy_buf[160];	/* sized for 2 lines */
 	vg_t *vg_ptr;
@@ -1022,13 +1288,16 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 
 #ifdef DEBUG_LVM_PROC_GET_INFO
 	printk(KERN_DEBUG
-	       "%s - lvm_proc_get_info CALLED  pos: %lu  count: %d  whence: %d\n",
+	       "%s - lvm_proc_get_global_info CALLED  pos: %lu  count: %d  whence: %d\n",
 	       lvm_name, pos, count, whence);
 #endif
 
+	MOD_INC_USE_COUNT;
+
 	if (pos == 0 || buf == NULL) {
 		sz_last = vg_counter = pv_counter = lv_counter = lv_open_counter = \
-		    lv_open_total = pe_t_bytes = lv_block_exception_t_bytes = 0;
+		lv_open_total = pe_t_bytes = hash_table_bytes = \
+		lv_block_exception_t_bytes = 0;
 
 		/* search for activity */
 		for (v = 0; v < ABS_MAX_VG; v++) {
@@ -1040,6 +1309,7 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 					for (l = 0; l < vg[v]->lv_max; l++) {
 						if ((lv_ptr = vg_ptr->lv[l]) != NULL) {
 							pe_t_bytes += lv_ptr->lv_allocated_le;
+							hash_table_bytes += lv_ptr->lv_snapshot_hash_table_size;
 							if (lv_ptr->lv_block_exception != NULL)
 								lv_block_exception_t_bytes += lv_ptr->lv_remap_end;
 							if (lv_ptr->lv_open > 0) {
@@ -1057,9 +1327,11 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 		if (buf != NULL) {
 #ifdef DEBUG_KFREE
 			printk(KERN_DEBUG
-			       "%s -- kfree %d\n", lvm_name, __LINE__);
+			       "%s -- vfree %d\n", lvm_name, __LINE__);
 #endif
-			kfree(buf);
+			lock_kernel();
+			vfree(buf);
+			unlock_kernel();
 			buf = NULL;
 		}
 		/* 2 times: first to get size to allocate buffer,
@@ -1094,7 +1366,7 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 				      vg_counter * sizeof(vg_t) +
 				      pv_counter * sizeof(pv_t) +
 				      lv_counter * sizeof(lv_t) +
-			pe_t_bytes + lv_block_exception_t_bytes + sz_last,
+				      pe_t_bytes + hash_table_bytes + lv_block_exception_t_bytes + sz_last,
 				      lvm_iop_version);
 
 			seconds = CURRENT_TIME - loadtime;
@@ -1115,26 +1387,7 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 				for (v = 0; v < ABS_MAX_VG; v++) {
 					/* volume group */
 					if ((vg_ptr = vg[v]) != NULL) {
-						inactive_flag = ' ';
-						if (!(vg_ptr->vg_status & VG_ACTIVE)) inactive_flag = 'I';
-						sz += sprintf(LVM_PROC_BUF,
-							      "\nVG: %c%s  [%d PV, %d LV/%d open] "
-							      " PE Size: %d KB\n"
-							      "  Usage [KB/PE]: %d /%d total  "
-							      "%d /%d used  %d /%d free",
-							      inactive_flag,
-							      vg_ptr->vg_name,
-							      vg_ptr->pv_cur,
-							      vg_ptr->lv_cur,
-							      vg_ptr->lv_open,
-						     	      vg_ptr->pe_size >> 1,
-							      vg_ptr->pe_size * vg_ptr->pe_total >> 1,
-							      vg_ptr->pe_total,
-							      vg_ptr->pe_allocated * vg_ptr->pe_size >> 1,
-						     	      vg_ptr->pe_allocated,
-							      (vg_ptr->pe_total - vg_ptr->pe_allocated) *
-						     	      vg_ptr->pe_size >> 1,
-							      vg_ptr->pe_total - vg_ptr->pe_allocated);
+						sz += lvm_vg_info(vg_ptr, LVM_PROC_BUF);
 
 						/* physical volumes */
 						sz += sprintf(LVM_PROC_BUF,
@@ -1143,32 +1396,8 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 						c = 0;
 						for (p = 0; p < vg_ptr->pv_max; p++) {
 							if ((pv_ptr = vg_ptr->pv[p]) != NULL) {
-								inactive_flag = 'A';
-								if (!(pv_ptr->pv_status & PV_ACTIVE))
-									inactive_flag = 'I';
-								allocation_flag = 'A';
-								if (!(pv_ptr->pv_allocatable & PV_ALLOCATABLE))
-									allocation_flag = 'N';
-								pv_name = strchr(pv_ptr->pv_name+1,'/');
-								if ( pv_name == 0) pv_name = pv_ptr->pv_name;
-								else               pv_name++;
-								sz += sprintf(LVM_PROC_BUF,
-									      "[%c%c] %-21s %8d /%-6d  "
-									      "%8d /%-6d  %8d /%-6d",
-									      inactive_flag,
-									      allocation_flag,
-									      pv_name,
-									      pv_ptr->pe_total *
-									      pv_ptr->pe_size >> 1,
-									      pv_ptr->pe_total,
-									      pv_ptr->pe_allocated *
-									      pv_ptr->pe_size >> 1,
-									      pv_ptr->pe_allocated,
-									      (pv_ptr->pe_total -
-									       pv_ptr->pe_allocated) *
-									      pv_ptr->pe_size >> 1,
-									      pv_ptr->pe_total -
-									      pv_ptr->pe_allocated);
+								sz += lvm_pv_info(pv_ptr, LVM_PROC_BUF);
+
 								c++;
 								if (c < vg_ptr->pv_cur)
 									sz += sprintf(LVM_PROC_BUF,
@@ -1181,47 +1410,9 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 							   "\n    LV%s ",
 							      vg_ptr->lv_cur == 1 ? ": " : "s:");
 						c = 0;
-						for (l = 0; l < vg[v]->lv_max; l++) {
+						for (l = 0; l < vg_ptr->lv_max; l++) {
 							if ((lv_ptr = vg_ptr->lv[l]) != NULL) {
-								inactive_flag = 'A';
-								if (!(lv_ptr->lv_status & LV_ACTIVE))
-									inactive_flag = 'I';
-								rw_flag = 'R';
-								if (lv_ptr->lv_access & LV_WRITE)
-									rw_flag = 'W';
-								allocation_flag = 'D';
-								if (lv_ptr->lv_allocation & LV_CONTIGUOUS)
-									allocation_flag = 'C';
-								stripes_flag = 'L';
-								if (lv_ptr->lv_stripes > 1)
-									stripes_flag = 'S';
-								sz += sprintf(LVM_PROC_BUF,
-									      "[%c%c%c%c",
-									      inactive_flag,
-								 rw_flag,
-									      allocation_flag,
-									      stripes_flag);
-								if (lv_ptr->lv_stripes > 1)
-									sz += sprintf(LVM_PROC_BUF, "%-2d",
-										      lv_ptr->lv_stripes);
-								else
-									sz += sprintf(LVM_PROC_BUF, "  ");
-								lv_name = strrchr(lv_ptr->lv_name, '/');
-								if ( lv_name == 0) lv_name = lv_ptr->lv_name;
-								else               lv_name++;
-								sz += sprintf(LVM_PROC_BUF, "] %-25s", lv_name);
-								if (strlen(lv_name) > 25)
-									sz += sprintf(LVM_PROC_BUF,
-										      "\n                              ");
-								sz += sprintf(LVM_PROC_BUF, "%9d /%-6d   ",
-									      lv_ptr->lv_size >> 1,
-									      lv_ptr->lv_size / vg[v]->pe_size);
-
-								if (lv_ptr->lv_open == 0)
-									sz += sprintf(LVM_PROC_BUF, "close");
-								else
-									sz += sprintf(LVM_PROC_BUF, "%dx open",
-										      lv_ptr->lv_open);
+								sz += lvm_lv_info(vg_ptr, lv_ptr, LVM_PROC_BUF);
 								c++;
 								if (c < vg_ptr->lv_cur)
 									sz += sprintf(LVM_PROC_BUF,
@@ -1234,8 +1425,12 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 				}
 			}
 			if (buf == NULL) {
-				if ((buf = vmalloc(sz)) == NULL) {
+				lock_kernel();
+				buf = vmalloc(sz);
+				unlock_kernel();
+				if (buf == NULL) {
 					sz = 0;
+					MOD_DEC_USE_COUNT;
 					return sprintf(page, "%s - vmalloc error at line %d\n",
 						     lvm_name, __LINE__);
 				}
@@ -1243,8 +1438,11 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 			sz_last = sz;
 		}
 	}
+	MOD_DEC_USE_COUNT;
 	if (pos > sz - 1) {
+		lock_kernel();
 		vfree(buf);
+		unlock_kernel();
 		buf = NULL;
 		return 0;
 	}
@@ -1253,47 +1451,111 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count)
 		return sz - pos;
 	else
 		return count;
-} /* lvm_proc_get_info() */
+} /* lvm_proc_get_global_info() */
 #endif /* #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS */
 
 
 /*
+ * provide VG information
+ */
+int lvm_proc_read_vg_info(char *page, char **start, off_t off,
+			  int count, int *eof, void *data) {
+	int sz = 0;
+	vg_t *vg = data;
+
+	sz += sprintf ( page+sz, "name:         %s\n", vg->vg_name);
+	sz += sprintf ( page+sz, "size:         %u\n",
+		        vg->pe_total * vg->pe_size / 2);
+	sz += sprintf ( page+sz, "access:       %u\n", vg->vg_access);
+	sz += sprintf ( page+sz, "status:       %u\n", vg->vg_status);
+	sz += sprintf ( page+sz, "number:       %u\n", vg->vg_number);
+	sz += sprintf ( page+sz, "LV max:       %u\n", vg->lv_max);
+	sz += sprintf ( page+sz, "LV current:   %u\n", vg->lv_cur);
+	sz += sprintf ( page+sz, "LV open:      %u\n", vg->lv_open);
+	sz += sprintf ( page+sz, "PV max:       %u\n", vg->pv_max);
+	sz += sprintf ( page+sz, "PV current:   %u\n", vg->pv_cur);
+	sz += sprintf ( page+sz, "PV active:    %u\n", vg->pv_act);
+	sz += sprintf ( page+sz, "PE size:      %u\n", vg->pe_size / 2);
+	sz += sprintf ( page+sz, "PE total:     %u\n", vg->pe_total);
+	sz += sprintf ( page+sz, "PE allocated: %u\n", vg->pe_allocated);
+	sz += sprintf ( page+sz, "uuid:         %s\n", vg->vg_uuid);
+
+	return sz;
+}
+
+
+/*
+ * provide LV information
+ */
+int lvm_proc_read_lv_info(char *page, char **start, off_t off,
+			  int count, int *eof, void *data) {
+	int sz = 0;
+	lv_t *lv = data;
+
+	sz += sprintf ( page+sz, "name:         %s\n", lv->lv_name);
+	sz += sprintf ( page+sz, "size:         %u\n", lv->lv_size);
+	sz += sprintf ( page+sz, "access:       %u\n", lv->lv_access);
+	sz += sprintf ( page+sz, "status:       %u\n", lv->lv_status);
+	sz += sprintf ( page+sz, "number:       %u\n", lv->lv_number);
+	sz += sprintf ( page+sz, "open:         %u\n", lv->lv_open);
+	sz += sprintf ( page+sz, "allocation:   %u\n", lv->lv_allocation);
+	sz += sprintf ( page+sz, "device:       %02u:%02u\n",
+                        MAJOR(lv->lv_dev), MINOR(lv->lv_dev));
+
+	return sz;
+}
+
+
+/*
+ * provide PV information
+ */
+int lvm_proc_read_pv_info(char *page, char **start, off_t off,
+			  int count, int *eof, void *data) {
+	int sz = 0;
+	pv_t *pv = data;
+
+	sz += sprintf ( page+sz, "name:         %s\n", pv->pv_name);
+	sz += sprintf ( page+sz, "size:         %u\n", pv->pv_size);
+	sz += sprintf ( page+sz, "status:       %u\n", pv->pv_status);
+	sz += sprintf ( page+sz, "number:       %u\n", pv->pv_number);
+	sz += sprintf ( page+sz, "allocatable:  %u\n", pv->pv_allocatable);
+	sz += sprintf ( page+sz, "LV current:   %u\n", pv->lv_cur);
+	sz += sprintf ( page+sz, "PE size:      %u\n", pv->pe_size / 2);
+	sz += sprintf ( page+sz, "PE total:     %u\n", pv->pe_total);
+	sz += sprintf ( page+sz, "PE allocated: %u\n", pv->pe_allocated);
+	sz += sprintf ( page+sz, "device:       %02u:%02u\n",
+                        MAJOR(pv->pv_dev), MINOR(pv->pv_dev));
+	sz += sprintf ( page+sz, "uuid:         %s\n", pv->pv_uuid);
+
+
+	return sz;
+}
+
+
+/*
  * block device support function for /usr/src/linux/drivers/block/ll_rw_blk.c
  * (see init_module/lvm_init)
  */
 static int lvm_map(struct buffer_head *bh, int rw)
 {
-	int minor = MINOR(bh->b_rdev);
+	int minor = MINOR(bh->b_dev);
+	int ret = 0;
 	ulong index;
 	ulong pe_start;
 	ulong size = bh->b_size >> 9;
-	ulong rsector_tmp = bh->b_rsector;
+	ulong rsector_tmp = bh->b_blocknr * size;
 	ulong rsector_sav;
-	kdev_t rdev_tmp = bh->b_rdev;
+	kdev_t rdev_tmp = bh->b_dev;
 	kdev_t rdev_sav;
-	lv_t *lv = vg[VG_BLK(minor)]->lv[LV_BLK(minor)];
+	vg_t *vg_this = vg[VG_BLK(minor)];
+	lv_t *lv = vg_this->lv[LV_BLK(minor)];
 
 
 	if (!(lv->lv_status & LV_ACTIVE)) {
 		printk(KERN_ALERT
 		       "%s - lvm_map: ll_rw_blk for inactive LV %s\n",
 		       lvm_name, lv->lv_name);
-		goto error;
-	}
-/*
-   if ( lv->lv_access & LV_SNAPSHOT)
-   printk ( "%s -- %02d:%02d  block: %lu  rw: %d\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), bh->b_blocknr, rw);
- */
-
-	/* take care of snapshot chunk writes before
-	   check for writable logical volume */
-	if ((lv->lv_access & LV_SNAPSHOT) &&
-	    MAJOR(bh->b_rdev) != 0 &&
-	    MAJOR(bh->b_rdev) != MAJOR_NR &&
-	    (rw == WRITEA || rw == WRITE))
-	{
-   printk ( "%s -- doing snapshot write for %02d:%02d[%02d:%02d]  b_blocknr: %lu  b_rsector: %lu\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), MAJOR ( bh->b_rdev), MINOR ( bh->b_rdev), bh->b_blocknr, bh->b_rsector);
-		goto error;
+		return -1;
 	}
 
 	if ((rw == WRITE || rw == WRITEA) &&
@@ -1301,7 +1563,7 @@ static int lvm_map(struct buffer_head *bh, int rw)
 		printk(KERN_CRIT
 		    "%s - lvm_map: ll_rw_blk write for readonly LV %s\n",
 		       lvm_name, lv->lv_name);
-		goto error;
+		return -1;
 	}
 #ifdef DEBUG_MAP
 	printk(KERN_DEBUG
@@ -1315,9 +1577,10 @@ static int lvm_map(struct buffer_head *bh, int rw)
 
 	if (rsector_tmp + size > lv->lv_size) {
 		printk(KERN_ALERT
-		       "%s - lvm_map *rsector: %lu or size: %lu wrong for"
-		    " minor: %2d\n", lvm_name, rsector_tmp, size, minor);
-		goto error;
+		       "%s - lvm_map access beyond end of device; *rsector: "
+                       "%lu or size: %lu wrong for minor: %2d\n",
+                       lvm_name, rsector_tmp, size, minor);
+		return -1;
 	}
 	rsector_sav = rsector_tmp;
 	rdev_sav = rdev_tmp;
@@ -1326,10 +1589,10 @@ lvm_second_remap:
 	/* linear mapping */
 	if (lv->lv_stripes < 2) {
 		/* get the index */
-		index = rsector_tmp / vg[VG_BLK(minor)]->pe_size;
+		index = rsector_tmp / vg_this->pe_size;
 		pe_start = lv->lv_current_pe[index].pe;
 		rsector_tmp = lv->lv_current_pe[index].pe +
-		    (rsector_tmp % vg[VG_BLK(minor)]->pe_size);
+		    (rsector_tmp % vg_this->pe_size);
 		rdev_tmp = lv->lv_current_pe[index].dev;
 
 #ifdef DEBUG_MAP
@@ -1347,7 +1610,7 @@ lvm_second_remap:
 		ulong stripe_index;
 		ulong stripe_length;
 
-		stripe_length = vg[VG_BLK(minor)]->pe_size * lv->lv_stripes;
+		stripe_length = vg_this->pe_size * lv->lv_stripes;
 		stripe_index = (rsector_tmp % stripe_length) / lv->lv_stripesize;
 		index = rsector_tmp / stripe_length +
 		    (stripe_index % lv->lv_stripes) *
@@ -1379,7 +1642,7 @@ lvm_second_remap:
 		if (rdev_tmp == pe_lock_req.data.pv_dev &&
 		    rsector_tmp >= pe_lock_req.data.pv_offset &&
 		    rsector_tmp < (pe_lock_req.data.pv_offset +
-				   vg[VG_BLK(minor)]->pe_size)) {
+				   vg_this->pe_size)) {
 			sleep_on(&lvm_map_wait);
 			rsector_tmp = rsector_sav;
 			rdev_tmp = rdev_sav;
@@ -1393,7 +1656,7 @@ lvm_second_remap:
 		lv->lv_current_pe[index].reads++;
 
 	/* snapshot volume exception handling on physical device address base */
-	if (lv->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG)) {
+	if (lv->lv_access & (LV_SNAPSHOT|LV_SNAPSHOT_ORG)) {
 		/* original logical volume */
 		if (lv->lv_access & LV_SNAPSHOT_ORG) {
 			if (rw == WRITE || rw == WRITEA)
@@ -1404,6 +1667,8 @@ lvm_second_remap:
 				for (lv_ptr = lv->lv_snapshot_next;
 				     lv_ptr != NULL;
 				     lv_ptr = lv_ptr->lv_snapshot_next) {
+					/* Check for inactive snapshot */
+					if (!(lv_ptr->lv_status & LV_ACTIVE)) continue;
 					down(&lv->lv_snapshot_org->lv_snapshot_sem);
 					/* do we still have exception storage for this snapshot free? */
 					if (lv_ptr->lv_block_exception != NULL) {
@@ -1414,11 +1679,13 @@ lvm_second_remap:
 									      pe_start,
 									      lv_ptr)) {
 							/* create a new mapping */
-							lvm_snapshot_COW(rdev_tmp,
-									 rsector_tmp,
-									 pe_start,
-									 rsector_sav,
-									 lv_ptr); 
+							if (!(ret = lvm_snapshot_COW(rdev_tmp,
+									       	     rsector_tmp,
+									             pe_start,
+									             rsector_sav,
+									             lv_ptr)))
+								ret = lvm_write_COW_table_block(vg_this,
+												lv_ptr);
 						}
 						rdev_tmp = rdev_sav;
 						rsector_tmp = rsector_sav;
@@ -1437,11 +1704,7 @@ lvm_second_remap:
 	bh->b_rdev = rdev_tmp;
 	bh->b_rsector = rsector_tmp;
 
-	return 1;
-
- error:
-	buffer_IO_error(bh);
-	return -1;
+	return ret;
 } /* lvm_map() */
 
 
@@ -1487,7 +1750,9 @@ static void lvm_dummy_device_request(request_queue_t * t)
 /*
  * make request function
  */
-static int lvm_make_request_fn(request_queue_t *q, int rw, struct buffer_head *bh)
+static int lvm_make_request_fn(request_queue_t *q,
+			       int rw,
+			       struct buffer_head *bh)
 {
 	if (lvm_map(bh, rw)<0)
 		return 0; /* failure, buffer_IO_error has been called, don't recurse */
@@ -1495,12 +1760,6 @@ static int lvm_make_request_fn(request_queue_t *q, int rw, struct buffer_head *b
 		return 1; /* all ok, mapping done, call lower level driver */
 }
 
-/*
- * plug device function is a noop because plugging has to happen
- * in the queue of the physical blockdevice to allow the
- * elevator to do a better job.
- */
-static void lvm_plug_device_noop(request_queue_t *q, kdev_t dev) { }
 
 /********************************************************************
  *
@@ -1563,7 +1822,8 @@ static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg)
 
 	case UNLOCK_PE:
 		pe_lock_req.lock = UNLOCK_PE;
-		pe_lock_req.data.lv_dev = pe_lock_req.data.pv_dev = 0;
+		pe_lock_req.data.lv_dev = \
+		pe_lock_req.data.pv_dev = \
 		pe_lock_req.data.pv_offset = 0;
 		wake_up(&lvm_map_wait);
 		break;
@@ -1593,8 +1853,7 @@ static int lvm_do_le_remap(vg_t *vg_ptr, void *arg)
 		if (lv_ptr != NULL &&
 		    strcmp(lv_ptr->lv_name,
 			       le_remap_req.lv_name) == 0) {
-			for (le = 0; le < lv_ptr->lv_allocated_le;
-			     le++) {
+			for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
 				if (lv_ptr->lv_current_pe[le].dev ==
 				    le_remap_req.old_dev &&
 				    lv_ptr->lv_current_pe[le].pe ==
@@ -1618,12 +1877,11 @@ static int lvm_do_le_remap(vg_t *vg_ptr, void *arg)
  */
 int lvm_do_vg_create(int minor, void *arg)
 {
-	int snaporg_minor = 0;
-	ulong l, p;
+	int ret = 0;
+	ulong l, ls = 0, p, size;
 	lv_t lv;
 	vg_t *vg_ptr;
-	pv_t *pv_ptr;
-	lv_t *lv_ptr;
+	lv_t **snap_lv_ptr;
 
 	if (vg[VG_CHR(minor)] != NULL) return -EPERM;
 
@@ -1639,18 +1897,11 @@ int lvm_do_vg_create(int minor, void *arg)
 		return -EFAULT;
 	}
 
-	vg_devfs_handle[vg_ptr->vg_number] = devfs_mk_dir(0, vg_ptr->vg_name, NULL);
-	ch_devfs_handle[vg_ptr->vg_number] = devfs_register(
-		vg_devfs_handle[vg_ptr->vg_number] , "group",
-		DEVFS_FL_DEFAULT, LVM_CHAR_MAJOR, vg_ptr->vg_number,
-		S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP,
-		&lvm_chr_fops, NULL);
-
 	/* we are not that active so far... */
 	vg_ptr->vg_status &= ~VG_ACTIVE;
 	vg[VG_CHR(minor)] = vg_ptr;
-
 	vg[VG_CHR(minor)]->pe_allocated = 0;
+
 	if (vg_ptr->pv_max > ABS_MAX_PV) {
 		printk(KERN_WARNING
 		       "%s -- Can't activate VG: ABS_MAX_PV too small\n",
@@ -1667,38 +1918,30 @@ int lvm_do_vg_create(int minor, void *arg)
 		vg_ptr = NULL;
 		return -EPERM;
 	}
+
 	/* get the physical volume structures */
 	vg_ptr->pv_act = vg_ptr->pv_cur = 0;
 	for (p = 0; p < vg_ptr->pv_max; p++) {
 		/* user space address */
 		if ((pvp = vg_ptr->pv[p]) != NULL) {
-			pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL);
-			if (pv_ptr == NULL) {
-				printk(KERN_CRIT
-				       "%s -- VG_CREATE: kmalloc error PV at line %d\n",
-				       lvm_name, __LINE__);
+			ret = lvm_do_pv_create(pvp, vg_ptr, p);
+			if ( ret != 0) {
 				lvm_do_vg_remove(minor);
-				return -ENOMEM;
-			}
-			if (copy_from_user(pv_ptr, pvp, sizeof(pv_t)) != 0) {
-				lvm_do_vg_remove(minor);
-				return -EFAULT;
+				return ret;
 			}
-			/* We don't need the PE list
-			   in kernel space as with LVs pe_t list (see below) */
-			pv_ptr->pe = NULL;
-			pv_ptr->pe_allocated = 0;
-			pv_ptr->pv_status = PV_ACTIVE;
-			vg_ptr->pv_act++;
-			vg_ptr->pv_cur++;
-
-#ifdef LVM_GET_INODE
-			/* insert a dummy inode for fs_may_mount */
-			pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev);
-#endif
 		}
 	}
 
+	size = vg_ptr->lv_max * sizeof(lv_t *);
+	if ((snap_lv_ptr = vmalloc ( size)) == NULL) {
+		printk(KERN_CRIT
+		       "%s -- VG_CREATE: vmalloc error snapshot LVs at line %d\n",
+		       lvm_name, __LINE__);
+		lvm_do_vg_remove(minor);
+		return -EFAULT;
+	}
+	memset(snap_lv_ptr, 0, size);
+
 	/* get the logical volume structures */
 	vg_ptr->lv_cur = 0;
 	for (l = 0; l < vg_ptr->lv_max; l++) {
@@ -1708,7 +1951,14 @@ int lvm_do_vg_create(int minor, void *arg)
 				lvm_do_vg_remove(minor);
 				return -EFAULT;
 			}
+			if ( lv.lv_access & LV_SNAPSHOT) {
+				snap_lv_ptr[ls] = lvp;
+				vg_ptr->lv[l] = NULL;
+				ls++;
+				continue;
+			}
 			vg_ptr->lv[l] = NULL;
+			/* only create original logical volumes for now */
 			if (lvm_do_lv_create(minor, lv.lv_name, &lv) != 0) {
 				lvm_do_vg_remove(minor);
 				return -EFAULT;
@@ -1718,55 +1968,41 @@ int lvm_do_vg_create(int minor, void *arg)
 
 	/* Second path to correct snapshot logical volumes which are not
 	   in place during first path above */
-	for (l = 0; l < vg_ptr->lv_max; l++) {
-		if ((lv_ptr = vg_ptr->lv[l]) != NULL &&
-		    vg_ptr->lv[l]->lv_access & LV_SNAPSHOT) {
-			snaporg_minor = lv_ptr->lv_snapshot_minor;
-			if (vg_ptr->lv[LV_BLK(snaporg_minor)] != NULL) {
-				/* get pointer to original logical volume */
-				lv_ptr = vg_ptr->lv[l]->lv_snapshot_org =
-				vg_ptr->lv[LV_BLK(snaporg_minor)];
-
-				/* set necessary fields of original logical volume */
-				lv_ptr->lv_access |= LV_SNAPSHOT_ORG;
-				lv_ptr->lv_snapshot_minor = 0;
-				lv_ptr->lv_snapshot_org = lv_ptr;
-				lv_ptr->lv_snapshot_prev = NULL;
+	for (l = 0; l < ls; l++) {
+		lvp = snap_lv_ptr[l];
+		if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) {
+			lvm_do_vg_remove(minor);
+			return -EFAULT;
+		}
+		if (lvm_do_lv_create(minor, lv.lv_name, &lv) != 0) {
+			lvm_do_vg_remove(minor);
+			return -EFAULT;
+		}
+	}
 
-				/* find last snapshot logical volume in the chain */
-				while (lv_ptr->lv_snapshot_next != NULL)
-					lv_ptr = lv_ptr->lv_snapshot_next;
+#ifdef	CONFIG_DEVFS_FS
+	vg_devfs_handle[vg_ptr->vg_number] = devfs_mk_dir(0, vg_ptr->vg_name, NULL);
+	ch_devfs_handle[vg_ptr->vg_number] = devfs_register(
+		vg_devfs_handle[vg_ptr->vg_number] , "group",
+		DEVFS_FL_DEFAULT, LVM_CHAR_MAJOR, vg_ptr->vg_number,
+		S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP,
+		&lvm_chr_fops, NULL);
+#endif
 
-				/* set back pointer to this last one in our new logical volume */
-				vg_ptr->lv[l]->lv_snapshot_prev = lv_ptr;
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_create_proc_entry_of_vg ( vg_ptr);
+#endif
 
-				/* last logical volume now points to our new snapshot volume */
-				lv_ptr->lv_snapshot_next = vg_ptr->lv[l];
+	vfree(snap_lv_ptr);
 
-				/* now point to the new one */
-				lv_ptr = lv_ptr->lv_snapshot_next;
+	vg_count++;
 
-				/* set necessary fields of new snapshot logical volume */
-				lv_ptr->lv_snapshot_next = NULL;
-				lv_ptr->lv_current_pe =
-				    vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_current_pe;
-				lv_ptr->lv_allocated_le =
-				    vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_allocated_le;
-				lv_ptr->lv_current_le =
-				    vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_current_le;
-				lv_ptr->lv_size =
-				    vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_size;
-			}
-		}
-	}
 
-	vg_count++;
+	MOD_INC_USE_COUNT;
 
 	/* let's go active */
 	vg_ptr->vg_status |= VG_ACTIVE;
 
-	MOD_INC_USE_COUNT;
-
 	return 0;
 } /* lvm_do_vg_create() */
 
@@ -1776,26 +2012,18 @@ int lvm_do_vg_create(int minor, void *arg)
  */
 static int lvm_do_vg_extend(vg_t *vg_ptr, void *arg)
 {
+	int ret = 0;
 	uint p;
 	pv_t *pv_ptr;
 
 	if (vg_ptr == NULL) return -ENXIO;
 	if (vg_ptr->pv_cur < vg_ptr->pv_max) {
 		for (p = 0; p < vg_ptr->pv_max; p++) {
-			if (vg_ptr->pv[p] == NULL) {
-				if ((pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL)) == NULL) {
-					printk(KERN_CRIT
-					       "%s -- VG_EXTEND: kmalloc error PV at line %d\n",
-					     lvm_name, __LINE__);
-					return -ENOMEM;
-				}
-				if (copy_from_user(pv_ptr, arg, sizeof(pv_t)) != 0) {
-					kfree(pv_ptr);
-					vg_ptr->pv[p] = NULL;
-				 	return -EFAULT;
-				}
+			if ( ( pv_ptr = vg_ptr->pv[p]) == NULL) {
+				ret = lvm_do_pv_create(arg, vg_ptr, p);
+				lvm_do_create_proc_entry_of_pv ( vg_ptr, pv_ptr);
+				if ( ret != 0) return ret;
 	
-				pv_ptr->pv_status = PV_ACTIVE;
 				/* We don't need the PE list
 				   in kernel space like LVs pe_t list */
 				pv_ptr->pe = NULL;
@@ -1818,8 +2046,7 @@ return -EPERM;
 /*
  * character device support function VGDA reduce
  */
-static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg)
-{
+static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg) {
 	uint p;
 	pv_t *pv_ptr;
 
@@ -1837,10 +2064,7 @@ static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg)
 			    pv_ptr->pe_total;
 			vg_ptr->pv_cur--;
 			vg_ptr->pv_act--;
-#ifdef LVM_GET_INODE
-			lvm_clear_inode(pv_ptr->inode);
-#endif
-			kfree(pv_ptr);
+			lvm_do_pv_remove(vg_ptr, p);
 			/* Make PV pointer array contiguous */
 			for (; p < vg_ptr->pv_max - 1; p++)
 				vg_ptr->pv[p] = vg_ptr->pv[p + 1];
@@ -1853,6 +2077,53 @@ static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg)
 
 
 /*
+ * character device support function VG rename
+ */
+static int lvm_do_vg_rename(vg_t *vg_ptr, void *arg)
+{
+	int l = 0, p = 0, len = 0;
+	char vg_name[NAME_LEN] = { 0,};
+	char lv_name[NAME_LEN] = { 0,};
+	char *ptr = NULL;
+	lv_t *lv_ptr = NULL;
+	pv_t *pv_ptr = NULL;
+
+	if (copy_from_user(vg_name, arg, sizeof(vg_name)) != 0)
+		return -EFAULT;
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_remove_proc_entry_of_vg ( vg_ptr);
+#endif
+
+	strncpy ( vg_ptr->vg_name, vg_name, sizeof ( vg_name)-1);
+	for ( l = 0; l < vg_ptr->lv_max; l++)
+	{
+		if ((lv_ptr = vg_ptr->lv[l]) == NULL) continue;
+		strncpy(lv_ptr->vg_name, vg_name, sizeof ( vg_name));
+		ptr = strrchr(lv_ptr->lv_name, '/');
+		if (ptr == NULL) ptr = lv_ptr->lv_name;
+		strncpy(lv_name, ptr, sizeof ( lv_name));
+		len = sizeof(LVM_DIR_PREFIX);
+		strcpy(lv_ptr->lv_name, LVM_DIR_PREFIX);
+		strncat(lv_ptr->lv_name, vg_name, NAME_LEN - len);
+		len += strlen ( vg_name);
+		strncat(lv_ptr->lv_name, lv_name, NAME_LEN - len);
+	}
+	for ( p = 0; p < vg_ptr->pv_max; p++)
+	{
+		if ( (pv_ptr = vg_ptr->pv[p]) == NULL) continue;
+		strncpy(pv_ptr->vg_name, vg_name, NAME_LEN);
+	}
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_create_proc_entry_of_vg ( vg_ptr);
+#endif
+
+	return 0;
+} /* lvm_do_vg_rename */
+
+
+/*
  * character device support function VGDA remove
  */
 static int lvm_do_vg_remove(int minor)
@@ -1873,9 +2144,6 @@ static int lvm_do_vg_remove(int minor)
 	/* let's go inactive */
 	vg_ptr->vg_status &= ~VG_ACTIVE;
 
-	devfs_unregister (ch_devfs_handle[vg_ptr->vg_number]);
-	devfs_unregister (vg_devfs_handle[vg_ptr->vg_number]);
-
 	/* free LVs */
 	/* first free snapshot logical volumes */
 	for (i = 0; i < vg_ptr->lv_max; i++) {
@@ -1902,17 +2170,23 @@ static int lvm_do_vg_remove(int minor)
 			printk(KERN_DEBUG
 			       "%s -- kfree %d\n", lvm_name, __LINE__);
 #endif
-#ifdef LVM_GET_INODE
-			lvm_clear_inode(pv_ptr->inode);
-#endif
-			kfree(pv_ptr);
-			vg[VG_CHR(minor)]->pv[i] = NULL;
+			lvm_do_pv_remove(vg_ptr, i);
 		}
 	}
 
+#ifdef	CONFIG_DEVFS_FS
+	devfs_unregister (ch_devfs_handle[vg_ptr->vg_number]);
+	devfs_unregister (vg_devfs_handle[vg_ptr->vg_number]);
+#endif
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_remove_proc_entry_of_vg ( vg_ptr);
+#endif
+
 #ifdef DEBUG_KFREE
 	printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__);
 #endif
+
 	kfree(vg_ptr);
 	vg[VG_CHR(minor)] = NULL;
 
@@ -1925,13 +2199,68 @@ static int lvm_do_vg_remove(int minor)
 
 
 /*
+ * character device support function physical volume create
+ */
+static int lvm_do_pv_create(pv_t *pvp, vg_t *vg_ptr, ulong p) {
+	pv_t *pv_ptr = NULL;
+
+	pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL);
+	if (pv_ptr == NULL) {
+		printk(KERN_CRIT
+		       "%s -- VG_CREATE: kmalloc error PV at line %d\n",
+		       lvm_name, __LINE__);
+		return -ENOMEM;
+	}
+	if (copy_from_user(pv_ptr, pvp, sizeof(pv_t)) != 0) {
+		return -EFAULT;
+	}
+	/* We don't need the PE list
+	   in kernel space as with LVs pe_t list (see below) */
+	pv_ptr->pe = NULL;
+	pv_ptr->pe_allocated = 0;
+	pv_ptr->pv_status = PV_ACTIVE;
+	vg_ptr->pv_act++;
+	vg_ptr->pv_cur++;
+
+#ifdef LVM_GET_INODE
+	/* insert a dummy inode for fs_may_mount */
+	pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev);
+#endif
+
+	return 0;
+} /* lvm_do_pv_create() */
+
+
+/*
+ * character device support function physical volume create
+ */
+static int lvm_do_pv_remove(vg_t *vg_ptr, ulong p) {
+	pv_t *pv_ptr = vg_ptr->pv[p];
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_remove_proc_entry_of_pv ( vg_ptr, pv_ptr);
+#endif
+	vg_ptr->pe_total -=
+	    pv_ptr->pe_total;
+	vg_ptr->pv_cur--;
+	vg_ptr->pv_act--;
+#ifdef LVM_GET_INODE
+	lvm_clear_inode(pv_ptr->inode);
+#endif
+	kfree(pv_ptr);
+	vg_ptr->pv[p] = NULL;
+
+	return 0;
+}
+
+
+/*
  * character device support function logical volume create
  */
 static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
 {
-	int l, le, l_new, p, size;
+	int e, ret, l, le, l_new, p, size;
 	ulong lv_status_save;
-	char *lv_tmp, *lv_buf = NULL;
 	lv_block_exception_t *lvbe = lv->lv_block_exception;
 	vg_t *vg_ptr = vg[VG_CHR(minor)];
 	lv_t *lv_ptr = NULL;
@@ -1946,7 +2275,7 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
 			return -EEXIST;
 	}
 
-	/* in case of lv_remove(), lv_create() pair; for eg. lvrename does this */
+	/* in case of lv_remove(), lv_create() pair */
 	l_new = -1;
 	if (vg_ptr->lv[lv->lv_number] == NULL)
 		l_new = lv->lv_number;
@@ -1957,7 +2286,7 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
 		}
 	}
 	if (l_new == -1) return -EPERM;
-	else            l = l_new;
+	else             l = l_new;
 
 	if ((lv_ptr = kmalloc(sizeof(lv_t),GFP_KERNEL)) == NULL) {;
 		printk(KERN_CRIT "%s -- LV_CREATE: kmalloc error LV at line %d\n",
@@ -1970,10 +2299,16 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
 	lv_status_save = lv_ptr->lv_status;
 	lv_ptr->lv_status &= ~LV_ACTIVE;
 	lv_ptr->lv_snapshot_org = \
-	    lv_ptr->lv_snapshot_prev = \
-	    lv_ptr->lv_snapshot_next = NULL;
+	lv_ptr->lv_snapshot_prev = \
+	lv_ptr->lv_snapshot_next = NULL;
 	lv_ptr->lv_block_exception = NULL;
+	lv_ptr->lv_iobuf = NULL;
+	lv_ptr->lv_snapshot_hash_table = NULL;
+	lv_ptr->lv_snapshot_hash_table_size = 0;
+	lv_ptr->lv_snapshot_hash_mask = 0;
+	lv_ptr->lv_COW_table_page = NULL;
 	init_MUTEX(&lv_ptr->lv_snapshot_sem);
+	lv_ptr->lv_snapshot_use_rate = 0;
 	vg_ptr->lv[l] = lv_ptr;
 
 	/* get the PE structures from user space if this
@@ -2032,7 +2367,7 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
 					vg[VG_CHR(minor)]->lv[l] = NULL;
 					return -EFAULT;
 				}
-				/* get pointer to original logical volume */
+				/* point to the original logical volume */
 				lv_ptr = lv_ptr->lv_snapshot_org;
 
 				lv_ptr->lv_snapshot_minor = 0;
@@ -2043,7 +2378,8 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
 					lv_ptr = lv_ptr->lv_snapshot_next;
 				/* now lv_ptr points to the last existing snapshot in the chain */
 				vg_ptr->lv[l]->lv_snapshot_prev = lv_ptr;
-				/* our new one now back points to the previous last in the chain */
+				/* our new one now back points to the previous last in the chain
+				   which can be the original logical volume */
 				lv_ptr = vg_ptr->lv[l];
 				/* now lv_ptr points to our new last snapshot logical volume */
 				lv_ptr->lv_snapshot_org = lv_ptr->lv_snapshot_prev->lv_snapshot_org;
@@ -2054,16 +2390,19 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
 				lv_ptr->lv_size = lv_ptr->lv_snapshot_org->lv_size;
 				lv_ptr->lv_stripes = lv_ptr->lv_snapshot_org->lv_stripes;
 				lv_ptr->lv_stripesize = lv_ptr->lv_snapshot_org->lv_stripesize;
+				if ((ret = lvm_snapshot_alloc(lv_ptr)) != 0)
 				{
-					int err = lvm_snapshot_alloc(lv_ptr);
-					if (err)
-					{
-						vfree(lv_ptr->lv_block_exception);
-						kfree(lv_ptr);
-						vg[VG_CHR(minor)]->lv[l] = NULL;
-						 return err;
-					}
+					vfree(lv_ptr->lv_block_exception);
+					kfree(lv_ptr);
+					vg[VG_CHR(minor)]->lv[l] = NULL;
+					return ret;
 				}
+				for ( e = 0; e < lv_ptr->lv_remap_ptr; e++)
+					lvm_hash_link (lv_ptr->lv_block_exception + e, lv_ptr->lv_block_exception[e].rdev_org, lv_ptr->lv_block_exception[e].rsector_org, lv_ptr);
+				/* need to fill the COW exception table data
+				   into the page for disk i/o */
+				lvm_snapshot_fill_COW_page(vg_ptr, lv_ptr);
+				init_waitqueue_head(&lv_ptr->lv_snapshot_wait);
 			} else {
 				vfree(lv_ptr->lv_block_exception);
 				kfree(lv_ptr);
@@ -2083,12 +2422,15 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
 	lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1;
 	vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = vg_ptr->vg_number;
 	vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = lv_ptr->lv_number;
-	read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead);
+	LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead);
 	vg_ptr->lv_cur++;
 	lv_ptr->lv_status = lv_status_save;
 
-	strtok(lv->lv_name, "/");	/* /dev */
+#ifdef	CONFIG_DEVFS_FS
+	{
+	char *lv_tmp, *lv_buf = NULL;
 
+	strtok(lv->lv_name, "/");       /* /dev */
 	while((lv_tmp = strtok(NULL, "/")) != NULL)
 		lv_buf = lv_tmp;
 
@@ -2097,15 +2439,43 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
 		DEVFS_FL_DEFAULT, LVM_BLK_MAJOR, lv->lv_number,
 		S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
 		&lvm_blk_dops, NULL);
+	}
+#endif
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_create_proc_entry_of_lv ( vg_ptr, lv_ptr);
+#endif
 
 	/* optionally add our new snapshot LV */
 	if (lv_ptr->lv_access & LV_SNAPSHOT) {
 		/* sync the original logical volume */
 		fsync_dev(lv_ptr->lv_snapshot_org->lv_dev);
+#ifdef	LVM_VFS_ENHANCEMENT
+		/* VFS function call to sync and lock the filesystem */
+		fsync_dev_lockfs(lv_ptr->lv_snapshot_org->lv_dev);
+#endif
+		lv_ptr->lv_snapshot_org->lv_access |= LV_SNAPSHOT_ORG;
+		lv_ptr->lv_access &= ~LV_SNAPSHOT_ORG;
 		/* put ourselve into the chain */
 		lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr;
-		lv_ptr->lv_snapshot_org->lv_access |= LV_SNAPSHOT_ORG;
 	}
+
+	/* activate the logical volume */
+	lv_ptr->lv_status |= LV_ACTIVE;
+	if ( lv_ptr->lv_access & LV_WRITE)
+		set_device_ro(lv_ptr->lv_dev, 0);
+	else
+		set_device_ro(lv_ptr->lv_dev, 1);
+
+#ifdef	LVM_VFS_ENHANCEMENT
+/* VFS function call to unlock the filesystem */
+	if (lv_ptr->lv_access & LV_SNAPSHOT) {
+		unlockfs(lv_ptr->lv_snapshot_org->lv_dev);
+	}
+#endif
+
+	lv_ptr->vg = vg_ptr;
+
 	return 0;
 } /* lvm_do_lv_create() */
 
@@ -2176,7 +2546,7 @@ static int lvm_do_lv_remove(int minor, char *lv_name, int l)
 			}
 		}
 		vfree(lv_ptr->lv_current_pe);
-		/* LV_SNAPSHOT */
+	/* LV_SNAPSHOT */
 	} else {
 		/* remove this snapshot logical volume from the chain */
 		lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr->lv_snapshot_next;
@@ -2190,7 +2560,13 @@ static int lvm_do_lv_remove(int minor, char *lv_name, int l)
 		lvm_snapshot_release(lv_ptr);
 	}
 
+#ifdef	CONFIG_DEVFS_FS
 	devfs_unregister(lv_devfs_handle[lv_ptr->lv_number]);
+#endif
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_remove_proc_entry_of_lv ( vg_ptr, lv_ptr);
+#endif
 
 #ifdef DEBUG_KFREE
 	printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__);
@@ -2207,8 +2583,7 @@ static int lvm_do_lv_remove(int minor, char *lv_name, int l)
  */
 static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv)
 {
-	int l, le, p, size, old_allocated_le;
-	uint32_t end, lv_status_save;
+	ulong end, l, le, p, size, old_allocated_le;
 	vg_t *vg_ptr = vg[VG_CHR(minor)];
 	lv_t *lv_ptr;
 	pe_t *pe;
@@ -2224,12 +2599,75 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv)
 	lv_ptr = vg_ptr->lv[l];
 
 	/* check for active snapshot */
-	if (lv->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG)) return -EPERM;
+	if (lv->lv_access & LV_SNAPSHOT)
+	{
+		ulong e;
+		lv_block_exception_t *lvbe, *lvbe_old;
+		struct list_head * lvs_hash_table_old;
+
+		if (lv->lv_block_exception == NULL) return -ENXIO;
+		size = lv->lv_remap_end * sizeof ( lv_block_exception_t);
+		if ((lvbe = vmalloc(size)) == NULL)
+		{
+			printk(KERN_CRIT
+			"%s -- lvm_do_lv_extend_reduce: vmalloc error LV_BLOCK_EXCEPTION "
+			       "of %lu Byte at line %d\n",
+			       lvm_name, size, __LINE__);
+			return -ENOMEM;
+		}
+		if (lv->lv_remap_end > lv_ptr->lv_remap_end)
+		{
+			if (copy_from_user(lvbe, lv->lv_block_exception, size))
+			{
+				vfree(lvbe);
+				return -EFAULT;
+			}
+		}
+
+		lvbe_old = lv_ptr->lv_block_exception;
+		lvs_hash_table_old = lv_ptr->lv_snapshot_hash_table;
+
+		/* we need to play on the safe side here... */
+		down(&lv_ptr->lv_snapshot_org->lv_snapshot_sem);
+		if (lv_ptr->lv_block_exception == NULL ||
+		    lv_ptr->lv_remap_ptr > lv_ptr->lv_remap_end)
+		{
+			up(&lv_ptr->lv_snapshot_org->lv_snapshot_sem);
+			vfree(lvbe);
+			return -EPERM;
+		}
+		memcpy(lvbe,
+		       lv_ptr->lv_block_exception,
+		       (lv->lv_remap_end > lv_ptr->lv_remap_end ? lv_ptr->lv_remap_ptr : lv->lv_remap_end) * sizeof(lv_block_exception_t));
+
+		lv_ptr->lv_block_exception = lvbe;
+		lv_ptr->lv_remap_end = lv->lv_remap_end;
+		if (lvm_snapshot_alloc_hash_table(lv_ptr) != 0)
+		{
+			lvm_drop_snapshot(lv_ptr, "hash_alloc");
+			up(&lv_ptr->lv_snapshot_org->lv_snapshot_sem);
+			vfree(lvbe_old);
+			vfree(lvs_hash_table_old);
+			return 1;
+		}
+
+		for (e = 0; e < lv_ptr->lv_remap_ptr; e++)
+			lvm_hash_link (lv_ptr->lv_block_exception + e, lv_ptr->lv_block_exception[e].rdev_org, lv_ptr->lv_block_exception[e].rsector_org, lv_ptr);
+
+		up(&lv_ptr->lv_snapshot_org->lv_snapshot_sem);
+
+		vfree(lvbe_old);
+		vfree(lvs_hash_table_old);
+
+		return 0;
+	}
+
 
+	/* we drop in here in case it is an original logical volume */
 	if ((pe = vmalloc(size = lv->lv_current_le * sizeof(pe_t))) == NULL) {
 		printk(KERN_CRIT
 		"%s -- lvm_do_lv_extend_reduce: vmalloc error LV_CURRENT_PE "
-		       "of %d Byte at line %d\n",
+		       "of %lu Byte at line %d\n",
 		       lvm_name, size, __LINE__);
 		return -ENOMEM;
 	}
@@ -2248,11 +2686,6 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv)
 	       vg_ptr->vg_name);
 #endif
 
-	lv_ptr->lv_status |= LV_SPINDOWN;
-	fsync_dev(lv_ptr->lv_dev);
-	lv_ptr->lv_status &= ~LV_ACTIVE;
-	invalidate_buffers(lv_ptr->lv_dev);
-
 	/* reduce allocation counters on PV(s) */
 	for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
 		vg_ptr->pe_allocated--;
@@ -2270,19 +2703,29 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv)
 	pep1 = lv_ptr->lv_current_pe;
 	end = lv_ptr->lv_current_le;
 
-	/* save open counter */
-	lv_open = lv_ptr->lv_open;
+	/* save open counter... */
+	lv->lv_open = lv_ptr->lv_open;
+	lv->lv_snapshot_prev = lv_ptr->lv_snapshot_prev;
+	lv->lv_snapshot_next = lv_ptr->lv_snapshot_next;
+	lv->lv_snapshot_org  = lv_ptr->lv_snapshot_org;
+
+	lv->lv_current_pe = pe;
 
 	/* save # of old allocated logical extents */
 	old_allocated_le = lv_ptr->lv_allocated_le;
 
+        /* in case of shrinking -> let's flush */
+        if ( end > lv->lv_current_le) fsync_dev(lv_ptr->lv_dev);
+
 	/* copy preloaded LV */
-	lv_status_save = lv->lv_status;
-	lv->lv_status |= LV_SPINDOWN;
-	lv->lv_status &= ~LV_ACTIVE;
 	memcpy((char *) lv_ptr, (char *) lv, sizeof(lv_t));
-	lv_ptr->lv_current_pe = pe;
-	lv_ptr->lv_open = lv_open;
+
+	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0;
+	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size;
+	lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1;
+	/* vg_lv_map array doesn't have to be changed here */
+
+	LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead);
 
 	/* save availiable i/o statistic data */
 	/* linear logical volume */
@@ -2290,8 +2733,8 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv)
 		/* Check what last LE shall be used */
 		if (end > lv_ptr->lv_current_le) end = lv_ptr->lv_current_le;
 		for (le = 0; le < end; le++) {
-			lv_ptr->lv_current_pe[le].reads  = pep1[le].reads;
-			lv_ptr->lv_current_pe[le].writes = pep1[le].writes;
+			lv_ptr->lv_current_pe[le].reads  += pep1[le].reads;
+			lv_ptr->lv_current_pe[le].writes += pep1[le].writes;
 		}
 		/* striped logical volume */
 	} else {
@@ -2304,38 +2747,44 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv)
 		for (i = source = dest = 0;
 		     i < lv_ptr->lv_stripes; i++) {
 			for (j = 0; j < end; j++) {
-				lv_ptr->lv_current_pe[dest + j].reads =
+				lv_ptr->lv_current_pe[dest + j].reads +=
 				    pep1[source + j].reads;
-				lv_ptr->lv_current_pe[dest + j].writes =
+				lv_ptr->lv_current_pe[dest + j].writes +=
 				    pep1[source + j].writes;
 			}
 			source += old_stripe_size;
 			dest += new_stripe_size;
 		}
 	}
-	vfree(pep1);
-	pep1 = NULL;
-
 
 	/* extend the PE count in PVs */
 	for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
 		vg_ptr->pe_allocated++;
 		for (p = 0; p < vg_ptr->pv_cur; p++) {
 			if (vg_ptr->pv[p]->pv_dev ==
-			vg_ptr->lv[l]->lv_current_pe[le].dev) {
+                            lv_ptr->lv_current_pe[le].dev) {
 				vg_ptr->pv[p]->pe_allocated++;
 				break;
 			}
 		}
 	}
 
-	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0;
-	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size;
-	lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1;
-	/* vg_lv_map array doesn't have to be changed here */
+	vfree ( pep1);
+	pep1 = NULL;
 
-	read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead);
-	lv_ptr->lv_status = lv_status_save;
+	if (lv->lv_access & LV_SNAPSHOT_ORG)
+	{
+		/* Correct the snapshot size information */
+		while ((lv_ptr = lv_ptr->lv_snapshot_next) != NULL)
+		{
+			lv_ptr->lv_current_pe = lv_ptr->lv_snapshot_org->lv_current_pe;
+			lv_ptr->lv_allocated_le = lv_ptr->lv_snapshot_org->lv_allocated_le;
+			lv_ptr->lv_current_le = lv_ptr->lv_snapshot_org->lv_current_le;
+			lv_ptr->lv_size = lv_ptr->lv_snapshot_org->lv_size;
+			lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size;
+			lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1;
+		}
+	}
 
 	return 0;
 } /* lvm_do_lv_extend_reduce() */
@@ -2425,6 +2874,65 @@ static int lvm_do_lv_status_byindex(vg_t *vg_ptr,void *arg)
 
 
 /*
+ * character device support function logical volume status by device number
+ */
+static int lvm_do_lv_status_bydev(vg_t * vg_ptr, void * arg) {
+	int l;
+	lv_status_bydev_req_t lv_status_bydev_req;
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (copy_from_user(&lv_status_bydev_req, arg,
+			   sizeof(lv_status_bydev_req)) != 0)
+		return -EFAULT;
+
+	for ( l = 0; l < vg_ptr->lv_max; l++) {
+		if ( vg_ptr->lv[l] == NULL) continue;
+		if ( vg_ptr->lv[l]->lv_dev == lv_status_bydev_req.dev) break;
+	}
+
+	if ( l == vg_ptr->lv_max) return -ENXIO;
+
+	if (copy_to_user(lv_status_bydev_req.lv,
+			 vg_ptr->lv[l], sizeof(lv_t)) != 0)
+		return -EFAULT;
+
+	return 0;
+} /* lvm_do_lv_status_bydev() */
+
+
+/*
+ * character device support function rename a logical volume
+ */
+static int lvm_do_lv_rename(vg_t *vg_ptr, lv_req_t *lv_req, lv_t *lv)
+{
+	int l = 0;
+	int ret = 0;
+	lv_t *lv_ptr = NULL;
+
+	for (l = 0; l < vg_ptr->lv_max; l++)
+	{
+		if ( (lv_ptr = vg_ptr->lv[l]) == NULL) continue;
+		if (lv_ptr->lv_dev == lv->lv_dev)
+		{
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+			lvm_do_remove_proc_entry_of_lv ( vg_ptr, lv_ptr);
+#endif
+			strncpy(lv_ptr->lv_name,
+				lv_req->lv_name,
+				NAME_LEN);
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+			lvm_do_create_proc_entry_of_lv ( vg_ptr, lv_ptr);
+#endif
+			break;
+		}
+	}
+	if (l == vg_ptr->lv_max) ret = -ENODEV;
+
+	return ret;
+} /* lvm_do_lv_rename */
+
+
+/*
  * character device support function physical volume change
  */
 static int lvm_do_pv_change(vg_t *vg_ptr, void *arg)
@@ -2494,6 +3002,140 @@ static int lvm_do_pv_status(vg_t *vg_ptr, void *arg)
 } /* lvm_do_pv_status() */
 
 
+
+/*
+ * create a /proc entry for a logical volume
+ */
+inline void lvm_do_create_proc_entry_of_lv ( vg_t *vg_ptr, lv_t *lv_ptr) {
+	char *basename;
+
+	if ( vg_ptr->lv_subdir_pde != NULL) {
+		basename = strrchr(lv_ptr->lv_name, '/');
+		if (basename == NULL) basename = lv_ptr->lv_name;
+		else		      basename++;
+		pde = create_proc_entry(basename, S_IFREG,
+					vg_ptr->lv_subdir_pde);
+		if ( pde != NULL) {
+			pde->read_proc = lvm_proc_read_lv_info;
+			pde->data = lv_ptr;
+		}
+	}
+}
+
+
+/*
+ * remove a /proc entry for a logical volume
+ */
+inline void lvm_do_remove_proc_entry_of_lv ( vg_t *vg_ptr, lv_t *lv_ptr) {
+	char *basename;
+
+	if ( vg_ptr->lv_subdir_pde != NULL) {
+		basename = strrchr(lv_ptr->lv_name, '/');
+		if (basename == NULL) basename = lv_ptr->lv_name;
+		else		      basename++;
+		remove_proc_entry(basename, vg_ptr->lv_subdir_pde);
+	}
+}
+
+
+/*
+ * create a /proc entry for a physical volume
+ */
+inline void lvm_do_create_proc_entry_of_pv ( vg_t *vg_ptr, pv_t *pv_ptr) {
+	char *basename;
+
+	basename = strrchr(pv_ptr->pv_name, '/');
+	if (basename == NULL) basename = pv_ptr->pv_name;
+	else		      basename++;
+	pde = create_proc_entry(basename, S_IFREG, vg_ptr->pv_subdir_pde);
+	if ( pde != NULL) {
+		pde->read_proc = lvm_proc_read_pv_info;
+		pde->data = pv_ptr;
+	}
+}
+
+
+/*
+ * remove a /proc entry for a physical volume
+ */
+inline void lvm_do_remove_proc_entry_of_pv ( vg_t *vg_ptr, pv_t *pv_ptr) {
+	char *basename;
+
+	basename = strrchr(pv_ptr->pv_name, '/');
+	if ( vg_ptr->pv_subdir_pde != NULL) {
+		basename = strrchr(pv_ptr->pv_name, '/');
+		if (basename == NULL) basename = pv_ptr->pv_name;
+		else		      basename++;
+		remove_proc_entry(basename, vg_ptr->pv_subdir_pde);
+	}
+}
+
+
+/*
+ * create a /proc entry for a volume group
+ */
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+void lvm_do_create_proc_entry_of_vg ( vg_t *vg_ptr) {
+	int l, p;
+	pv_t *pv_ptr;
+	lv_t *lv_ptr;
+
+	pde = create_proc_entry(vg_ptr->vg_name, S_IFDIR,
+				lvm_proc_vg_subdir);
+	if ( pde != NULL) {
+		vg_ptr->vg_dir_pde = pde;
+		pde = create_proc_entry("group", S_IFREG,
+					vg_ptr->vg_dir_pde);
+		if ( pde != NULL) {
+			pde->read_proc = lvm_proc_read_vg_info;
+			pde->data = vg_ptr;
+		}
+		vg_ptr->lv_subdir_pde =
+			create_proc_entry(LVM_LV_SUBDIR, S_IFDIR,
+					  vg_ptr->vg_dir_pde);
+		vg_ptr->pv_subdir_pde =
+			create_proc_entry(LVM_PV_SUBDIR, S_IFDIR,
+					  vg_ptr->vg_dir_pde);
+	}
+
+	if ( vg_ptr->pv_subdir_pde != NULL) {
+		for ( l = 0; l < vg_ptr->lv_max; l++) {
+			if ( ( lv_ptr = vg_ptr->lv[l]) == NULL) continue;
+			lvm_do_create_proc_entry_of_lv ( vg_ptr, lv_ptr);
+		}
+		for ( p = 0; p < vg_ptr->pv_max; p++) {
+			if ( ( pv_ptr = vg_ptr->pv[p]) == NULL) continue;
+			lvm_do_create_proc_entry_of_pv ( vg_ptr, pv_ptr);
+		}
+	}
+}
+
+/*
+ * remove a /proc entry for a volume group
+ */
+void lvm_do_remove_proc_entry_of_vg ( vg_t *vg_ptr) {
+	int l, p;
+	lv_t *lv_ptr;
+	pv_t *pv_ptr;
+
+	for ( l = 0; l < vg_ptr->lv_max; l++) {
+		if ( ( lv_ptr = vg_ptr->lv[l]) == NULL) continue;
+		lvm_do_remove_proc_entry_of_lv ( vg_ptr, vg_ptr->lv[l]);
+	}
+	for ( p = 0; p < vg_ptr->pv_max; p++) {
+		if ( ( pv_ptr = vg_ptr->pv[p]) == NULL) continue;
+		lvm_do_remove_proc_entry_of_pv ( vg_ptr, vg_ptr->pv[p]);
+	}
+	if ( vg_ptr->vg_dir_pde != NULL) {
+		remove_proc_entry(LVM_LV_SUBDIR, vg_ptr->vg_dir_pde);
+		remove_proc_entry(LVM_PV_SUBDIR, vg_ptr->vg_dir_pde);
+		remove_proc_entry("group", vg_ptr->vg_dir_pde);
+		remove_proc_entry(vg_ptr->vg_name, lvm_proc_vg_subdir);
+	}
+}
+#endif
+
+
 /*
  * support function initialize gendisk variables
  */
@@ -2516,8 +3158,9 @@ void __init
 		lvm_blocksizes[i] = BLOCK_SIZE;
 	}
 
-	blksize_size[MAJOR_NR] = lvm_blocksizes;
 	blk_size[MAJOR_NR] = lvm_size;
+	blksize_size[MAJOR_NR] = lvm_blocksizes;
+	hardsect_size[MAJOR_NR] = lvm_blocksizes;
 
 	return;
 } /* lvm_gen_init() */
@@ -2533,17 +3176,8 @@ void __init
  *
  * Is this the real thing?
  *
- *	No, it's bollocks. md.c tries to do a bit different thing that might
- * _somewhat_ work eons ago. Neither does any good these days. mount() couldn't
- * care less for icache (it cares only for ->s_root->d_count and if we want
- * loopback mounts even that will stop). BTW, with the form used here mount()
- * would have to scan the _whole_ icache to detect the attempt - how on the
- * Earth could it guess the i_ino of your dummy inode? Official line on the
- * exclusion between mount()/swapon()/open()/etc. is Just Don't Do It(tm).
- * If you can convince Linus that it's worth changing - fine, then you'll need
- * to do blkdev_get()/blkdev_put(). Until then...
  */
-struct inode *lvm_get_inode(kdev_t dev)
+struct inode *lvm_get_inode(int dev)
 {
 	struct inode *inode_this = NULL;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 55c50c5e7..663dfd395 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -30,12 +30,12 @@ static mdk_personality_t raid5_personality;
  * Stripe cache
  */
 
-#define NR_STRIPES		128
+#define NR_STRIPES		256
 #define HASH_PAGES		1
 #define HASH_PAGES_ORDER	0
 #define NR_HASH			(HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
 #define HASH_MASK		(NR_HASH - 1)
-#define stripe_hash(conf, sect, size)	((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
+#define stripe_hash(conf, sect)	((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
 
 /*
  * The following can be used to debug the driver
@@ -44,10 +44,8 @@ static mdk_personality_t raid5_personality;
 #define RAID5_PARANOIA	1
 #if RAID5_PARANOIA && CONFIG_SMP
 # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
-# define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG()
 #else
 # define CHECK_DEVLOCK()
-# define CHECK_SHLOCK(unused)
 #endif
 
 #if RAID5_DEBUG
@@ -60,196 +58,98 @@ static mdk_personality_t raid5_personality;
 
 static void print_raid5_conf (raid5_conf_t *conf);
 
-static inline int stripe_locked(struct stripe_head *sh)
+static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 {
-	return test_bit(STRIPE_LOCKED, &sh->state);
-}
-
-static void __unlock_stripe(struct stripe_head *sh)
-{
-	if (!md_test_and_clear_bit(STRIPE_LOCKED, &sh->state))
-		BUG();
-	PRINTK("unlocking stripe %lu\n", sh->sector);
-	wake_up(&sh->wait);
+	if (atomic_dec_and_test(&sh->count)) {
+		if (!list_empty(&sh->lru))
+			BUG();
+		if (atomic_read(&conf->active_stripes)==0)
+			BUG();
+		if (test_bit(STRIPE_HANDLE, &sh->state)) {
+			list_add_tail(&sh->lru, &conf->handle_list);
+			md_wakeup_thread(conf->thread);
+		}
+		else {
+			list_add_tail(&sh->lru, &conf->inactive_list);
+			atomic_dec(&conf->active_stripes);
+			wake_up(&conf->wait_for_stripe);
+		}
+	}
 }
-
-static void finish_unlock_stripe(struct stripe_head *sh)
+static void release_stripe(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	sh->cmd = STRIPE_NONE;
-	sh->phase = PHASE_COMPLETE;
-	atomic_dec(&conf->nr_pending_stripes);
-	atomic_inc(&conf->nr_cached_stripes);
-	__unlock_stripe(sh);
-	atomic_dec(&sh->count);
-	wake_up(&conf->wait_for_stripe);
+
+	spin_lock_irq(&conf->device_lock);
+	__release_stripe(conf, sh);
+	spin_unlock_irq(&conf->device_lock);
 }
 
-static void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
+static void remove_hash(struct stripe_head *sh)
 {
 	PRINTK("remove_hash(), stripe %lu\n", sh->sector);
 
-	CHECK_DEVLOCK();
-	CHECK_SHLOCK(sh);
 	if (sh->hash_pprev) {
 		if (sh->hash_next)
 			sh->hash_next->hash_pprev = sh->hash_pprev;
 		*sh->hash_pprev = sh->hash_next;
 		sh->hash_pprev = NULL;
-		atomic_dec(&conf->nr_hashed_stripes);
 	}
 }
 
-static void lock_get_bh (struct buffer_head *bh)
-{
-	while (md_test_and_set_bit(BH_Lock, &bh->b_state))
-		__wait_on_buffer(bh);
-	atomic_inc(&bh->b_count);
-}
-
 static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 {
-	struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
+	struct stripe_head **shp = &stripe_hash(conf, sh->sector);
 
-	PRINTK("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
-			sh->sector, atomic_read(&conf->nr_hashed_stripes));
+	PRINTK("insert_hash(), stripe %lu\n",sh->sector);
 
 	CHECK_DEVLOCK();
-	CHECK_SHLOCK(sh);
 	if ((sh->hash_next = *shp) != NULL)
 		(*shp)->hash_pprev = &sh->hash_next;
 	*shp = sh;
 	sh->hash_pprev = shp;
-	atomic_inc(&conf->nr_hashed_stripes);
 }
 
-static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
-{
-	struct buffer_head *bh;
-	unsigned long flags;
-
-	CHECK_SHLOCK(sh);
-	md_spin_lock_irqsave(&sh->stripe_lock, flags);
-	bh = sh->buffer_pool;
-	if (!bh)
-		goto out_unlock;
-	sh->buffer_pool = bh->b_next;
-	bh->b_size = b_size;
-	if (atomic_read(&bh->b_count))
-		BUG();
-out_unlock:
-	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
-
-	return bh;
-}
-
-static struct buffer_head *get_free_bh(struct stripe_head *sh)
-{
-	struct buffer_head *bh;
-	unsigned long flags;
-
-	CHECK_SHLOCK(sh);
-	md_spin_lock_irqsave(&sh->stripe_lock, flags);
-	bh = sh->bh_pool;
-	if (!bh)
-		goto out_unlock;
-	sh->bh_pool = bh->b_next;
-	if (atomic_read(&bh->b_count))
-		BUG();
-out_unlock:
-	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
-
-	return bh;
-}
-
-static void put_free_buffer(struct stripe_head *sh, struct buffer_head *bh)
-{
-	unsigned long flags;
-
-	if (atomic_read(&bh->b_count))
-		BUG();
-	CHECK_SHLOCK(sh);
-	md_spin_lock_irqsave(&sh->stripe_lock, flags);
-	bh->b_next = sh->buffer_pool;
-	sh->buffer_pool = bh;
-	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
-}
-
-static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
-{
-	unsigned long flags;
-
-	if (atomic_read(&bh->b_count))
-		BUG();
-	CHECK_SHLOCK(sh);
-	md_spin_lock_irqsave(&sh->stripe_lock, flags);
-	bh->b_next = sh->bh_pool;
-	sh->bh_pool = bh;
-	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
-}
 
+/* find an idle stripe, make sure it is unhashed, and return it. */
 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
 {
-	struct stripe_head *sh;
+	struct stripe_head *sh = NULL;
+	struct list_head *first;
 
-	md_spin_lock_irq(&conf->device_lock);
-	sh = conf->free_sh_list;
-	if (!sh)
+	CHECK_DEVLOCK();
+	if (list_empty(&conf->inactive_list))
 		goto out;
-	conf->free_sh_list = sh->free_next;
-	atomic_dec(&conf->nr_free_sh);
-	if (!atomic_read(&conf->nr_free_sh) && conf->free_sh_list)
-		BUG();
-	if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) ||
-					 atomic_read(&sh->count))
-		BUG();
+	first = conf->inactive_list.next;
+	sh = list_entry(first, struct stripe_head, lru);
+	list_del_init(first);
+	remove_hash(sh);
+	atomic_inc(&conf->active_stripes);
 out:
-	md_spin_unlock_irq(&conf->device_lock);
 	return sh;
 }
 
-static void __put_free_stripe (raid5_conf_t *conf, struct stripe_head *sh)
-{
-	if (atomic_read(&sh->count) != 0)
-		BUG();
-	CHECK_DEVLOCK();
-	CHECK_SHLOCK(sh);
-	clear_bit(STRIPE_LOCKED, &sh->state);
-	sh->free_next = conf->free_sh_list;
-	conf->free_sh_list = sh;
-	atomic_inc(&conf->nr_free_sh);
-}
-
 static void shrink_buffers(struct stripe_head *sh, int num)
 {
 	struct buffer_head *bh;
+	int i;
 
-	while (num--) {
-		bh = get_free_buffer(sh, -1);
+	for (i=0; i<num ; i++) {
+		bh = sh->bh_cache[i];
 		if (!bh)
 			return;
+		sh->bh_cache[i] = NULL;
 		free_page((unsigned long) bh->b_data);
 		kfree(bh);
 	}
 }
 
-static void shrink_bh(struct stripe_head *sh, int num)
-{
-	struct buffer_head *bh;
-
-	while (num--) {
-		bh = get_free_bh(sh);
-		if (!bh)
-			return;
-		kfree(bh);
-	}
-}
-
-static int grow_raid5_buffers(struct stripe_head *sh, int num, int b_size, int priority)
+static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
 {
 	struct buffer_head *bh;
+	int i;
 
-	while (num--) {
+	for (i=0; i<num; i++) {
 		struct page *page;
 		bh = kmalloc(sizeof(struct buffer_head), priority);
 		if (!bh)
@@ -262,239 +162,155 @@ static int grow_raid5_buffers(struct stripe_head *sh, int num, int b_size, int p
 			kfree(bh);
 			return 1;
 		}
-		bh->b_size = b_size;
 		atomic_set(&bh->b_count, 0);
 		bh->b_page = page;
-		put_free_buffer(sh, bh);
-	}
-	return 0;
-}
+		sh->bh_cache[i] = bh;
 
-static int grow_bh(struct stripe_head *sh, int num, int priority)
-{
-	struct buffer_head *bh;
-
-	while (num--) {
-		bh = kmalloc(sizeof(struct buffer_head), priority);
-		if (!bh)
-			return 1;
-		memset(bh, 0, sizeof (struct buffer_head));
-		init_waitqueue_head(&bh->b_wait);
-		put_free_bh(sh, bh);
 	}
 	return 0;
 }
 
-static void raid5_free_buffer(struct stripe_head *sh, struct buffer_head *bh)
-{
-	put_free_buffer(sh, bh);
-}
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
 
-static void raid5_free_bh(struct stripe_head *sh, struct buffer_head *bh)
-{
-	put_free_bh(sh, bh);
-}
-
-static void raid5_free_old_bh(struct stripe_head *sh, int i)
-{
-	CHECK_SHLOCK(sh);
-	if (!sh->bh_old[i])
-		BUG();
-	raid5_free_buffer(sh, sh->bh_old[i]);
-	sh->bh_old[i] = NULL;
-}
-
-static void raid5_update_old_bh(struct stripe_head *sh, int i)
-{
-	CHECK_SHLOCK(sh);
-	PRINTK("stripe %lu, idx %d, updating cache copy\n", sh->sector, i);
-	if (!sh->bh_copy[i])
-		BUG();
-	if (sh->bh_old[i])
-		raid5_free_old_bh(sh, i);
-	sh->bh_old[i] = sh->bh_copy[i];
-	sh->bh_copy[i] = NULL;
-}
-
-static void free_stripe(struct stripe_head *sh)
+static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, j;
+	int disks = conf->raid_disks, i;
 
 	if (atomic_read(&sh->count) != 0)
 		BUG();
+	if (test_bit(STRIPE_HANDLE, &sh->state))
+		BUG();
+	
 	CHECK_DEVLOCK();
-	CHECK_SHLOCK(sh);
-	PRINTK("free_stripe called, stripe %lu\n", sh->sector);
-	if (sh->phase != PHASE_COMPLETE || atomic_read(&sh->count)) {
-		PRINTK("raid5: free_stripe(), sector %lu, phase %d, count %d\n", sh->sector, sh->phase, atomic_read(&sh->count));
-		return;
-	}
-	for (j = 0; j < disks; j++) {
-		if (sh->bh_old[j])
-			raid5_free_old_bh(sh, j);
-		if (sh->bh_new[j] || sh->bh_copy[j])
-			BUG();
-	}
-	remove_hash(conf, sh);
-	__put_free_stripe(conf, sh);
-}
+	PRINTK("init_stripe called, stripe %lu\n", sh->sector);
 
-static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
-{
-	struct stripe_head *sh;
-	int i, count = 0;
-
-	PRINTK("shrink_stripe_cache called, %d/%d, clock %d\n", nr, atomic_read(&conf->nr_hashed_stripes), conf->clock);
-	md_spin_lock_irq(&conf->device_lock);
-	for (i = 0; i < NR_HASH; i++) {
-		sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
-		for (; sh; sh = sh->hash_next) {
-			if (sh->phase != PHASE_COMPLETE)
-				continue;
-			if (atomic_read(&sh->count))
-				continue;
-			/*
-			 * Try to lock this stripe:
-			 */
-			if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
-				continue;
-			free_stripe(sh);
-			if (++count == nr) {
-				conf->clock = (i + conf->clock) & HASH_MASK;
-				goto out;
-			}
+	remove_hash(sh);
+	
+	sh->sector = sector;
+	sh->size = conf->buffer_size;
+	sh->state = 0;
+
+	for (i=disks; i--; ) {
+		if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
+		    buffer_locked(sh->bh_cache[i])) {
+			printk("sector=%lx i=%d %p %p %p %d\n",
+			       sh->sector, i, sh->bh_read[i],
+			       sh->bh_write[i], sh->bh_written[i],
+			       buffer_locked(sh->bh_cache[i]));
+			BUG();
 		}
+		clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
+		raid5_build_block(sh, i);
 	}
-out:
-	md_spin_unlock_irq(&conf->device_lock);
-	PRINTK("shrink completed, nr_hashed_stripes %d, nr_pending_strips %d\n",
-		atomic_read(&conf->nr_hashed_stripes),
-		atomic_read(&conf->nr_pending_stripes));
-	return count;
+	insert_hash(conf, sh);
 }
 
-void __wait_lock_stripe(struct stripe_head *sh)
+/* the buffer size has changed, so unhash all stripes
+ * as active stripes complete, they will go onto inactive list
+ */
+static void shrink_stripe_cache(raid5_conf_t *conf)
 {
-	MD_DECLARE_WAITQUEUE(wait, current);
-
-	PRINTK("wait_lock_stripe %lu\n", sh->sector);
-	if (!atomic_read(&sh->count))
+	int i;
+	CHECK_DEVLOCK();
+	if (atomic_read(&conf->active_stripes))
 		BUG();
-	add_wait_queue(&sh->wait, &wait);
-repeat:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
-		schedule();
-		goto repeat;
+	for (i=0; i < NR_HASH; i++) {
+		struct stripe_head *sh;
+		while ((sh = conf->stripe_hashtbl[i])) 
+			remove_hash(sh);
 	}
-	PRINTK("wait_lock_stripe %lu done\n", sh->sector);
-	remove_wait_queue(&sh->wait, &wait);
-	current->state = TASK_RUNNING;
 }
 
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
 {
 	struct stripe_head *sh;
 
+	CHECK_DEVLOCK();
 	PRINTK("__find_stripe, sector %lu\n", sector);
-	for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next) {
-		if (sh->sector == sector && sh->raid_conf == conf) {
-			if (sh->size != size)
-				BUG();
+	for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
+		if (sh->sector == sector)
 			return sh;
-		}
-	}
 	PRINTK("__stripe %lu not in cache\n", sector);
 	return NULL;
 }
 
-static inline struct stripe_head *alloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock) 
 {
 	struct stripe_head *sh;
-	struct buffer_head *buffer_pool, *bh_pool;
-	MD_DECLARE_WAITQUEUE(wait, current);
-
-	PRINTK("alloc_stripe called\n");
-
-	
-	while ((sh = get_free_stripe(conf)) == NULL) {
-		int cnt;
-		add_wait_queue(&conf->wait_for_stripe, &wait);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		cnt = shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
-		sh = get_free_stripe(conf);
-		if (!sh && cnt < (conf->max_nr_stripes/8)) {
-			md_wakeup_thread(conf->thread);
-			PRINTK("waiting for some stripes to complete - %d %d\n", cnt, conf->max_nr_stripes/8);
-			schedule();
-		}
-		remove_wait_queue(&conf->wait_for_stripe, &wait);
-		current->state = TASK_RUNNING;
-		if (sh)
-			break;
-	}
 
-	buffer_pool = sh->buffer_pool;
-	bh_pool = sh->bh_pool;
-	memset(sh, 0, sizeof(*sh));
-	sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
-	md_init_waitqueue_head(&sh->wait);
-	sh->buffer_pool = buffer_pool;
-	sh->bh_pool = bh_pool;
-	sh->phase = PHASE_COMPLETE;
-	sh->cmd = STRIPE_NONE;
-	sh->raid_conf = conf;
-	sh->sector = sector;
-	sh->size = size;
-	atomic_inc(&conf->nr_cached_stripes);
-
-	return sh;
-}
+	PRINTK("get_stripe, sector %lu\n", sector);
 
-static struct stripe_head *get_lock_stripe(raid5_conf_t *conf, unsigned long sector, int size)
-{
-	struct stripe_head *sh, *new = NULL;
+	md_spin_lock_irq(&conf->device_lock);
 
-	PRINTK("get_stripe, sector %lu\n", sector);
+	do {
+		if (conf->buffer_size == 0 ||
+		    (size && size != conf->buffer_size)) {
+			/* either the size is being changed (buffer_size==0) or
+			 * we need to change it.
+			 * If size==0, we can proceed as soon as buffer_size gets set.
+			 * If size>0, we can proceed when active_stripes reaches 0, or
+			 * when someone else sets the buffer_size to size.
+			 * If someone sets the buffer size to something else, we will need to
+			 * assert that we want to change it again
+			 */
+			int oldsize = conf->buffer_size;
+			PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
+			if (size==0)
+				wait_event_lock_irq(conf->wait_for_stripe,
+						    conf->buffer_size,
+						    conf->device_lock);
+			else {
+				while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
+					conf->buffer_size = 0;
+					wait_event_lock_irq(conf->wait_for_stripe,
+							    atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
+							    conf->device_lock);
+					PRINTK("waited and now  %ld/%d buffer_size is %d - %d active\n", sector, size,
+					       conf->buffer_size, atomic_read(&conf->active_stripes));
+				}
 
-	/*
-	 * Do this in set_blocksize()!
-	 */
-	if (conf->buffer_size != size) {
-		PRINTK("switching size, %d --> %d\n", conf->buffer_size, size);
-		shrink_stripe_cache(conf, conf->max_nr_stripes);
-		conf->buffer_size = size;
-	}
+				if (conf->buffer_size != size) {
+					printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
+					shrink_stripe_cache(conf);
+					if (size==0) BUG();
+					conf->buffer_size = size;
+					PRINTK("size now %d\n", conf->buffer_size);
+				}
+			}
+		}
+		if (size == 0)
+			sector -= sector & ((conf->buffer_size>>9)-1);
 
-repeat:
-	md_spin_lock_irq(&conf->device_lock);
-	sh = __find_stripe(conf, sector, size);
-	if (!sh) {
-		if (!new) {
-			md_spin_unlock_irq(&conf->device_lock);
-			new = alloc_stripe(conf, sector, size);
-			goto repeat;
+		sh = __find_stripe(conf, sector);
+		if (!sh) {
+			sh = get_free_stripe(conf);
+			if (noblock && sh == NULL)
+				break;
+			if (!sh) {
+				wait_event_lock_irq(conf->wait_for_stripe,
+						    !list_empty(&conf->inactive_list),
+						    conf->device_lock);
+			} else
+				init_stripe(sh, sector);
+		} else {
+			if (atomic_read(&sh->count)) {
+				if (!list_empty(&sh->lru))
+					BUG();
+			} else {
+				if (!test_bit(STRIPE_HANDLE, &sh->state))
+					atomic_inc(&conf->active_stripes);
+				if (list_empty(&sh->lru))
+					BUG();
+				list_del_init(&sh->lru);
+			}
 		}
-		sh = new;
-		new = NULL;
-		if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
-			BUG();
-		insert_hash(conf, sh);
-		atomic_inc(&sh->count);
-		md_spin_unlock_irq(&conf->device_lock);
-	} else {
+	} while (sh == NULL);
+
+	if (sh)
 		atomic_inc(&sh->count);
-		if (new) {
-			if (md_test_and_set_bit(STRIPE_LOCKED, &new->state))
-				BUG();
-			__put_free_stripe(conf, new);
-		}
-		md_spin_unlock_irq(&conf->device_lock);
-		PRINTK("get_stripe, waiting, sector %lu\n", sector);
-		if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
-			__wait_lock_stripe(sh);
-	}
+
+	md_spin_unlock_irq(&conf->device_lock);
 	return sh;
 }
 
@@ -508,26 +324,18 @@ static int grow_stripes(raid5_conf_t *conf, int num, int priority)
 			return 1;
 		memset(sh, 0, sizeof(*sh));
 		sh->raid_conf = conf;
-		sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
-		md_init_waitqueue_head(&sh->wait);
+		sh->lock = SPIN_LOCK_UNLOCKED;
 
-		if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
-			BUG();
-		if (grow_raid5_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
-			shrink_buffers(sh, 2 * conf->raid_disks);
-			kfree(sh);
-			return 1;
-		}
-		if (grow_bh(sh, conf->raid_disks, priority)) {
-			shrink_buffers(sh, 2 * conf->raid_disks);
-			shrink_bh(sh, conf->raid_disks);
+		if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
+			shrink_buffers(sh, conf->raid_disks);
 			kfree(sh);
 			return 1;
 		}
-		md_spin_lock_irq(&conf->device_lock);
-		__put_free_stripe(conf, sh);
-		atomic_inc(&conf->nr_stripes);
-		md_spin_unlock_irq(&conf->device_lock);
+		/* we just created an active stripe so... */
+		atomic_set(&sh->count, 1);
+		atomic_inc(&conf->active_stripes);
+		INIT_LIST_HEAD(&sh->lru);
+		release_stripe(sh);
 	}
 	return 0;
 }
@@ -537,119 +345,124 @@ static void shrink_stripes(raid5_conf_t *conf, int num)
 	struct stripe_head *sh;
 
 	while (num--) {
+		spin_lock_irq(&conf->device_lock);
 		sh = get_free_stripe(conf);
+		spin_unlock_irq(&conf->device_lock);
 		if (!sh)
 			break;
-		if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
+		if (atomic_read(&sh->count))
 			BUG();
-		shrink_buffers(sh, conf->raid_disks * 2);
-		shrink_bh(sh, conf->raid_disks);
+		shrink_buffers(sh, conf->raid_disks);
 		kfree(sh);
-		atomic_dec(&conf->nr_stripes);
+		atomic_dec(&conf->active_stripes);
 	}
 }
 
 
-static struct buffer_head *raid5_alloc_buffer(struct stripe_head *sh, int b_size)
+static inline void raid5_end_buffer_read(struct buffer_head *blist, struct buffer_head *bh)
 {
-	struct buffer_head *bh;
-
-	bh = get_free_buffer(sh, b_size);
-	if (!bh)
-		BUG();
-	return bh;
+	while (blist) {
+		struct buffer_head *new = blist;
+		blist = new->b_reqnext;
+		memcpy(new->b_data, bh->b_data, bh->b_size);
+		new->b_end_io(new, 1);
+	}
 }
 
-static struct buffer_head *raid5_alloc_bh(struct stripe_head *sh)
+static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
 {
-	struct buffer_head *bh;
+ 	struct stripe_head *sh = bh->b_private;
+	raid5_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks, i;
+	unsigned long flags;
+	struct buffer_head *buffers = NULL;
 
-	bh = get_free_bh(sh);
-	if (!bh)
-		BUG();
-	return bh;
-}
+	for (i=0 ; i<disks; i++)
+		if (bh == sh->bh_cache[i])
+			break;
 
-static void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate)
-{
-	struct buffer_head *bh = sh->bh_new[i];
-
-	PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh->b_blocknr, uptodate);
-	sh->bh_new[i] = NULL;
-	raid5_free_bh(sh, sh->bh_req[i]);
-	sh->bh_req[i] = NULL;
-	PRINTK("calling %p->end_io: %p.\n", bh, bh->b_end_io);
-	bh->b_end_io(bh, uptodate);
-	if (!uptodate)
-		printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
-			"block %lu\n",
-			partition_name(mddev_to_kdev(sh->raid_conf->mddev)),
-			bh->b_blocknr);
-}
+	PRINTK("end_read_request %lu/%d,  %d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
+	if (i == disks) {
+		BUG();
+		return;
+	}
 
-static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
-{
-	if (uptodate)
+	md_spin_lock_irqsave(&conf->device_lock, flags);
+	if (uptodate) {
+#ifdef CONFIG_HIGHMEM
+		/* cannot map highmem bufferheads from irq,
+		 * so leave it for stripe_handle if there might
+		 * be a problem
+		 */
+		if (sh->bh_read[i] &&
+		    sh->bh_read[i]->b_reqnext == NULL &&
+		    !PageHighMem(sh->bh_read[i]->b_page)) {
+			/* it's safe */
+			buffers = sh->bh_read[i];
+			sh->bh_read[i] = NULL;
+		}
+#else
+		buffers = sh->bh_read[i];
+		sh->bh_read[i] = NULL;
+#endif
 		set_bit(BH_Uptodate, &bh->b_state);
-	else
+		if (buffers) {
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+			raid5_end_buffer_read(buffers, bh);
+			spin_lock_irqsave(&conf->device_lock, flags);
+		}
+	} else {
+		md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
 		clear_bit(BH_Uptodate, &bh->b_state);
+	}
+	clear_bit(BH_Lock, &bh->b_state);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	__release_stripe(conf, sh);
+	md_spin_unlock_irqrestore(&conf->device_lock, flags);
 }
 
-static void raid5_end_request (struct buffer_head * bh, int uptodate)
+static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
 {
  	struct stripe_head *sh = bh->b_private;
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = conf->raid_disks, i;
 	unsigned long flags;
 
-	PRINTK("end_request %lu, nr_pending %d, uptodate: %d, (caller: %p,%p,%p,%p).\n", sh->sector, atomic_read(&sh->nr_pending), uptodate, __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2), __builtin_return_address(3));
-	md_spin_lock_irqsave(&sh->stripe_lock, flags);
-	raid5_mark_buffer_uptodate(bh, uptodate);
-	if (!uptodate)
-		md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
-	if (conf->failed_disks) {
-		for (i = 0; i < disks; i++) {
-			if (conf->disks[i].operational)
-				continue;
-			if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
-				continue;
-			if (bh->b_dev != conf->disks[i].dev)
-				continue;
-			set_bit(STRIPE_ERROR, &sh->state);
-		}
-	}
-	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
+	for (i=0 ; i<disks; i++)
+		if (bh == sh->bh_cache[i])
+			break;
 
-	if (atomic_dec_and_test(&sh->nr_pending)) {
-		atomic_inc(&conf->nr_handle);
-		md_wakeup_thread(conf->thread);
+	PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
+	if (i == disks) {
+		BUG();
+		return;
 	}
+
+	md_spin_lock_irqsave(&conf->device_lock, flags);
+	if (!uptodate)
+		md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
+	clear_bit(BH_Lock, &bh->b_state);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	__release_stripe(conf, sh);
+	md_spin_unlock_irqrestore(&conf->device_lock, flags);
 }
+	
 
-static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
+
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	char *b_data;
-	struct page *b_page;
+	struct buffer_head *bh = sh->bh_cache[i];
 	unsigned long block = sh->sector / (sh->size >> 9);
 
-	b_data = bh->b_data;
-	b_page = bh->b_page;
-	memset (bh, 0, sizeof (struct buffer_head));
-	init_waitqueue_head(&bh->b_wait);
-	init_buffer(bh, raid5_end_request, sh);
-	bh->b_dev = conf->disks[i].dev;
-	bh->b_blocknr = block;
-
-	bh->b_data = b_data;
-	bh->b_page = b_page;
-
-	bh->b_rdev	= conf->disks[i].dev;
-	bh->b_rsector	= sh->sector;
+	init_buffer(bh, raid5_end_read_request, sh);
+	bh->b_dev       = conf->disks[i].dev;
+	bh->b_blocknr   = block;
 
 	bh->b_state	= (1 << BH_Req) | (1 << BH_Mapped);
 	bh->b_size	= sh->size;
 	bh->b_list	= BUF_LOCKED;
+	return bh;
 }
 
 static int raid5_error (mddev_t *mddev, kdev_t dev)
@@ -778,6 +591,7 @@ static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int r
 	return new_sector;
 }
 
+#if 0
 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
 {
 	raid5_conf_t *conf = sh->raid_conf;
@@ -816,38 +630,42 @@ static unsigned long compute_blocknr(struct stripe_head *sh, int i)
 	}
 	return blocknr;
 }
+#endif
+
+#define check_xor() 	do { 					\
+			   if (count == MAX_XOR_BLOCKS) {	\
+				xor_block(count, bh_ptr);	\
+				count = 1;			\
+			   }					\
+			} while(0)
+
 
 static void compute_block(struct stripe_head *sh, int dd_idx)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int i, count, disks = conf->raid_disks;
-	struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+	struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
 
 	PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
 
-	if (sh->bh_old[dd_idx] == NULL)
-		sh->bh_old[dd_idx] = raid5_alloc_buffer(sh, sh->size);
-	raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
 
-	memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
-	bh_ptr[0] = sh->bh_old[dd_idx];
+	memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
+	bh_ptr[0] = sh->bh_cache[dd_idx];
 	count = 1;
-	for (i = 0; i < disks; i++) {
+	for (i = disks ; i--; ) {
 		if (i == dd_idx)
 			continue;
-		if (sh->bh_old[i]) {
-			bh_ptr[count++] = sh->bh_old[i];
-		} else {
+		bh = sh->bh_cache[i];
+		if (buffer_uptodate(bh))
+			bh_ptr[count++] = bh;
+		else
 			printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
-		}
-		if (count == MAX_XOR_BLOCKS) {
-			xor_block(count, &bh_ptr[0]);
-			count = 1;
-		}
+
+		check_xor();
 	}
 	if (count != 1)
-		xor_block(count, &bh_ptr[0]);
-	raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
+		xor_block(count, bh_ptr);
+	set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
 }
 
 static void compute_parity(struct stripe_head *sh, int method)
@@ -855,604 +673,432 @@ static void compute_parity(struct stripe_head *sh, int method)
 	raid5_conf_t *conf = sh->raid_conf;
 	int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
 	struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+	struct buffer_head *chosen[MD_SB_DISKS];
 
 	PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
-	for (i = 0; i < disks; i++) {
-		char *bdata;
-		if (i == pd_idx || !sh->bh_new[i])
-			continue;
-		if (!sh->bh_copy[i])
-			sh->bh_copy[i] = raid5_alloc_buffer(sh, sh->size);
-		raid5_build_block(sh, sh->bh_copy[i], i);
-		atomic_set_buffer_dirty(sh->bh_copy[i]);
-		bdata = bh_kmap(sh->bh_new[i]);
-		memcpy(sh->bh_copy[i]->b_data, bdata, sh->size);
-		bh_kunmap(sh->bh_new[i]);
-	}
-	if (sh->bh_copy[pd_idx] == NULL) {
-		sh->bh_copy[pd_idx] = raid5_alloc_buffer(sh, sh->size);
-		atomic_set_buffer_dirty(sh->bh_copy[pd_idx]);
-	}
-	raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
+	memset(chosen, 0, sizeof(chosen));
 
-	if (method == RECONSTRUCT_WRITE) {
-		memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
-		bh_ptr[0] = sh->bh_copy[pd_idx];
-		count = 1;
-		for (i = 0; i < disks; i++) {
-			if (i == sh->pd_idx)
+	count = 1;
+	bh_ptr[0] = sh->bh_cache[pd_idx];
+	spin_lock_irq(&conf->device_lock);
+	switch(method) {
+	case READ_MODIFY_WRITE:
+		if (!buffer_uptodate(sh->bh_cache[pd_idx]))
+			BUG();
+		for (i=disks ; i-- ;) {
+			if (i==pd_idx)
 				continue;
-			if (sh->bh_new[i]) {
-				bh_ptr[count++] = sh->bh_copy[i];
-			} else if (sh->bh_old[i]) {
-				bh_ptr[count++] = sh->bh_old[i];
-			}
-			if (count == MAX_XOR_BLOCKS) {
-				xor_block(count, &bh_ptr[0]);
-				count = 1;
+			if (sh->bh_write[i] &&
+			    buffer_uptodate(sh->bh_cache[i])) {
+				bh_ptr[count++] = sh->bh_cache[i];
+				chosen[i] = sh->bh_write[i];
+				sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
+				chosen[i]->b_reqnext = sh->bh_written[i];
+				sh->bh_written[i] = chosen[i];
+				check_xor();
 			}
 		}
-		if (count != 1) {
-			xor_block(count, &bh_ptr[0]);
-		}
-	} else if (method == READ_MODIFY_WRITE) {
-		memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
-		bh_ptr[0] = sh->bh_copy[pd_idx];
-		count = 1;
-		for (i = 0; i < disks; i++) {
-			if (i == sh->pd_idx)
-				continue;
-			if (sh->bh_new[i] && sh->bh_old[i]) {
-				bh_ptr[count++] = sh->bh_copy[i];
-				bh_ptr[count++] = sh->bh_old[i];
+		break;
+	case RECONSTRUCT_WRITE:
+		memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
+		for (i= disks; i-- ;)
+			if (i!=pd_idx && sh->bh_write[i]) {
+				chosen[i] = sh->bh_write[i];
+				sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
+				chosen[i]->b_reqnext = sh->bh_written[i];
+				sh->bh_written[i] = chosen[i];
+				check_xor();
 			}
-			if (count >= (MAX_XOR_BLOCKS - 1)) {
-				xor_block(count, &bh_ptr[0]);
-				count = 1;
+		break;
+	case CHECK_PARITY:
+		break;
+	}
+	spin_unlock_irq(&conf->device_lock);
+	for (i = disks; i--;)
+		if (chosen[i]) {
+			struct buffer_head *bh = sh->bh_cache[i];
+			char *bdata;
+			mark_buffer_clean(chosen[i]); /* NO FIXME */
+			bdata = bh_kmap(chosen[i]);
+			memcpy(bh->b_data,
+			       bdata,sh->size);
+			bh_kunmap(chosen[i]);
+			set_bit(BH_Lock, &bh->b_state);
+			mark_buffer_uptodate(bh, 1);
+		}
+
+	switch(method) {
+	case RECONSTRUCT_WRITE:
+	case CHECK_PARITY:
+		for (i=disks; i--;)
+			if (i != pd_idx) {
+				bh_ptr[count++] = sh->bh_cache[i];
+				check_xor();
+			}
+		break;
+	case READ_MODIFY_WRITE:
+		for (i = disks; i--;)
+			if (chosen[i]) {
+				bh_ptr[count++] = sh->bh_cache[i];
+				check_xor();
 			}
-		}
-		if (count != 1) {
-			xor_block(count, &bh_ptr[0]);
-		}
 	}
-	raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
+	if (count != 1)
+		xor_block(count, bh_ptr);
+	
+	if (method != CHECK_PARITY) {
+		mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
+		set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
+	} else
+		mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
 }
 
 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
 {
+	struct buffer_head **bhp;
 	raid5_conf_t *conf = sh->raid_conf;
-	struct buffer_head *bh_req;
 
 	PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
-	CHECK_SHLOCK(sh);
-	if (sh->bh_new[dd_idx])
-		BUG();
 
-	bh_req = raid5_alloc_bh(sh);
-	raid5_build_block(sh, bh_req, dd_idx);
-	bh_req->b_data = bh->b_data;
-	bh_req->b_page = bh->b_page;
 
-	md_spin_lock_irq(&conf->device_lock);
-	if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
-		PRINTK("stripe s#%lu => PHASE_BEGIN (%s)\n", sh->sector, rw == READ ? "read" : "write");
-		sh->phase = PHASE_BEGIN;
-		sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
-		atomic_inc(&conf->nr_pending_stripes);
-		atomic_inc(&conf->nr_handle);
-		PRINTK("# of pending stripes: %u, # of handle: %u\n", atomic_read(&conf->nr_pending_stripes), atomic_read(&conf->nr_handle));
+	spin_lock_irq(&conf->device_lock);
+	bh->b_reqnext = NULL;
+	if (rw == READ)
+		bhp = &sh->bh_read[dd_idx];
+	else
+		bhp = &sh->bh_write[dd_idx];
+	while (*bhp) {
+		printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
+		bhp = & (*bhp)->b_reqnext;
 	}
-	sh->bh_new[dd_idx] = bh;
-	sh->bh_req[dd_idx] = bh_req;
-	sh->cmd_new[dd_idx] = rw;
-	sh->new[dd_idx] = 1;
-	md_spin_unlock_irq(&conf->device_lock);
+	*bhp = bh;
+	spin_unlock_irq(&conf->device_lock);
 
 	PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
 }
 
-static void complete_stripe(struct stripe_head *sh)
-{
-	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks;
-	int i, new = 0;
-	
-	PRINTK("complete_stripe %lu\n", sh->sector);
-	for (i = 0; i < disks; i++) {
-		if (sh->cmd == STRIPE_SYNC && sh->bh_copy[i])
-			raid5_update_old_bh(sh, i);
-		if (sh->cmd == STRIPE_WRITE && i == sh->pd_idx)
-			raid5_update_old_bh(sh, i);
-		if (sh->bh_new[i]) {
-			PRINTK("stripe %lu finishes new bh, sh->new == %d\n", sh->sector, sh->new[i]);
-			if (!sh->new[i]) {
-#if 0
-				if (sh->cmd == STRIPE_WRITE) {
-					char *bdata = bh_kmap(sh->bh_new[i]);
-					if (memcmp(bdata, sh->bh_copy[i]->b_data, sh->size)) {
-						printk("copy differs, %s, sector %lu ",
-							test_bit(BH_Dirty, &sh->bh_new[i]->b_state) ? "dirty" : "clean",
-							sh->sector);
-					} else if (test_bit(BH_Dirty, &sh->bh_new[i]->b_state))
-						printk("sector %lu dirty\n", sh->sector);
-					bh_kunmap(sh->bh_new[i]);
-				}
-#endif
-				if (sh->cmd == STRIPE_WRITE)
-					raid5_update_old_bh(sh, i);
-				raid5_end_buffer_io(sh, i, 1);
-				continue;
-			} else
-				new++;
-		}
-		if (new && sh->cmd == STRIPE_WRITE)
-			printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new);
-	}
-	if (sh->cmd == STRIPE_SYNC)
-		md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1);
-	if (!new)
-		finish_unlock_stripe(sh);
-	else {
-		PRINTK("stripe %lu, new == %d\n", sh->sector, new);
-		sh->phase = PHASE_BEGIN;
-	}
-}
-
-
-static void handle_stripe_write (mddev_t *mddev , raid5_conf_t *conf,
-	struct stripe_head *sh, int nr_write, int * operational, int disks,
-	int parity, int parity_failed, int nr_cache, int nr_cache_other,
-	int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite)
-{
-	int i;
-	unsigned int block;
-	struct buffer_head *bh;
-	int method1 = INT_MAX, method2 = INT_MAX;
-
-	/*
-	 * Attempt to add entries :-)
-	 */
-	if (nr_write != disks - 1) {
-		for (i = 0; i < disks; i++) {
-			if (i == sh->pd_idx)
-				continue;
-			if (sh->bh_new[i])
-				continue;
-			block = (int) compute_blocknr(sh, i);
-			bh = get_hash_table(mddev_to_kdev(mddev), block, sh->size);
-			if (!bh)
-				continue;
-			if (buffer_dirty(bh) && !md_test_and_set_bit(BH_Lock, &bh->b_state)) {
-				PRINTK("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block);
-				add_stripe_bh(sh, bh, i, WRITE);
-				sh->new[i] = 0;
-				nr_write++;
-				if (sh->bh_old[i]) {
-					nr_cache_overwrite++;
-					nr_cache_other--;
-				} else
-					if (!operational[i]) {
-						nr_failed_overwrite++;
-						nr_failed_other--;
-					}
-			}
-			atomic_dec(&bh->b_count);
-		}
-	}
-	PRINTK("handle_stripe() -- begin writing, stripe %lu\n", sh->sector);
-	/*
-	 * Writing, need to update parity buffer.
-	 *
-	 * Compute the number of I/O requests in the "reconstruct
-	 * write" and "read modify write" methods.
-	 */
-	if (!nr_failed_other)
-		method1 = (disks - 1) - (nr_write + nr_cache_other);
-	if (!nr_failed_overwrite && !parity_failed)
-		method2 = nr_write - nr_cache_overwrite + (1 - parity);
-
-	if (method1 == INT_MAX && method2 == INT_MAX)
-		BUG();
-	PRINTK("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2);
 
-	if (!method1 || !method2) {
-		sh->phase = PHASE_WRITE;
-		compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
 
-		for (i = 0; i < disks; i++) {
-			if (!operational[i] && !conf->spare && !conf->resync_parity)
-				continue;
-			bh = sh->bh_copy[i];
-			if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
-				printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]);
-			if (i == sh->pd_idx && !bh)
-				printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i);
-			if (bh) {
-				PRINTK("making request for buffer %d\n", i);
-				lock_get_bh(bh);
-				if (!operational[i] && !conf->resync_parity) {
-					PRINTK("writing spare %d\n", i);
-					atomic_inc(&sh->nr_pending);
-					bh->b_dev = bh->b_rdev = conf->spare->dev;
-					generic_make_request(WRITE, bh);
-				} else {
-					atomic_inc(&sh->nr_pending);
-					bh->b_dev = bh->b_rdev = conf->disks[i].dev;
-					generic_make_request(WRITE, bh);
-				}
-				atomic_dec(&bh->b_count);
-			}
-		}
-		PRINTK("handle_stripe() %lu, writing back %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
-		return;
-	}
 
-	if (method1 < method2) {
-		sh->write_method = RECONSTRUCT_WRITE;
-		for (i = 0; i < disks; i++) {
-			if (i == sh->pd_idx)
-				continue;
-			if (sh->bh_new[i] || sh->bh_old[i])
-				continue;
-			sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size);
-			raid5_build_block(sh, sh->bh_old[i], i);
-		}
-	} else {
-		sh->write_method = READ_MODIFY_WRITE;
-		for (i = 0; i < disks; i++) {
-			if (sh->bh_old[i])
-				continue;
-			if (!sh->bh_new[i] && i != sh->pd_idx)
-				continue;
-			sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size);
-			raid5_build_block(sh, sh->bh_old[i], i);
-		}
-	}
-	sh->phase = PHASE_READ_OLD;
-	for (i = 0; i < disks; i++) {
-		if (!sh->bh_old[i])
-			continue;
-		if (test_bit(BH_Uptodate, &sh->bh_old[i]->b_state))
-			continue;
-		lock_get_bh(sh->bh_old[i]);
-		atomic_inc(&sh->nr_pending);
-		sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev;
-		generic_make_request(READ, sh->bh_old[i]);
-		atomic_dec(&sh->bh_old[i]->b_count);
-	}
-	PRINTK("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
-}
 
 /*
- * Reading
+ * handle_stripe - do things to a stripe.
+ *
+ * We lock the stripe and then examine the state of various bits
+ * to see what needs to be done.
+ * Possible results:
+ *    return some read request which now have data
+ *    return some write requests which are safely on disc
+ *    schedule a read on some buffers
+ *    schedule a write of some buffers
+ *    return confirmation of parity correctness
+ *
+ * Parity calculations are done inside the stripe lock
+ * buffers are taken off read_list or write_list, and bh_cache buffers
+ * get BH_Lock set before the stripe lock is released.
+ *
  */
-static void handle_stripe_read (mddev_t *mddev , raid5_conf_t *conf,
-	struct stripe_head *sh, int nr_read, int * operational, int disks,
-	int parity, int parity_failed, int nr_cache, int nr_cache_other,
-	int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite)
+ 
+static void handle_stripe(struct stripe_head *sh)
 {
+	raid5_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks;
+	struct buffer_head *return_ok= NULL, *return_fail = NULL;
+	int action[MD_SB_DISKS];
 	int i;
-	int method1 = INT_MAX;
-
-	method1 = nr_read - nr_cache_overwrite;
-
-	PRINTK("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1);
+	int syncing;
+	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+	int failed_num=0;
+	struct buffer_head *bh;
 
-	if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
-		PRINTK("read %lu completed from cache\n", sh->sector);
-		for (i = 0; i < disks; i++) {
-			char *bdata;
-			if (!sh->bh_new[i])
-				continue;
-			if (!sh->bh_old[i])
-				compute_block(sh, i);
-			bdata = bh_kmap(sh->bh_new[i]);
-			memcpy(bdata, sh->bh_old[i]->b_data, sh->size);
-			bh_kunmap(sh->bh_new[i]);
-		}
-		complete_stripe(sh);
-		return;
-	}
-	if (nr_failed_overwrite) {
-		sh->phase = PHASE_READ_OLD;
-		for (i = 0; i < disks; i++) {
-			if (sh->bh_old[i])
-				continue;
-			if (!operational[i])
-				continue;
-			sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size);
-			raid5_build_block(sh, sh->bh_old[i], i);
-			lock_get_bh(sh->bh_old[i]);
-			atomic_inc(&sh->nr_pending);
-			sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev;
-			generic_make_request(READ, sh->bh_old[i]);
-			atomic_dec(&sh->bh_old[i]->b_count);
-		}
-		PRINTK("handle_stripe() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
-		return;
-	}
-	sh->phase = PHASE_READ;
-	for (i = 0; i < disks; i++) {
-		if (!sh->bh_new[i])
-			continue;
-		if (sh->bh_old[i]) {
-			char *bdata = bh_kmap(sh->bh_new[i]);
-			memcpy(bdata, sh->bh_old[i]->b_data, sh->size);
-			bh_kunmap(sh->bh_new[i]);
-			continue;
-		}
-#if RAID5_PARANOIA
-		if (sh->bh_req[i] == NULL || test_bit(BH_Lock, &sh->bh_req[i]->b_state)) {
-			int j;
-			printk("req %d is NULL! or locked \n", i);
-			for (j=0; j<disks; j++) {
-				printk("%d: new=%p old=%p req=%p new=%d cmd=%d\n",
-					j, sh->bh_new[j], sh->bh_old[j], sh->bh_req[j],
-					sh->new[j], sh->cmd_new[j]);
+	PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
+	memset(action, 0, sizeof(action));
+
+	spin_lock(&sh->lock);
+	clear_bit(STRIPE_HANDLE, &sh->state);
+
+	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	/* Now to look around and see what can be done */
+
+	for (i=disks; i--; ) {
+		bh = sh->bh_cache[i];
+		PRINTK("check %d: state %lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
+		/* maybe we can reply to a read */
+		if (buffer_uptodate(bh) && sh->bh_read[i]) {
+			struct buffer_head *rbh, *rbh2;
+			PRINTK("Return read for disc %d\n", i);
+			spin_lock_irq(&conf->device_lock);
+			rbh = sh->bh_read[i];
+			sh->bh_read[i] = NULL;
+			spin_unlock_irq(&conf->device_lock);
+			while (rbh) {
+				char *bdata;
+				bdata = bh_kmap(rbh);
+				memcpy(bdata, bh->b_data, bh->b_size);
+				bh_kunmap(rbh);
+				rbh2 = rbh->b_reqnext;
+				rbh->b_reqnext = return_ok;
+				return_ok = rbh;
+				rbh = rbh2;
 			}
-
 		}
-#endif
-		lock_get_bh(sh->bh_req[i]);
-		atomic_inc(&sh->nr_pending);
-		sh->bh_req[i]->b_dev = sh->bh_req[i]->b_rdev = conf->disks[i].dev;
-		generic_make_request(READ, sh->bh_req[i]);
-		atomic_dec(&sh->bh_req[i]->b_count);
-	}
-	PRINTK("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending));
-}
 
-/*
- * Syncing
- */
-static void handle_stripe_sync (mddev_t *mddev , raid5_conf_t *conf,
-	struct stripe_head *sh, int * operational, int disks,
-	int parity, int parity_failed, int nr_cache, int nr_cache_other,
-	int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite)
-{
-	struct buffer_head *bh;
-	int i, pd_idx;
-	
-	/* firstly, we want to have data from all non-failed drives
-	 * in bh_old
-	 */
-	PRINTK("handle_stripe_sync: sec=%lu disks=%d nr_cache=%d\n", sh->sector, disks, nr_cache);	
-	if ((nr_cache < disks-1) || ((nr_cache == disks-1) && !(parity_failed+nr_failed_other+nr_failed_overwrite))
-		) {
-		sh->phase = PHASE_READ_OLD;
-		for (i = 0; i < disks; i++) {
-			if (sh->bh_old[i])
-				continue;
-			if (!conf->disks[i].operational)
-				continue;
+		/* now count some things */
+		if (buffer_locked(bh)) locked++;
+		if (buffer_uptodate(bh)) uptodate++;
 
-			bh = raid5_alloc_buffer(sh, sh->size);
-			sh->bh_old[i] = bh;
-			raid5_build_block(sh, bh, i);
-			lock_get_bh(bh);
-			atomic_inc(&sh->nr_pending);
-			bh->b_dev = bh->b_rdev = conf->disks[i].dev;
-			generic_make_request(READ, bh);
-			md_sync_acct(bh->b_rdev, bh->b_size/512);
-			atomic_dec(&sh->bh_old[i]->b_count);
+		
+		if (sh->bh_read[i]) to_read++;
+		if (sh->bh_write[i]) to_write++;
+		if (sh->bh_written[i]) written++;
+		if (!conf->disks[i].operational) {
+			failed++;
+			failed_num = i;
 		}
-		PRINTK("handle_stripe_sync() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
-
-		return;
 	}
-	/* now, if there is a failed drive, rebuild and write to spare */
-	if (nr_cache == disks-1) {
-		sh->phase = PHASE_WRITE;
-		/* we can generate the missing block, which will be on the failed drive */
-		for (i=0; i<disks; i++) {
-			if (operational[i])
-				continue;
-			compute_block(sh, i);
-			if (conf->spare) {
-				bh = sh->bh_copy[i];
-				if (bh) {
-					memcpy(bh->b_data, sh->bh_old[i]->b_data, sh->size);
-					set_bit(BH_Uptodate, &bh->b_state);
-				} else {
-					bh = sh->bh_old[i];
-					sh->bh_old[i] = NULL;
-					sh->bh_copy[i] = bh;
+	PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
+	       locked, uptodate, to_read, to_write, failed, failed_num);
+	/* check if the array has lost two devices and, if so, some requests might
+	 * need to be failed
+	 */
+	if (failed > 1 && to_read+to_write) {
+		spin_lock_irq(&conf->device_lock);
+		for (i=disks; i--; ) {
+			/* fail all writes first */
+			if (sh->bh_write[i]) to_write--;
+			while ((bh = sh->bh_write[i])) {
+				sh->bh_write[i] = bh->b_reqnext;
+				bh->b_reqnext = return_fail;
+				return_fail = bh;
+			}
+			/* fail any reads if this device is non-operational */
+			if (!conf->disks[i].operational) {
+				if (sh->bh_read[i]) to_read--;
+				while ((bh = sh->bh_read[i])) {
+					sh->bh_read[i] = bh->b_reqnext;
+					bh->b_reqnext = return_fail;
+					return_fail = bh;
 				}
-				atomic_inc(&sh->nr_pending);
-				lock_get_bh(bh);
-				bh->b_dev = bh->b_rdev = conf->spare->dev;
-				generic_make_request(WRITE, bh);
-				md_sync_acct(bh->b_rdev, bh->b_size/512);
-				atomic_dec(&bh->b_count);
-		PRINTK("handle_stripe_sync() %lu, phase WRITE, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
 			}
-			break;
 		}
-		return;
+		spin_unlock_irq(&conf->device_lock);
+		if (syncing) {
+			md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,0);
+			clear_bit(STRIPE_SYNCING, &sh->state);
+			syncing = 0;
+		}			
 	}
 
-	/* nr_cache == disks:
-	 * check parity and compute/write if needed
+	/* might be able to return some write requests if the parity block
+	 * is safe, or on a failed drive
 	 */
-	
-	compute_parity(sh, RECONSTRUCT_WRITE);
-	pd_idx = sh->pd_idx;
-	if (!memcmp(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size)) {
-		/* the parity is correct - Yay! */
-		complete_stripe(sh);
-	} else {
-		sh->phase = PHASE_WRITE;
-		bh = sh->bh_copy[pd_idx];
-		atomic_set_buffer_dirty(bh);
-		lock_get_bh(bh);
-		atomic_inc(&sh->nr_pending);
-		bh->b_dev = bh->b_rdev = conf->disks[pd_idx].dev;
-		generic_make_request(WRITE, bh);
-		md_sync_acct(bh->b_rdev, bh->b_size/512);
-		atomic_dec(&bh->b_count);
-		PRINTK("handle_stripe_sync() %lu phase WRITE, pending %d buffers\n",
-			sh->sector, md_atomic_read(&sh->nr_pending));
-	}
-}
-
-/*
- * handle_stripe() is our main logic routine. Note that:
- *
- * 1.	lock_stripe() should be used whenever we can't accept additonal
- *	buffers, either during short sleeping in handle_stripe() or
- *	during io operations.
- *
- * 2.	We should be careful to set sh->nr_pending whenever we sleep,
- *	to prevent re-entry of handle_stripe() for the same sh.
- *
- * 3.	conf->failed_disks and disk->operational can be changed
- *	from an interrupt. This complicates things a bit, but it allows
- *	us to stop issuing requests for a failed drive as soon as possible.
- */
-static void handle_stripe(struct stripe_head *sh)
-{
-	raid5_conf_t *conf = sh->raid_conf;
-	mddev_t *mddev = conf->mddev;
-	int disks = conf->raid_disks;
-	int i, nr_read = 0, nr_write = 0, parity = 0;
-	int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0;
-	int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
-	int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
-
-	PRINTK("handle_stripe(), stripe %lu\n", sh->sector);
-	if (!stripe_locked(sh))
-		BUG();
-	if (md_atomic_read(&sh->nr_pending))
-		BUG();
-	if (sh->phase == PHASE_COMPLETE)
-		BUG();
-
-	atomic_dec(&conf->nr_handle);
-
-	if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
-		printk("raid5: restarting stripe %lu\n", sh->sector);
-		sh->phase = PHASE_BEGIN;
-	}
-
-	if ((sh->cmd == STRIPE_WRITE && sh->phase == PHASE_WRITE) ||
-		(sh->cmd == STRIPE_READ && sh->phase == PHASE_READ) ||
-		(sh->cmd == STRIPE_SYNC && sh->phase == PHASE_WRITE)
-		) {
-		/*
-		 * Completed
-		 */
-		complete_stripe(sh);
-		if (sh->phase == PHASE_COMPLETE)
-			return;
-	}
-
-	md_spin_lock_irq(&conf->device_lock);
-	for (i = 0; i < disks; i++) {
-		operational[i] = conf->disks[i].operational;
-	if (i == sh->pd_idx && conf->resync_parity)
-			operational[i] = 0;
-	}
-	failed_disks = conf->failed_disks;
-	md_spin_unlock_irq(&conf->device_lock);
-
-	/*
-	 * Make this one more graceful?
-	 */
-	if (failed_disks > 1) {
-		for (i = 0; i < disks; i++) {
-			if (sh->bh_new[i]) {
-				raid5_end_buffer_io(sh, i, 0);
-				continue;
+	bh = sh->bh_cache[sh->pd_idx];
+	if ( written &&
+	     ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
+	       || (failed == 1 && failed_num == sh->pd_idx))
+	    ) {
+	    /* any written block on a uptodate or failed drive can be returned */
+	    for (i=disks; i--; )
+		if (sh->bh_written[i]) {
+		    bh = sh->bh_cache[i];
+		    if (!conf->disks[sh->pd_idx].operational ||
+			(!buffer_locked(bh) && buffer_uptodate(bh)) ) {
+			/* maybe we can return some write requests */
+			struct buffer_head *wbh, *wbh2;
+			PRINTK("Return write for disc %d\n", i);
+			spin_lock_irq(&conf->device_lock);
+			wbh = sh->bh_written[i];
+			sh->bh_written[i] = NULL;
+			spin_unlock_irq(&conf->device_lock);
+			while (wbh) {
+			    wbh2 = wbh->b_reqnext;
+			    wbh->b_reqnext = return_ok;
+			    return_ok = wbh;
+			    wbh = wbh2;
 			}
+		    }
 		}
-		if (sh->cmd == STRIPE_SYNC)
-			md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1);
-		finish_unlock_stripe(sh);
-		return;
 	}
-
-	PRINTK("=== stripe index START ===\n");
-	for (i = 0; i < disks; i++) {
-		PRINTK("disk %d, ", i);
-		if (sh->bh_old[i]) {
-			nr_cache++;
-			PRINTK(" (old cached, %d)", nr_cache);
-		}
-		if (i == sh->pd_idx) {
-			PRINTK(" PARITY.");
-			if (sh->bh_old[i]) {
-				PRINTK(" CACHED.");
-				parity = 1;
-			} else {
-				PRINTK(" UNCACHED.");
-				if (!operational[i]) {
-					PRINTK(" FAILED.");
-					parity_failed = 1;
+		
+	/* Now we might consider reading some blocks, either to check/generate
+	 * parity, or to satisfy requests
+	 */
+	if (to_read || (syncing && (uptodate+failed < disks))) {
+		for (i=disks; i--;) {
+			bh = sh->bh_cache[i];
+			if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
+			    (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
+				/* we would like to get this block, possibly
+				 * by computing it, but we might not be able to
+				 */
+				if (uptodate == disks-1) {
+					PRINTK("Computing block %d\n", i);
+					compute_block(sh, i);
+					uptodate++;
+				} else if (conf->disks[i].operational) {
+					set_bit(BH_Lock, &bh->b_state);
+					action[i] = READ+1;
+					locked++;
+					PRINTK("Reading block %d (sync=%d)\n", i, syncing);
+					if (syncing)
+						md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
 				}
 			}
-			PRINTK("\n");
-			continue;
 		}
-		if (!sh->bh_new[i]) {
-			PRINTK(" (no new data block) ");
-			if (sh->bh_old[i]) {
-				PRINTK(" (but old block cached) ");
-				nr_cache_other++;
-			} else {
-				if (!operational[i]) {
-					PRINTK(" (because failed disk) ");
-					nr_failed_other++;
-				} else
-					PRINTK(" (no old block either) ");
+		set_bit(STRIPE_HANDLE, &sh->state);
+	}
+
+	/* now to consider writing and what else, if anything should be read */
+	if (to_write) {
+		int rmw=0, rcw=0;
+		for (i=disks ; i--;) {
+			/* would I have to read this buffer for read_modify_write */
+			bh = sh->bh_cache[i];
+			if ((sh->bh_write[i] || i == sh->pd_idx) &&
+			    !buffer_locked(bh) && !buffer_uptodate(bh)) {
+				if (conf->disks[i].operational 
+/*				    && !(conf->resync_parity && i == sh->pd_idx) */
+					)
+					rmw++;
+				else rmw += 2*disks;  /* cannot read it */
+			}
+			/* Would I have to read this buffer for reconstruct_write */
+			if (!sh->bh_write[i] && i != sh->pd_idx &&
+			    !buffer_locked(bh) && !buffer_uptodate(bh)) {
+				if (conf->disks[i].operational) rcw++;
+				else rcw += 2*disks;
 			}
-			PRINTK("\n");
-			continue;
-		}
-		sh->new[i] = 0;
-		if (sh->cmd_new[i] == READ) {
-			nr_read++;
-			PRINTK(" (new READ %d)", nr_read);
-		}
-		if (sh->cmd_new[i] == WRITE) {
-			nr_write++;
-			PRINTK(" (new WRITE %d)", nr_write);
 		}
-		if (sh->bh_old[i]) {
-			nr_cache_overwrite++;
-			PRINTK(" (overwriting old %d)", nr_cache_overwrite);
-		} else {
-			if (!operational[i]) {
-				nr_failed_overwrite++;
-				PRINTK(" (overwriting failed %d)", nr_failed_overwrite);
+		PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		if (rmw < rcw && rmw > 0)
+			/* prefer read-modify-write, but need to get some data */
+			for (i=disks; i--;) {
+				bh = sh->bh_cache[i];
+				if ((sh->bh_write[i] || i == sh->pd_idx) &&
+				    !buffer_locked(bh) && !buffer_uptodate(bh) &&
+				    conf->disks[i].operational) {
+					PRINTK("Read_old block %d for r-m-w\n", i);
+					set_bit(BH_Lock, &bh->b_state);
+					action[i] = READ+1;
+					locked++;
+				}
 			}
+		if (rcw <= rmw && rcw > 0)
+			/* want reconstruct write, but need to get some data */
+			for (i=disks; i--;) {
+				bh = sh->bh_cache[i];
+				if (!sh->bh_write[i]  && i != sh->pd_idx &&
+				    !buffer_locked(bh) && !buffer_uptodate(bh) &&
+				    conf->disks[i].operational) {
+					PRINTK("Read_old block %d for Reconstruct\n", i);
+					set_bit(BH_Lock, &bh->b_state);
+					action[i] = READ+1;
+					locked++;
+				}
+			}
+		/* now if nothing is locked, and if we have enough data, we can start a write request */
+		if (locked == 0 && (rcw == 0 ||rmw == 0)) {
+			PRINTK("Computing parity...\n");
+			compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+			/* now every locked buffer is ready to be written */
+			for (i=disks; i--;)
+				if (buffer_locked(sh->bh_cache[i])) {
+					PRINTK("Writing block %d\n", i);
+					locked++;
+					action[i] = WRITE+1;
+					if (!conf->disks[i].operational
+					    || (i==sh->pd_idx && failed == 0))
+						set_bit(STRIPE_INSYNC, &sh->state);
+				}
 		}
-		PRINTK("\n");
 	}
-	PRINTK("=== stripe index END ===\n");
 
-	if (nr_write && nr_read)
-		BUG();
+	/* maybe we need to check and possibly fix the parity for this stripe
+	 * Any reads will already have been scheduled, so we just see if enough data
+	 * is available
+	 */
+	if (syncing && locked == 0 &&
+	    !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
+		set_bit(STRIPE_HANDLE, &sh->state);
+		if (failed == 0) {
+			if (uptodate != disks)
+				BUG();
+			compute_parity(sh, CHECK_PARITY);
+			uptodate--;
+			bh = sh->bh_cache[sh->pd_idx];
+			if ((*(u32*)bh->b_data) == 0 &&
+			    !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
+				/* parity is correct (on disc, not in buffer any more) */
+				set_bit(STRIPE_INSYNC, &sh->state);
+			}
+		}
+		if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+			if (failed==0)
+				failed_num = sh->pd_idx;
+			/* should be able to compute the missing block and write it to spare */
+			if (!buffer_uptodate(sh->bh_cache[failed_num])) {
+				if (uptodate+1 != disks)
+					BUG();
+				compute_block(sh, failed_num);
+				uptodate++;
+			}
+			if (uptodate != disks)
+				BUG();
+			bh = sh->bh_cache[failed_num];
+			set_bit(BH_Lock, &bh->b_state);
+			action[failed_num] = WRITE+1;
+			locked++;
+			set_bit(STRIPE_INSYNC, &sh->state);
+			if (conf->disks[i].operational)
+				md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
+			else if (conf->spare)
+				md_sync_acct(conf->spare->dev, bh->b_size>>9);
 
-	if (nr_write)
-		handle_stripe_write(
-			mddev, conf, sh, nr_write, operational, disks,
-			parity, parity_failed, nr_cache, nr_cache_other,
-			nr_failed_other, nr_cache_overwrite,
-			nr_failed_overwrite
-		);
-	else if (nr_read)
-		handle_stripe_read(
-			mddev, conf, sh, nr_read, operational, disks,
-			parity, parity_failed, nr_cache, nr_cache_other,
-			nr_failed_other, nr_cache_overwrite,
-			nr_failed_overwrite
-		);
-	else if (sh->cmd == STRIPE_SYNC)
-		handle_stripe_sync(
-			mddev, conf, sh, operational, disks,
-			parity, parity_failed, nr_cache, nr_cache_other,
-			nr_failed_other, nr_cache_overwrite, nr_failed_overwrite
-		);
+		}
+	}
+	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+		md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1);
+		clear_bit(STRIPE_SYNCING, &sh->state);
+	}
+	
+	
+	spin_unlock(&sh->lock);
+
+	while ((bh=return_ok)) {
+		return_ok = bh->b_reqnext;
+		bh->b_reqnext = NULL;
+		bh->b_end_io(bh, 1);
+	}
+	while ((bh=return_fail)) {
+		return_ok = bh->b_reqnext;
+		bh->b_reqnext = NULL;
+		bh->b_end_io(bh, 0);
+	}
+	for (i=disks; i-- ;) 
+		if (action[i]) {
+			struct buffer_head *bh = sh->bh_cache[i];
+			int skip = 0;
+			if (action[i] == READ+1)
+				bh->b_end_io = raid5_end_read_request;
+			else
+				bh->b_end_io = raid5_end_write_request;
+			if (conf->disks[i].operational)
+				bh->b_dev = conf->disks[i].dev;
+			else if (conf->spare && action[i] == WRITE+1)
+				bh->b_dev = conf->spare->dev;
+			else if (action[i] == READ+1)
+				BUG();
+			else skip=1;
+			if (!skip) {
+				PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
+				atomic_inc(&sh->count);
+				bh->b_rdev = bh->b_dev;
+				bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
+				generic_make_request(action[i]-1, bh);
+			} else
+				PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
+		}
 }
 
 
@@ -1463,34 +1109,28 @@ static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
 	const unsigned int data_disks = raid_disks - 1;
 	unsigned int dd_idx, pd_idx;
 	unsigned long new_sector;
+	int read_ahead = 0;
 
 	struct stripe_head *sh;
 
-	if (rw == READA)
+	if (rw == READA) {
 		rw = READ;
+		read_ahead=1;
+	}
 
 	new_sector = raid5_compute_sector(bh->b_rsector,
 			raid_disks, data_disks, &dd_idx, &pd_idx, conf);
 
 	PRINTK("raid5_make_request, sector %lu\n", new_sector);
-	sh = get_lock_stripe(conf, new_sector, bh->b_size);
-#if 0
-	if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
-		PRINTK("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd);
-		lock_stripe(sh);
-		if (!md_atomic_read(&sh->nr_pending))
-			handle_stripe(sh);
-		goto repeat;
-	}
-#endif
-	sh->pd_idx = pd_idx;
-	if (sh->phase != PHASE_COMPLETE && sh->phase != PHASE_BEGIN)
-		PRINTK("stripe %lu catching the bus!\n", sh->sector);
-	if (sh->bh_new[dd_idx])
-		BUG();
-	add_stripe_bh(sh, bh, dd_idx, rw);
+	sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
+	if (sh) {
+		sh->pd_idx = pd_idx;
 
-	md_wakeup_thread(conf->thread);
+		add_stripe_bh(sh, bh, dd_idx, rw);
+		handle_stripe(sh);
+		release_stripe(sh);
+	} else
+		bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
 	return 0;
 }
 
@@ -1525,22 +1165,21 @@ static int raid5_sync_request (mddev_t *mddev, unsigned long block_nr)
 	int redone = 0;
 	int bufsize;
 
-	if (!conf->buffer_size)
-		conf->buffer_size = /* device_bsize(mddev_to_kdev(mddev))*/ PAGE_SIZE;
-	bufsize = conf->buffer_size;
-	/* Hmm... race on buffer_size ?? */
-	redone = block_nr% (bufsize>>10);
-	block_nr -= redone;
-	sh = get_lock_stripe(conf, block_nr<<1, bufsize);
+	sh = get_active_stripe(conf, block_nr<<1, 0, 0);
+	bufsize = sh->size;
+	redone = block_nr-(sh->sector>>1);
 	first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
 		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
 	sh->pd_idx = pd_idx;
-	sh->cmd = STRIPE_SYNC;
-	sh->phase = PHASE_BEGIN;
+	spin_lock(&sh->lock);	
+	set_bit(STRIPE_SYNCING, &sh->state);
+	clear_bit(STRIPE_INSYNC, &sh->state);
 	sh->sync_redone = redone;
-	atomic_inc(&conf->nr_pending_stripes);
-	atomic_inc(&conf->nr_handle);
-	md_wakeup_thread(conf->thread);
+	spin_unlock(&sh->lock);
+
+	handle_stripe(sh);
+	release_stripe(sh);
+
 	return (bufsize>>10)-redone;
 }
 
@@ -1556,46 +1195,35 @@ static void raid5d (void *data)
 	struct stripe_head *sh;
 	raid5_conf_t *conf = data;
 	mddev_t *mddev = conf->mddev;
-	int i, handled;
+	int handled;
 
 	PRINTK("+++ raid5d active\n");
 
 	handled = 0;
-	md_spin_lock_irq(&conf->device_lock);
-	clear_bit(THREAD_WAKEUP, &conf->thread->flags);
-repeat_pass:
+
 	if (mddev->sb_dirty) {
-		md_spin_unlock_irq(&conf->device_lock);
 		mddev->sb_dirty = 0;
 		md_update_sb(mddev);
-		md_spin_lock_irq(&conf->device_lock);
 	}
-	for (i = 0; i < NR_HASH; i++) {
-repeat:
-		sh = conf->stripe_hashtbl[i];
-		for (; sh; sh = sh->hash_next) {
-			if (sh->raid_conf != conf)
-				continue;
-			if (sh->phase == PHASE_COMPLETE)
-				continue;
-			if (md_atomic_read(&sh->nr_pending))
-				continue;
-			md_spin_unlock_irq(&conf->device_lock);
-			if (!atomic_read(&sh->count))
-				BUG();
+	md_spin_lock_irq(&conf->device_lock);
+	while (!list_empty(&conf->handle_list)) {
+		struct list_head *first = conf->handle_list.next;
+		sh = list_entry(first, struct stripe_head, lru);
 
-			handled++;
-			handle_stripe(sh);
-			md_spin_lock_irq(&conf->device_lock);
-			goto repeat;
-		}
-	}
-	if (conf) {
-		PRINTK("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle));
-		if (test_and_clear_bit(THREAD_WAKEUP, &conf->thread->flags) &&
-				md_atomic_read(&conf->nr_handle))
-			goto repeat_pass;
+		list_del_init(first);
+		atomic_inc(&sh->count);
+		if (atomic_read(&sh->count)!= 1)
+			BUG();
+		md_spin_unlock_irq(&conf->device_lock);
+		
+		handled++;
+		handle_stripe(sh);
+		release_stripe(sh);
+
+		md_spin_lock_irq(&conf->device_lock);
 	}
+	PRINTK("%d stripes handled\n", handled);
+
 	md_spin_unlock_irq(&conf->device_lock);
 
 	PRINTK("--- raid5d inactive\n");
@@ -1727,6 +1355,11 @@ static int raid5_run (mddev_t *mddev)
 
 	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
 	md_init_waitqueue_head(&conf->wait_for_stripe);
+	INIT_LIST_HEAD(&conf->handle_list);
+	INIT_LIST_HEAD(&conf->inactive_list);
+	atomic_set(&conf->active_stripes, 0);
+	conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
+
 	PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
 
 	ITERATE_RDEV(mddev,rdev,tmp) {
@@ -1867,8 +1500,7 @@ static int raid5_run (mddev_t *mddev)
 	}
 
 	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-		 conf->raid_disks * (sizeof(struct buffer_head) +
-		 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
+		 conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
 	if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
 		printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
 		shrink_stripes(conf, conf->max_nr_stripes);
@@ -1971,11 +1603,10 @@ static int raid5_stop (mddev_t *mddev)
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 
-	shrink_stripe_cache(conf, conf->max_nr_stripes);
-	shrink_stripes(conf, conf->max_nr_stripes);
-	md_unregister_thread(conf->thread);
 	if (conf->resync_thread)
 		md_unregister_thread(conf->resync_thread);
+	md_unregister_thread(conf->thread);
+	shrink_stripes(conf, conf->max_nr_stripes);
 	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
 	kfree(conf);
 	mddev->private = NULL;
@@ -1988,23 +1619,14 @@ static void print_sh (struct stripe_head *sh)
 {
 	int i;
 
-	printk("sh %lu, phase %d, size %d, pd_idx %d, state %ld, cmd %d.\n", sh->sector, sh->phase, sh->size, sh->pd_idx, sh->state, sh->cmd);
-	printk("sh %lu, write_method %d, nr_pending %d, count %d.\n", sh->sector, sh->write_method, atomic_read(&sh->nr_pending), atomic_read(&sh->count));
+	printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
+	printk("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
 	printk("sh %lu, ", sh->sector);
 	for (i = 0; i < MD_SB_DISKS; i++) {
-		if (sh->bh_old[i])
-			printk("(old%d: %p) ", i, sh->bh_old[i]);
-		if (sh->bh_new[i])
-			printk("(new%d: %p) ", i, sh->bh_new[i]);
-		if (sh->bh_copy[i])
-			printk("(copy%d: %p) ", i, sh->bh_copy[i]);
-		if (sh->bh_req[i])
-			printk("(req%d: %p) ", i, sh->bh_req[i]);
+		if (sh->bh_cache[i])
+			printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
 	}
 	printk("\n");
-	for (i = 0; i < MD_SB_DISKS; i++)
-		printk("%d(%d/%d) ", i, sh->cmd_new[i], sh->new[i]);
-	printk("\n");
 }
 
 static void printall (raid5_conf_t *conf)
@@ -2041,13 +1663,6 @@ static int raid5_status (char *page, mddev_t *mddev)
 #if RAID5_DEBUG
 #define D(x) \
 	sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
-	D(nr_handle);
-	D(nr_stripes);
-	D(nr_hashed_stripes);
-	D(nr_locked_stripes);
-	D(nr_pending_stripes);
-	D(nr_cached_stripes);
-	D(nr_free_sh);
 	printall(conf);
 #endif
 	return sz;
@@ -2066,7 +1681,11 @@ static void print_raid5_conf (raid5_conf_t *conf)
 	printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
 		 conf->working_disks, conf->failed_disks);
 
+#if RAID5_DEBUG
 	for (i = 0; i < MD_SB_DISKS; i++) {
+#else
+	for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
+#endif
 		tmp = conf->disks + i;
 		printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
 			i, tmp->spare,tmp->operational,
author	Ralf Baechle <ralf@linux-mips.org>	2001-01-10 17:17:53 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2001-01-10 17:17:53 +0000
commit	b2ad5f821b1381492d792ca10b1eb7a107b48f14 (patch)
tree	954a648692e7da983db1d2470953705f6a729264 /drivers/md
parent	c9c06167e7933d93a6e396174c68abf242294abb (diff)