diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2001-01-10 17:17:53 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2001-01-10 17:17:53 +0000 |
commit | b2ad5f821b1381492d792ca10b1eb7a107b48f14 (patch) | |
tree | 954a648692e7da983db1d2470953705f6a729264 /drivers/md | |
parent | c9c06167e7933d93a6e396174c68abf242294abb (diff) |
Merge with Linux 2.4.0-prerelease. Big Makefile rewrite, test your
Makefiles.
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Makefile | 17 | ||||
-rw-r--r-- | drivers/md/lvm-snap.c | 216 | ||||
-rw-r--r-- | drivers/md/lvm.c | 1398 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1755 |
4 files changed, 1883 insertions, 1503 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index c37ce84db..041b18661 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -3,25 +3,15 @@ # O_TARGET := mddev.o -SUB_DIRS := -ALL_SUB_DIRS := -MOD_SUB_DIRS := export-objs := md.o xor.o list-multi := lvm-mod.o lvm-mod-objs := lvm.o lvm-snap.o -obj-y := -obj-m := -obj-n := -obj- := - # Note: link order is important. All raid personalities # and xor.o must come before md.o, as they each initialise # themselves, and md.o may use the personalities when it # auto-initialised. -# The use of MIX_OBJS allows link order to be maintained even -# though some are export-objs and some aren't. obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o @@ -30,13 +20,6 @@ obj-$(CONFIG_MD_RAID5) += raid5.o xor.o obj-$(CONFIG_BLK_DEV_MD) += md.o obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o -# Translate to Rules.make lists. -active-objs := $(sort $(obj-y) $(obj-m)) - -O_OBJS := $(obj-y) -M_OBJS := $(obj-m) -MIX_OBJS := $(filter $(export-objs), $(active-objs)) - include $(TOPDIR)/Rules.make lvm-mod.o: $(lvm-mod-objs) diff --git a/drivers/md/lvm-snap.c b/drivers/md/lvm-snap.c index 04007c1be..980694ee3 100644 --- a/drivers/md/lvm-snap.c +++ b/drivers/md/lvm-snap.c @@ -2,13 +2,14 @@ * kernel/lvm-snap.c * * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * Heinz Mauelshagen, Sistina Software (persistent snapshots) * * LVM snapshot driver is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * - * LVM driver is distributed in the hope that it will be useful, + * LVM snapshot driver is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. @@ -29,13 +30,27 @@ #include <linux/lvm.h> -static char *lvm_snap_version __attribute__ ((unused)) = "LVM 0.8final (15/02/2000)\n"; +static char *lvm_snap_version __attribute__ ((unused)) = "LVM 0.9 snapshot code (13/11/2000)\n"; extern const char *const lvm_name; extern int lvm_blocksizes[]; void lvm_snapshot_release(lv_t *); +uint lvm_pv_get_number(vg_t * vg, kdev_t rdev) +{ + uint p; + + for ( p = 0; p < vg->pv_max; p++) + { + if ( vg->pv[p] == NULL) continue; + if ( vg->pv[p]->pv_dev == rdev) break; + } + + return vg->pv[p]->pv_number; +} + + #define hashfn(dev,block,mask,chunk_size) \ ((HASHDEV(dev)^((block)/(chunk_size))) & (mask)) @@ -72,9 +87,9 @@ lvm_find_exception_table(kdev_t org_dev, unsigned long org_start, lv_t * lv) return ret; } -static inline void lvm_hash_link(lv_block_exception_t * exception, - kdev_t org_dev, unsigned long org_start, - lv_t * lv) +inline void lvm_hash_link(lv_block_exception_t * exception, + kdev_t org_dev, unsigned long org_start, + lv_t * lv) { struct list_head * hash_table = lv->lv_snapshot_hash_table; unsigned long mask = lv->lv_snapshot_hash_mask; @@ -97,7 +112,6 @@ int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector, pe_adjustment = (*org_sector-pe_off) % chunk_size; __org_start = *org_sector - pe_adjustment; __org_dev = *org_dev; - ret = 0; exception = lvm_find_exception_table(__org_dev, __org_start, lv); if (exception) @@ -109,7 +123,7 @@ int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector, return ret; } -static void lvm_drop_snapshot(lv_t * lv_snap, const char * reason) +void lvm_drop_snapshot(lv_t * lv_snap, const char * reason) { kdev_t last_dev; int i; @@ -118,8 +132,7 @@ static void lvm_drop_snapshot(lv_t * lv_snap, const char * reason) or error on this snapshot --> release it */ invalidate_buffers(lv_snap->lv_dev); - last_dev = 0; - for (i = 0; i < lv_snap->lv_remap_ptr; i++) { + for (i = last_dev = 0; i < lv_snap->lv_remap_ptr; i++) { if ( lv_snap->lv_block_exception[i].rdev_new != last_dev) { last_dev = lv_snap->lv_block_exception[i].rdev_new; invalidate_buffers(last_dev); @@ -149,7 +162,7 @@ static inline void lvm_snapshot_prepare_blocks(unsigned long * blocks, blocks[i] = start++; } -static inline int get_blksize(kdev_t dev) +inline int lvm_get_blksize(kdev_t dev) { int correct_size = BLOCK_SIZE, i, major; @@ -185,6 +198,133 @@ static inline void invalidate_snap_cache(unsigned long start, unsigned long nr, } #endif + +void lvm_snapshot_fill_COW_page(vg_t * vg, lv_t * lv_snap) +{ + int id = 0, is = lv_snap->lv_remap_ptr; + ulong blksize_snap; + lv_COW_table_disk_t * lv_COW_table = + ( lv_COW_table_disk_t *) page_address(lv_snap->lv_COW_table_page); + + if (is == 0) return; + is--; + blksize_snap = lvm_get_blksize(lv_snap->lv_block_exception[is].rdev_new); + is -= is % (blksize_snap / sizeof(lv_COW_table_disk_t)); + + memset(lv_COW_table, 0, blksize_snap); + for ( ; is < lv_snap->lv_remap_ptr; is++, id++) { + /* store new COW_table entry */ + lv_COW_table[id].pv_org_number = LVM_TO_DISK64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[is].rdev_org)); + lv_COW_table[id].pv_org_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[is].rsector_org); + lv_COW_table[id].pv_snap_number = LVM_TO_DISK64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[is].rdev_new)); + lv_COW_table[id].pv_snap_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[is].rsector_new); + } +} + + +/* + * writes a COW exception table sector to disk (HM) + * + */ + +int lvm_write_COW_table_block(vg_t * vg, + lv_t * lv_snap) +{ + int blksize_snap; + int end_of_table; + int idx = lv_snap->lv_remap_ptr, idx_COW_table; + int nr_pages_tmp; + int length_tmp; + ulong snap_pe_start, COW_table_sector_offset, + COW_entries_per_pe, COW_chunks_per_pe, COW_entries_per_block; + ulong blocks[1]; + const char * reason; + kdev_t snap_phys_dev; + struct kiobuf * iobuf = lv_snap->lv_iobuf; + struct page * page_tmp; + lv_COW_table_disk_t * lv_COW_table = + ( lv_COW_table_disk_t *) page_address(lv_snap->lv_COW_table_page); + + idx--; + + COW_chunks_per_pe = LVM_GET_COW_TABLE_CHUNKS_PER_PE(vg, lv_snap); + COW_entries_per_pe = LVM_GET_COW_TABLE_ENTRIES_PER_PE(vg, lv_snap); + + /* get physical addresse of destination chunk */ + snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new; + snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size; + + blksize_snap = lvm_get_blksize(snap_phys_dev); + + COW_entries_per_block = blksize_snap / sizeof(lv_COW_table_disk_t); + idx_COW_table = idx % COW_entries_per_pe % COW_entries_per_block; + + if ( idx_COW_table == 0) memset(lv_COW_table, 0, blksize_snap); + + /* sector offset into the on disk COW table */ + COW_table_sector_offset = (idx % COW_entries_per_pe) / (SECTOR_SIZE / sizeof(lv_COW_table_disk_t)); + + /* COW table block to write next */ + blocks[0] = (snap_pe_start + COW_table_sector_offset) >> (blksize_snap >> 10); + + /* store new COW_table entry */ + lv_COW_table[idx_COW_table].pv_org_number = LVM_TO_DISK64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[idx].rdev_org)); + lv_COW_table[idx_COW_table].pv_org_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[idx].rsector_org); + lv_COW_table[idx_COW_table].pv_snap_number = LVM_TO_DISK64(lvm_pv_get_number(vg, snap_phys_dev)); + lv_COW_table[idx_COW_table].pv_snap_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[idx].rsector_new); + + length_tmp = iobuf->length; + iobuf->length = blksize_snap; + page_tmp = iobuf->maplist[0]; + iobuf->maplist[0] = lv_snap->lv_COW_table_page; + nr_pages_tmp = iobuf->nr_pages; + iobuf->nr_pages = 1; + + if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, + blocks, blksize_snap) != blksize_snap) + goto fail_raw_write; + + + /* initialization of next COW exception table block with zeroes */ + end_of_table = idx % COW_entries_per_pe == COW_entries_per_pe - 1; + if (idx_COW_table % COW_entries_per_block == COW_entries_per_block - 1 || end_of_table) + { + /* don't go beyond the end */ + if (idx + 1 >= lv_snap->lv_remap_end) goto good_out; + + memset(lv_COW_table, 0, blksize_snap); + + if (end_of_table) + { + idx++; + snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new; + snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size; + blksize_snap = lvm_get_blksize(snap_phys_dev); + blocks[0] = snap_pe_start >> (blksize_snap >> 10); + } else blocks[0]++; + + if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, + blocks, blksize_snap) != blksize_snap) + goto fail_raw_write; + } + + + good_out: + iobuf->length = length_tmp; + iobuf->maplist[0] = page_tmp; + iobuf->nr_pages = nr_pages_tmp; + return 0; + + /* slow path */ + out: + lvm_drop_snapshot(lv_snap, reason); + return 1; + + fail_raw_write: + reason = "write error"; + goto out; +} + /* * copy on write handler for one snapshot logical volume * @@ -200,9 +340,8 @@ int lvm_snapshot_COW(kdev_t org_phys_dev, lv_t * lv_snap) { const char * reason; - unsigned long org_start, snap_start, virt_start, pe_off; + unsigned long org_start, snap_start, snap_phys_dev, virt_start, pe_off; int idx = lv_snap->lv_remap_ptr, chunk_size = lv_snap->lv_chunk_size; - kdev_t snap_phys_dev; struct kiobuf * iobuf; unsigned long blocks[KIO_MAX_SECTORS]; int blksize_snap, blksize_org, min_blksize, max_blksize; @@ -238,8 +377,8 @@ int lvm_snapshot_COW(kdev_t org_phys_dev, iobuf = lv_snap->lv_iobuf; - blksize_org = get_blksize(org_phys_dev); - blksize_snap = get_blksize(snap_phys_dev); + blksize_org = lvm_get_blksize(org_phys_dev); + blksize_snap = lvm_get_blksize(snap_phys_dev); max_blksize = max(blksize_org, blksize_snap); min_blksize = min(blksize_org, blksize_snap); max_sectors = KIO_MAX_SECTORS * (min_blksize>>9); @@ -268,7 +407,7 @@ int lvm_snapshot_COW(kdev_t org_phys_dev, } #ifdef DEBUG_SNAPSHOT - /* invalidate the logcial snapshot buffer cache */ + /* invalidate the logical snapshot buffer cache */ invalidate_snap_cache(virt_start, lv_snap->lv_chunk_size, lv_snap->lv_dev); #endif @@ -277,15 +416,20 @@ int lvm_snapshot_COW(kdev_t org_phys_dev, so update the execption table */ lv_snap->lv_block_exception[idx].rdev_org = org_phys_dev; lv_snap->lv_block_exception[idx].rsector_org = org_start; + lvm_hash_link(lv_snap->lv_block_exception + idx, org_phys_dev, org_start, lv_snap); lv_snap->lv_remap_ptr = idx + 1; - return 1; + if (lv_snap->lv_snapshot_use_rate > 0) { + if (lv_snap->lv_remap_ptr * 100 / lv_snap->lv_remap_end >= lv_snap->lv_snapshot_use_rate) + wake_up_interruptible(&lv_snap->lv_snapshot_wait); + } + return 0; /* slow path */ out: lvm_drop_snapshot(lv_snap, reason); - return -1; + return 1; fail_out_of_space: reason = "out of space"; @@ -301,7 +445,7 @@ int lvm_snapshot_COW(kdev_t org_phys_dev, goto out; } -static int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors) +int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors) { int bytes, nr_pages, err, i; @@ -312,33 +456,17 @@ static int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors) goto out; err = -ENOMEM; - iobuf->locked = 1; + iobuf->locked = 0; iobuf->nr_pages = 0; for (i = 0; i < nr_pages; i++) { struct page * page; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,27) page = alloc_page(GFP_KERNEL); if (!page) goto out; -#else - { - unsigned long addr = __get_free_page(GFP_USER); - if (!addr) - goto out; - iobuf->pagelist[i] = addr; - page = virt_to_page(addr); - } -#endif iobuf->maplist[i] = page; - /* the only point to lock the page here is to be allowed - to share unmap_kiobuf() in the fail-path */ -#ifndef LockPage -#define LockPage(map) set_bit(PG_locked, &(map)->flags) -#endif - LockPage(page); iobuf->nr_pages++; } iobuf->offset = 0; @@ -360,7 +488,7 @@ static int calc_max_buckets(void) return mem; } -static int lvm_snapshot_alloc_hash_table(lv_t * lv) +int lvm_snapshot_alloc_hash_table(lv_t * lv) { int err; unsigned long buckets, max_buckets, size; @@ -380,6 +508,7 @@ static int lvm_snapshot_alloc_hash_table(lv_t * lv) if (!hash) goto out; + lv->lv_snapshot_hash_table_size = size; lv->lv_snapshot_hash_mask = buckets-1; while (buckets--) @@ -407,12 +536,20 @@ int lvm_snapshot_alloc(lv_t * lv_snap) err = lvm_snapshot_alloc_hash_table(lv_snap); if (err) goto out_free_kiovec; + + + lv_snap->lv_COW_table_page = alloc_page(GFP_KERNEL); + if (!lv_snap->lv_COW_table_page) + goto out_free_kiovec; + out: return err; out_free_kiovec: unmap_kiobuf(lv_snap->lv_iobuf); free_kiovec(1, &lv_snap->lv_iobuf); + vfree(lv_snap->lv_snapshot_hash_table); + lv_snap->lv_snapshot_hash_table = NULL; goto out; } @@ -427,10 +564,17 @@ void lvm_snapshot_release(lv_t * lv) { vfree(lv->lv_snapshot_hash_table); lv->lv_snapshot_hash_table = NULL; + lv->lv_snapshot_hash_table_size = 0; } if (lv->lv_iobuf) { + unmap_kiobuf(lv->lv_iobuf); free_kiovec(1, &lv->lv_iobuf); lv->lv_iobuf = NULL; } + if (lv->lv_COW_table_page) + { + free_page((ulong)lv->lv_COW_table_page); + lv->lv_COW_table_page = NULL; + } } diff --git a/drivers/md/lvm.c b/drivers/md/lvm.c index f9433232e..ea276c57c 100644 --- a/drivers/md/lvm.c +++ b/drivers/md/lvm.c @@ -1,12 +1,12 @@ /* * kernel/lvm.c * - * Copyright (C) 1997 - 2000 Heinz Mauelshagen, Germany + * Copyright (C) 1997 - 2000 Heinz Mauelshagen, Sistina Software * * February-November 1997 * April-May,July-August,November 1998 * January-March,May,July,September,October 1999 - * January,February 2000 + * January,February,July,September-November 2000 * * * LVM driver is free software; you can redistribute it and/or modify @@ -38,7 +38,7 @@ * lvm_status_byindex_req_t vars * 04/05/1998 - added multiple device support * 08/05/1998 - added support to set/clear extendable flag in volume group - * 09/05/1998 - changed output of lvm_proc_get_info() because of + * 09/05/1998 - changed output of lvm_proc_get_global_info() because of * support for free (eg. longer) logical volume names * 12/05/1998 - added spin_locks (thanks to Pascal van Dam * <pascal@ramoth.xs4all.nl>) @@ -122,18 +122,36 @@ * - avoided "/dev/" in proc filesystem output * - avoided inline strings functions lvm_strlen etc. * 14/02/2000 - support for 2.3.43 - * - integrated Andrea Arcangeli's snapshot code + * - integrated Andrea Arcagneli's snapshot code + * 25/06/2000 - james (chip) , IKKHAYD! roffl + * 26/06/2000 - enhanced lv_extend_reduce for snapshot logical volume support + * 06/09/2000 - added devfs support + * 07/09/2000 - changed IOP version to 9 + * - started to add new char ioctl LV_STATUS_BYDEV_T to support + * getting an lv_t based on the dev_t of the Logical Volume + * 14/09/2000 - enhanced lvm_do_lv_create to upcall VFS functions + * to sync and lock, activate snapshot and unlock the FS + * (to support journaled filesystems) + * 18/09/2000 - hardsector size support + * 27/09/2000 - implemented lvm_do_lv_rename() and lvm_do_vg_rename() + * 30/10/2000 - added Andi Kleen's LV_BMAP ioctl to support LILO + * 01/11/2000 - added memory information on hash tables to + * lvm_proc_get_global_info() + * 02/11/2000 - implemented /proc/lvm/ hierarchy * 07/12/2000 - make sure lvm_make_request_fn returns correct value - 0 or 1 - NeilBrown * */ -static char *lvm_version = "LVM version 0.8final by Heinz Mauelshagen (15/02/2000)\n"; -static char *lvm_short_version = "version 0.8final (15/02/2000)"; +static char *lvm_version = "LVM version 0.9 by Heinz Mauelshagen (13/11/2000)\n"; +static char *lvm_short_version = "version 0.9 (13/11/2000)"; #define MAJOR_NR LVM_BLK_MAJOR #define DEVICE_OFF(device) +/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */ +/* #define LVM_VFS_ENHANCEMENT */ + #include <linux/config.h> #include <linux/version.h> @@ -166,17 +184,15 @@ static char *lvm_short_version = "version 0.8final (15/02/2000)"; #include <linux/kerneld.h> #endif -#define LOCAL_END_REQUEST - #include <linux/blk.h> #include <linux/blkpg.h> #include <linux/errno.h> #include <linux/lvm.h> -#define LVM_CORRECT_READ_AHEAD(a) \ - (((a) < LVM_MIN_READ_AHEAD || (a) > LVM_MAX_READ_AHEAD) \ - ? LVM_MAX_READ_AHEAD : (a)) +#define LVM_CORRECT_READ_AHEAD( a) \ + if ( a < LVM_MIN_READ_AHEAD || \ + a > LVM_MAX_READ_AHEAD) a = LVM_MAX_READ_AHEAD; #ifndef WRITEA # define WRITEA WRITE @@ -195,8 +211,7 @@ extern int lvm_init(void); static void lvm_dummy_device_request(request_queue_t *); #define DEVICE_REQUEST lvm_dummy_device_request -static int lvm_make_request_fn(request_queue_t *, int, struct buffer_head*); -static void lvm_plug_device_noop(request_queue_t *, kdev_t); +static int lvm_make_request_fn(request_queue_t*, int, struct buffer_head*); static int lvm_blk_ioctl(struct inode *, struct file *, uint, ulong); static int lvm_blk_open(struct inode *, struct file *); @@ -205,13 +220,21 @@ static int lvm_chr_open(struct inode *, struct file *); static int lvm_chr_close(struct inode *, struct file *); static int lvm_blk_close(struct inode *, struct file *); +static int lvm_user_bmap(struct inode *, struct lv_bmap *); static int lvm_chr_ioctl(struct inode *, struct file *, uint, ulong); #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS -static int lvm_proc_get_info(char *, char **, off_t, int); -static int (*lvm_proc_get_info_ptr) (char *, char **, off_t, int) = -&lvm_proc_get_info; +int lvm_proc_read_vg_info(char *, char **, off_t, int, int *, void *); +int lvm_proc_read_lv_info(char *, char **, off_t, int, int *, void *); +int lvm_proc_read_pv_info(char *, char **, off_t, int, int *, void *); +static int lvm_proc_get_global_info(char *, char **, off_t, int, int *, void *); +void lvm_do_create_proc_entry_of_vg ( vg_t *); +inline void lvm_do_remove_proc_entry_of_vg ( vg_t *); +inline void lvm_do_create_proc_entry_of_lv ( vg_t *, lv_t *); +inline void lvm_do_remove_proc_entry_of_lv ( vg_t *, lv_t *); +inline void lvm_do_create_proc_entry_of_pv ( vg_t *, pv_t *); +inline void lvm_do_remove_proc_entry_of_pv ( vg_t *, pv_t *); #endif #ifdef LVM_HD_NAME @@ -226,10 +249,16 @@ void lvm_hd_name(char *, int); static void lvm_init_vars(void); /* external snapshot calls */ -int lvm_snapshot_remap_block(kdev_t *, ulong *, ulong, lv_t *); -int lvm_snapshot_COW(kdev_t, ulong, ulong, ulong, lv_t *); -int lvm_snapshot_alloc(lv_t *); -void lvm_snapshot_release(lv_t *); +extern inline int lvm_get_blksize(kdev_t); +extern int lvm_snapshot_alloc(lv_t *); +extern void lvm_snapshot_fill_COW_page(vg_t *, lv_t *); +extern int lvm_snapshot_COW(kdev_t, ulong, ulong, ulong, lv_t *); +extern int lvm_snapshot_remap_block(kdev_t *, ulong *, ulong, lv_t *); +extern void lvm_snapshot_release(lv_t *); +extern int lvm_write_COW_table_block(vg_t *, lv_t *); +extern inline void lvm_hash_link(lv_block_exception_t *, kdev_t, ulong, lv_t *); +extern int lvm_snapshot_alloc_hash_table(lv_t *); +extern void lvm_drop_snapshot(lv_t *, char *); #ifdef LVM_HD_NAME extern void (*lvm_hd_name_ptr) (char *, int); @@ -237,21 +266,30 @@ extern void (*lvm_hd_name_ptr) (char *, int); static int lvm_map(struct buffer_head *, int); static int lvm_do_lock_lvm(void); static int lvm_do_le_remap(vg_t *, void *); -static int lvm_do_pe_lock_unlock(vg_t *r, void *); -static int lvm_do_vg_create(int, void *); -static int lvm_do_vg_extend(vg_t *, void *); -static int lvm_do_vg_reduce(vg_t *, void *); -static int lvm_do_vg_remove(int); + +static int lvm_do_pv_create(pv_t *, vg_t *, ulong); +static int lvm_do_pv_remove(vg_t *, ulong); static int lvm_do_lv_create(int, char *, lv_t *); -static int lvm_do_lv_remove(int, char *, int); static int lvm_do_lv_extend_reduce(int, char *, lv_t *); +static int lvm_do_lv_remove(int, char *, int); +static int lvm_do_lv_rename(vg_t *, lv_req_t *, lv_t *); static int lvm_do_lv_status_byname(vg_t *r, void *); -static int lvm_do_lv_status_byindex(vg_t *, void *arg); +static int lvm_do_lv_status_byindex(vg_t *, void *); +static int lvm_do_lv_status_bydev(vg_t *, void *); + +static int lvm_do_pe_lock_unlock(vg_t *r, void *); + static int lvm_do_pv_change(vg_t*, void*); static int lvm_do_pv_status(vg_t *, void *); + +static int lvm_do_vg_create(int, void *); +static int lvm_do_vg_extend(vg_t *, void *); +static int lvm_do_vg_reduce(vg_t *, void *); +static int lvm_do_vg_rename(vg_t *, void *); +static int lvm_do_vg_remove(int); static void lvm_geninit(struct gendisk *); #ifdef LVM_GET_INODE -static struct inode *lvm_get_inode(kdev_t); +static struct inode *lvm_get_inode(int); void lvm_clear_inode(struct inode *); #endif /* END Internal function prototypes */ @@ -259,10 +297,19 @@ void lvm_clear_inode(struct inode *); /* volume group descriptor area pointers */ static vg_t *vg[ABS_MAX_VG]; + +#ifdef CONFIG_DEVFS_FS +static devfs_handle_t lvm_devfs_handle; +static devfs_handle_t vg_devfs_handle[MAX_VG]; +static devfs_handle_t ch_devfs_handle[MAX_VG]; +static devfs_handle_t lv_devfs_handle[MAX_LV]; +#endif + static pv_t *pvp = NULL; static lv_t *lvp = NULL; static pe_t *pep = NULL; static pe_t *pep1 = NULL; +static char *basename = NULL; /* map from block minor number to VG and LV numbers */ @@ -287,7 +334,6 @@ static int lvm_reset_spindown = 0; static char pv_name[NAME_LEN]; /* static char rootvg[NAME_LEN] = { 0, }; */ -static uint lv_open = 0; const char *const lvm_name = LVM_NAME; static int lock = 0; static int loadtime = 0; @@ -299,27 +345,31 @@ static DECLARE_WAIT_QUEUE_HEAD(lvm_wait); static DECLARE_WAIT_QUEUE_HEAD(lvm_map_wait); static spinlock_t lvm_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t lvm_snapshot_lock = SPIN_LOCK_UNLOCKED; -static devfs_handle_t lvm_devfs_handle; -static devfs_handle_t vg_devfs_handle[MAX_VG]; -static devfs_handle_t ch_devfs_handle[MAX_VG]; -static devfs_handle_t lv_devfs_handle[MAX_LV]; +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS +static struct proc_dir_entry *lvm_proc_dir = NULL; +static struct proc_dir_entry *lvm_proc_vg_subdir = NULL; +struct proc_dir_entry *pde = NULL; +#endif static struct file_operations lvm_chr_fops = { - owner: THIS_MODULE, open: lvm_chr_open, release: lvm_chr_close, ioctl: lvm_chr_ioctl, }; +#define BLOCK_DEVICE_OPERATIONS +/* block device operations structure needed for 2.3.38? and above */ static struct block_device_operations lvm_blk_dops = { open: lvm_blk_open, release: lvm_blk_close, - ioctl: lvm_blk_ioctl + ioctl: lvm_blk_ioctl, }; + /* gendisk structures */ static struct hd_struct lvm_hd_struct[MAX_LV]; static int lvm_blocksizes[MAX_LV] = @@ -364,21 +414,32 @@ int __init lvm_init(void) printk(KERN_ERR "%s -- register_chrdev failed\n", lvm_name); return -EIO; } - if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0) { +#ifdef BLOCK_DEVICE_OPERATIONS + if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0) +#else + if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_fops) < 0) +#endif + { printk("%s -- register_blkdev failed\n", lvm_name); if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); return -EIO; } +#ifdef CONFIG_DEVFS_FS lvm_devfs_handle = devfs_register( 0 , "lvm", 0, 0, LVM_CHAR_MAJOR, S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP, &lvm_chr_fops, NULL); +#endif #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS - create_proc_info_entry(LVM_NAME, S_IFREG | S_IRUGO, - &proc_root, lvm_proc_get_info_ptr); + lvm_proc_dir = create_proc_entry (LVM_DIR, S_IFDIR, &proc_root); + if (lvm_proc_dir != NULL) { + lvm_proc_vg_subdir = create_proc_entry (LVM_VG_SUBDIR, S_IFDIR, lvm_proc_dir); + pde = create_proc_entry(LVM_GLOBAL, S_IFREG, lvm_proc_dir); + if ( pde != NULL) pde->read_proc = &lvm_proc_get_global_info; + } #endif lvm_init_vars(); @@ -405,7 +466,7 @@ int __init lvm_init(void) blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST); blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_make_request_fn); - blk_queue_pluggable(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_plug_device_noop); + /* optional read root VGDA */ /* if ( *rootvg != 0) vg_read_with_pv_and_lv ( rootvg, &vg); @@ -433,7 +494,9 @@ void cleanup_module(void) { struct gendisk *gendisk_ptr = NULL, *gendisk_ptr_prev = NULL; +#ifdef CONFIG_DEVFS_FS devfs_unregister (lvm_devfs_handle); +#endif if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) { printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); @@ -456,9 +519,12 @@ void cleanup_module(void) blk_size[MAJOR_NR] = NULL; blksize_size[MAJOR_NR] = NULL; + hardsect_size[MAJOR_NR] = NULL; #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS - remove_proc_entry(LVM_NAME, &proc_root); + remove_proc_entry(LVM_GLOBAL, lvm_proc_dir); + remove_proc_entry(LVM_VG_SUBDIR, lvm_proc_dir); + remove_proc_entry(LVM_DIR, &proc_root); #endif #ifdef LVM_HD_NAME @@ -486,8 +552,11 @@ void __init lvm_init_vars(void) loadtime = CURRENT_TIME; + lvm_lock = lvm_snapshot_lock = SPIN_LOCK_UNLOCKED; + pe_lock_req.lock = UNLOCK_PE; - pe_lock_req.data.lv_dev = pe_lock_req.data.pv_dev = 0; + pe_lock_req.data.lv_dev = \ + pe_lock_req.data.pv_dev = \ pe_lock_req.data.pv_offset = 0; /* Initialize VG pointers */ @@ -531,6 +600,9 @@ static int lvm_chr_open(struct inode *inode, if (VG_CHR(minor) > MAX_VG) return -ENXIO; lvm_chr_open_count++; + + MOD_INC_USE_COUNT; + return 0; } /* lvm_chr_open() */ @@ -592,7 +664,7 @@ static int lvm_chr_ioctl(struct inode *inode, struct file *file, MOD_INC_USE_COUNT; while (GET_USE_COUNT(&__this_module) > 1) MOD_DEC_USE_COUNT; -#endif /* MODULE */ +#endif /* MODULE */ lock = 0; /* release lock */ wake_up_interruptible(&lvm_wait); return 0; @@ -612,17 +684,21 @@ static int lvm_chr_ioctl(struct inode *inode, struct file *file, /* create a VGDA */ return lvm_do_vg_create(minor, arg); - case VG_REMOVE: - /* remove an inactive VGDA */ - return lvm_do_vg_remove(minor); - case VG_EXTEND: /* extend a volume group */ - return lvm_do_vg_extend(vg_ptr,arg); + return lvm_do_vg_extend(vg_ptr, arg); case VG_REDUCE: /* reduce a volume group */ - return lvm_do_vg_reduce(vg_ptr,arg); + return lvm_do_vg_reduce(vg_ptr, arg); + + case VG_RENAME: + /* rename a volume group */ + return lvm_do_vg_rename(vg_ptr, arg); + + case VG_REMOVE: + /* remove an inactive VGDA */ + return lvm_do_vg_remove(minor); case VG_SET_EXTENDABLE: @@ -660,20 +736,22 @@ static int lvm_chr_ioctl(struct inode *inode, struct file *file, /* get volume group count */ for (l = v = 0; v < ABS_MAX_VG; v++) { if (vg[v] != NULL) { - if (copy_to_user(arg + l++ * NAME_LEN, + if (copy_to_user(arg + l * NAME_LEN, vg[v]->vg_name, NAME_LEN) != 0) return -EFAULT; + l++; } } return 0; case LV_CREATE: - case LV_REMOVE: case LV_EXTEND: case LV_REDUCE: - /* create, remove, extend or reduce a logical volume */ + case LV_REMOVE: + case LV_RENAME: + /* create, extend, reduce, remove or rename a logical volume */ if (vg_ptr == NULL) return -ENXIO; if (copy_from_user(&lv_req, arg, sizeof(lv_req)) != 0) return -EFAULT; @@ -686,52 +764,54 @@ static int lvm_chr_ioctl(struct inode *inode, struct file *file, case LV_CREATE: return lvm_do_lv_create(minor, lv_req.lv_name, &lv); - case LV_REMOVE: - return lvm_do_lv_remove(minor, lv_req.lv_name, -1); - case LV_EXTEND: case LV_REDUCE: return lvm_do_lv_extend_reduce(minor, lv_req.lv_name, &lv); + case LV_REMOVE: + return lvm_do_lv_remove(minor, lv_req.lv_name, -1); + + case LV_RENAME: + return lvm_do_lv_rename(vg_ptr, &lv_req, &lv); } + + case LV_STATUS_BYNAME: /* get status of a logical volume by name */ - return lvm_do_lv_status_byname(vg_ptr,arg); + return lvm_do_lv_status_byname(vg_ptr, arg); + case LV_STATUS_BYINDEX: /* get status of a logical volume by index */ - return lvm_do_lv_status_byindex(vg_ptr,arg); + return lvm_do_lv_status_byindex(vg_ptr, arg); + + + case LV_STATUS_BYDEV: + return lvm_do_lv_status_bydev(vg_ptr, arg); + case PV_CHANGE: /* change a physical volume */ return lvm_do_pv_change(vg_ptr,arg); + case PV_STATUS: /* get physical volume data (pv_t structure only) */ return lvm_do_pv_status(vg_ptr,arg); + case PV_FLUSH: /* physical volume buffer flush/invalidate */ if (copy_from_user(&pv_flush_req, arg, sizeof(pv_flush_req)) != 0) return -EFAULT; - for ( v = 0; v < ABS_MAX_VG; v++) { - unsigned int p; - if ( vg[v] == NULL) continue; - for ( p = 0; p < vg[v]->pv_max; p++) { - if ( vg[v]->pv[p] != NULL && - strcmp ( vg[v]->pv[p]->pv_name, - pv_flush_req.pv_name) == 0) { - fsync_dev ( vg[v]->pv[p]->pv_dev); - invalidate_buffers ( vg[v]->pv[p]->pv_dev); - return 0; - } - } - } + fsync_dev(pv_flush_req.pv_dev); + invalidate_buffers(pv_flush_req.pv_dev); return 0; + default: printk(KERN_WARNING "%s -- lvm_chr_ioctl: unknown command %x\n", @@ -754,11 +834,10 @@ static int lvm_chr_close(struct inode *inode, struct file *file) "%s -- lvm_chr_close VG#: %d\n", lvm_name, VG_CHR(minor)); #endif - lock_kernel(); #ifdef LVM_TOTAL_RESET if (lvm_reset_spindown > 0) { lvm_reset_spindown = 0; - lvm_chr_open_count = 1; + lvm_chr_open_count = 0; } #endif @@ -767,7 +846,8 @@ static int lvm_chr_close(struct inode *inode, struct file *file) lock = 0; /* release lock */ wake_up_interruptible(&lvm_wait); } - unlock_kernel(); + + MOD_DEC_USE_COUNT; return 0; } /* lvm_chr_close() */ @@ -815,6 +895,10 @@ static int lvm_blk_open(struct inode *inode, struct file *file) if (!(lv_ptr->lv_access & LV_WRITE)) return -EACCES; } +#ifndef BLOCK_DEVICE_OPERATIONS + file->f_op = &lvm_blk_fops; +#endif + /* be sure to increment VG counter */ if (lv_ptr->lv_open == 0) vg_ptr->lv_open++; lv_ptr->lv_open++; @@ -863,7 +947,7 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file, lvm_name, lv_ptr->lv_size); #endif if (put_user(lv_ptr->lv_size, (long *)arg)) - return -EFAULT; + return -EFAULT; break; @@ -892,7 +976,7 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file, if ((long) arg < LVM_MIN_READ_AHEAD || (long) arg > LVM_MAX_READ_AHEAD) return -EINVAL; - read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = (long) arg; + lv_ptr->lv_read_ahead = (long) arg; break; @@ -944,6 +1028,10 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file, /* set access flags of a logical volume */ if (!capable(CAP_SYS_ADMIN)) return -EACCES; lv_ptr->lv_access = (ulong) arg; + if ( lv_ptr->lv_access & LV_WRITE) + set_device_ro(lv_ptr->lv_dev, 0); + else + set_device_ro(lv_ptr->lv_dev, 1); break; @@ -955,6 +1043,10 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file, lv_ptr->lv_status = (ulong) arg; break; + case LV_BMAP: + /* turn logical block into (dev_t, block). non privileged. */ + return lvm_user_bmap(inode, (struct lv_bmap *) arg); + break; case LV_SET_ALLOCATION: /* set allocation flags of a logical volume */ @@ -962,6 +1054,37 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file, lv_ptr->lv_allocation = (ulong) arg; break; + case LV_SNAPSHOT_USE_RATE: + if (!(lv_ptr->lv_access & LV_SNAPSHOT)) return -EPERM; + { + lv_snapshot_use_rate_req_t lv_snapshot_use_rate_req; + + if (copy_from_user(&lv_snapshot_use_rate_req, arg, + sizeof(lv_snapshot_use_rate_req_t))) + return -EFAULT; + if (lv_snapshot_use_rate_req.rate < 0 || + lv_snapshot_use_rate_req.rate > 100) return -EFAULT; + + switch (lv_snapshot_use_rate_req.block) + { + case 0: + lv_ptr->lv_snapshot_use_rate = lv_snapshot_use_rate_req.rate; + if (lv_ptr->lv_remap_ptr * 100 / lv_ptr->lv_remap_end < lv_ptr->lv_snapshot_use_rate) + interruptible_sleep_on (&lv_ptr->lv_snapshot_wait); + break; + + case O_NONBLOCK: + break; + + default: + return -EFAULT; + } + lv_snapshot_use_rate_req.rate = lv_ptr->lv_remap_ptr * 100 / lv_ptr->lv_remap_end; + if (copy_to_user(arg, &lv_snapshot_use_rate_req, + sizeof(lv_snapshot_use_rate_req_t))) + return -EFAULT; + } + break; default: printk(KERN_WARNING @@ -999,20 +1122,163 @@ static int lvm_blk_close(struct inode *inode, struct file *file) } /* lvm_blk_close() */ +static int lvm_user_bmap(struct inode *inode, struct lv_bmap *user_result) +{ + struct buffer_head bh; + unsigned long block; + int err; + + if (get_user(block, &user_result->lv_block)) + return -EFAULT; + + memset(&bh,0,sizeof bh); + bh.b_rsector = block; + bh.b_dev = bh.b_rdev = inode->i_dev; + bh.b_size = lvm_get_blksize(bh.b_dev); + if ((err=lvm_map(&bh, READ)) < 0) { + printk("lvm map failed: %d\n", err); + return -EINVAL; + } + + return put_user( kdev_t_to_nr(bh.b_rdev), &user_result->lv_dev) || + put_user(bh.b_rsector, &user_result->lv_block) ? -EFAULT : 0; +} + + +/* + * provide VG info for proc filesystem use (global) + */ +int lvm_vg_info(vg_t *vg_ptr, char *buf) { + int sz = 0; + char inactive_flag = ' '; + + if (!(vg_ptr->vg_status & VG_ACTIVE)) inactive_flag = 'I'; + sz = sprintf(buf, + "\nVG: %c%s [%d PV, %d LV/%d open] " + " PE Size: %d KB\n" + " Usage [KB/PE]: %d /%d total " + "%d /%d used %d /%d free", + inactive_flag, + vg_ptr->vg_name, + vg_ptr->pv_cur, + vg_ptr->lv_cur, + vg_ptr->lv_open, + vg_ptr->pe_size >> 1, + vg_ptr->pe_size * vg_ptr->pe_total >> 1, + vg_ptr->pe_total, + vg_ptr->pe_allocated * vg_ptr->pe_size >> 1, + vg_ptr->pe_allocated, + (vg_ptr->pe_total - vg_ptr->pe_allocated) * + vg_ptr->pe_size >> 1, + vg_ptr->pe_total - vg_ptr->pe_allocated); + return sz; +} + + +/* + * provide LV info for proc filesystem use (global) + */ +int lvm_lv_info(vg_t *vg_ptr, lv_t *lv_ptr, char *buf) { + int sz = 0; + char inactive_flag = 'A', allocation_flag = ' ', + stripes_flag = ' ', rw_flag = ' '; + + if (!(lv_ptr->lv_status & LV_ACTIVE)) + inactive_flag = 'I'; + rw_flag = 'R'; + if (lv_ptr->lv_access & LV_WRITE) + rw_flag = 'W'; + allocation_flag = 'D'; + if (lv_ptr->lv_allocation & LV_CONTIGUOUS) + allocation_flag = 'C'; + stripes_flag = 'L'; + if (lv_ptr->lv_stripes > 1) + stripes_flag = 'S'; + sz += sprintf(buf+sz, + "[%c%c%c%c", + inactive_flag, + rw_flag, + allocation_flag, + stripes_flag); + if (lv_ptr->lv_stripes > 1) + sz += sprintf(buf+sz, "%-2d", + lv_ptr->lv_stripes); + else + sz += sprintf(buf+sz, " "); + basename = strrchr(lv_ptr->lv_name, '/'); + if ( basename == 0) basename = lv_ptr->lv_name; + else basename++; + sz += sprintf(buf+sz, "] %-25s", basename); + if (strlen(basename) > 25) + sz += sprintf(buf+sz, + "\n "); + sz += sprintf(buf+sz, "%9d /%-6d ", + lv_ptr->lv_size >> 1, + lv_ptr->lv_size / vg_ptr->pe_size); + + if (lv_ptr->lv_open == 0) + sz += sprintf(buf+sz, "close"); + else + sz += sprintf(buf+sz, "%dx open", + lv_ptr->lv_open); + + return sz; +} + + +/* + * provide PV info for proc filesystem use (global) + */ +int lvm_pv_info(pv_t *pv_ptr, char *buf) { + int sz = 0; + char inactive_flag = 'A', allocation_flag = ' '; + char *pv_name = NULL; + + if (!(pv_ptr->pv_status & PV_ACTIVE)) + inactive_flag = 'I'; + allocation_flag = 'A'; + if (!(pv_ptr->pv_allocatable & PV_ALLOCATABLE)) + allocation_flag = 'N'; + pv_name = strrchr(pv_ptr->pv_name+1,'/'); + if ( pv_name == 0) pv_name = pv_ptr->pv_name; + else pv_name++; + sz = sprintf(buf, + "[%c%c] %-21s %8d /%-6d " + "%8d /%-6d %8d /%-6d", + inactive_flag, + allocation_flag, + pv_name, + pv_ptr->pe_total * + pv_ptr->pe_size >> 1, + pv_ptr->pe_total, + pv_ptr->pe_allocated * + pv_ptr->pe_size >> 1, + pv_ptr->pe_allocated, + (pv_ptr->pe_total - + pv_ptr->pe_allocated) * + pv_ptr->pe_size >> 1, + pv_ptr->pe_total - + pv_ptr->pe_allocated); + return sz; +} + + #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS /* - * Support function /proc-Filesystem + * Support functions /proc-Filesystem */ + #define LVM_PROC_BUF ( i == 0 ? dummy_buf : &buf[sz]) -static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) +/* + * provide global LVM information + */ +static int lvm_proc_get_global_info(char *page, char **start, off_t pos, int count, int *eof, void *data) { int c, i, l, p, v, vg_counter, pv_counter, lv_counter, lv_open_counter, - lv_open_total, pe_t_bytes, lv_block_exception_t_bytes, seconds; + lv_open_total, pe_t_bytes, hash_table_bytes, lv_block_exception_t_bytes, seconds; static off_t sz; off_t sz_last; - char allocation_flag, inactive_flag, rw_flag, stripes_flag; - char *lv_name, *pv_name; static char *buf = NULL; static char dummy_buf[160]; /* sized for 2 lines */ vg_t *vg_ptr; @@ -1022,13 +1288,16 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) #ifdef DEBUG_LVM_PROC_GET_INFO printk(KERN_DEBUG - "%s - lvm_proc_get_info CALLED pos: %lu count: %d whence: %d\n", + "%s - lvm_proc_get_global_info CALLED pos: %lu count: %d whence: %d\n", lvm_name, pos, count, whence); #endif + MOD_INC_USE_COUNT; + if (pos == 0 || buf == NULL) { sz_last = vg_counter = pv_counter = lv_counter = lv_open_counter = \ - lv_open_total = pe_t_bytes = lv_block_exception_t_bytes = 0; + lv_open_total = pe_t_bytes = hash_table_bytes = \ + lv_block_exception_t_bytes = 0; /* search for activity */ for (v = 0; v < ABS_MAX_VG; v++) { @@ -1040,6 +1309,7 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) for (l = 0; l < vg[v]->lv_max; l++) { if ((lv_ptr = vg_ptr->lv[l]) != NULL) { pe_t_bytes += lv_ptr->lv_allocated_le; + hash_table_bytes += lv_ptr->lv_snapshot_hash_table_size; if (lv_ptr->lv_block_exception != NULL) lv_block_exception_t_bytes += lv_ptr->lv_remap_end; if (lv_ptr->lv_open > 0) { @@ -1057,9 +1327,11 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) if (buf != NULL) { #ifdef DEBUG_KFREE printk(KERN_DEBUG - "%s -- kfree %d\n", lvm_name, __LINE__); + "%s -- vfree %d\n", lvm_name, __LINE__); #endif - kfree(buf); + lock_kernel(); + vfree(buf); + unlock_kernel(); buf = NULL; } /* 2 times: first to get size to allocate buffer, @@ -1094,7 +1366,7 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) vg_counter * sizeof(vg_t) + pv_counter * sizeof(pv_t) + lv_counter * sizeof(lv_t) + - pe_t_bytes + lv_block_exception_t_bytes + sz_last, + pe_t_bytes + hash_table_bytes + lv_block_exception_t_bytes + sz_last, lvm_iop_version); seconds = CURRENT_TIME - loadtime; @@ -1115,26 +1387,7 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) for (v = 0; v < ABS_MAX_VG; v++) { /* volume group */ if ((vg_ptr = vg[v]) != NULL) { - inactive_flag = ' '; - if (!(vg_ptr->vg_status & VG_ACTIVE)) inactive_flag = 'I'; - sz += sprintf(LVM_PROC_BUF, - "\nVG: %c%s [%d PV, %d LV/%d open] " - " PE Size: %d KB\n" - " Usage [KB/PE]: %d /%d total " - "%d /%d used %d /%d free", - inactive_flag, - vg_ptr->vg_name, - vg_ptr->pv_cur, - vg_ptr->lv_cur, - vg_ptr->lv_open, - vg_ptr->pe_size >> 1, - vg_ptr->pe_size * vg_ptr->pe_total >> 1, - vg_ptr->pe_total, - vg_ptr->pe_allocated * vg_ptr->pe_size >> 1, - vg_ptr->pe_allocated, - (vg_ptr->pe_total - vg_ptr->pe_allocated) * - vg_ptr->pe_size >> 1, - vg_ptr->pe_total - vg_ptr->pe_allocated); + sz += lvm_vg_info(vg_ptr, LVM_PROC_BUF); /* physical volumes */ sz += sprintf(LVM_PROC_BUF, @@ -1143,32 +1396,8 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) c = 0; for (p = 0; p < vg_ptr->pv_max; p++) { if ((pv_ptr = vg_ptr->pv[p]) != NULL) { - inactive_flag = 'A'; - if (!(pv_ptr->pv_status & PV_ACTIVE)) - inactive_flag = 'I'; - allocation_flag = 'A'; - if (!(pv_ptr->pv_allocatable & PV_ALLOCATABLE)) - allocation_flag = 'N'; - pv_name = strchr(pv_ptr->pv_name+1,'/'); - if ( pv_name == 0) pv_name = pv_ptr->pv_name; - else pv_name++; - sz += sprintf(LVM_PROC_BUF, - "[%c%c] %-21s %8d /%-6d " - "%8d /%-6d %8d /%-6d", - inactive_flag, - allocation_flag, - pv_name, - pv_ptr->pe_total * - pv_ptr->pe_size >> 1, - pv_ptr->pe_total, - pv_ptr->pe_allocated * - pv_ptr->pe_size >> 1, - pv_ptr->pe_allocated, - (pv_ptr->pe_total - - pv_ptr->pe_allocated) * - pv_ptr->pe_size >> 1, - pv_ptr->pe_total - - pv_ptr->pe_allocated); + sz += lvm_pv_info(pv_ptr, LVM_PROC_BUF); + c++; if (c < vg_ptr->pv_cur) sz += sprintf(LVM_PROC_BUF, @@ -1181,47 +1410,9 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) "\n LV%s ", vg_ptr->lv_cur == 1 ? ": " : "s:"); c = 0; - for (l = 0; l < vg[v]->lv_max; l++) { + for (l = 0; l < vg_ptr->lv_max; l++) { if ((lv_ptr = vg_ptr->lv[l]) != NULL) { - inactive_flag = 'A'; - if (!(lv_ptr->lv_status & LV_ACTIVE)) - inactive_flag = 'I'; - rw_flag = 'R'; - if (lv_ptr->lv_access & LV_WRITE) - rw_flag = 'W'; - allocation_flag = 'D'; - if (lv_ptr->lv_allocation & LV_CONTIGUOUS) - allocation_flag = 'C'; - stripes_flag = 'L'; - if (lv_ptr->lv_stripes > 1) - stripes_flag = 'S'; - sz += sprintf(LVM_PROC_BUF, - "[%c%c%c%c", - inactive_flag, - rw_flag, - allocation_flag, - stripes_flag); - if (lv_ptr->lv_stripes > 1) - sz += sprintf(LVM_PROC_BUF, "%-2d", - lv_ptr->lv_stripes); - else - sz += sprintf(LVM_PROC_BUF, " "); - lv_name = strrchr(lv_ptr->lv_name, '/'); - if ( lv_name == 0) lv_name = lv_ptr->lv_name; - else lv_name++; - sz += sprintf(LVM_PROC_BUF, "] %-25s", lv_name); - if (strlen(lv_name) > 25) - sz += sprintf(LVM_PROC_BUF, - "\n "); - sz += sprintf(LVM_PROC_BUF, "%9d /%-6d ", - lv_ptr->lv_size >> 1, - lv_ptr->lv_size / vg[v]->pe_size); - - if (lv_ptr->lv_open == 0) - sz += sprintf(LVM_PROC_BUF, "close"); - else - sz += sprintf(LVM_PROC_BUF, "%dx open", - lv_ptr->lv_open); + sz += lvm_lv_info(vg_ptr, lv_ptr, LVM_PROC_BUF); c++; if (c < vg_ptr->lv_cur) sz += sprintf(LVM_PROC_BUF, @@ -1234,8 +1425,12 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) } } if (buf == NULL) { - if ((buf = vmalloc(sz)) == NULL) { + lock_kernel(); + buf = vmalloc(sz); + unlock_kernel(); + if (buf == NULL) { sz = 0; + MOD_DEC_USE_COUNT; return sprintf(page, "%s - vmalloc error at line %d\n", lvm_name, __LINE__); } @@ -1243,8 +1438,11 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) sz_last = sz; } } + MOD_DEC_USE_COUNT; if (pos > sz - 1) { + lock_kernel(); vfree(buf); + unlock_kernel(); buf = NULL; return 0; } @@ -1253,47 +1451,111 @@ static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) return sz - pos; else return count; -} /* lvm_proc_get_info() */ +} /* lvm_proc_get_global_info() */ #endif /* #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS */ /* + * provide VG information + */ +int lvm_proc_read_vg_info(char *page, char **start, off_t off, + int count, int *eof, void *data) { + int sz = 0; + vg_t *vg = data; + + sz += sprintf ( page+sz, "name: %s\n", vg->vg_name); + sz += sprintf ( page+sz, "size: %u\n", + vg->pe_total * vg->pe_size / 2); + sz += sprintf ( page+sz, "access: %u\n", vg->vg_access); + sz += sprintf ( page+sz, "status: %u\n", vg->vg_status); + sz += sprintf ( page+sz, "number: %u\n", vg->vg_number); + sz += sprintf ( page+sz, "LV max: %u\n", vg->lv_max); + sz += sprintf ( page+sz, "LV current: %u\n", vg->lv_cur); + sz += sprintf ( page+sz, "LV open: %u\n", vg->lv_open); + sz += sprintf ( page+sz, "PV max: %u\n", vg->pv_max); + sz += sprintf ( page+sz, "PV current: %u\n", vg->pv_cur); + sz += sprintf ( page+sz, "PV active: %u\n", vg->pv_act); + sz += sprintf ( page+sz, "PE size: %u\n", vg->pe_size / 2); + sz += sprintf ( page+sz, "PE total: %u\n", vg->pe_total); + sz += sprintf ( page+sz, "PE allocated: %u\n", vg->pe_allocated); + sz += sprintf ( page+sz, "uuid: %s\n", vg->vg_uuid); + + return sz; +} + + +/* + * provide LV information + */ +int lvm_proc_read_lv_info(char *page, char **start, off_t off, + int count, int *eof, void *data) { + int sz = 0; + lv_t *lv = data; + + sz += sprintf ( page+sz, "name: %s\n", lv->lv_name); + sz += sprintf ( page+sz, "size: %u\n", lv->lv_size); + sz += sprintf ( page+sz, "access: %u\n", lv->lv_access); + sz += sprintf ( page+sz, "status: %u\n", lv->lv_status); + sz += sprintf ( page+sz, "number: %u\n", lv->lv_number); + sz += sprintf ( page+sz, "open: %u\n", lv->lv_open); + sz += sprintf ( page+sz, "allocation: %u\n", lv->lv_allocation); + sz += sprintf ( page+sz, "device: %02u:%02u\n", + MAJOR(lv->lv_dev), MINOR(lv->lv_dev)); + + return sz; +} + + +/* + * provide PV information + */ +int lvm_proc_read_pv_info(char *page, char **start, off_t off, + int count, int *eof, void *data) { + int sz = 0; + pv_t *pv = data; + + sz += sprintf ( page+sz, "name: %s\n", pv->pv_name); + sz += sprintf ( page+sz, "size: %u\n", pv->pv_size); + sz += sprintf ( page+sz, "status: %u\n", pv->pv_status); + sz += sprintf ( page+sz, "number: %u\n", pv->pv_number); + sz += sprintf ( page+sz, "allocatable: %u\n", pv->pv_allocatable); + sz += sprintf ( page+sz, "LV current: %u\n", pv->lv_cur); + sz += sprintf ( page+sz, "PE size: %u\n", pv->pe_size / 2); + sz += sprintf ( page+sz, "PE total: %u\n", pv->pe_total); + sz += sprintf ( page+sz, "PE allocated: %u\n", pv->pe_allocated); + sz += sprintf ( page+sz, "device: %02u:%02u\n", + MAJOR(pv->pv_dev), MINOR(pv->pv_dev)); + sz += sprintf ( page+sz, "uuid: %s\n", pv->pv_uuid); + + + return sz; +} + + +/* * block device support function for /usr/src/linux/drivers/block/ll_rw_blk.c * (see init_module/lvm_init) */ static int lvm_map(struct buffer_head *bh, int rw) { - int minor = MINOR(bh->b_rdev); + int minor = MINOR(bh->b_dev); + int ret = 0; ulong index; ulong pe_start; ulong size = bh->b_size >> 9; - ulong rsector_tmp = bh->b_rsector; + ulong rsector_tmp = bh->b_blocknr * size; ulong rsector_sav; - kdev_t rdev_tmp = bh->b_rdev; + kdev_t rdev_tmp = bh->b_dev; kdev_t rdev_sav; - lv_t *lv = vg[VG_BLK(minor)]->lv[LV_BLK(minor)]; + vg_t *vg_this = vg[VG_BLK(minor)]; + lv_t *lv = vg_this->lv[LV_BLK(minor)]; if (!(lv->lv_status & LV_ACTIVE)) { printk(KERN_ALERT "%s - lvm_map: ll_rw_blk for inactive LV %s\n", lvm_name, lv->lv_name); - goto error; - } -/* - if ( lv->lv_access & LV_SNAPSHOT) - printk ( "%s -- %02d:%02d block: %lu rw: %d\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), bh->b_blocknr, rw); - */ - - /* take care of snapshot chunk writes before - check for writable logical volume */ - if ((lv->lv_access & LV_SNAPSHOT) && - MAJOR(bh->b_rdev) != 0 && - MAJOR(bh->b_rdev) != MAJOR_NR && - (rw == WRITEA || rw == WRITE)) - { - printk ( "%s -- doing snapshot write for %02d:%02d[%02d:%02d] b_blocknr: %lu b_rsector: %lu\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), MAJOR ( bh->b_rdev), MINOR ( bh->b_rdev), bh->b_blocknr, bh->b_rsector); - goto error; + return -1; } if ((rw == WRITE || rw == WRITEA) && @@ -1301,7 +1563,7 @@ static int lvm_map(struct buffer_head *bh, int rw) printk(KERN_CRIT "%s - lvm_map: ll_rw_blk write for readonly LV %s\n", lvm_name, lv->lv_name); - goto error; + return -1; } #ifdef DEBUG_MAP printk(KERN_DEBUG @@ -1315,9 +1577,10 @@ static int lvm_map(struct buffer_head *bh, int rw) if (rsector_tmp + size > lv->lv_size) { printk(KERN_ALERT - "%s - lvm_map *rsector: %lu or size: %lu wrong for" - " minor: %2d\n", lvm_name, rsector_tmp, size, minor); - goto error; + "%s - lvm_map access beyond end of device; *rsector: " + "%lu or size: %lu wrong for minor: %2d\n", + lvm_name, rsector_tmp, size, minor); + return -1; } rsector_sav = rsector_tmp; rdev_sav = rdev_tmp; @@ -1326,10 +1589,10 @@ lvm_second_remap: /* linear mapping */ if (lv->lv_stripes < 2) { /* get the index */ - index = rsector_tmp / vg[VG_BLK(minor)]->pe_size; + index = rsector_tmp / vg_this->pe_size; pe_start = lv->lv_current_pe[index].pe; rsector_tmp = lv->lv_current_pe[index].pe + - (rsector_tmp % vg[VG_BLK(minor)]->pe_size); + (rsector_tmp % vg_this->pe_size); rdev_tmp = lv->lv_current_pe[index].dev; #ifdef DEBUG_MAP @@ -1347,7 +1610,7 @@ lvm_second_remap: ulong stripe_index; ulong stripe_length; - stripe_length = vg[VG_BLK(minor)]->pe_size * lv->lv_stripes; + stripe_length = vg_this->pe_size * lv->lv_stripes; stripe_index = (rsector_tmp % stripe_length) / lv->lv_stripesize; index = rsector_tmp / stripe_length + (stripe_index % lv->lv_stripes) * @@ -1379,7 +1642,7 @@ lvm_second_remap: if (rdev_tmp == pe_lock_req.data.pv_dev && rsector_tmp >= pe_lock_req.data.pv_offset && rsector_tmp < (pe_lock_req.data.pv_offset + - vg[VG_BLK(minor)]->pe_size)) { + vg_this->pe_size)) { sleep_on(&lvm_map_wait); rsector_tmp = rsector_sav; rdev_tmp = rdev_sav; @@ -1393,7 +1656,7 @@ lvm_second_remap: lv->lv_current_pe[index].reads++; /* snapshot volume exception handling on physical device address base */ - if (lv->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG)) { + if (lv->lv_access & (LV_SNAPSHOT|LV_SNAPSHOT_ORG)) { /* original logical volume */ if (lv->lv_access & LV_SNAPSHOT_ORG) { if (rw == WRITE || rw == WRITEA) @@ -1404,6 +1667,8 @@ lvm_second_remap: for (lv_ptr = lv->lv_snapshot_next; lv_ptr != NULL; lv_ptr = lv_ptr->lv_snapshot_next) { + /* Check for inactive snapshot */ + if (!(lv_ptr->lv_status & LV_ACTIVE)) continue; down(&lv->lv_snapshot_org->lv_snapshot_sem); /* do we still have exception storage for this snapshot free? */ if (lv_ptr->lv_block_exception != NULL) { @@ -1414,11 +1679,13 @@ lvm_second_remap: pe_start, lv_ptr)) { /* create a new mapping */ - lvm_snapshot_COW(rdev_tmp, - rsector_tmp, - pe_start, - rsector_sav, - lv_ptr); + if (!(ret = lvm_snapshot_COW(rdev_tmp, + rsector_tmp, + pe_start, + rsector_sav, + lv_ptr))) + ret = lvm_write_COW_table_block(vg_this, + lv_ptr); } rdev_tmp = rdev_sav; rsector_tmp = rsector_sav; @@ -1437,11 +1704,7 @@ lvm_second_remap: bh->b_rdev = rdev_tmp; bh->b_rsector = rsector_tmp; - return 1; - - error: - buffer_IO_error(bh); - return -1; + return ret; } /* lvm_map() */ @@ -1487,7 +1750,9 @@ static void lvm_dummy_device_request(request_queue_t * t) /* * make request function */ -static int lvm_make_request_fn(request_queue_t *q, int rw, struct buffer_head *bh) +static int lvm_make_request_fn(request_queue_t *q, + int rw, + struct buffer_head *bh) { if (lvm_map(bh, rw)<0) return 0; /* failure, buffer_IO_error has been called, don't recurse */ @@ -1495,12 +1760,6 @@ static int lvm_make_request_fn(request_queue_t *q, int rw, struct buffer_head *b return 1; /* all ok, mapping done, call lower level driver */ } -/* - * plug device function is a noop because plugging has to happen - * in the queue of the physical blockdevice to allow the - * elevator to do a better job. - */ -static void lvm_plug_device_noop(request_queue_t *q, kdev_t dev) { } /******************************************************************** * @@ -1563,7 +1822,8 @@ static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg) case UNLOCK_PE: pe_lock_req.lock = UNLOCK_PE; - pe_lock_req.data.lv_dev = pe_lock_req.data.pv_dev = 0; + pe_lock_req.data.lv_dev = \ + pe_lock_req.data.pv_dev = \ pe_lock_req.data.pv_offset = 0; wake_up(&lvm_map_wait); break; @@ -1593,8 +1853,7 @@ static int lvm_do_le_remap(vg_t *vg_ptr, void *arg) if (lv_ptr != NULL && strcmp(lv_ptr->lv_name, le_remap_req.lv_name) == 0) { - for (le = 0; le < lv_ptr->lv_allocated_le; - le++) { + for (le = 0; le < lv_ptr->lv_allocated_le; le++) { if (lv_ptr->lv_current_pe[le].dev == le_remap_req.old_dev && lv_ptr->lv_current_pe[le].pe == @@ -1618,12 +1877,11 @@ static int lvm_do_le_remap(vg_t *vg_ptr, void *arg) */ int lvm_do_vg_create(int minor, void *arg) { - int snaporg_minor = 0; - ulong l, p; + int ret = 0; + ulong l, ls = 0, p, size; lv_t lv; vg_t *vg_ptr; - pv_t *pv_ptr; - lv_t *lv_ptr; + lv_t **snap_lv_ptr; if (vg[VG_CHR(minor)] != NULL) return -EPERM; @@ -1639,18 +1897,11 @@ int lvm_do_vg_create(int minor, void *arg) return -EFAULT; } - vg_devfs_handle[vg_ptr->vg_number] = devfs_mk_dir(0, vg_ptr->vg_name, NULL); - ch_devfs_handle[vg_ptr->vg_number] = devfs_register( - vg_devfs_handle[vg_ptr->vg_number] , "group", - DEVFS_FL_DEFAULT, LVM_CHAR_MAJOR, vg_ptr->vg_number, - S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP, - &lvm_chr_fops, NULL); - /* we are not that active so far... */ vg_ptr->vg_status &= ~VG_ACTIVE; vg[VG_CHR(minor)] = vg_ptr; - vg[VG_CHR(minor)]->pe_allocated = 0; + if (vg_ptr->pv_max > ABS_MAX_PV) { printk(KERN_WARNING "%s -- Can't activate VG: ABS_MAX_PV too small\n", @@ -1667,38 +1918,30 @@ int lvm_do_vg_create(int minor, void *arg) vg_ptr = NULL; return -EPERM; } + /* get the physical volume structures */ vg_ptr->pv_act = vg_ptr->pv_cur = 0; for (p = 0; p < vg_ptr->pv_max; p++) { /* user space address */ if ((pvp = vg_ptr->pv[p]) != NULL) { - pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL); - if (pv_ptr == NULL) { - printk(KERN_CRIT - "%s -- VG_CREATE: kmalloc error PV at line %d\n", - lvm_name, __LINE__); + ret = lvm_do_pv_create(pvp, vg_ptr, p); + if ( ret != 0) { lvm_do_vg_remove(minor); - return -ENOMEM; - } - if (copy_from_user(pv_ptr, pvp, sizeof(pv_t)) != 0) { - lvm_do_vg_remove(minor); - return -EFAULT; + return ret; } - /* We don't need the PE list - in kernel space as with LVs pe_t list (see below) */ - pv_ptr->pe = NULL; - pv_ptr->pe_allocated = 0; - pv_ptr->pv_status = PV_ACTIVE; - vg_ptr->pv_act++; - vg_ptr->pv_cur++; - -#ifdef LVM_GET_INODE - /* insert a dummy inode for fs_may_mount */ - pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev); -#endif } } + size = vg_ptr->lv_max * sizeof(lv_t *); + if ((snap_lv_ptr = vmalloc ( size)) == NULL) { + printk(KERN_CRIT + "%s -- VG_CREATE: vmalloc error snapshot LVs at line %d\n", + lvm_name, __LINE__); + lvm_do_vg_remove(minor); + return -EFAULT; + } + memset(snap_lv_ptr, 0, size); + /* get the logical volume structures */ vg_ptr->lv_cur = 0; for (l = 0; l < vg_ptr->lv_max; l++) { @@ -1708,7 +1951,14 @@ int lvm_do_vg_create(int minor, void *arg) lvm_do_vg_remove(minor); return -EFAULT; } + if ( lv.lv_access & LV_SNAPSHOT) { + snap_lv_ptr[ls] = lvp; + vg_ptr->lv[l] = NULL; + ls++; + continue; + } vg_ptr->lv[l] = NULL; + /* only create original logical volumes for now */ if (lvm_do_lv_create(minor, lv.lv_name, &lv) != 0) { lvm_do_vg_remove(minor); return -EFAULT; @@ -1718,55 +1968,41 @@ int lvm_do_vg_create(int minor, void *arg) /* Second path to correct snapshot logical volumes which are not in place during first path above */ - for (l = 0; l < vg_ptr->lv_max; l++) { - if ((lv_ptr = vg_ptr->lv[l]) != NULL && - vg_ptr->lv[l]->lv_access & LV_SNAPSHOT) { - snaporg_minor = lv_ptr->lv_snapshot_minor; - if (vg_ptr->lv[LV_BLK(snaporg_minor)] != NULL) { - /* get pointer to original logical volume */ - lv_ptr = vg_ptr->lv[l]->lv_snapshot_org = - vg_ptr->lv[LV_BLK(snaporg_minor)]; - - /* set necessary fields of original logical volume */ - lv_ptr->lv_access |= LV_SNAPSHOT_ORG; - lv_ptr->lv_snapshot_minor = 0; - lv_ptr->lv_snapshot_org = lv_ptr; - lv_ptr->lv_snapshot_prev = NULL; + for (l = 0; l < ls; l++) { + lvp = snap_lv_ptr[l]; + if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) { + lvm_do_vg_remove(minor); + return -EFAULT; + } + if (lvm_do_lv_create(minor, lv.lv_name, &lv) != 0) { + lvm_do_vg_remove(minor); + return -EFAULT; + } + } - /* find last snapshot logical volume in the chain */ - while (lv_ptr->lv_snapshot_next != NULL) - lv_ptr = lv_ptr->lv_snapshot_next; +#ifdef CONFIG_DEVFS_FS + vg_devfs_handle[vg_ptr->vg_number] = devfs_mk_dir(0, vg_ptr->vg_name, NULL); + ch_devfs_handle[vg_ptr->vg_number] = devfs_register( + vg_devfs_handle[vg_ptr->vg_number] , "group", + DEVFS_FL_DEFAULT, LVM_CHAR_MAJOR, vg_ptr->vg_number, + S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP, + &lvm_chr_fops, NULL); +#endif - /* set back pointer to this last one in our new logical volume */ - vg_ptr->lv[l]->lv_snapshot_prev = lv_ptr; +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + lvm_do_create_proc_entry_of_vg ( vg_ptr); +#endif - /* last logical volume now points to our new snapshot volume */ - lv_ptr->lv_snapshot_next = vg_ptr->lv[l]; + vfree(snap_lv_ptr); - /* now point to the new one */ - lv_ptr = lv_ptr->lv_snapshot_next; + vg_count++; - /* set necessary fields of new snapshot logical volume */ - lv_ptr->lv_snapshot_next = NULL; - lv_ptr->lv_current_pe = - vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_current_pe; - lv_ptr->lv_allocated_le = - vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_allocated_le; - lv_ptr->lv_current_le = - vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_current_le; - lv_ptr->lv_size = - vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_size; - } - } - } - vg_count++; + MOD_INC_USE_COUNT; /* let's go active */ vg_ptr->vg_status |= VG_ACTIVE; - MOD_INC_USE_COUNT; - return 0; } /* lvm_do_vg_create() */ @@ -1776,26 +2012,18 @@ int lvm_do_vg_create(int minor, void *arg) */ static int lvm_do_vg_extend(vg_t *vg_ptr, void *arg) { + int ret = 0; uint p; pv_t *pv_ptr; if (vg_ptr == NULL) return -ENXIO; if (vg_ptr->pv_cur < vg_ptr->pv_max) { for (p = 0; p < vg_ptr->pv_max; p++) { - if (vg_ptr->pv[p] == NULL) { - if ((pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL)) == NULL) { - printk(KERN_CRIT - "%s -- VG_EXTEND: kmalloc error PV at line %d\n", - lvm_name, __LINE__); - return -ENOMEM; - } - if (copy_from_user(pv_ptr, arg, sizeof(pv_t)) != 0) { - kfree(pv_ptr); - vg_ptr->pv[p] = NULL; - return -EFAULT; - } + if ( ( pv_ptr = vg_ptr->pv[p]) == NULL) { + ret = lvm_do_pv_create(arg, vg_ptr, p); + lvm_do_create_proc_entry_of_pv ( vg_ptr, pv_ptr); + if ( ret != 0) return ret; - pv_ptr->pv_status = PV_ACTIVE; /* We don't need the PE list in kernel space like LVs pe_t list */ pv_ptr->pe = NULL; @@ -1818,8 +2046,7 @@ return -EPERM; /* * character device support function VGDA reduce */ -static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg) -{ +static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg) { uint p; pv_t *pv_ptr; @@ -1837,10 +2064,7 @@ static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg) pv_ptr->pe_total; vg_ptr->pv_cur--; vg_ptr->pv_act--; -#ifdef LVM_GET_INODE - lvm_clear_inode(pv_ptr->inode); -#endif - kfree(pv_ptr); + lvm_do_pv_remove(vg_ptr, p); /* Make PV pointer array contiguous */ for (; p < vg_ptr->pv_max - 1; p++) vg_ptr->pv[p] = vg_ptr->pv[p + 1]; @@ -1853,6 +2077,53 @@ static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg) /* + * character device support function VG rename + */ +static int lvm_do_vg_rename(vg_t *vg_ptr, void *arg) +{ + int l = 0, p = 0, len = 0; + char vg_name[NAME_LEN] = { 0,}; + char lv_name[NAME_LEN] = { 0,}; + char *ptr = NULL; + lv_t *lv_ptr = NULL; + pv_t *pv_ptr = NULL; + + if (copy_from_user(vg_name, arg, sizeof(vg_name)) != 0) + return -EFAULT; + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + lvm_do_remove_proc_entry_of_vg ( vg_ptr); +#endif + + strncpy ( vg_ptr->vg_name, vg_name, sizeof ( vg_name)-1); + for ( l = 0; l < vg_ptr->lv_max; l++) + { + if ((lv_ptr = vg_ptr->lv[l]) == NULL) continue; + strncpy(lv_ptr->vg_name, vg_name, sizeof ( vg_name)); + ptr = strrchr(lv_ptr->lv_name, '/'); + if (ptr == NULL) ptr = lv_ptr->lv_name; + strncpy(lv_name, ptr, sizeof ( lv_name)); + len = sizeof(LVM_DIR_PREFIX); + strcpy(lv_ptr->lv_name, LVM_DIR_PREFIX); + strncat(lv_ptr->lv_name, vg_name, NAME_LEN - len); + len += strlen ( vg_name); + strncat(lv_ptr->lv_name, lv_name, NAME_LEN - len); + } + for ( p = 0; p < vg_ptr->pv_max; p++) + { + if ( (pv_ptr = vg_ptr->pv[p]) == NULL) continue; + strncpy(pv_ptr->vg_name, vg_name, NAME_LEN); + } + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + lvm_do_create_proc_entry_of_vg ( vg_ptr); +#endif + + return 0; +} /* lvm_do_vg_rename */ + + +/* * character device support function VGDA remove */ static int lvm_do_vg_remove(int minor) @@ -1873,9 +2144,6 @@ static int lvm_do_vg_remove(int minor) /* let's go inactive */ vg_ptr->vg_status &= ~VG_ACTIVE; - devfs_unregister (ch_devfs_handle[vg_ptr->vg_number]); - devfs_unregister (vg_devfs_handle[vg_ptr->vg_number]); - /* free LVs */ /* first free snapshot logical volumes */ for (i = 0; i < vg_ptr->lv_max; i++) { @@ -1902,17 +2170,23 @@ static int lvm_do_vg_remove(int minor) printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); #endif -#ifdef LVM_GET_INODE - lvm_clear_inode(pv_ptr->inode); -#endif - kfree(pv_ptr); - vg[VG_CHR(minor)]->pv[i] = NULL; + lvm_do_pv_remove(vg_ptr, i); } } +#ifdef CONFIG_DEVFS_FS + devfs_unregister (ch_devfs_handle[vg_ptr->vg_number]); + devfs_unregister (vg_devfs_handle[vg_ptr->vg_number]); +#endif + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + lvm_do_remove_proc_entry_of_vg ( vg_ptr); +#endif + #ifdef DEBUG_KFREE printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); #endif + kfree(vg_ptr); vg[VG_CHR(minor)] = NULL; @@ -1925,13 +2199,68 @@ static int lvm_do_vg_remove(int minor) /* + * character device support function physical volume create + */ +static int lvm_do_pv_create(pv_t *pvp, vg_t *vg_ptr, ulong p) { + pv_t *pv_ptr = NULL; + + pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL); + if (pv_ptr == NULL) { + printk(KERN_CRIT + "%s -- VG_CREATE: kmalloc error PV at line %d\n", + lvm_name, __LINE__); + return -ENOMEM; + } + if (copy_from_user(pv_ptr, pvp, sizeof(pv_t)) != 0) { + return -EFAULT; + } + /* We don't need the PE list + in kernel space as with LVs pe_t list (see below) */ + pv_ptr->pe = NULL; + pv_ptr->pe_allocated = 0; + pv_ptr->pv_status = PV_ACTIVE; + vg_ptr->pv_act++; + vg_ptr->pv_cur++; + +#ifdef LVM_GET_INODE + /* insert a dummy inode for fs_may_mount */ + pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev); +#endif + + return 0; +} /* lvm_do_pv_create() */ + + +/* + * character device support function physical volume create + */ +static int lvm_do_pv_remove(vg_t *vg_ptr, ulong p) { + pv_t *pv_ptr = vg_ptr->pv[p]; + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + lvm_do_remove_proc_entry_of_pv ( vg_ptr, pv_ptr); +#endif + vg_ptr->pe_total -= + pv_ptr->pe_total; + vg_ptr->pv_cur--; + vg_ptr->pv_act--; +#ifdef LVM_GET_INODE + lvm_clear_inode(pv_ptr->inode); +#endif + kfree(pv_ptr); + vg_ptr->pv[p] = NULL; + + return 0; +} + + +/* * character device support function logical volume create */ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) { - int l, le, l_new, p, size; + int e, ret, l, le, l_new, p, size; ulong lv_status_save; - char *lv_tmp, *lv_buf = NULL; lv_block_exception_t *lvbe = lv->lv_block_exception; vg_t *vg_ptr = vg[VG_CHR(minor)]; lv_t *lv_ptr = NULL; @@ -1946,7 +2275,7 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) return -EEXIST; } - /* in case of lv_remove(), lv_create() pair; for eg. lvrename does this */ + /* in case of lv_remove(), lv_create() pair */ l_new = -1; if (vg_ptr->lv[lv->lv_number] == NULL) l_new = lv->lv_number; @@ -1957,7 +2286,7 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) } } if (l_new == -1) return -EPERM; - else l = l_new; + else l = l_new; if ((lv_ptr = kmalloc(sizeof(lv_t),GFP_KERNEL)) == NULL) {; printk(KERN_CRIT "%s -- LV_CREATE: kmalloc error LV at line %d\n", @@ -1970,10 +2299,16 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) lv_status_save = lv_ptr->lv_status; lv_ptr->lv_status &= ~LV_ACTIVE; lv_ptr->lv_snapshot_org = \ - lv_ptr->lv_snapshot_prev = \ - lv_ptr->lv_snapshot_next = NULL; + lv_ptr->lv_snapshot_prev = \ + lv_ptr->lv_snapshot_next = NULL; lv_ptr->lv_block_exception = NULL; + lv_ptr->lv_iobuf = NULL; + lv_ptr->lv_snapshot_hash_table = NULL; + lv_ptr->lv_snapshot_hash_table_size = 0; + lv_ptr->lv_snapshot_hash_mask = 0; + lv_ptr->lv_COW_table_page = NULL; init_MUTEX(&lv_ptr->lv_snapshot_sem); + lv_ptr->lv_snapshot_use_rate = 0; vg_ptr->lv[l] = lv_ptr; /* get the PE structures from user space if this @@ -2032,7 +2367,7 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) vg[VG_CHR(minor)]->lv[l] = NULL; return -EFAULT; } - /* get pointer to original logical volume */ + /* point to the original logical volume */ lv_ptr = lv_ptr->lv_snapshot_org; lv_ptr->lv_snapshot_minor = 0; @@ -2043,7 +2378,8 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) lv_ptr = lv_ptr->lv_snapshot_next; /* now lv_ptr points to the last existing snapshot in the chain */ vg_ptr->lv[l]->lv_snapshot_prev = lv_ptr; - /* our new one now back points to the previous last in the chain */ + /* our new one now back points to the previous last in the chain + which can be the original logical volume */ lv_ptr = vg_ptr->lv[l]; /* now lv_ptr points to our new last snapshot logical volume */ lv_ptr->lv_snapshot_org = lv_ptr->lv_snapshot_prev->lv_snapshot_org; @@ -2054,16 +2390,19 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) lv_ptr->lv_size = lv_ptr->lv_snapshot_org->lv_size; lv_ptr->lv_stripes = lv_ptr->lv_snapshot_org->lv_stripes; lv_ptr->lv_stripesize = lv_ptr->lv_snapshot_org->lv_stripesize; + if ((ret = lvm_snapshot_alloc(lv_ptr)) != 0) { - int err = lvm_snapshot_alloc(lv_ptr); - if (err) - { - vfree(lv_ptr->lv_block_exception); - kfree(lv_ptr); - vg[VG_CHR(minor)]->lv[l] = NULL; - return err; - } + vfree(lv_ptr->lv_block_exception); + kfree(lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return ret; } + for ( e = 0; e < lv_ptr->lv_remap_ptr; e++) + lvm_hash_link (lv_ptr->lv_block_exception + e, lv_ptr->lv_block_exception[e].rdev_org, lv_ptr->lv_block_exception[e].rsector_org, lv_ptr); + /* need to fill the COW exception table data + into the page for disk i/o */ + lvm_snapshot_fill_COW_page(vg_ptr, lv_ptr); + init_waitqueue_head(&lv_ptr->lv_snapshot_wait); } else { vfree(lv_ptr->lv_block_exception); kfree(lv_ptr); @@ -2083,12 +2422,15 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1; vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = vg_ptr->vg_number; vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = lv_ptr->lv_number; - read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead); + LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead); vg_ptr->lv_cur++; lv_ptr->lv_status = lv_status_save; - strtok(lv->lv_name, "/"); /* /dev */ +#ifdef CONFIG_DEVFS_FS + { + char *lv_tmp, *lv_buf = NULL; + strtok(lv->lv_name, "/"); /* /dev */ while((lv_tmp = strtok(NULL, "/")) != NULL) lv_buf = lv_tmp; @@ -2097,15 +2439,43 @@ static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) DEVFS_FL_DEFAULT, LVM_BLK_MAJOR, lv->lv_number, S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, &lvm_blk_dops, NULL); + } +#endif + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + lvm_do_create_proc_entry_of_lv ( vg_ptr, lv_ptr); +#endif /* optionally add our new snapshot LV */ if (lv_ptr->lv_access & LV_SNAPSHOT) { /* sync the original logical volume */ fsync_dev(lv_ptr->lv_snapshot_org->lv_dev); +#ifdef LVM_VFS_ENHANCEMENT + /* VFS function call to sync and lock the filesystem */ + fsync_dev_lockfs(lv_ptr->lv_snapshot_org->lv_dev); +#endif + lv_ptr->lv_snapshot_org->lv_access |= LV_SNAPSHOT_ORG; + lv_ptr->lv_access &= ~LV_SNAPSHOT_ORG; /* put ourselve into the chain */ lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr; - lv_ptr->lv_snapshot_org->lv_access |= LV_SNAPSHOT_ORG; } + + /* activate the logical volume */ + lv_ptr->lv_status |= LV_ACTIVE; + if ( lv_ptr->lv_access & LV_WRITE) + set_device_ro(lv_ptr->lv_dev, 0); + else + set_device_ro(lv_ptr->lv_dev, 1); + +#ifdef LVM_VFS_ENHANCEMENT +/* VFS function call to unlock the filesystem */ + if (lv_ptr->lv_access & LV_SNAPSHOT) { + unlockfs(lv_ptr->lv_snapshot_org->lv_dev); + } +#endif + + lv_ptr->vg = vg_ptr; + return 0; } /* lvm_do_lv_create() */ @@ -2176,7 +2546,7 @@ static int lvm_do_lv_remove(int minor, char *lv_name, int l) } } vfree(lv_ptr->lv_current_pe); - /* LV_SNAPSHOT */ + /* LV_SNAPSHOT */ } else { /* remove this snapshot logical volume from the chain */ lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr->lv_snapshot_next; @@ -2190,7 +2560,13 @@ static int lvm_do_lv_remove(int minor, char *lv_name, int l) lvm_snapshot_release(lv_ptr); } +#ifdef CONFIG_DEVFS_FS devfs_unregister(lv_devfs_handle[lv_ptr->lv_number]); +#endif + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + lvm_do_remove_proc_entry_of_lv ( vg_ptr, lv_ptr); +#endif #ifdef DEBUG_KFREE printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); @@ -2207,8 +2583,7 @@ static int lvm_do_lv_remove(int minor, char *lv_name, int l) */ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv) { - int l, le, p, size, old_allocated_le; - uint32_t end, lv_status_save; + ulong end, l, le, p, size, old_allocated_le; vg_t *vg_ptr = vg[VG_CHR(minor)]; lv_t *lv_ptr; pe_t *pe; @@ -2224,12 +2599,75 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv) lv_ptr = vg_ptr->lv[l]; /* check for active snapshot */ - if (lv->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG)) return -EPERM; + if (lv->lv_access & LV_SNAPSHOT) + { + ulong e; + lv_block_exception_t *lvbe, *lvbe_old; + struct list_head * lvs_hash_table_old; + + if (lv->lv_block_exception == NULL) return -ENXIO; + size = lv->lv_remap_end * sizeof ( lv_block_exception_t); + if ((lvbe = vmalloc(size)) == NULL) + { + printk(KERN_CRIT + "%s -- lvm_do_lv_extend_reduce: vmalloc error LV_BLOCK_EXCEPTION " + "of %lu Byte at line %d\n", + lvm_name, size, __LINE__); + return -ENOMEM; + } + if (lv->lv_remap_end > lv_ptr->lv_remap_end) + { + if (copy_from_user(lvbe, lv->lv_block_exception, size)) + { + vfree(lvbe); + return -EFAULT; + } + } + + lvbe_old = lv_ptr->lv_block_exception; + lvs_hash_table_old = lv_ptr->lv_snapshot_hash_table; + + /* we need to play on the safe side here... */ + down(&lv_ptr->lv_snapshot_org->lv_snapshot_sem); + if (lv_ptr->lv_block_exception == NULL || + lv_ptr->lv_remap_ptr > lv_ptr->lv_remap_end) + { + up(&lv_ptr->lv_snapshot_org->lv_snapshot_sem); + vfree(lvbe); + return -EPERM; + } + memcpy(lvbe, + lv_ptr->lv_block_exception, + (lv->lv_remap_end > lv_ptr->lv_remap_end ? lv_ptr->lv_remap_ptr : lv->lv_remap_end) * sizeof(lv_block_exception_t)); + + lv_ptr->lv_block_exception = lvbe; + lv_ptr->lv_remap_end = lv->lv_remap_end; + if (lvm_snapshot_alloc_hash_table(lv_ptr) != 0) + { + lvm_drop_snapshot(lv_ptr, "hash_alloc"); + up(&lv_ptr->lv_snapshot_org->lv_snapshot_sem); + vfree(lvbe_old); + vfree(lvs_hash_table_old); + return 1; + } + + for (e = 0; e < lv_ptr->lv_remap_ptr; e++) + lvm_hash_link (lv_ptr->lv_block_exception + e, lv_ptr->lv_block_exception[e].rdev_org, lv_ptr->lv_block_exception[e].rsector_org, lv_ptr); + + up(&lv_ptr->lv_snapshot_org->lv_snapshot_sem); + + vfree(lvbe_old); + vfree(lvs_hash_table_old); + + return 0; + } + + /* we drop in here in case it is an original logical volume */ if ((pe = vmalloc(size = lv->lv_current_le * sizeof(pe_t))) == NULL) { printk(KERN_CRIT "%s -- lvm_do_lv_extend_reduce: vmalloc error LV_CURRENT_PE " - "of %d Byte at line %d\n", + "of %lu Byte at line %d\n", lvm_name, size, __LINE__); return -ENOMEM; } @@ -2248,11 +2686,6 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv) vg_ptr->vg_name); #endif - lv_ptr->lv_status |= LV_SPINDOWN; - fsync_dev(lv_ptr->lv_dev); - lv_ptr->lv_status &= ~LV_ACTIVE; - invalidate_buffers(lv_ptr->lv_dev); - /* reduce allocation counters on PV(s) */ for (le = 0; le < lv_ptr->lv_allocated_le; le++) { vg_ptr->pe_allocated--; @@ -2270,19 +2703,29 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv) pep1 = lv_ptr->lv_current_pe; end = lv_ptr->lv_current_le; - /* save open counter */ - lv_open = lv_ptr->lv_open; + /* save open counter... */ + lv->lv_open = lv_ptr->lv_open; + lv->lv_snapshot_prev = lv_ptr->lv_snapshot_prev; + lv->lv_snapshot_next = lv_ptr->lv_snapshot_next; + lv->lv_snapshot_org = lv_ptr->lv_snapshot_org; + + lv->lv_current_pe = pe; /* save # of old allocated logical extents */ old_allocated_le = lv_ptr->lv_allocated_le; + /* in case of shrinking -> let's flush */ + if ( end > lv->lv_current_le) fsync_dev(lv_ptr->lv_dev); + /* copy preloaded LV */ - lv_status_save = lv->lv_status; - lv->lv_status |= LV_SPINDOWN; - lv->lv_status &= ~LV_ACTIVE; memcpy((char *) lv_ptr, (char *) lv, sizeof(lv_t)); - lv_ptr->lv_current_pe = pe; - lv_ptr->lv_open = lv_open; + + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size; + lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1; + /* vg_lv_map array doesn't have to be changed here */ + + LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead); /* save availiable i/o statistic data */ /* linear logical volume */ @@ -2290,8 +2733,8 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv) /* Check what last LE shall be used */ if (end > lv_ptr->lv_current_le) end = lv_ptr->lv_current_le; for (le = 0; le < end; le++) { - lv_ptr->lv_current_pe[le].reads = pep1[le].reads; - lv_ptr->lv_current_pe[le].writes = pep1[le].writes; + lv_ptr->lv_current_pe[le].reads += pep1[le].reads; + lv_ptr->lv_current_pe[le].writes += pep1[le].writes; } /* striped logical volume */ } else { @@ -2304,38 +2747,44 @@ static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv) for (i = source = dest = 0; i < lv_ptr->lv_stripes; i++) { for (j = 0; j < end; j++) { - lv_ptr->lv_current_pe[dest + j].reads = + lv_ptr->lv_current_pe[dest + j].reads += pep1[source + j].reads; - lv_ptr->lv_current_pe[dest + j].writes = + lv_ptr->lv_current_pe[dest + j].writes += pep1[source + j].writes; } source += old_stripe_size; dest += new_stripe_size; } } - vfree(pep1); - pep1 = NULL; - /* extend the PE count in PVs */ for (le = 0; le < lv_ptr->lv_allocated_le; le++) { vg_ptr->pe_allocated++; for (p = 0; p < vg_ptr->pv_cur; p++) { if (vg_ptr->pv[p]->pv_dev == - vg_ptr->lv[l]->lv_current_pe[le].dev) { + lv_ptr->lv_current_pe[le].dev) { vg_ptr->pv[p]->pe_allocated++; break; } } } - lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0; - lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size; - lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1; - /* vg_lv_map array doesn't have to be changed here */ + vfree ( pep1); + pep1 = NULL; - read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead); - lv_ptr->lv_status = lv_status_save; + if (lv->lv_access & LV_SNAPSHOT_ORG) + { + /* Correct the snapshot size information */ + while ((lv_ptr = lv_ptr->lv_snapshot_next) != NULL) + { + lv_ptr->lv_current_pe = lv_ptr->lv_snapshot_org->lv_current_pe; + lv_ptr->lv_allocated_le = lv_ptr->lv_snapshot_org->lv_allocated_le; + lv_ptr->lv_current_le = lv_ptr->lv_snapshot_org->lv_current_le; + lv_ptr->lv_size = lv_ptr->lv_snapshot_org->lv_size; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size; + lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1; + } + } return 0; } /* lvm_do_lv_extend_reduce() */ @@ -2425,6 +2874,65 @@ static int lvm_do_lv_status_byindex(vg_t *vg_ptr,void *arg) /* + * character device support function logical volume status by device number + */ +static int lvm_do_lv_status_bydev(vg_t * vg_ptr, void * arg) { + int l; + lv_status_bydev_req_t lv_status_bydev_req; + + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&lv_status_bydev_req, arg, + sizeof(lv_status_bydev_req)) != 0) + return -EFAULT; + + for ( l = 0; l < vg_ptr->lv_max; l++) { + if ( vg_ptr->lv[l] == NULL) continue; + if ( vg_ptr->lv[l]->lv_dev == lv_status_bydev_req.dev) break; + } + + if ( l == vg_ptr->lv_max) return -ENXIO; + + if (copy_to_user(lv_status_bydev_req.lv, + vg_ptr->lv[l], sizeof(lv_t)) != 0) + return -EFAULT; + + return 0; +} /* lvm_do_lv_status_bydev() */ + + +/* + * character device support function rename a logical volume + */ +static int lvm_do_lv_rename(vg_t *vg_ptr, lv_req_t *lv_req, lv_t *lv) +{ + int l = 0; + int ret = 0; + lv_t *lv_ptr = NULL; + + for (l = 0; l < vg_ptr->lv_max; l++) + { + if ( (lv_ptr = vg_ptr->lv[l]) == NULL) continue; + if (lv_ptr->lv_dev == lv->lv_dev) + { +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + lvm_do_remove_proc_entry_of_lv ( vg_ptr, lv_ptr); +#endif + strncpy(lv_ptr->lv_name, + lv_req->lv_name, + NAME_LEN); +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + lvm_do_create_proc_entry_of_lv ( vg_ptr, lv_ptr); +#endif + break; + } + } + if (l == vg_ptr->lv_max) ret = -ENODEV; + + return ret; +} /* lvm_do_lv_rename */ + + +/* * character device support function physical volume change */ static int lvm_do_pv_change(vg_t *vg_ptr, void *arg) @@ -2494,6 +3002,140 @@ static int lvm_do_pv_status(vg_t *vg_ptr, void *arg) } /* lvm_do_pv_status() */ + +/* + * create a /proc entry for a logical volume + */ +inline void lvm_do_create_proc_entry_of_lv ( vg_t *vg_ptr, lv_t *lv_ptr) { + char *basename; + + if ( vg_ptr->lv_subdir_pde != NULL) { + basename = strrchr(lv_ptr->lv_name, '/'); + if (basename == NULL) basename = lv_ptr->lv_name; + else basename++; + pde = create_proc_entry(basename, S_IFREG, + vg_ptr->lv_subdir_pde); + if ( pde != NULL) { + pde->read_proc = lvm_proc_read_lv_info; + pde->data = lv_ptr; + } + } +} + + +/* + * remove a /proc entry for a logical volume + */ +inline void lvm_do_remove_proc_entry_of_lv ( vg_t *vg_ptr, lv_t *lv_ptr) { + char *basename; + + if ( vg_ptr->lv_subdir_pde != NULL) { + basename = strrchr(lv_ptr->lv_name, '/'); + if (basename == NULL) basename = lv_ptr->lv_name; + else basename++; + remove_proc_entry(basename, vg_ptr->lv_subdir_pde); + } +} + + +/* + * create a /proc entry for a physical volume + */ +inline void lvm_do_create_proc_entry_of_pv ( vg_t *vg_ptr, pv_t *pv_ptr) { + char *basename; + + basename = strrchr(pv_ptr->pv_name, '/'); + if (basename == NULL) basename = pv_ptr->pv_name; + else basename++; + pde = create_proc_entry(basename, S_IFREG, vg_ptr->pv_subdir_pde); + if ( pde != NULL) { + pde->read_proc = lvm_proc_read_pv_info; + pde->data = pv_ptr; + } +} + + +/* + * remove a /proc entry for a physical volume + */ +inline void lvm_do_remove_proc_entry_of_pv ( vg_t *vg_ptr, pv_t *pv_ptr) { + char *basename; + + basename = strrchr(pv_ptr->pv_name, '/'); + if ( vg_ptr->pv_subdir_pde != NULL) { + basename = strrchr(pv_ptr->pv_name, '/'); + if (basename == NULL) basename = pv_ptr->pv_name; + else basename++; + remove_proc_entry(basename, vg_ptr->pv_subdir_pde); + } +} + + +/* + * create a /proc entry for a volume group + */ +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS +void lvm_do_create_proc_entry_of_vg ( vg_t *vg_ptr) { + int l, p; + pv_t *pv_ptr; + lv_t *lv_ptr; + + pde = create_proc_entry(vg_ptr->vg_name, S_IFDIR, + lvm_proc_vg_subdir); + if ( pde != NULL) { + vg_ptr->vg_dir_pde = pde; + pde = create_proc_entry("group", S_IFREG, + vg_ptr->vg_dir_pde); + if ( pde != NULL) { + pde->read_proc = lvm_proc_read_vg_info; + pde->data = vg_ptr; + } + vg_ptr->lv_subdir_pde = + create_proc_entry(LVM_LV_SUBDIR, S_IFDIR, + vg_ptr->vg_dir_pde); + vg_ptr->pv_subdir_pde = + create_proc_entry(LVM_PV_SUBDIR, S_IFDIR, + vg_ptr->vg_dir_pde); + } + + if ( vg_ptr->pv_subdir_pde != NULL) { + for ( l = 0; l < vg_ptr->lv_max; l++) { + if ( ( lv_ptr = vg_ptr->lv[l]) == NULL) continue; + lvm_do_create_proc_entry_of_lv ( vg_ptr, lv_ptr); + } + for ( p = 0; p < vg_ptr->pv_max; p++) { + if ( ( pv_ptr = vg_ptr->pv[p]) == NULL) continue; + lvm_do_create_proc_entry_of_pv ( vg_ptr, pv_ptr); + } + } +} + +/* + * remove a /proc entry for a volume group + */ +void lvm_do_remove_proc_entry_of_vg ( vg_t *vg_ptr) { + int l, p; + lv_t *lv_ptr; + pv_t *pv_ptr; + + for ( l = 0; l < vg_ptr->lv_max; l++) { + if ( ( lv_ptr = vg_ptr->lv[l]) == NULL) continue; + lvm_do_remove_proc_entry_of_lv ( vg_ptr, vg_ptr->lv[l]); + } + for ( p = 0; p < vg_ptr->pv_max; p++) { + if ( ( pv_ptr = vg_ptr->pv[p]) == NULL) continue; + lvm_do_remove_proc_entry_of_pv ( vg_ptr, vg_ptr->pv[p]); + } + if ( vg_ptr->vg_dir_pde != NULL) { + remove_proc_entry(LVM_LV_SUBDIR, vg_ptr->vg_dir_pde); + remove_proc_entry(LVM_PV_SUBDIR, vg_ptr->vg_dir_pde); + remove_proc_entry("group", vg_ptr->vg_dir_pde); + remove_proc_entry(vg_ptr->vg_name, lvm_proc_vg_subdir); + } +} +#endif + + /* * support function initialize gendisk variables */ @@ -2516,8 +3158,9 @@ void __init lvm_blocksizes[i] = BLOCK_SIZE; } - blksize_size[MAJOR_NR] = lvm_blocksizes; blk_size[MAJOR_NR] = lvm_size; + blksize_size[MAJOR_NR] = lvm_blocksizes; + hardsect_size[MAJOR_NR] = lvm_blocksizes; return; } /* lvm_gen_init() */ @@ -2533,17 +3176,8 @@ void __init * * Is this the real thing? * - * No, it's bollocks. md.c tries to do a bit different thing that might - * _somewhat_ work eons ago. Neither does any good these days. mount() couldn't - * care less for icache (it cares only for ->s_root->d_count and if we want - * loopback mounts even that will stop). BTW, with the form used here mount() - * would have to scan the _whole_ icache to detect the attempt - how on the - * Earth could it guess the i_ino of your dummy inode? Official line on the - * exclusion between mount()/swapon()/open()/etc. is Just Don't Do It(tm). - * If you can convince Linus that it's worth changing - fine, then you'll need - * to do blkdev_get()/blkdev_put(). Until then... */ -struct inode *lvm_get_inode(kdev_t dev) +struct inode *lvm_get_inode(int dev) { struct inode *inode_this = NULL; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 55c50c5e7..663dfd395 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -30,12 +30,12 @@ static mdk_personality_t raid5_personality; * Stripe cache */ -#define NR_STRIPES 128 +#define NR_STRIPES 256 #define HASH_PAGES 1 #define HASH_PAGES_ORDER 0 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) #define HASH_MASK (NR_HASH - 1) -#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK]) +#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK]) /* * The following can be used to debug the driver @@ -44,10 +44,8 @@ static mdk_personality_t raid5_personality; #define RAID5_PARANOIA 1 #if RAID5_PARANOIA && CONFIG_SMP # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG() -# define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG() #else # define CHECK_DEVLOCK() -# define CHECK_SHLOCK(unused) #endif #if RAID5_DEBUG @@ -60,196 +58,98 @@ static mdk_personality_t raid5_personality; static void print_raid5_conf (raid5_conf_t *conf); -static inline int stripe_locked(struct stripe_head *sh) +static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) { - return test_bit(STRIPE_LOCKED, &sh->state); -} - -static void __unlock_stripe(struct stripe_head *sh) -{ - if (!md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) - BUG(); - PRINTK("unlocking stripe %lu\n", sh->sector); - wake_up(&sh->wait); + if (atomic_dec_and_test(&sh->count)) { + if (!list_empty(&sh->lru)) + BUG(); + if (atomic_read(&conf->active_stripes)==0) + BUG(); + if (test_bit(STRIPE_HANDLE, &sh->state)) { + list_add_tail(&sh->lru, &conf->handle_list); + md_wakeup_thread(conf->thread); + } + else { + list_add_tail(&sh->lru, &conf->inactive_list); + atomic_dec(&conf->active_stripes); + wake_up(&conf->wait_for_stripe); + } + } } - -static void finish_unlock_stripe(struct stripe_head *sh) +static void release_stripe(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; - sh->cmd = STRIPE_NONE; - sh->phase = PHASE_COMPLETE; - atomic_dec(&conf->nr_pending_stripes); - atomic_inc(&conf->nr_cached_stripes); - __unlock_stripe(sh); - atomic_dec(&sh->count); - wake_up(&conf->wait_for_stripe); + + spin_lock_irq(&conf->device_lock); + __release_stripe(conf, sh); + spin_unlock_irq(&conf->device_lock); } -static void remove_hash(raid5_conf_t *conf, struct stripe_head *sh) +static void remove_hash(struct stripe_head *sh) { PRINTK("remove_hash(), stripe %lu\n", sh->sector); - CHECK_DEVLOCK(); - CHECK_SHLOCK(sh); if (sh->hash_pprev) { if (sh->hash_next) sh->hash_next->hash_pprev = sh->hash_pprev; *sh->hash_pprev = sh->hash_next; sh->hash_pprev = NULL; - atomic_dec(&conf->nr_hashed_stripes); } } -static void lock_get_bh (struct buffer_head *bh) -{ - while (md_test_and_set_bit(BH_Lock, &bh->b_state)) - __wait_on_buffer(bh); - atomic_inc(&bh->b_count); -} - static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) { - struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size); + struct stripe_head **shp = &stripe_hash(conf, sh->sector); - PRINTK("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", - sh->sector, atomic_read(&conf->nr_hashed_stripes)); + PRINTK("insert_hash(), stripe %lu\n",sh->sector); CHECK_DEVLOCK(); - CHECK_SHLOCK(sh); if ((sh->hash_next = *shp) != NULL) (*shp)->hash_pprev = &sh->hash_next; *shp = sh; sh->hash_pprev = shp; - atomic_inc(&conf->nr_hashed_stripes); } -static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size) -{ - struct buffer_head *bh; - unsigned long flags; - - CHECK_SHLOCK(sh); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - bh = sh->buffer_pool; - if (!bh) - goto out_unlock; - sh->buffer_pool = bh->b_next; - bh->b_size = b_size; - if (atomic_read(&bh->b_count)) - BUG(); -out_unlock: - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); - - return bh; -} - -static struct buffer_head *get_free_bh(struct stripe_head *sh) -{ - struct buffer_head *bh; - unsigned long flags; - - CHECK_SHLOCK(sh); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - bh = sh->bh_pool; - if (!bh) - goto out_unlock; - sh->bh_pool = bh->b_next; - if (atomic_read(&bh->b_count)) - BUG(); -out_unlock: - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); - - return bh; -} - -static void put_free_buffer(struct stripe_head *sh, struct buffer_head *bh) -{ - unsigned long flags; - - if (atomic_read(&bh->b_count)) - BUG(); - CHECK_SHLOCK(sh); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - bh->b_next = sh->buffer_pool; - sh->buffer_pool = bh; - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); -} - -static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh) -{ - unsigned long flags; - - if (atomic_read(&bh->b_count)) - BUG(); - CHECK_SHLOCK(sh); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - bh->b_next = sh->bh_pool; - sh->bh_pool = bh; - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); -} +/* find an idle stripe, make sure it is unhashed, and return it. */ static struct stripe_head *get_free_stripe(raid5_conf_t *conf) { - struct stripe_head *sh; + struct stripe_head *sh = NULL; + struct list_head *first; - md_spin_lock_irq(&conf->device_lock); - sh = conf->free_sh_list; - if (!sh) + CHECK_DEVLOCK(); + if (list_empty(&conf->inactive_list)) goto out; - conf->free_sh_list = sh->free_next; - atomic_dec(&conf->nr_free_sh); - if (!atomic_read(&conf->nr_free_sh) && conf->free_sh_list) - BUG(); - if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || - atomic_read(&sh->count)) - BUG(); + first = conf->inactive_list.next; + sh = list_entry(first, struct stripe_head, lru); + list_del_init(first); + remove_hash(sh); + atomic_inc(&conf->active_stripes); out: - md_spin_unlock_irq(&conf->device_lock); return sh; } -static void __put_free_stripe (raid5_conf_t *conf, struct stripe_head *sh) -{ - if (atomic_read(&sh->count) != 0) - BUG(); - CHECK_DEVLOCK(); - CHECK_SHLOCK(sh); - clear_bit(STRIPE_LOCKED, &sh->state); - sh->free_next = conf->free_sh_list; - conf->free_sh_list = sh; - atomic_inc(&conf->nr_free_sh); -} - static void shrink_buffers(struct stripe_head *sh, int num) { struct buffer_head *bh; + int i; - while (num--) { - bh = get_free_buffer(sh, -1); + for (i=0; i<num ; i++) { + bh = sh->bh_cache[i]; if (!bh) return; + sh->bh_cache[i] = NULL; free_page((unsigned long) bh->b_data); kfree(bh); } } -static void shrink_bh(struct stripe_head *sh, int num) -{ - struct buffer_head *bh; - - while (num--) { - bh = get_free_bh(sh); - if (!bh) - return; - kfree(bh); - } -} - -static int grow_raid5_buffers(struct stripe_head *sh, int num, int b_size, int priority) +static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority) { struct buffer_head *bh; + int i; - while (num--) { + for (i=0; i<num; i++) { struct page *page; bh = kmalloc(sizeof(struct buffer_head), priority); if (!bh) @@ -262,239 +162,155 @@ static int grow_raid5_buffers(struct stripe_head *sh, int num, int b_size, int p kfree(bh); return 1; } - bh->b_size = b_size; atomic_set(&bh->b_count, 0); bh->b_page = page; - put_free_buffer(sh, bh); - } - return 0; -} + sh->bh_cache[i] = bh; -static int grow_bh(struct stripe_head *sh, int num, int priority) -{ - struct buffer_head *bh; - - while (num--) { - bh = kmalloc(sizeof(struct buffer_head), priority); - if (!bh) - return 1; - memset(bh, 0, sizeof (struct buffer_head)); - init_waitqueue_head(&bh->b_wait); - put_free_bh(sh, bh); } return 0; } -static void raid5_free_buffer(struct stripe_head *sh, struct buffer_head *bh) -{ - put_free_buffer(sh, bh); -} +static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i); -static void raid5_free_bh(struct stripe_head *sh, struct buffer_head *bh) -{ - put_free_bh(sh, bh); -} - -static void raid5_free_old_bh(struct stripe_head *sh, int i) -{ - CHECK_SHLOCK(sh); - if (!sh->bh_old[i]) - BUG(); - raid5_free_buffer(sh, sh->bh_old[i]); - sh->bh_old[i] = NULL; -} - -static void raid5_update_old_bh(struct stripe_head *sh, int i) -{ - CHECK_SHLOCK(sh); - PRINTK("stripe %lu, idx %d, updating cache copy\n", sh->sector, i); - if (!sh->bh_copy[i]) - BUG(); - if (sh->bh_old[i]) - raid5_free_old_bh(sh, i); - sh->bh_old[i] = sh->bh_copy[i]; - sh->bh_copy[i] = NULL; -} - -static void free_stripe(struct stripe_head *sh) +static inline void init_stripe(struct stripe_head *sh, unsigned long sector) { raid5_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks, j; + int disks = conf->raid_disks, i; if (atomic_read(&sh->count) != 0) BUG(); + if (test_bit(STRIPE_HANDLE, &sh->state)) + BUG(); + CHECK_DEVLOCK(); - CHECK_SHLOCK(sh); - PRINTK("free_stripe called, stripe %lu\n", sh->sector); - if (sh->phase != PHASE_COMPLETE || atomic_read(&sh->count)) { - PRINTK("raid5: free_stripe(), sector %lu, phase %d, count %d\n", sh->sector, sh->phase, atomic_read(&sh->count)); - return; - } - for (j = 0; j < disks; j++) { - if (sh->bh_old[j]) - raid5_free_old_bh(sh, j); - if (sh->bh_new[j] || sh->bh_copy[j]) - BUG(); - } - remove_hash(conf, sh); - __put_free_stripe(conf, sh); -} + PRINTK("init_stripe called, stripe %lu\n", sh->sector); -static int shrink_stripe_cache(raid5_conf_t *conf, int nr) -{ - struct stripe_head *sh; - int i, count = 0; - - PRINTK("shrink_stripe_cache called, %d/%d, clock %d\n", nr, atomic_read(&conf->nr_hashed_stripes), conf->clock); - md_spin_lock_irq(&conf->device_lock); - for (i = 0; i < NR_HASH; i++) { - sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK]; - for (; sh; sh = sh->hash_next) { - if (sh->phase != PHASE_COMPLETE) - continue; - if (atomic_read(&sh->count)) - continue; - /* - * Try to lock this stripe: - */ - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) - continue; - free_stripe(sh); - if (++count == nr) { - conf->clock = (i + conf->clock) & HASH_MASK; - goto out; - } + remove_hash(sh); + + sh->sector = sector; + sh->size = conf->buffer_size; + sh->state = 0; + + for (i=disks; i--; ) { + if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] || + buffer_locked(sh->bh_cache[i])) { + printk("sector=%lx i=%d %p %p %p %d\n", + sh->sector, i, sh->bh_read[i], + sh->bh_write[i], sh->bh_written[i], + buffer_locked(sh->bh_cache[i])); + BUG(); } + clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state); + raid5_build_block(sh, i); } -out: - md_spin_unlock_irq(&conf->device_lock); - PRINTK("shrink completed, nr_hashed_stripes %d, nr_pending_strips %d\n", - atomic_read(&conf->nr_hashed_stripes), - atomic_read(&conf->nr_pending_stripes)); - return count; + insert_hash(conf, sh); } -void __wait_lock_stripe(struct stripe_head *sh) +/* the buffer size has changed, so unhash all stripes + * as active stripes complete, they will go onto inactive list + */ +static void shrink_stripe_cache(raid5_conf_t *conf) { - MD_DECLARE_WAITQUEUE(wait, current); - - PRINTK("wait_lock_stripe %lu\n", sh->sector); - if (!atomic_read(&sh->count)) + int i; + CHECK_DEVLOCK(); + if (atomic_read(&conf->active_stripes)) BUG(); - add_wait_queue(&sh->wait, &wait); -repeat: - set_current_state(TASK_UNINTERRUPTIBLE); - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) { - schedule(); - goto repeat; + for (i=0; i < NR_HASH; i++) { + struct stripe_head *sh; + while ((sh = conf->stripe_hashtbl[i])) + remove_hash(sh); } - PRINTK("wait_lock_stripe %lu done\n", sh->sector); - remove_wait_queue(&sh->wait, &wait); - current->state = TASK_RUNNING; } -static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector, int size) +static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector) { struct stripe_head *sh; + CHECK_DEVLOCK(); PRINTK("__find_stripe, sector %lu\n", sector); - for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next) { - if (sh->sector == sector && sh->raid_conf == conf) { - if (sh->size != size) - BUG(); + for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) + if (sh->sector == sector) return sh; - } - } PRINTK("__stripe %lu not in cache\n", sector); return NULL; } -static inline struct stripe_head *alloc_stripe(raid5_conf_t *conf, unsigned long sector, int size) +static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock) { struct stripe_head *sh; - struct buffer_head *buffer_pool, *bh_pool; - MD_DECLARE_WAITQUEUE(wait, current); - - PRINTK("alloc_stripe called\n"); - - - while ((sh = get_free_stripe(conf)) == NULL) { - int cnt; - add_wait_queue(&conf->wait_for_stripe, &wait); - set_current_state(TASK_UNINTERRUPTIBLE); - cnt = shrink_stripe_cache(conf, conf->max_nr_stripes / 8); - sh = get_free_stripe(conf); - if (!sh && cnt < (conf->max_nr_stripes/8)) { - md_wakeup_thread(conf->thread); - PRINTK("waiting for some stripes to complete - %d %d\n", cnt, conf->max_nr_stripes/8); - schedule(); - } - remove_wait_queue(&conf->wait_for_stripe, &wait); - current->state = TASK_RUNNING; - if (sh) - break; - } - buffer_pool = sh->buffer_pool; - bh_pool = sh->bh_pool; - memset(sh, 0, sizeof(*sh)); - sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; - md_init_waitqueue_head(&sh->wait); - sh->buffer_pool = buffer_pool; - sh->bh_pool = bh_pool; - sh->phase = PHASE_COMPLETE; - sh->cmd = STRIPE_NONE; - sh->raid_conf = conf; - sh->sector = sector; - sh->size = size; - atomic_inc(&conf->nr_cached_stripes); - - return sh; -} + PRINTK("get_stripe, sector %lu\n", sector); -static struct stripe_head *get_lock_stripe(raid5_conf_t *conf, unsigned long sector, int size) -{ - struct stripe_head *sh, *new = NULL; + md_spin_lock_irq(&conf->device_lock); - PRINTK("get_stripe, sector %lu\n", sector); + do { + if (conf->buffer_size == 0 || + (size && size != conf->buffer_size)) { + /* either the size is being changed (buffer_size==0) or + * we need to change it. + * If size==0, we can proceed as soon as buffer_size gets set. + * If size>0, we can proceed when active_stripes reaches 0, or + * when someone else sets the buffer_size to size. + * If someone sets the buffer size to something else, we will need to + * assert that we want to change it again + */ + int oldsize = conf->buffer_size; + PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes)); + if (size==0) + wait_event_lock_irq(conf->wait_for_stripe, + conf->buffer_size, + conf->device_lock); + else { + while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) { + conf->buffer_size = 0; + wait_event_lock_irq(conf->wait_for_stripe, + atomic_read(&conf->active_stripes)==0 || conf->buffer_size, + conf->device_lock); + PRINTK("waited and now %ld/%d buffer_size is %d - %d active\n", sector, size, + conf->buffer_size, atomic_read(&conf->active_stripes)); + } - /* - * Do this in set_blocksize()! - */ - if (conf->buffer_size != size) { - PRINTK("switching size, %d --> %d\n", conf->buffer_size, size); - shrink_stripe_cache(conf, conf->max_nr_stripes); - conf->buffer_size = size; - } + if (conf->buffer_size != size) { + printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size); + shrink_stripe_cache(conf); + if (size==0) BUG(); + conf->buffer_size = size; + PRINTK("size now %d\n", conf->buffer_size); + } + } + } + if (size == 0) + sector -= sector & ((conf->buffer_size>>9)-1); -repeat: - md_spin_lock_irq(&conf->device_lock); - sh = __find_stripe(conf, sector, size); - if (!sh) { - if (!new) { - md_spin_unlock_irq(&conf->device_lock); - new = alloc_stripe(conf, sector, size); - goto repeat; + sh = __find_stripe(conf, sector); + if (!sh) { + sh = get_free_stripe(conf); + if (noblock && sh == NULL) + break; + if (!sh) { + wait_event_lock_irq(conf->wait_for_stripe, + !list_empty(&conf->inactive_list), + conf->device_lock); + } else + init_stripe(sh, sector); + } else { + if (atomic_read(&sh->count)) { + if (!list_empty(&sh->lru)) + BUG(); + } else { + if (!test_bit(STRIPE_HANDLE, &sh->state)) + atomic_inc(&conf->active_stripes); + if (list_empty(&sh->lru)) + BUG(); + list_del_init(&sh->lru); + } } - sh = new; - new = NULL; - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) - BUG(); - insert_hash(conf, sh); - atomic_inc(&sh->count); - md_spin_unlock_irq(&conf->device_lock); - } else { + } while (sh == NULL); + + if (sh) atomic_inc(&sh->count); - if (new) { - if (md_test_and_set_bit(STRIPE_LOCKED, &new->state)) - BUG(); - __put_free_stripe(conf, new); - } - md_spin_unlock_irq(&conf->device_lock); - PRINTK("get_stripe, waiting, sector %lu\n", sector); - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) - __wait_lock_stripe(sh); - } + + md_spin_unlock_irq(&conf->device_lock); return sh; } @@ -508,26 +324,18 @@ static int grow_stripes(raid5_conf_t *conf, int num, int priority) return 1; memset(sh, 0, sizeof(*sh)); sh->raid_conf = conf; - sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; - md_init_waitqueue_head(&sh->wait); + sh->lock = SPIN_LOCK_UNLOCKED; - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) - BUG(); - if (grow_raid5_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) { - shrink_buffers(sh, 2 * conf->raid_disks); - kfree(sh); - return 1; - } - if (grow_bh(sh, conf->raid_disks, priority)) { - shrink_buffers(sh, 2 * conf->raid_disks); - shrink_bh(sh, conf->raid_disks); + if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) { + shrink_buffers(sh, conf->raid_disks); kfree(sh); return 1; } - md_spin_lock_irq(&conf->device_lock); - __put_free_stripe(conf, sh); - atomic_inc(&conf->nr_stripes); - md_spin_unlock_irq(&conf->device_lock); + /* we just created an active stripe so... */ + atomic_set(&sh->count, 1); + atomic_inc(&conf->active_stripes); + INIT_LIST_HEAD(&sh->lru); + release_stripe(sh); } return 0; } @@ -537,119 +345,124 @@ static void shrink_stripes(raid5_conf_t *conf, int num) struct stripe_head *sh; while (num--) { + spin_lock_irq(&conf->device_lock); sh = get_free_stripe(conf); + spin_unlock_irq(&conf->device_lock); if (!sh) break; - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + if (atomic_read(&sh->count)) BUG(); - shrink_buffers(sh, conf->raid_disks * 2); - shrink_bh(sh, conf->raid_disks); + shrink_buffers(sh, conf->raid_disks); kfree(sh); - atomic_dec(&conf->nr_stripes); + atomic_dec(&conf->active_stripes); } } -static struct buffer_head *raid5_alloc_buffer(struct stripe_head *sh, int b_size) +static inline void raid5_end_buffer_read(struct buffer_head *blist, struct buffer_head *bh) { - struct buffer_head *bh; - - bh = get_free_buffer(sh, b_size); - if (!bh) - BUG(); - return bh; + while (blist) { + struct buffer_head *new = blist; + blist = new->b_reqnext; + memcpy(new->b_data, bh->b_data, bh->b_size); + new->b_end_io(new, 1); + } } -static struct buffer_head *raid5_alloc_bh(struct stripe_head *sh) +static void raid5_end_read_request (struct buffer_head * bh, int uptodate) { - struct buffer_head *bh; + struct stripe_head *sh = bh->b_private; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + unsigned long flags; + struct buffer_head *buffers = NULL; - bh = get_free_bh(sh); - if (!bh) - BUG(); - return bh; -} + for (i=0 ; i<disks; i++) + if (bh == sh->bh_cache[i]) + break; -static void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate) -{ - struct buffer_head *bh = sh->bh_new[i]; - - PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh->b_blocknr, uptodate); - sh->bh_new[i] = NULL; - raid5_free_bh(sh, sh->bh_req[i]); - sh->bh_req[i] = NULL; - PRINTK("calling %p->end_io: %p.\n", bh, bh->b_end_io); - bh->b_end_io(bh, uptodate); - if (!uptodate) - printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for " - "block %lu\n", - partition_name(mddev_to_kdev(sh->raid_conf->mddev)), - bh->b_blocknr); -} + PRINTK("end_read_request %lu/%d, %d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate); + if (i == disks) { + BUG(); + return; + } -static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate) -{ - if (uptodate) + md_spin_lock_irqsave(&conf->device_lock, flags); + if (uptodate) { +#ifdef CONFIG_HIGHMEM + /* cannot map highmem bufferheads from irq, + * so leave it for stripe_handle if there might + * be a problem + */ + if (sh->bh_read[i] && + sh->bh_read[i]->b_reqnext == NULL && + !PageHighMem(sh->bh_read[i]->b_page)) { + /* it's safe */ + buffers = sh->bh_read[i]; + sh->bh_read[i] = NULL; + } +#else + buffers = sh->bh_read[i]; + sh->bh_read[i] = NULL; +#endif set_bit(BH_Uptodate, &bh->b_state); - else + if (buffers) { + spin_unlock_irqrestore(&conf->device_lock, flags); + raid5_end_buffer_read(buffers, bh); + spin_lock_irqsave(&conf->device_lock, flags); + } + } else { + md_error(mddev_to_kdev(conf->mddev), bh->b_dev); clear_bit(BH_Uptodate, &bh->b_state); + } + clear_bit(BH_Lock, &bh->b_state); + set_bit(STRIPE_HANDLE, &sh->state); + __release_stripe(conf, sh); + md_spin_unlock_irqrestore(&conf->device_lock, flags); } -static void raid5_end_request (struct buffer_head * bh, int uptodate) +static void raid5_end_write_request (struct buffer_head *bh, int uptodate) { struct stripe_head *sh = bh->b_private; raid5_conf_t *conf = sh->raid_conf; int disks = conf->raid_disks, i; unsigned long flags; - PRINTK("end_request %lu, nr_pending %d, uptodate: %d, (caller: %p,%p,%p,%p).\n", sh->sector, atomic_read(&sh->nr_pending), uptodate, __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2), __builtin_return_address(3)); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - raid5_mark_buffer_uptodate(bh, uptodate); - if (!uptodate) - md_error(mddev_to_kdev(conf->mddev), bh->b_dev); - if (conf->failed_disks) { - for (i = 0; i < disks; i++) { - if (conf->disks[i].operational) - continue; - if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i]) - continue; - if (bh->b_dev != conf->disks[i].dev) - continue; - set_bit(STRIPE_ERROR, &sh->state); - } - } - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + for (i=0 ; i<disks; i++) + if (bh == sh->bh_cache[i]) + break; - if (atomic_dec_and_test(&sh->nr_pending)) { - atomic_inc(&conf->nr_handle); - md_wakeup_thread(conf->thread); + PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate); + if (i == disks) { + BUG(); + return; } + + md_spin_lock_irqsave(&conf->device_lock, flags); + if (!uptodate) + md_error(mddev_to_kdev(conf->mddev), bh->b_dev); + clear_bit(BH_Lock, &bh->b_state); + set_bit(STRIPE_HANDLE, &sh->state); + __release_stripe(conf, sh); + md_spin_unlock_irqrestore(&conf->device_lock, flags); } + -static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i) + +static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i) { raid5_conf_t *conf = sh->raid_conf; - char *b_data; - struct page *b_page; + struct buffer_head *bh = sh->bh_cache[i]; unsigned long block = sh->sector / (sh->size >> 9); - b_data = bh->b_data; - b_page = bh->b_page; - memset (bh, 0, sizeof (struct buffer_head)); - init_waitqueue_head(&bh->b_wait); - init_buffer(bh, raid5_end_request, sh); - bh->b_dev = conf->disks[i].dev; - bh->b_blocknr = block; - - bh->b_data = b_data; - bh->b_page = b_page; - - bh->b_rdev = conf->disks[i].dev; - bh->b_rsector = sh->sector; + init_buffer(bh, raid5_end_read_request, sh); + bh->b_dev = conf->disks[i].dev; + bh->b_blocknr = block; bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); bh->b_size = sh->size; bh->b_list = BUF_LOCKED; + return bh; } static int raid5_error (mddev_t *mddev, kdev_t dev) @@ -778,6 +591,7 @@ static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int r return new_sector; } +#if 0 static unsigned long compute_blocknr(struct stripe_head *sh, int i) { raid5_conf_t *conf = sh->raid_conf; @@ -816,38 +630,42 @@ static unsigned long compute_blocknr(struct stripe_head *sh, int i) } return blocknr; } +#endif + +#define check_xor() do { \ + if (count == MAX_XOR_BLOCKS) { \ + xor_block(count, bh_ptr); \ + count = 1; \ + } \ + } while(0) + static void compute_block(struct stripe_head *sh, int dd_idx) { raid5_conf_t *conf = sh->raid_conf; int i, count, disks = conf->raid_disks; - struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh; PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx); - if (sh->bh_old[dd_idx] == NULL) - sh->bh_old[dd_idx] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx); - memset(sh->bh_old[dd_idx]->b_data, 0, sh->size); - bh_ptr[0] = sh->bh_old[dd_idx]; + memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_cache[dd_idx]; count = 1; - for (i = 0; i < disks; i++) { + for (i = disks ; i--; ) { if (i == dd_idx) continue; - if (sh->bh_old[i]) { - bh_ptr[count++] = sh->bh_old[i]; - } else { + bh = sh->bh_cache[i]; + if (buffer_uptodate(bh)) + bh_ptr[count++] = bh; + else printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); - } - if (count == MAX_XOR_BLOCKS) { - xor_block(count, &bh_ptr[0]); - count = 1; - } + + check_xor(); } if (count != 1) - xor_block(count, &bh_ptr[0]); - raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1); + xor_block(count, bh_ptr); + set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state); } static void compute_parity(struct stripe_head *sh, int method) @@ -855,604 +673,432 @@ static void compute_parity(struct stripe_head *sh, int method) raid5_conf_t *conf = sh->raid_conf; int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; + struct buffer_head *chosen[MD_SB_DISKS]; PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method); - for (i = 0; i < disks; i++) { - char *bdata; - if (i == pd_idx || !sh->bh_new[i]) - continue; - if (!sh->bh_copy[i]) - sh->bh_copy[i] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_copy[i], i); - atomic_set_buffer_dirty(sh->bh_copy[i]); - bdata = bh_kmap(sh->bh_new[i]); - memcpy(sh->bh_copy[i]->b_data, bdata, sh->size); - bh_kunmap(sh->bh_new[i]); - } - if (sh->bh_copy[pd_idx] == NULL) { - sh->bh_copy[pd_idx] = raid5_alloc_buffer(sh, sh->size); - atomic_set_buffer_dirty(sh->bh_copy[pd_idx]); - } - raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx); + memset(chosen, 0, sizeof(chosen)); - if (method == RECONSTRUCT_WRITE) { - memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size); - bh_ptr[0] = sh->bh_copy[pd_idx]; - count = 1; - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) + count = 1; + bh_ptr[0] = sh->bh_cache[pd_idx]; + spin_lock_irq(&conf->device_lock); + switch(method) { + case READ_MODIFY_WRITE: + if (!buffer_uptodate(sh->bh_cache[pd_idx])) + BUG(); + for (i=disks ; i-- ;) { + if (i==pd_idx) continue; - if (sh->bh_new[i]) { - bh_ptr[count++] = sh->bh_copy[i]; - } else if (sh->bh_old[i]) { - bh_ptr[count++] = sh->bh_old[i]; - } - if (count == MAX_XOR_BLOCKS) { - xor_block(count, &bh_ptr[0]); - count = 1; + if (sh->bh_write[i] && + buffer_uptodate(sh->bh_cache[i])) { + bh_ptr[count++] = sh->bh_cache[i]; + chosen[i] = sh->bh_write[i]; + sh->bh_write[i] = sh->bh_write[i]->b_reqnext; + chosen[i]->b_reqnext = sh->bh_written[i]; + sh->bh_written[i] = chosen[i]; + check_xor(); } } - if (count != 1) { - xor_block(count, &bh_ptr[0]); - } - } else if (method == READ_MODIFY_WRITE) { - memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size); - bh_ptr[0] = sh->bh_copy[pd_idx]; - count = 1; - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i] && sh->bh_old[i]) { - bh_ptr[count++] = sh->bh_copy[i]; - bh_ptr[count++] = sh->bh_old[i]; + break; + case RECONSTRUCT_WRITE: + memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size); + for (i= disks; i-- ;) + if (i!=pd_idx && sh->bh_write[i]) { + chosen[i] = sh->bh_write[i]; + sh->bh_write[i] = sh->bh_write[i]->b_reqnext; + chosen[i]->b_reqnext = sh->bh_written[i]; + sh->bh_written[i] = chosen[i]; + check_xor(); } - if (count >= (MAX_XOR_BLOCKS - 1)) { - xor_block(count, &bh_ptr[0]); - count = 1; + break; + case CHECK_PARITY: + break; + } + spin_unlock_irq(&conf->device_lock); + for (i = disks; i--;) + if (chosen[i]) { + struct buffer_head *bh = sh->bh_cache[i]; + char *bdata; + mark_buffer_clean(chosen[i]); /* NO FIXME */ + bdata = bh_kmap(chosen[i]); + memcpy(bh->b_data, + bdata,sh->size); + bh_kunmap(chosen[i]); + set_bit(BH_Lock, &bh->b_state); + mark_buffer_uptodate(bh, 1); + } + + switch(method) { + case RECONSTRUCT_WRITE: + case CHECK_PARITY: + for (i=disks; i--;) + if (i != pd_idx) { + bh_ptr[count++] = sh->bh_cache[i]; + check_xor(); + } + break; + case READ_MODIFY_WRITE: + for (i = disks; i--;) + if (chosen[i]) { + bh_ptr[count++] = sh->bh_cache[i]; + check_xor(); } - } - if (count != 1) { - xor_block(count, &bh_ptr[0]); - } } - raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1); + if (count != 1) + xor_block(count, bh_ptr); + + if (method != CHECK_PARITY) { + mark_buffer_uptodate(sh->bh_cache[pd_idx], 1); + set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state); + } else + mark_buffer_uptodate(sh->bh_cache[pd_idx], 0); } static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw) { + struct buffer_head **bhp; raid5_conf_t *conf = sh->raid_conf; - struct buffer_head *bh_req; PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector); - CHECK_SHLOCK(sh); - if (sh->bh_new[dd_idx]) - BUG(); - bh_req = raid5_alloc_bh(sh); - raid5_build_block(sh, bh_req, dd_idx); - bh_req->b_data = bh->b_data; - bh_req->b_page = bh->b_page; - md_spin_lock_irq(&conf->device_lock); - if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) { - PRINTK("stripe s#%lu => PHASE_BEGIN (%s)\n", sh->sector, rw == READ ? "read" : "write"); - sh->phase = PHASE_BEGIN; - sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE; - atomic_inc(&conf->nr_pending_stripes); - atomic_inc(&conf->nr_handle); - PRINTK("# of pending stripes: %u, # of handle: %u\n", atomic_read(&conf->nr_pending_stripes), atomic_read(&conf->nr_handle)); + spin_lock_irq(&conf->device_lock); + bh->b_reqnext = NULL; + if (rw == READ) + bhp = &sh->bh_read[dd_idx]; + else + bhp = &sh->bh_write[dd_idx]; + while (*bhp) { + printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector); + bhp = & (*bhp)->b_reqnext; } - sh->bh_new[dd_idx] = bh; - sh->bh_req[dd_idx] = bh_req; - sh->cmd_new[dd_idx] = rw; - sh->new[dd_idx] = 1; - md_spin_unlock_irq(&conf->device_lock); + *bhp = bh; + spin_unlock_irq(&conf->device_lock); PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx); } -static void complete_stripe(struct stripe_head *sh) -{ - raid5_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks; - int i, new = 0; - - PRINTK("complete_stripe %lu\n", sh->sector); - for (i = 0; i < disks; i++) { - if (sh->cmd == STRIPE_SYNC && sh->bh_copy[i]) - raid5_update_old_bh(sh, i); - if (sh->cmd == STRIPE_WRITE && i == sh->pd_idx) - raid5_update_old_bh(sh, i); - if (sh->bh_new[i]) { - PRINTK("stripe %lu finishes new bh, sh->new == %d\n", sh->sector, sh->new[i]); - if (!sh->new[i]) { -#if 0 - if (sh->cmd == STRIPE_WRITE) { - char *bdata = bh_kmap(sh->bh_new[i]); - if (memcmp(bdata, sh->bh_copy[i]->b_data, sh->size)) { - printk("copy differs, %s, sector %lu ", - test_bit(BH_Dirty, &sh->bh_new[i]->b_state) ? "dirty" : "clean", - sh->sector); - } else if (test_bit(BH_Dirty, &sh->bh_new[i]->b_state)) - printk("sector %lu dirty\n", sh->sector); - bh_kunmap(sh->bh_new[i]); - } -#endif - if (sh->cmd == STRIPE_WRITE) - raid5_update_old_bh(sh, i); - raid5_end_buffer_io(sh, i, 1); - continue; - } else - new++; - } - if (new && sh->cmd == STRIPE_WRITE) - printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new); - } - if (sh->cmd == STRIPE_SYNC) - md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1); - if (!new) - finish_unlock_stripe(sh); - else { - PRINTK("stripe %lu, new == %d\n", sh->sector, new); - sh->phase = PHASE_BEGIN; - } -} - - -static void handle_stripe_write (mddev_t *mddev , raid5_conf_t *conf, - struct stripe_head *sh, int nr_write, int * operational, int disks, - int parity, int parity_failed, int nr_cache, int nr_cache_other, - int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) -{ - int i; - unsigned int block; - struct buffer_head *bh; - int method1 = INT_MAX, method2 = INT_MAX; - - /* - * Attempt to add entries :-) - */ - if (nr_write != disks - 1) { - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i]) - continue; - block = (int) compute_blocknr(sh, i); - bh = get_hash_table(mddev_to_kdev(mddev), block, sh->size); - if (!bh) - continue; - if (buffer_dirty(bh) && !md_test_and_set_bit(BH_Lock, &bh->b_state)) { - PRINTK("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block); - add_stripe_bh(sh, bh, i, WRITE); - sh->new[i] = 0; - nr_write++; - if (sh->bh_old[i]) { - nr_cache_overwrite++; - nr_cache_other--; - } else - if (!operational[i]) { - nr_failed_overwrite++; - nr_failed_other--; - } - } - atomic_dec(&bh->b_count); - } - } - PRINTK("handle_stripe() -- begin writing, stripe %lu\n", sh->sector); - /* - * Writing, need to update parity buffer. - * - * Compute the number of I/O requests in the "reconstruct - * write" and "read modify write" methods. - */ - if (!nr_failed_other) - method1 = (disks - 1) - (nr_write + nr_cache_other); - if (!nr_failed_overwrite && !parity_failed) - method2 = nr_write - nr_cache_overwrite + (1 - parity); - - if (method1 == INT_MAX && method2 == INT_MAX) - BUG(); - PRINTK("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2); - if (!method1 || !method2) { - sh->phase = PHASE_WRITE; - compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); - for (i = 0; i < disks; i++) { - if (!operational[i] && !conf->spare && !conf->resync_parity) - continue; - bh = sh->bh_copy[i]; - if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL))) - printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]); - if (i == sh->pd_idx && !bh) - printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i); - if (bh) { - PRINTK("making request for buffer %d\n", i); - lock_get_bh(bh); - if (!operational[i] && !conf->resync_parity) { - PRINTK("writing spare %d\n", i); - atomic_inc(&sh->nr_pending); - bh->b_dev = bh->b_rdev = conf->spare->dev; - generic_make_request(WRITE, bh); - } else { - atomic_inc(&sh->nr_pending); - bh->b_dev = bh->b_rdev = conf->disks[i].dev; - generic_make_request(WRITE, bh); - } - atomic_dec(&bh->b_count); - } - } - PRINTK("handle_stripe() %lu, writing back %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); - return; - } - if (method1 < method2) { - sh->write_method = RECONSTRUCT_WRITE; - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i] || sh->bh_old[i]) - continue; - sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - } - } else { - sh->write_method = READ_MODIFY_WRITE; - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!sh->bh_new[i] && i != sh->pd_idx) - continue; - sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - } - } - sh->phase = PHASE_READ_OLD; - for (i = 0; i < disks; i++) { - if (!sh->bh_old[i]) - continue; - if (test_bit(BH_Uptodate, &sh->bh_old[i]->b_state)) - continue; - lock_get_bh(sh->bh_old[i]); - atomic_inc(&sh->nr_pending); - sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev; - generic_make_request(READ, sh->bh_old[i]); - atomic_dec(&sh->bh_old[i]->b_count); - } - PRINTK("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); -} /* - * Reading + * handle_stripe - do things to a stripe. + * + * We lock the stripe and then examine the state of various bits + * to see what needs to be done. + * Possible results: + * return some read request which now have data + * return some write requests which are safely on disc + * schedule a read on some buffers + * schedule a write of some buffers + * return confirmation of parity correctness + * + * Parity calculations are done inside the stripe lock + * buffers are taken off read_list or write_list, and bh_cache buffers + * get BH_Lock set before the stripe lock is released. + * */ -static void handle_stripe_read (mddev_t *mddev , raid5_conf_t *conf, - struct stripe_head *sh, int nr_read, int * operational, int disks, - int parity, int parity_failed, int nr_cache, int nr_cache_other, - int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) + +static void handle_stripe(struct stripe_head *sh) { + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; + struct buffer_head *return_ok= NULL, *return_fail = NULL; + int action[MD_SB_DISKS]; int i; - int method1 = INT_MAX; - - method1 = nr_read - nr_cache_overwrite; - - PRINTK("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1); + int syncing; + int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; + int failed_num=0; + struct buffer_head *bh; - if (!method1 || (method1 == 1 && nr_cache == disks - 1)) { - PRINTK("read %lu completed from cache\n", sh->sector); - for (i = 0; i < disks; i++) { - char *bdata; - if (!sh->bh_new[i]) - continue; - if (!sh->bh_old[i]) - compute_block(sh, i); - bdata = bh_kmap(sh->bh_new[i]); - memcpy(bdata, sh->bh_old[i]->b_data, sh->size); - bh_kunmap(sh->bh_new[i]); - } - complete_stripe(sh); - return; - } - if (nr_failed_overwrite) { - sh->phase = PHASE_READ_OLD; - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!operational[i]) - continue; - sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - lock_get_bh(sh->bh_old[i]); - atomic_inc(&sh->nr_pending); - sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev; - generic_make_request(READ, sh->bh_old[i]); - atomic_dec(&sh->bh_old[i]->b_count); - } - PRINTK("handle_stripe() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); - return; - } - sh->phase = PHASE_READ; - for (i = 0; i < disks; i++) { - if (!sh->bh_new[i]) - continue; - if (sh->bh_old[i]) { - char *bdata = bh_kmap(sh->bh_new[i]); - memcpy(bdata, sh->bh_old[i]->b_data, sh->size); - bh_kunmap(sh->bh_new[i]); - continue; - } -#if RAID5_PARANOIA - if (sh->bh_req[i] == NULL || test_bit(BH_Lock, &sh->bh_req[i]->b_state)) { - int j; - printk("req %d is NULL! or locked \n", i); - for (j=0; j<disks; j++) { - printk("%d: new=%p old=%p req=%p new=%d cmd=%d\n", - j, sh->bh_new[j], sh->bh_old[j], sh->bh_req[j], - sh->new[j], sh->cmd_new[j]); + PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx); + memset(action, 0, sizeof(action)); + + spin_lock(&sh->lock); + clear_bit(STRIPE_HANDLE, &sh->state); + + syncing = test_bit(STRIPE_SYNCING, &sh->state); + /* Now to look around and see what can be done */ + + for (i=disks; i--; ) { + bh = sh->bh_cache[i]; + PRINTK("check %d: state %lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]); + /* maybe we can reply to a read */ + if (buffer_uptodate(bh) && sh->bh_read[i]) { + struct buffer_head *rbh, *rbh2; + PRINTK("Return read for disc %d\n", i); + spin_lock_irq(&conf->device_lock); + rbh = sh->bh_read[i]; + sh->bh_read[i] = NULL; + spin_unlock_irq(&conf->device_lock); + while (rbh) { + char *bdata; + bdata = bh_kmap(rbh); + memcpy(bdata, bh->b_data, bh->b_size); + bh_kunmap(rbh); + rbh2 = rbh->b_reqnext; + rbh->b_reqnext = return_ok; + return_ok = rbh; + rbh = rbh2; } - } -#endif - lock_get_bh(sh->bh_req[i]); - atomic_inc(&sh->nr_pending); - sh->bh_req[i]->b_dev = sh->bh_req[i]->b_rdev = conf->disks[i].dev; - generic_make_request(READ, sh->bh_req[i]); - atomic_dec(&sh->bh_req[i]->b_count); - } - PRINTK("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)); -} -/* - * Syncing - */ -static void handle_stripe_sync (mddev_t *mddev , raid5_conf_t *conf, - struct stripe_head *sh, int * operational, int disks, - int parity, int parity_failed, int nr_cache, int nr_cache_other, - int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) -{ - struct buffer_head *bh; - int i, pd_idx; - - /* firstly, we want to have data from all non-failed drives - * in bh_old - */ - PRINTK("handle_stripe_sync: sec=%lu disks=%d nr_cache=%d\n", sh->sector, disks, nr_cache); - if ((nr_cache < disks-1) || ((nr_cache == disks-1) && !(parity_failed+nr_failed_other+nr_failed_overwrite)) - ) { - sh->phase = PHASE_READ_OLD; - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!conf->disks[i].operational) - continue; + /* now count some things */ + if (buffer_locked(bh)) locked++; + if (buffer_uptodate(bh)) uptodate++; - bh = raid5_alloc_buffer(sh, sh->size); - sh->bh_old[i] = bh; - raid5_build_block(sh, bh, i); - lock_get_bh(bh); - atomic_inc(&sh->nr_pending); - bh->b_dev = bh->b_rdev = conf->disks[i].dev; - generic_make_request(READ, bh); - md_sync_acct(bh->b_rdev, bh->b_size/512); - atomic_dec(&sh->bh_old[i]->b_count); + + if (sh->bh_read[i]) to_read++; + if (sh->bh_write[i]) to_write++; + if (sh->bh_written[i]) written++; + if (!conf->disks[i].operational) { + failed++; + failed_num = i; } - PRINTK("handle_stripe_sync() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); - - return; } - /* now, if there is a failed drive, rebuild and write to spare */ - if (nr_cache == disks-1) { - sh->phase = PHASE_WRITE; - /* we can generate the missing block, which will be on the failed drive */ - for (i=0; i<disks; i++) { - if (operational[i]) - continue; - compute_block(sh, i); - if (conf->spare) { - bh = sh->bh_copy[i]; - if (bh) { - memcpy(bh->b_data, sh->bh_old[i]->b_data, sh->size); - set_bit(BH_Uptodate, &bh->b_state); - } else { - bh = sh->bh_old[i]; - sh->bh_old[i] = NULL; - sh->bh_copy[i] = bh; + PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n", + locked, uptodate, to_read, to_write, failed, failed_num); + /* check if the array has lost two devices and, if so, some requests might + * need to be failed + */ + if (failed > 1 && to_read+to_write) { + spin_lock_irq(&conf->device_lock); + for (i=disks; i--; ) { + /* fail all writes first */ + if (sh->bh_write[i]) to_write--; + while ((bh = sh->bh_write[i])) { + sh->bh_write[i] = bh->b_reqnext; + bh->b_reqnext = return_fail; + return_fail = bh; + } + /* fail any reads if this device is non-operational */ + if (!conf->disks[i].operational) { + if (sh->bh_read[i]) to_read--; + while ((bh = sh->bh_read[i])) { + sh->bh_read[i] = bh->b_reqnext; + bh->b_reqnext = return_fail; + return_fail = bh; } - atomic_inc(&sh->nr_pending); - lock_get_bh(bh); - bh->b_dev = bh->b_rdev = conf->spare->dev; - generic_make_request(WRITE, bh); - md_sync_acct(bh->b_rdev, bh->b_size/512); - atomic_dec(&bh->b_count); - PRINTK("handle_stripe_sync() %lu, phase WRITE, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); } - break; } - return; + spin_unlock_irq(&conf->device_lock); + if (syncing) { + md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,0); + clear_bit(STRIPE_SYNCING, &sh->state); + syncing = 0; + } } - /* nr_cache == disks: - * check parity and compute/write if needed + /* might be able to return some write requests if the parity block + * is safe, or on a failed drive */ - - compute_parity(sh, RECONSTRUCT_WRITE); - pd_idx = sh->pd_idx; - if (!memcmp(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size)) { - /* the parity is correct - Yay! */ - complete_stripe(sh); - } else { - sh->phase = PHASE_WRITE; - bh = sh->bh_copy[pd_idx]; - atomic_set_buffer_dirty(bh); - lock_get_bh(bh); - atomic_inc(&sh->nr_pending); - bh->b_dev = bh->b_rdev = conf->disks[pd_idx].dev; - generic_make_request(WRITE, bh); - md_sync_acct(bh->b_rdev, bh->b_size/512); - atomic_dec(&bh->b_count); - PRINTK("handle_stripe_sync() %lu phase WRITE, pending %d buffers\n", - sh->sector, md_atomic_read(&sh->nr_pending)); - } -} - -/* - * handle_stripe() is our main logic routine. Note that: - * - * 1. lock_stripe() should be used whenever we can't accept additonal - * buffers, either during short sleeping in handle_stripe() or - * during io operations. - * - * 2. We should be careful to set sh->nr_pending whenever we sleep, - * to prevent re-entry of handle_stripe() for the same sh. - * - * 3. conf->failed_disks and disk->operational can be changed - * from an interrupt. This complicates things a bit, but it allows - * us to stop issuing requests for a failed drive as soon as possible. - */ -static void handle_stripe(struct stripe_head *sh) -{ - raid5_conf_t *conf = sh->raid_conf; - mddev_t *mddev = conf->mddev; - int disks = conf->raid_disks; - int i, nr_read = 0, nr_write = 0, parity = 0; - int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0; - int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0; - int operational[MD_SB_DISKS], failed_disks = conf->failed_disks; - - PRINTK("handle_stripe(), stripe %lu\n", sh->sector); - if (!stripe_locked(sh)) - BUG(); - if (md_atomic_read(&sh->nr_pending)) - BUG(); - if (sh->phase == PHASE_COMPLETE) - BUG(); - - atomic_dec(&conf->nr_handle); - - if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) { - printk("raid5: restarting stripe %lu\n", sh->sector); - sh->phase = PHASE_BEGIN; - } - - if ((sh->cmd == STRIPE_WRITE && sh->phase == PHASE_WRITE) || - (sh->cmd == STRIPE_READ && sh->phase == PHASE_READ) || - (sh->cmd == STRIPE_SYNC && sh->phase == PHASE_WRITE) - ) { - /* - * Completed - */ - complete_stripe(sh); - if (sh->phase == PHASE_COMPLETE) - return; - } - - md_spin_lock_irq(&conf->device_lock); - for (i = 0; i < disks; i++) { - operational[i] = conf->disks[i].operational; - if (i == sh->pd_idx && conf->resync_parity) - operational[i] = 0; - } - failed_disks = conf->failed_disks; - md_spin_unlock_irq(&conf->device_lock); - - /* - * Make this one more graceful? - */ - if (failed_disks > 1) { - for (i = 0; i < disks; i++) { - if (sh->bh_new[i]) { - raid5_end_buffer_io(sh, i, 0); - continue; + bh = sh->bh_cache[sh->pd_idx]; + if ( written && + ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh)) + || (failed == 1 && failed_num == sh->pd_idx)) + ) { + /* any written block on a uptodate or failed drive can be returned */ + for (i=disks; i--; ) + if (sh->bh_written[i]) { + bh = sh->bh_cache[i]; + if (!conf->disks[sh->pd_idx].operational || + (!buffer_locked(bh) && buffer_uptodate(bh)) ) { + /* maybe we can return some write requests */ + struct buffer_head *wbh, *wbh2; + PRINTK("Return write for disc %d\n", i); + spin_lock_irq(&conf->device_lock); + wbh = sh->bh_written[i]; + sh->bh_written[i] = NULL; + spin_unlock_irq(&conf->device_lock); + while (wbh) { + wbh2 = wbh->b_reqnext; + wbh->b_reqnext = return_ok; + return_ok = wbh; + wbh = wbh2; } + } } - if (sh->cmd == STRIPE_SYNC) - md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1); - finish_unlock_stripe(sh); - return; } - - PRINTK("=== stripe index START ===\n"); - for (i = 0; i < disks; i++) { - PRINTK("disk %d, ", i); - if (sh->bh_old[i]) { - nr_cache++; - PRINTK(" (old cached, %d)", nr_cache); - } - if (i == sh->pd_idx) { - PRINTK(" PARITY."); - if (sh->bh_old[i]) { - PRINTK(" CACHED."); - parity = 1; - } else { - PRINTK(" UNCACHED."); - if (!operational[i]) { - PRINTK(" FAILED."); - parity_failed = 1; + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + */ + if (to_read || (syncing && (uptodate+failed < disks))) { + for (i=disks; i--;) { + bh = sh->bh_cache[i]; + if (!buffer_locked(bh) && !buffer_uptodate(bh) && + (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) { + /* we would like to get this block, possibly + * by computing it, but we might not be able to + */ + if (uptodate == disks-1) { + PRINTK("Computing block %d\n", i); + compute_block(sh, i); + uptodate++; + } else if (conf->disks[i].operational) { + set_bit(BH_Lock, &bh->b_state); + action[i] = READ+1; + locked++; + PRINTK("Reading block %d (sync=%d)\n", i, syncing); + if (syncing) + md_sync_acct(conf->disks[i].dev, bh->b_size>>9); } } - PRINTK("\n"); - continue; } - if (!sh->bh_new[i]) { - PRINTK(" (no new data block) "); - if (sh->bh_old[i]) { - PRINTK(" (but old block cached) "); - nr_cache_other++; - } else { - if (!operational[i]) { - PRINTK(" (because failed disk) "); - nr_failed_other++; - } else - PRINTK(" (no old block either) "); + set_bit(STRIPE_HANDLE, &sh->state); + } + + /* now to consider writing and what else, if anything should be read */ + if (to_write) { + int rmw=0, rcw=0; + for (i=disks ; i--;) { + /* would I have to read this buffer for read_modify_write */ + bh = sh->bh_cache[i]; + if ((sh->bh_write[i] || i == sh->pd_idx) && + !buffer_locked(bh) && !buffer_uptodate(bh)) { + if (conf->disks[i].operational +/* && !(conf->resync_parity && i == sh->pd_idx) */ + ) + rmw++; + else rmw += 2*disks; /* cannot read it */ + } + /* Would I have to read this buffer for reconstruct_write */ + if (!sh->bh_write[i] && i != sh->pd_idx && + !buffer_locked(bh) && !buffer_uptodate(bh)) { + if (conf->disks[i].operational) rcw++; + else rcw += 2*disks; } - PRINTK("\n"); - continue; - } - sh->new[i] = 0; - if (sh->cmd_new[i] == READ) { - nr_read++; - PRINTK(" (new READ %d)", nr_read); - } - if (sh->cmd_new[i] == WRITE) { - nr_write++; - PRINTK(" (new WRITE %d)", nr_write); } - if (sh->bh_old[i]) { - nr_cache_overwrite++; - PRINTK(" (overwriting old %d)", nr_cache_overwrite); - } else { - if (!operational[i]) { - nr_failed_overwrite++; - PRINTK(" (overwriting failed %d)", nr_failed_overwrite); + PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw); + set_bit(STRIPE_HANDLE, &sh->state); + if (rmw < rcw && rmw > 0) + /* prefer read-modify-write, but need to get some data */ + for (i=disks; i--;) { + bh = sh->bh_cache[i]; + if ((sh->bh_write[i] || i == sh->pd_idx) && + !buffer_locked(bh) && !buffer_uptodate(bh) && + conf->disks[i].operational) { + PRINTK("Read_old block %d for r-m-w\n", i); + set_bit(BH_Lock, &bh->b_state); + action[i] = READ+1; + locked++; + } } + if (rcw <= rmw && rcw > 0) + /* want reconstruct write, but need to get some data */ + for (i=disks; i--;) { + bh = sh->bh_cache[i]; + if (!sh->bh_write[i] && i != sh->pd_idx && + !buffer_locked(bh) && !buffer_uptodate(bh) && + conf->disks[i].operational) { + PRINTK("Read_old block %d for Reconstruct\n", i); + set_bit(BH_Lock, &bh->b_state); + action[i] = READ+1; + locked++; + } + } + /* now if nothing is locked, and if we have enough data, we can start a write request */ + if (locked == 0 && (rcw == 0 ||rmw == 0)) { + PRINTK("Computing parity...\n"); + compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); + /* now every locked buffer is ready to be written */ + for (i=disks; i--;) + if (buffer_locked(sh->bh_cache[i])) { + PRINTK("Writing block %d\n", i); + locked++; + action[i] = WRITE+1; + if (!conf->disks[i].operational + || (i==sh->pd_idx && failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); + } } - PRINTK("\n"); } - PRINTK("=== stripe index END ===\n"); - if (nr_write && nr_read) - BUG(); + /* maybe we need to check and possibly fix the parity for this stripe + * Any reads will already have been scheduled, so we just see if enough data + * is available + */ + if (syncing && locked == 0 && + !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { + set_bit(STRIPE_HANDLE, &sh->state); + if (failed == 0) { + if (uptodate != disks) + BUG(); + compute_parity(sh, CHECK_PARITY); + uptodate--; + bh = sh->bh_cache[sh->pd_idx]; + if ((*(u32*)bh->b_data) == 0 && + !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) { + /* parity is correct (on disc, not in buffer any more) */ + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (!test_bit(STRIPE_INSYNC, &sh->state)) { + if (failed==0) + failed_num = sh->pd_idx; + /* should be able to compute the missing block and write it to spare */ + if (!buffer_uptodate(sh->bh_cache[failed_num])) { + if (uptodate+1 != disks) + BUG(); + compute_block(sh, failed_num); + uptodate++; + } + if (uptodate != disks) + BUG(); + bh = sh->bh_cache[failed_num]; + set_bit(BH_Lock, &bh->b_state); + action[failed_num] = WRITE+1; + locked++; + set_bit(STRIPE_INSYNC, &sh->state); + if (conf->disks[i].operational) + md_sync_acct(conf->disks[i].dev, bh->b_size>>9); + else if (conf->spare) + md_sync_acct(conf->spare->dev, bh->b_size>>9); - if (nr_write) - handle_stripe_write( - mddev, conf, sh, nr_write, operational, disks, - parity, parity_failed, nr_cache, nr_cache_other, - nr_failed_other, nr_cache_overwrite, - nr_failed_overwrite - ); - else if (nr_read) - handle_stripe_read( - mddev, conf, sh, nr_read, operational, disks, - parity, parity_failed, nr_cache, nr_cache_other, - nr_failed_other, nr_cache_overwrite, - nr_failed_overwrite - ); - else if (sh->cmd == STRIPE_SYNC) - handle_stripe_sync( - mddev, conf, sh, operational, disks, - parity, parity_failed, nr_cache, nr_cache_other, - nr_failed_other, nr_cache_overwrite, nr_failed_overwrite - ); + } + } + if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { + md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1); + clear_bit(STRIPE_SYNCING, &sh->state); + } + + + spin_unlock(&sh->lock); + + while ((bh=return_ok)) { + return_ok = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, 1); + } + while ((bh=return_fail)) { + return_ok = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, 0); + } + for (i=disks; i-- ;) + if (action[i]) { + struct buffer_head *bh = sh->bh_cache[i]; + int skip = 0; + if (action[i] == READ+1) + bh->b_end_io = raid5_end_read_request; + else + bh->b_end_io = raid5_end_write_request; + if (conf->disks[i].operational) + bh->b_dev = conf->disks[i].dev; + else if (conf->spare && action[i] == WRITE+1) + bh->b_dev = conf->spare->dev; + else if (action[i] == READ+1) + BUG(); + else skip=1; + if (!skip) { + PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i); + atomic_inc(&sh->count); + bh->b_rdev = bh->b_dev; + bh->b_rsector = bh->b_blocknr * (bh->b_size>>9); + generic_make_request(action[i]-1, bh); + } else + PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector); + } } @@ -1463,34 +1109,28 @@ static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) const unsigned int data_disks = raid_disks - 1; unsigned int dd_idx, pd_idx; unsigned long new_sector; + int read_ahead = 0; struct stripe_head *sh; - if (rw == READA) + if (rw == READA) { rw = READ; + read_ahead=1; + } new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks, &dd_idx, &pd_idx, conf); PRINTK("raid5_make_request, sector %lu\n", new_sector); - sh = get_lock_stripe(conf, new_sector, bh->b_size); -#if 0 - if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) { - PRINTK("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd); - lock_stripe(sh); - if (!md_atomic_read(&sh->nr_pending)) - handle_stripe(sh); - goto repeat; - } -#endif - sh->pd_idx = pd_idx; - if (sh->phase != PHASE_COMPLETE && sh->phase != PHASE_BEGIN) - PRINTK("stripe %lu catching the bus!\n", sh->sector); - if (sh->bh_new[dd_idx]) - BUG(); - add_stripe_bh(sh, bh, dd_idx, rw); + sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead); + if (sh) { + sh->pd_idx = pd_idx; - md_wakeup_thread(conf->thread); + add_stripe_bh(sh, bh, dd_idx, rw); + handle_stripe(sh); + release_stripe(sh); + } else + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); return 0; } @@ -1525,22 +1165,21 @@ static int raid5_sync_request (mddev_t *mddev, unsigned long block_nr) int redone = 0; int bufsize; - if (!conf->buffer_size) - conf->buffer_size = /* device_bsize(mddev_to_kdev(mddev))*/ PAGE_SIZE; - bufsize = conf->buffer_size; - /* Hmm... race on buffer_size ?? */ - redone = block_nr% (bufsize>>10); - block_nr -= redone; - sh = get_lock_stripe(conf, block_nr<<1, bufsize); + sh = get_active_stripe(conf, block_nr<<1, 0, 0); + bufsize = sh->size; + redone = block_nr-(sh->sector>>1); first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); sh->pd_idx = pd_idx; - sh->cmd = STRIPE_SYNC; - sh->phase = PHASE_BEGIN; + spin_lock(&sh->lock); + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); sh->sync_redone = redone; - atomic_inc(&conf->nr_pending_stripes); - atomic_inc(&conf->nr_handle); - md_wakeup_thread(conf->thread); + spin_unlock(&sh->lock); + + handle_stripe(sh); + release_stripe(sh); + return (bufsize>>10)-redone; } @@ -1556,46 +1195,35 @@ static void raid5d (void *data) struct stripe_head *sh; raid5_conf_t *conf = data; mddev_t *mddev = conf->mddev; - int i, handled; + int handled; PRINTK("+++ raid5d active\n"); handled = 0; - md_spin_lock_irq(&conf->device_lock); - clear_bit(THREAD_WAKEUP, &conf->thread->flags); -repeat_pass: + if (mddev->sb_dirty) { - md_spin_unlock_irq(&conf->device_lock); mddev->sb_dirty = 0; md_update_sb(mddev); - md_spin_lock_irq(&conf->device_lock); } - for (i = 0; i < NR_HASH; i++) { -repeat: - sh = conf->stripe_hashtbl[i]; - for (; sh; sh = sh->hash_next) { - if (sh->raid_conf != conf) - continue; - if (sh->phase == PHASE_COMPLETE) - continue; - if (md_atomic_read(&sh->nr_pending)) - continue; - md_spin_unlock_irq(&conf->device_lock); - if (!atomic_read(&sh->count)) - BUG(); + md_spin_lock_irq(&conf->device_lock); + while (!list_empty(&conf->handle_list)) { + struct list_head *first = conf->handle_list.next; + sh = list_entry(first, struct stripe_head, lru); - handled++; - handle_stripe(sh); - md_spin_lock_irq(&conf->device_lock); - goto repeat; - } - } - if (conf) { - PRINTK("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)); - if (test_and_clear_bit(THREAD_WAKEUP, &conf->thread->flags) && - md_atomic_read(&conf->nr_handle)) - goto repeat_pass; + list_del_init(first); + atomic_inc(&sh->count); + if (atomic_read(&sh->count)!= 1) + BUG(); + md_spin_unlock_irq(&conf->device_lock); + + handled++; + handle_stripe(sh); + release_stripe(sh); + + md_spin_lock_irq(&conf->device_lock); } + PRINTK("%d stripes handled\n", handled); + md_spin_unlock_irq(&conf->device_lock); PRINTK("--- raid5d inactive\n"); @@ -1727,6 +1355,11 @@ static int raid5_run (mddev_t *mddev) conf->device_lock = MD_SPIN_LOCK_UNLOCKED; md_init_waitqueue_head(&conf->wait_for_stripe); + INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->inactive_list); + atomic_set(&conf->active_stripes, 0); + conf->buffer_size = PAGE_SIZE; /* good default for rebuild */ + PRINTK("raid5_run(md%d) called.\n", mdidx(mddev)); ITERATE_RDEV(mddev,rdev,tmp) { @@ -1867,8 +1500,7 @@ static int raid5_run (mddev_t *mddev) } memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * (sizeof(struct buffer_head) + - 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024; + conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024; if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) { printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory); shrink_stripes(conf, conf->max_nr_stripes); @@ -1971,11 +1603,10 @@ static int raid5_stop (mddev_t *mddev) { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - shrink_stripe_cache(conf, conf->max_nr_stripes); - shrink_stripes(conf, conf->max_nr_stripes); - md_unregister_thread(conf->thread); if (conf->resync_thread) md_unregister_thread(conf->resync_thread); + md_unregister_thread(conf->thread); + shrink_stripes(conf, conf->max_nr_stripes); free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); kfree(conf); mddev->private = NULL; @@ -1988,23 +1619,14 @@ static void print_sh (struct stripe_head *sh) { int i; - printk("sh %lu, phase %d, size %d, pd_idx %d, state %ld, cmd %d.\n", sh->sector, sh->phase, sh->size, sh->pd_idx, sh->state, sh->cmd); - printk("sh %lu, write_method %d, nr_pending %d, count %d.\n", sh->sector, sh->write_method, atomic_read(&sh->nr_pending), atomic_read(&sh->count)); + printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state); + printk("sh %lu, count %d.\n", sh->sector, atomic_read(&sh->count)); printk("sh %lu, ", sh->sector); for (i = 0; i < MD_SB_DISKS; i++) { - if (sh->bh_old[i]) - printk("(old%d: %p) ", i, sh->bh_old[i]); - if (sh->bh_new[i]) - printk("(new%d: %p) ", i, sh->bh_new[i]); - if (sh->bh_copy[i]) - printk("(copy%d: %p) ", i, sh->bh_copy[i]); - if (sh->bh_req[i]) - printk("(req%d: %p) ", i, sh->bh_req[i]); + if (sh->bh_cache[i]) + printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state); } printk("\n"); - for (i = 0; i < MD_SB_DISKS; i++) - printk("%d(%d/%d) ", i, sh->cmd_new[i], sh->new[i]); - printk("\n"); } static void printall (raid5_conf_t *conf) @@ -2041,13 +1663,6 @@ static int raid5_status (char *page, mddev_t *mddev) #if RAID5_DEBUG #define D(x) \ sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x)) - D(nr_handle); - D(nr_stripes); - D(nr_hashed_stripes); - D(nr_locked_stripes); - D(nr_pending_stripes); - D(nr_cached_stripes); - D(nr_free_sh); printall(conf); #endif return sz; @@ -2066,7 +1681,11 @@ static void print_raid5_conf (raid5_conf_t *conf) printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, conf->working_disks, conf->failed_disks); +#if RAID5_DEBUG for (i = 0; i < MD_SB_DISKS; i++) { +#else + for (i = 0; i < conf->working_disks+conf->failed_disks; i++) { +#endif tmp = conf->disks + i; printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", i, tmp->spare,tmp->operational, |