diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-10-05 01:18:40 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-10-05 01:18:40 +0000 |
commit | 012bb3e61e5eced6c610f9e036372bf0c8def2d1 (patch) | |
tree | 87efc733f9b164e8c85c0336f92c8fb7eff6d183 /drivers/md | |
parent | 625a1589d3d6464b5d90b8a0918789e3afffd220 (diff) |
Merge with Linux 2.4.0-test9. Please check DECstation, I had a number
of rejects to fixup while integrating Linus patches. I also found
that this kernel will only boot SMP on Origin; the UP kernel freeze
soon after bootup with SCSI timeout messages. I commit this anyway
since I found that the last CVS versions had the same problem.
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Config.in | 22 | ||||
-rw-r--r-- | drivers/md/Makefile | 35 | ||||
-rw-r--r-- | drivers/md/linear.c | 213 | ||||
-rw-r--r-- | drivers/md/lvm-snap.c | 436 | ||||
-rw-r--r-- | drivers/md/lvm.c | 2567 | ||||
-rw-r--r-- | drivers/md/md.c | 3878 | ||||
-rw-r--r-- | drivers/md/raid0.c | 356 | ||||
-rw-r--r-- | drivers/md/raid1.c | 1897 | ||||
-rw-r--r-- | drivers/md/raid5.c | 2371 | ||||
-rw-r--r-- | drivers/md/xor.c | 2728 |
10 files changed, 14503 insertions, 0 deletions
diff --git a/drivers/md/Config.in b/drivers/md/Config.in new file mode 100644 index 000000000..565055a68 --- /dev/null +++ b/drivers/md/Config.in @@ -0,0 +1,22 @@ +# +# Block device driver configuration +# +mainmenu_option next_comment +comment 'Multi-device support (RAID and LVM)' + +bool 'Multiple devices driver support (RAID and LVM)' CONFIG_MD + +dep_tristate ' RAID support' CONFIG_BLK_DEV_MD $CONFIG_MD +dep_tristate ' Linear (append) mode' CONFIG_MD_LINEAR $CONFIG_BLK_DEV_MD +dep_tristate ' RAID-0 (striping) mode' CONFIG_MD_RAID0 $CONFIG_BLK_DEV_MD +dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD +dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_RAID0" = "y" -o "$CONFIG_MD_RAID1" = "y" -o "$CONFIG_MD_RAID5" = "y" ]; then + bool ' Boot support' CONFIG_MD_BOOT + bool ' Auto Detect support' CONFIG_AUTODETECT_RAID +fi + +dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD +dep_mbool ' LVM information in proc filesystem' CONFIG_LVM_PROC_FS $CONFIG_BLK_DEV_LVM + +endmenu diff --git a/drivers/md/Makefile b/drivers/md/Makefile new file mode 100644 index 000000000..69d65c2bd --- /dev/null +++ b/drivers/md/Makefile @@ -0,0 +1,35 @@ +# +# Makefile for the kernel software RAID and LVM drivers. +# + +O_TARGET := mddev.o +SUB_DIRS := +ALL_SUB_DIRS := +MOD_SUB_DIRS := + +export-objs := md.o xor.o +list-multi := lvm-mod.o +lvm-mod-objs := lvm.o lvm-snap.o + +obj-y := +obj-m := +obj-n := +obj- := + +obj-$(CONFIG_BLK_DEV_MD) += md.o +obj-$(CONFIG_MD_LINEAR) += linear.o +obj-$(CONFIG_MD_RAID0) += raid0.o +obj-$(CONFIG_MD_RAID1) += raid1.o +obj-$(CONFIG_MD_RAID5) += raid5.o xor.o +obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o + +# Translate to Rules.make lists. +O_OBJS := $(filter-out $(export-objs), $(obj-y)) +OX_OBJS := $(filter $(export-objs), $(obj-y)) +M_OBJS := $(sort $(filter-out $(export-objs), $(obj-m))) +MX_OBJS := $(sort $(filter $(export-objs), $(obj-m))) + +include $(TOPDIR)/Rules.make + +lvm-mod.o: $(lvm-mod-objs) + $(LD) -r -o $@ $(lvm-mod-objs) diff --git a/drivers/md/linear.c b/drivers/md/linear.c new file mode 100644 index 000000000..855bc44dd --- /dev/null +++ b/drivers/md/linear.c @@ -0,0 +1,213 @@ +/* + linear.c : Multiple Devices driver for Linux + Copyright (C) 1994-96 Marc ZYNGIER + <zyngier@ufr-info-p7.ibp.fr> or + <maz@gloups.fdn.fr> + + Linear mode management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include <linux/module.h> + +#include <linux/raid/md.h> +#include <linux/malloc.h> + +#include <linux/raid/linear.h> + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +static int linear_run (mddev_t *mddev) +{ + linear_conf_t *conf; + struct linear_hash *table; + mdk_rdev_t *rdev; + int size, i, j, nb_zone; + unsigned int curr_offset; + + MOD_INC_USE_COUNT; + + conf = kmalloc (sizeof (*conf), GFP_KERNEL); + if (!conf) + goto out; + mddev->private = conf; + + if (md_check_ordering(mddev)) { + printk("linear: disks are not ordered, aborting!\n"); + goto out; + } + /* + * Find the smallest device. + */ + + conf->smallest = NULL; + curr_offset = 0; + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + dev_info_t *disk = conf->disks + j; + + disk->dev = rdev->dev; + disk->size = rdev->size; + disk->offset = curr_offset; + + curr_offset += disk->size; + + if (!conf->smallest || (disk->size < conf->smallest->size)) + conf->smallest = disk; + } + + nb_zone = conf->nr_zones = + md_size[mdidx(mddev)] / conf->smallest->size + + ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0); + + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone, + GFP_KERNEL); + if (!conf->hash_table) + goto out; + + /* + * Here we generate the linear hash table + */ + table = conf->hash_table; + i = 0; + size = 0; + for (j = 0; j < mddev->nb_dev; j++) { + dev_info_t *disk = conf->disks + j; + + if (size < 0) { + table->dev1 = disk; + table++; + } + size += disk->size; + + while (size) { + table->dev0 = disk; + size -= conf->smallest->size; + if (size < 0) + break; + table->dev1 = NULL; + table++; + } + } + table->dev1 = NULL; + + return 0; + +out: + if (conf) + kfree(conf); + MOD_DEC_USE_COUNT; + return 1; +} + +static int linear_stop (mddev_t *mddev) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + + kfree(conf->hash_table); + kfree(conf); + + MOD_DEC_USE_COUNT; + + return 0; +} + +static int linear_make_request (mddev_t *mddev, + int rw, struct buffer_head * bh) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + struct linear_hash *hash; + dev_info_t *tmp_dev; + long block; + + block = bh->b_rsector >> 1; + hash = conf->hash_table + (block / conf->smallest->size); + + if (block >= (hash->dev0->size + hash->dev0->offset)) { + if (!hash->dev1) { + printk ("linear_make_request : hash->dev1==NULL for block %ld\n", + block); + return -1; + } + tmp_dev = hash->dev1; + } else + tmp_dev = hash->dev0; + + if (block >= (tmp_dev->size + tmp_dev->offset) + || block < tmp_dev->offset) { + printk ("linear_make_request: Block %ld out of bounds on dev %s size %ld offset %ld\n", block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); + return -1; + } + bh->b_rdev = tmp_dev->dev; + bh->b_rsector = bh->b_rsector - (tmp_dev->offset << 1); + + return 1; +} + +static int linear_status (char *page, mddev_t *mddev) +{ + int sz = 0; + +#undef MD_DEBUG +#ifdef MD_DEBUG + int j; + linear_conf_t *conf = mddev_to_conf(mddev); + + sz += sprintf(page+sz, " "); + for (j = 0; j < conf->nr_zones; j++) + { + sz += sprintf(page+sz, "[%s", + partition_name(conf->hash_table[j].dev0->dev)); + + if (conf->hash_table[j].dev1) + sz += sprintf(page+sz, "/%s] ", + partition_name(conf->hash_table[j].dev1->dev)); + else + sz += sprintf(page+sz, "] "); + } + sz += sprintf(page+sz, "\n"); +#endif + sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024); + return sz; +} + + +static mdk_personality_t linear_personality= +{ + name: "linear", + make_request: linear_make_request, + run: linear_run, + stop: linear_stop, + status: linear_status, +}; + +#ifndef MODULE + +void md__init linear_init (void) +{ + register_md_personality (LINEAR, &linear_personality); +} + +#else + +int init_module (void) +{ + return (register_md_personality (LINEAR, &linear_personality)); +} + +void cleanup_module (void) +{ + unregister_md_personality (LINEAR); +} + +#endif + diff --git a/drivers/md/lvm-snap.c b/drivers/md/lvm-snap.c new file mode 100644 index 000000000..04007c1be --- /dev/null +++ b/drivers/md/lvm-snap.c @@ -0,0 +1,436 @@ +/* + * kernel/lvm-snap.c + * + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * + * LVM snapshot driver is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * LVM driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/smp_lock.h> +#include <linux/types.h> +#include <linux/iobuf.h> +#include <linux/lvm.h> + + +static char *lvm_snap_version __attribute__ ((unused)) = "LVM 0.8final (15/02/2000)\n"; + +extern const char *const lvm_name; +extern int lvm_blocksizes[]; + +void lvm_snapshot_release(lv_t *); + +#define hashfn(dev,block,mask,chunk_size) \ + ((HASHDEV(dev)^((block)/(chunk_size))) & (mask)) + +static inline lv_block_exception_t * +lvm_find_exception_table(kdev_t org_dev, unsigned long org_start, lv_t * lv) +{ + struct list_head * hash_table = lv->lv_snapshot_hash_table, * next; + unsigned long mask = lv->lv_snapshot_hash_mask; + int chunk_size = lv->lv_chunk_size; + lv_block_exception_t * ret; + int i = 0; + + hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)]; + ret = NULL; + for (next = hash_table->next; next != hash_table; next = next->next) + { + lv_block_exception_t * exception; + + exception = list_entry(next, lv_block_exception_t, hash); + if (exception->rsector_org == org_start && + exception->rdev_org == org_dev) + { + if (i) + { + /* fun, isn't it? :) */ + list_del(next); + list_add(next, hash_table); + } + ret = exception; + break; + } + i++; + } + return ret; +} + +static inline void lvm_hash_link(lv_block_exception_t * exception, + kdev_t org_dev, unsigned long org_start, + lv_t * lv) +{ + struct list_head * hash_table = lv->lv_snapshot_hash_table; + unsigned long mask = lv->lv_snapshot_hash_mask; + int chunk_size = lv->lv_chunk_size; + + hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)]; + list_add(&exception->hash, hash_table); +} + +int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector, + unsigned long pe_start, lv_t * lv) +{ + int ret; + unsigned long pe_off, pe_adjustment, __org_start; + kdev_t __org_dev; + int chunk_size = lv->lv_chunk_size; + lv_block_exception_t * exception; + + pe_off = pe_start % chunk_size; + pe_adjustment = (*org_sector-pe_off) % chunk_size; + __org_start = *org_sector - pe_adjustment; + __org_dev = *org_dev; + + ret = 0; + exception = lvm_find_exception_table(__org_dev, __org_start, lv); + if (exception) + { + *org_dev = exception->rdev_new; + *org_sector = exception->rsector_new + pe_adjustment; + ret = 1; + } + return ret; +} + +static void lvm_drop_snapshot(lv_t * lv_snap, const char * reason) +{ + kdev_t last_dev; + int i; + + /* no exception storage space available for this snapshot + or error on this snapshot --> release it */ + invalidate_buffers(lv_snap->lv_dev); + + last_dev = 0; + for (i = 0; i < lv_snap->lv_remap_ptr; i++) { + if ( lv_snap->lv_block_exception[i].rdev_new != last_dev) { + last_dev = lv_snap->lv_block_exception[i].rdev_new; + invalidate_buffers(last_dev); + } + } + + lvm_snapshot_release(lv_snap); + + printk(KERN_INFO + "%s -- giving up to snapshot %s on %s due %s\n", + lvm_name, lv_snap->lv_snapshot_org->lv_name, lv_snap->lv_name, + reason); +} + +static inline void lvm_snapshot_prepare_blocks(unsigned long * blocks, + unsigned long start, + int nr_sectors, + int blocksize) +{ + int i, sectors_per_block, nr_blocks; + + sectors_per_block = blocksize >> 9; + nr_blocks = nr_sectors / sectors_per_block; + start /= sectors_per_block; + + for (i = 0; i < nr_blocks; i++) + blocks[i] = start++; +} + +static inline int get_blksize(kdev_t dev) +{ + int correct_size = BLOCK_SIZE, i, major; + + major = MAJOR(dev); + if (blksize_size[major]) + { + i = blksize_size[major][MINOR(dev)]; + if (i) + correct_size = i; + } + return correct_size; +} + +#ifdef DEBUG_SNAPSHOT +static inline void invalidate_snap_cache(unsigned long start, unsigned long nr, + kdev_t dev) +{ + struct buffer_head * bh; + int sectors_per_block, i, blksize, minor; + + minor = MINOR(dev); + blksize = lvm_blocksizes[minor]; + sectors_per_block = blksize >> 9; + nr /= sectors_per_block; + start /= sectors_per_block; + + for (i = 0; i < nr; i++) + { + bh = get_hash_table(dev, start++, blksize); + if (bh) + bforget(bh); + } +} +#endif + +/* + * copy on write handler for one snapshot logical volume + * + * read the original blocks and store it/them on the new one(s). + * if there is no exception storage space free any longer --> release snapshot. + * + * this routine gets called for each _first_ write to a physical chunk. + */ +int lvm_snapshot_COW(kdev_t org_phys_dev, + unsigned long org_phys_sector, + unsigned long org_pe_start, + unsigned long org_virt_sector, + lv_t * lv_snap) +{ + const char * reason; + unsigned long org_start, snap_start, virt_start, pe_off; + int idx = lv_snap->lv_remap_ptr, chunk_size = lv_snap->lv_chunk_size; + kdev_t snap_phys_dev; + struct kiobuf * iobuf; + unsigned long blocks[KIO_MAX_SECTORS]; + int blksize_snap, blksize_org, min_blksize, max_blksize; + int max_sectors, nr_sectors; + + /* check if we are out of snapshot space */ + if (idx >= lv_snap->lv_remap_end) + goto fail_out_of_space; + + /* calculate physical boundaries of source chunk */ + pe_off = org_pe_start % chunk_size; + org_start = org_phys_sector - ((org_phys_sector-pe_off) % chunk_size); + virt_start = org_virt_sector - (org_phys_sector - org_start); + + /* calculate physical boundaries of destination chunk */ + snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new; + snap_start = lv_snap->lv_block_exception[idx].rsector_new; + +#ifdef DEBUG_SNAPSHOT + printk(KERN_INFO + "%s -- COW: " + "org %02d:%02d faulting %lu start %lu, " + "snap %02d:%02d start %lu, " + "size %d, pe_start %lu pe_off %lu, virt_sec %lu\n", + lvm_name, + MAJOR(org_phys_dev), MINOR(org_phys_dev), org_phys_sector, + org_start, + MAJOR(snap_phys_dev), MINOR(snap_phys_dev), snap_start, + chunk_size, + org_pe_start, pe_off, + org_virt_sector); +#endif + + iobuf = lv_snap->lv_iobuf; + + blksize_org = get_blksize(org_phys_dev); + blksize_snap = get_blksize(snap_phys_dev); + max_blksize = max(blksize_org, blksize_snap); + min_blksize = min(blksize_org, blksize_snap); + max_sectors = KIO_MAX_SECTORS * (min_blksize>>9); + + if (chunk_size % (max_blksize>>9)) + goto fail_blksize; + + while (chunk_size) + { + nr_sectors = min(chunk_size, max_sectors); + chunk_size -= nr_sectors; + + iobuf->length = nr_sectors << 9; + + lvm_snapshot_prepare_blocks(blocks, org_start, + nr_sectors, blksize_org); + if (brw_kiovec(READ, 1, &iobuf, org_phys_dev, + blocks, blksize_org) != (nr_sectors<<9)) + goto fail_raw_read; + + lvm_snapshot_prepare_blocks(blocks, snap_start, + nr_sectors, blksize_snap); + if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, + blocks, blksize_snap) != (nr_sectors<<9)) + goto fail_raw_write; + } + +#ifdef DEBUG_SNAPSHOT + /* invalidate the logcial snapshot buffer cache */ + invalidate_snap_cache(virt_start, lv_snap->lv_chunk_size, + lv_snap->lv_dev); +#endif + + /* the original chunk is now stored on the snapshot volume + so update the execption table */ + lv_snap->lv_block_exception[idx].rdev_org = org_phys_dev; + lv_snap->lv_block_exception[idx].rsector_org = org_start; + lvm_hash_link(lv_snap->lv_block_exception + idx, + org_phys_dev, org_start, lv_snap); + lv_snap->lv_remap_ptr = idx + 1; + return 1; + + /* slow path */ + out: + lvm_drop_snapshot(lv_snap, reason); + return -1; + + fail_out_of_space: + reason = "out of space"; + goto out; + fail_raw_read: + reason = "read error"; + goto out; + fail_raw_write: + reason = "write error"; + goto out; + fail_blksize: + reason = "blocksize error"; + goto out; +} + +static int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors) +{ + int bytes, nr_pages, err, i; + + bytes = sectors << 9; + nr_pages = (bytes + ~PAGE_MASK) >> PAGE_SHIFT; + err = expand_kiobuf(iobuf, nr_pages); + if (err) + goto out; + + err = -ENOMEM; + iobuf->locked = 1; + iobuf->nr_pages = 0; + for (i = 0; i < nr_pages; i++) + { + struct page * page; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,27) + page = alloc_page(GFP_KERNEL); + if (!page) + goto out; +#else + { + unsigned long addr = __get_free_page(GFP_USER); + if (!addr) + goto out; + iobuf->pagelist[i] = addr; + page = virt_to_page(addr); + } +#endif + + iobuf->maplist[i] = page; + /* the only point to lock the page here is to be allowed + to share unmap_kiobuf() in the fail-path */ +#ifndef LockPage +#define LockPage(map) set_bit(PG_locked, &(map)->flags) +#endif + LockPage(page); + iobuf->nr_pages++; + } + iobuf->offset = 0; + + err = 0; + out: + return err; +} + +static int calc_max_buckets(void) +{ + unsigned long mem; + + mem = num_physpages << PAGE_SHIFT; + mem /= 100; + mem *= 2; + mem /= sizeof(struct list_head); + + return mem; +} + +static int lvm_snapshot_alloc_hash_table(lv_t * lv) +{ + int err; + unsigned long buckets, max_buckets, size; + struct list_head * hash; + + buckets = lv->lv_remap_end; + max_buckets = calc_max_buckets(); + buckets = min(buckets, max_buckets); + while (buckets & (buckets-1)) + buckets &= (buckets-1); + + size = buckets * sizeof(struct list_head); + + err = -ENOMEM; + hash = vmalloc(size); + lv->lv_snapshot_hash_table = hash; + + if (!hash) + goto out; + + lv->lv_snapshot_hash_mask = buckets-1; + while (buckets--) + INIT_LIST_HEAD(hash+buckets); + err = 0; + out: + return err; +} + +int lvm_snapshot_alloc(lv_t * lv_snap) +{ + int err, blocksize, max_sectors; + + err = alloc_kiovec(1, &lv_snap->lv_iobuf); + if (err) + goto out; + + blocksize = lvm_blocksizes[MINOR(lv_snap->lv_dev)]; + max_sectors = KIO_MAX_SECTORS << (PAGE_SHIFT-9); + + err = lvm_snapshot_alloc_iobuf_pages(lv_snap->lv_iobuf, max_sectors); + if (err) + goto out_free_kiovec; + + err = lvm_snapshot_alloc_hash_table(lv_snap); + if (err) + goto out_free_kiovec; + out: + return err; + + out_free_kiovec: + unmap_kiobuf(lv_snap->lv_iobuf); + free_kiovec(1, &lv_snap->lv_iobuf); + goto out; +} + +void lvm_snapshot_release(lv_t * lv) +{ + if (lv->lv_block_exception) + { + vfree(lv->lv_block_exception); + lv->lv_block_exception = NULL; + } + if (lv->lv_snapshot_hash_table) + { + vfree(lv->lv_snapshot_hash_table); + lv->lv_snapshot_hash_table = NULL; + } + if (lv->lv_iobuf) + { + free_kiovec(1, &lv->lv_iobuf); + lv->lv_iobuf = NULL; + } +} diff --git a/drivers/md/lvm.c b/drivers/md/lvm.c new file mode 100644 index 000000000..239ed99aa --- /dev/null +++ b/drivers/md/lvm.c @@ -0,0 +1,2567 @@ +/* + * kernel/lvm.c + * + * Copyright (C) 1997 - 2000 Heinz Mauelshagen, Germany + * + * February-November 1997 + * April-May,July-August,November 1998 + * January-March,May,July,September,October 1999 + * January,February 2000 + * + * + * LVM driver is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * LVM driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +/* + * Changelog + * + * 09/11/1997 - added chr ioctls VG_STATUS_GET_COUNT + * and VG_STATUS_GET_NAMELIST + * 18/01/1998 - change lvm_chr_open/close lock handling + * 30/04/1998 - changed LV_STATUS ioctl to LV_STATUS_BYNAME and + * - added LV_STATUS_BYINDEX ioctl + * - used lvm_status_byname_req_t and + * lvm_status_byindex_req_t vars + * 04/05/1998 - added multiple device support + * 08/05/1998 - added support to set/clear extendable flag in volume group + * 09/05/1998 - changed output of lvm_proc_get_info() because of + * support for free (eg. longer) logical volume names + * 12/05/1998 - added spin_locks (thanks to Pascal van Dam + * <pascal@ramoth.xs4all.nl>) + * 25/05/1998 - fixed handling of locked PEs in lvm_map() and lvm_chr_ioctl() + * 26/05/1998 - reactivated verify_area by access_ok + * 07/06/1998 - used vmalloc/vfree instead of kmalloc/kfree to go + * beyond 128/256 KB max allocation limit per call + * - #ifdef blocked spin_lock calls to avoid compile errors + * with 2.0.x + * 11/06/1998 - another enhancement to spinlock code in lvm_chr_open() + * and use of LVM_VERSION_CODE instead of my own macros + * (thanks to Michael Marxmeier <mike@msede.com>) + * 07/07/1998 - added statistics in lvm_map() + * 08/07/1998 - saved statistics in lvm_do_lv_extend_reduce() + * 25/07/1998 - used __initfunc macro + * 02/08/1998 - changes for official char/block major numbers + * 07/08/1998 - avoided init_module() and cleanup_module() to be static + * 30/08/1998 - changed VG lv_open counter from sum of LV lv_open counters + * to sum of LVs open (no matter how often each is) + * 01/09/1998 - fixed lvm_gendisk.part[] index error + * 07/09/1998 - added copying of lv_current_pe-array + * in LV_STATUS_BYINDEX ioctl + * 17/11/1998 - added KERN_* levels to printk + * 13/01/1999 - fixed LV index bug in lvm_do_lv_create() which hit lvrename + * 07/02/1999 - fixed spinlock handling bug in case of LVM_RESET + * by moving spinlock code from lvm_chr_open() + * to lvm_chr_ioctl() + * - added LVM_LOCK_LVM ioctl to lvm_chr_ioctl() + * - allowed LVM_RESET and retrieval commands to go ahead; + * only other update ioctls are blocked now + * - fixed pv->pe to NULL for pv_status + * - using lv_req structure in lvm_chr_ioctl() now + * - fixed NULL ptr reference bug in lvm_do_lv_extend_reduce() + * caused by uncontiguous PV array in lvm_chr_ioctl(VG_REDUCE) + * 09/02/1999 - changed BLKRASET and BLKRAGET in lvm_chr_ioctl() to + * handle lgoical volume private read ahead sector + * - implemented LV read_ahead handling with lvm_blk_read() + * and lvm_blk_write() + * 10/02/1999 - implemented 2.[12].* support function lvm_hd_name() + * to be used in drivers/block/genhd.c by disk_name() + * 12/02/1999 - fixed index bug in lvm_blk_ioctl(), HDIO_GETGEO + * - enhanced gendisk insert/remove handling + * 16/02/1999 - changed to dynamic block minor number allocation to + * have as much as 99 volume groups with 256 logical volumes + * as the grand total; this allows having 1 volume group with + * up to 256 logical volumes in it + * 21/02/1999 - added LV open count information to proc filesystem + * - substituted redundant LVM_RESET code by calls + * to lvm_do_vg_remove() + * 22/02/1999 - used schedule_timeout() to be more responsive + * in case of lvm_do_vg_remove() with lots of logical volumes + * 19/03/1999 - fixed NULL pointer bug in module_init/lvm_init + * 17/05/1999 - used DECLARE_WAIT_QUEUE_HEAD macro (>2.3.0) + * - enhanced lvm_hd_name support + * 03/07/1999 - avoided use of KERNEL_VERSION macro based ifdefs and + * memcpy_tofs/memcpy_fromfs macro redefinitions + * 06/07/1999 - corrected reads/writes statistic counter copy in case + * of striped logical volume + * 28/07/1999 - implemented snapshot logical volumes + * - lvm_chr_ioctl + * - LV_STATUS_BYINDEX + * - LV_STATUS_BYNAME + * - lvm_do_lv_create + * - lvm_do_lv_remove + * - lvm_map + * - new lvm_snapshot_remap_block + * - new lvm_snapshot_remap_new_block + * 08/10/1999 - implemented support for multiple snapshots per + * original logical volume + * 12/10/1999 - support for 2.3.19 + * 11/11/1999 - support for 2.3.28 + * 21/11/1999 - changed lvm_map() interface to buffer_head based + * 19/12/1999 - support for 2.3.33 + * 01/01/2000 - changed locking concept in lvm_map(), + * lvm_do_vg_create() and lvm_do_lv_remove() + * 15/01/2000 - fixed PV_FLUSH bug in lvm_chr_ioctl() + * 24/01/2000 - ported to 2.3.40 including Alan Cox's pointer changes etc. + * 29/01/2000 - used kmalloc/kfree again for all small structures + * 20/01/2000 - cleaned up lvm_chr_ioctl by moving code + * to seperated functions + * - avoided "/dev/" in proc filesystem output + * - avoided inline strings functions lvm_strlen etc. + * 14/02/2000 - support for 2.3.43 + * - integrated Andrea Arcangeli's snapshot code + * + */ + + +static char *lvm_version = "LVM version 0.8final by Heinz Mauelshagen (15/02/2000)\n"; +static char *lvm_short_version = "version 0.8final (15/02/2000)"; + +#define MAJOR_NR LVM_BLK_MAJOR +#define DEVICE_OFF(device) + +#include <linux/config.h> +#include <linux/version.h> + +#ifdef MODVERSIONS +#undef MODULE +#define MODULE +#include <linux/modversions.h> +#endif + +#include <linux/module.h> + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/init.h> + +#include <linux/hdreg.h> +#include <linux/stat.h> +#include <linux/fs.h> +#include <linux/proc_fs.h> +#include <linux/blkdev.h> +#include <linux/genhd.h> +#include <linux/locks.h> +#include <linux/smp_lock.h> +#include <asm/ioctl.h> +#include <asm/segment.h> +#include <asm/uaccess.h> + +#ifdef CONFIG_KERNELD +#include <linux/kerneld.h> +#endif + +#define LOCAL_END_REQUEST + +#include <linux/blk.h> +#include <linux/blkpg.h> + +#include <linux/errno.h> +#include <linux/lvm.h> + +#define LVM_CORRECT_READ_AHEAD(a) \ + (((a) < LVM_MIN_READ_AHEAD || (a) > LVM_MAX_READ_AHEAD) \ + ? LVM_MAX_READ_AHEAD : (a)) + +#ifndef WRITEA +# define WRITEA WRITE +#endif + +/* + * External function prototypes + */ +#ifdef MODULE +int init_module(void); +void cleanup_module(void); +#else +extern int lvm_init(void); +#endif + +static void lvm_dummy_device_request(request_queue_t *); +#define DEVICE_REQUEST lvm_dummy_device_request + +static int lvm_make_request_fn(request_queue_t *, int, struct buffer_head*); +static void lvm_plug_device_noop(request_queue_t *, kdev_t); + +static int lvm_blk_ioctl(struct inode *, struct file *, uint, ulong); +static int lvm_blk_open(struct inode *, struct file *); + +static int lvm_chr_open(struct inode *, struct file *); + +static int lvm_chr_close(struct inode *, struct file *); +static int lvm_blk_close(struct inode *, struct file *); + +static int lvm_chr_ioctl(struct inode *, struct file *, uint, ulong); + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS +static int lvm_proc_get_info(char *, char **, off_t, int); +static int (*lvm_proc_get_info_ptr) (char *, char **, off_t, int) = +&lvm_proc_get_info; +#endif + +#ifdef LVM_HD_NAME +void lvm_hd_name(char *, int); +#endif +/* End external function prototypes */ + + +/* + * Internal function prototypes + */ +static void lvm_init_vars(void); + +/* external snapshot calls */ +int lvm_snapshot_remap_block(kdev_t *, ulong *, ulong, lv_t *); +int lvm_snapshot_COW(kdev_t, ulong, ulong, ulong, lv_t *); +int lvm_snapshot_alloc(lv_t *); +void lvm_snapshot_release(lv_t *); + +#ifdef LVM_HD_NAME +extern void (*lvm_hd_name_ptr) (char *, int); +#endif +static int lvm_map(struct buffer_head *, int); +static int lvm_do_lock_lvm(void); +static int lvm_do_le_remap(vg_t *, void *); +static int lvm_do_pe_lock_unlock(vg_t *r, void *); +static int lvm_do_vg_create(int, void *); +static int lvm_do_vg_extend(vg_t *, void *); +static int lvm_do_vg_reduce(vg_t *, void *); +static int lvm_do_vg_remove(int); +static int lvm_do_lv_create(int, char *, lv_t *); +static int lvm_do_lv_remove(int, char *, int); +static int lvm_do_lv_extend_reduce(int, char *, lv_t *); +static int lvm_do_lv_status_byname(vg_t *r, void *); +static int lvm_do_lv_status_byindex(vg_t *, void *arg); +static int lvm_do_pv_change(vg_t*, void*); +static int lvm_do_pv_status(vg_t *, void *); +static void lvm_geninit(struct gendisk *); +#ifdef LVM_GET_INODE +static struct inode *lvm_get_inode(kdev_t); +void lvm_clear_inode(struct inode *); +#endif +/* END Internal function prototypes */ + + +/* volume group descriptor area pointers */ +static vg_t *vg[ABS_MAX_VG]; +static pv_t *pvp = NULL; +static lv_t *lvp = NULL; +static pe_t *pep = NULL; +static pe_t *pep1 = NULL; + + +/* map from block minor number to VG and LV numbers */ +typedef struct { + int vg_number; + int lv_number; +} vg_lv_map_t; +static vg_lv_map_t vg_lv_map[ABS_MAX_LV]; + + +/* Request structures (lvm_chr_ioctl()) */ +static pv_change_req_t pv_change_req; +static pv_flush_req_t pv_flush_req; +static pv_status_req_t pv_status_req; +static pe_lock_req_t pe_lock_req; +static le_remap_req_t le_remap_req; +static lv_req_t lv_req; + +#ifdef LVM_TOTAL_RESET +static int lvm_reset_spindown = 0; +#endif + +static char pv_name[NAME_LEN]; +/* static char rootvg[NAME_LEN] = { 0, }; */ +static uint lv_open = 0; +const char *const lvm_name = LVM_NAME; +static int lock = 0; +static int loadtime = 0; +static uint vg_count = 0; +static long lvm_chr_open_count = 0; +static ushort lvm_iop_version = LVM_DRIVER_IOP_VERSION; +static DECLARE_WAIT_QUEUE_HEAD(lvm_snapshot_wait); +static DECLARE_WAIT_QUEUE_HEAD(lvm_wait); +static DECLARE_WAIT_QUEUE_HEAD(lvm_map_wait); + +static spinlock_t lvm_lock = SPIN_LOCK_UNLOCKED; + +static devfs_handle_t lvm_devfs_handle; +static devfs_handle_t vg_devfs_handle[MAX_VG]; +static devfs_handle_t ch_devfs_handle[MAX_VG]; +static devfs_handle_t lv_devfs_handle[MAX_LV]; + +static struct file_operations lvm_chr_fops = +{ + owner: THIS_MODULE, + open: lvm_chr_open, + release: lvm_chr_close, + ioctl: lvm_chr_ioctl, +}; + +static struct block_device_operations lvm_blk_dops = +{ + open: lvm_blk_open, + release: lvm_blk_close, + ioctl: lvm_blk_ioctl +}; + +/* gendisk structures */ +static struct hd_struct lvm_hd_struct[MAX_LV]; +static int lvm_blocksizes[MAX_LV] = +{0,}; +static int lvm_size[MAX_LV] = +{0,}; +static struct gendisk lvm_gendisk = +{ + MAJOR_NR, /* major # */ + LVM_NAME, /* name of major */ + 0, /* number of times minor is shifted + to get real minor */ + 1, /* maximum partitions per device */ + lvm_hd_struct, /* partition table */ + lvm_size, /* device size in blocks, copied + to block_size[] */ + MAX_LV, /* number or real devices */ + NULL, /* internal */ + NULL, /* pointer to next gendisk struct (internal) */ +}; + + +#ifdef MODULE +/* + * Module initialization... + */ +int init_module(void) +#else +/* + * Driver initialization... + */ +#ifdef __initfunc +__initfunc(int lvm_init(void)) +#else +int __init lvm_init(void) +#endif +#endif /* #ifdef MODULE */ +{ + struct gendisk *gendisk_ptr = NULL; + + if (register_chrdev(LVM_CHAR_MAJOR, lvm_name, &lvm_chr_fops) < 0) { + printk(KERN_ERR "%s -- register_chrdev failed\n", lvm_name); + return -EIO; + } + if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0) { + printk("%s -- register_blkdev failed\n", lvm_name); + if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) + printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); + return -EIO; + } + + lvm_devfs_handle = devfs_register( + 0 , "lvm", 0, 0, LVM_CHAR_MAJOR, + S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP, + &lvm_chr_fops, NULL); + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + create_proc_info_entry(LVM_NAME, S_IFREG | S_IRUGO, + &proc_root, lvm_proc_get_info_ptr); +#endif + + lvm_init_vars(); + lvm_geninit(&lvm_gendisk); + + /* insert our gendisk at the corresponding major */ + if (gendisk_head != NULL) { + gendisk_ptr = gendisk_head; + while (gendisk_ptr->next != NULL && + gendisk_ptr->major > lvm_gendisk.major) { + gendisk_ptr = gendisk_ptr->next; + } + lvm_gendisk.next = gendisk_ptr->next; + gendisk_ptr->next = &lvm_gendisk; + } else { + gendisk_head = &lvm_gendisk; + lvm_gendisk.next = NULL; + } + +#ifdef LVM_HD_NAME + /* reference from drivers/block/genhd.c */ + lvm_hd_name_ptr = lvm_hd_name; +#endif + + blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST); + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_make_request_fn); + blk_queue_pluggable(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_plug_device_noop); + /* optional read root VGDA */ +/* + if ( *rootvg != 0) vg_read_with_pv_and_lv ( rootvg, &vg); +*/ + + printk(KERN_INFO + "%s%s -- " +#ifdef MODULE + "Module" +#else + "Driver" +#endif + " successfully initialized\n", + lvm_version, lvm_name); + + return 0; +} /* init_module() / lvm_init() */ + + +#ifdef MODULE +/* + * Module cleanup... + */ +void cleanup_module(void) +{ + struct gendisk *gendisk_ptr = NULL, *gendisk_ptr_prev = NULL; + + devfs_unregister (lvm_devfs_handle); + + if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) { + printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); + } + if (unregister_blkdev(MAJOR_NR, lvm_name) < 0) { + printk(KERN_ERR "%s -- unregister_blkdev failed\n", lvm_name); + } + blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR)); + + gendisk_ptr = gendisk_ptr_prev = gendisk_head; + while (gendisk_ptr != NULL) { + if (gendisk_ptr == &lvm_gendisk) + break; + gendisk_ptr_prev = gendisk_ptr; + gendisk_ptr = gendisk_ptr->next; + } + /* delete our gendisk from chain */ + if (gendisk_ptr == &lvm_gendisk) + gendisk_ptr_prev->next = gendisk_ptr->next; + + blk_size[MAJOR_NR] = NULL; + blksize_size[MAJOR_NR] = NULL; + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS + remove_proc_entry(LVM_NAME, &proc_root); +#endif + +#ifdef LVM_HD_NAME + /* reference from linux/drivers/block/genhd.c */ + lvm_hd_name_ptr = NULL; +#endif + + printk(KERN_INFO "%s -- Module successfully deactivated\n", lvm_name); + + return; +} /* void cleanup_module() */ +#endif /* #ifdef MODULE */ + + +/* + * support function to initialize lvm variables + */ +#ifdef __initfunc +__initfunc(void lvm_init_vars(void)) +#else +void __init lvm_init_vars(void) +#endif +{ + int v; + + loadtime = CURRENT_TIME; + + pe_lock_req.lock = UNLOCK_PE; + pe_lock_req.data.lv_dev = pe_lock_req.data.pv_dev = 0; + pe_lock_req.data.pv_offset = 0; + + /* Initialize VG pointers */ + for (v = 0; v < ABS_MAX_VG; v++) vg[v] = NULL; + + /* Initialize LV -> VG association */ + for (v = 0; v < ABS_MAX_LV; v++) { + /* index ABS_MAX_VG never used for real VG */ + vg_lv_map[v].vg_number = ABS_MAX_VG; + vg_lv_map[v].lv_number = -1; + } + + return; +} /* lvm_init_vars() */ + + +/******************************************************************** + * + * Character device functions + * + ********************************************************************/ + +/* + * character device open routine + */ +static int lvm_chr_open(struct inode *inode, + struct file *file) +{ + int minor = MINOR(inode->i_rdev); + +#ifdef DEBUG + printk(KERN_DEBUG + "%s -- lvm_chr_open MINOR: %d VG#: %d mode: 0x%X lock: %d\n", + lvm_name, minor, VG_CHR(minor), file->f_mode, lock); +#endif + + /* super user validation */ + if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + /* Group special file open */ + if (VG_CHR(minor) > MAX_VG) return -ENXIO; + + lvm_chr_open_count++; + return 0; +} /* lvm_chr_open() */ + + +/* + * character device i/o-control routine + * + * Only one changing process can do changing ioctl at one time, + * others will block. + * + */ +static int lvm_chr_ioctl(struct inode *inode, struct file *file, + uint command, ulong a) +{ + int minor = MINOR(inode->i_rdev); + uint extendable, l, v; + void *arg = (void *) a; + lv_t lv; + vg_t* vg_ptr = vg[VG_CHR(minor)]; + + /* otherwise cc will complain about unused variables */ + (void) lvm_lock; + + +#ifdef DEBUG_IOCTL + printk(KERN_DEBUG + "%s -- lvm_chr_ioctl: command: 0x%X MINOR: %d " + "VG#: %d mode: 0x%X\n", + lvm_name, command, minor, VG_CHR(minor), file->f_mode); +#endif + +#ifdef LVM_TOTAL_RESET + if (lvm_reset_spindown > 0) return -EACCES; +#endif + + /* Main command switch */ + switch (command) { + case LVM_LOCK_LVM: + /* lock the LVM */ + return lvm_do_lock_lvm(); + + case LVM_GET_IOP_VERSION: + /* check lvm version to ensure driver/tools+lib + interoperability */ + if (copy_to_user(arg, &lvm_iop_version, sizeof(ushort)) != 0) + return -EFAULT; + return 0; + +#ifdef LVM_TOTAL_RESET + case LVM_RESET: + /* lock reset function */ + lvm_reset_spindown = 1; + for (v = 0; v < ABS_MAX_VG; v++) { + if (vg[v] != NULL) lvm_do_vg_remove(v); + } + +#ifdef MODULE + while (GET_USE_COUNT(&__this_module) < 1) + MOD_INC_USE_COUNT; + while (GET_USE_COUNT(&__this_module) > 1) + MOD_DEC_USE_COUNT; +#endif /* MODULE */ + lock = 0; /* release lock */ + wake_up_interruptible(&lvm_wait); + return 0; +#endif /* LVM_TOTAL_RESET */ + + + case LE_REMAP: + /* remap a logical extent (after moving the physical extent) */ + return lvm_do_le_remap(vg_ptr,arg); + + case PE_LOCK_UNLOCK: + /* lock/unlock i/o to a physical extent to move it to another + physical volume (move's done in user space's pvmove) */ + return lvm_do_pe_lock_unlock(vg_ptr,arg); + + case VG_CREATE: + /* create a VGDA */ + return lvm_do_vg_create(minor, arg); + + case VG_REMOVE: + /* remove an inactive VGDA */ + return lvm_do_vg_remove(minor); + + case VG_EXTEND: + /* extend a volume group */ + return lvm_do_vg_extend(vg_ptr,arg); + + case VG_REDUCE: + /* reduce a volume group */ + return lvm_do_vg_reduce(vg_ptr,arg); + + + case VG_SET_EXTENDABLE: + /* set/clear extendability flag of volume group */ + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&extendable, arg, sizeof(extendable)) != 0) + return -EFAULT; + + if (extendable == VG_EXTENDABLE || + extendable == ~VG_EXTENDABLE) { + if (extendable == VG_EXTENDABLE) + vg_ptr->vg_status |= VG_EXTENDABLE; + else + vg_ptr->vg_status &= ~VG_EXTENDABLE; + } else return -EINVAL; + return 0; + + + case VG_STATUS: + /* get volume group data (only the vg_t struct) */ + if (vg_ptr == NULL) return -ENXIO; + if (copy_to_user(arg, vg_ptr, sizeof(vg_t)) != 0) + return -EFAULT; + return 0; + + + case VG_STATUS_GET_COUNT: + /* get volume group count */ + if (copy_to_user(arg, &vg_count, sizeof(vg_count)) != 0) + return -EFAULT; + return 0; + + + case VG_STATUS_GET_NAMELIST: + /* get volume group count */ + for (l = v = 0; v < ABS_MAX_VG; v++) { + if (vg[v] != NULL) { + if (copy_to_user(arg + l++ * NAME_LEN, + vg[v]->vg_name, + NAME_LEN) != 0) + return -EFAULT; + } + } + return 0; + + + case LV_CREATE: + case LV_REMOVE: + case LV_EXTEND: + case LV_REDUCE: + /* create, remove, extend or reduce a logical volume */ + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&lv_req, arg, sizeof(lv_req)) != 0) + return -EFAULT; + + if (command != LV_REMOVE) { + if (copy_from_user(&lv, lv_req.lv, sizeof(lv_t)) != 0) + return -EFAULT; + } + switch (command) { + case LV_CREATE: + return lvm_do_lv_create(minor, lv_req.lv_name, &lv); + + case LV_REMOVE: + return lvm_do_lv_remove(minor, lv_req.lv_name, -1); + + case LV_EXTEND: + case LV_REDUCE: + return lvm_do_lv_extend_reduce(minor, lv_req.lv_name, &lv); + } + + + case LV_STATUS_BYNAME: + /* get status of a logical volume by name */ + return lvm_do_lv_status_byname(vg_ptr,arg); + + case LV_STATUS_BYINDEX: + /* get status of a logical volume by index */ + return lvm_do_lv_status_byindex(vg_ptr,arg); + + case PV_CHANGE: + /* change a physical volume */ + return lvm_do_pv_change(vg_ptr,arg); + + case PV_STATUS: + /* get physical volume data (pv_t structure only) */ + return lvm_do_pv_status(vg_ptr,arg); + + case PV_FLUSH: + /* physical volume buffer flush/invalidate */ + if (copy_from_user(&pv_flush_req, arg, + sizeof(pv_flush_req)) != 0) + return -EFAULT; + + for ( v = 0; v < ABS_MAX_VG; v++) { + unsigned int p; + if ( vg[v] == NULL) continue; + for ( p = 0; p < vg[v]->pv_max; p++) { + if ( vg[v]->pv[p] != NULL && + strcmp ( vg[v]->pv[p]->pv_name, + pv_flush_req.pv_name) == 0) { + fsync_dev ( vg[v]->pv[p]->pv_dev); + invalidate_buffers ( vg[v]->pv[p]->pv_dev); + return 0; + } + } + } + return 0; + + default: + printk(KERN_WARNING + "%s -- lvm_chr_ioctl: unknown command %x\n", + lvm_name, command); + return -EINVAL; + } + + return 0; +} /* lvm_chr_ioctl */ + + +/* + * character device close routine + */ +static int lvm_chr_close(struct inode *inode, struct file *file) +{ +#ifdef DEBUG + int minor = MINOR(inode->i_rdev); + printk(KERN_DEBUG + "%s -- lvm_chr_close VG#: %d\n", lvm_name, VG_CHR(minor)); +#endif + + lock_kernel(); +#ifdef LVM_TOTAL_RESET + if (lvm_reset_spindown > 0) { + lvm_reset_spindown = 0; + lvm_chr_open_count = 1; + } +#endif + + if (lvm_chr_open_count > 0) lvm_chr_open_count--; + if (lock == current->pid) { + lock = 0; /* release lock */ + wake_up_interruptible(&lvm_wait); + } + unlock_kernel(); + + return 0; +} /* lvm_chr_close() */ + + + +/******************************************************************** + * + * Block device functions + * + ********************************************************************/ + +/* + * block device open routine + */ +static int lvm_blk_open(struct inode *inode, struct file *file) +{ + int minor = MINOR(inode->i_rdev); + lv_t *lv_ptr; + vg_t *vg_ptr = vg[VG_BLK(minor)]; + +#ifdef DEBUG_LVM_BLK_OPEN + printk(KERN_DEBUG + "%s -- lvm_blk_open MINOR: %d VG#: %d LV#: %d mode: 0x%X\n", + lvm_name, minor, VG_BLK(minor), LV_BLK(minor), file->f_mode); +#endif + +#ifdef LVM_TOTAL_RESET + if (lvm_reset_spindown > 0) + return -EPERM; +#endif + + if (vg_ptr != NULL && + (vg_ptr->vg_status & VG_ACTIVE) && + (lv_ptr = vg_ptr->lv[LV_BLK(minor)]) != NULL && + LV_BLK(minor) >= 0 && + LV_BLK(minor) < vg_ptr->lv_max) { + + /* Check parallel LV spindown (LV remove) */ + if (lv_ptr->lv_status & LV_SPINDOWN) return -EPERM; + + /* Check inactive LV and open for read/write */ + if (file->f_mode & O_RDWR) { + if (!(lv_ptr->lv_status & LV_ACTIVE)) return -EPERM; + if (!(lv_ptr->lv_access & LV_WRITE)) return -EACCES; + } + + /* be sure to increment VG counter */ + if (lv_ptr->lv_open == 0) vg_ptr->lv_open++; + lv_ptr->lv_open++; + + MOD_INC_USE_COUNT; + +#ifdef DEBUG_LVM_BLK_OPEN + printk(KERN_DEBUG + "%s -- lvm_blk_open MINOR: %d VG#: %d LV#: %d size: %d\n", + lvm_name, minor, VG_BLK(minor), LV_BLK(minor), + lv_ptr->lv_size); +#endif + + return 0; + } + return -ENXIO; +} /* lvm_blk_open() */ + + +/* + * block device i/o-control routine + */ +static int lvm_blk_ioctl(struct inode *inode, struct file *file, + uint command, ulong a) +{ + int minor = MINOR(inode->i_rdev); + vg_t *vg_ptr = vg[VG_BLK(minor)]; + lv_t *lv_ptr = vg_ptr->lv[LV_BLK(minor)]; + void *arg = (void *) a; + struct hd_geometry *hd = (struct hd_geometry *) a; + +#ifdef DEBUG_IOCTL + printk(KERN_DEBUG + "%s -- lvm_blk_ioctl MINOR: %d command: 0x%X arg: %X " + "VG#: %dl LV#: %d\n", + lvm_name, minor, command, (ulong) arg, + VG_BLK(minor), LV_BLK(minor)); +#endif + + switch (command) { + case BLKGETSIZE: + /* return device size */ +#ifdef DEBUG_IOCTL + printk(KERN_DEBUG + "%s -- lvm_blk_ioctl -- BLKGETSIZE: %u\n", + lvm_name, lv_ptr->lv_size); +#endif + if (put_user(lv_ptr->lv_size, (long *)arg)) + return -EFAULT; + break; + + + case BLKFLSBUF: + /* flush buffer cache */ + if (!capable(CAP_SYS_ADMIN)) return -EACCES; + +#ifdef DEBUG_IOCTL + printk(KERN_DEBUG + "%s -- lvm_blk_ioctl -- BLKFLSBUF\n", lvm_name); +#endif + fsync_dev(inode->i_rdev); + invalidate_buffers(inode->i_rdev); + break; + + + case BLKRASET: + /* set read ahead for block device */ + if (!capable(CAP_SYS_ADMIN)) return -EACCES; + +#ifdef DEBUG_IOCTL + printk(KERN_DEBUG + "%s -- lvm_blk_ioctl -- BLKRASET: %d sectors for %02X:%02X\n", + lvm_name, (long) arg, MAJOR(inode->i_rdev), minor); +#endif + if ((long) arg < LVM_MIN_READ_AHEAD || + (long) arg > LVM_MAX_READ_AHEAD) + return -EINVAL; + read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = (long) arg; + break; + + + case BLKRAGET: + /* get current read ahead setting */ +#ifdef DEBUG_IOCTL + printk(KERN_DEBUG + "%s -- lvm_blk_ioctl -- BLKRAGET\n", lvm_name); +#endif + if (put_user(lv_ptr->lv_read_ahead, (long *)arg)) + return -EFAULT; + break; + + + case HDIO_GETGEO: + /* get disk geometry */ +#ifdef DEBUG_IOCTL + printk(KERN_DEBUG + "%s -- lvm_blk_ioctl -- HDIO_GETGEO\n", lvm_name); +#endif + if (hd == NULL) + return -EINVAL; + { + unsigned char heads = 64; + unsigned char sectors = 32; + long start = 0; + short cylinders = lv_ptr->lv_size / heads / sectors; + + if (copy_to_user((char *) &hd->heads, &heads, + sizeof(heads)) != 0 || + copy_to_user((char *) &hd->sectors, §ors, + sizeof(sectors)) != 0 || + copy_to_user((short *) &hd->cylinders, + &cylinders, sizeof(cylinders)) != 0 || + copy_to_user((long *) &hd->start, &start, + sizeof(start)) != 0) + return -EFAULT; + } + +#ifdef DEBUG_IOCTL + printk(KERN_DEBUG + "%s -- lvm_blk_ioctl -- cylinders: %d\n", + lvm_name, lv_ptr->lv_size / heads / sectors); +#endif + break; + + + case LV_SET_ACCESS: + /* set access flags of a logical volume */ + if (!capable(CAP_SYS_ADMIN)) return -EACCES; + lv_ptr->lv_access = (ulong) arg; + break; + + + case LV_SET_STATUS: + /* set status flags of a logical volume */ + if (!capable(CAP_SYS_ADMIN)) return -EACCES; + if (!((ulong) arg & LV_ACTIVE) && lv_ptr->lv_open > 1) + return -EPERM; + lv_ptr->lv_status = (ulong) arg; + break; + + + case LV_SET_ALLOCATION: + /* set allocation flags of a logical volume */ + if (!capable(CAP_SYS_ADMIN)) return -EACCES; + lv_ptr->lv_allocation = (ulong) arg; + break; + + + default: + printk(KERN_WARNING + "%s -- lvm_blk_ioctl: unknown command %d\n", + lvm_name, command); + return -EINVAL; + } + + return 0; +} /* lvm_blk_ioctl() */ + + +/* + * block device close routine + */ +static int lvm_blk_close(struct inode *inode, struct file *file) +{ + int minor = MINOR(inode->i_rdev); + vg_t *vg_ptr = vg[VG_BLK(minor)]; + lv_t *lv_ptr = vg_ptr->lv[LV_BLK(minor)]; + +#ifdef DEBUG + printk(KERN_DEBUG + "%s -- lvm_blk_close MINOR: %d VG#: %d LV#: %d\n", + lvm_name, minor, VG_BLK(minor), LV_BLK(minor)); +#endif + + sync_dev(inode->i_rdev); + if (lv_ptr->lv_open == 1) vg_ptr->lv_open--; + lv_ptr->lv_open--; + + MOD_DEC_USE_COUNT; + + return 0; +} /* lvm_blk_close() */ + + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS +/* + * Support function /proc-Filesystem + */ +#define LVM_PROC_BUF ( i == 0 ? dummy_buf : &buf[sz]) + +static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) +{ + int c, i, l, p, v, vg_counter, pv_counter, lv_counter, lv_open_counter, + lv_open_total, pe_t_bytes, lv_block_exception_t_bytes, seconds; + static off_t sz; + off_t sz_last; + char allocation_flag, inactive_flag, rw_flag, stripes_flag; + char *lv_name, *pv_name; + static char *buf = NULL; + static char dummy_buf[160]; /* sized for 2 lines */ + vg_t *vg_ptr; + lv_t *lv_ptr; + pv_t *pv_ptr; + + +#ifdef DEBUG_LVM_PROC_GET_INFO + printk(KERN_DEBUG + "%s - lvm_proc_get_info CALLED pos: %lu count: %d whence: %d\n", + lvm_name, pos, count, whence); +#endif + + if (pos == 0 || buf == NULL) { + sz_last = vg_counter = pv_counter = lv_counter = lv_open_counter = \ + lv_open_total = pe_t_bytes = lv_block_exception_t_bytes = 0; + + /* search for activity */ + for (v = 0; v < ABS_MAX_VG; v++) { + if ((vg_ptr = vg[v]) != NULL) { + vg_counter++; + pv_counter += vg_ptr->pv_cur; + lv_counter += vg_ptr->lv_cur; + if (vg_ptr->lv_cur > 0) { + for (l = 0; l < vg[v]->lv_max; l++) { + if ((lv_ptr = vg_ptr->lv[l]) != NULL) { + pe_t_bytes += lv_ptr->lv_allocated_le; + if (lv_ptr->lv_block_exception != NULL) + lv_block_exception_t_bytes += lv_ptr->lv_remap_end; + if (lv_ptr->lv_open > 0) { + lv_open_counter++; + lv_open_total += lv_ptr->lv_open; + } + } + } + } + } + } + pe_t_bytes *= sizeof(pe_t); + lv_block_exception_t_bytes *= sizeof(lv_block_exception_t); + + if (buf != NULL) { +#ifdef DEBUG_KFREE + printk(KERN_DEBUG + "%s -- kfree %d\n", lvm_name, __LINE__); +#endif + kfree(buf); + buf = NULL; + } + /* 2 times: first to get size to allocate buffer, + 2nd to fill the malloced buffer */ + for (i = 0; i < 2; i++) { + sz = 0; + sz += sprintf(LVM_PROC_BUF, + "LVM " +#ifdef MODULE + "module" +#else + "driver" +#endif + " %s\n\n" + "Total: %d VG%s %d PV%s %d LV%s ", + lvm_short_version, + vg_counter, vg_counter == 1 ? "" : "s", + pv_counter, pv_counter == 1 ? "" : "s", + lv_counter, lv_counter == 1 ? "" : "s"); + sz += sprintf(LVM_PROC_BUF, + "(%d LV%s open", + lv_open_counter, + lv_open_counter == 1 ? "" : "s"); + if (lv_open_total > 0) + sz += sprintf(LVM_PROC_BUF, + " %d times)\n", + lv_open_total); + else + sz += sprintf(LVM_PROC_BUF, ")"); + sz += sprintf(LVM_PROC_BUF, + "\nGlobal: %lu bytes malloced IOP version: %d ", + vg_counter * sizeof(vg_t) + + pv_counter * sizeof(pv_t) + + lv_counter * sizeof(lv_t) + + pe_t_bytes + lv_block_exception_t_bytes + sz_last, + lvm_iop_version); + + seconds = CURRENT_TIME - loadtime; + if (seconds < 0) + loadtime = CURRENT_TIME + seconds; + if (seconds / 86400 > 0) { + sz += sprintf(LVM_PROC_BUF, "%d day%s ", + seconds / 86400, + seconds / 86400 == 0 || + seconds / 86400 > 1 ? "s" : ""); + } + sz += sprintf(LVM_PROC_BUF, "%d:%02d:%02d active\n", + (seconds % 86400) / 3600, + (seconds % 3600) / 60, + seconds % 60); + + if (vg_counter > 0) { + for (v = 0; v < ABS_MAX_VG; v++) { + /* volume group */ + if ((vg_ptr = vg[v]) != NULL) { + inactive_flag = ' '; + if (!(vg_ptr->vg_status & VG_ACTIVE)) inactive_flag = 'I'; + sz += sprintf(LVM_PROC_BUF, + "\nVG: %c%s [%d PV, %d LV/%d open] " + " PE Size: %d KB\n" + " Usage [KB/PE]: %d /%d total " + "%d /%d used %d /%d free", + inactive_flag, + vg_ptr->vg_name, + vg_ptr->pv_cur, + vg_ptr->lv_cur, + vg_ptr->lv_open, + vg_ptr->pe_size >> 1, + vg_ptr->pe_size * vg_ptr->pe_total >> 1, + vg_ptr->pe_total, + vg_ptr->pe_allocated * vg_ptr->pe_size >> 1, + vg_ptr->pe_allocated, + (vg_ptr->pe_total - vg_ptr->pe_allocated) * + vg_ptr->pe_size >> 1, + vg_ptr->pe_total - vg_ptr->pe_allocated); + + /* physical volumes */ + sz += sprintf(LVM_PROC_BUF, + "\n PV%s ", + vg_ptr->pv_cur == 1 ? ": " : "s:"); + c = 0; + for (p = 0; p < vg_ptr->pv_max; p++) { + if ((pv_ptr = vg_ptr->pv[p]) != NULL) { + inactive_flag = 'A'; + if (!(pv_ptr->pv_status & PV_ACTIVE)) + inactive_flag = 'I'; + allocation_flag = 'A'; + if (!(pv_ptr->pv_allocatable & PV_ALLOCATABLE)) + allocation_flag = 'N'; + pv_name = strchr(pv_ptr->pv_name+1,'/'); + if ( pv_name == 0) pv_name = pv_ptr->pv_name; + else pv_name++; + sz += sprintf(LVM_PROC_BUF, + "[%c%c] %-21s %8d /%-6d " + "%8d /%-6d %8d /%-6d", + inactive_flag, + allocation_flag, + pv_name, + pv_ptr->pe_total * + pv_ptr->pe_size >> 1, + pv_ptr->pe_total, + pv_ptr->pe_allocated * + pv_ptr->pe_size >> 1, + pv_ptr->pe_allocated, + (pv_ptr->pe_total - + pv_ptr->pe_allocated) * + pv_ptr->pe_size >> 1, + pv_ptr->pe_total - + pv_ptr->pe_allocated); + c++; + if (c < vg_ptr->pv_cur) + sz += sprintf(LVM_PROC_BUF, + "\n "); + } + } + + /* logical volumes */ + sz += sprintf(LVM_PROC_BUF, + "\n LV%s ", + vg_ptr->lv_cur == 1 ? ": " : "s:"); + c = 0; + for (l = 0; l < vg[v]->lv_max; l++) { + if ((lv_ptr = vg_ptr->lv[l]) != NULL) { + inactive_flag = 'A'; + if (!(lv_ptr->lv_status & LV_ACTIVE)) + inactive_flag = 'I'; + rw_flag = 'R'; + if (lv_ptr->lv_access & LV_WRITE) + rw_flag = 'W'; + allocation_flag = 'D'; + if (lv_ptr->lv_allocation & LV_CONTIGUOUS) + allocation_flag = 'C'; + stripes_flag = 'L'; + if (lv_ptr->lv_stripes > 1) + stripes_flag = 'S'; + sz += sprintf(LVM_PROC_BUF, + "[%c%c%c%c", + inactive_flag, + rw_flag, + allocation_flag, + stripes_flag); + if (lv_ptr->lv_stripes > 1) + sz += sprintf(LVM_PROC_BUF, "%-2d", + lv_ptr->lv_stripes); + else + sz += sprintf(LVM_PROC_BUF, " "); + lv_name = strrchr(lv_ptr->lv_name, '/'); + if ( lv_name == 0) lv_name = lv_ptr->lv_name; + else lv_name++; + sz += sprintf(LVM_PROC_BUF, "] %-25s", lv_name); + if (strlen(lv_name) > 25) + sz += sprintf(LVM_PROC_BUF, + "\n "); + sz += sprintf(LVM_PROC_BUF, "%9d /%-6d ", + lv_ptr->lv_size >> 1, + lv_ptr->lv_size / vg[v]->pe_size); + + if (lv_ptr->lv_open == 0) + sz += sprintf(LVM_PROC_BUF, "close"); + else + sz += sprintf(LVM_PROC_BUF, "%dx open", + lv_ptr->lv_open); + c++; + if (c < vg_ptr->lv_cur) + sz += sprintf(LVM_PROC_BUF, + "\n "); + } + } + if (vg_ptr->lv_cur == 0) sz += sprintf(LVM_PROC_BUF, "none"); + sz += sprintf(LVM_PROC_BUF, "\n"); + } + } + } + if (buf == NULL) { + if ((buf = vmalloc(sz)) == NULL) { + sz = 0; + return sprintf(page, "%s - vmalloc error at line %d\n", + lvm_name, __LINE__); + } + } + sz_last = sz; + } + } + if (pos > sz - 1) { + vfree(buf); + buf = NULL; + return 0; + } + *start = &buf[pos]; + if (sz - pos < count) + return sz - pos; + else + return count; +} /* lvm_proc_get_info() */ +#endif /* #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS */ + + +/* + * block device support function for /usr/src/linux/drivers/block/ll_rw_blk.c + * (see init_module/lvm_init) + */ +static int lvm_map(struct buffer_head *bh, int rw) +{ + int minor = MINOR(bh->b_rdev); + ulong index; + ulong pe_start; + ulong size = bh->b_size >> 9; + ulong rsector_tmp = bh->b_rsector; + ulong rsector_sav; + kdev_t rdev_tmp = bh->b_rdev; + kdev_t rdev_sav; + lv_t *lv = vg[VG_BLK(minor)]->lv[LV_BLK(minor)]; + + + if (!(lv->lv_status & LV_ACTIVE)) { + printk(KERN_ALERT + "%s - lvm_map: ll_rw_blk for inactive LV %s\n", + lvm_name, lv->lv_name); + goto error; + } +/* + if ( lv->lv_access & LV_SNAPSHOT) + printk ( "%s -- %02d:%02d block: %lu rw: %d\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), bh->b_blocknr, rw); + */ + + /* take care of snapshot chunk writes before + check for writable logical volume */ + if ((lv->lv_access & LV_SNAPSHOT) && + MAJOR(bh->b_rdev) != 0 && + MAJOR(bh->b_rdev) != MAJOR_NR && + (rw == WRITEA || rw == WRITE)) + { + printk ( "%s -- doing snapshot write for %02d:%02d[%02d:%02d] b_blocknr: %lu b_rsector: %lu\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), MAJOR ( bh->b_rdev), MINOR ( bh->b_rdev), bh->b_blocknr, bh->b_rsector); + goto error; + } + + if ((rw == WRITE || rw == WRITEA) && + !(lv->lv_access & LV_WRITE)) { + printk(KERN_CRIT + "%s - lvm_map: ll_rw_blk write for readonly LV %s\n", + lvm_name, lv->lv_name); + goto error; + } +#ifdef DEBUG_MAP + printk(KERN_DEBUG + "%s - lvm_map minor:%d *rdev: %02d:%02d *rsector: %lu " + "size:%lu\n", + lvm_name, minor, + MAJOR(rdev_tmp), + MINOR(rdev_tmp), + rsector_tmp, size); +#endif + + if (rsector_tmp + size > lv->lv_size) { + printk(KERN_ALERT + "%s - lvm_map *rsector: %lu or size: %lu wrong for" + " minor: %2d\n", lvm_name, rsector_tmp, size, minor); + goto error; + } + rsector_sav = rsector_tmp; + rdev_sav = rdev_tmp; + +lvm_second_remap: + /* linear mapping */ + if (lv->lv_stripes < 2) { + /* get the index */ + index = rsector_tmp / vg[VG_BLK(minor)]->pe_size; + pe_start = lv->lv_current_pe[index].pe; + rsector_tmp = lv->lv_current_pe[index].pe + + (rsector_tmp % vg[VG_BLK(minor)]->pe_size); + rdev_tmp = lv->lv_current_pe[index].dev; + +#ifdef DEBUG_MAP + printk(KERN_DEBUG + "lv_current_pe[%ld].pe: %ld rdev: %02d:%02d rsector:%ld\n", + index, + lv->lv_current_pe[index].pe, + MAJOR(rdev_tmp), + MINOR(rdev_tmp), + rsector_tmp); +#endif + + /* striped mapping */ + } else { + ulong stripe_index; + ulong stripe_length; + + stripe_length = vg[VG_BLK(minor)]->pe_size * lv->lv_stripes; + stripe_index = (rsector_tmp % stripe_length) / lv->lv_stripesize; + index = rsector_tmp / stripe_length + + (stripe_index % lv->lv_stripes) * + (lv->lv_allocated_le / lv->lv_stripes); + pe_start = lv->lv_current_pe[index].pe; + rsector_tmp = lv->lv_current_pe[index].pe + + (rsector_tmp % stripe_length) - + (stripe_index % lv->lv_stripes) * lv->lv_stripesize - + stripe_index / lv->lv_stripes * + (lv->lv_stripes - 1) * lv->lv_stripesize; + rdev_tmp = lv->lv_current_pe[index].dev; + } + +#ifdef DEBUG_MAP + printk(KERN_DEBUG + "lv_current_pe[%ld].pe: %ld rdev: %02d:%02d rsector:%ld\n" + "stripe_length: %ld stripe_index: %ld\n", + index, + lv->lv_current_pe[index].pe, + MAJOR(rdev_tmp), + MINOR(rdev_tmp), + rsector_tmp, + stripe_length, + stripe_index); +#endif + + /* handle physical extents on the move */ + if (pe_lock_req.lock == LOCK_PE) { + if (rdev_tmp == pe_lock_req.data.pv_dev && + rsector_tmp >= pe_lock_req.data.pv_offset && + rsector_tmp < (pe_lock_req.data.pv_offset + + vg[VG_BLK(minor)]->pe_size)) { + sleep_on(&lvm_map_wait); + rsector_tmp = rsector_sav; + rdev_tmp = rdev_sav; + goto lvm_second_remap; + } + } + /* statistic */ + if (rw == WRITE || rw == WRITEA) + lv->lv_current_pe[index].writes++; + else + lv->lv_current_pe[index].reads++; + + /* snapshot volume exception handling on physical device address base */ + if (lv->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG)) { + /* original logical volume */ + if (lv->lv_access & LV_SNAPSHOT_ORG) { + if (rw == WRITE || rw == WRITEA) + { + lv_t *lv_ptr; + + /* start with first snapshot and loop thrugh all of them */ + for (lv_ptr = lv->lv_snapshot_next; + lv_ptr != NULL; + lv_ptr = lv_ptr->lv_snapshot_next) { + down(&lv->lv_snapshot_org->lv_snapshot_sem); + /* do we still have exception storage for this snapshot free? */ + if (lv_ptr->lv_block_exception != NULL) { + rdev_sav = rdev_tmp; + rsector_sav = rsector_tmp; + if (!lvm_snapshot_remap_block(&rdev_tmp, + &rsector_tmp, + pe_start, + lv_ptr)) { + /* create a new mapping */ + lvm_snapshot_COW(rdev_tmp, + rsector_tmp, + pe_start, + rsector_sav, + lv_ptr); + } + rdev_tmp = rdev_sav; + rsector_tmp = rsector_sav; + } + up(&lv->lv_snapshot_org->lv_snapshot_sem); + } + } + } else { + /* remap snapshot logical volume */ + down(&lv->lv_snapshot_sem); + if (lv->lv_block_exception != NULL) + lvm_snapshot_remap_block(&rdev_tmp, &rsector_tmp, pe_start, lv); + up(&lv->lv_snapshot_sem); + } + } + bh->b_rdev = rdev_tmp; + bh->b_rsector = rsector_tmp; + + return 1; + + error: + buffer_IO_error(bh); + return -1; +} /* lvm_map() */ + + +/* + * internal support functions + */ + +#ifdef LVM_HD_NAME +/* + * generate "hard disk" name + */ +void lvm_hd_name(char *buf, int minor) +{ + int len = 0; + lv_t *lv_ptr; + + if (vg[VG_BLK(minor)] == NULL || + (lv_ptr = vg[VG_BLK(minor)]->lv[LV_BLK(minor)]) == NULL) + return; + len = strlen(lv_ptr->lv_name) - 5; + memcpy(buf, &lv_ptr->lv_name[5], len); + buf[len] = 0; + return; +} +#endif + + +/* + * this one never should be called... + */ +static void lvm_dummy_device_request(request_queue_t * t) +{ + printk(KERN_EMERG + "%s -- oops, got lvm request for %02d:%02d [sector: %lu]\n", + lvm_name, + MAJOR(CURRENT->rq_dev), + MINOR(CURRENT->rq_dev), + CURRENT->sector); + return; +} + + +/* + * make request function + */ +static int lvm_make_request_fn(request_queue_t *q, int rw, struct buffer_head *bh) +{ + lvm_map(bh, rw); + return 1; +} + +/* + * plug device function is a noop because plugging has to happen + * in the queue of the physical blockdevice to allow the + * elevator to do a better job. + */ +static void lvm_plug_device_noop(request_queue_t *q, kdev_t dev) { } + +/******************************************************************** + * + * Character device support functions + * + ********************************************************************/ +/* + * character device support function logical volume manager lock + */ +static int lvm_do_lock_lvm(void) +{ +lock_try_again: + spin_lock(&lvm_lock); + if (lock != 0 && lock != current->pid) { +#ifdef DEBUG_IOCTL + printk(KERN_INFO "lvm_do_lock_lvm: %s is locked by pid %d ...\n", + lvm_name, lock); +#endif + spin_unlock(&lvm_lock); + interruptible_sleep_on(&lvm_wait); + if (current->sigpending != 0) + return -EINTR; +#ifdef LVM_TOTAL_RESET + if (lvm_reset_spindown > 0) + return -EACCES; +#endif + goto lock_try_again; + } + lock = current->pid; + spin_unlock(&lvm_lock); + return 0; +} /* lvm_do_lock_lvm */ + + +/* + * character device support function lock/unlock physical extend + */ +static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg) +{ + uint p; + + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&pe_lock_req, arg, + sizeof(pe_lock_req_t)) != 0) return -EFAULT; + + switch (pe_lock_req.lock) { + case LOCK_PE: + for (p = 0; p < vg_ptr->pv_max; p++) { + if (vg_ptr->pv[p] != NULL && + pe_lock_req.data.pv_dev == + vg_ptr->pv[p]->pv_dev) + break; + } + if (p == vg_ptr->pv_max) return -ENXIO; + + pe_lock_req.lock = UNLOCK_PE; + fsync_dev(pe_lock_req.data.lv_dev); + pe_lock_req.lock = LOCK_PE; + break; + + case UNLOCK_PE: + pe_lock_req.lock = UNLOCK_PE; + pe_lock_req.data.lv_dev = pe_lock_req.data.pv_dev = 0; + pe_lock_req.data.pv_offset = 0; + wake_up(&lvm_map_wait); + break; + + default: + return -EINVAL; + } + return 0; +} + + +/* + * character device support function logical extend remap + */ +static int lvm_do_le_remap(vg_t *vg_ptr, void *arg) +{ + uint l, le; + lv_t *lv_ptr; + + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&le_remap_req, arg, + sizeof(le_remap_req_t)) != 0) + return -EFAULT; + + for (l = 0; l < vg_ptr->lv_max; l++) { + lv_ptr = vg_ptr->lv[l]; + if (lv_ptr != NULL && + strcmp(lv_ptr->lv_name, + le_remap_req.lv_name) == 0) { + for (le = 0; le < lv_ptr->lv_allocated_le; + le++) { + if (lv_ptr->lv_current_pe[le].dev == + le_remap_req.old_dev && + lv_ptr->lv_current_pe[le].pe == + le_remap_req.old_pe) { + lv_ptr->lv_current_pe[le].dev = + le_remap_req.new_dev; + lv_ptr->lv_current_pe[le].pe = + le_remap_req.new_pe; + return 0; + } + } + return -EINVAL; + } + } + return -ENXIO; +} /* lvm_do_le_remap() */ + + +/* + * character device support function VGDA create + */ +int lvm_do_vg_create(int minor, void *arg) +{ + int snaporg_minor = 0; + ulong l, p; + lv_t lv; + vg_t *vg_ptr; + pv_t *pv_ptr; + lv_t *lv_ptr; + + if (vg[VG_CHR(minor)] != NULL) return -EPERM; + + if ((vg_ptr = kmalloc(sizeof(vg_t),GFP_KERNEL)) == NULL) { + printk(KERN_CRIT + "%s -- VG_CREATE: kmalloc error VG at line %d\n", + lvm_name, __LINE__); + return -ENOMEM; + } + /* get the volume group structure */ + if (copy_from_user(vg_ptr, arg, sizeof(vg_t)) != 0) { + kfree(vg_ptr); + return -EFAULT; + } + + vg_devfs_handle[vg_ptr->vg_number] = devfs_mk_dir(0, vg_ptr->vg_name, NULL); + ch_devfs_handle[vg_ptr->vg_number] = devfs_register( + vg_devfs_handle[vg_ptr->vg_number] , "group", + DEVFS_FL_DEFAULT, LVM_CHAR_MAJOR, vg_ptr->vg_number, + S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP, + &lvm_chr_fops, NULL); + + /* we are not that active so far... */ + vg_ptr->vg_status &= ~VG_ACTIVE; + vg[VG_CHR(minor)] = vg_ptr; + + vg[VG_CHR(minor)]->pe_allocated = 0; + if (vg_ptr->pv_max > ABS_MAX_PV) { + printk(KERN_WARNING + "%s -- Can't activate VG: ABS_MAX_PV too small\n", + lvm_name); + kfree(vg_ptr); + vg[VG_CHR(minor)] = NULL; + return -EPERM; + } + if (vg_ptr->lv_max > ABS_MAX_LV) { + printk(KERN_WARNING + "%s -- Can't activate VG: ABS_MAX_LV too small for %u\n", + lvm_name, vg_ptr->lv_max); + kfree(vg_ptr); + vg_ptr = NULL; + return -EPERM; + } + /* get the physical volume structures */ + vg_ptr->pv_act = vg_ptr->pv_cur = 0; + for (p = 0; p < vg_ptr->pv_max; p++) { + /* user space address */ + if ((pvp = vg_ptr->pv[p]) != NULL) { + pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL); + if (pv_ptr == NULL) { + printk(KERN_CRIT + "%s -- VG_CREATE: kmalloc error PV at line %d\n", + lvm_name, __LINE__); + lvm_do_vg_remove(minor); + return -ENOMEM; + } + if (copy_from_user(pv_ptr, pvp, sizeof(pv_t)) != 0) { + lvm_do_vg_remove(minor); + return -EFAULT; + } + /* We don't need the PE list + in kernel space as with LVs pe_t list (see below) */ + pv_ptr->pe = NULL; + pv_ptr->pe_allocated = 0; + pv_ptr->pv_status = PV_ACTIVE; + vg_ptr->pv_act++; + vg_ptr->pv_cur++; + +#ifdef LVM_GET_INODE + /* insert a dummy inode for fs_may_mount */ + pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev); +#endif + } + } + + /* get the logical volume structures */ + vg_ptr->lv_cur = 0; + for (l = 0; l < vg_ptr->lv_max; l++) { + /* user space address */ + if ((lvp = vg_ptr->lv[l]) != NULL) { + if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) { + lvm_do_vg_remove(minor); + return -EFAULT; + } + vg_ptr->lv[l] = NULL; + if (lvm_do_lv_create(minor, lv.lv_name, &lv) != 0) { + lvm_do_vg_remove(minor); + return -EFAULT; + } + } + } + + /* Second path to correct snapshot logical volumes which are not + in place during first path above */ + for (l = 0; l < vg_ptr->lv_max; l++) { + if ((lv_ptr = vg_ptr->lv[l]) != NULL && + vg_ptr->lv[l]->lv_access & LV_SNAPSHOT) { + snaporg_minor = lv_ptr->lv_snapshot_minor; + if (vg_ptr->lv[LV_BLK(snaporg_minor)] != NULL) { + /* get pointer to original logical volume */ + lv_ptr = vg_ptr->lv[l]->lv_snapshot_org = + vg_ptr->lv[LV_BLK(snaporg_minor)]; + + /* set necessary fields of original logical volume */ + lv_ptr->lv_access |= LV_SNAPSHOT_ORG; + lv_ptr->lv_snapshot_minor = 0; + lv_ptr->lv_snapshot_org = lv_ptr; + lv_ptr->lv_snapshot_prev = NULL; + + /* find last snapshot logical volume in the chain */ + while (lv_ptr->lv_snapshot_next != NULL) + lv_ptr = lv_ptr->lv_snapshot_next; + + /* set back pointer to this last one in our new logical volume */ + vg_ptr->lv[l]->lv_snapshot_prev = lv_ptr; + + /* last logical volume now points to our new snapshot volume */ + lv_ptr->lv_snapshot_next = vg_ptr->lv[l]; + + /* now point to the new one */ + lv_ptr = lv_ptr->lv_snapshot_next; + + /* set necessary fields of new snapshot logical volume */ + lv_ptr->lv_snapshot_next = NULL; + lv_ptr->lv_current_pe = + vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_current_pe; + lv_ptr->lv_allocated_le = + vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_allocated_le; + lv_ptr->lv_current_le = + vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_current_le; + lv_ptr->lv_size = + vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_size; + } + } + } + + vg_count++; + + /* let's go active */ + vg_ptr->vg_status |= VG_ACTIVE; + + MOD_INC_USE_COUNT; + + return 0; +} /* lvm_do_vg_create() */ + + +/* + * character device support function VGDA extend + */ +static int lvm_do_vg_extend(vg_t *vg_ptr, void *arg) +{ + uint p; + pv_t *pv_ptr; + + if (vg_ptr == NULL) return -ENXIO; + if (vg_ptr->pv_cur < vg_ptr->pv_max) { + for (p = 0; p < vg_ptr->pv_max; p++) { + if (vg_ptr->pv[p] == NULL) { + if ((pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL)) == NULL) { + printk(KERN_CRIT + "%s -- VG_EXTEND: kmalloc error PV at line %d\n", + lvm_name, __LINE__); + return -ENOMEM; + } + if (copy_from_user(pv_ptr, arg, sizeof(pv_t)) != 0) { + kfree(pv_ptr); + vg_ptr->pv[p] = NULL; + return -EFAULT; + } + + pv_ptr->pv_status = PV_ACTIVE; + /* We don't need the PE list + in kernel space like LVs pe_t list */ + pv_ptr->pe = NULL; + vg_ptr->pv_cur++; + vg_ptr->pv_act++; + vg_ptr->pe_total += + pv_ptr->pe_total; +#ifdef LVM_GET_INODE + /* insert a dummy inode for fs_may_mount */ + pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev); +#endif + return 0; + } + } + } +return -EPERM; +} /* lvm_do_vg_extend() */ + + +/* + * character device support function VGDA reduce + */ +static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg) +{ + uint p; + pv_t *pv_ptr; + + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(pv_name, arg, sizeof(pv_name)) != 0) + return -EFAULT; + + for (p = 0; p < vg_ptr->pv_max; p++) { + pv_ptr = vg_ptr->pv[p]; + if (pv_ptr != NULL && + strcmp(pv_ptr->pv_name, + pv_name) == 0) { + if (pv_ptr->lv_cur > 0) return -EPERM; + vg_ptr->pe_total -= + pv_ptr->pe_total; + vg_ptr->pv_cur--; + vg_ptr->pv_act--; +#ifdef LVM_GET_INODE + lvm_clear_inode(pv_ptr->inode); +#endif + kfree(pv_ptr); + /* Make PV pointer array contiguous */ + for (; p < vg_ptr->pv_max - 1; p++) + vg_ptr->pv[p] = vg_ptr->pv[p + 1]; + vg_ptr->pv[p + 1] = NULL; + return 0; + } + } + return -ENXIO; +} /* lvm_do_vg_reduce */ + + +/* + * character device support function VGDA remove + */ +static int lvm_do_vg_remove(int minor) +{ + int i; + vg_t *vg_ptr = vg[VG_CHR(minor)]; + pv_t *pv_ptr; + + if (vg_ptr == NULL) return -ENXIO; + +#ifdef LVM_TOTAL_RESET + if (vg_ptr->lv_open > 0 && lvm_reset_spindown == 0) +#else + if (vg_ptr->lv_open > 0) +#endif + return -EPERM; + + /* let's go inactive */ + vg_ptr->vg_status &= ~VG_ACTIVE; + + devfs_unregister (ch_devfs_handle[vg_ptr->vg_number]); + devfs_unregister (vg_devfs_handle[vg_ptr->vg_number]); + + /* free LVs */ + /* first free snapshot logical volumes */ + for (i = 0; i < vg_ptr->lv_max; i++) { + if (vg_ptr->lv[i] != NULL && + vg_ptr->lv[i]->lv_access & LV_SNAPSHOT) { + lvm_do_lv_remove(minor, NULL, i); + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(1); + } + } + /* then free the rest of the LVs */ + for (i = 0; i < vg_ptr->lv_max; i++) { + if (vg_ptr->lv[i] != NULL) { + lvm_do_lv_remove(minor, NULL, i); + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(1); + } + } + + /* free PVs */ + for (i = 0; i < vg_ptr->pv_max; i++) { + if ((pv_ptr = vg_ptr->pv[i]) != NULL) { +#ifdef DEBUG_KFREE + printk(KERN_DEBUG + "%s -- kfree %d\n", lvm_name, __LINE__); +#endif +#ifdef LVM_GET_INODE + lvm_clear_inode(pv_ptr->inode); +#endif + kfree(pv_ptr); + vg[VG_CHR(minor)]->pv[i] = NULL; + } + } + +#ifdef DEBUG_KFREE + printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); +#endif + kfree(vg_ptr); + vg[VG_CHR(minor)] = NULL; + + vg_count--; + + MOD_DEC_USE_COUNT; + + return 0; +} /* lvm_do_vg_remove() */ + + +/* + * character device support function logical volume create + */ +static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) +{ + int l, le, l_new, p, size; + ulong lv_status_save; + char *lv_tmp, *lv_buf; + lv_block_exception_t *lvbe = lv->lv_block_exception; + vg_t *vg_ptr = vg[VG_CHR(minor)]; + lv_t *lv_ptr = NULL; + + if ((pep = lv->lv_current_pe) == NULL) return -EINVAL; + if (lv->lv_chunk_size > LVM_SNAPSHOT_MAX_CHUNK) + return -EINVAL; + + for (l = 0; l < vg_ptr->lv_max; l++) { + if (vg_ptr->lv[l] != NULL && + strcmp(vg_ptr->lv[l]->lv_name, lv_name) == 0) + return -EEXIST; + } + + /* in case of lv_remove(), lv_create() pair; for eg. lvrename does this */ + l_new = -1; + if (vg_ptr->lv[lv->lv_number] == NULL) + l_new = lv->lv_number; + else { + for (l = 0; l < vg_ptr->lv_max; l++) { + if (vg_ptr->lv[l] == NULL) + if (l_new == -1) l_new = l; + } + } + if (l_new == -1) return -EPERM; + else l = l_new; + + if ((lv_ptr = kmalloc(sizeof(lv_t),GFP_KERNEL)) == NULL) {; + printk(KERN_CRIT "%s -- LV_CREATE: kmalloc error LV at line %d\n", + lvm_name, __LINE__); + return -ENOMEM; + } + /* copy preloaded LV */ + memcpy((char *) lv_ptr, (char *) lv, sizeof(lv_t)); + + lv_status_save = lv_ptr->lv_status; + lv_ptr->lv_status &= ~LV_ACTIVE; + lv_ptr->lv_snapshot_org = \ + lv_ptr->lv_snapshot_prev = \ + lv_ptr->lv_snapshot_next = NULL; + lv_ptr->lv_block_exception = NULL; + init_MUTEX(&lv_ptr->lv_snapshot_sem); + vg_ptr->lv[l] = lv_ptr; + + /* get the PE structures from user space if this + is no snapshot logical volume */ + if (!(lv_ptr->lv_access & LV_SNAPSHOT)) { + size = lv_ptr->lv_allocated_le * sizeof(pe_t); + if ((lv_ptr->lv_current_pe = vmalloc(size)) == NULL) { + printk(KERN_CRIT + "%s -- LV_CREATE: vmalloc error LV_CURRENT_PE of %d Byte " + "at line %d\n", + lvm_name, size, __LINE__); +#ifdef DEBUG_KFREE + printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); +#endif + kfree(lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return -ENOMEM; + } + if (copy_from_user(lv_ptr->lv_current_pe, pep, size)) { + vfree(lv_ptr->lv_current_pe); + kfree(lv_ptr); + vg_ptr->lv[l] = NULL; + return -EFAULT; + } + /* correct the PE count in PVs */ + for (le = 0; le < lv_ptr->lv_allocated_le; le++) { + vg_ptr->pe_allocated++; + for (p = 0; p < vg_ptr->pv_cur; p++) { + if (vg_ptr->pv[p]->pv_dev == + lv_ptr->lv_current_pe[le].dev) + vg_ptr->pv[p]->pe_allocated++; + } + } + } else { + /* Get snapshot exception data and block list */ + if (lvbe != NULL) { + lv_ptr->lv_snapshot_org = + vg_ptr->lv[LV_BLK(lv_ptr->lv_snapshot_minor)]; + if (lv_ptr->lv_snapshot_org != NULL) { + size = lv_ptr->lv_remap_end * sizeof(lv_block_exception_t); + if ((lv_ptr->lv_block_exception = vmalloc(size)) == NULL) { + printk(KERN_CRIT + "%s -- lvm_do_lv_create: vmalloc error LV_BLOCK_EXCEPTION " + "of %d byte at line %d\n", + lvm_name, size, __LINE__); +#ifdef DEBUG_KFREE + printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); +#endif + kfree(lv_ptr); + vg_ptr->lv[l] = NULL; + return -ENOMEM; + } + if (copy_from_user(lv_ptr->lv_block_exception, lvbe, size)) { + vfree(lv_ptr->lv_block_exception); + kfree(lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return -EFAULT; + } + /* get pointer to original logical volume */ + lv_ptr = lv_ptr->lv_snapshot_org; + + lv_ptr->lv_snapshot_minor = 0; + lv_ptr->lv_snapshot_org = lv_ptr; + lv_ptr->lv_snapshot_prev = NULL; + /* walk thrugh the snapshot list */ + while (lv_ptr->lv_snapshot_next != NULL) + lv_ptr = lv_ptr->lv_snapshot_next; + /* now lv_ptr points to the last existing snapshot in the chain */ + vg_ptr->lv[l]->lv_snapshot_prev = lv_ptr; + /* our new one now back points to the previous last in the chain */ + lv_ptr = vg_ptr->lv[l]; + /* now lv_ptr points to our new last snapshot logical volume */ + lv_ptr->lv_snapshot_org = lv_ptr->lv_snapshot_prev->lv_snapshot_org; + lv_ptr->lv_snapshot_next = NULL; + lv_ptr->lv_current_pe = lv_ptr->lv_snapshot_org->lv_current_pe; + lv_ptr->lv_allocated_le = lv_ptr->lv_snapshot_org->lv_allocated_le; + lv_ptr->lv_current_le = lv_ptr->lv_snapshot_org->lv_current_le; + lv_ptr->lv_size = lv_ptr->lv_snapshot_org->lv_size; + lv_ptr->lv_stripes = lv_ptr->lv_snapshot_org->lv_stripes; + lv_ptr->lv_stripesize = lv_ptr->lv_snapshot_org->lv_stripesize; + { + int err = lvm_snapshot_alloc(lv_ptr); + if (err) + { + vfree(lv_ptr->lv_block_exception); + kfree(lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return err; + } + } + } else { + vfree(lv_ptr->lv_block_exception); + kfree(lv_ptr); + vg_ptr->lv[l] = NULL; + return -EFAULT; + } + } else { + kfree(vg_ptr->lv[l]); + vg_ptr->lv[l] = NULL; + return -EINVAL; + } + } /* if ( vg[VG_CHR(minor)]->lv[l]->lv_access & LV_SNAPSHOT) */ + + lv_ptr = vg_ptr->lv[l]; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size; + lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1; + vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = vg_ptr->vg_number; + vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = lv_ptr->lv_number; + read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead); + vg_ptr->lv_cur++; + lv_ptr->lv_status = lv_status_save; + + strtok(lv->lv_name, "/"); /* /dev */ + + while((lv_tmp = strtok(NULL, "/")) != NULL) + lv_buf = lv_tmp; + + lv_devfs_handle[lv->lv_number] = devfs_register( + vg_devfs_handle[vg_ptr->vg_number], lv_buf, + DEVFS_FL_DEFAULT, LVM_BLK_MAJOR, lv->lv_number, + S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, + &lvm_blk_dops, NULL); + + /* optionally add our new snapshot LV */ + if (lv_ptr->lv_access & LV_SNAPSHOT) { + /* sync the original logical volume */ + fsync_dev(lv_ptr->lv_snapshot_org->lv_dev); + /* put ourselve into the chain */ + lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr; + lv_ptr->lv_snapshot_org->lv_access |= LV_SNAPSHOT_ORG; + } + return 0; +} /* lvm_do_lv_create() */ + + +/* + * character device support function logical volume remove + */ +static int lvm_do_lv_remove(int minor, char *lv_name, int l) +{ + uint le, p; + vg_t *vg_ptr = vg[VG_CHR(minor)]; + lv_t *lv_ptr; + + if (l == -1) { + for (l = 0; l < vg_ptr->lv_max; l++) { + if (vg_ptr->lv[l] != NULL && + strcmp(vg_ptr->lv[l]->lv_name, lv_name) == 0) { + break; + } + } + } + if (l == vg_ptr->lv_max) return -ENXIO; + + lv_ptr = vg_ptr->lv[l]; +#ifdef LVM_TOTAL_RESET + if (lv_ptr->lv_open > 0 && lvm_reset_spindown == 0) +#else + if (lv_ptr->lv_open > 0) +#endif + return -EBUSY; + + /* check for deletion of snapshot source while + snapshot volume still exists */ + if ((lv_ptr->lv_access & LV_SNAPSHOT_ORG) && + lv_ptr->lv_snapshot_next != NULL) + return -EPERM; + + lv_ptr->lv_status |= LV_SPINDOWN; + + /* sync the buffers */ + fsync_dev(lv_ptr->lv_dev); + + lv_ptr->lv_status &= ~LV_ACTIVE; + + /* invalidate the buffers */ + invalidate_buffers(lv_ptr->lv_dev); + + /* reset generic hd */ + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = -1; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = 0; + lvm_size[MINOR(lv_ptr->lv_dev)] = 0; + + /* reset VG/LV mapping */ + vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = ABS_MAX_VG; + vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = -1; + + /* correct the PE count in PVs if this is no snapshot logical volume */ + if (!(lv_ptr->lv_access & LV_SNAPSHOT)) { + /* only if this is no snapshot logical volume because + we share the lv_current_pe[] structs with the + original logical volume */ + for (le = 0; le < lv_ptr->lv_allocated_le; le++) { + vg_ptr->pe_allocated--; + for (p = 0; p < vg_ptr->pv_cur; p++) { + if (vg_ptr->pv[p]->pv_dev == + lv_ptr->lv_current_pe[le].dev) + vg_ptr->pv[p]->pe_allocated--; + } + } + vfree(lv_ptr->lv_current_pe); + /* LV_SNAPSHOT */ + } else { + /* remove this snapshot logical volume from the chain */ + lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr->lv_snapshot_next; + if (lv_ptr->lv_snapshot_next != NULL) { + lv_ptr->lv_snapshot_next->lv_snapshot_prev = + lv_ptr->lv_snapshot_prev; + } + /* no more snapshots? */ + if (lv_ptr->lv_snapshot_org->lv_snapshot_next == NULL) + lv_ptr->lv_snapshot_org->lv_access &= ~LV_SNAPSHOT_ORG; + lvm_snapshot_release(lv_ptr); + } + + devfs_unregister(lv_devfs_handle[lv_ptr->lv_number]); + +#ifdef DEBUG_KFREE + printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); +#endif + kfree(lv_ptr); + vg_ptr->lv[l] = NULL; + vg_ptr->lv_cur--; + return 0; +} /* lvm_do_lv_remove() */ + + +/* + * character device support function logical volume extend / reduce + */ +static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv) +{ + int l, le, p, size, old_allocated_le; + uint32_t end, lv_status_save; + vg_t *vg_ptr = vg[VG_CHR(minor)]; + lv_t *lv_ptr; + pe_t *pe; + + if ((pep = lv->lv_current_pe) == NULL) return -EINVAL; + + for (l = 0; l < vg_ptr->lv_max; l++) { + if (vg_ptr->lv[l] != NULL && + strcmp(vg_ptr->lv[l]->lv_name, lv_name) == 0) + break; + } + if (l == vg_ptr->lv_max) return -ENXIO; + lv_ptr = vg_ptr->lv[l]; + + /* check for active snapshot */ + if (lv->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG)) return -EPERM; + + if ((pe = vmalloc(size = lv->lv_current_le * sizeof(pe_t))) == NULL) { + printk(KERN_CRIT + "%s -- lvm_do_lv_extend_reduce: vmalloc error LV_CURRENT_PE " + "of %d Byte at line %d\n", + lvm_name, size, __LINE__); + return -ENOMEM; + } + /* get the PE structures from user space */ + if (copy_from_user(pe, pep, size)) { + vfree(pe); + return -EFAULT; + } + +#ifdef DEBUG + printk(KERN_DEBUG + "%s -- fsync_dev and " + "invalidate_buffers for %s [%s] in %s\n", + lvm_name, lv_ptr->lv_name, + kdevname(lv_ptr->lv_dev), + vg_ptr->vg_name); +#endif + + lv_ptr->lv_status |= LV_SPINDOWN; + fsync_dev(lv_ptr->lv_dev); + lv_ptr->lv_status &= ~LV_ACTIVE; + invalidate_buffers(lv_ptr->lv_dev); + + /* reduce allocation counters on PV(s) */ + for (le = 0; le < lv_ptr->lv_allocated_le; le++) { + vg_ptr->pe_allocated--; + for (p = 0; p < vg_ptr->pv_cur; p++) { + if (vg_ptr->pv[p]->pv_dev == + lv_ptr->lv_current_pe[le].dev) { + vg_ptr->pv[p]->pe_allocated--; + break; + } + } + } + + + /* save pointer to "old" lv/pe pointer array */ + pep1 = lv_ptr->lv_current_pe; + end = lv_ptr->lv_current_le; + + /* save open counter */ + lv_open = lv_ptr->lv_open; + + /* save # of old allocated logical extents */ + old_allocated_le = lv_ptr->lv_allocated_le; + + /* copy preloaded LV */ + lv_status_save = lv->lv_status; + lv->lv_status |= LV_SPINDOWN; + lv->lv_status &= ~LV_ACTIVE; + memcpy((char *) lv_ptr, (char *) lv, sizeof(lv_t)); + lv_ptr->lv_current_pe = pe; + lv_ptr->lv_open = lv_open; + + /* save availiable i/o statistic data */ + /* linear logical volume */ + if (lv_ptr->lv_stripes < 2) { + /* Check what last LE shall be used */ + if (end > lv_ptr->lv_current_le) end = lv_ptr->lv_current_le; + for (le = 0; le < end; le++) { + lv_ptr->lv_current_pe[le].reads = pep1[le].reads; + lv_ptr->lv_current_pe[le].writes = pep1[le].writes; + } + /* striped logical volume */ + } else { + uint i, j, source, dest, end, old_stripe_size, new_stripe_size; + + old_stripe_size = old_allocated_le / lv_ptr->lv_stripes; + new_stripe_size = lv_ptr->lv_allocated_le / lv_ptr->lv_stripes; + end = old_stripe_size; + if (end > new_stripe_size) end = new_stripe_size; + for (i = source = dest = 0; + i < lv_ptr->lv_stripes; i++) { + for (j = 0; j < end; j++) { + lv_ptr->lv_current_pe[dest + j].reads = + pep1[source + j].reads; + lv_ptr->lv_current_pe[dest + j].writes = + pep1[source + j].writes; + } + source += old_stripe_size; + dest += new_stripe_size; + } + } + vfree(pep1); + pep1 = NULL; + + + /* extend the PE count in PVs */ + for (le = 0; le < lv_ptr->lv_allocated_le; le++) { + vg_ptr->pe_allocated++; + for (p = 0; p < vg_ptr->pv_cur; p++) { + if (vg_ptr->pv[p]->pv_dev == + vg_ptr->lv[l]->lv_current_pe[le].dev) { + vg_ptr->pv[p]->pe_allocated++; + break; + } + } + } + + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size; + lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1; + /* vg_lv_map array doesn't have to be changed here */ + + read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead); + lv_ptr->lv_status = lv_status_save; + + return 0; +} /* lvm_do_lv_extend_reduce() */ + + +/* + * character device support function logical volume status by name + */ +static int lvm_do_lv_status_byname(vg_t *vg_ptr, void *arg) +{ + uint l; + ulong size; + lv_t lv; + lv_t *lv_ptr; + lv_status_byname_req_t lv_status_byname_req; + + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&lv_status_byname_req, arg, + sizeof(lv_status_byname_req_t)) != 0) + return -EFAULT; + + if (lv_status_byname_req.lv == NULL) return -EINVAL; + if (copy_from_user(&lv, lv_status_byname_req.lv, + sizeof(lv_t)) != 0) + return -EFAULT; + + for (l = 0; l < vg_ptr->lv_max; l++) { + lv_ptr = vg_ptr->lv[l]; + if (lv_ptr != NULL && + strcmp(lv_ptr->lv_name, + lv_status_byname_req.lv_name) == 0) { + if (copy_to_user(lv_status_byname_req.lv, + lv_ptr, + sizeof(lv_t)) != 0) + return -EFAULT; + + if (lv.lv_current_pe != NULL) { + size = lv_ptr->lv_allocated_le * + sizeof(pe_t); + if (copy_to_user(lv.lv_current_pe, + lv_ptr->lv_current_pe, + size) != 0) + return -EFAULT; + } + return 0; + } + } + return -ENXIO; +} /* lvm_do_lv_status_byname() */ + + +/* + * character device support function logical volume status by index + */ +static int lvm_do_lv_status_byindex(vg_t *vg_ptr,void *arg) +{ + ulong size; + lv_t lv; + lv_t *lv_ptr; + lv_status_byindex_req_t lv_status_byindex_req; + + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&lv_status_byindex_req, arg, + sizeof(lv_status_byindex_req)) != 0) + return -EFAULT; + + if ((lvp = lv_status_byindex_req.lv) == NULL) + return -EINVAL; + if ( ( lv_ptr = vg_ptr->lv[lv_status_byindex_req.lv_index]) == NULL) + return -ENXIO; + + if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) + return -EFAULT; + + if (copy_to_user(lvp, lv_ptr, sizeof(lv_t)) != 0) + return -EFAULT; + + if (lv.lv_current_pe != NULL) { + size = lv_ptr->lv_allocated_le * sizeof(pe_t); + if (copy_to_user(lv.lv_current_pe, + lv_ptr->lv_current_pe, + size) != 0) + return -EFAULT; + } + return 0; +} /* lvm_do_lv_status_byindex() */ + + +/* + * character device support function physical volume change + */ +static int lvm_do_pv_change(vg_t *vg_ptr, void *arg) +{ + uint p; + pv_t *pv_ptr; +#ifdef LVM_GET_INODE + struct inode *inode_sav; +#endif + + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&pv_change_req, arg, + sizeof(pv_change_req)) != 0) + return -EFAULT; + + for (p = 0; p < vg_ptr->pv_max; p++) { + pv_ptr = vg_ptr->pv[p]; + if (pv_ptr != NULL && + strcmp(pv_ptr->pv_name, + pv_change_req.pv_name) == 0) { +#ifdef LVM_GET_INODE + inode_sav = pv_ptr->inode; +#endif + if (copy_from_user(pv_ptr, + pv_change_req.pv, + sizeof(pv_t)) != 0) + return -EFAULT; + + /* We don't need the PE list + in kernel space as with LVs pe_t list */ + pv_ptr->pe = NULL; +#ifdef LVM_GET_INODE + pv_ptr->inode = inode_sav; +#endif + return 0; + } + } + return -ENXIO; +} /* lvm_do_pv_change() */ + +/* + * character device support function get physical volume status + */ +static int lvm_do_pv_status(vg_t *vg_ptr, void *arg) +{ + uint p; + pv_t *pv_ptr; + + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&pv_status_req, arg, + sizeof(pv_status_req)) != 0) + return -EFAULT; + + for (p = 0; p < vg_ptr->pv_max; p++) { + pv_ptr = vg_ptr->pv[p]; + if (pv_ptr != NULL && + strcmp(pv_ptr->pv_name, + pv_status_req.pv_name) == 0) { + if (copy_to_user(pv_status_req.pv, + pv_ptr, + sizeof(pv_t)) != 0) + return -EFAULT; + return 0; + } + } + return -ENXIO; +} /* lvm_do_pv_status() */ + + +/* + * support function initialize gendisk variables + */ +#ifdef __initfunc +__initfunc(void lvm_geninit(struct gendisk *lvm_gdisk)) +#else +void __init + lvm_geninit(struct gendisk *lvm_gdisk) +#endif +{ + int i = 0; + +#ifdef DEBUG_GENDISK + printk(KERN_DEBUG "%s -- lvm_gendisk\n", lvm_name); +#endif + + for (i = 0; i < MAX_LV; i++) { + lvm_gendisk.part[i].start_sect = -1; /* avoid partition check */ + lvm_size[i] = lvm_gendisk.part[i].nr_sects = 0; + lvm_blocksizes[i] = BLOCK_SIZE; + } + + blksize_size[MAJOR_NR] = lvm_blocksizes; + blk_size[MAJOR_NR] = lvm_size; + + return; +} /* lvm_gen_init() */ + + +#ifdef LVM_GET_INODE +/* + * support function to get an empty inode + * + * Gets an empty inode to be inserted into the inode hash, + * so that a physical volume can't be mounted. + * This is analog to drivers/block/md.c + * + * Is this the real thing? + * + * No, it's bollocks. md.c tries to do a bit different thing that might + * _somewhat_ work eons ago. Neither does any good these days. mount() couldn't + * care less for icache (it cares only for ->s_root->d_count and if we want + * loopback mounts even that will stop). BTW, with the form used here mount() + * would have to scan the _whole_ icache to detect the attempt - how on the + * Earth could it guess the i_ino of your dummy inode? Official line on the + * exclusion between mount()/swapon()/open()/etc. is Just Don't Do It(tm). + * If you can convince Linus that it's worth changing - fine, then you'll need + * to do blkdev_get()/blkdev_put(). Until then... + */ +struct inode *lvm_get_inode(kdev_t dev) +{ + struct inode *inode_this = NULL; + + /* Lock the device by inserting a dummy inode. */ + inode_this = get_empty_inode(); + inode_this->i_dev = dev; + insert_inode_hash(inode_this); + return inode_this; +} + + +/* + * support function to clear an inode + * + */ +void lvm_clear_inode(struct inode *inode) +{ +#ifdef I_FREEING + inode->i_state |= I_FREEING; +#endif + clear_inode(inode); + return; +} +#endif /* #ifdef LVM_GET_INODE */ diff --git a/drivers/md/md.c b/drivers/md/md.c new file mode 100644 index 000000000..4e82a5814 --- /dev/null +++ b/drivers/md/md.c @@ -0,0 +1,3878 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> + - kerneld support by Boris Tobotras <boris@xtalk.msk.su> + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> + - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown <neilb@cse.unsw.edu.au>. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/raid/md.h> +#include <linux/raid/xor.h> +#include <linux/devfs_fs_kernel.h> + +#include <linux/init.h> + +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif + +#define __KERNEL_SYSCALLS__ +#include <linux/unistd.h> + +#include <asm/unaligned.h> + +extern asmlinkage int sys_sched_yield(void); +extern asmlinkage long sys_setsid(void); + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER + +#include <linux/blk.h> + +#define DEBUG 0 +#if DEBUG +# define dprintk(x...) printk(x) +#else +# define dprintk(x...) do { } while(0) +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, }; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table raid_dir_table[] = { + {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, + {0} +}; + +static ctl_table raid_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, + {0} +}; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_hardsect_sizes[MAX_MD_DEVS]; +static int md_maxreadahead[MAX_MD_DEVS]; +static mdk_thread_t *md_recovery_thread = NULL; + +int md_size[MAX_MD_DEVS] = {0, }; + +extern struct block_device_operations md_fops; +static devfs_handle_t devfs_handle = NULL; + +static struct gendisk md_gendisk= +{ + major: MD_MAJOR, + major_name: "md", + minor_shift: 0, + max_p: 1, + part: md_hd_struct, + sizes: md_size, + nr_real: MAX_MD_DEVS, + real_devices: NULL, + next: NULL, + fops: &md_fops, +}; + +/* + * Enables to iterate over all existing md arrays + */ +static MD_LIST_HEAD(all_mddevs); + +/* + * The mapping between kdev and mddev is not necessary a simple + * one! Eg. HSM uses several sub-devices to implement Logical + * Volumes. All these sub-devices map to the same mddev. + */ +dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, }; + +void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data) +{ + unsigned int minor = MINOR(dev); + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; + } + if (mddev_map[minor].mddev != NULL) { + MD_BUG(); + return; + } + mddev_map[minor].mddev = mddev; + mddev_map[minor].data = data; +} + +void del_mddev_mapping (mddev_t * mddev, kdev_t dev) +{ + unsigned int minor = MINOR(dev); + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; + } + if (mddev_map[minor].mddev != mddev) { + MD_BUG(); + return; + } + mddev_map[minor].mddev = NULL; + mddev_map[minor].data = NULL; +} + +static int md_make_request (request_queue_t *q, int rw, struct buffer_head * bh) +{ + mddev_t *mddev = kdev_to_mddev(bh->b_rdev); + + if (mddev && mddev->pers) + return mddev->pers->make_request(mddev, rw, bh); + else { + buffer_IO_error(bh); + return -1; + } +} + +static mddev_t * alloc_mddev (kdev_t dev) +{ + mddev_t *mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); + + mddev->__minor = MINOR(dev); + init_MUTEX(&mddev->reconfig_sem); + init_MUTEX(&mddev->recovery_sem); + init_MUTEX(&mddev->resync_sem); + MD_INIT_LIST_HEAD(&mddev->disks); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + + /* + * The 'base' mddev is the one with data NULL. + * personalities can create additional mddevs + * if necessary. + */ + add_mddev_mapping(mddev, dev, 0); + md_list_add(&mddev->all_mddevs, &all_mddevs); + + MOD_INC_USE_COUNT; + + return mddev; +} + +struct gendisk * find_gendisk (kdev_t dev) +{ + struct gendisk *tmp = gendisk_head; + + while (tmp != NULL) { + if (tmp->major == MAJOR(dev)) + return (tmp); + tmp = tmp->next; + } + return (NULL); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); + +char * partition_name (kdev_t dev) +{ + struct gendisk *hd; + static char nomem [] = "<nomem>"; + dev_name_t *dname; + struct md_list_head *tmp = device_names.next; + + while (tmp != &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + tmp = tmp->next; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + + if (!dname) + return nomem; + /* + * ok, add this new device name to the list + */ + hd = find_gendisk (dev); + dname->name = NULL; + if (hd) + dname->name = disk_name (hd, MINOR(dev), dname->namebuf); + if (!dname->name) { + sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); + dname->name = dname->namebuf; + } + + dname->dev = dev; + MD_INIT_LIST_HEAD(&dname->list); + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +static unsigned int zoned_raid_size (mddev_t *mddev) +{ + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); + + ITERATE_RDEV(mddev,rdev,tmp) { + rdev->size &= mask; + md_size[mdidx(mddev)] += rdev->size; + } + return 0; +} + +/* + * We check wether all devices are numbered from 0 to nb_dev-1. The + * order is guaranteed even after device name changes. + * + * Some personalities (raid0, linear) use this. Personalities that + * provide data have to be able to deal with loss of individual + * disks, so they do their checking themselves. + */ +int md_check_ordering (mddev_t *mddev) +{ + int i, c; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + /* + * First, all devices must be fully functional + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk("md: md%d's device %s faulty, aborting.\n", + mdidx(mddev), partition_name(rdev->dev)); + goto abort; + } + } + + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + c++; + } + if (c != mddev->nb_dev) { + MD_BUG(); + goto abort; + } + if (mddev->nb_dev != mddev->sb->raid_disks) { + printk("md: md%d, array needs %d disks, has %d, aborting.\n", + mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev); + goto abort; + } + /* + * Now the numbering check + */ + for (i = 0; i < mddev->nb_dev; i++) { + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == i) + c++; + } + if (!c) { + printk("md: md%d, missing disk #%d, aborting.\n", + mdidx(mddev), i); + goto abort; + } + if (c > 1) { + printk("md: md%d, too many disks #%d, aborting.\n", + mdidx(mddev), i); + goto abort; + } + } + return 0; +abort: + return 1; +} + +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb) +{ + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} + +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" + +#define BAD_MINOR KERN_ERR \ +"md: %s: invalid raid minor (%x)\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" + +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" + +static int alloc_array_sb (mddev_t * mddev) +{ + if (mddev->sb) { + MD_BUG(); + return 0; + } + + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page(mddev->sb); + return 0; +} + +static int alloc_disk_sb (mdk_rdev_t * rdev) +{ + if (rdev->sb) + MD_BUG(); + + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL); + if (!rdev->sb) { + printk (OUT_OF_MEM); + return -EINVAL; + } + md_clear_page(rdev->sb); + + return 0; +} + +static void free_disk_sb (mdk_rdev_t * rdev) +{ + if (rdev->sb) { + free_page((unsigned long) rdev->sb); + rdev->sb = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); + } +} + +static void mark_rdev_faulty (mdk_rdev_t * rdev) +{ + if (!rdev) { + MD_BUG(); + return; + } + free_disk_sb(rdev); + rdev->faulty = 1; +} + +static int read_disk_sb (mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + struct buffer_head *bh = NULL; + kdev_t dev = rdev->dev; + mdp_super_t *sb; + unsigned long sb_offset; + + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + + /* + * Calculate the position of the superblock, + * it's at the end of the disk + */ + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + printk("(read) %s's sb offset: %ld", partition_name(dev), sb_offset); + fsync_dev(dev); + set_blocksize (dev, MD_SB_BYTES); + bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); + + if (bh) { + sb = (mdp_super_t *) bh->b_data; + memcpy (rdev->sb, sb, MD_SB_BYTES); + } else { + printk (NO_SB,partition_name(rdev->dev)); + goto abort; + } + printk(" [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); + ret = 0; +abort: + if (bh) + brelse (bh); + return ret; +} + +static unsigned int calc_sb_csum (mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb (mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk (BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk (BAD_MINOR, partition_name(rdev->dev), + sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) + printk(BAD_CSUM, partition_name(rdev->dev)); + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = find_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + mddev->nb_dev++; + printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev); +} + +static void unbind_rdev_from_array (mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + md_list_del(&rdev->same_set); + MD_INIT_LIST_HEAD(&rdev->same_set); + rdev->mddev->nb_dev--; + printk("unbind<%s,%d>\n", partition_name(rdev->dev), + rdev->mddev->nb_dev); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev (mdk_rdev_t *rdev) +{ + int err = 0; + + /* + * First insert a dummy inode. + */ + if (rdev->inode) + MD_BUG(); + rdev->inode = get_empty_inode(); + if (!rdev->inode) + return -ENOMEM; + /* + * we dont care about any other fields + */ + rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev; + insert_inode_hash(rdev->inode); + + memset(&rdev->filp, 0, sizeof(rdev->filp)); + rdev->filp.f_mode = 3; /* read write */ + return err; +} + +static void unlock_rdev (mdk_rdev_t *rdev) +{ + if (!rdev->inode) + MD_BUG(); + iput(rdev->inode); + rdev->inode = NULL; +} + +static void export_rdev (mdk_rdev_t * rdev) +{ + printk("export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + md_list_del(&rdev->all); + MD_INIT_LIST_HEAD(&rdev->all); + if (rdev->pending.next != &rdev->pending) { + printk("(%s was pending)\n",partition_name(rdev->dev)); + md_list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array (mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array (mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (mddev->nb_dev) + MD_BUG(); +} + +static void free_mddev (mddev_t *mddev) +{ + if (!mddev) { + MD_BUG(); + return; + } + + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; + + /* + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) + */ + while (md_atomic_read(&mddev->resync_sem.count) != 1) + schedule(); + while (md_atomic_read(&mddev->recovery_sem.count) != 1) + schedule(); + + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); + md_list_del(&mddev->all_mddevs); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + kfree(mddev); + MOD_DEC_USE_COUNT; +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + printk(" D %2d: ", i); + print_desc(desc); + } + printk(" THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(" rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk("rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk("no rdev superblock!\n"); +} + +void md_print_devices (void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk(" **********************************\n"); + printk(" * <COMPLETE RAID STATE PRINTOUT> *\n"); + printk(" **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk(" **********************************\n"); + printk("\n"); +} + +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all (kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + tmp = all_raid_disks.next; + while (tmp != &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + tmp = tmp->next; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + struct buffer_head *bh; + kdev_t dev; + unsigned long sb_offset, size; + mdp_super_t *sb; + + if (!rdev->sb) { + MD_BUG(); + return -1; + } + if (rdev->faulty) { + MD_BUG(); + return -1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return -1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * it's size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev), rdev->size, size); + goto skip; + } + + printk("(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); + fsync_dev(dev); + set_blocksize(dev, MD_SB_BYTES); + bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); + if (!bh) { + printk(GETBLK_FAILED, partition_name(dev)); + return 1; + } + memset(bh->b_data,0,bh->b_size); + sb = (mdp_super_t *) bh->b_data; + memcpy(sb, rdev->sb, MD_SB_BYTES); + + mark_buffer_uptodate(bh, 1); + mark_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + brelse(bh); + fsync_dev(dev); +skip: + return 0; +} +#undef GETBLK_FAILED + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +int md_update_sb(mddev_t * mddev) +{ + int first, err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + +repeat: + mddev->sb->utime = CURRENT_TIME; + if ((++mddev->sb->events_lo)==0) + ++mddev->sb->events_hi; + + if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return 0; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + first = 1; + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (!first) { + first = 0; + printk(", "); + } + if (rdev->faulty) + printk("(skipping faulty "); + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty) { + printk("[events: %08lx]", + (unsigned long)rdev->sb->events_lo); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + printk(".\n"); + if (err) { + printk("errors occured during superblock update, repeating\n"); + if (--count) + goto repeat; + printk("excessive errors occured during superblock update, exiting\n"); + } + return 0; +} + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static int md_import_device (kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk("could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (get_super(newdev)) { + printk("md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk("md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk("md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk("md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk("md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + + if (rdev->faulty && rdev->sb) + free_disk_sb(rdev); + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->inode) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + +/* + * Check a full RAID array for plausibility + */ + +#define INCONSISTENT KERN_ERR \ +"md: fatal superblock inconsistency in %s -- removing from array\n" + +#define OUT_OF_DATE KERN_ERR \ +"md: superblock update time inconsistency -- using the most recent one\n" + +#define OLD_VERSION KERN_ALERT \ +"md: md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md: md%d: raid array is not clean -- starting background reconstruction\n" + +#define UNKNOWN_LEVEL KERN_ERR \ +"md: md%d: unsupported raid level %d\n" + +static int analyze_sbs (mddev_t * mddev) +{ + int out_of_date = 0, i; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + printk (INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + if (rdev->sb->events_lo || rdev->sb->events_hi) + if ((rdev->sb->events_lo--)==0) + rdev->sb->events_hi--; + } + + printk("%s's event counter: %08lx\n", partition_name(rdev->dev), + (unsigned long)rdev->sb->events_lo); + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = md_event(rdev->sb); + ev2 = md_event(freshest->sb); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + printk(OUT_OF_DATE); + printk("freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices faulty + */ + __u64 ev1, ev2; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ++ev1; + if (ev1 < ev2) { + printk("md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty) { /* REMOVEME */ + MD_BUG(); + goto abort; + } + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + */ + if (disk_faulty(desc)) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk("md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk("md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); + } + + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); + goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + } + + /* + * Do a final reality check. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk (OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk (NOT_CLEAN_IGNORE, mdidx(mddev)); + + return 0; +abort: + return 1; +} + +#undef INCONSISTENT +#undef OUT_OF_DATE +#undef OLD_VERSION +#undef OLD_LEVEL + +static int device_size_calculation (mddev_t * mddev) +{ + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk (KERN_WARNING + "Dev %s smaller than chunk_size: %ldk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { + readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + if (sb->level == -3) + readahead = 0; + } + md_maxreadahead[mdidx(mddev)] = readahead; + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run (mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + + if (!mddev->nb_dev) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + mddev->param.chunk_size = chunk_size; + mddev->param.personality = pnum; + + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + return -EINVAL; + } + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + md_hardsect_sizes[mdidx(mddev)] = 512; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + fsync_dev(rdev->dev); + invalidate_buffers(rdev->dev); + if (get_hardsect_size(rdev->dev) + > md_hardsect_sizes[mdidx(mddev)]) + md_hardsect_sizes[mdidx(mddev)] = + get_hardsect_size(rdev->dev); + } + md_blocksizes[mdidx(mddev)] = 1024; + if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) + md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; + mddev->pers = pers[pnum]; + + err = mddev->pers->run(mddev); + if (err) { + printk("pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } + + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + md_update_sb(mddev); + + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1; + + read_ahead[MD_MAJOR] = 1024; + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +#define OUT(x) do { err = (x); goto out; } while (0) + +static int restart_array (mddev_t *mddev) +{ + int err = 0; + + /* + * Complain if it has no devices + */ + if (!mddev->nb_dev) + OUT(-ENXIO); + + if (mddev->pers) { + if (!mddev->ro) + OUT(-EBUSY); + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk (KERN_INFO + "md%d switched to read-write mode.\n", mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + md_recover_arrays(); + if (mddev->pers->restart_resync) + mddev->pers->restart_resync(mddev); + } else + err = -EINVAL; + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" + +static int do_md_stop (mddev_t * mddev, int ro) +{ + int err = 0, resync_interrupted = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (!ro && get_super(dev)) { + printk (STILL_MOUNTED, mdidx(mddev)); + OUT(-EBUSY); + } + + if (mddev->pers) { + /* + * It is safe to call stop here, it only frees private + * data. Also, it tells us if a device is unstoppable + * (eg. resyncing is in progress) + */ + if (mddev->pers->stop_resync) + if (mddev->pers->stop_resync(mddev)) + resync_interrupted = 1; + + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + + /* + * This synchronizes with signal delivery to the + * resync or reconstruction thread. It also nicely + * hangs the process if some reconstruction has not + * finished. + */ + down(&mddev->recovery_sem); + up(&mddev->recovery_sem); + + /* + * sync and invalidate buffers because we cannot kill the + * main thread with valid IO transfers still around. + * the kernel lock protects us from new requests being + * added after invalidate_buffers(). + */ + fsync_dev (mddev_to_kdev(mddev)); + fsync_dev (dev); + invalidate_buffers (dev); + + if (ro) { + if (mddev->ro) + OUT(-ENXIO); + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + if (mddev->ro) + set_device_ro(dev, 1); + OUT(-EBUSY); + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (!mddev->recovery_running && !resync_interrupted) { + printk("marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + md_update_sb(mddev); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + printk (KERN_INFO "md%d stopped.\n", mdidx(mddev)); + free_mddev(mddev); + + } else + printk (KERN_INFO + "md%d switched to read-only mode.\n", mdidx(mddev)); +out: + return err; +} + +#undef OUT + +/* + * We have to safely support old arrays too. + */ +int detect_old_array (mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; +} + + +static void autorun_array (mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (mddev->disks.prev == &mddev->disks) { + MD_BUG(); + return; + } + + printk("running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\nnow!\n"); + + err = do_md_run (mddev); + if (err) { + printk("do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + */ +static void autorun_devices (void) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + kdev_t md_kdev; + + + printk("autorun ...\n"); + while (pending_raid_disks.next != &pending_raid_disks) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk("considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(" adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); + mddev = kdev_to_mddev(md_kdev); + if (mddev) { + printk("md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + continue; + } + mddev = alloc_mddev(md_kdev); + printk("created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + md_list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + autorun_array(mddev); + } + printk("... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array (kdev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk("could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk("can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + +struct { + int set; + int noautodetect; + +} raid_setup_args md__initdata = { 0, 0 }; + +void md_setup_drive(void) md__init; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +#ifdef CONFIG_AUTODETECT_RAID +static int detected_devices[128] md__initdata; +static int dev_cnt=0; +void md_autodetect_dev(kdev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} +#endif + +int md__init md_run_setup(void) +{ +#ifdef CONFIG_AUTODETECT_RAID + mdk_rdev_t *rdev; + int i; + + if (raid_setup_args.noautodetect) + printk(KERN_INFO "skipping autodetection of RAID arrays\n"); + else { + + printk(KERN_INFO "autodetecting RAID arrays\n"); + + for (i=0; i<dev_cnt; i++) { + kdev_t dev = detected_devices[i]; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + autorun_devices(); + } + + dev_cnt = -1; /* make sure further calls to md_autodetect_dev are ignored */ +#endif +#ifdef CONFIG_MD_BOOT + md_setup_drive(); +#endif + return 0; +} + +static int get_version (void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info (mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) + return -EINVAL; + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info (mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= mddev->sb->nr_disks) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info->x + +static int add_new_disk (mddev_t * mddev, mdu_disk_info_t *info) +{ + int err, size, persistent; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + dev = MKDEV(info->major,info->minor); + + if (find_rdev_all(dev)) { + printk("device %s already used in a RAID array!\n", + partition_name(dev)); + return -EBUSY; + } + if (!mddev->sb) { + /* expecting a device which has a superblock */ + err = md_import_device(dev, 1); + if (err) { + printk("md error, md_import_device returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (mddev->nb_dev) { + mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + if (!uuid_equal(rdev0, rdev)) { + printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + } + bind_rdev_to_array(rdev, mddev); + return 0; + } + + nr = info->number; + if (nr >= mddev->sb->nr_disks) + return -EINVAL; + + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info->state & (1<<MD_DISK_FAULTY))==0) { + err = md_import_device (dev, 0); + if (err) { + printk("md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + + rdev->old_dev = dev; + rdev->desc_nr = info->number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk("nonpersistent superblock ...\n"); + if (!mddev->sb->chunk_size) + printk("no chunksize?\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB + +static int hot_remove_disk (mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk("trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk("md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + if (disk_removed(disk)) { + MD_BUG(); + return -EINVAL; + } + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + return 0; +busy: + printk("cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk (mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk("trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk("md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk("md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + return -ENOSPC; + } + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk("md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk("md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; + + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + printk("md%d: can not hot-add to full array!\n", mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + if (disk_removed(disk)) { + /* + * reuse slot + */ + if (disk->number != i) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + } else { + disk->number = i; + } + + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + mddev->sb_dirty = 1; + + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +#define SET_SB(x) mddev->sb->x = info->x +static int set_array_info (mddev_t * mddev, mdu_array_info_t *info) +{ + + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; +} +#undef SET_SB + +static int set_disk_info (mddev_t * mddev, void * arg) +{ + printk("not yet"); + return -EINVAL; +} + +static int clear_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int write_raid_info (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int protect_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int unprotect_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int set_disk_faulty (mddev_t *mddev, kdev_t dev) +{ + int ret; + + fsync_dev(mddev_to_kdev(mddev)); + ret = md_error(mddev_to_kdev(mddev), dev); + return ret; +} + +static int md_ioctl (struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!md_capable_admin()) + return -EACCES; + + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) + return -EINVAL; + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done_unlock; + + case BLKGETSIZE: /* Return device size */ + if (!arg) { + err = -EINVAL; + goto abort; + } + err = md_put_user(md_hd_struct[minor].nr_sects, + (long *) arg); + goto done; + + case BLKFLSBUF: + fsync_dev(dev); + invalidate_buffers(dev); + goto done; + + case BLKRASET: + if (arg > 0xff) { + err = -EINVAL; + goto abort; + } + read_ahead[MAJOR(dev)] = arg; + goto done; + + case BLKRAGET: + if (!arg) { + err = -EINVAL; + goto abort; + } + err = md_put_user (read_ahead[ + MAJOR(dev)], (long *) arg); + goto done; + default: + } + + /* + * Commands creating/starting a new array: + */ + + mddev = kdev_to_mddev(dev); + + switch (cmd) + { + case SET_ARRAY_INFO: + case START_ARRAY: + if (mddev) { + printk("array md%d already exists!\n", + mdidx(mddev)); + err = -EEXIST; + goto abort; + } + default: + } + switch (cmd) + { + case SET_ARRAY_INFO: + mddev = alloc_mddev(dev); + if (!mddev) { + err = -ENOMEM; + goto abort; + } + /* + * alloc_mddev() should possibly self-lock. + */ + err = lock_mddev(mddev); + if (err) { + printk("ioctl, reason %d, cmd %d\n", err, cmd); + goto abort; + } + + if (mddev->sb) { + printk("array md%d already has a superblock!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (arg) { + mdu_array_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk("couldnt set array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + case START_ARRAY: + /* + * possibly make it lock the array ... + */ + err = autostart_array((kdev_t)arg); + if (err) { + printk("autostart %s failed!\n", + partition_name((kdev_t)arg)); + goto abort; + } + goto done; + + default: + } + + /* + * Commands querying/configuring an existing array: + */ + + if (!mddev) { + err = -ENODEV; + goto abort; + } + err = lock_mddev(mddev); + if (err) { + printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); + goto abort; + } + /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + if (!(err = do_md_stop (mddev, 0))) + mddev = NULL; + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case CLEAR_ARRAY: + err = clear_array(mddev); + goto done_unlock; + + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_INFO: + err = set_disk_info(mddev, (void *)arg); + goto done_unlock; + + case WRITE_RAID_INFO: + err = write_raid_info(mddev); + goto done_unlock; + + case UNPROTECT_ARRAY: + err = unprotect_array(mddev); + goto done_unlock; + + case PROTECT_ARRAY: + err = protect_array(mddev); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { +/* The data is never used.... + mdu_param_t param; + err = md_copy_from_user(¶m, (mdu_param_t *)arg, + sizeof(param)); + if (err) + goto abort_unlock; +*/ + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + if (!do_md_stop (mddev, 0)) + mddev = NULL; + } + goto done_unlock; + } + + default: + printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + if (mddev) + unlock_mddev(mddev); + + return err; +done: + if (err) + printk("huh12?\n"); +abort: + return err; +} + +static int md_open (struct inode *inode, struct file *file) +{ + /* + * Always succeed + */ + return (0); +} + +static struct block_device_operations md_fops= +{ + open: md_open, + ioctl: md_ioctl, +}; + + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + md_lock_kernel(); + exit_mm(current); + exit_files(current); + exit_fs(current); + + /* + * Detach thread + */ + daemonize(); + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->nice = -20; +// md_unlock_kernel(); + + up(thread->sem); + + for (;;) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&thread->wqueue, &wait); + set_task_state(current, TASK_INTERRUPTIBLE); + if (!test_bit(THREAD_WAKEUP, &thread->flags)) { + dprintk("thread %p went to sleep.\n", thread); + schedule(); + dprintk("thread %p woke up.\n", thread); + } + current->state = TASK_RUNNING; + remove_wait_queue(&thread->wqueue, &wait); + clear_bit(THREAD_WAKEUP, &thread->flags); + + if (thread->run) { + thread->run(thread->data); + run_task_queue(&tq_disk); + } else + break; + if (md_signal_pending(current)) { + printk("%8s(%d) flushing signals.\n", current->comm, + current->pid); + md_flush_signals(); + } + } + up(thread->sem); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + dprintk("waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); +} + +mdk_thread_t *md_register_thread (void (*run) (void *), + void *data, const char *name) +{ + mdk_thread_t *thread; + int ret; + DECLARE_MUTEX_LOCKED(sem); + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + md_init_waitqueue_head(&thread->wqueue); + + thread->sem = &sem; + thread->run = run; + thread->data = data; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + down(&sem); + return thread; +} + +void md_interrupt_thread (mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + printk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread (mdk_thread_t *thread) +{ + DECLARE_MUTEX_LOCKED(sem); + + thread->sem = &sem; + thread->run = NULL; + thread->name = NULL; + if (!thread->tsk) { + MD_BUG(); + return; + } + md_interrupt_thread(thread); + down(&sem); +} + +void md_recover_arrays (void) +{ + if (!md_recovery_thread) { + MD_BUG(); + return; + } + md_wakeup_thread(md_recovery_thread); +} + + +int md_error (kdev_t dev, kdev_t rdev) +{ + mddev_t *mddev; + mdk_rdev_t * rrdev; + int rc; + + mddev = kdev_to_mddev(dev); +/* printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3)); + */ + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + mark_rdev_faulty(rrdev); + /* + * if recovery was running, stop it now. + */ + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + if (mddev->pers->error_handler) { + rc = mddev->pers->error_handler(mddev, rdev); + md_recover_arrays(); + return rc; + } + return 0; +} + +static int status_unused (char * page) +{ + int sz = 0, i = 0; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + sz += sprintf(page + sz, "unused devices: "); + + ITERATE_RDEV_ALL(rdev,tmp) { + if (!rdev->same_set.next && !rdev->same_set.prev) { + /* + * The device is not yet used by any array. + */ + i++; + sz += sprintf(page + sz, "%s ", + partition_name(rdev->dev)); + } + } + if (!i) + sz += sprintf(page + sz, "<none>"); + + sz += sprintf(page + sz, "\n"); + return sz; +} + + +static int status_resync (char * page, mddev_t * mddev) +{ + int sz = 0; + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); + max_blocks = mddev->sb->size; + + /* + * Should not happen. + */ + if (!max_blocks) { + MD_BUG(); + return 0; + } + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + sz += sprintf(page + sz, "["); + for (i = 0; i < x; i++) + sz += sprintf(page + sz, "="); + sz += sprintf(page + sz, ">"); + for (i = 0; i < y; i++) + sz += sprintf(page + sz, "."); + sz += sprintf(page + sz, "] "); + } + if (!mddev->recovery_running) + /* + * true resync + */ + sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + else + /* + * recovery ... + */ + sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - mddev->resync_mark_cnt; + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + sz += sprintf(page + sz, " speed=%ldK/sec", db/dt); + + return sz; +} + +static int md_status_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int sz = 0, j, size; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + sz += sprintf(page + sz, "Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + sz += sprintf(page+sz, "[%s] ", pers[j]->name); + + sz += sprintf(page+sz, "\n"); + + + sz += sprintf(page+sz, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + sz += sprintf(page+sz, "not set\n"); + else + sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); + + ITERATE_MDDEV(mddev,tmp) { + sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + sz += sprintf(page + sz, " (read-only)"); + sz += sprintf(page + sz, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + sz += sprintf(page + sz, " %s[%d]", + partition_name(rdev->dev), rdev->desc_nr); + if (rdev->faulty) { + sz += sprintf(page + sz, "(F)"); + continue; + } + size += rdev->size; + } + + if (mddev->nb_dev) { + if (mddev->pers) + sz += sprintf(page + sz, "\n %d blocks", + md_size[mdidx(mddev)]); + else + sz += sprintf(page + sz, "\n %d blocks", size); + } + + if (!mddev->pers) { + sz += sprintf(page+sz, "\n"); + continue; + } + + sz += mddev->pers->status (page+sz, mddev); + + sz += sprintf(page+sz, "\n "); + if (mddev->curr_resync) { + sz += status_resync (page+sz, mddev); + } else { + if (md_atomic_read(&mddev->resync_sem.count) != 1) + sz += sprintf(page + sz, " resync=DELAYED"); + } + sz += sprintf(page + sz, "\n"); + } + sz += status_unused (page + sz); + + return sz; +} + +int register_md_personality (int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) + return -EINVAL; + + if (pers[pnum]) + return -EBUSY; + + pers[pnum] = p; + printk(KERN_INFO "%s personality registered\n", p->name); + return 0; +} + +int unregister_md_personality (int pnum) +{ + if (pnum >= MAX_PERSONALITY) + return -EINVAL; + + printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; +} + +static mdp_disk_t *get_spare(mddev_t *mddev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); + continue; + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); + continue; + } + if (disk_active(disk)) + continue; + return disk; + } + return NULL; +} + +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; +void md_sync_acct(kdev_t dev, unsigned long nr_sectors) +{ + unsigned int major = MAJOR(dev); + unsigned int index; + + index = disk_index(dev); + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + return; + + sync_io[major][index] += nr_sectors; +} + +static int is_mddev_idle (mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + int major = MAJOR(rdev->dev); + int idx = disk_index(rdev->dev); + + if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + continue; + + curr_events = kstat.dk_drive_rblk[major][idx] + + kstat.dk_drive_wblk[major][idx] ; + curr_events -= sync_io[major][idx]; +// printk("events(major: %d, idx: %d): %ld\n", major, idx, curr_events); + if (curr_events != rdev->last_events) { +// printk("!I(%ld)", curr_events - rdev->last_events); + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (1K) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + // stop recovery, signal do_sync .... + } +} + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) +{ + mddev_t *mddev2; + unsigned int max_blocks, currspeed, + j, window, err, serialize; + kdev_t read_disk = mddev_to_kdev(mddev); + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct md_list_head *tmp; + unsigned long last_check; + + + err = down_interruptible(&mddev->resync_sem); + if (err) + goto out_nolock; + +recheck: + serialize = 0; + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2)); + serialize = 1; + break; + } + } + if (serialize) { + interruptible_sleep_on(&resync_wait); + if (md_signal_pending(current)) { + md_flush_signals(); + err = -EINTR; + goto out; + } + goto recheck; + } + + mddev->curr_resync = 1; + + max_blocks = mddev->sb->size; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", + sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ + current->nice = 19; + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = 0; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = MAX_READAHEAD*(PAGE_SIZE/1024); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window,max_blocks); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + for (j = 0; j < max_blocks;) { + int blocks; + + blocks = mddev->pers->sync_request(mddev, j); + + if (blocks < 0) { + err = blocks; + goto out; + } + atomic_add(blocks, &mddev->recovery_active); + j += blocks; + mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + run_task_queue(&tq_disk); //?? + + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + mddev->curr_resync = 0; + printk("md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ +repeat: + if (md_need_resched(current)) + schedule(); + + currspeed = (j-mddev->resync_mark_cnt)/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + current->nice = 19; + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/4); + if (!md_signal_pending(current)) + goto repeat; + } + } else + current->nice = -20; + } + fsync_dev(read_disk); + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); + up(&mddev->resync_sem); +out_nolock: + mddev->curr_resync = 0; + wake_up(&resync_wait); + return err; +} + + +/* + * This is a kernel thread which syncs a spare disk with the active array + * + * the amount of foolproofing might seem to be a tad excessive, but an + * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs + * of my root partition with the first 0.5 gigs of my /home partition ... so + * i'm a bit nervous ;) + */ +void md_do_recovery (void *data) +{ + int err; + mddev_t *mddev; + mdp_super_t *sb; + mdp_disk_t *spare; + struct md_list_head *tmp; + + printk(KERN_INFO "md: recovery thread got woken up ...\n"); +restart: + ITERATE_MDDEV(mddev,tmp) { + sb = mddev->sb; + if (!sb) + continue; + if (mddev->recovery_running) + continue; + if (sb->active_disks == sb->raid_disks) + continue; + if (!sb->spare_disks) { + printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev)); + continue; + } + /* + * now here we get the spare and resync it. + */ + if ((spare = get_spare(mddev)) == NULL) + continue; + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!mddev->pers->diskop) + continue; + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) + continue; + down(&mddev->recovery_sem); + mddev->recovery_running = 1; + err = md_do_sync(mddev, spare); + if (err == -EIO) { + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!disk_faulty(spare)) { + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); + mark_disk_faulty(spare); + mark_disk_nonsync(spare); + mark_disk_inactive(spare); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + } + } else + if (disk_faulty(spare)) + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + if (err == -EINTR || err == -ENOMEM) { + /* + * Recovery got interrupted, or ran out of mem ... + * signal back that we have finished using the array. + */ + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + up(&mddev->recovery_sem); + mddev->recovery_running = 0; + continue; + } else { + mddev->recovery_running = 0; + up(&mddev->recovery_sem); + } + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + goto restart; + } + printk(KERN_INFO "md: recovery thread finished ...\n"); + +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct md_list_head *tmp; + mddev_t *mddev; + + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { + + printk(KERN_INFO "stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + md_mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + md_notify_reboot, + NULL, + 0 +}; +#ifndef MODULE +static int md__init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (strncmp(str, "noautodetect", wlen) == 0) + raid_setup_args.noautodetect = 1; + pos += wlen+1; + } + raid_setup_args.set = 1; + return 1; +} +__setup("raid=", raid_setup); +#endif +static void md_geninit (void) +{ + int i; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_size[i] = 0; + md_hardsect_sizes[i] = 512; + md_maxreadahead[i] = MD_READAHEAD; + register_disk(&md_gendisk, MKDEV(MAJOR_NR,i), 1, &md_fops, 0); + } + blksize_size[MAJOR_NR] = md_blocksizes; + blk_size[MAJOR_NR] = md_size; + max_readahead[MAJOR_NR] = md_maxreadahead; + hardsect_size[MAJOR_NR] = md_hardsect_sizes; + + printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL); +#endif +} +void hsm_init (void); +void translucent_init (void); +void linear_init (void); +void raid0_init (void); +void raid1_init (void); +void raid5_init (void); + +int md__init md_init (void) +{ + static char * name = "mdrecoveryd"; + + printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL); + + if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) + { + printk (KERN_ALERT "Unable to get major %d for md\n", MAJOR_NR); + return (-1); + } + devfs_handle = devfs_mk_dir (NULL, "md", NULL); + devfs_register_series (devfs_handle, "%u",MAX_MD_DEVS,DEVFS_FL_DEFAULT, + MAJOR_NR, 0, S_IFBLK | S_IRUSR | S_IWUSR, + &md_fops, NULL); + + /* forward all md request to md_make_request */ + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request); + + + read_ahead[MAJOR_NR] = INT_MAX; + md_gendisk.next = gendisk_head; + + gendisk_head = &md_gendisk; + + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); + if (!md_recovery_thread) + printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n"); + + md_register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + +#ifdef CONFIG_MD_LINEAR + linear_init (); +#endif +#ifdef CONFIG_MD_RAID0 + raid0_init (); +#endif +#ifdef CONFIG_MD_RAID1 + raid1_init (); +#endif +#ifdef CONFIG_MD_RAID5 + raid5_init (); +#endif + md_geninit(); + return (0); +} + +#ifdef CONFIG_MD_BOOT +#define MAX_MD_BOOT_DEVS 8 +struct { + unsigned long set; + int pers[MAX_MD_BOOT_DEVS]; + int chunk[MAX_MD_BOOT_DEVS]; + kdev_t devices[MAX_MD_BOOT_DEVS][MAX_REAL]; +} md_setup_args md__initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistant-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + */ +extern kdev_t name_to_kdev_t(char *line) md__init; +static int md__init md_setup(char *str) +{ + int minor, level, factor, fault, i=0; + kdev_t device; + char *devnames, *pername = ""; + + if(get_option(&str, &minor) != 2) { /* MD Number */ + printk("md: Too few arguments supplied to md=.\n"); + return 0; + } + if (minor >= MAX_MD_BOOT_DEVS) { + printk ("md: Minor device number too high.\n"); + return 0; + } else if (md_setup_args.set & (1 << minor)) { + printk ("md: Warning - md=%d,... has been specified twice;\n" + " will discard the first definition.\n", minor); + } + switch(get_option(&str, &level)) { /* RAID Personality */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == -1) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk("md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args.pers[minor] = level; + md_setup_args.chunk[minor] = 1 << (factor+12); + switch(level) { + case -1: + level = LINEAR; + pername = "linear"; + break; + case 0: + level = RAID0; + pername = "raid0"; + break; + default: + printk ("md: The kernel has not been configured for raid%d" + " support!\n", level); + return 0; + } + md_setup_args.pers[minor] = level; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + md_setup_args.devices[minor][i++] = level; + /* FALL THROUGH */ + case 0: + md_setup_args.pers[minor] = 0; + pername="super-block"; + } + devnames = str; + for (; i<MAX_REAL && str; i++) { + if ((device = name_to_kdev_t(str))) { + md_setup_args.devices[minor][i] = device; + } else { + printk ("md: Unknown device name, %s.\n", str); + return 0; + } + if ((str = strchr(str, ',')) != NULL) + str++; + } + if (!i) { + printk ("md: No devices specified for md%d?\n", minor); + return 0; + } + + printk ("md: Will configure md%d (%s) from %s, below.\n", + minor, pername, devnames); + md_setup_args.devices[minor][i] = (kdev_t) 0; + md_setup_args.set |= (1 << minor); + return 1; +} + +void md__init md_setup_drive(void) +{ + int minor, i; + kdev_t dev; + mddev_t*mddev; + + for (minor = 0; minor < MAX_MD_BOOT_DEVS; minor++) { + mdu_disk_info_t dinfo; + int err=0; + if (!(md_setup_args.set & (1 << minor))) + continue; + printk("md: Loading md%d.\n", minor); + if (mddev_map[minor].mddev) { + printk(".. md%d already autodetected - use raid=noautodetect\n", minor); + continue; + } + mddev = alloc_mddev(MKDEV(MD_MAJOR,minor)); + if (md_setup_args.pers[minor]) { + /* non-persistent */ + mdu_array_info_t ainfo; + ainfo.level = pers_to_level(md_setup_args.pers[minor]); + ainfo.size = 0; + ainfo.nr_disks =0; + ainfo.raid_disks =0; + ainfo.md_minor =minor; + ainfo.not_persistent = 1; + + ainfo.state = MD_SB_CLEAN; + ainfo.active_disks = 0; + ainfo.working_disks = 0; + ainfo.failed_disks = 0; + ainfo.spare_disks = 0; + ainfo.layout = 0; + ainfo.chunk_size = md_setup_args.chunk[minor]; + err = set_array_info(mddev, &ainfo); + for (i=0; !err && (dev = md_setup_args.devices[minor][i]); i++) { + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC); + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + mddev->sb->nr_disks++; + mddev->sb->raid_disks++; + mddev->sb->active_disks++; + mddev->sb->working_disks++; + err = add_new_disk (mddev, &dinfo); + } + } else { + /* persistent */ + for (i = 0; (dev = md_setup_args.devices[minor][i]); i++) { + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + add_new_disk (mddev, &dinfo); + } + } + if (!err) + err = do_md_run(mddev); + if (err) { + mddev->sb_dirty = 0; + do_md_stop(mddev, 0); + printk("md: starting md%d failed\n", minor); + } + } +} + +__setup("md=", md_setup); +#endif + +#ifdef MODULE +int init_module (void) +{ + return md_init(); +} + +static void free_device_names(void) +{ + while (device_names.next != &device_names) { + struct list_head *tmp = device_names.next; + list_del(tmp); + kfree(tmp); + } +} + + +void cleanup_module (void) +{ + struct gendisk **gendisk_ptr; + + md_unregister_thread(md_recovery_thread); + devfs_unregister(devfs_handle); + + devfs_unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + + gendisk_ptr = &gendisk_head; + while (*gendisk_ptr) { + if (*gendisk_ptr == &md_gendisk) { + *gendisk_ptr = md_gendisk.next; + break; + } + gendisk_ptr = & (*gendisk_ptr)->next; + } + blk_dev[MAJOR_NR].queue = NULL; + blksize_size[MAJOR_NR] = NULL; + blk_size[MAJOR_NR] = NULL; + max_readahead[MAJOR_NR] = NULL; + hardsect_size[MAJOR_NR] = NULL; + + free_device_names(); + +} +#endif + +__initcall(md_init); +#ifdef CONFIG_AUTODETECT_RAID +__initcall(md_run_setup); +#endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_do_sync); +MD_EXPORT_SYMBOL(md_sync_acct); +MD_EXPORT_SYMBOL(md_done_sync); +MD_EXPORT_SYMBOL(md_recover_arrays); +MD_EXPORT_SYMBOL(md_register_thread); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_interrupt_thread); +MD_EXPORT_SYMBOL(mddev_map); +MD_EXPORT_SYMBOL(md_check_ordering); + diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c new file mode 100644 index 000000000..09f3f8547 --- /dev/null +++ b/drivers/md/raid0.c @@ -0,0 +1,356 @@ +/* + raid0.c : Multiple Devices driver for Linux + Copyright (C) 1994-96 Marc ZYNGIER + <zyngier@ufr-info-p7.ibp.fr> or + <maz@gloups.fdn.fr> + Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + + + RAID-0 management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include <linux/module.h> +#include <linux/raid/raid0.h> + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +static int create_strip_zones (mddev_t *mddev) +{ + int i, c, j, j1, j2; + unsigned long current_offset, curr_zone_offset; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; + + /* + * The number of 'same size groups' + */ + conf->nr_strip_zones = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev1,j1) { + printk("raid0: looking at %s\n", partition_name(rdev1->dev)); + c = 0; + ITERATE_RDEV_ORDERED(mddev,rdev2,j2) { + printk("raid0: comparing %s(%ld) with %s(%ld)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size); + if (rdev2 == rdev1) { + printk("raid0: END\n"); + break; + } + if (rdev2->size == rdev1->size) + { + /* + * Not unique, dont count it as a new + * group + */ + printk("raid0: EQUAL\n"); + c = 1; + break; + } + printk("raid0: NOT EQUAL\n"); + } + if (!c) { + printk("raid0: ==> UNIQUE\n"); + conf->nr_strip_zones++; + printk("raid0: %d zones\n", conf->nr_strip_zones); + } + } + printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); + + conf->strip_zone = vmalloc(sizeof(struct strip_zone)* + conf->nr_strip_zones); + if (!conf->strip_zone) + return 1; + + + conf->smallest = NULL; + current_offset = 0; + curr_zone_offset = 0; + + for (i = 0; i < conf->nr_strip_zones; i++) + { + struct strip_zone *zone = conf->strip_zone + i; + + printk("zone %d\n", i); + zone->dev_offset = current_offset; + smallest = NULL; + c = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + + printk(" checking %s ...", partition_name(rdev->dev)); + if (rdev->size > current_offset) + { + printk(" contained as device %d\n", c); + zone->dev[c] = rdev; + c++; + if (!smallest || (rdev->size <smallest->size)) { + smallest = rdev; + printk(" (%ld) is smallest!.\n", rdev->size); + } + } else + printk(" nope.\n"); + } + + zone->nb_dev = c; + zone->size = (smallest->size - current_offset) * c; + printk(" zone->nb_dev: %d, size: %ld\n",zone->nb_dev,zone->size); + + if (!conf->smallest || (zone->size < conf->smallest->size)) + conf->smallest = zone; + + zone->zone_offset = curr_zone_offset; + curr_zone_offset += zone->size; + + current_offset = smallest->size; + printk("current zone offset: %ld\n", current_offset); + } + printk("done.\n"); + return 0; +} + +static int raid0_run (mddev_t *mddev) +{ + unsigned long cur=0, i=0, size, zone0_size, nb_zone; + raid0_conf_t *conf; + + MOD_INC_USE_COUNT; + + conf = vmalloc(sizeof (raid0_conf_t)); + if (!conf) + goto out; + mddev->private = (void *)conf; + + if (md_check_ordering(mddev)) { + printk("raid0: disks are not ordered, aborting!\n"); + goto out_free_conf; + } + + if (create_strip_zones (mddev)) + goto out_free_conf; + + printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]); + printk("raid0 : conf->smallest->size is %ld blocks.\n", conf->smallest->size); + nb_zone = md_size[mdidx(mddev)]/conf->smallest->size + + (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0); + printk("raid0 : nb_zone is %ld.\n", nb_zone); + conf->nr_zones = nb_zone; + + printk("raid0 : Allocating %ld bytes for hash.\n", + nb_zone*sizeof(struct raid0_hash)); + + conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone); + if (!conf->hash_table) + goto out_free_zone_conf; + size = conf->strip_zone[cur].size; + + i = 0; + while (cur < conf->nr_strip_zones) { + conf->hash_table[i].zone0 = conf->strip_zone + cur; + + /* + * If we completely fill the slot + */ + if (size >= conf->smallest->size) { + conf->hash_table[i++].zone1 = NULL; + size -= conf->smallest->size; + + if (!size) { + if (++cur == conf->nr_strip_zones) + continue; + size = conf->strip_zone[cur].size; + } + continue; + } + if (++cur == conf->nr_strip_zones) { + /* + * Last dev, set unit1 as NULL + */ + conf->hash_table[i].zone1=NULL; + continue; + } + + /* + * Here we use a 2nd dev to fill the slot + */ + zone0_size = size; + size = conf->strip_zone[cur].size; + conf->hash_table[i++].zone1 = conf->strip_zone + cur; + size -= (conf->smallest->size - zone0_size); + } + return 0; + +out_free_zone_conf: + vfree(conf->strip_zone); + conf->strip_zone = NULL; + +out_free_conf: + vfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return 1; +} + +static int raid0_stop (mddev_t *mddev) +{ + raid0_conf_t *conf = mddev_to_conf(mddev); + + vfree (conf->hash_table); + conf->hash_table = NULL; + vfree (conf->strip_zone); + conf->strip_zone = NULL; + vfree (conf); + mddev->private = NULL; + + MOD_DEC_USE_COUNT; + return 0; +} + +/* + * FIXME - We assume some things here : + * - requested buffers NEVER bigger than chunk size, + * - requested buffers NEVER cross stripes limits. + * Of course, those facts may not be valid anymore (and surely won't...) + * Hey guys, there's some work out there ;-) + */ +static int raid0_make_request (mddev_t *mddev, + int rw, struct buffer_head * bh) +{ + unsigned int sect_in_chunk, chunksize_bits, chunk_size; + raid0_conf_t *conf = mddev_to_conf(mddev); + struct raid0_hash *hash; + struct strip_zone *zone; + mdk_rdev_t *tmp_dev; + unsigned long chunk, block, rsect; + + chunk_size = mddev->param.chunk_size >> 10; + chunksize_bits = ffz(~chunk_size); + block = bh->b_rsector >> 1; + hash = conf->hash_table + block / conf->smallest->size; + + /* Sanity check */ + if (chunk_size < (block % chunk_size) + (bh->b_size >> 10)) + goto bad_map; + + if (!hash) + goto bad_hash; + + if (!hash->zone0) + goto bad_zone0; + + if (block >= (hash->zone0->size + hash->zone0->zone_offset)) { + if (!hash->zone1) + goto bad_zone1; + zone = hash->zone1; + } else + zone = hash->zone0; + + sect_in_chunk = bh->b_rsector & ((chunk_size<<1) -1); + chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits); + tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev]; + rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1) + + sect_in_chunk; + + /* + * The new BH_Lock semantics in ll_rw_blk.c guarantee that this + * is the only IO operation happening on this bh. + */ + bh->b_rdev = tmp_dev->dev; + bh->b_rsector = rsect; + + /* + * Let the main block layer submit the IO and resolve recursion: + */ + return 1; + +bad_map: + printk ("raid0_make_request bug: can't convert block across chunks or bigger than %dk %ld %d\n", chunk_size, bh->b_rsector, bh->b_size >> 10); + return -1; +bad_hash: + printk("raid0_make_request bug: hash==NULL for block %ld\n", block); + return -1; +bad_zone0: + printk ("raid0_make_request bug: hash->zone0==NULL for block %ld\n", block); + return -1; +bad_zone1: + printk ("raid0_make_request bug: hash->zone1==NULL for block %ld\n", block); + return -1; +} + +static int raid0_status (char *page, mddev_t *mddev) +{ + int sz = 0; +#undef MD_DEBUG +#ifdef MD_DEBUG + int j, k; + raid0_conf_t *conf = mddev_to_conf(mddev); + + sz += sprintf(page + sz, " "); + for (j = 0; j < conf->nr_zones; j++) { + sz += sprintf(page + sz, "[z%d", + conf->hash_table[j].zone0 - conf->strip_zone); + if (conf->hash_table[j].zone1) + sz += sprintf(page+sz, "/z%d] ", + conf->hash_table[j].zone1 - conf->strip_zone); + else + sz += sprintf(page+sz, "] "); + } + + sz += sprintf(page + sz, "\n"); + + for (j = 0; j < conf->nr_strip_zones; j++) { + sz += sprintf(page + sz, " z%d=[", j); + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + sz += sprintf (page+sz, "%s/", partition_name( + conf->strip_zone[j].dev[k]->dev)); + sz--; + sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n", + conf->strip_zone[j].zone_offset, + conf->strip_zone[j].dev_offset, + conf->strip_zone[j].size); + } +#endif + sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024); + return sz; +} + +static mdk_personality_t raid0_personality= +{ + name: "raid0", + make_request: raid0_make_request, + run: raid0_run, + stop: raid0_stop, + status: raid0_status, +}; + +#ifndef MODULE + +void raid0_init (void) +{ + register_md_personality (RAID0, &raid0_personality); +} + +#else + +int init_module (void) +{ + return (register_md_personality (RAID0, &raid0_personality)); +} + +void cleanup_module (void) +{ + unregister_md_personality (RAID0); +} + +#endif + diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c new file mode 100644 index 000000000..b39c87e0e --- /dev/null +++ b/drivers/md/raid1.c @@ -0,0 +1,1897 @@ +/* + * raid1.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * RAID-1 management functions. + * + * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 + * + * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> + * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/module.h> +#include <linux/malloc.h> +#include <linux/raid/raid1.h> +#include <asm/atomic.h> + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +#define MAX_WORK_PER_DISK 128 + +/* + * The following can be used to debug the driver + */ +#define RAID1_DEBUG 0 + +#if RAID1_DEBUG +#define PRINTK(x...) printk(x) +#define inline +#define __inline__ +#else +#define PRINTK(x...) do { } while (0) +#endif + + +static mdk_personality_t raid1_personality; +static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; +struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; + +static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) +{ + /* return a linked list of "cnt" struct buffer_heads. + * don't take any off the free list unless we know we can + * get all we need, otherwise we could deadlock + */ + struct buffer_head *bh=NULL; + + while(cnt) { + struct buffer_head *t; + md_spin_lock_irq(&conf->device_lock); + if (conf->freebh_cnt >= cnt) + while (cnt) { + t = conf->freebh; + conf->freebh = t->b_next; + t->b_next = bh; + bh = t; + t->b_state = 0; + conf->freebh_cnt--; + cnt--; + } + md_spin_unlock_irq(&conf->device_lock); + if (cnt == 0) + break; + t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_BUFFER); + if (t) { + memset(t, 0, sizeof(*t)); + t->b_next = bh; + bh = t; + cnt--; + } else { + PRINTK("waiting for %d bh\n", cnt); + wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt); + } + } + return bh; +} + +static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) +{ + md_spin_lock_irq(&conf->device_lock); + while (bh) { + struct buffer_head *t = bh; + bh=bh->b_next; + if (t->b_pprev == NULL) + kfree(t); + else { + t->b_next= conf->freebh; + conf->freebh = t; + conf->freebh_cnt++; + } + } + md_spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_buffer); +} + +static int raid1_grow_bh(raid1_conf_t *conf, int cnt) +{ + /* allocate cnt buffer_heads, possibly less if kalloc fails */ + int i = 0; + + while (i < cnt) { + struct buffer_head *bh; + bh = kmalloc(sizeof(*bh), GFP_KERNEL); + if (!bh) break; + memset(bh, 0, sizeof(*bh)); + + md_spin_lock_irq(&conf->device_lock); + bh->b_pprev = &conf->freebh; + bh->b_next = conf->freebh; + conf->freebh = bh; + conf->freebh_cnt++; + md_spin_unlock_irq(&conf->device_lock); + + i++; + } + return i; +} + +static int raid1_shrink_bh(raid1_conf_t *conf, int cnt) +{ + /* discard cnt buffer_heads, if we can find them */ + int i = 0; + + md_spin_lock_irq(&conf->device_lock); + while ((i < cnt) && conf->freebh) { + struct buffer_head *bh = conf->freebh; + conf->freebh = bh->b_next; + kfree(bh); + i++; + conf->freebh_cnt--; + } + md_spin_unlock_irq(&conf->device_lock); + return i; +} + + +static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh = NULL; + + do { + md_spin_lock_irq(&conf->device_lock); + if (conf->freer1) { + r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + r1_bh->next_r1 = NULL; + r1_bh->state = 0; + r1_bh->bh_req.b_state = 0; + } + md_spin_unlock_irq(&conf->device_lock); + if (r1_bh) + return r1_bh; + r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), + GFP_BUFFER); + if (r1_bh) { + memset(r1_bh, 0, sizeof(*r1_bh)); + return r1_bh; + } + wait_event(conf->wait_buffer, conf->freer1); + } while (1); +} + +static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) +{ + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + + r1_bh->mirror_bh_list = NULL; + + if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { + md_spin_lock_irq(&conf->device_lock); + r1_bh->next_r1 = conf->freer1; + conf->freer1 = r1_bh; + md_spin_unlock_irq(&conf->device_lock); + } else { + kfree(r1_bh); + } + raid1_free_bh(conf, bh); +} + +static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + while (i < cnt) { + struct raid1_bh *r1_bh; + r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) + break; + memset(r1_bh, 0, sizeof(*r1_bh)); + + md_spin_lock_irq(&conf->device_lock); + set_bit(R1BH_PreAlloc, &r1_bh->state); + r1_bh->next_r1 = conf->freer1; + conf->freer1 = r1_bh; + md_spin_unlock_irq(&conf->device_lock); + + i++; + } + return i; +} + +static void raid1_shrink_r1bh(raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freer1) { + struct raid1_bh *r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); +} + + + +static inline void raid1_free_buf(struct raid1_bh *r1_bh) +{ + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + r1_bh->mirror_bh_list = NULL; + + md_spin_lock_irq(&conf->device_lock); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + md_spin_unlock_irq(&conf->device_lock); + raid1_free_bh(conf, bh); +} + +static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh; + + md_spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); + r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + r1_bh->next_r1= NULL; + md_spin_unlock_irq(&conf->device_lock); + + return r1_bh; +} + +static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + md_spin_lock_irq(&conf->device_lock); + while (i < cnt) { + struct raid1_bh *r1_bh; + struct page *page; + + page = alloc_page(GFP_KERNEL); + if (!page) + break; + + r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) { + __free_page(page); + break; + } + memset(r1_bh, 0, sizeof(*r1_bh)); + r1_bh->bh_req.b_page = page; + r1_bh->bh_req.b_data = page_address(page); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + i++; + } + md_spin_unlock_irq(&conf->device_lock); + return i; +} + +static void raid1_shrink_buffers (raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freebuf) { + struct raid1_bh *r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + __free_page(r1_bh->bh_req.b_page); + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); +} + +static int raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int i, disks = MD_SB_DISKS; + + /* + * Later we do read balancing on the read side + * now we use the first available disk. + */ + + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].operational) { + *rdev = conf->mirrors[i].dev; + return (0); + } + } + + printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); + return (-1); +} + +static void raid1_reschedule_retry (struct raid1_bh *r1_bh) +{ + unsigned long flags; + mddev_t *mddev = r1_bh->mddev; + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_spin_lock_irqsave(&retry_list_lock, flags); + if (raid1_retry_list == NULL) + raid1_retry_tail = &raid1_retry_list; + *raid1_retry_tail = r1_bh; + raid1_retry_tail = &r1_bh->next_r1; + r1_bh->next_r1 = NULL; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + md_wakeup_thread(conf->thread); +} + + +static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector < conf->start_active) + conf->cnt_done--; + else if (sector >= conf->start_future && conf->phase == phase) + conf->cnt_future--; + else if (!--conf->cnt_pending) + wake_up(&conf->wait_ready); + + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector >= conf->start_ready) + --conf->cnt_ready; + else if (sector >= conf->start_active) { + if (!--conf->cnt_active) { + conf->start_active = conf->start_ready; + wake_up(&conf->wait_done); + } + } + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +/* + * raid1_end_bh_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) +{ + struct buffer_head *bh = r1_bh->master_bh; + + io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), + test_bit(R1BH_SyncPhase, &r1_bh->state)); + + bh->b_end_io(bh, uptodate); + raid1_free_r1bh(r1_bh); +} +void raid1_end_request (struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (!uptodate) + md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev); + else + /* + * Set R1BH_Uptodate in our master buffer_head, so that + * we will return a good error code for to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the complex operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' buffer_head. + */ + set_bit (R1BH_Uptodate, &r1_bh->state); + + /* + * We split up the read and write side, imho they are + * conceptually different. + */ + + if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { + /* + * we have only one buffer_head on the read side + */ + + if (uptodate) { + raid1_end_bh_io(r1_bh, uptodate); + return; + } + /* + * oops, read error: + */ + printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", + partition_name(bh->b_dev), bh->b_blocknr); + raid1_reschedule_retry(r1_bh); + return; + } + + /* + * WRITE: + * + * Let's see if all mirrored write operations have finished + * already. + */ + + if (atomic_dec_and_test(&r1_bh->remaining)) + raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); +} + +/* + * This routine returns the disk from which the requested read should + * be done. It bookkeeps the last read position for every disk + * in array and when new read requests come, the disk which last + * position is nearest to the request, is chosen. + * + * TODO: now if there are 2 mirrors in the same 2 devices, performance + * degrades dramatically because position is mirror, not device based. + * This should be changed to be device based. Also atomic sequential + * reads should be somehow balanced. + */ + +static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) +{ + int new_disk = conf->last_used; + const int sectors = bh->b_size >> 9; + const unsigned long this_sector = bh->b_rsector; + int disk = new_disk; + unsigned long new_distance; + unsigned long current_distance; + + /* + * Check if it is sane at all to balance + */ + + if (conf->resync_mirrors) + goto rb_out; + + if (conf->working_disks < 2) { + int i = 0; + + while( !conf->mirrors[new_disk].operational && + (i < MD_SB_DISKS) ) { + new_disk = conf->mirrors[new_disk].next; + i++; + } + + if (i >= MD_SB_DISKS) { + /* + * This means no working disk was found + * Nothing much to do, lets not change anything + * and hope for the best... + */ + + new_disk = conf->last_used; + } + + goto rb_out; + } + + /* + * Don't touch anything for sequential reads. + */ + + if (this_sector == conf->mirrors[new_disk].head_position) + goto rb_out; + + /* + * If reads have been done only on a single disk + * for a time, lets give another disk a change. + * This is for kicking those idling disks so that + * they would find work near some hotspot. + */ + + if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { + conf->sect_count = 0; + + while( new_disk != conf->mirrors[new_disk].next ) { + if ((conf->mirrors[new_disk].write_only) || + (!conf->mirrors[new_disk].operational) ) + continue; + + new_disk = conf->mirrors[new_disk].next; + break; + } + + goto rb_out; + } + + current_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + /* Find the disk which is closest */ + + while( conf->mirrors[disk].next != conf->last_used ) { + disk = conf->mirrors[disk].next; + + if ((conf->mirrors[disk].write_only) || + (!conf->mirrors[disk].operational)) + continue; + + new_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + if (new_distance < current_distance) { + conf->sect_count = 0; + current_distance = new_distance; + new_disk = disk; + } + } + +rb_out: + conf->mirrors[new_disk].head_position = this_sector + sectors; + + conf->last_used = new_disk; + conf->sect_count += sectors; + + return new_disk; +} + +static int raid1_make_request (mddev_t *mddev, int rw, + struct buffer_head * bh) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct buffer_head *bh_req, *bhl; + struct raid1_bh * r1_bh; + int disks = MD_SB_DISKS; + int i, sum_bhs = 0, sectors; + struct mirror_info *mirror; + + if (!buffer_locked(bh)) + BUG(); + +/* + * make_request() can abort the operation when READA is being + * used and no empty request is available. + * + * Currently, just replace the command with READ/WRITE. + */ + if (rw == READA) + rw = READ; + + r1_bh = raid1_alloc_r1bh (conf); + + spin_lock_irq(&conf->segment_lock); + wait_event_lock_irq(conf->wait_done, + bh->b_rsector < conf->start_active || + bh->b_rsector >= conf->start_future, + conf->segment_lock); + if (bh->b_rsector < conf->start_active) + conf->cnt_done++; + else { + conf->cnt_future++; + if (conf->phase) + set_bit(R1BH_SyncPhase, &r1_bh->state); + } + spin_unlock_irq(&conf->segment_lock); + + /* + * i think the read and write branch should be separated completely, + * since we want to do read balancing on the read side for example. + * Alternative implementations? :) --mingo + */ + + r1_bh->master_bh = bh; + r1_bh->mddev = mddev; + r1_bh->cmd = rw; + + sectors = bh->b_size >> 9; + if (rw == READ) { + /* + * read balancing logic: + */ + mirror = conf->mirrors + raid1_read_balance(conf, bh); + + bh_req = &r1_bh->bh_req; + memcpy(bh_req, bh, sizeof(*bh)); + bh_req->b_blocknr = bh->b_rsector * sectors; + bh_req->b_dev = mirror->dev; + bh_req->b_rdev = mirror->dev; + /* bh_req->b_rsector = bh->n_rsector; */ + bh_req->b_end_io = raid1_end_request; + bh_req->b_private = r1_bh; + generic_make_request (rw, bh_req); + return 0; + } + + /* + * WRITE: + */ + + bhl = raid1_alloc_bh(conf, conf->raid_disks); + for (i = 0; i < disks; i++) { + struct buffer_head *mbh; + if (!conf->mirrors[i].operational) + continue; + + /* + * We should use a private pool (size depending on NR_REQUEST), + * to avoid writes filling up the memory with bhs + * + * Such pools are much faster than kmalloc anyways (so we waste + * almost nothing by not using the master bh when writing and + * win alot of cleanness) but for now we are cool enough. --mingo + * + * It's safe to sleep here, buffer heads cannot be used in a shared + * manner in the write branch. Look how we lock the buffer at the + * beginning of this function to grok the difference ;) + */ + mbh = bhl; + if (mbh == NULL) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_next = NULL; + mbh->b_this_page = (struct buffer_head *)1; + + /* + * prepare mirrored mbh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_rsector * sectors; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_rsector; + mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) | + (1<<BH_Mapped) | (1<<BH_Lock); + + atomic_set(&mbh->b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = raid1_end_request; + mbh->b_private = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + sum_bhs++; + } + if (bhl) raid1_free_bh(conf,bhl); + md_atomic_set(&r1_bh->remaining, sum_bhs); + + /* + * We have to be a bit careful about the semaphore above, thats + * why we start the requests separately. Since kmalloc() could + * fail, sleep and make_request() can sleep too, this is the + * safer solution. Imagine, end_request decreasing the semaphore + * before we could have set it up ... We could play tricks with + * the semaphore (presetting it and correcting at the end if + * sum_bhs is not 'n' but we have to do end_request by hand if + * all requests finish until we had a chance to set up the + * semaphore correctly ... lots of races). + */ + bh = r1_bh->mirror_bh_list; + while(bh) { + struct buffer_head *bh2 = bh; + bh = bh->b_next; + generic_make_request(rw, bh2); + } + return (0); +} + +static int raid1_status (char *page, mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int sz = 0, i; + + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", + conf->mirrors[i].operational ? "U" : "_"); + sz += sprintf (page+sz, "]"); + return sz; +} + +static void unlink_disk (raid1_conf_t *conf, int target) +{ + int disks = MD_SB_DISKS; + int i; + + for (i = 0; i < disks; i++) + if (conf->mirrors[i].next == target) + conf->mirrors[i].next = conf->mirrors[target].next; +} + +#define LAST_DISK KERN_ALERT \ +"raid1: only one disk left and IO error.\n" + +#define NO_SPARE_DISK KERN_ALERT \ +"raid1: no spare disk left, degrading mirror level by one.\n" + +#define DISK_FAILED KERN_ALERT \ +"raid1: Disk failure on %s, disabling device. \n" \ +" Operation continuing on %d devices\n" + +#define START_SYNCING KERN_ALERT \ +"raid1: start syncing spare disk.\n" + +#define ALREADY_SYNCING KERN_INFO \ +"raid1: syncing already in progress.\n" + +static void mark_disk_bad (mddev_t *mddev, int failed) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror = conf->mirrors+failed; + mdp_super_t *sb = mddev->sb; + + mirror->operational = 0; + unlink_disk(conf, failed); + mark_disk_faulty(sb->disks+mirror->number); + mark_disk_nonsync(sb->disks+mirror->number); + mark_disk_inactive(sb->disks+mirror->number); + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + conf->working_disks--; + printk (DISK_FAILED, partition_name (mirror->dev), + conf->working_disks); +} + +static int raid1_error (mddev_t *mddev, kdev_t dev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info * mirrors = conf->mirrors; + int disks = MD_SB_DISKS; + int i; + + if (conf->working_disks == 1) { + /* + * Uh oh, we can do nothing if this is our last disk, but + * first check if this is a queued request for a device + * which has just failed. + */ + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && !mirrors[i].operational) + return 0; + } + printk (LAST_DISK); + } else { + /* + * Mark disk as unusable + */ + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && mirrors[i].operational) { + mark_disk_bad(mddev, i); + break; + } + } + } + return 0; +} + +#undef LAST_DISK +#undef NO_SPARE_DISK +#undef DISK_FAILED +#undef START_SYNCING + +/* + * Insert the spare disk into the drive-ring + */ +static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror) +{ + int j, next; + int disks = MD_SB_DISKS; + struct mirror_info *p = conf->mirrors; + + for (j = 0; j < disks; j++, p++) + if (p->operational && !p->write_only) { + next = p->next; + p->next = mirror->raid_disk; + mirror->next = next; + return; + } + + printk("raid1: bug: no read-operational devices\n"); +} + +static void print_raid1_conf (raid1_conf_t *conf) +{ + int i; + struct mirror_info *tmp; + + printk("RAID1 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, + conf->raid_disks, conf->nr_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid1_conf_t *conf = mddev->private; + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + + print_raid1_conf(conf); + md_spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ + switch (state) { + + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID1 configuration ... + * (this can only be in the first conf->working_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + } + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->nr_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + link_disk(conf, fdisk); + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + + conf->working_disks++; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->mirrors + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + conf->nr_disks--; + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->mirrors + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + adisk->head_position = 0; + conf->nr_disks++; + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); + if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) + /* should move to "END_REBUILD" when such exists */ + raid1_shrink_buffers(conf); + + print_raid1_conf(conf); + return err; +} + + +#define IO_ERROR KERN_ALERT \ +"raid1: %s: unrecoverable I/O read error for block %lu\n" + +#define REDIRECT_SECTOR KERN_ERR \ +"raid1: %s: redirecting sector %lu to another mirror\n" + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. + */ +static void end_sync_write(struct buffer_head *bh, int uptodate); +static void end_sync_read(struct buffer_head *bh, int uptodate); + +static void raid1d (void *data) +{ + struct raid1_bh *r1_bh; + struct buffer_head *bh; + unsigned long flags; + mddev_t *mddev; + kdev_t dev; + + + for (;;) { + md_spin_lock_irqsave(&retry_list_lock, flags); + r1_bh = raid1_retry_list; + if (!r1_bh) + break; + raid1_retry_list = r1_bh->next_r1; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + + mddev = r1_bh->mddev; + if (mddev->sb_dirty) { + printk(KERN_INFO "dirty sb detected, updating.\n"); + mddev->sb_dirty = 0; + md_update_sb(mddev); + } + bh = &r1_bh->bh_req; + switch(r1_bh->cmd) { + case SPECIAL: + /* have to allocate lots of bh structures and + * schedule writes + */ + if (test_bit(R1BH_Uptodate, &r1_bh->state)) { + int i, sum_bhs = 0; + int disks = MD_SB_DISKS; + struct buffer_head *bhl, *mbh; + raid1_conf_t *conf; + int sectors = bh->b_size >> 9; + + conf = mddev_to_conf(mddev); + bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ + for (i = 0; i < disks ; i++) { + if (!conf->mirrors[i].operational) + continue; + if (i==conf->last_used) + /* we read from here, no need to write */ + continue; + if (i < conf->raid_disks + && !conf->resync_mirrors) + /* don't need to write this, + * we are just rebuilding */ + continue; + mbh = bhl; + if (!mbh) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_this_page = (struct buffer_head *)1; + + + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_blocknr; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_blocknr * sectors; + mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) | + (1<<BH_Mapped) | (1<<BH_Lock); + atomic_set(&mbh->b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = end_sync_write; + mbh->b_private = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + + sum_bhs++; + } + md_atomic_set(&r1_bh->remaining, sum_bhs); + if (bhl) raid1_free_bh(conf, bhl); + mbh = r1_bh->mirror_bh_list; + while (mbh) { + struct buffer_head *bh1 = mbh; + mbh = mbh->b_next; + generic_make_request(WRITE, bh1); + md_sync_acct(bh1->b_rdev, bh1->b_size/512); + } + } else { + dev = bh->b_dev; + raid1_map (mddev, &bh->b_dev, bh->b_size >> 9); + if (bh->b_dev == dev) { + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + md_done_sync(mddev, bh->b_size>>10, 0); + } else { + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); + bh->b_rdev = bh->b_dev; + generic_make_request(READ, bh); + } + } + + break; + case READ: + case READA: + dev = bh->b_dev; + + raid1_map (mddev, &bh->b_dev, bh->b_size >> 9); + if (bh->b_dev == dev) { + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + raid1_end_bh_io(r1_bh, 0); + } else { + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); + bh->b_rdev = bh->b_dev; + generic_make_request (r1_bh->cmd, bh); + } + break; + } + } + md_spin_unlock_irqrestore(&retry_list_lock, flags); +} +#undef IO_ERROR +#undef REDIRECT_SECTOR + +/* + * Private kernel thread to reconstruct mirrors after an unclean + * shutdown. + */ +static void raid1syncd (void *data) +{ + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_mirrors) + return; + if (conf->resync_mirrors == 2) + return; + down(&mddev->recovery_sem); + if (!md_do_sync(mddev, NULL)) { + /* + * Only if everything went Ok. + */ + conf->resync_mirrors = 0; + } + + /* If reconstruction was interrupted, we need to close the "active" and "pending" + * holes. + * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 + */ + /* this is really needed when recovery stops too... */ + spin_lock_irq(&conf->segment_lock); + conf->start_active = conf->start_pending; + conf->start_ready = conf->start_pending; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; + conf->start_future = mddev->sb->size+1; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + conf->phase = conf->phase ^1; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; + conf->phase = 0; + conf->cnt_future = conf->cnt_done;; + conf->cnt_done = 0; + spin_unlock_irq(&conf->segment_lock); + wake_up(&conf->wait_done); + + up(&mddev->recovery_sem); + raid1_shrink_buffers(conf); +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * This is achieved by conceptually dividing the device space into a + * number of sections: + * DONE: 0 .. a-1 These blocks are in-sync + * ACTIVE: a.. b-1 These blocks may have active sync requests, but + * no normal IO requests + * READY: b .. c-1 These blocks have no normal IO requests - sync + * request may be happening + * PENDING: c .. d-1 These blocks may have IO requests, but no new + * ones will be added + * FUTURE: d .. end These blocks are not to be considered yet. IO may + * be happening, but not sync + * + * We keep a + * phase which flips (0 or 1) each time d moves and + * a count of: + * z = active io requests in FUTURE since d moved - marked with + * current phase + * y = active io requests in FUTURE before d moved, or PENDING - + * marked with previous phase + * x = active sync requests in READY + * w = active sync requests in ACTIVE + * v = active io requests in DONE + * + * Normally, a=b=c=d=0 and z= active io requests + * or a=b=c=d=END and v= active io requests + * Allowed changes to a,b,c,d: + * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase + * B: y==0 -> c=d + * C: b=c, w+=x, x=0 + * D: w==0 -> a=b + * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 + * + * At start of sync we apply A. + * When y reaches 0, we apply B then A then being sync requests + * When sync point reaches c-1, we wait for y==0, and W==0, and + * then apply apply B then A then D then C. + * Finally, we apply E + * + * The sync request simply issues a "read" against a working drive + * This is marked so that on completion the raid1d thread is woken to + * issue suitable write requests + */ + +static int raid1_sync_request (mddev_t *mddev, unsigned long block_nr) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror; + struct raid1_bh *r1_bh; + struct buffer_head *bh; + int bsize; + + spin_lock_irq(&conf->segment_lock); + if (!block_nr) { + /* initialize ...*/ + int buffs; + conf->start_active = 0; + conf->start_ready = 0; + conf->start_pending = 0; + conf->start_future = 0; + conf->phase = 0; + /* we want enough buffers to hold twice the window of 128*/ + buffs = 128 *2 / (PAGE_SIZE>>9); + buffs = raid1_grow_buffers(conf, buffs); + if (buffs < 2) + goto nomem; + + conf->window = buffs*(PAGE_SIZE>>9)/2; + conf->cnt_future += conf->cnt_done+conf->cnt_pending; + conf->cnt_done = conf->cnt_pending = 0; + if (conf->cnt_ready || conf->cnt_active) + MD_BUG(); + } + while ((block_nr<<1) >= conf->start_pending) { + PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", + block_nr<<1, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, + conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); + wait_event_lock_irq(conf->wait_done, + !conf->cnt_active, + conf->segment_lock); + wait_event_lock_irq(conf->wait_ready, + !conf->cnt_pending, + conf->segment_lock); + conf->start_active = conf->start_ready; + conf->start_ready = conf->start_pending; + conf->start_pending = conf->start_future; + conf->start_future = conf->start_future+conf->window; + // Note: falling off the end is not a problem + conf->phase = conf->phase ^1; + conf->cnt_active = conf->cnt_ready; + conf->cnt_ready = 0; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + wake_up(&conf->wait_done); + } + conf->cnt_ready++; + spin_unlock_irq(&conf->segment_lock); + + + /* If reconstructing, and >1 working disc, + * could dedicate one to rebuild and others to + * service read requests .. + */ + mirror = conf->mirrors+conf->last_used; + + r1_bh = raid1_alloc_buf (conf); + r1_bh->master_bh = NULL; + r1_bh->mddev = mddev; + r1_bh->cmd = SPECIAL; + bh = &r1_bh->bh_req; + + bh->b_blocknr = block_nr; + bsize = 1024; + while (!(bh->b_blocknr & 1) && bsize < PAGE_SIZE + && (bh->b_blocknr+2)*(bsize>>10) < mddev->sb->size) { + bh->b_blocknr >>= 1; + bsize <<= 1; + } + bh->b_size = bsize; + bh->b_list = BUF_LOCKED; + bh->b_dev = mirror->dev; + bh->b_rdev = mirror->dev; + bh->b_state = (1<<BH_Req) | (1<<BH_Mapped); + if (!bh->b_page) + BUG(); + if (!bh->b_data) + BUG(); + if (bh->b_data != page_address(bh->b_page)) + BUG(); + bh->b_end_io = end_sync_read; + bh->b_private = r1_bh; + bh->b_rsector = block_nr<<1; + init_waitqueue_head(&bh->b_wait); + + generic_make_request(READ, bh); + md_sync_acct(bh->b_rdev, bh->b_size/512); + + return (bsize >> 10); + +nomem: + raid1_shrink_buffers(conf); + spin_unlock_irq(&conf->segment_lock); + return -ENOMEM; +} + +static void end_sync_read(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + /* we have read a block, now it needs to be re-written, + * or re-read if the read failed. + * We don't do much here, just schedule handling by raid1d + */ + if (!uptodate) + md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev); + else + set_bit(R1BH_Uptodate, &r1_bh->state); + raid1_reschedule_retry(r1_bh); +} + +static void end_sync_write(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + if (!uptodate) + md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev); + if (atomic_dec_and_test(&r1_bh->remaining)) { + mddev_t *mddev = r1_bh->mddev; + unsigned long sect = bh->b_blocknr * (bh->b_size>>9); + int size = bh->b_size; + raid1_free_buf(r1_bh); + sync_request_done(sect, mddev_to_conf(mddev)); + md_done_sync(mddev,size>>10, uptodate); + } +} + +/* + * This will catch the scenario in which one of the mirrors was + * mounted as a normal device rather than as a part of a raid set. + * + * check_consistency is very personality-dependent, eg. RAID5 cannot + * do this check, it uses another method. + */ +static int __check_consistency (mddev_t *mddev, int row) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int disks = MD_SB_DISKS; + kdev_t dev; + struct buffer_head *bh = NULL; + int i, rc = 0; + char *buffer = NULL; + + for (i = 0; i < disks; i++) { + printk("(checking disk %d)\n",i); + if (!conf->mirrors[i].operational) + continue; + printk("(really checking disk %d)\n",i); + dev = conf->mirrors[i].dev; + set_blocksize(dev, 4096); + if ((bh = bread(dev, row / 4, 4096)) == NULL) + break; + if (!buffer) { + buffer = (char *) __get_free_page(GFP_KERNEL); + if (!buffer) + break; + memcpy(buffer, bh->b_data, 4096); + } else if (memcmp(buffer, bh->b_data, 4096)) { + rc = 1; + break; + } + bforget(bh); + fsync_dev(dev); + invalidate_buffers(dev); + bh = NULL; + } + if (buffer) + free_page((unsigned long) buffer); + if (bh) { + dev = bh->b_dev; + bforget(bh); + fsync_dev(dev); + invalidate_buffers(dev); + } + return rc; +} + +static int check_consistency (mddev_t *mddev) +{ + if (__check_consistency(mddev, 0)) +/* + * we do not do this currently, as it's perfectly possible to + * have an inconsistent array when it's freshly created. Only + * newly written data has to be consistent. + */ + return 0; + + return 0; +} + +#define INVALID_LEVEL KERN_WARNING \ +"raid1: md%d: raid level not set to mirroring (%d)\n" + +#define NO_SB KERN_ERR \ +"raid1: disabled mirror %s (couldn't access raid superblock)\n" + +#define ERRORS KERN_ERR \ +"raid1: disabled mirror %s (errors detected)\n" + +#define NOT_IN_SYNC KERN_ERR \ +"raid1: disabled mirror %s (not in sync)\n" + +#define INCONSISTENT KERN_ERR \ +"raid1: disabled mirror %s (inconsistent descriptor)\n" + +#define ALREADY_RUNNING KERN_ERR \ +"raid1: disabled mirror %s (mirror %d already operational)\n" + +#define OPERATIONAL KERN_INFO \ +"raid1: device %s operational as mirror %d\n" + +#define MEM_ERROR KERN_ERR \ +"raid1: couldn't allocate memory for md%d\n" + +#define SPARE KERN_INFO \ +"raid1: spare disk %s\n" + +#define NONE_OPERATIONAL KERN_ERR \ +"raid1: no operational mirrors for md%d\n" + +#define RUNNING_CKRAID KERN_ERR \ +"raid1: detected mirror differences -- running resync\n" + +#define ARRAY_IS_ACTIVE KERN_INFO \ +"raid1: raid set md%d active with %d out of %d mirrors\n" + +#define THREAD_ERROR KERN_ERR \ +"raid1: couldn't allocate thread for md%d\n" + +#define START_RESYNC KERN_WARNING \ +"raid1: raid set md%d not clean; reconstructing mirrors\n" + +static int raid1_run (mddev_t *mddev) +{ + raid1_conf_t *conf; + int i, j, disk_idx; + struct mirror_info *disk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int start_recovery = 0; + + MOD_INC_USE_COUNT; + + if (sb->level != 1) { + printk(INVALID_LEVEL, mdidx(mddev), sb->level); + goto out; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in raid1_run(), + * should be freed in raid1_stop()] + */ + + conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); + mddev->private = conf; + if (!conf) { + printk(MEM_ERROR, mdidx(mddev)); + goto out; + } + memset(conf, 0, sizeof(*conf)); + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk(ERRORS, partition_name(rdev->dev)); + } else { + if (!rdev->sb) { + MD_BUG(); + continue; + } + } + if (rdev->desc_nr == -1) { + MD_BUG(); + continue; + } + descriptor = &sb->disks[rdev->desc_nr]; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor)) { + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + continue; + } + if (disk_active(descriptor)) { + if (!disk_sync(descriptor)) { + printk(NOT_IN_SYNC, + partition_name(rdev->dev)); + continue; + } + if ((descriptor->number > MD_SB_DISKS) || + (disk_idx > sb->raid_disks)) { + + printk(INCONSISTENT, + partition_name(rdev->dev)); + continue; + } + if (disk->operational) { + printk(ALREADY_RUNNING, + partition_name(rdev->dev), + disk_idx); + continue; + } + printk(OPERATIONAL, partition_name(rdev->dev), + disk_idx); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 1; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + conf->working_disks++; + } else { + /* + * Must be a spare disk .. + */ + printk(SPARE, partition_name(rdev->dev)); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + disk->head_position = 0; + } + } + conf->raid_disks = sb->raid_disks; + conf->nr_disks = sb->nr_disks; + conf->mddev = mddev; + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + + conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_buffer); + init_waitqueue_head(&conf->wait_done); + init_waitqueue_head(&conf->wait_ready); + + if (!conf->working_disks) { + printk(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + + /* pre-allocate some buffer_head structures. + * As a minimum, 1 r1bh and raid_disks buffer_heads + * would probably get us by in tight memory situations, + * but a few more is probably a good idea. + * For now, try 16 r1bh and 16*raid_disks bufferheads + * This will allow at least 16 concurrent reads or writes + * even if kmalloc starts failing + */ + if (raid1_grow_r1bh(conf, 16) < 16 || + raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) { + printk(MEM_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + + descriptor = sb->disks+i; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && + !disk->used_slot) { + + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + } + } + + /* + * find the first working one and use it as a starting point + * to read balancing. + */ + for (j = 0; !conf->mirrors[j].operational; j++) + /* nothing */; + conf->last_used = j; + + /* + * initialize the 'working disks' list. + */ + for (i = conf->raid_disks - 1; i >= 0; i--) { + if (conf->mirrors[i].operational) { + conf->mirrors[i].next = j; + j = i; + } + } + + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; + } + + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) { + /* + * we do sanity checks even if the device says + * it's clean ... + */ + if (check_consistency(mddev)) { + printk(RUNNING_CKRAID); + sb->state &= ~(1 << MD_SB_CLEAN); + } + } + + { + const char * name = "raid1d"; + + conf->thread = md_register_thread(raid1d, conf, name); + if (!conf->thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + } + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { + const char * name = "raid1syncd"; + + conf->resync_thread = md_register_thread(raid1syncd, conf,name); + if (!conf->resync_thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + printk(START_RESYNC, mdidx(mddev)); + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + } + + /* + * Regenerate the "device is in sync with the raid set" bit for + * each device. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks+i); + for (j = 0; j < sb->raid_disks; j++) { + if (!conf->mirrors[j].operational) + continue; + if (sb->disks[i].number == conf->mirrors[j].number) + mark_disk_sync(sb->disks+i); + } + } + sb->active_disks = conf->working_disks; + + if (start_recovery) + md_recover_arrays(); + + + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); + /* + * Ok, everything is just fine now + */ + return 0; + +out_free_conf: + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf, conf->freebh_cnt); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return -EIO; +} + +#undef INVALID_LEVEL +#undef NO_SB +#undef ERRORS +#undef NOT_IN_SYNC +#undef INCONSISTENT +#undef ALREADY_RUNNING +#undef OPERATIONAL +#undef SPARE +#undef NONE_OPERATIONAL +#undef RUNNING_CKRAID +#undef ARRAY_IS_ACTIVE + +static int raid1_stop_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_thread) { + if (conf->resync_mirrors) { + conf->resync_mirrors = 2; + md_interrupt_thread(conf->resync_thread); + + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid1_restart_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_mirrors) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } + return 0; +} + +static int raid1_stop (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf, conf->freebh_cnt); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; + MOD_DEC_USE_COUNT; + return 0; +} + +static mdk_personality_t raid1_personality= +{ + name: "raid1", + make_request: raid1_make_request, + run: raid1_run, + stop: raid1_stop, + status: raid1_status, + error_handler: raid1_error, + diskop: raid1_diskop, + stop_resync: raid1_stop_resync, + restart_resync: raid1_restart_resync, + sync_request: raid1_sync_request +}; + +int raid1_init (void) +{ + return register_md_personality (RAID1, &raid1_personality); +} + +#ifdef MODULE +int init_module (void) +{ + return raid1_init(); +} + +void cleanup_module (void) +{ + unregister_md_personality (RAID1); +} +#endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c new file mode 100644 index 000000000..cff836dc4 --- /dev/null +++ b/drivers/md/raid5.c @@ -0,0 +1,2371 @@ +/* + * raid5.c : Multiple Devices driver for Linux + * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1999, 2000 Ingo Molnar + * + * RAID-5 management functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/locks.h> +#include <linux/malloc.h> +#include <linux/raid/raid5.h> +#include <asm/bitops.h> +#include <asm/atomic.h> + +static mdk_personality_t raid5_personality; + +/* + * Stripe cache + */ + +#define NR_STRIPES 128 +#define HASH_PAGES 1 +#define HASH_PAGES_ORDER 0 +#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) +#define HASH_MASK (NR_HASH - 1) +#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK]) + +/* + * The following can be used to debug the driver + */ +#define RAID5_DEBUG 0 +#define RAID5_PARANOIA 1 +#if RAID5_PARANOIA && CONFIG_SMP +# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG() +# define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG() +#else +# define CHECK_DEVLOCK() +# define CHECK_SHLOCK(unused) +#endif + +#if RAID5_DEBUG +#define PRINTK(x...) printk(x) +#define inline +#define __inline__ +#else +#define PRINTK(x...) do { } while (0) +#endif + +static void print_raid5_conf (raid5_conf_t *conf); + +static inline int stripe_locked(struct stripe_head *sh) +{ + return test_bit(STRIPE_LOCKED, &sh->state); +} + +static void __unlock_stripe(struct stripe_head *sh) +{ + if (!md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + PRINTK("unlocking stripe %lu\n", sh->sector); + wake_up(&sh->wait); +} + +static void finish_unlock_stripe(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + sh->cmd = STRIPE_NONE; + sh->phase = PHASE_COMPLETE; + atomic_dec(&conf->nr_pending_stripes); + atomic_inc(&conf->nr_cached_stripes); + __unlock_stripe(sh); + atomic_dec(&sh->count); + wake_up(&conf->wait_for_stripe); +} + +static void remove_hash(raid5_conf_t *conf, struct stripe_head *sh) +{ + PRINTK("remove_hash(), stripe %lu\n", sh->sector); + + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); + if (sh->hash_pprev) { + if (sh->hash_next) + sh->hash_next->hash_pprev = sh->hash_pprev; + *sh->hash_pprev = sh->hash_next; + sh->hash_pprev = NULL; + atomic_dec(&conf->nr_hashed_stripes); + } +} + +static void lock_get_bh (struct buffer_head *bh) +{ + while (md_test_and_set_bit(BH_Lock, &bh->b_state)) + __wait_on_buffer(bh); + atomic_inc(&bh->b_count); +} + +static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) +{ + struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size); + + PRINTK("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", + sh->sector, atomic_read(&conf->nr_hashed_stripes)); + + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); + if ((sh->hash_next = *shp) != NULL) + (*shp)->hash_pprev = &sh->hash_next; + *shp = sh; + sh->hash_pprev = shp; + atomic_inc(&conf->nr_hashed_stripes); +} + +static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size) +{ + struct buffer_head *bh; + unsigned long flags; + + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh = sh->buffer_pool; + if (!bh) + goto out_unlock; + sh->buffer_pool = bh->b_next; + bh->b_size = b_size; + if (atomic_read(&bh->b_count)) + BUG(); +out_unlock: + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + + return bh; +} + +static struct buffer_head *get_free_bh(struct stripe_head *sh) +{ + struct buffer_head *bh; + unsigned long flags; + + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh = sh->bh_pool; + if (!bh) + goto out_unlock; + sh->bh_pool = bh->b_next; + if (atomic_read(&bh->b_count)) + BUG(); +out_unlock: + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + + return bh; +} + +static void put_free_buffer(struct stripe_head *sh, struct buffer_head *bh) +{ + unsigned long flags; + + if (atomic_read(&bh->b_count)) + BUG(); + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh->b_next = sh->buffer_pool; + sh->buffer_pool = bh; + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); +} + +static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh) +{ + unsigned long flags; + + if (atomic_read(&bh->b_count)) + BUG(); + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh->b_next = sh->bh_pool; + sh->bh_pool = bh; + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); +} + +static struct stripe_head *get_free_stripe(raid5_conf_t *conf) +{ + struct stripe_head *sh; + + md_spin_lock_irq(&conf->device_lock); + sh = conf->free_sh_list; + if (!sh) + goto out; + conf->free_sh_list = sh->free_next; + atomic_dec(&conf->nr_free_sh); + if (!atomic_read(&conf->nr_free_sh) && conf->free_sh_list) + BUG(); + if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || + atomic_read(&sh->count)) + BUG(); +out: + md_spin_unlock_irq(&conf->device_lock); + return sh; +} + +static void __put_free_stripe (raid5_conf_t *conf, struct stripe_head *sh) +{ + if (atomic_read(&sh->count) != 0) + BUG(); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); + clear_bit(STRIPE_LOCKED, &sh->state); + sh->free_next = conf->free_sh_list; + conf->free_sh_list = sh; + atomic_inc(&conf->nr_free_sh); +} + +static void shrink_buffers(struct stripe_head *sh, int num) +{ + struct buffer_head *bh; + + while (num--) { + bh = get_free_buffer(sh, -1); + if (!bh) + return; + free_page((unsigned long) bh->b_data); + kfree(bh); + } +} + +static void shrink_bh(struct stripe_head *sh, int num) +{ + struct buffer_head *bh; + + while (num--) { + bh = get_free_bh(sh); + if (!bh) + return; + kfree(bh); + } +} + +static int grow_raid5_buffers(struct stripe_head *sh, int num, int b_size, int priority) +{ + struct buffer_head *bh; + + while (num--) { + struct page *page; + bh = kmalloc(sizeof(struct buffer_head), priority); + if (!bh) + return 1; + memset(bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); + page = alloc_page(priority); + bh->b_data = page_address(page); + if (!bh->b_data) { + kfree(bh); + return 1; + } + bh->b_size = b_size; + atomic_set(&bh->b_count, 0); + bh->b_page = page; + put_free_buffer(sh, bh); + } + return 0; +} + +static int grow_bh(struct stripe_head *sh, int num, int priority) +{ + struct buffer_head *bh; + + while (num--) { + bh = kmalloc(sizeof(struct buffer_head), priority); + if (!bh) + return 1; + memset(bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); + put_free_bh(sh, bh); + } + return 0; +} + +static void raid5_free_buffer(struct stripe_head *sh, struct buffer_head *bh) +{ + put_free_buffer(sh, bh); +} + +static void raid5_free_bh(struct stripe_head *sh, struct buffer_head *bh) +{ + put_free_bh(sh, bh); +} + +static void raid5_free_old_bh(struct stripe_head *sh, int i) +{ + CHECK_SHLOCK(sh); + if (!sh->bh_old[i]) + BUG(); + raid5_free_buffer(sh, sh->bh_old[i]); + sh->bh_old[i] = NULL; +} + +static void raid5_update_old_bh(struct stripe_head *sh, int i) +{ + CHECK_SHLOCK(sh); + PRINTK("stripe %lu, idx %d, updating cache copy\n", sh->sector, i); + if (!sh->bh_copy[i]) + BUG(); + if (sh->bh_old[i]) + raid5_free_old_bh(sh, i); + sh->bh_old[i] = sh->bh_copy[i]; + sh->bh_copy[i] = NULL; +} + +static void free_stripe(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, j; + + if (atomic_read(&sh->count) != 0) + BUG(); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); + PRINTK("free_stripe called, stripe %lu\n", sh->sector); + if (sh->phase != PHASE_COMPLETE || atomic_read(&sh->count)) { + PRINTK("raid5: free_stripe(), sector %lu, phase %d, count %d\n", sh->sector, sh->phase, atomic_read(&sh->count)); + return; + } + for (j = 0; j < disks; j++) { + if (sh->bh_old[j]) + raid5_free_old_bh(sh, j); + if (sh->bh_new[j] || sh->bh_copy[j]) + BUG(); + } + remove_hash(conf, sh); + __put_free_stripe(conf, sh); +} + +static int shrink_stripe_cache(raid5_conf_t *conf, int nr) +{ + struct stripe_head *sh; + int i, count = 0; + + PRINTK("shrink_stripe_cache called, %d/%d, clock %d\n", nr, atomic_read(&conf->nr_hashed_stripes), conf->clock); + md_spin_lock_irq(&conf->device_lock); + for (i = 0; i < NR_HASH; i++) { + sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK]; + for (; sh; sh = sh->hash_next) { + if (sh->phase != PHASE_COMPLETE) + continue; + if (atomic_read(&sh->count)) + continue; + /* + * Try to lock this stripe: + */ + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + continue; + free_stripe(sh); + if (++count == nr) { + conf->clock = (i + conf->clock) & HASH_MASK; + goto out; + } + } + } +out: + md_spin_unlock_irq(&conf->device_lock); + PRINTK("shrink completed, nr_hashed_stripes %d, nr_pending_strips %d\n", + atomic_read(&conf->nr_hashed_stripes), + atomic_read(&conf->nr_pending_stripes)); + return count; +} + +void __wait_lock_stripe(struct stripe_head *sh) +{ + MD_DECLARE_WAITQUEUE(wait, current); + + PRINTK("wait_lock_stripe %lu\n", sh->sector); + if (!atomic_read(&sh->count)) + BUG(); + add_wait_queue(&sh->wait, &wait); +repeat: + set_current_state(TASK_UNINTERRUPTIBLE); + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) { + schedule(); + goto repeat; + } + PRINTK("wait_lock_stripe %lu done\n", sh->sector); + remove_wait_queue(&sh->wait, &wait); + current->state = TASK_RUNNING; +} + +static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector, int size) +{ + struct stripe_head *sh; + + PRINTK("__find_stripe, sector %lu\n", sector); + for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next) { + if (sh->sector == sector && sh->raid_conf == conf) { + if (sh->size != size) + BUG(); + return sh; + } + } + PRINTK("__stripe %lu not in cache\n", sector); + return NULL; +} + +static inline struct stripe_head *alloc_stripe(raid5_conf_t *conf, unsigned long sector, int size) +{ + struct stripe_head *sh; + struct buffer_head *buffer_pool, *bh_pool; + MD_DECLARE_WAITQUEUE(wait, current); + + PRINTK("alloc_stripe called\n"); + + + while ((sh = get_free_stripe(conf)) == NULL) { + int cnt; + add_wait_queue(&conf->wait_for_stripe, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + cnt = shrink_stripe_cache(conf, conf->max_nr_stripes / 8); + sh = get_free_stripe(conf); + if (!sh && cnt < (conf->max_nr_stripes/8)) { + md_wakeup_thread(conf->thread); + PRINTK("waiting for some stripes to complete - %d %d\n", cnt, conf->max_nr_stripes/8); + schedule(); + } + remove_wait_queue(&conf->wait_for_stripe, &wait); + current->state = TASK_RUNNING; + if (sh) + break; + } + + buffer_pool = sh->buffer_pool; + bh_pool = sh->bh_pool; + memset(sh, 0, sizeof(*sh)); + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&sh->wait); + sh->buffer_pool = buffer_pool; + sh->bh_pool = bh_pool; + sh->phase = PHASE_COMPLETE; + sh->cmd = STRIPE_NONE; + sh->raid_conf = conf; + sh->sector = sector; + sh->size = size; + atomic_inc(&conf->nr_cached_stripes); + + return sh; +} + +static struct stripe_head *get_lock_stripe(raid5_conf_t *conf, unsigned long sector, int size) +{ + struct stripe_head *sh, *new = NULL; + + PRINTK("get_stripe, sector %lu\n", sector); + + /* + * Do this in set_blocksize()! + */ + if (conf->buffer_size != size) { + PRINTK("switching size, %d --> %d\n", conf->buffer_size, size); + shrink_stripe_cache(conf, conf->max_nr_stripes); + conf->buffer_size = size; + } + +repeat: + md_spin_lock_irq(&conf->device_lock); + sh = __find_stripe(conf, sector, size); + if (!sh) { + if (!new) { + md_spin_unlock_irq(&conf->device_lock); + new = alloc_stripe(conf, sector, size); + goto repeat; + } + sh = new; + new = NULL; + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + insert_hash(conf, sh); + atomic_inc(&sh->count); + md_spin_unlock_irq(&conf->device_lock); + } else { + atomic_inc(&sh->count); + if (new) { + if (md_test_and_set_bit(STRIPE_LOCKED, &new->state)) + BUG(); + __put_free_stripe(conf, new); + } + md_spin_unlock_irq(&conf->device_lock); + PRINTK("get_stripe, waiting, sector %lu\n", sector); + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + __wait_lock_stripe(sh); + } + return sh; +} + +static int grow_stripes(raid5_conf_t *conf, int num, int priority) +{ + struct stripe_head *sh; + + while (num--) { + sh = kmalloc(sizeof(struct stripe_head), priority); + if (!sh) + return 1; + memset(sh, 0, sizeof(*sh)); + sh->raid_conf = conf; + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&sh->wait); + + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + if (grow_raid5_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) { + shrink_buffers(sh, 2 * conf->raid_disks); + kfree(sh); + return 1; + } + if (grow_bh(sh, conf->raid_disks, priority)) { + shrink_buffers(sh, 2 * conf->raid_disks); + shrink_bh(sh, conf->raid_disks); + kfree(sh); + return 1; + } + md_spin_lock_irq(&conf->device_lock); + __put_free_stripe(conf, sh); + atomic_inc(&conf->nr_stripes); + md_spin_unlock_irq(&conf->device_lock); + } + return 0; +} + +static void shrink_stripes(raid5_conf_t *conf, int num) +{ + struct stripe_head *sh; + + while (num--) { + sh = get_free_stripe(conf); + if (!sh) + break; + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + shrink_buffers(sh, conf->raid_disks * 2); + shrink_bh(sh, conf->raid_disks); + kfree(sh); + atomic_dec(&conf->nr_stripes); + } +} + + +static struct buffer_head *raid5_alloc_buffer(struct stripe_head *sh, int b_size) +{ + struct buffer_head *bh; + + bh = get_free_buffer(sh, b_size); + if (!bh) + BUG(); + return bh; +} + +static struct buffer_head *raid5_alloc_bh(struct stripe_head *sh) +{ + struct buffer_head *bh; + + bh = get_free_bh(sh); + if (!bh) + BUG(); + return bh; +} + +static void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate) +{ + struct buffer_head *bh = sh->bh_new[i]; + + PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh->b_blocknr, uptodate); + sh->bh_new[i] = NULL; + raid5_free_bh(sh, sh->bh_req[i]); + sh->bh_req[i] = NULL; + PRINTK("calling %p->end_io: %p.\n", bh, bh->b_end_io); + bh->b_end_io(bh, uptodate); + if (!uptodate) + printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for " + "block %lu\n", + partition_name(mddev_to_kdev(sh->raid_conf->mddev)), + bh->b_blocknr); +} + +static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate) +{ + if (uptodate) + set_bit(BH_Uptodate, &bh->b_state); + else + clear_bit(BH_Uptodate, &bh->b_state); +} + +static void raid5_end_request (struct buffer_head * bh, int uptodate) +{ + struct stripe_head *sh = bh->b_private; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + unsigned long flags; + + PRINTK("end_request %lu, nr_pending %d, uptodate: %d, (caller: %p,%p,%p,%p).\n", sh->sector, atomic_read(&sh->nr_pending), uptodate, __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2), __builtin_return_address(3)); + md_spin_lock_irqsave(&sh->stripe_lock, flags); + raid5_mark_buffer_uptodate(bh, uptodate); + if (!uptodate) + md_error(mddev_to_kdev(conf->mddev), bh->b_dev); + if (conf->failed_disks) { + for (i = 0; i < disks; i++) { + if (conf->disks[i].operational) + continue; + if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i]) + continue; + if (bh->b_dev != conf->disks[i].dev) + continue; + set_bit(STRIPE_ERROR, &sh->state); + } + } + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + + if (atomic_dec_and_test(&sh->nr_pending)) { + atomic_inc(&conf->nr_handle); + md_wakeup_thread(conf->thread); + } +} + +static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i) +{ + raid5_conf_t *conf = sh->raid_conf; + char *b_data; + struct page *b_page; + unsigned long block = sh->sector / (sh->size >> 9); + + b_data = bh->b_data; + b_page = bh->b_page; + memset (bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); + init_buffer(bh, raid5_end_request, sh); + bh->b_dev = conf->disks[i].dev; + bh->b_blocknr = block; + + bh->b_data = b_data; + bh->b_page = b_page; + + bh->b_rdev = conf->disks[i].dev; + bh->b_rsector = sh->sector; + + bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); + bh->b_size = sh->size; + bh->b_list = BUF_LOCKED; +} + +static int raid5_error (mddev_t *mddev, kdev_t dev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; + struct disk_info *disk; + int i; + + PRINTK("raid5_error called\n"); + conf->resync_parity = 0; + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) { + if (disk->dev == dev && disk->operational) { + disk->operational = 0; + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + conf->working_disks--; + conf->failed_disks++; + md_wakeup_thread(conf->thread); + printk (KERN_ALERT + "raid5: Disk failure on %s, disabling device." + " Operation continuing on %d devices\n", + partition_name (dev), conf->working_disks); + return 0; + } + } + /* + * handle errors in spares (during reconstruction) + */ + if (conf->spare) { + disk = conf->spare; + if (disk->dev == dev) { + printk (KERN_ALERT + "raid5: Disk failure on spare %s\n", + partition_name (dev)); + if (!conf->spare->operational) { + MD_BUG(); + return -EIO; + } + disk->operational = 0; + disk->write_only = 0; + conf->spare = NULL; + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + + return 0; + } + } + MD_BUG(); + return -EIO; +} + +/* + * Input: a 'big' sector number, + * Output: index of the data and parity disk, and the sector # in them. + */ +static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks, + unsigned int data_disks, unsigned int * dd_idx, + unsigned int * pd_idx, raid5_conf_t *conf) +{ + unsigned long stripe; + unsigned long chunk_number; + unsigned int chunk_offset; + unsigned long new_sector; + int sectors_per_chunk = conf->chunk_size >> 9; + + /* First compute the information on this sector */ + + /* + * Compute the chunk number and the sector offset inside the chunk + */ + chunk_number = r_sector / sectors_per_chunk; + chunk_offset = r_sector % sectors_per_chunk; + + /* + * Compute the stripe number + */ + stripe = chunk_number / data_disks; + + /* + * Compute the data disk and parity disk indexes inside the stripe + */ + *dd_idx = chunk_number % data_disks; + + /* + * Select the parity disk based on the user selected algorithm. + */ + if (conf->level == 4) + *pd_idx = data_disks; + else switch (conf->algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + *pd_idx = data_disks - stripe % raid_disks; + if (*dd_idx >= *pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + *pd_idx = stripe % raid_disks; + if (*dd_idx >= *pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_LEFT_SYMMETRIC: + *pd_idx = data_disks - stripe % raid_disks; + *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + *pd_idx = stripe % raid_disks; + *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + break; + default: + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); + } + + /* + * Finally, compute the new sector number + */ + new_sector = stripe * sectors_per_chunk + chunk_offset; + return new_sector; +} + +static unsigned long compute_blocknr(struct stripe_head *sh, int i) +{ + raid5_conf_t *conf = sh->raid_conf; + int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; + unsigned long new_sector = sh->sector, check; + int sectors_per_chunk = conf->chunk_size >> 9; + unsigned long stripe = new_sector / sectors_per_chunk; + int chunk_offset = new_sector % sectors_per_chunk; + int chunk_number, dummy1, dummy2, dd_idx = i; + unsigned long r_sector, blocknr; + + switch (conf->algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC: + case ALGORITHM_RIGHT_SYMMETRIC: + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 1); + break; + default: + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); + } + + chunk_number = stripe * data_disks + i; + r_sector = chunk_number * sectors_per_chunk + chunk_offset; + blocknr = r_sector / (sh->size >> 9); + + check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); + if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { + printk("compute_blocknr: map not correct\n"); + return 0; + } + return blocknr; +} + +static void compute_block(struct stripe_head *sh, int dd_idx) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; + + PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx); + + if (sh->bh_old[dd_idx] == NULL) + sh->bh_old[dd_idx] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx); + + memset(sh->bh_old[dd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_old[dd_idx]; + count = 1; + for (i = 0; i < disks; i++) { + if (i == dd_idx) + continue; + if (sh->bh_old[i]) { + bh_ptr[count++] = sh->bh_old[i]; + } else { + printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); + } + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if (count != 1) + xor_block(count, &bh_ptr[0]); + raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1); +} + +static void compute_parity(struct stripe_head *sh, int method) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; + + PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method); + for (i = 0; i < disks; i++) { + if (i == pd_idx || !sh->bh_new[i]) + continue; + if (!sh->bh_copy[i]) + sh->bh_copy[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_copy[i], i); + atomic_set_buffer_dirty(sh->bh_copy[i]); + memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size); + } + if (sh->bh_copy[pd_idx] == NULL) { + sh->bh_copy[pd_idx] = raid5_alloc_buffer(sh, sh->size); + atomic_set_buffer_dirty(sh->bh_copy[pd_idx]); + } + raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx); + + if (method == RECONSTRUCT_WRITE) { + memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_copy[pd_idx]; + count = 1; + for (i = 0; i < disks; i++) { + if (i == sh->pd_idx) + continue; + if (sh->bh_new[i]) { + bh_ptr[count++] = sh->bh_copy[i]; + } else if (sh->bh_old[i]) { + bh_ptr[count++] = sh->bh_old[i]; + } + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } + } else if (method == READ_MODIFY_WRITE) { + memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size); + bh_ptr[0] = sh->bh_copy[pd_idx]; + count = 1; + for (i = 0; i < disks; i++) { + if (i == sh->pd_idx) + continue; + if (sh->bh_new[i] && sh->bh_old[i]) { + bh_ptr[count++] = sh->bh_copy[i]; + bh_ptr[count++] = sh->bh_old[i]; + } + if (count >= (MAX_XOR_BLOCKS - 1)) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } + } + raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1); +} + +static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw) +{ + raid5_conf_t *conf = sh->raid_conf; + struct buffer_head *bh_req; + + PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector); + CHECK_SHLOCK(sh); + if (sh->bh_new[dd_idx]) + BUG(); + + bh_req = raid5_alloc_bh(sh); + raid5_build_block(sh, bh_req, dd_idx); + bh_req->b_data = bh->b_data; + bh_req->b_page = bh->b_page; + + md_spin_lock_irq(&conf->device_lock); + if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) { + PRINTK("stripe s#%lu => PHASE_BEGIN (%s)\n", sh->sector, rw == READ ? "read" : "write"); + sh->phase = PHASE_BEGIN; + sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE; + atomic_inc(&conf->nr_pending_stripes); + atomic_inc(&conf->nr_handle); + PRINTK("# of pending stripes: %u, # of handle: %u\n", atomic_read(&conf->nr_pending_stripes), atomic_read(&conf->nr_handle)); + } + sh->bh_new[dd_idx] = bh; + sh->bh_req[dd_idx] = bh_req; + sh->cmd_new[dd_idx] = rw; + sh->new[dd_idx] = 1; + md_spin_unlock_irq(&conf->device_lock); + + PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx); +} + +static void complete_stripe(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; + int i, new = 0; + + PRINTK("complete_stripe %lu\n", sh->sector); + for (i = 0; i < disks; i++) { + if (sh->cmd == STRIPE_SYNC && sh->bh_copy[i]) + raid5_update_old_bh(sh, i); + if (sh->cmd == STRIPE_WRITE && i == sh->pd_idx) + raid5_update_old_bh(sh, i); + if (sh->bh_new[i]) { + PRINTK("stripe %lu finishes new bh, sh->new == %d\n", sh->sector, sh->new[i]); + if (!sh->new[i]) { +#if 0 + if (sh->cmd == STRIPE_WRITE) { + if (memcmp(sh->bh_new[i]->b_data, sh->bh_copy[i]->b_data, sh->size)) { + printk("copy differs, %s, sector %lu ", + test_bit(BH_Dirty, &sh->bh_new[i]->b_state) ? "dirty" : "clean", + sh->sector); + } else if (test_bit(BH_Dirty, &sh->bh_new[i]->b_state)) + printk("sector %lu dirty\n", sh->sector); + } +#endif + if (sh->cmd == STRIPE_WRITE) + raid5_update_old_bh(sh, i); + raid5_end_buffer_io(sh, i, 1); + continue; + } else + new++; + } + if (new && sh->cmd == STRIPE_WRITE) + printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new); + } + if (sh->cmd == STRIPE_SYNC) + md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1); + if (!new) + finish_unlock_stripe(sh); + else { + PRINTK("stripe %lu, new == %d\n", sh->sector, new); + sh->phase = PHASE_BEGIN; + } +} + + +static void handle_stripe_write (mddev_t *mddev , raid5_conf_t *conf, + struct stripe_head *sh, int nr_write, int * operational, int disks, + int parity, int parity_failed, int nr_cache, int nr_cache_other, + int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) +{ + int i; + unsigned int block; + struct buffer_head *bh; + int method1 = INT_MAX, method2 = INT_MAX; + + /* + * Attempt to add entries :-) + */ + if (nr_write != disks - 1) { + for (i = 0; i < disks; i++) { + if (i == sh->pd_idx) + continue; + if (sh->bh_new[i]) + continue; + block = (int) compute_blocknr(sh, i); + bh = get_hash_table(mddev_to_kdev(mddev), block, sh->size); + if (!bh) + continue; + if (buffer_dirty(bh) && !md_test_and_set_bit(BH_Lock, &bh->b_state)) { + PRINTK("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block); + add_stripe_bh(sh, bh, i, WRITE); + sh->new[i] = 0; + nr_write++; + if (sh->bh_old[i]) { + nr_cache_overwrite++; + nr_cache_other--; + } else + if (!operational[i]) { + nr_failed_overwrite++; + nr_failed_other--; + } + } + atomic_dec(&bh->b_count); + } + } + PRINTK("handle_stripe() -- begin writing, stripe %lu\n", sh->sector); + /* + * Writing, need to update parity buffer. + * + * Compute the number of I/O requests in the "reconstruct + * write" and "read modify write" methods. + */ + if (!nr_failed_other) + method1 = (disks - 1) - (nr_write + nr_cache_other); + if (!nr_failed_overwrite && !parity_failed) + method2 = nr_write - nr_cache_overwrite + (1 - parity); + + if (method1 == INT_MAX && method2 == INT_MAX) + BUG(); + PRINTK("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2); + + if (!method1 || !method2) { + sh->phase = PHASE_WRITE; + compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); + + for (i = 0; i < disks; i++) { + if (!operational[i] && !conf->spare && !conf->resync_parity) + continue; + bh = sh->bh_copy[i]; + if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL))) + printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]); + if (i == sh->pd_idx && !bh) + printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i); + if (bh) { + PRINTK("making request for buffer %d\n", i); + lock_get_bh(bh); + if (!operational[i] && !conf->resync_parity) { + PRINTK("writing spare %d\n", i); + atomic_inc(&sh->nr_pending); + bh->b_dev = bh->b_rdev = conf->spare->dev; + generic_make_request(WRITE, bh); + } else { + atomic_inc(&sh->nr_pending); + bh->b_dev = bh->b_rdev = conf->disks[i].dev; + generic_make_request(WRITE, bh); + } + atomic_dec(&bh->b_count); + } + } + PRINTK("handle_stripe() %lu, writing back %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); + return; + } + + if (method1 < method2) { + sh->write_method = RECONSTRUCT_WRITE; + for (i = 0; i < disks; i++) { + if (i == sh->pd_idx) + continue; + if (sh->bh_new[i] || sh->bh_old[i]) + continue; + sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[i], i); + } + } else { + sh->write_method = READ_MODIFY_WRITE; + for (i = 0; i < disks; i++) { + if (sh->bh_old[i]) + continue; + if (!sh->bh_new[i] && i != sh->pd_idx) + continue; + sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[i], i); + } + } + sh->phase = PHASE_READ_OLD; + for (i = 0; i < disks; i++) { + if (!sh->bh_old[i]) + continue; + if (test_bit(BH_Uptodate, &sh->bh_old[i]->b_state)) + continue; + lock_get_bh(sh->bh_old[i]); + atomic_inc(&sh->nr_pending); + sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev; + generic_make_request(READ, sh->bh_old[i]); + atomic_dec(&sh->bh_old[i]->b_count); + } + PRINTK("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); +} + +/* + * Reading + */ +static void handle_stripe_read (mddev_t *mddev , raid5_conf_t *conf, + struct stripe_head *sh, int nr_read, int * operational, int disks, + int parity, int parity_failed, int nr_cache, int nr_cache_other, + int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) +{ + int i; + int method1 = INT_MAX; + + method1 = nr_read - nr_cache_overwrite; + + PRINTK("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1); + + if (!method1 || (method1 == 1 && nr_cache == disks - 1)) { + PRINTK("read %lu completed from cache\n", sh->sector); + for (i = 0; i < disks; i++) { + if (!sh->bh_new[i]) + continue; + if (!sh->bh_old[i]) + compute_block(sh, i); + memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); + } + complete_stripe(sh); + return; + } + if (nr_failed_overwrite) { + sh->phase = PHASE_READ_OLD; + for (i = 0; i < disks; i++) { + if (sh->bh_old[i]) + continue; + if (!operational[i]) + continue; + sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[i], i); + lock_get_bh(sh->bh_old[i]); + atomic_inc(&sh->nr_pending); + sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev; + generic_make_request(READ, sh->bh_old[i]); + atomic_dec(&sh->bh_old[i]->b_count); + } + PRINTK("handle_stripe() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); + return; + } + sh->phase = PHASE_READ; + for (i = 0; i < disks; i++) { + if (!sh->bh_new[i]) + continue; + if (sh->bh_old[i]) { + memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); + continue; + } +#if RAID5_PARANOIA + if (sh->bh_req[i] == NULL || test_bit(BH_Lock, &sh->bh_req[i]->b_state)) { + int j; + printk("req %d is NULL! or locked \n", i); + for (j=0; j<disks; j++) { + printk("%d: new=%p old=%p req=%p new=%d cmd=%d\n", + j, sh->bh_new[j], sh->bh_old[j], sh->bh_req[j], + sh->new[j], sh->cmd_new[j]); + } + + } +#endif + lock_get_bh(sh->bh_req[i]); + atomic_inc(&sh->nr_pending); + sh->bh_req[i]->b_dev = sh->bh_req[i]->b_rdev = conf->disks[i].dev; + generic_make_request(READ, sh->bh_req[i]); + atomic_dec(&sh->bh_req[i]->b_count); + } + PRINTK("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)); +} + +/* + * Syncing + */ +static void handle_stripe_sync (mddev_t *mddev , raid5_conf_t *conf, + struct stripe_head *sh, int * operational, int disks, + int parity, int parity_failed, int nr_cache, int nr_cache_other, + int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) +{ + struct buffer_head *bh; + int i, pd_idx; + + /* firstly, we want to have data from all non-failed drives + * in bh_old + */ + PRINTK("handle_stripe_sync: sec=%lu disks=%d nr_cache=%d\n", sh->sector, disks, nr_cache); + if ((nr_cache < disks-1) || ((nr_cache == disks-1) && !(parity_failed+nr_failed_other+nr_failed_overwrite)) + ) { + sh->phase = PHASE_READ_OLD; + for (i = 0; i < disks; i++) { + if (sh->bh_old[i]) + continue; + if (!conf->disks[i].operational) + continue; + + bh = raid5_alloc_buffer(sh, sh->size); + sh->bh_old[i] = bh; + raid5_build_block(sh, bh, i); + lock_get_bh(bh); + atomic_inc(&sh->nr_pending); + bh->b_dev = bh->b_rdev = conf->disks[i].dev; + generic_make_request(READ, bh); + md_sync_acct(bh->b_rdev, bh->b_size/512); + atomic_dec(&sh->bh_old[i]->b_count); + } + PRINTK("handle_stripe_sync() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); + + return; + } + /* now, if there is a failed drive, rebuild and write to spare */ + if (nr_cache == disks-1) { + sh->phase = PHASE_WRITE; + /* we can generate the missing block, which will be on the failed drive */ + for (i=0; i<disks; i++) { + if (operational[i]) + continue; + compute_block(sh, i); + if (conf->spare) { + bh = sh->bh_copy[i]; + if (bh) { + memcpy(bh->b_data, sh->bh_old[i]->b_data, sh->size); + set_bit(BH_Uptodate, &bh->b_state); + } else { + bh = sh->bh_old[i]; + sh->bh_old[i] = NULL; + sh->bh_copy[i] = bh; + } + atomic_inc(&sh->nr_pending); + lock_get_bh(bh); + bh->b_dev = bh->b_rdev = conf->spare->dev; + generic_make_request(WRITE, bh); + md_sync_acct(bh->b_rdev, bh->b_size/512); + atomic_dec(&bh->b_count); + PRINTK("handle_stripe_sync() %lu, phase WRITE, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); + } + break; + } + return; + } + + /* nr_cache == disks: + * check parity and compute/write if needed + */ + + compute_parity(sh, RECONSTRUCT_WRITE); + pd_idx = sh->pd_idx; + if (!memcmp(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size)) { + /* the parity is correct - Yay! */ + complete_stripe(sh); + } else { + sh->phase = PHASE_WRITE; + bh = sh->bh_copy[pd_idx]; + atomic_set_buffer_dirty(bh); + lock_get_bh(bh); + atomic_inc(&sh->nr_pending); + bh->b_dev = bh->b_rdev = conf->disks[pd_idx].dev; + generic_make_request(WRITE, bh); + md_sync_acct(bh->b_rdev, bh->b_size/512); + atomic_dec(&bh->b_count); + PRINTK("handle_stripe_sync() %lu phase WRITE, pending %d buffers\n", + sh->sector, md_atomic_read(&sh->nr_pending)); + } +} + +/* + * handle_stripe() is our main logic routine. Note that: + * + * 1. lock_stripe() should be used whenever we can't accept additonal + * buffers, either during short sleeping in handle_stripe() or + * during io operations. + * + * 2. We should be careful to set sh->nr_pending whenever we sleep, + * to prevent re-entry of handle_stripe() for the same sh. + * + * 3. conf->failed_disks and disk->operational can be changed + * from an interrupt. This complicates things a bit, but it allows + * us to stop issuing requests for a failed drive as soon as possible. + */ +static void handle_stripe(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + mddev_t *mddev = conf->mddev; + int disks = conf->raid_disks; + int i, nr_read = 0, nr_write = 0, parity = 0; + int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0; + int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0; + int operational[MD_SB_DISKS], failed_disks = conf->failed_disks; + + PRINTK("handle_stripe(), stripe %lu\n", sh->sector); + if (!stripe_locked(sh)) + BUG(); + if (md_atomic_read(&sh->nr_pending)) + BUG(); + if (sh->phase == PHASE_COMPLETE) + BUG(); + + atomic_dec(&conf->nr_handle); + + if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) { + printk("raid5: restarting stripe %lu\n", sh->sector); + sh->phase = PHASE_BEGIN; + } + + if ((sh->cmd == STRIPE_WRITE && sh->phase == PHASE_WRITE) || + (sh->cmd == STRIPE_READ && sh->phase == PHASE_READ) || + (sh->cmd == STRIPE_SYNC && sh->phase == PHASE_WRITE) + ) { + /* + * Completed + */ + complete_stripe(sh); + if (sh->phase == PHASE_COMPLETE) + return; + } + + md_spin_lock_irq(&conf->device_lock); + for (i = 0; i < disks; i++) { + operational[i] = conf->disks[i].operational; + if (i == sh->pd_idx && conf->resync_parity) + operational[i] = 0; + } + failed_disks = conf->failed_disks; + md_spin_unlock_irq(&conf->device_lock); + + /* + * Make this one more graceful? + */ + if (failed_disks > 1) { + for (i = 0; i < disks; i++) { + if (sh->bh_new[i]) { + raid5_end_buffer_io(sh, i, 0); + continue; + } + } + if (sh->cmd == STRIPE_SYNC) + md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1); + finish_unlock_stripe(sh); + return; + } + + PRINTK("=== stripe index START ===\n"); + for (i = 0; i < disks; i++) { + PRINTK("disk %d, ", i); + if (sh->bh_old[i]) { + nr_cache++; + PRINTK(" (old cached, %d)", nr_cache); + } + if (i == sh->pd_idx) { + PRINTK(" PARITY."); + if (sh->bh_old[i]) { + PRINTK(" CACHED."); + parity = 1; + } else { + PRINTK(" UNCACHED."); + if (!operational[i]) { + PRINTK(" FAILED."); + parity_failed = 1; + } + } + PRINTK("\n"); + continue; + } + if (!sh->bh_new[i]) { + PRINTK(" (no new data block) "); + if (sh->bh_old[i]) { + PRINTK(" (but old block cached) "); + nr_cache_other++; + } else { + if (!operational[i]) { + PRINTK(" (because failed disk) "); + nr_failed_other++; + } else + PRINTK(" (no old block either) "); + } + PRINTK("\n"); + continue; + } + sh->new[i] = 0; + if (sh->cmd_new[i] == READ) { + nr_read++; + PRINTK(" (new READ %d)", nr_read); + } + if (sh->cmd_new[i] == WRITE) { + nr_write++; + PRINTK(" (new WRITE %d)", nr_write); + } + if (sh->bh_old[i]) { + nr_cache_overwrite++; + PRINTK(" (overwriting old %d)", nr_cache_overwrite); + } else { + if (!operational[i]) { + nr_failed_overwrite++; + PRINTK(" (overwriting failed %d)", nr_failed_overwrite); + } + } + PRINTK("\n"); + } + PRINTK("=== stripe index END ===\n"); + + if (nr_write && nr_read) + BUG(); + + if (nr_write) + handle_stripe_write( + mddev, conf, sh, nr_write, operational, disks, + parity, parity_failed, nr_cache, nr_cache_other, + nr_failed_other, nr_cache_overwrite, + nr_failed_overwrite + ); + else if (nr_read) + handle_stripe_read( + mddev, conf, sh, nr_read, operational, disks, + parity, parity_failed, nr_cache, nr_cache_other, + nr_failed_other, nr_cache_overwrite, + nr_failed_overwrite + ); + else if (sh->cmd == STRIPE_SYNC) + handle_stripe_sync( + mddev, conf, sh, operational, disks, + parity, parity_failed, nr_cache, nr_cache_other, + nr_failed_other, nr_cache_overwrite, nr_failed_overwrite + ); +} + + +static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + const unsigned int raid_disks = conf->raid_disks; + const unsigned int data_disks = raid_disks - 1; + unsigned int dd_idx, pd_idx; + unsigned long new_sector; + + struct stripe_head *sh; + + if (rw == READA) + rw = READ; + + new_sector = raid5_compute_sector(bh->b_rsector, + raid_disks, data_disks, &dd_idx, &pd_idx, conf); + + PRINTK("raid5_make_request, sector %lu\n", new_sector); + sh = get_lock_stripe(conf, new_sector, bh->b_size); +#if 0 + if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) { + PRINTK("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd); + lock_stripe(sh); + if (!md_atomic_read(&sh->nr_pending)) + handle_stripe(sh); + goto repeat; + } +#endif + sh->pd_idx = pd_idx; + if (sh->phase != PHASE_COMPLETE && sh->phase != PHASE_BEGIN) + PRINTK("stripe %lu catching the bus!\n", sh->sector); + if (sh->bh_new[dd_idx]) + BUG(); + add_stripe_bh(sh, bh, dd_idx, rw); + + md_wakeup_thread(conf->thread); + return 0; +} + +/* + * Determine correct block size for this device. + */ +unsigned int device_bsize (kdev_t dev) +{ + unsigned int i, correct_size; + + correct_size = BLOCK_SIZE; + if (blksize_size[MAJOR(dev)]) { + i = blksize_size[MAJOR(dev)][MINOR(dev)]; + if (i) + correct_size = i; + } + + return correct_size; +} + +static int raid5_sync_request (mddev_t *mddev, unsigned long block_nr) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct stripe_head *sh; + int sectors_per_chunk = conf->chunk_size >> 9; + unsigned long stripe = (block_nr<<2)/sectors_per_chunk; + int chunk_offset = (block_nr<<2) % sectors_per_chunk; + int dd_idx, pd_idx; + unsigned long first_sector; + int raid_disks = conf->raid_disks; + int data_disks = raid_disks-1; + int redone = 0; + int bufsize; + + if (!conf->buffer_size) + conf->buffer_size = /* device_bsize(mddev_to_kdev(mddev))*/ PAGE_SIZE; + bufsize = conf->buffer_size; + /* Hmm... race on buffer_size ?? */ + redone = block_nr% (bufsize>>10); + block_nr -= redone; + sh = get_lock_stripe(conf, block_nr<<1, bufsize); + first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk + + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); + sh->pd_idx = pd_idx; + sh->cmd = STRIPE_SYNC; + sh->phase = PHASE_BEGIN; + sh->sync_redone = redone; + atomic_inc(&conf->nr_pending_stripes); + atomic_inc(&conf->nr_handle); + md_wakeup_thread(conf->thread); + return (bufsize>>10)-redone; +} + +/* + * This is our raid5 kernel thread. + * + * We scan the hash table for stripes which can be handled now. + * During the scan, completed stripes are saved for us by the interrupt + * handler, so that they will not have to wait for our next wakeup. + */ +static void raid5d (void *data) +{ + struct stripe_head *sh; + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + int i, handled; + + PRINTK("+++ raid5d active\n"); + + handled = 0; + md_spin_lock_irq(&conf->device_lock); + clear_bit(THREAD_WAKEUP, &conf->thread->flags); +repeat_pass: + if (mddev->sb_dirty) { + md_spin_unlock_irq(&conf->device_lock); + mddev->sb_dirty = 0; + md_update_sb(mddev); + md_spin_lock_irq(&conf->device_lock); + } + for (i = 0; i < NR_HASH; i++) { +repeat: + sh = conf->stripe_hashtbl[i]; + for (; sh; sh = sh->hash_next) { + if (sh->raid_conf != conf) + continue; + if (sh->phase == PHASE_COMPLETE) + continue; + if (md_atomic_read(&sh->nr_pending)) + continue; + md_spin_unlock_irq(&conf->device_lock); + if (!atomic_read(&sh->count)) + BUG(); + + handled++; + handle_stripe(sh); + md_spin_lock_irq(&conf->device_lock); + goto repeat; + } + } + if (conf) { + PRINTK("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)); + if (test_and_clear_bit(THREAD_WAKEUP, &conf->thread->flags) && + md_atomic_read(&conf->nr_handle)) + goto repeat_pass; + } + md_spin_unlock_irq(&conf->device_lock); + + PRINTK("--- raid5d inactive\n"); +} + +/* + * Private kernel thread for parity reconstruction after an unclean + * shutdown. Reconstruction on spare drives in case of a failed drive + * is done by the generic mdsyncd. + */ +static void raid5syncd (void *data) +{ + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_parity) + return; + if (conf->resync_parity == 2) + return; + down(&mddev->recovery_sem); + if (md_do_sync(mddev,NULL)) { + up(&mddev->recovery_sem); + printk("raid5: resync aborted!\n"); + return; + } + conf->resync_parity = 0; + up(&mddev->recovery_sem); + printk("raid5: resync finished.\n"); +} + +static int __check_consistency (mddev_t *mddev, int row) +{ + raid5_conf_t *conf = mddev->private; + kdev_t dev; + struct buffer_head *bh[MD_SB_DISKS], *tmp = NULL; + int i, ret = 0, nr = 0, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; + + if (conf->working_disks != conf->raid_disks) + goto out; + tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + tmp->b_size = 4096; + tmp->b_page = alloc_page(GFP_KERNEL); + tmp->b_data = page_address(tmp->b_page); + if (!tmp->b_data) + goto out; + md_clear_page(tmp->b_data); + memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *)); + for (i = 0; i < conf->raid_disks; i++) { + dev = conf->disks[i].dev; + set_blocksize(dev, 4096); + bh[i] = bread(dev, row / 4, 4096); + if (!bh[i]) + break; + nr++; + } + if (nr == conf->raid_disks) { + bh_ptr[0] = tmp; + count = 1; + for (i = 1; i < nr; i++) { + bh_ptr[count++] = bh[i]; + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } + if (memcmp(tmp->b_data, bh[0]->b_data, 4096)) + ret = 1; + } + for (i = 0; i < conf->raid_disks; i++) { + dev = conf->disks[i].dev; + if (bh[i]) { + bforget(bh[i]); + bh[i] = NULL; + } + fsync_dev(dev); + invalidate_buffers(dev); + } + free_page((unsigned long) tmp->b_data); +out: + if (tmp) + kfree(tmp); + return ret; +} + +static int check_consistency (mddev_t *mddev) +{ + if (__check_consistency(mddev, 0)) +/* + * We are not checking this currently, as it's legitimate to have + * an inconsistent array, at creation time. + */ + return 0; + + return 0; +} + +static int raid5_run (mddev_t *mddev) +{ + raid5_conf_t *conf; + int i, j, raid_disk, memory; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *desc; + mdk_rdev_t *rdev; + struct disk_info *disk; + struct md_list_head *tmp; + int start_recovery = 0; + + MOD_INC_USE_COUNT; + + if (sb->level != 5 && sb->level != 4) { + printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level); + MOD_DEC_USE_COUNT; + return -EIO; + } + + mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL); + if ((conf = mddev->private) == NULL) + goto abort; + memset (conf, 0, sizeof (*conf)); + conf->mddev = mddev; + + if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) + goto abort; + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); + + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&conf->wait_for_stripe); + PRINTK("raid5_run(md%d) called.\n", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * This is important -- we are using the descriptor on + * the disk only to get a pointer to the descriptor on + * the main superblock, which might be more recent. + */ + desc = sb->disks + rdev->desc_nr; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc)) { + printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev)); + if (!rdev->faulty) { + MD_BUG(); + goto abort; + } + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + continue; + } + if (disk_active(desc)) { + if (!disk_sync(desc)) { + printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev)); + MD_BUG(); + goto abort; + } + if (raid_disk > sb->raid_disks) { + printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev)); + continue; + } + if (disk->operational) { + printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk); + continue; + } + printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk); + + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->operational = 1; + disk->used_slot = 1; + + conf->working_disks++; + } else { + /* + * Must be a spare disk .. + */ + printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev)); + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } + } + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = sb->disks + i; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) && + !conf->disks[raid_disk].used_slot) { + + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } + } + + conf->raid_disks = sb->raid_disks; + /* + * 0 for a fully functional array, 1 for a degraded array. + */ + conf->failed_disks = conf->raid_disks - conf->working_disks; + conf->mddev = mddev; + conf->chunk_size = sb->chunk_size; + conf->level = sb->level; + conf->algorithm = sb->layout; + conf->max_nr_stripes = NR_STRIPES; + +#if 0 + for (i = 0; i < conf->raid_disks; i++) { + if (!conf->disks[i].used_slot) { + MD_BUG(); + goto abort; + } + } +#endif + if (!conf->chunk_size || conf->chunk_size % 4) { + printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev)); + goto abort; + } + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { + printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev)); + goto abort; + } + if (conf->failed_disks > 1) { + printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks); + goto abort; + } + + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; + } + + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) && + check_consistency(mddev)) { + printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n"); + sb->state &= ~(1 << MD_SB_CLEAN); + } + + { + const char * name = "raid5d"; + + conf->thread = md_register_thread(raid5d, conf, name); + if (!conf->thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } + } + + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * (sizeof(struct buffer_head) + + 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024; + if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) { + printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory); + shrink_stripes(conf, conf->max_nr_stripes); + goto abort; + } else + printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev)); + + /* + * Regenerate the "device is in sync with the raid set" bit for + * each device. + */ + for (i = 0; i < MD_SB_DISKS ; i++) { + mark_disk_nonsync(sb->disks + i); + for (j = 0; j < sb->raid_disks; j++) { + if (!conf->disks[j].operational) + continue; + if (sb->disks[i].number == conf->disks[j].number) + mark_disk_sync(sb->disks + i); + } + } + sb->active_disks = conf->working_disks; + + if (sb->active_disks == sb->raid_disks) + printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); + else + printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { + const char * name = "raid5syncd"; + + conf->resync_thread = md_register_thread(raid5syncd, conf,name); + if (!conf->resync_thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } + + printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev)); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); + } + + print_raid5_conf(conf); + if (start_recovery) + md_recover_arrays(); + print_raid5_conf(conf); + + /* Ok, everything is just fine now */ + return (0); +abort: + if (conf) { + print_raid5_conf(conf); + if (conf->stripe_hashtbl) + free_pages((unsigned long) conf->stripe_hashtbl, + HASH_PAGES_ORDER); + kfree(conf); + } + mddev->private = NULL; + printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev)); + MOD_DEC_USE_COUNT; + return -EIO; +} + +static int raid5_stop_resync (mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + mdk_thread_t *thread = conf->resync_thread; + + if (thread) { + if (conf->resync_parity) { + conf->resync_parity = 2; + md_interrupt_thread(thread); + printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid5_restart_resync (mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_parity) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + printk("raid5: waking up raid5resync.\n"); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } else + printk("raid5: no restart-resync needed.\n"); + return 0; +} + + +static int raid5_stop (mddev_t *mddev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + + shrink_stripe_cache(conf, conf->max_nr_stripes); + shrink_stripes(conf, conf->max_nr_stripes); + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); + kfree(conf); + mddev->private = NULL; + MOD_DEC_USE_COUNT; + return 0; +} + +#if RAID5_DEBUG +static void print_sh (struct stripe_head *sh) +{ + int i; + + printk("sh %lu, phase %d, size %d, pd_idx %d, state %ld, cmd %d.\n", sh->sector, sh->phase, sh->size, sh->pd_idx, sh->state, sh->cmd); + printk("sh %lu, write_method %d, nr_pending %d, count %d.\n", sh->sector, sh->write_method, atomic_read(&sh->nr_pending), atomic_read(&sh->count)); + printk("sh %lu, ", sh->sector); + for (i = 0; i < MD_SB_DISKS; i++) { + if (sh->bh_old[i]) + printk("(old%d: %p) ", i, sh->bh_old[i]); + if (sh->bh_new[i]) + printk("(new%d: %p) ", i, sh->bh_new[i]); + if (sh->bh_copy[i]) + printk("(copy%d: %p) ", i, sh->bh_copy[i]); + if (sh->bh_req[i]) + printk("(req%d: %p) ", i, sh->bh_req[i]); + } + printk("\n"); + for (i = 0; i < MD_SB_DISKS; i++) + printk("%d(%d/%d) ", i, sh->cmd_new[i], sh->new[i]); + printk("\n"); +} + +static void printall (raid5_conf_t *conf) +{ + struct stripe_head *sh; + int i; + + md_spin_lock_irq(&conf->device_lock); + for (i = 0; i < NR_HASH; i++) { + sh = conf->stripe_hashtbl[i]; + for (; sh; sh = sh->hash_next) { + if (sh->raid_conf != conf) + continue; + print_sh(sh); + } + } + md_spin_unlock_irq(&conf->device_lock); + + PRINTK("--- raid5d inactive\n"); +} +#endif + +static int raid5_status (char *page, mddev_t *mddev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; + int sz = 0, i; + + sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout); + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_"); + sz += sprintf (page+sz, "]"); +#if RAID5_DEBUG +#define D(x) \ + sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x)) + D(nr_handle); + D(nr_stripes); + D(nr_hashed_stripes); + D(nr_locked_stripes); + D(nr_pending_stripes); + D(nr_cached_stripes); + D(nr_free_sh); + printall(conf); +#endif + return sz; +} + +static void print_raid5_conf (raid5_conf_t *conf) +{ + int i; + struct disk_info *tmp; + + printk("RAID5 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, + conf->working_disks, conf->failed_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid5_conf_t *conf = mddev->private; + struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + + print_raid5_conf(conf); + md_spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ + switch (state) { + + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID5 configuration ... + * (this can only be in the first conf->raid_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->disks + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + } + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + if (conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + conf->spare = sdisk; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->disks + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + /* + * Was the spare being resynced? + */ + if (conf->spare == sdisk) + conf->spare = NULL; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->raid_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + if (!conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + fdisk = conf->disks + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + conf->failed_disks--; + conf->working_disks++; + conf->spare = NULL; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->disks + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->disks + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); + print_raid5_conf(conf); + return err; +} + +static mdk_personality_t raid5_personality= +{ + name: "raid5", + make_request: raid5_make_request, + run: raid5_run, + stop: raid5_stop, + status: raid5_status, + error_handler: raid5_error, + diskop: raid5_diskop, + stop_resync: raid5_stop_resync, + restart_resync: raid5_restart_resync, + sync_request: raid5_sync_request +}; + +int raid5_init (void) +{ + int err; + + err = register_md_personality (RAID5, &raid5_personality); + if (err) + return err; + + /* + * pick a XOR routine, runtime. + */ + calibrate_xor_block(); + + return 0; +} + +#ifdef MODULE +int init_module (void) +{ + return raid5_init(); +} + +void cleanup_module (void) +{ + unregister_md_personality (RAID5); +} +#endif diff --git a/drivers/md/xor.c b/drivers/md/xor.c new file mode 100644 index 000000000..4fe04fb89 --- /dev/null +++ b/drivers/md/xor.c @@ -0,0 +1,2728 @@ +/* + * xor.c : Multiple Devices driver for Linux + * + * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek + * + * + * optimized RAID-5 checksumming functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/config.h> +#define BH_TRACE 0 +#include <linux/module.h> +#include <linux/raid/md.h> +#ifdef __sparc_v9__ +#include <asm/head.h> +#include <asm/asi.h> +#include <asm/visasm.h> +#endif + +/* + * we use the 'XOR function template' to register multiple xor + * functions runtime. The kernel measures their speed upon bootup + * and decides which one to use. (compile-time registration is + * not enough as certain CPU features like MMX can only be detected + * runtime) + * + * this architecture makes it pretty easy to add new routines + * that are faster on certain CPUs, without killing other CPU's + * 'native' routine. Although the current routines are belived + * to be the physically fastest ones on all CPUs tested, but + * feel free to prove me wrong and add yet another routine =B-) + * --mingo + */ + +#define MAX_XOR_BLOCKS 5 + +#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr) + +typedef void (*xor_block_t) XOR_ARGS; +xor_block_t xor_block = NULL; + +#ifndef __sparc_v9__ + +struct xor_block_template; + +struct xor_block_template { + char * name; + xor_block_t xor_block; + int speed; + struct xor_block_template * next; +}; + +struct xor_block_template * xor_functions = NULL; + +#define XORBLOCK_TEMPLATE(x) \ +static void xor_block_##x XOR_ARGS; \ +static struct xor_block_template t_xor_block_##x = \ + { #x, xor_block_##x, 0, NULL }; \ +static void xor_block_##x XOR_ARGS + +#ifdef __i386__ + +#ifdef CONFIG_X86_XMM +/* + * Cache avoiding checksumming functions utilizing KNI instructions + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) + */ + +XORBLOCK_TEMPLATE(pIII_kni) +{ + char xmm_save[16*4]; + int cr0; + int lines = (bh_ptr[0]->b_size>>8); + + __asm__ __volatile__ ( + "movl %%cr0,%0 ;\n\t" + "clts ;\n\t" + "movups %%xmm0,(%1) ;\n\t" + "movups %%xmm1,0x10(%1) ;\n\t" + "movups %%xmm2,0x20(%1) ;\n\t" + "movups %%xmm3,0x30(%1) ;\n\t" + : "=r" (cr0) + : "r" (xmm_save) + : "memory" ); + +#define OFFS(x) "8*("#x"*2)" +#define PF0(x) \ + " prefetcht0 "OFFS(x)"(%1) ;\n" +#define LD(x,y) \ + " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" +#define ST(x,y) \ + " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" +#define PF1(x) \ + " prefetchnta "OFFS(x)"(%2) ;\n" +#define PF2(x) \ + " prefetchnta "OFFS(x)"(%3) ;\n" +#define PF3(x) \ + " prefetchnta "OFFS(x)"(%4) ;\n" +#define PF4(x) \ + " prefetchnta "OFFS(x)"(%5) ;\n" +#define PF5(x) \ + " prefetchnta "OFFS(x)"(%6) ;\n" +#define XO1(x,y) \ + " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" +#define XO2(x,y) \ + " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" +#define XO3(x,y) \ + " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" +#define XO4(x,y) \ + " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" +#define XO5(x,y) \ + " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" + + switch(count) { + case 2: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + PF1(i) \ + PF1(i+2) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory" ); + break; + case 3: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory" ); + break; + case 4: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory" ); + break; + case 5: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + PF4(i) \ + PF4(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + XO4(i,0) \ + XO4(i+1,1) \ + XO4(i+2,2) \ + XO4(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " addl $256, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory"); + break; + } + + __asm__ __volatile__ ( + "sfence ;\n\t" + "movups (%1),%%xmm0 ;\n\t" + "movups 0x10(%1),%%xmm1 ;\n\t" + "movups 0x20(%1),%%xmm2 ;\n\t" + "movups 0x30(%1),%%xmm3 ;\n\t" + "movl %0,%%cr0 ;\n\t" + : + : "r" (cr0), "r" (xmm_save) + : "memory" ); +} + +#undef OFFS +#undef LD +#undef ST +#undef PF0 +#undef PF1 +#undef PF2 +#undef PF3 +#undef PF4 +#undef PF5 +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef XO5 +#undef BLOCK + +#endif /* CONFIG_X86_XMM */ + +/* + * high-speed RAID5 checksumming functions utilizing MMX instructions + * Copyright (C) 1998 Ingo Molnar + */ +XORBLOCK_TEMPLATE(pII_mmx) +{ + char fpu_save[108]; + int lines = (bh_ptr[0]->b_size>>7); + + if (!(current->flags & PF_USEDFPU)) + __asm__ __volatile__ ( " clts;\n"); + + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); + +#define LD(x,y) \ + " movq 8*("#x")(%1), %%mm"#y" ;\n" +#define ST(x,y) \ + " movq %%mm"#y", 8*("#x")(%1) ;\n" +#define XO1(x,y) \ + " pxor 8*("#x")(%2), %%mm"#y" ;\n" +#define XO2(x,y) \ + " pxor 8*("#x")(%3), %%mm"#y" ;\n" +#define XO3(x,y) \ + " pxor 8*("#x")(%4), %%mm"#y" ;\n" +#define XO4(x,y) \ + " pxor 8*("#x")(%5), %%mm"#y" ;\n" + + switch(count) { + case 2: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + ST(i,0) \ + XO1(i+1,1) \ + ST(i+1,1) \ + XO1(i+2,2) \ + ST(i+2,2) \ + XO1(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory"); + break; + case 3: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + ST(i,0) \ + XO2(i+1,1) \ + ST(i+1,1) \ + XO2(i+2,2) \ + ST(i+2,2) \ + XO2(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory"); + break; + case 4: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + ST(i,0) \ + XO3(i+1,1) \ + ST(i+1,1) \ + XO3(i+2,2) \ + ST(i+2,2) \ + XO3(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory"); + break; + case 5: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + XO4(i,0) \ + ST(i,0) \ + XO4(i+1,1) \ + ST(i+1,1) \ + XO4(i+2,2) \ + ST(i+2,2) \ + XO4(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " addl $128, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "g" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory"); + break; + } + + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); + + if (!(current->flags & PF_USEDFPU)) + stts(); +} + +#undef LD +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef ST +#undef BLOCK + +XORBLOCK_TEMPLATE(p5_mmx) +{ + char fpu_save[108]; + int lines = (bh_ptr[0]->b_size>>6); + + if (!(current->flags & PF_USEDFPU)) + __asm__ __volatile__ ( " clts;\n"); + + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); + + switch(count) { + case 2: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq 40(%1), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory" ); + break; + case 3: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory" ); + break; + case 4: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor (%4), %%mm0 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " pxor 16(%4), %%mm2 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 24(%4), %%mm3 ;\n" + " movq %%mm3, 24(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq 48(%1), %%mm6 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory" ); + break; + case 5: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor (%4), %%mm0 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor (%5), %%mm0 ;\n" + " pxor 8(%5), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 16(%4), %%mm2 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%5), %%mm2 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%4), %%mm3 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%5), %%mm3 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 32(%5), %%mm4 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " pxor 40(%5), %%mm5 ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%5), %%mm6 ;\n" + " pxor 56(%5), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " addl $64, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "g" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory" ); + break; + } + + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); + + if (!(current->flags & PF_USEDFPU)) + stts(); +} +#endif /* __i386__ */ +#endif /* !__sparc_v9__ */ + +#ifdef __sparc_v9__ +/* + * High speed xor_block operation for RAID4/5 utilizing the + * UltraSparc Visual Instruction Set. + * + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) + * + * Requirements: + * !(((long)dest | (long)sourceN) & (64 - 1)) && + * !(len & 127) && len >= 256 + * + * It is done in pure assembly, as otherwise gcc makes it + * a non-leaf function, which is not what we want. + * Also, we don't measure the speeds as on other architectures, + * as the measuring routine does not take into account cold caches + * and the fact that xor_block_VIS bypasses the caches. + * xor_block_32regs might be 5% faster for count 2 if caches are hot + * and things just right (for count 3 VIS is about as fast as 32regs for + * hot caches and for count 4 and 5 VIS is faster by good margin always), + * but I think it is better not to pollute the caches. + * Actually, if I'd just fight for speed for hot caches, I could + * write a hybrid VIS/integer routine, which would do always two + * 64B blocks in VIS and two in IEUs, but I really care more about + * caches. + */ +extern void *VISenter(void); +extern void xor_block_VIS XOR_ARGS; + +void __xor_block_VIS(void) +{ +__asm__ (" + .globl xor_block_VIS +xor_block_VIS: + ldx [%%o1 + 0], %%o4 + ldx [%%o1 + 8], %%o3 + ldx [%%o4 + %1], %%g5 + ldx [%%o4 + %0], %%o4 + ldx [%%o3 + %0], %%o3 + rd %%fprs, %%o5 + andcc %%o5, %2, %%g0 + be,pt %%icc, 297f + sethi %%hi(%5), %%g1 + jmpl %%g1 + %%lo(%5), %%g7 + add %%g7, 8, %%g7 +297: wr %%g0, %4, %%fprs + membar #LoadStore|#StoreLoad|#StoreStore + sub %%g5, 64, %%g5 + ldda [%%o4] %3, %%f0 + ldda [%%o3] %3, %%f16 + cmp %%o0, 4 + bgeu,pt %%xcc, 10f + cmp %%o0, 3 + be,pn %%xcc, 13f + mov -64, %%g1 + sub %%g5, 64, %%g5 + rd %%asi, %%g1 + wr %%g0, %3, %%asi + +2: ldda [%%o4 + 64] %%asi, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + stda %%f16, [%%o4] %3 + ldda [%%o3 + 64] %%asi, %%f48 + ldda [%%o4 + 128] %%asi, %%f0 + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + add %%o4, 128, %%o4 + fxor %%f36, %%f52, %%f52 + add %%o3, 128, %%o3 + fxor %%f38, %%f54, %%f54 + subcc %%g5, 128, %%g5 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 - 64] %%asi + bne,pt %%xcc, 2b + ldda [%%o3] %3, %%f16 + + ldda [%%o4 + 64] %%asi, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + stda %%f16, [%%o4] %3 + ldda [%%o3 + 64] %%asi, %%f48 + membar #Sync + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 + 64] %%asi + membar #Sync|#StoreStore|#StoreLoad + wr %%g0, 0, %%fprs + retl + wr %%g1, %%g0, %%asi + +13: ldx [%%o1 + 16], %%o2 + ldx [%%o2 + %0], %%o2 + +3: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o4] %3, %%f0 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + add %%o2, 64, %%o2 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 3b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + membar #Sync + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + +10: cmp %%o0, 5 + be,pt %%xcc, 15f + mov -64, %%g1 + +14: ldx [%%o1 + 16], %%o2 + ldx [%%o1 + 24], %%o0 + ldx [%%o2 + %0], %%o2 + ldx [%%o0 + %0], %%o0 + +4: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + ldda [%%o0] %3, %%f48 + fxor %%f16, %%f32, %%f32 + fxor %%f18, %%f34, %%f34 + fxor %%f20, %%f36, %%f36 + fxor %%f22, %%f38, %%f38 + add %%o2, 64, %%o2 + fxor %%f24, %%f40, %%f40 + fxor %%f26, %%f42, %%f42 + fxor %%f28, %%f44, %%f44 + fxor %%f30, %%f46, %%f46 + ldda [%%o4] %3, %%f0 + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + add %%o0, 64, %%o0 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 4b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + ldda [%%o0] %3, %%f48 + fxor %%f16, %%f32, %%f32 + fxor %%f18, %%f34, %%f34 + fxor %%f20, %%f36, %%f36 + fxor %%f22, %%f38, %%f38 + fxor %%f24, %%f40, %%f40 + fxor %%f26, %%f42, %%f42 + fxor %%f28, %%f44, %%f44 + fxor %%f30, %%f46, %%f46 + membar #Sync + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + +15: ldx [%%o1 + 16], %%o2 + ldx [%%o1 + 24], %%o0 + ldx [%%o1 + 32], %%o1 + ldx [%%o2 + %0], %%o2 + ldx [%%o0 + %0], %%o0 + ldx [%%o1 + %0], %%o1 + +5: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o0] %3, %%f16 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + add %%o2, 64, %%o2 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + ldda [%%o1] %3, %%f32 + fxor %%f48, %%f16, %%f48 + fxor %%f50, %%f18, %%f50 + add %%o0, 64, %%o0 + fxor %%f52, %%f20, %%f52 + fxor %%f54, %%f22, %%f54 + add %%o1, 64, %%o1 + fxor %%f56, %%f24, %%f56 + fxor %%f58, %%f26, %%f58 + fxor %%f60, %%f28, %%f60 + fxor %%f62, %%f30, %%f62 + ldda [%%o4] %3, %%f0 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 5b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o0] %3, %%f16 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + ldda [%%o1] %3, %%f32 + fxor %%f48, %%f16, %%f48 + fxor %%f50, %%f18, %%f50 + fxor %%f52, %%f20, %%f52 + fxor %%f54, %%f22, %%f54 + fxor %%f56, %%f24, %%f56 + fxor %%f58, %%f26, %%f58 + fxor %%f60, %%f28, %%f60 + fxor %%f62, %%f30, %%f62 + membar #Sync + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + " : : + "i" (&((struct buffer_head *)0)->b_data), + "i" (&((struct buffer_head *)0)->b_size), + "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P), + "i" (FPRS_FEF), "i" (VISenter)); +} +#endif /* __sparc_v9__ */ + +#if defined(__sparc__) && !defined(__sparc_v9__) +/* + * High speed xor_block operation for RAID4/5 utilizing the + * ldd/std SPARC instructions. + * + * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) + * + */ + +XORBLOCK_TEMPLATE(SPARC) +{ + int size = bh_ptr[0]->b_size; + int lines = size / (sizeof (long)) / 8, i; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1 = (long *) bh_ptr[1]->b_data; + long *source2, *source3, *source4; + + switch (count) { + case 2: + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", + "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + } + break; + case 4: + source2 = (long *) bh_ptr[2]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%3 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%3 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%3 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%3 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + source3 += 8; + } + break; + case 5: + source2 = (long *) bh_ptr[2]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source4 = (long *) bh_ptr[4]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%3 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%3 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%3 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%3 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%4 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%4 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%4 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%4 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + } + break; + } +} +#endif /* __sparc_v[78]__ */ + +#ifdef __alpha__ +/* + * High speed xor_block operation for RAID4/5 pipelined for Alpha EV5. + * There is a second version using EV6 prefetch instructions. + * + * Copyright (C) 2000 Richard Henderson (rth@redhat.com) + */ + +XORBLOCK_TEMPLATE(alpha) +{ + long lines = bh_ptr[0]->b_size / sizeof (long) / 8; + long *d = (long *) bh_ptr[0]->b_data; + long *s1 = (long *) bh_ptr[1]->b_data; + long *s2, *s3, *s4; + + if (count == 2) goto two_blocks; + + s2 = (long *) bh_ptr[2]->b_data; + if (count == 3) goto three_blocks; + + s3 = (long *) bh_ptr[3]->b_data; + if (count == 4) goto four_blocks; + + s4 = (long *) bh_ptr[4]->b_data; + goto five_blocks; + +two_blocks: +asm volatile (" + .align 4 +2: + ldq $0,0(%0) + ldq $1,0(%1) + ldq $2,8(%0) + ldq $3,8(%1) + + ldq $4,16(%0) + ldq $5,16(%1) + ldq $6,24(%0) + ldq $7,24(%1) + + ldq $16,32(%0) + ldq $17,32(%1) + ldq $18,40(%0) + ldq $19,40(%1) + + ldq $20,48(%0) + ldq $21,48(%1) + ldq $22,56(%0) + xor $0,$1,$0 # 7 cycles from $1 load + + ldq $23,56(%1) + xor $2,$3,$2 + stq $0,0(%0) + xor $4,$5,$4 + + stq $2,8(%0) + xor $6,$7,$6 + stq $4,16(%0) + xor $16,$17,$16 + + stq $6,24(%0) + xor $18,$19,$18 + stq $16,32(%0) + xor $20,$21,$20 + + stq $18,40(%0) + xor $22,$23,$22 + stq $20,48(%0) + subq %2,1,%2 + + stq $22,56(%0) + addq %0,64,%0 + addq %1,64,%1 + bgt %2,2b" + : "=r"(d), "=r"(s1), "=r"(lines) + : "0"(d), "1"(s1), "2"(lines) + : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", + "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23"); + return; + +three_blocks: +asm volatile (" + .align 4 +3: + ldq $0,0(%0) + ldq $1,0(%1) + ldq $2,0(%2) + ldq $3,8(%0) + + ldq $4,8(%1) + ldq $6,16(%0) + ldq $7,16(%1) + ldq $17,24(%0) + + ldq $18,24(%1) + ldq $20,32(%0) + ldq $21,32(%1) + ldq $5,8(%2) + + ldq $16,16(%2) + ldq $19,24(%2) + ldq $22,32(%2) + nop + + xor $0,$1,$1 # 8 cycles from $0 load + xor $3,$4,$4 # 6 cycles from $4 load + xor $6,$7,$7 # 6 cycles from $7 load + xor $17,$18,$18 # 5 cycles from $18 load + + xor $1,$2,$2 # 9 cycles from $2 load + xor $20,$21,$21 # 5 cycles from $21 load + stq $2,0(%0) + xor $4,$5,$5 # 6 cycles from $5 load + + stq $5,8(%0) + xor $7,$16,$16 # 7 cycles from $16 load + stq $16,16(%0) + xor $18,$19,$19 # 7 cycles from $19 load + + stq $19,24(%0) + xor $21,$22,$22 # 7 cycles from $22 load + stq $22,32(%0) + nop + + ldq $0,40(%0) + ldq $1,40(%1) + ldq $3,48(%0) + ldq $4,48(%1) + + ldq $6,56(%0) + ldq $7,56(%1) + ldq $2,40(%2) + ldq $5,48(%2) + + ldq $16,56(%2) + xor $0,$1,$1 # 4 cycles from $1 load + xor $3,$4,$4 # 5 cycles from $4 load + xor $6,$7,$7 # 5 cycles from $7 load + + xor $1,$2,$2 # 4 cycles from $2 load + xor $4,$5,$5 # 5 cycles from $5 load + stq $2,40(%0) + xor $7,$16,$16 # 4 cycles from $16 load + + stq $5,48(%0) + subq %3,1,%3 + stq $16,56(%0) + addq %2,64,%2 + + addq %1,64,%1 + addq %0,64,%0 + bgt %3,3b" + : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines) + : "0"(d), "1"(s1), "2"(s2), "3"(lines) + : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", + "$16", "$17", "$18", "$19", "$20", "$21", "$22"); + return; + +four_blocks: +asm volatile (" + .align 4 +4: + ldq $0,0(%0) + ldq $1,0(%1) + ldq $2,0(%2) + ldq $3,0(%3) + + ldq $4,8(%0) + ldq $5,8(%1) + ldq $6,8(%2) + ldq $7,8(%3) + + ldq $16,16(%0) + ldq $17,16(%1) + ldq $18,16(%2) + ldq $19,16(%3) + + ldq $20,24(%0) + xor $0,$1,$1 # 6 cycles from $1 load + ldq $21,24(%1) + xor $2,$3,$3 # 6 cycles from $3 load + + ldq $0,24(%2) + xor $1,$3,$3 + ldq $1,24(%3) + xor $4,$5,$5 # 7 cycles from $5 load + + stq $3,0(%0) + xor $6,$7,$7 + xor $16,$17,$17 # 7 cycles from $17 load + xor $5,$7,$7 + + stq $7,8(%0) + xor $18,$19,$19 # 7 cycles from $19 load + ldq $2,32(%0) + xor $17,$19,$19 + + ldq $3,32(%1) + ldq $4,32(%2) + ldq $5,32(%3) + xor $20,$21,$21 # 8 cycles from $21 load + + ldq $6,40(%0) + ldq $7,40(%1) + ldq $16,40(%2) + ldq $17,40(%3) + + stq $19,16(%0) + xor $0,$1,$1 # 9 cycles from $1 load + xor $2,$3,$3 # 5 cycles from $3 load + xor $21,$1,$1 + + ldq $18,48(%0) + xor $4,$5,$5 # 5 cycles from $5 load + ldq $19,48(%1) + xor $3,$5,$5 + + ldq $20,48(%2) + ldq $21,48(%3) + ldq $0,56(%0) + ldq $1,56(%1) + + ldq $2,56(%2) + xor $6,$7,$7 # 8 cycles from $6 load + ldq $3,56(%3) + xor $16,$17,$17 # 8 cycles from $17 load + + xor $7,$17,$17 + xor $18,$19,$19 # 5 cycles from $19 load + xor $20,$21,$21 # 5 cycles from $21 load + xor $19,$21,$21 + + stq $1,24(%0) + xor $0,$1,$1 # 5 cycles from $1 load + stq $5,32(%0) + xor $2,$3,$3 # 4 cycles from $3 load + + stq $17,40(%0) + xor $1,$3,$3 + stq $21,48(%0) + subq %4,1,%4 + + stq $3,56(%0) + addq %3,64,%3 + addq %2,64,%2 + addq %1,64,%1 + + addq %0,64,%0 + bgt %4,4b" + : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines) + : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines) + : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", + "$16", "$17", "$18", "$19", "$20", "$21"); + return; + +five_blocks: +asm volatile (" + ldq %0,0(%6) + ldq %1,8(%6) + ldq %2,16(%6) + ldq %3,24(%6) + ldq %4,32(%6) + ldq %0,%7(%0) + ldq %1,%7(%1) + ldq %2,%7(%2) + ldq %3,%7(%3) + ldq %4,%7(%4) + .align 4 +5: + ldq $0,0(%0) + ldq $1,0(%1) + ldq $2,0(%2) + ldq $3,0(%3) + + ldq $4,0(%4) + ldq $5,8(%0) + ldq $6,8(%1) + ldq $7,8(%2) + + ldq $16,8(%3) + ldq $17,8(%4) + ldq $18,16(%0) + ldq $19,16(%1) + + ldq $20,16(%2) + xor $0,$1,$1 # 6 cycles from $1 load + ldq $21,16(%3) + xor $2,$3,$3 # 6 cycles from $3 load + + ldq $0,16(%4) + xor $1,$3,$3 + ldq $1,24(%0) + xor $3,$4,$4 # 7 cycles from $4 load + + stq $4,0(%0) + xor $5,$6,$6 # 7 cycles from $6 load + xor $7,$16,$16 # 7 cycles from $16 load + xor $6,$17,$17 # 7 cycles from $17 load + + ldq $2,24(%1) + xor $16,$17,$17 + ldq $3,24(%2) + xor $18,$19,$19 # 8 cycles from $19 load + + stq $17,8(%0) + xor $19,$20,$20 # 8 cycles from $20 load + ldq $4,24(%3) + xor $21,$0,$0 # 7 cycles from $0 load + + ldq $5,24(%4) + xor $20,$0,$0 + ldq $6,32(%0) + ldq $7,32(%1) + + stq $0,16(%0) + xor $1,$2,$2 # 6 cycles from $2 load + ldq $16,32(%2) + xor $3,$4,$4 # 4 cycles from $4 load + + ldq $17,32(%3) + xor $2,$4,$4 + ldq $18,32(%4) + ldq $19,40(%0) + + ldq $20,40(%1) + ldq $21,40(%2) + ldq $0,40(%3) + xor $4,$5,$5 # 7 cycles from $5 load + + stq $5,24(%0) + xor $6,$7,$7 # 7 cycles from $7 load + ldq $1,40(%4) + ldq $2,48(%0) + + ldq $3,48(%1) + xor $7,$16,$16 # 7 cycles from $16 load + ldq $4,48(%2) + xor $17,$18,$18 # 6 cycles from $18 load + + ldq $5,48(%3) + xor $16,$18,$18 + ldq $6,48(%4) + xor $19,$20,$20 # 7 cycles from $20 load + + stq $18,32(%0) + xor $20,$21,$21 # 8 cycles from $21 load + ldq $7,56(%0) + xor $0,$1,$1 # 6 cycles from $1 load + + ldq $16,56(%1) + ldq $17,56(%2) + ldq $18,56(%3) + ldq $19,56(%4) + + xor $21,$1,$1 + xor $2,$3,$3 # 9 cycles from $3 load + xor $3,$4,$4 # 9 cycles from $4 load + xor $5,$6,$6 # 8 cycles from $6 load + + unop + xor $4,$6,$6 + xor $7,$16,$16 # 7 cycles from $16 load + xor $17,$18,$18 # 6 cycles from $18 load + + stq $6,48(%0) + xor $16,$18,$18 + subq %5,1,%5 + xor $18,$19,$19 # 8 cycles from $19 load + + stq $19,56(%0) + addq %4,64,%4 + addq %3,64,%3 + addq %2,64,%2 + + addq %1,64,%1 + addq %0,64,%0 + bgt %5,5b" + : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines) + /* ARG! We've run out of asm arguments! We've got to reload + all those pointers we just loaded. */ + : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines) + : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", + "$16", "$17", "$18", "$19", "$20", "$21"); + return; +} + +#define prefetch(base, ofs) \ + asm("ldq $31,%2(%0)" : "=r"(base) : "0"(base), "i"(ofs)) + +XORBLOCK_TEMPLATE(alpha_prefetch) +{ + long lines = bh_ptr[0]->b_size / sizeof (long) / 8; + long *d = (long *) bh_ptr[0]->b_data; + long *s1 = (long *) bh_ptr[1]->b_data; + long *s2, *s3, *s4; + long p; + + p = count == 2; + prefetch(d, 0); + prefetch(s1, 0); + prefetch(d, 64); + prefetch(s1, 64); + prefetch(d, 128); + prefetch(s1, 128); + prefetch(d, 192); + prefetch(s1, 192); + if (p) goto two_blocks; + + s2 = (long *) bh_ptr[2]->b_data; + p = count == 3; + prefetch(s2, 0); + prefetch(s2, 64); + prefetch(s2, 128); + prefetch(s2, 192); + if (p) goto three_blocks; + + s3 = (long *) bh_ptr[3]->b_data; + p = count == 4; + prefetch(s3, 0); + prefetch(s3, 64); + prefetch(s3, 128); + prefetch(s3, 192); + if (p) goto four_blocks; + + s4 = (long *) bh_ptr[4]->b_data; + prefetch(s4, 0); + prefetch(s4, 64); + prefetch(s4, 128); + prefetch(s4, 192); + goto five_blocks; + +two_blocks: +asm volatile (" + .align 4 +2: + ldq $0,0(%0) + ldq $1,0(%1) + ldq $2,8(%0) + ldq $3,8(%1) + + ldq $4,16(%0) + ldq $5,16(%1) + ldq $6,24(%0) + ldq $7,24(%1) + + ldq $16,32(%0) + ldq $17,32(%1) + ldq $18,40(%0) + ldq $19,40(%1) + + ldq $20,48(%0) + ldq $21,48(%1) + ldq $22,56(%0) + ldq $23,56(%1) + + ldq $31,256(%0) + xor $0,$1,$0 # 8 cycles from $1 load + ldq $31,256(%1) + xor $2,$3,$2 + + stq $0,0(%0) + xor $4,$5,$4 + stq $2,8(%0) + xor $6,$7,$6 + + stq $4,16(%0) + xor $16,$17,$16 + stq $6,24(%0) + xor $18,$19,$18 + + stq $16,32(%0) + xor $20,$21,$20 + stq $18,40(%0) + xor $22,$23,$22 + + stq $20,48(%0) + subq %2,1,%2 + stq $22,56(%0) + addq %0,64,%0 + + addq %1,64,%1 + bgt %2,2b" + : "=r"(d), "=r"(s1), "=r"(lines) + : "0"(d), "1"(s1), "2"(lines) + : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", + "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23"); + return; + +three_blocks: +asm volatile (" + .align 4 +3: + ldq $0,0(%0) + ldq $1,0(%1) + ldq $2,0(%2) + ldq $3,8(%0) + + ldq $4,8(%1) + ldq $6,16(%0) + ldq $7,16(%1) + ldq $17,24(%0) + + ldq $18,24(%1) + ldq $20,32(%0) + ldq $21,32(%1) + ldq $5,8(%2) + + ldq $16,16(%2) + ldq $19,24(%2) + ldq $22,32(%2) + nop + + xor $0,$1,$1 # 8 cycles from $0 load + xor $3,$4,$4 # 7 cycles from $4 load + xor $6,$7,$7 # 6 cycles from $7 load + xor $17,$18,$18 # 5 cycles from $18 load + + xor $1,$2,$2 # 9 cycles from $2 load + xor $20,$21,$21 # 5 cycles from $21 load + stq $2,0(%0) + xor $4,$5,$5 # 6 cycles from $5 load + + stq $5,8(%0) + xor $7,$16,$16 # 7 cycles from $16 load + stq $16,16(%0) + xor $18,$19,$19 # 7 cycles from $19 load + + stq $19,24(%0) + xor $21,$22,$22 # 7 cycles from $22 load + stq $22,32(%0) + nop + + ldq $0,40(%0) + ldq $1,40(%1) + ldq $3,48(%0) + ldq $4,48(%1) + + ldq $6,56(%0) + ldq $7,56(%1) + ldq $2,40(%2) + ldq $5,48(%2) + + ldq $16,56(%2) + ldq $31,256(%0) + ldq $31,256(%1) + ldq $31,256(%2) + + xor $0,$1,$1 # 6 cycles from $1 load + xor $3,$4,$4 # 5 cycles from $4 load + xor $6,$7,$7 # 5 cycles from $7 load + xor $1,$2,$2 # 4 cycles from $2 load + + xor $4,$5,$5 # 5 cycles from $5 load + xor $7,$16,$16 # 4 cycles from $16 load + stq $2,40(%0) + subq %3,1,%3 + + stq $5,48(%0) + addq %2,64,%2 + stq $16,56(%0) + addq %1,64,%1 + + addq %0,64,%0 + bgt %3,3b" + : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines) + : "0"(d), "1"(s1), "2"(s2), "3"(lines) + : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", + "$16", "$17", "$18", "$19", "$20", "$21", "$22"); + return; + +four_blocks: +asm volatile (" + .align 4 +4: + ldq $0,0(%0) + ldq $1,0(%1) + ldq $2,0(%2) + ldq $3,0(%3) + + ldq $4,8(%0) + ldq $5,8(%1) + ldq $6,8(%2) + ldq $7,8(%3) + + ldq $16,16(%0) + ldq $17,16(%1) + ldq $18,16(%2) + ldq $19,16(%3) + + ldq $20,24(%0) + xor $0,$1,$1 # 6 cycles from $1 load + ldq $21,24(%1) + xor $2,$3,$3 # 6 cycles from $3 load + + ldq $0,24(%2) + xor $1,$3,$3 + ldq $1,24(%3) + xor $4,$5,$5 # 7 cycles from $5 load + + stq $3,0(%0) + xor $6,$7,$7 + xor $16,$17,$17 # 7 cycles from $17 load + xor $5,$7,$7 + + stq $7,8(%0) + xor $18,$19,$19 # 7 cycles from $19 load + ldq $2,32(%0) + xor $17,$19,$19 + + ldq $3,32(%1) + ldq $4,32(%2) + ldq $5,32(%3) + xor $20,$21,$21 # 8 cycles from $21 load + + ldq $6,40(%0) + ldq $7,40(%1) + ldq $16,40(%2) + ldq $17,40(%3) + + stq $19,16(%0) + xor $0,$1,$1 # 9 cycles from $1 load + xor $2,$3,$3 # 5 cycles from $3 load + xor $21,$1,$1 + + ldq $18,48(%0) + xor $4,$5,$5 # 5 cycles from $5 load + ldq $19,48(%1) + xor $3,$5,$5 + + ldq $20,48(%2) + ldq $21,48(%3) + ldq $0,56(%0) + ldq $1,56(%1) + + ldq $2,56(%2) + xor $6,$7,$7 # 8 cycles from $6 load + ldq $3,56(%3) + xor $16,$17,$17 # 8 cycles from $17 load + + ldq $31,256(%0) + xor $7,$17,$17 + ldq $31,256(%1) + xor $18,$19,$19 # 6 cycles from $19 load + + ldq $31,256(%2) + xor $20,$21,$21 # 6 cycles from $21 load + ldq $31,256(%3) + xor $19,$21,$21 + + stq $1,24(%0) + xor $0,$1,$1 # 7 cycles from $1 load + stq $5,32(%0) + xor $2,$3,$3 # 6 cycles from $3 load + + stq $17,40(%0) + xor $1,$3,$3 + stq $21,48(%0) + subq %4,1,%4 + + stq $3,56(%0) + addq %3,64,%3 + addq %2,64,%2 + addq %1,64,%1 + + addq %0,64,%0 + bgt %4,4b" + : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines) + : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines) + : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", + "$16", "$17", "$18", "$19", "$20", "$21"); + return; + +five_blocks: +asm volatile (" + ldq %0,0(%6) + ldq %1,8(%6) + ldq %2,16(%6) + ldq %3,24(%6) + ldq %4,32(%6) + ldq %0,%7(%0) + ldq %1,%7(%1) + ldq %2,%7(%2) + ldq %3,%7(%3) + ldq %4,%7(%4) + .align 4 +5: + ldq $0,0(%0) + ldq $1,0(%1) + ldq $2,0(%2) + ldq $3,0(%3) + + ldq $4,0(%4) + ldq $5,8(%0) + ldq $6,8(%1) + ldq $7,8(%2) + + ldq $16,8(%3) + ldq $17,8(%4) + ldq $18,16(%0) + ldq $19,16(%1) + + ldq $20,16(%2) + xor $0,$1,$1 # 6 cycles from $1 load + ldq $21,16(%3) + xor $2,$3,$3 # 6 cycles from $3 load + + ldq $0,16(%4) + xor $1,$3,$3 + ldq $1,24(%0) + xor $3,$4,$4 # 7 cycles from $4 load + + stq $4,0(%0) + xor $5,$6,$6 # 7 cycles from $6 load + xor $7,$16,$16 # 7 cycles from $16 load + xor $6,$17,$17 # 7 cycles from $17 load + + ldq $2,24(%1) + xor $16,$17,$17 + ldq $3,24(%2) + xor $18,$19,$19 # 8 cycles from $19 load + + stq $17,8(%0) + xor $19,$20,$20 # 8 cycles from $20 load + ldq $4,24(%3) + xor $21,$0,$0 # 7 cycles from $0 load + + ldq $5,24(%4) + xor $20,$0,$0 + ldq $6,32(%0) + ldq $7,32(%1) + + stq $0,16(%0) + xor $1,$2,$2 # 6 cycles from $2 load + ldq $16,32(%2) + xor $3,$4,$4 # 4 cycles from $4 load + + ldq $17,32(%3) + xor $2,$4,$4 + ldq $18,32(%4) + ldq $19,40(%0) + + ldq $20,40(%1) + ldq $21,40(%2) + ldq $0,40(%3) + xor $4,$5,$5 # 7 cycles from $5 load + + stq $5,24(%0) + xor $6,$7,$7 # 7 cycles from $7 load + ldq $1,40(%4) + ldq $2,48(%0) + + ldq $3,48(%1) + xor $7,$16,$16 # 7 cycles from $16 load + ldq $4,48(%2) + xor $17,$18,$18 # 6 cycles from $18 load + + ldq $5,48(%3) + xor $16,$18,$18 + ldq $6,48(%4) + xor $19,$20,$20 # 7 cycles from $20 load + + stq $18,32(%0) + xor $20,$21,$21 # 8 cycles from $21 load + ldq $7,56(%0) + xor $0,$1,$1 # 6 cycles from $1 load + + ldq $16,56(%1) + ldq $17,56(%2) + ldq $18,56(%3) + ldq $19,56(%4) + + ldq $31,256(%0) + xor $21,$1,$1 + ldq $31,256(%1) + xor $2,$3,$3 # 9 cycles from $3 load + + ldq $31,256(%2) + xor $3,$4,$4 # 9 cycles from $4 load + ldq $31,256(%3) + xor $5,$6,$6 # 8 cycles from $6 load + + ldq $31,256(%4) + xor $4,$6,$6 + xor $7,$16,$16 # 7 cycles from $16 load + xor $17,$18,$18 # 6 cycles from $18 load + + stq $6,48(%0) + xor $16,$18,$18 + subq %5,1,%5 + xor $18,$19,$19 # 8 cycles from $19 load + + stq $19,56(%0) + addq %4,64,%4 + addq %3,64,%3 + addq %2,64,%2 + + addq %1,64,%1 + addq %0,64,%0 + bgt %5,5b" + : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines) + /* ARG! We've run out of asm arguments! We've got to reload + all those pointers we just loaded. */ + : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines) + : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", + "$16", "$17", "$18", "$19", "$20", "$21"); + return; +} + +#undef prefetch + +#endif /* __alpha__ */ + +#ifndef __sparc_v9__ + +/* + * this one works reasonably on any x86 CPU + * (send me an assembly version for inclusion if you can make it faster) + * + * this one is just as fast as written in pure assembly on x86. + * the reason for this separate version is that the + * fast open-coded xor routine "32reg" produces suboptimal code + * on x86, due to lack of registers. + */ +XORBLOCK_TEMPLATE(8regs) +{ + int len = bh_ptr[0]->b_size; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1, *source2, *source3, *source4; + long lines = len / (sizeof (long)) / 8, i; + + switch(count) { + case 2: + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 7) ^= *(source1 + 7); + source1 += 8; + destp += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + source1 += 8; + source2 += 8; + destp += 8; + } + break; + case 4: + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 0) ^= *(source3 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 1) ^= *(source3 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 2) ^= *(source3 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 3) ^= *(source3 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 4) ^= *(source3 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 5) ^= *(source3 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 6) ^= *(source3 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + *(destp + 7) ^= *(source3 + 7); + source1 += 8; + source2 += 8; + source3 += 8; + destp += 8; + } + break; + case 5: + source4 = (long *) bh_ptr[4]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 0) ^= *(source3 + 0); + *(destp + 0) ^= *(source4 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 1) ^= *(source3 + 1); + *(destp + 1) ^= *(source4 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 2) ^= *(source3 + 2); + *(destp + 2) ^= *(source4 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 3) ^= *(source3 + 3); + *(destp + 3) ^= *(source4 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 4) ^= *(source3 + 4); + *(destp + 4) ^= *(source4 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 5) ^= *(source3 + 5); + *(destp + 5) ^= *(source4 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 6) ^= *(source3 + 6); + *(destp + 6) ^= *(source4 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + *(destp + 7) ^= *(source3 + 7); + *(destp + 7) ^= *(source4 + 7); + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + destp += 8; + } + break; + } +} + +/* + * platform independent RAID5 checksum calculation, this should + * be very fast on any platform that has a decent amount of + * registers. (32 or more) + */ +XORBLOCK_TEMPLATE(32regs) +{ + int size = bh_ptr[0]->b_size; + int lines = size / (sizeof (long)) / 8, i; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1, *source2, *source3, *source4; + + /* LOTS of registers available... + We do explicite loop-unrolling here for code which + favours RISC machines. In fact this is almoast direct + RISC assembly on Alpha and SPARC :-) */ + + + switch(count) { + case 2: + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + destp += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + destp += 8; + } + break; + case 4: + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + d0 ^= source3[0]; + d1 ^= source3[1]; + d2 ^= source3[2]; + d3 ^= source3[3]; + d4 ^= source3[4]; + d5 ^= source3[5]; + d6 ^= source3[6]; + d7 ^= source3[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + source3 += 8; + destp += 8; + } + break; + case 5: + source4 = (long *) bh_ptr[4]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + d0 ^= source3[0]; + d1 ^= source3[1]; + d2 ^= source3[2]; + d3 ^= source3[3]; + d4 ^= source3[4]; + d5 ^= source3[5]; + d6 ^= source3[6]; + d7 ^= source3[7]; + d0 ^= source4[0]; + d1 ^= source4[1]; + d2 ^= source4[2]; + d3 ^= source4[3]; + d4 ^= source4[4]; + d5 ^= source4[5]; + d6 ^= source4[6]; + d7 ^= source4[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + destp += 8; + } + break; + } +} + +/* + * (the -6*32 shift factor colors the cache) + */ +#define SIZE (PAGE_SIZE-6*32) + +static void xor_speed ( struct xor_block_template * func, + struct buffer_head *b1, struct buffer_head *b2) +{ + int speed; + unsigned long now; + int i, count, max; + struct buffer_head *bh_ptr[6]; + + func->next = xor_functions; + xor_functions = func; + bh_ptr[0] = b1; + bh_ptr[1] = b2; + + /* + * count the number of XORs done during a whole jiffy. + * calculate the speed of checksumming from this. + * (we use a 2-page allocation to have guaranteed + * color L1-cache layout) + */ + max = 0; + for (i = 0; i < 5; i++) { + now = jiffies; + count = 0; + while (jiffies == now) { + mb(); + func->xor_block(2,bh_ptr); + mb(); + count++; + mb(); + } + if (count > max) + max = count; + } + + speed = max * (HZ*SIZE/1024); + func->speed = speed; + + printk( " %-10s: %5d.%03d MB/sec\n", func->name, + speed / 1000, speed % 1000); +} + +static inline void pick_fastest_function(void) +{ + struct xor_block_template *f, *fastest; + + fastest = xor_functions; + for (f = fastest; f; f = f->next) { + if (f->speed > fastest->speed) + fastest = f; + } +#ifdef CONFIG_X86_XMM + if (cpu_has_xmm) { + /* we force the use of the KNI xor block because it + can write around l2. we may also be able + to load into the l1 only depending on how + the cpu deals with a load to a line that is + being prefetched. + */ + fastest = &t_xor_block_pIII_kni; + } +#endif +#ifdef __alpha__ + if (implver() == IMPLVER_EV6) { + /* Force the use of alpha_prefetch if EV6, as it + is significantly faster in the cold cache case. */ + fastest = &t_xor_block_alpha_prefetch; + } +#endif + xor_block = fastest->xor_block; + printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name, + fastest->speed / 1000, fastest->speed % 1000); +} + +static struct buffer_head b1, b2; + +void calibrate_xor_block(void) +{ + if (xor_block) + return; + memset(&b1,0,sizeof(b1)); + b2 = b1; + + b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2); + if (!b1.b_data) { + pick_fastest_function(); + return; + } + b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE; + + b1.b_size = SIZE; + + printk(KERN_INFO "raid5: measuring checksumming speed\n"); + + sti(); /* should be safe */ + +#if defined(__sparc__) && !defined(__sparc_v9__) + printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n"); + xor_speed(&t_xor_block_SPARC,&b1,&b2); +#endif + +#ifdef CONFIG_X86_XMM + if (cpu_has_xmm) { + printk(KERN_INFO + "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n"); + xor_speed(&t_xor_block_pIII_kni,&b1,&b2); + } +#endif /* CONFIG_X86_XMM */ + +#ifdef __i386__ + if (md_cpu_has_mmx()) { + printk(KERN_INFO + "raid5: MMX detected, trying high-speed MMX checksum routines\n"); + xor_speed(&t_xor_block_pII_mmx,&b1,&b2); + xor_speed(&t_xor_block_p5_mmx,&b1,&b2); + } +#endif /* __i386__ */ + +#ifdef __alpha__ + xor_speed(&t_xor_block_alpha,&b1,&b2); + xor_speed(&t_xor_block_alpha_prefetch,&b1,&b2); +#endif + + xor_speed(&t_xor_block_8regs,&b1,&b2); + xor_speed(&t_xor_block_32regs,&b1,&b2); + + free_pages((unsigned long)b1.b_data,2); + pick_fastest_function(); +} + +#else /* __sparc_v9__ */ + +void calibrate_xor_block(void) +{ + if (xor_block) + return; + printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n"); + xor_block = xor_block_VIS; +} + +#endif /* __sparc_v9__ */ + +MD_EXPORT_SYMBOL(xor_block); +MD_EXPORT_SYMBOL(calibrate_xor_block); + +#ifdef MODULE +int init_module(void) +{ + calibrate_xor_block(); + return 0; +} +#endif |