diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-10-05 01:18:40 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-10-05 01:18:40 +0000 |
commit | 012bb3e61e5eced6c610f9e036372bf0c8def2d1 (patch) | |
tree | 87efc733f9b164e8c85c0336f92c8fb7eff6d183 /drivers/block | |
parent | 625a1589d3d6464b5d90b8a0918789e3afffd220 (diff) |
Merge with Linux 2.4.0-test9. Please check DECstation, I had a number
of rejects to fixup while integrating Linus patches. I also found
that this kernel will only boot SMP on Origin; the UP kernel freeze
soon after bootup with SCSI timeout messages. I commit this anyway
since I found that the last CVS versions had the same problem.
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/Config.in | 15 | ||||
-rw-r--r-- | drivers/block/Makefile | 15 | ||||
-rw-r--r-- | drivers/block/amiflop.c | 12 | ||||
-rw-r--r-- | drivers/block/cciss.c | 1917 | ||||
-rw-r--r-- | drivers/block/cciss.h | 201 | ||||
-rw-r--r-- | drivers/block/cciss_cmd.h | 254 | ||||
-rw-r--r-- | drivers/block/cpqarray.c | 91 | ||||
-rw-r--r-- | drivers/block/cpqarray.h | 7 | ||||
-rw-r--r-- | drivers/block/floppy.c | 83 | ||||
-rw-r--r-- | drivers/block/genhd.c | 4 | ||||
-rw-r--r-- | drivers/block/ida_ioctl.h | 9 | ||||
-rw-r--r-- | drivers/block/linear.c | 213 | ||||
-rw-r--r-- | drivers/block/ll_rw_blk.c | 5 | ||||
-rw-r--r-- | drivers/block/loop.c | 5 | ||||
-rw-r--r-- | drivers/block/lvm-snap.c | 434 | ||||
-rw-r--r-- | drivers/block/lvm.c | 2532 | ||||
-rw-r--r-- | drivers/block/md.c | 3867 | ||||
-rw-r--r-- | drivers/block/paride/paride.h | 4 | ||||
-rw-r--r-- | drivers/block/raid0.c | 356 | ||||
-rw-r--r-- | drivers/block/raid1.c | 1897 | ||||
-rw-r--r-- | drivers/block/raid5.c | 2371 | ||||
-rw-r--r-- | drivers/block/swim3.c | 8 | ||||
-rw-r--r-- | drivers/block/xd.c | 10 | ||||
-rw-r--r-- | drivers/block/xor.c | 1907 |
24 files changed, 2517 insertions, 13700 deletions
diff --git a/drivers/block/Config.in b/drivers/block/Config.in index 4a92d53c2..5aa228104 100644 --- a/drivers/block/Config.in +++ b/drivers/block/Config.in @@ -34,25 +34,12 @@ if [ "$CONFIG_PARIDE" = "y" -o "$CONFIG_PARIDE" = "m" ]; then source drivers/block/paride/Config.in fi dep_tristate 'Compaq SMART2 support' CONFIG_BLK_CPQ_DA $CONFIG_PCI +dep_tristate 'Compaq CISS Array support' CONFIG_BLK_CPQ_CISS_DA $CONFIG_PCI dep_tristate 'Mylex DAC960/DAC1100 PCI RAID Controller support' CONFIG_BLK_DEV_DAC960 $CONFIG_PCI tristate 'Loopback device support' CONFIG_BLK_DEV_LOOP dep_tristate 'Network block device support' CONFIG_BLK_DEV_NBD $CONFIG_NET -tristate 'Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM N -if [ "$CONFIG_BLK_DEV_LVM" != "n" ]; then - bool ' LVM information in proc filesystem' CONFIG_LVM_PROC_FS Y -fi - -tristate 'Multiple devices driver support' CONFIG_BLK_DEV_MD -dep_tristate ' Linear (append) mode' CONFIG_MD_LINEAR $CONFIG_BLK_DEV_MD -dep_tristate ' RAID-0 (striping) mode' CONFIG_MD_RAID0 $CONFIG_BLK_DEV_MD -dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD -dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD -if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_RAID0" = "y" -o "$CONFIG_MD_RAID1" = "y" -o "$CONFIG_MD_RAID5" = "y" ]; then - bool ' Boot support' CONFIG_MD_BOOT - bool ' Auto Detect support' CONFIG_AUTODETECT_RAID -fi tristate 'RAM disk support' CONFIG_BLK_DEV_RAM if [ "$CONFIG_BLK_DEV_RAM" = "y" -o "$CONFIG_BLK_DEV_RAM" = "m" ]; then int ' Default RAM disk size' CONFIG_BLK_DEV_RAM_SIZE 4096 diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 237d7ac64..8ce7af56d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -14,9 +14,7 @@ ALL_SUB_DIRS := $(SUB_DIRS) paride O_TARGET := block.o -export-objs := ll_rw_blk.o blkpg.o loop.o DAC960.o md.o xor.o -list-multi := lvm-mod.o -lvm-mod-objs := lvm.o lvm-snap.o +export-objs := ll_rw_blk.o blkpg.o loop.o DAC960.o obj-y := ll_rw_blk.o blkpg.o genhd.o elevator.o @@ -33,14 +31,8 @@ obj-$(CONFIG_BLK_DEV_LOOP) += loop.o obj-$(CONFIG_BLK_DEV_PS2) += ps2esdi.o obj-$(CONFIG_BLK_DEV_XD) += xd.o obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o +obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o -obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o - -obj-$(CONFIG_BLK_DEV_MD) += md.o -obj-$(CONFIG_MD_LINEAR) += linear.o -obj-$(CONFIG_MD_RAID0) += raid0.o -obj-$(CONFIG_MD_RAID1) += raid1.o -obj-$(CONFIG_MD_RAID5) += raid5.o xor.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o @@ -71,6 +63,3 @@ M_OBJS := $(sort $(filter-out $(export-objs), $(obj-m))) MX_OBJS := $(sort $(filter $(export-objs), $(obj-m))) include $(TOPDIR)/Rules.make - -lvm-mod.o: $(lvm-mod-objs) - $(LD) -r -o $@ $(lvm-mod-objs) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index d33bda641..912bafc48 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -140,7 +140,7 @@ static int num_dr_types = sizeof(drive_types) / sizeof(drive_types[0]); /* defaults for 3 1/2" HD-Disks */ static int floppy_sizes[256]={880,880,880,880,720,720,720,720,}; -static int floppy_blocksizes[256]={0,}; +static int floppy_blocksizes[256]; /* hardsector size assumed to be 512 */ static int amiga_read(int), dos_read(int); @@ -151,7 +151,7 @@ static struct fd_data_type data_types[] = { }; /* current info on each unit */ -static struct amiga_floppy_struct unit[FD_MAX_UNITS] = {{ 0,}}; +static struct amiga_floppy_struct unit[FD_MAX_UNITS]; static struct timer_list flush_track_timer[FD_MAX_UNITS]; static struct timer_list post_write_timer; @@ -162,15 +162,15 @@ static int on_attempts; /* Synchronization of FDC access */ /* request loop (trackbuffer) */ static volatile int fdc_busy = -1; -static volatile int fdc_nested = 0; +static volatile int fdc_nested; static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); static DECLARE_WAIT_QUEUE_HEAD(motor_wait); static volatile int selected = -1; /* currently selected drive */ -static int writepending = 0; -static int writefromint = 0; +static int writepending; +static int writefromint; static char *raw_buf; #define RAW_BUF_SIZE 30000 /* size of raw disk data */ @@ -180,7 +180,7 @@ static char *raw_buf; * information to interrupts. They are the data used for the current * request. */ -static volatile char block_flag = 0; +static volatile char block_flag; static DECLARE_WAIT_QUEUE_HEAD(wait_fd_block); /* MS-Dos MFM Coding tables (should go quick and easy) */ diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c new file mode 100644 index 000000000..f88762bb1 --- /dev/null +++ b/drivers/block/cciss.c @@ -0,0 +1,1917 @@ +/* + * Disk Array driver for Compaq SMART2 Controllers + * Copyright 2000 Compaq Computer Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Questions/Comments/Bugfixes to arrays@compaq.com + * + */ + +#include <linux/config.h> /* CONFIG_PROC_FS */ +#include <linux/module.h> +#include <linux/version.h> +#include <linux/types.h> +#include <linux/pci.h> +#include <linux/kernel.h> +#include <linux/malloc.h> +#include <linux/delay.h> +#include <linux/major.h> +#include <linux/fs.h> +#include <linux/blkpg.h> +#include <linux/timer.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/hdreg.h> +#include <linux/spinlock.h> +#include <asm/uaccess.h> +#include <asm/io.h> + +#include <linux/blk.h> +#include <linux/blkdev.h> +#include <linux/genhd.h> + +#define CCISS_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin)) +#define DRIVER_NAME "Compaq CISS Driver (v 2.4.0)" +#define DRIVER_VERSION CCISS_DRIVER_VERSION(2,4,0) + +/* Embedded module documentation macros - see modules.h */ +MODULE_AUTHOR("Charles M. White III - Compaq Computer Corporation"); +MODULE_DESCRIPTION("Driver for Compaq Smart Array Controller 5300"); + +#include "cciss_cmd.h" +#include "cciss.h" +#include <linux/cciss_ioctl.h> + +#define NR_PRODUCTS (sizeof(products)/sizeof(struct board_type)) + +/* board_id = Subsystem Device ID & Vendor ID + * product = Marketing Name for the board + * access = Address of the struct of function pointers + */ +static struct board_type products[] = { + { 0x40700E11, "Smart Array 5300", &SA5_access }, +}; + +/* How long to wait (in millesconds) for board to go into simple mode */ +#define MAX_CONFIG_WAIT 1000 + +#define READ_AHEAD 128 +#define NR_CMDS 128 /* #commands that can be outstanding */ +#define MAX_CTLR 8 +static int nr_ctlr =0; +static ctlr_info_t *hba[MAX_CTLR] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + +static struct proc_dir_entry *proc_cciss = NULL; + +static void do_cciss_request(int i); +/* + * This is a hack. This driver eats a major number for each controller, and + * sets blkdev[xxx].request_fn to each one of these so the real request + * function knows what controller its working with. + */ +#define DO_CCISS_REQUEST(x) { do_cciss_request(x); } + +static void do_cciss_request0(request_queue_t * q) DO_CCISS_REQUEST(0); +static void do_cciss_request1(request_queue_t * q) DO_CCISS_REQUEST(1); +static void do_cciss_request2(request_queue_t * q) DO_CCISS_REQUEST(2); +static void do_cciss_request3(request_queue_t * q) DO_CCISS_REQUEST(3); +static void do_cciss_request4(request_queue_t * q) DO_CCISS_REQUEST(4); +static void do_cciss_request5(request_queue_t * q) DO_CCISS_REQUEST(5); +static void do_cciss_request6(request_queue_t * q) DO_CCISS_REQUEST(6); +static void do_cciss_request7(request_queue_t * q) DO_CCISS_REQUEST(7); + +static int cciss_open(struct inode *inode, struct file *filep); +static int cciss_release(struct inode *inode, struct file *filep); +static int cciss_ioctl(struct inode *inode, struct file *filep, + unsigned int cmd, unsigned long arg); + +static int revalidate_allvol(kdev_t dev); +static int revalidate_logvol(kdev_t dev, int maxusage); +static int frevalidate_logvol(kdev_t dev); + +static void cciss_getgeometry(int cntl_num); + +static inline void addQ(CommandList_struct **Qptr, CommandList_struct *c); +static void start_io( ctlr_info_t *h); + +#ifdef CONFIG_PROC_FS +static int cciss_proc_get_info(char *buffer, char **start, off_t offset, + int length, int *eof, void *data); +static void cciss_procinit(int i); +#else +static int cciss_proc_get_info(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) {} +static void cciss_procinit(int i) {} +#endif /* CONFIG_PROC_FS */ + +static struct block_device_operations cciss_fops = { + open: cciss_open, + release: cciss_release, + ioctl: cciss_ioctl, + revalidate: frevalidate_logvol, +}; + +/* + * Report information about this controller. + */ +#ifdef CONFIG_PROC_FS +static int cciss_proc_get_info(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t len = 0; + int size, i, ctlr; + ctlr_info_t *h = (ctlr_info_t*)data; + drive_info_struct *drv; + + ctlr = h->ctlr; + size = sprintf(buffer, "%s: Compaq %s Controller\n" + " Board ID: %08lx\n" + " Firmware Version: %c%c%c%c\n" + " Memory Address: %08lx\n" + " IRQ: 0x%x\n" + " Logical drives: %d\n" + " Current Q depth: %d\n" + " Current # commands on controller %d\n" + " Max Q depth since init: %d\n" + " Max # commands on controller since init: %d\n" + " Max SG entries since init: %d\n\n", + h->devname, + h->product_name, + (unsigned long)h->board_id, + h->firm_ver[0], h->firm_ver[1], h->firm_ver[2], h->firm_ver[3], + (unsigned long)h->vaddr, + (unsigned int)h->intr, + h->num_luns, + h->Qdepth, h->commands_outstanding, + h->maxQsinceinit, h->max_outstanding, h->maxSG); + + pos += size; len += size; + for(i=0; i<h->num_luns; i++) { + drv = &h->drv[i]; + size = sprintf(buffer+len, "cciss/c%dd%d: blksz=%d nr_blocks=%d\n", + ctlr, i, drv->block_size, drv->nr_blocks); + pos += size; len += size; + } + + size = sprintf(buffer+len, "nr_allocs = %d\nnr_frees = %d\n", + h->nr_allocs, h->nr_frees); + pos += size; len += size; + + *eof = 1; + *start = buffer+offset; + len -= offset; + if (len>length) + len = length; + return len; +} + +/* + * Get us a file in /proc/cciss that says something about each controller. + * Create /proc/cciss if it doesn't exist yet. + */ +static void __init cciss_procinit(int i) +{ + if (proc_cciss == NULL) { + proc_cciss = proc_mkdir("driver/cciss", NULL); + if (!proc_cciss) + return; + } + + create_proc_read_entry(hba[i]->devname, 0, proc_cciss, + cciss_proc_get_info, hba[i]); +} +#endif /* CONFIG_PROC_FS */ + +/* + * For operations that cannot sleep, a command block is allocated at init, + * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track + * which ones are free or in use. For operations that can wait for kmalloc + * to possible sleep, this routine can be called with a NULL pointer. + * cmd_free() MUST be called with a NULL pointer if cmd_alloc was. + */ +static CommandList_struct * cmd_alloc(ctlr_info_t *h) +{ + CommandList_struct *c; + int i; + u64bit temp64; + + if (h == NULL) + { + c = (CommandList_struct *)kmalloc(sizeof(CommandList_struct), + GFP_KERNEL); + if(c==NULL) + return NULL; + memset(c, 0, sizeof(CommandList_struct)); + + c->err_info = (ErrorInfo_struct *)kmalloc( + sizeof(ErrorInfo_struct), GFP_KERNEL); + + if (c->err_info == NULL) + { + kfree(c); + return NULL; + } + memset(c->err_info, 0, sizeof(ErrorInfo_struct)); + } else /* get it out of the controllers pool */ + { + do { + i = find_first_zero_bit(h->cmd_pool_bits, NR_CMDS); + if (i == NR_CMDS) + return NULL; + } while(test_and_set_bit(i%32, h->cmd_pool_bits+(i/32)) != 0); +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "cciss: using command buffer %d\n", i); +#endif + c = h->cmd_pool + i; + memset(c, 0, sizeof(CommandList_struct)); + c->err_info = h->errinfo_pool + i; + memset(c->err_info, 0, sizeof(ErrorInfo_struct)); + h->nr_allocs++; + } + + + temp64.val = (__u64) virt_to_bus(c->err_info); + c->ErrDesc.Addr.lower = temp64.val32.lower; + c->ErrDesc.Addr.upper = temp64.val32.upper; + c->ErrDesc.Len = sizeof(ErrorInfo_struct); + c->busaddr = virt_to_bus(c); + return c; + + +} + +/* + * Frees a command block that was previously allocated with cmd_alloc(). + */ +static void cmd_free(ctlr_info_t *h, CommandList_struct *c) +{ + int i; + + if( h == NULL) + { + kfree(c->err_info); + kfree(c); + } else + { + i = c - h->cmd_pool; + clear_bit(i%32, h->cmd_pool_bits+(i/32)); + h->nr_frees++; + } +} + +/* + * fills in the disk information. + */ +static void cciss_geninit( int ctlr) +{ + drive_info_struct *drv; + int i,j; + + /* Loop through each real device */ + hba[ctlr]->gendisk.nr_real = 0; + for(i=0; i< NWD; i++) + { + drv = &(hba[ctlr]->drv[i]); + if( !(drv->nr_blocks)) + continue; + hba[ctlr]->hd[i << NWD_SHIFT].nr_sects = + hba[ctlr]->sizes[i << NWD_SHIFT] = drv->nr_blocks; + + /* for each partition */ + for(j=0; j<MAX_PART; j++) + { + hba[ctlr]->blocksizes[(i<<NWD_SHIFT) + j] = 1024; + + hba[ctlr]->hardsizes[ (i<<NWD_SHIFT) + j] = + drv->block_size; + } + hba[ctlr]->gendisk.nr_real++; + } +} +/* + * Open. Make sure the device is really there. + */ +static int cciss_open(struct inode *inode, struct file *filep) +{ + int ctlr = MAJOR(inode->i_rdev) - MAJOR_NR; + int dsk = MINOR(inode->i_rdev) >> NWD_SHIFT; + +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "cciss_open %x (%x:%x)\n", inode->i_rdev, ctlr, dsk); +#endif /* CCISS_DEBUG */ + + if (ctlr > MAX_CTLR || hba[ctlr] == NULL) + return -ENXIO; + + if (!suser() && hba[ctlr]->sizes[ MINOR(inode->i_rdev)] == 0) + return -ENXIO; + + /* + * Root is allowed to open raw volume zero even if its not configured + * so array config can still work. I don't think I really like this, + * but I'm already using way to many device nodes to claim another one + * for "raw controller". + */ + if (suser() + && (hba[ctlr]->sizes[MINOR(inode->i_rdev)] == 0) + && (MINOR(inode->i_rdev)!= 0)) + return -ENXIO; + + hba[ctlr]->drv[dsk].usage_count++; + hba[ctlr]->usage_count++; + MOD_INC_USE_COUNT; + return 0; +} +/* + * Close. Sync first. + */ +static int cciss_release(struct inode *inode, struct file *filep) +{ + int ctlr = MAJOR(inode->i_rdev) - MAJOR_NR; + int dsk = MINOR(inode->i_rdev) >> NWD_SHIFT; + +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "cciss_release %x (%x:%x)\n", inode->i_rdev, ctlr, dsk); +#endif /* CCISS_DEBUG */ + + /* fsync_dev(inode->i_rdev); */ + + hba[ctlr]->drv[dsk].usage_count--; + hba[ctlr]->usage_count--; + MOD_DEC_USE_COUNT; + return 0; +} + +/* + * ioctl + */ +static int cciss_ioctl(struct inode *inode, struct file *filep, + unsigned int cmd, unsigned long arg) +{ + int ctlr = MAJOR(inode->i_rdev) - MAJOR_NR; + int dsk = MINOR(inode->i_rdev) >> NWD_SHIFT; + int diskinfo[4]; + struct hd_geometry *geo = (struct hd_geometry *)arg; + +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "cciss_ioctl: Called with cmd=%x %lx\n", cmd, arg); +#endif /* CCISS_DEBUG */ + + switch(cmd) { + case HDIO_GETGEO: + if (hba[ctlr]->drv[dsk].cylinders) { + diskinfo[0] = hba[ctlr]->drv[dsk].heads; + diskinfo[1] = hba[ctlr]->drv[dsk].sectors; + diskinfo[2] = hba[ctlr]->drv[dsk].cylinders; + } else { + diskinfo[0] = 0xff; + diskinfo[1] = 0x3f; + diskinfo[2] = hba[ctlr]->drv[dsk].nr_blocks / (0xff*0x3f); } + put_user(diskinfo[0], &geo->heads); + put_user(diskinfo[1], &geo->sectors); + put_user(diskinfo[2], &geo->cylinders); + put_user(hba[ctlr]->hd[MINOR(inode->i_rdev)].start_sect, &geo->start); + return 0; + case BLKGETSIZE: + if (!arg) return -EINVAL; + put_user(hba[ctlr]->hd[MINOR(inode->i_rdev)].nr_sects, (long*)arg); + return 0; + case BLKRRPART: + return revalidate_logvol(inode->i_rdev, 1); + case BLKFLSBUF: + case BLKROSET: + case BLKROGET: + case BLKRASET: + case BLKRAGET: + case BLKPG: + return( blk_ioctl(inode->i_rdev, cmd, arg)); + case CCISS_GETPCIINFO: + { + cciss_pci_info_struct pciinfo; + + if (!arg) return -EINVAL; + pciinfo.bus = hba[ctlr]->pci_bus; + pciinfo.dev_fn = hba[ctlr]->pci_dev_fn; + pciinfo.board_id = hba[ctlr]->board_id; + if (copy_to_user((void *) arg, &pciinfo, sizeof( cciss_pci_info_struct ))) + return -EFAULT; + return(0); + } + case CCISS_GETINTINFO: + { + cciss_coalint_struct intinfo; + ctlr_info_t *c = hba[ctlr]; + + if (!arg) return -EINVAL; + intinfo.delay = readl(&c->cfgtable->HostWrite.CoalIntDelay); + intinfo.count = readl(&c->cfgtable->HostWrite.CoalIntCount); + if (copy_to_user((void *) arg, &intinfo, sizeof( cciss_coalint_struct ))) + return -EFAULT; + return(0); + } + case CCISS_SETINTINFO: + { + cciss_coalint_struct intinfo; + ctlr_info_t *c = hba[ctlr]; + unsigned long flags; + int i; + + if (!arg) return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (copy_from_user(&intinfo, (void *) arg, sizeof( cciss_coalint_struct))) + return -EFAULT; + if ( (intinfo.delay == 0 ) && (intinfo.count == 0)) + + { +// printk("cciss_ioctl: delay and count cannot be 0\n"); + return( -EINVAL); + } + spin_lock_irqsave(&io_request_lock, flags); + /* Can only safely update if no commands outstanding */ + if (c->commands_outstanding > 0 ) + { +// printk("cciss_ioctl: cannot change coalasing " +// "%d commands outstanding on controller\n", +// c->commands_outstanding); + spin_unlock_irqrestore(&io_request_lock, flags); + return(-EINVAL); + } + /* Update the field, and then ring the doorbell */ + writel( intinfo.delay, + &(c->cfgtable->HostWrite.CoalIntDelay)); + writel( intinfo.count, + &(c->cfgtable->HostWrite.CoalIntCount)); + writel( CFGTBL_ChangeReq, c->vaddr + SA5_DOORBELL); + + for(i=0;i<MAX_CONFIG_WAIT;i++) + { + if (!(readl(c->vaddr + SA5_DOORBELL) + & CFGTBL_ChangeReq)) + break; + /* delay and try again */ + udelay(1000); + } + spin_unlock_irqrestore(&io_request_lock, flags); + if (i >= MAX_CONFIG_WAIT) + return( -EFAULT); + return(0); + } + case CCISS_GETNODENAME: + { + NodeName_type NodeName; + ctlr_info_t *c = hba[ctlr]; + int i; + + if (!arg) return -EINVAL; + for(i=0;i<16;i++) + NodeName[i] = readb(&c->cfgtable->ServerName[i]); + if (copy_to_user((void *) arg, NodeName, sizeof( NodeName_type))) + return -EFAULT; + return(0); + } + case CCISS_SETNODENAME: + { + NodeName_type NodeName; + ctlr_info_t *c = hba[ctlr]; + unsigned long flags; + int i; + + if (!arg) return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; + + if (copy_from_user(NodeName, (void *) arg, sizeof( NodeName_type))) + return -EFAULT; + + spin_lock_irqsave(&io_request_lock, flags); + + /* Update the field, and then ring the doorbell */ + for(i=0;i<16;i++) + writeb( NodeName[i], &c->cfgtable->ServerName[i]); + + writel( CFGTBL_ChangeReq, c->vaddr + SA5_DOORBELL); + + for(i=0;i<MAX_CONFIG_WAIT;i++) + { + if (!(readl(c->vaddr + SA5_DOORBELL) + & CFGTBL_ChangeReq)) + break; + /* delay and try again */ + udelay(1000); + } + spin_unlock_irqrestore(&io_request_lock, flags); + if (i >= MAX_CONFIG_WAIT) + return( -EFAULT); + return(0); + } + + case CCISS_GETHEARTBEAT: + { + Heartbeat_type heartbeat; + ctlr_info_t *c = hba[ctlr]; + + if (!arg) return -EINVAL; + heartbeat = readl(&c->cfgtable->HeartBeat); + if (copy_to_user((void *) arg, &heartbeat, sizeof( Heartbeat_type))) + return -EFAULT; + return(0); + } + case CCISS_GETBUSTYPES: + { + BusTypes_type BusTypes; + ctlr_info_t *c = hba[ctlr]; + + if (!arg) return -EINVAL; + BusTypes = readl(&c->cfgtable->BusTypes); + if (copy_to_user((void *) arg, &BusTypes, sizeof( BusTypes_type) )) + return -EFAULT; + return(0); + } + case CCISS_GETFIRMVER: + { + FirmwareVer_type firmware; + + if (!arg) return -EINVAL; + memcpy(firmware, hba[ctlr]->firm_ver, 4); + + if (copy_to_user((void *) arg, firmware, sizeof( FirmwareVer_type))) + return -EFAULT; + return(0); + } + case CCISS_GETDRIVVER: + { + DriverVer_type DriverVer = DRIVER_VERSION; + + if (!arg) return -EINVAL; + + if (copy_to_user((void *) arg, &DriverVer, sizeof( DriverVer_type) )) + return -EFAULT; + return(0); + } + + case CCISS_REVALIDVOLS: + return( revalidate_allvol(inode->i_rdev)); + + case CCISS_PASSTHRU: + { + IOCTL_Command_struct iocommand; + ctlr_info_t *h = hba[ctlr]; + CommandList_struct *c; + char *buff = NULL; + u64bit temp64; + unsigned long flags; + + if (!arg) return -EINVAL; + + if (!capable(CAP_SYS_RAWIO)) return -EPERM; + + if (copy_from_user(&iocommand, (void *) arg, sizeof( IOCTL_Command_struct) )) + return -EFAULT; + if((iocommand.buf_size < 1) && + (iocommand.Request.Type.Direction != XFER_NONE)) + { + return -EINVAL; + } + /* Check kmalloc limits */ + if(iocommand.buf_size > 128000) + return -EINVAL; + if(iocommand.buf_size > 0) + { + buff = kmalloc(iocommand.buf_size, GFP_KERNEL); + if( buff == NULL) + return -EFAULT; + } + if (iocommand.Request.Type.Direction == XFER_WRITE) + { + /* Copy the data into the buffer we created */ + if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) + return -EFAULT; + } + if ((c = cmd_alloc(NULL)) == NULL) + { + if(buff!=NULL) + kfree(buff); + return -ENOMEM; + } + // Fill in the command type + c->cmd_type = CMD_IOCTL_PEND; + // Fill in Command Header + c->Header.ReplyQueue = 0; // unused in simple mode + if( iocommand.buf_size > 0) // buffer to fill + { + c->Header.SGList = 1; + c->Header.SGTotal= 1; + } else // no buffers to fill + { + c->Header.SGList = 0; + c->Header.SGTotal= 0; + } + c->Header.LUN = iocommand.LUN_info; + c->Header.Tag.lower = c->busaddr; // use the kernel address the cmd block for tag + + // Fill in Request block + c->Request = iocommand.Request; + + // Fill in the scatter gather information + if (iocommand.buf_size > 0 ) + { + temp64.val = (__u64) virt_to_bus(buff); + c->SG[0].Addr.lower = temp64.val32.lower; + c->SG[0].Addr.upper = temp64.val32.upper; + c->SG[0].Len = iocommand.buf_size; + c->SG[0].Ext = 0; // we are not chaining + } + /* Put the request on the tail of the request queue */ + spin_lock_irqsave(&io_request_lock, flags); + addQ(&h->reqQ, c); + h->Qdepth++; + start_io(h); + spin_unlock_irqrestore(&io_request_lock, flags); + + /* Wait for completion */ + while(c->cmd_type != CMD_IOCTL_DONE) + schedule_timeout(1); + + /* Copy the error information out */ + iocommand.error_info = *(c->err_info); + if ( copy_to_user((void *) arg, &iocommand, sizeof( IOCTL_Command_struct) ) ) + { + cmd_free(NULL, c); + if (buff != NULL) + kfree(buff); + return( -EFAULT); + } + + if (iocommand.Request.Type.Direction == XFER_READ) + { + /* Copy the data out of the buffer we created */ + if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) + { + cmd_free(NULL, c); + kfree(buff); + } + } + cmd_free(NULL, c); + if (buff != NULL) + kfree(buff); + return(0); + } + + default: + return -EBADRQC; + } + +} + +/* Borrowed and adapted from sd.c */ +static int revalidate_logvol(kdev_t dev, int maxusage) +{ + int ctlr, target; + struct gendisk *gdev; + unsigned long flags; + int max_p; + int start; + int i; + + target = MINOR(dev) >> NWD_SHIFT; + ctlr = MAJOR(dev) - MAJOR_NR; + gdev = &(hba[ctlr]->gendisk); + + spin_lock_irqsave(&io_request_lock, flags); + if (hba[ctlr]->drv[target].usage_count > maxusage) { + spin_unlock_irqrestore(&io_request_lock, flags); + printk(KERN_WARNING "cpqarray: Device busy for " + "revalidation (usage=%d)\n", + hba[ctlr]->drv[target].usage_count); + return -EBUSY; + } + hba[ctlr]->drv[target].usage_count++; + spin_unlock_irqrestore(&io_request_lock, flags); + + max_p = gdev->max_p; + start = target << gdev->minor_shift; + + for(i=max_p; i>=0; i--) { + int minor = start+i; + kdev_t devi = MKDEV(MAJOR_NR + ctlr, minor); + struct super_block *sb = get_super(devi); + sync_dev(devi); + if (sb) invalidate_inodes(sb); + invalidate_buffers(devi); + gdev->part[minor].start_sect = 0; + gdev->part[minor].nr_sects = 0; + + /* reset the blocksize so we can read the partition table */ + blksize_size[MAJOR_NR+ctlr][minor] = 1024; + } + /* setup partitions per disk */ + grok_partitions(gdev, target, MAX_PART, + hba[ctlr]->drv[target].nr_blocks); + hba[ctlr]->drv[target].usage_count--; + return 0; +} + +static int frevalidate_logvol(kdev_t dev) +{ +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "cciss: frevalidate has been called\n"); +#endif /* CCISS_DEBUG */ + return revalidate_logvol(dev, 0); +} + +/* + * revalidate_allvol is for online array config utilities. After a + * utility reconfigures the drives in the array, it can use this function + * (through an ioctl) to make the driver zap any previous disk structs for + * that controller and get new ones. + * + * Right now I'm using the getgeometry() function to do this, but this + * function should probably be finer grained and allow you to revalidate one + * particualar logical volume (instead of all of them on a particular + * controller). + */ +static int revalidate_allvol(kdev_t dev) +{ + int ctlr, i; + unsigned long flags; + + ctlr = MAJOR(dev) - MAJOR_NR; + if (MINOR(dev) != 0) + return -ENXIO; + + spin_lock_irqsave(&io_request_lock, flags); + if (hba[ctlr]->usage_count > 1) { + spin_unlock_irqrestore(&io_request_lock, flags); + printk(KERN_WARNING "cciss: Device busy for volume" + " revalidation (usage=%d)\n", hba[ctlr]->usage_count); + return -EBUSY; + } + spin_unlock_irqrestore(&io_request_lock, flags); + hba[ctlr]->usage_count++; + + /* + * Set the partition and block size structures for all volumes + * on this controller to zero. We will reread all of this data + */ + memset(hba[ctlr]->hd, 0, sizeof(struct hd_struct) * 256); + memset(hba[ctlr]->sizes, 0, sizeof(int) * 256); + memset(hba[ctlr]->blocksizes, 0, sizeof(int) * 256); + memset(hba[ctlr]->hardsizes, 0, sizeof(int) * 256); + memset(hba[ctlr]->drv, 0, sizeof(drive_info_struct) + * CISS_MAX_LUN); + hba[ctlr]->gendisk.nr_real = 0; + + /* + * Tell the array controller not to give us any interupts while + * we check the new geometry. Then turn interrupts back on when + * we're done. + */ + hba[ctlr]->access.set_intr_mask(hba[ctlr], CCISS_INTR_OFF); + cciss_getgeometry(ctlr); + hba[ctlr]->access.set_intr_mask(hba[ctlr], CCISS_INTR_ON); + + cciss_geninit(ctlr); + for(i=0; i<NWD; i++) + if (hba[ctlr]->sizes[ i<<NWD_SHIFT ]) + revalidate_logvol(dev+(i<<NWD_SHIFT), 2); + + hba[ctlr]->usage_count--; + return 0; +} + + + +/* + * Wait polling for a command to complete. + * The memory mapped FIFO is polled for the completion. + * Used only at init time, interrupts disabled. + */ +static unsigned long pollcomplete(int ctlr) +{ + unsigned long done; + int i; + + /* Wait (up to 2 seconds) for a command to complete */ + + for (i = 200000; i > 0; i--) { + done = hba[ctlr]->access.command_completed(hba[ctlr]); + if (done == FIFO_EMPTY) { + udelay(10); /* a short fixed delay */ + } else + return (done); + } + /* Invalid address to tell caller we ran out of time */ + return 1; +} +/* + * Send a command to the controller, and wait for it to complete. + * Only used at init time. + */ +static int sendcmd( + __u8 cmd, + int ctlr, + void *buff, + size_t size, + unsigned int use_unit_num, + unsigned int log_unit, + __u8 page_code ) +{ + CommandList_struct *c; + int i; + unsigned long complete; + ctlr_info_t *info_p= hba[ctlr]; + u64bit temp64; + + c = cmd_alloc(info_p); + if (c == NULL) + { + printk(KERN_WARNING "cciss: unable to get memory"); + return(IO_ERROR); + } + // Fill in Command Header + c->Header.ReplyQueue = 0; // unused in simple mode + if( buff != NULL) // buffer to fill + { + c->Header.SGList = 1; + c->Header.SGTotal= 1; + } else // no buffers to fill + { + c->Header.SGList = 0; + c->Header.SGTotal= 0; + } + c->Header.Tag.lower = c->busaddr; // use the kernel address the cmd block for tag + // Fill in Request block + switch(cmd) + { + case CISS_INQUIRY: + /* If the logical unit number is 0 then, this is going + to controller so It's a physical command + mode = 0 target = 0. + So we have nothing to write. + Otherwise + mode = 1 target = LUNID + */ + if(use_unit_num != 0) + { + c->Header.LUN.LogDev.VolId= + hba[ctlr]->drv[log_unit].LunID; + c->Header.LUN.LogDev.Mode = 1; + } + /* are we trying to read a vital product page */ + if(page_code != 0) + { + c->Request.CDB[1] = 0x01; + c->Request.CDB[2] = page_code; + } + c->Request.CDBLen = 6; + c->Request.Type.Type = TYPE_CMD; // It is a command. + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_READ; // Read + c->Request.Timeout = 0; // Don't time out + c->Request.CDB[0] = CISS_INQUIRY; + c->Request.CDB[4] = size & 0xFF; + break; + case CISS_REPORT_LOG: + /* Talking to controller so It's a physical command + mode = 00 target = 0. + So we have nothing to write. + */ + c->Request.CDBLen = 12; + c->Request.Type.Type = TYPE_CMD; // It is a command. + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_READ; // Read + c->Request.Timeout = 0; // Don't time out + c->Request.CDB[0] = CISS_REPORT_LOG; + c->Request.CDB[6] = (size >> 24) & 0xFF; //MSB + c->Request.CDB[7] = (size >> 16) & 0xFF; + c->Request.CDB[8] = (size >> 8) & 0xFF; + c->Request.CDB[9] = size & 0xFF; + break; + + case CCISS_READ_CAPACITY: + c->Header.LUN.LogDev.VolId= + hba[ctlr]->drv[log_unit].LunID; + c->Header.LUN.LogDev.Mode = 1; + c->Request.CDBLen = 10; + c->Request.Type.Type = TYPE_CMD; // It is a command. + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_READ; // Read + c->Request.Timeout = 0; // Don't time out + c->Request.CDB[0] = CCISS_READ_CAPACITY; + break; + default: + printk(KERN_WARNING + "cciss: Unknown Command 0x%c sent attempted\n", + cmd); + cmd_free(info_p, c); + return(IO_ERROR); + }; + // Fill in the scatter gather information + if (size > 0 ) + { + temp64.val = (__u64) virt_to_bus(buff); + c->SG[0].Addr.lower = temp64.val32.lower; + c->SG[0].Addr.upper = temp64.val32.upper; + c->SG[0].Len = size; + c->SG[0].Ext = 0; // we are not chaining + } + /* + * Disable interrupt + */ +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "cciss: turning intr off\n"); +#endif /* CCISS_DEBUG */ + info_p->access.set_intr_mask(info_p, CCISS_INTR_OFF); + + /* Make sure there is room in the command FIFO */ + /* Actually it should be completely empty at this time. */ + for (i = 200000; i > 0; i--) + { + /* if fifo isn't full go */ + if (!(info_p->access.fifo_full(info_p))) + { + + break; + } + udelay(10); + printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full," + " waiting!\n", ctlr); + } + /* + * Send the cmd + */ + info_p->access.submit_command(info_p, c); + complete = pollcomplete(ctlr); + +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "cciss: command completed\n"); +#endif /* CCISS_DEBUG */ + + if (complete != 1) { + if ( (complete & CISS_ERROR_BIT) + && (complete & ~CISS_ERROR_BIT) == c->busaddr) + { + /* if data overrun or underun on Report command + ignore it + */ + if (((c->Request.CDB[0] == CISS_REPORT_LOG) || + (c->Request.CDB[0] == CISS_INQUIRY)) && + ((c->err_info->CommandStatus == + CMD_DATA_OVERRUN) || + (c->err_info->CommandStatus == + CMD_DATA_UNDERRUN) + )) + { + complete = c->busaddr; + } else + { + printk(KERN_WARNING "ciss ciss%d: sendcmd" + " Error %x \n", ctlr, + c->err_info->CommandStatus); + printk(KERN_WARNING "ciss ciss%d: sendcmd" + " offensive info\n" + " size %x\n num %x value %x\n", ctlr, + c->err_info->MoreErrInfo.Invalid_Cmd.offense_size, + c->err_info->MoreErrInfo.Invalid_Cmd.offense_num, + c->err_info->MoreErrInfo.Invalid_Cmd.offense_value); + cmd_free(info_p,c); + return(IO_ERROR); + } + } + if (complete != c->busaddr) { + printk( KERN_WARNING "cciss cciss%d: SendCmd " + "Invalid command list address returned! (%lx)\n", + ctlr, complete); + cmd_free(info_p, c); + return (IO_ERROR); + } + } else { + printk( KERN_WARNING + "cciss cciss%d: SendCmd Timeout out, " + "No command list address returned!\n", + ctlr); + cmd_free(info_p, c); + return (IO_ERROR); + } + cmd_free(info_p, c); + return (IO_OK); +} +/* + * Map (physical) PCI mem into (virtual) kernel space + */ +static ulong remap_pci_mem(ulong base, ulong size) +{ + ulong page_base = ((ulong) base) & PAGE_MASK; + ulong page_offs = ((ulong) base) - page_base; + ulong page_remapped = (ulong) ioremap(page_base, page_offs+size); + + return (ulong) (page_remapped ? (page_remapped + page_offs) : 0UL); +} + +/* + * Enqueuing and dequeuing functions for cmdlists. + */ +static inline void addQ(CommandList_struct **Qptr, CommandList_struct *c) +{ + if (*Qptr == NULL) { + *Qptr = c; + c->next = c->prev = c; + } else { + c->prev = (*Qptr)->prev; + c->next = (*Qptr); + (*Qptr)->prev->next = c; + (*Qptr)->prev = c; + } +} + +static inline CommandList_struct *removeQ(CommandList_struct **Qptr, + CommandList_struct *c) +{ + if (c && c->next != c) { + if (*Qptr == c) *Qptr = c->next; + c->prev->next = c->next; + c->next->prev = c->prev; + } else { + *Qptr = NULL; + } + return c; +} + +/* + * Takes jobs of the Q and sends them to the hardware, then puts it on + * the Q to wait for completion. + */ +static void start_io( ctlr_info_t *h) +{ + CommandList_struct *c; + + while(( c = h->reqQ) != NULL ) + { + /* can't do anything if fifo is full */ + if ((h->access.fifo_full(h))) + { + printk(KERN_WARNING "cciss: fifo full \n"); + return; + } + /* Get the frist entry from the Request Q */ + removeQ(&(h->reqQ), c); + h->Qdepth--; + + /* Tell the controller execute command */ + h->access.submit_command(h, c); + + /* Put job onto the completed Q */ + addQ (&(h->cmpQ), c); + } +} + +static inline void complete_buffers( struct buffer_head *bh, int status) +{ + struct buffer_head *xbh; + + while(bh) + { + xbh = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, status); + bh = xbh; + } +} +/* checks the status of the job and calls complete buffers to mark all + * buffers for the completed job. + */ +static inline void complete_command( CommandList_struct *cmd, int timeout) +{ + int status = 1; + + if (timeout) + status = 0; + if(cmd->err_info->CommandStatus != 0) + { /* an error has occured */ + switch(cmd->err_info->CommandStatus) + { + case CMD_TARGET_STATUS: + printk(KERN_WARNING "cciss: cmd %p has " + " completed with errors\n", cmd); + if( cmd->err_info->ScsiStatus) + { + printk(KERN_WARNING "cciss: cmd %p " + "has SCSI Status = %x\n", + cmd, + cmd->err_info->ScsiStatus); + } + + break; + case CMD_DATA_UNDERRUN: + printk(KERN_WARNING "cciss: cmd %p has" + " completed with data underrun " + "reported\n", cmd); + break; + case CMD_DATA_OVERRUN: + printk(KERN_WARNING "cciss: cmd %p has" + " completed with data overrun " + "reported\n", cmd); + break; + case CMD_INVALID: + printk(KERN_WARNING "cciss: cmd %p is " + "reported invalid\n", cmd); + status = 0; + break; + case CMD_PROTOCOL_ERR: + printk(KERN_WARNING "cciss: cmd %p has " + "protocol error \n", cmd); + status = 0; + break; + case CMD_HARDWARE_ERR: + printk(KERN_WARNING "cciss: cmd %p had " + " hardware error\n", cmd); + status = 0; + break; + case CMD_CONNECTION_LOST: + printk(KERN_WARNING "cciss: cmd %p had " + "connection lost\n", cmd); + status=0; + break; + case CMD_ABORTED: + printk(KERN_WARNING "cciss: cmd %p was " + "aborted\n", cmd); + status=0; + break; + case CMD_ABORT_FAILED: + printk(KERN_WARNING "cciss: cmd %p reports " + "abort failed\n", cmd); + status=0; + break; + case CMD_UNSOLICITED_ABORT: + printk(KERN_WARNING "cciss: cmd %p aborted " + "do to an unsolicited abort\n", cmd); + status=0; + break; + case CMD_TIMEOUT: + printk(KERN_WARNING "cciss: cmd %p timedout\n", + cmd); + status=0; + break; + default: + printk(KERN_WARNING "cciss: cmd %p returned " + "unknown status %x\n", cmd, + cmd->err_info->CommandStatus); + status=0; + } + } + complete_buffers(cmd->bh, status); +} +/* + * Get a request and submit it to the controller. + * Currently we do one request at a time. Ideally we would like to send + * everything to the controller on the first call, but there is a danger + * of holding the io_request_lock for to long. + */ +static void do_cciss_request(int ctlr) +{ + ctlr_info_t *h= hba[ctlr]; + CommandList_struct *c; + int log_unit, start_blk, seg, sect; + char *lastdataend; + struct buffer_head *bh; + struct list_head *queue_head; + struct request *creq; + u64bit temp64; + + queue_head = &blk_dev[MAJOR_NR+ctlr].request_queue.queue_head; + if (list_empty(queue_head)) + { + /* nothing to do... */ + start_io(h); + return; + } + creq = blkdev_entry_next_request(queue_head); + if ((creq == NULL) || (creq->rq_status == RQ_INACTIVE)) + { + /* nothing to do... restart processing and return */ + start_io(h); + return; + } + if ((ctlr != (MAJOR(creq->rq_dev)-MAJOR_NR)) || (ctlr > nr_ctlr) + || (h == NULL)) + { +#ifdef CCISS_DEBUG + printk(KERN_WARNING "cciss: doreq cmd of %d, %x at %p\n", + ctlr, creq->rq_dev, creq); +#endif /* CCISS_DEBUG */ + complete_buffers(creq->bh, 0); + start_io(h); + return; + } + if (( c = cmd_alloc(h)) == NULL) + { + start_io(h); + return; + } + c->cmd_type = CMD_RWREQ; + bh = c->bh = creq->bh; + + /* fill in the request */ + log_unit = MINOR(creq->rq_dev) >> NWD_SHIFT; + c->Header.ReplyQueue = 0; // unused in simple mode + c->Header.Tag.lower = c->busaddr; // use the physical address the cmd block for tag + c->Header.LUN.LogDev.VolId= hba[ctlr]->drv[log_unit].LunID; + c->Header.LUN.LogDev.Mode = 1; + c->Request.CDBLen = 10; // 12 byte commands not in FW yet; + c->Request.Type.Type = TYPE_CMD; // It is a command. + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = + (creq->cmd == READ) ? XFER_READ: XFER_WRITE; + c->Request.Timeout = 0; // Don't time out + c->Request.CDB[0] = (creq->cmd == READ) ? CCISS_READ : CCISS_WRITE; + start_blk = hba[ctlr]->hd[MINOR(creq->rq_dev)].start_sect + creq->sector; + if (bh == NULL) + panic("cciss: bh== NULL?"); +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "ciss: sector =%d nr_sectors=%d\n",(int) creq->sector, + (int) creq->nr_sectors); +#endif /* CCISS_DEBUG */ + seg = 0; + lastdataend = NULL; + sect = 0; + while(bh) + { + sect += bh->b_size/512; + if (bh->b_size % 512) + { + printk(KERN_CRIT "cciss: Oh Man. %d+%d, size=%d\n", + (int) creq->sector, sect, (int) bh->b_size); + panic("b_size 512 != 0\n"); + } + if (bh->b_data == lastdataend) + { // tack it on to the last segment + c->SG[seg-1].Len +=bh->b_size; + lastdataend += bh->b_size; + } else + { + c->SG[seg].Len = bh->b_size; + temp64.val = (__u64) virt_to_bus(bh->b_data); + c->SG[seg].Addr.lower = temp64.val32.lower; + c->SG[seg].Addr.upper = temp64.val32.upper; + c->SG[0].Ext = 0; // we are not chaining + lastdataend = bh->b_data + bh->b_size; + if( ++seg == MAXSGENTRIES) + { + break; + } + } + bh = bh->b_reqnext; + } + /* track how many SG entries we are using */ + if( seg > h->maxSG) + h->maxSG = seg; + + /* adjusting the remaining request, if any */ + creq-> sector+= sect; + creq->nr_sectors -= sect; + +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "cciss: Submitting %d sectors in %d segments\n", sect, seg); +#endif /* CCISS_DEBUG */ + + c->Header.SGList = c->Header.SGTotal = seg; + c->Request.CDB[1]= 0; + c->Request.CDB[2]= (start_blk >> 24) & 0xff; //MSB + c->Request.CDB[3]= (start_blk >> 16) & 0xff; + c->Request.CDB[4]= (start_blk >> 8) & 0xff; + c->Request.CDB[5]= start_blk & 0xff; + c->Request.CDB[6]= 0; // (sect >> 24) & 0xff; MSB + // c->Request.CDB[7]= (sect >> 16) & 0xff; + c->Request.CDB[7]= (sect >> 8) & 0xff; + c->Request.CDB[8]= sect & 0xff; + c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0; + + /* check to see if we going to complete the entire request */ + /* if so, mark this request as Done and ready the next one */ + if (creq->nr_sectors) + { +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "cciss: More to do on the same request %p %ld\n", + creq, creq->nr_sectors); +#endif /* CCISS_DEBUG */ + + creq->bh = bh->b_reqnext; + bh->b_reqnext = NULL; + } else + { +#ifdef CCISS_DEBUG + printk("cciss: Done with %p, queueing %p\n", creq); +#endif /* CCISS_DEBUG */ + + blkdev_dequeue_request(creq); + end_that_request_last(creq); + } + addQ(&(h->reqQ),c); + h->Qdepth++; + if(h->Qdepth > h->maxQsinceinit) + h->maxQsinceinit = h->Qdepth; + start_io(h); +} + +static void do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs) +{ + ctlr_info_t *h = dev_id; + CommandList_struct *c; + unsigned long flags; + __u32 a, a1; + + + /* Is this interrupt for us? */ + if ( h->access.intr_pending(h) == 0) + return; + + /* + * If there are completed commands in the completion queue, + * we had better do something about it. + */ + spin_lock_irqsave(&io_request_lock, flags); + while( h->access.intr_pending(h)) + { + while((a = h->access.command_completed(h)) != FIFO_EMPTY) + { + a1 = a; + a &= ~3; + if ((c = h->cmpQ) == NULL) + { + printk(KERN_WARNING "cpqarray: Completion of %08lx ignored\n", (unsigned long)a1); + continue; + } + while(c->busaddr != a) { + c = c->next; + if (c == h->cmpQ) + break; + } + /* + * If we've found the command, take it off the + * completion Q and free it + */ + if (c->busaddr == a) { + removeQ(&h->cmpQ, c); + if (c->cmd_type == CMD_RWREQ) { + complete_command(c, 0); + cmd_free(h, c); + } else if (c->cmd_type == CMD_IOCTL_PEND) { + c->cmd_type = CMD_IOCTL_DONE; + } + continue; + } + } + } + /* + * See if we can queue up some more IO + */ + do_cciss_request(h->ctlr); + spin_unlock_irqrestore(&io_request_lock, flags); +} +/* + * We cannot read the structure directly, for portablity we must use + * the io functions. + * This is for debug only. + */ +#ifdef CCISS_DEBUG +static void print_cfg_table( CfgTable_struct *tb) +{ + int i; + char temp_name[17]; + + printk("Controller Configuration information\n"); + printk("------------------------------------\n"); + for(i=0;i<4;i++) + temp_name[i] = readb(&(tb->Signature[i])); + temp_name[4]='\0'; + printk(" Signature = %s\n", temp_name); + printk(" Spec Number = %d\n", readl(&(tb->SpecValence))); + printk(" Transport methods supported = 0x%x\n", + readl(&(tb-> TransportSupport))); + printk(" Transport methods active = 0x%x\n", + readl(&(tb->TransportActive))); + printk(" Requested transport Method = 0x%x\n", + readl(&(tb->HostWrite.TransportRequest))); + printk(" Coalese Interrupt Delay = 0x%x\n", + readl(&(tb->HostWrite.CoalIntDelay))); + printk(" Coalese Interrupt Count = 0x%x\n", + readl(&(tb->HostWrite.CoalIntCount))); + printk(" Max outstanding commands = 0x%d\n", + readl(&(tb->CmdsOutMax))); + printk(" Bus Types = 0x%x\n", readl(&(tb-> BusTypes))); + for(i=0;i<16;i++) + temp_name[i] = readb(&(tb->ServerName[i])); + temp_name[16] = '\0'; + printk(" Server Name = %s\n", temp_name); + printk(" Heartbeat Counter = 0x%x\n\n\n", + readl(&(tb->HeartBeat))); +} +#endif /* CCISS_DEBUG */ + +static int cciss_pci_init(ctlr_info_t *c, unchar bus, unchar device_fn) +{ + ushort vendor_id, device_id, command; + unchar cache_line_size, latency_timer; + unchar irq, revision; + uint addr[6]; + __u32 board_id; + struct pci_dev *pdev; + + int i; + + pdev = pci_find_slot(bus, device_fn); + vendor_id = pdev->vendor; + device_id = pdev->device; + irq = pdev->irq; + + for(i=0; i<6; i++) + addr[i] = pdev->resource[i].start; + + if (pci_enable_device(pdev)) + return( -1); + + (void) pci_read_config_word(pdev, PCI_COMMAND,&command); + (void) pci_read_config_byte(pdev, PCI_CLASS_REVISION, &revision); + (void) pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, + &cache_line_size); + (void) pci_read_config_byte(pdev, PCI_LATENCY_TIMER, + &latency_timer); + + (void) pci_read_config_dword(pdev, PCI_SUBSYSTEM_VENDOR_ID, + &board_id); + +#ifdef CCISS_DEBUG + printk("vendor_id = %x\n", vendor_id); + printk("device_id = %x\n", device_id); + printk("command = %x\n", command); + for(i=0; i<6; i++) + printk("addr[%d] = %x\n", i, addr[i]); + printk("revision = %x\n", revision); + printk("irq = %x\n", irq); + printk("cache_line_size = %x\n", cache_line_size); + printk("latency_timer = %x\n", latency_timer); + printk("board_id = %x\n", board_id); +#endif /* CCISS_DEBUG */ + + c->intr = irq; + + /* + * Memory base addr is first addr , the second points to the config + * table + */ + c->paddr = pci_resource_start(pdev, 0); + c->vaddr = remap_pci_mem(c->paddr, 128); + c->cfgtable = (CfgTable_struct *) remap_pci_mem(addr[1], + sizeof(CfgTable_struct)); + c->board_id = board_id; + +#ifdef CCISS_DEBUG + print_cfg_table(c->cfgtable); +#endif /* CCISS_DEBUG */ + for(i=0; i<NR_PRODUCTS; i++) { + if (board_id == products[i].board_id) { + c->product_name = products[i].product_name; + c->access = *(products[i].access); + break; + } + } + if (i == NR_PRODUCTS) { + printk(KERN_WARNING "cciss: Sorry, I don't know how" + " to access the Smart Array controller %08lx\n", + (unsigned long)board_id); + return -1; + } +#ifdef CCISS_DEBUG + printk("Trying to put board into Simple mode\n"); +#endif /* CCISS_DEBUG */ + c->max_commands = readl(&(c->cfgtable->CmdsOutMax)); + /* Update the field, and then ring the doorbell */ + writel( CFGTBL_Trans_Simple, + &(c->cfgtable->HostWrite.TransportRequest)); + writel( CFGTBL_ChangeReq, c->vaddr + SA5_DOORBELL); + + for(i=0;i<MAX_CONFIG_WAIT;i++) + { + if (!(readl(c->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) + break; + /* delay and try again */ + udelay(1000); + } + +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "I counter got to %d %x\n", i, readl(c->vaddr + SA5_DOORBELL)); +#endif /* CCISS_DEBUG */ +#ifdef CCISS_DEBUG + print_cfg_table(c->cfgtable); +#endif /* CCISS_DEBUG */ + + if (!(readl(&(c->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) + { + printk(KERN_WARNING "cciss: unable to get board into" + " simple mode\n"); + return -1; + } + return 0; + +} +/* + * Scans PCI space for any controllers that this driver can control. + */ +static int cciss_pci_detect(void) +{ + + int index; + unchar bus=0, dev_fn=0; + + for(index=0; ; index++) { + if (pcibios_find_device(PCI_VENDOR_ID_COMPAQ, + PCI_DEVICE_ID_COMPAQ_CISS, + index, &bus, &dev_fn)) + break; + printk(KERN_DEBUG "cciss: Device %x has been found at %x %x\n", + PCI_DEVICE_ID_COMPAQ_CISS, bus, dev_fn); + if (index == 1000000) break; + if (nr_ctlr == 8) { + printk(KERN_WARNING "cciss: This driver" + " supports a maximum of 8 controllers.\n"); + break; + } + hba[nr_ctlr] = kmalloc(sizeof(ctlr_info_t), GFP_KERNEL); + if(hba[nr_ctlr]==NULL) + { + printk(KERN_ERR "cciss: out of memory.\n"); + continue; + } + memset(hba[nr_ctlr], 0, sizeof(ctlr_info_t)); + if (cciss_pci_init(hba[nr_ctlr], bus, dev_fn) != 0) + { + kfree(hba[nr_ctlr]); + continue; + } + sprintf(hba[nr_ctlr]->devname, "cciss%d", nr_ctlr); + hba[nr_ctlr]->ctlr = nr_ctlr; + hba[nr_ctlr]->pci_bus = bus; + hba[nr_ctlr]->pci_dev_fn = dev_fn; + nr_ctlr++; + + } + return nr_ctlr; + +} + +/* + * Gets information about the local volumes attached to the controller. + */ +static void cciss_getgeometry(int cntl_num) +{ + ReportLunData_struct *ld_buff; + ReadCapdata_struct *size_buff; + InquiryData_struct *inq_buff; + int return_code; + int i; + int listlength = 0; + int lunid = 0; + int block_size; + int total_size; + + ld_buff = kmalloc(sizeof(ReportLunData_struct), GFP_KERNEL); + if (ld_buff == NULL) + { + printk(KERN_ERR "cciss: out of memory\n"); + return; + } + memset(ld_buff, 0, sizeof(ReportLunData_struct)); + size_buff = kmalloc(sizeof( ReadCapdata_struct), GFP_KERNEL); + if (size_buff == NULL) + { + printk(KERN_ERR "cciss: out of memory\n"); + kfree(ld_buff); + return; + } + inq_buff = kmalloc(sizeof( InquiryData_struct), GFP_KERNEL); + if (inq_buff == NULL) + { + printk(KERN_ERR "cciss: out of memory\n"); + kfree(ld_buff); + kfree(size_buff); + return; + } + /* Get the firmware version */ + return_code = sendcmd(CISS_INQUIRY, cntl_num, inq_buff, + sizeof(InquiryData_struct), 0, 0 ,0 ); + if (return_code == IO_OK) + { + hba[cntl_num]->firm_ver[0] = inq_buff->data_byte[32]; + hba[cntl_num]->firm_ver[1] = inq_buff->data_byte[33]; + hba[cntl_num]->firm_ver[2] = inq_buff->data_byte[34]; + hba[cntl_num]->firm_ver[3] = inq_buff->data_byte[35]; + } else /* send command failed */ + { + printk(KERN_WARNING "cciss: unable to determine firmware" + " version of controller\n"); + } + /* Get the number of logical volumes */ + return_code = sendcmd(CISS_REPORT_LOG, cntl_num, ld_buff, + sizeof(ReportLunData_struct), 0, 0, 0 ); + + if( return_code == IO_OK) + { +#ifdef CCISS_DEBUG + printk("LUN Data\n--------------------------\n"); +#endif /* CCISS_DEBUG */ + + listlength |= (0xff & (unsigned int)(ld_buff->LUNListLength[0])) << 24; + listlength |= (0xff & (unsigned int)(ld_buff->LUNListLength[1])) << 16; + listlength |= (0xff & (unsigned int)(ld_buff->LUNListLength[2])) << 8; + listlength |= 0xff & (unsigned int)(ld_buff->LUNListLength[3]); + } else /* reading number of logical volumes failed */ + { + printk(KERN_WARNING "cciss: report logical volume" + " command failed\n"); + listlength = 0; + } + hba[cntl_num]->num_luns = listlength / 8; // 8 bytes pre entry + if (hba[cntl_num]->num_luns > CISS_MAX_LUN) + { + printk(KERN_ERR "ciss: only %d number of logical volumes supported\n", + CISS_MAX_LUN); + hba[cntl_num]->num_luns = CISS_MAX_LUN; + } +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "Length = %x %x %x %x = %d\n", ld_buff->LUNListLength[0], + ld_buff->LUNListLength[1], ld_buff->LUNListLength[2], + ld_buff->LUNListLength[3], hba[cntl_num]->num_luns); +#endif /* CCISS_DEBUG */ + for(i=0; i< hba[cntl_num]->num_luns ; i++) + { + lunid = (0xff & (unsigned int)(ld_buff->LUN[i][3])) << 24; + lunid |= (0xff & (unsigned int)(ld_buff->LUN[i][2])) << 16; + lunid |= (0xff & (unsigned int)(ld_buff->LUN[i][1])) << 8; + lunid |= 0xff & (unsigned int)(ld_buff->LUN[i][0]); + hba[cntl_num]->drv[i].LunID = lunid; + +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "LUN[%d]: %x %x %x %x = %x\n", i, + ld_buff->LUN[i][0], ld_buff->LUN[i][1],ld_buff->LUN[i][2], + ld_buff->LUN[i][3], hba[cntl_num]->drv[i].LunID); +#endif /* CCISS_DEBUG */ + + memset(size_buff, 0, sizeof(ReadCapdata_struct)); + return_code = sendcmd(CCISS_READ_CAPACITY, cntl_num, size_buff, + sizeof( ReadCapdata_struct), 1, i, 0 ); + if (return_code == IO_OK) + { + total_size = (0xff & + (unsigned int)(size_buff->total_size[0])) << 24; + total_size |= (0xff & + (unsigned int)(size_buff->total_size[1])) << 16; + total_size |= (0xff & + (unsigned int)(size_buff->total_size[2])) << 8; + total_size |= (0xff & (unsigned int) + (size_buff->total_size[3])); + total_size++; // command returns highest block address + + block_size = (0xff & + (unsigned int)(size_buff->block_size[0])) << 24; + block_size |= (0xff & + (unsigned int)(size_buff->block_size[1])) << 16; + block_size |= (0xff & + (unsigned int)(size_buff->block_size[2])) << 8; + block_size |= (0xff & + (unsigned int)(size_buff->block_size[3])); + } else /* read capacity command failed */ + { + printk(KERN_WARNING "cciss: read capacity failed\n"); + total_size = block_size = 0; + } + printk(" blocks= %d block_size= %d\n", total_size, + block_size); + + /* Execute the command to read the disk geometry */ + memset(inq_buff, 0, sizeof(InquiryData_struct)); + return_code = sendcmd(CISS_INQUIRY, cntl_num, inq_buff, + sizeof(InquiryData_struct), 1, i ,0xC1 ); + if (return_code == IO_OK) + { + if(inq_buff->data_byte[8] == 0xFF) + { + printk(KERN_WARNING "cciss: reading geometry failed, volume does not support reading geometry\n"); + + hba[cntl_num]->drv[i].block_size = block_size; + hba[cntl_num]->drv[i].nr_blocks = total_size; + hba[cntl_num]->drv[i].heads = 255; + hba[cntl_num]->drv[i].sectors = 32; // Sectors per track + hba[cntl_num]->drv[i].cylinders = total_size / 255 / 32; } else + { + + hba[cntl_num]->drv[i].block_size = block_size; + hba[cntl_num]->drv[i].nr_blocks = total_size; + hba[cntl_num]->drv[i].heads = + inq_buff->data_byte[6]; + hba[cntl_num]->drv[i].sectors = + inq_buff->data_byte[7]; + hba[cntl_num]->drv[i].cylinders = + (inq_buff->data_byte[4] & 0xff) << 8; + hba[cntl_num]->drv[i].cylinders += + inq_buff->data_byte[5]; + } + } + else /* Get geometry failed */ + { + printk(KERN_WARNING "cciss: reading geometry failed, continuing with default geometry\n"); + + hba[cntl_num]->drv[i].block_size = block_size; + hba[cntl_num]->drv[i].nr_blocks = total_size; + hba[cntl_num]->drv[i].heads = 255; + hba[cntl_num]->drv[i].sectors = 32; // Sectors per track + hba[cntl_num]->drv[i].cylinders = total_size / 255 / 32; + } + printk(KERN_INFO " heads= %d, sectors= %d, cylinders= %d\n\n", + hba[cntl_num]->drv[i].heads, + hba[cntl_num]->drv[i].sectors, + hba[cntl_num]->drv[i].cylinders); + + } + kfree(ld_buff); + kfree(size_buff); +} + +/* + * This is it. Find all the controllers and register them. I really hate + * stealing all these major device numbers. + * returns the number of block devices registered. + */ +int __init cciss_init(void) +{ + int num_cntlrs_reg = 0; + int i,j; + + void (*request_fns[MAX_CTLR])(request_queue_t *) = { + do_cciss_request0, do_cciss_request1, + do_cciss_request2, do_cciss_request3, + do_cciss_request4, do_cciss_request5, + do_cciss_request6, do_cciss_request7, + }; + + /* detect controllers */ + cciss_pci_detect(); + + if (nr_ctlr == 0) + return(num_cntlrs_reg); + + printk(KERN_INFO DRIVER_NAME "\n"); + printk(KERN_INFO "Found %d controller(s)\n", nr_ctlr); + for(i=0;i<nr_ctlr;i++) + { + if( register_blkdev(MAJOR_NR+i, hba[i]->devname, &cciss_fops)) + { + printk(KERN_ERR "cciss: Unable to get major number " + "%d for %s\n", MAJOR_NR+i, hba[i]->devname); + continue; + } + /* make sure the board interrupts are off */ + hba[i]->access.set_intr_mask(hba[i], CCISS_INTR_OFF); + if( request_irq(hba[i]->intr, do_cciss_intr, SA_INTERRUPT|SA_SHIRQ, hba[i]->devname, hba[i])) + { + printk(KERN_ERR "ciss: Unable to get irq %d for %s\n", + hba[i]->intr, hba[i]->devname); + unregister_blkdev( MAJOR_NR+i, hba[i]->devname); + continue; + } + num_cntlrs_reg++; + hba[i]->cmd_pool_bits = (__u32*)kmalloc( + ((NR_CMDS+31)/32)*sizeof(__u32), GFP_KERNEL); + hba[i]->cmd_pool = (CommandList_struct *)kmalloc( + NR_CMDS * sizeof(CommandList_struct), + GFP_KERNEL); + hba[i]->errinfo_pool = (ErrorInfo_struct *)kmalloc( + NR_CMDS * sizeof( ErrorInfo_struct), + GFP_KERNEL); + if((hba[i]->cmd_pool_bits == NULL) + || (hba[i]->cmd_pool == NULL) + || (hba[i]->errinfo_pool == NULL)) + { + nr_ctlr = i; + if(hba[i]->cmd_pool_bits) + kfree(hba[i]->cmd_pool_bits); + if(hba[i]->cmd_pool) + kfree(hba[i]->cmd_pool); + if(hba[i]->errinfo_pool) + kfree(hba[i]->errinfo_pool); + free_irq(hba[i]->intr, hba[i]); + unregister_blkdev(MAJOR_NR+i, hba[i]->devname); + num_cntlrs_reg--; + printk( KERN_ERR "cciss: out of memory"); + return(num_cntlrs_reg); + } + + /* command and error info recs zeroed out before + they are used */ + memset(hba[i]->cmd_pool_bits, 0, + ((NR_CMDS+31)/32)*sizeof(__u32)); + +#ifdef CCISS_DEBUG + printk(KERN_DEBUG "Scanning for drives on controller cciss%d\n",i); +#endif /* CCISS_DEBUG */ + + cciss_getgeometry(i); + + /* Turn the interrupts on so we can service requests */ + hba[i]->access.set_intr_mask(hba[i], CCISS_INTR_ON); + + cciss_procinit(i); + + blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR+i), + request_fns[i]); + blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR+i), 0); + + /* fill in the other Kernel structs */ + blksize_size[MAJOR_NR+i] = hba[i]->blocksizes; + hardsect_size[MAJOR_NR+i] = hba[i]->hardsizes; + read_ahead[MAJOR_NR+i] = READ_AHEAD; + + /* Fill in the gendisk data */ + hba[i]->gendisk.major = MAJOR_NR + i; + hba[i]->gendisk.major_name = "cciss"; + hba[i]->gendisk.minor_shift = NWD_SHIFT; + hba[i]->gendisk.max_p = MAX_PART; + hba[i]->gendisk.part = hba[i]->hd; + hba[i]->gendisk.sizes = hba[i]->sizes; + hba[i]->gendisk.nr_real = hba[i]->num_luns; + + /* Get on the disk list */ + hba[i]->gendisk.next = gendisk_head; + gendisk_head = &(hba[i]->gendisk); + + cciss_geninit(i); + for(j=0; j<NWD; j++) + register_disk(&(hba[i]->gendisk), + MKDEV(MAJOR_NR+i, j <<4), + MAX_PART, &cciss_fops, + hba[i]->drv[j].nr_blocks); + } + return(nr_ctlr); +} + +EXPORT_NO_SYMBOLS; + +/* This is a bit of a hack... */ +static int __init init_cciss_module(void) +{ + + if (cciss_init() == 0) /* all the block dev numbers already used */ + return -EIO; /* or no controllers were found */ + return 0; +} + +static void __exit cleanup_cciss_module(void) +{ + int i; + struct gendisk *g; + + for(i=0; i<nr_ctlr; i++) + { + /* Turn board interrupts off */ + hba[i]->access.set_intr_mask(hba[i], CCISS_INTR_OFF); + free_irq(hba[i]->intr, hba[i]); + iounmap((void*)hba[i]->vaddr); + unregister_blkdev(MAJOR_NR+i, hba[i]->devname); + remove_proc_entry(hba[i]->devname, proc_cciss); + + /* remove it from the disk list */ + if (gendisk_head == &(hba[i]->gendisk)) + { + gendisk_head = hba[i]->gendisk.next; + } else + { + for(g=gendisk_head; g ; g=g->next) + { + if(g->next == &(hba[i]->gendisk)) + { + g->next = hba[i]->gendisk.next; + } + } + } + remove_proc_entry("driver/cciss", &proc_root); + kfree(hba[i]->cmd_pool); + kfree(hba[i]->errinfo_pool); + kfree(hba[i]->cmd_pool_bits); + kfree(hba[i]); + } +} + +module_init(init_cciss_module); +module_exit(cleanup_cciss_module); diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h new file mode 100644 index 000000000..bd03aaea1 --- /dev/null +++ b/drivers/block/cciss.h @@ -0,0 +1,201 @@ +#ifndef CCISS_H +#define CCISS_H + +#include <linux/genhd.h> + +#include "cciss_cmd.h" + + +#define NWD 16 +#define NWD_SHIFT 4 +#define MAX_PART 16 + +#define IO_OK 0 +#define IO_ERROR 1 + +#define MAJOR_NR COMPAQ_CISS_MAJOR + +struct ctlr_info; +typedef struct ctlr_info ctlr_info_t; + +struct access_method { + void (*submit_command)(ctlr_info_t *h, CommandList_struct *c); + void (*set_intr_mask)(ctlr_info_t *h, unsigned long val); + unsigned long (*fifo_full)(ctlr_info_t *h); + unsigned long (*intr_pending)(ctlr_info_t *h); + unsigned long (*command_completed)(ctlr_info_t *h); +}; +typedef struct _drive_info_struct +{ + __u32 LunID; + int usage_count; + int nr_blocks; + int block_size; + int heads; + int sectors; + int cylinders; +} drive_info_struct; + +struct ctlr_info +{ + int ctlr; + char devname[8]; + char *product_name; + char firm_ver[4]; // Firmware version + unchar pci_bus; + unchar pci_dev_fn; + __u32 board_id; + ulong vaddr; + __u32 paddr; + CfgTable_struct *cfgtable; + int intr; + + int max_commands; + int commands_outstanding; + int max_outstanding; /* Debug */ + int num_luns; + int usage_count; /* number of opens all all minor devices */ + + // information about each logical volume + drive_info_struct drv[CISS_MAX_LUN]; + + struct access_method access; + + /* queue and queue Info */ + CommandList_struct *reqQ; + CommandList_struct *cmpQ; + unsigned int Qdepth; + unsigned int maxQsinceinit; + unsigned int maxSG; + + //* pointers to command and error info pool */ + CommandList_struct *cmd_pool; + ErrorInfo_struct *errinfo_pool; + __u32 *cmd_pool_bits; + int nr_allocs; + int nr_frees; + + // Disk structures we need to pass back + struct gendisk gendisk; + // indexed by minor numbers + struct hd_struct hd[256]; + int sizes[256]; + int blocksizes[256]; + int hardsizes[256]; +}; + +/* Defining the diffent access_menthods */ +/* + * Memory mapped FIFO interface (SMART 53xx cards) + */ +#define SA5_DOORBELL 0x20 +#define SA5_REQUEST_PORT_OFFSET 0x40 +#define SA5_REPLY_INTR_MASK_OFFSET 0x34 +#define SA5_REPLY_PORT_OFFSET 0x44 +#define SA5_INTR_STATUS 0x30 + +#define SA5_INTR_OFF 0x08 +#define SA5_INTR_PENDING 0x08 +#define FIFO_EMPTY 0xffffffff + +#define CISS_ERROR_BIT 0x02 + +#define CCISS_INTR_ON 1 +#define CCISS_INTR_OFF 0 +/* + Send the command to the hardware +*/ +static void SA5_submit_command( ctlr_info_t *h, CommandList_struct *c) +{ +#ifdef CCISS_DEBUG + printk("Sending %x - down to controller\n", c->busaddr ); +#endif /* CCISS_DEBUG */ + writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET); + h->commands_outstanding++; + if ( h->commands_outstanding > h->max_outstanding) + h->max_outstanding = h->commands_outstanding; +} + +/* + * This card is the oposite of the other cards. + * 0 turns interrupts on... + * 0x08 turns them off... + */ +static void SA5_intr_mask(ctlr_info_t *h, unsigned long val) +{ + if (val) + { /* Turn interrupts on */ + writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + } else /* Turn them off */ + { + writel( SA5_INTR_OFF, + h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + } +} +/* + * Returns true if fifo is full. + * + */ +static unsigned long SA5_fifo_full(ctlr_info_t *h) +{ + if( h->commands_outstanding >= h->max_commands) + return(1); + else + return(0); + +} +/* + * returns value read from hardware. + * returns FIFO_EMPTY if there is nothing to read + */ +static unsigned long SA5_completed(ctlr_info_t *h) +{ + unsigned long register_value + = readl(h->vaddr + SA5_REPLY_PORT_OFFSET); + if(register_value != FIFO_EMPTY) + { + h->commands_outstanding--; +#ifdef CCISS_DEBUG + printk("cciss: Read %lx back from board\n", register_value); +#endif /* CCISS_DEBUG */ + } +#ifdef CCISS_DEBUG + else + { + printk("cciss: FIFO Empty read\n"); + } +#endif + return ( register_value); + +} +/* + * Returns true if an interrupt is pending.. + */ +static unsigned long SA5_intr_pending(ctlr_info_t *h) +{ + unsigned long register_value = + readl(h->vaddr + SA5_INTR_STATUS); +#ifdef CCISS_DEBUG + printk("cciss: intr_pending %lx\n", register_value); +#endif /* CCISS_DEBUG */ + if( register_value & SA5_INTR_PENDING) + return 1; + return 0 ; +} + + +static struct access_method SA5_access = { + SA5_submit_command, + SA5_intr_mask, + SA5_fifo_full, + SA5_intr_pending, + SA5_completed, +}; + +struct board_type { + __u32 board_id; + char *product_name; + struct access_method *access; +}; +#endif /* CCISS_H */ + diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h new file mode 100644 index 000000000..456a3a021 --- /dev/null +++ b/drivers/block/cciss_cmd.h @@ -0,0 +1,254 @@ +#ifndef CCISS_CMD_H +#define CCISS_CMD_H +//########################################################################### +//DEFINES +//########################################################################### +#define CISS_VERSION "1.00" + +//general boundary defintions +#define SENSEINFOBYTES 32//note that this value may vary between host implementations +#define MAXSGENTRIES 31 +#define MAXREPLYQS 256 + +//Command Status value +#define CMD_SUCCESS 0x0000 +#define CMD_TARGET_STATUS 0x0001 +#define CMD_DATA_UNDERRUN 0x0002 +#define CMD_DATA_OVERRUN 0x0003 +#define CMD_INVALID 0x0004 +#define CMD_PROTOCOL_ERR 0x0005 +#define CMD_HARDWARE_ERR 0x0006 +#define CMD_CONNECTION_LOST 0x0007 +#define CMD_ABORTED 0x0008 +#define CMD_ABORT_FAILED 0x0009 +#define CMD_UNSOLICITED_ABORT 0x000A +#define CMD_TIMEOUT 0x000B +#define CMD_UNABORTABLE 0x000C + +//transfer direction +#define XFER_NONE 0x00 +#define XFER_WRITE 0x01 +#define XFER_READ 0x02 +#define XFER_RSVD 0x03 + +//task attribute +#define ATTR_UNTAGGED 0x00 +#define ATTR_SIMPLE 0x04 +#define ATTR_HEADOFQUEUE 0x05 +#define ATTR_ORDERED 0x06 +#define ATTR_ACA 0x07 + +//cdb type +#define TYPE_CMD 0x00 +#define TYPE_MSG 0x01 + +//config space register offsets +#define CFG_VENDORID 0x00 +#define CFG_DEVICEID 0x02 +#define CFG_I2OBAR 0x10 +#define CFG_MEM1BAR 0x14 + +//i2o space register offsets +#define I2O_IBDB_SET 0x20 +#define I2O_IBDB_CLEAR 0x70 +#define I2O_INT_STATUS 0x30 +#define I2O_INT_MASK 0x34 +#define I2O_IBPOST_Q 0x40 +#define I2O_OBPOST_Q 0x44 + +//Configuration Table +#define CFGTBL_ChangeReq 0x00000001l +#define CFGTBL_AccCmds 0x00000001l + +#define CFGTBL_Trans_Simple 0x00000002l + +#define CFGTBL_BusType_Ultra2 0x00000001l +#define CFGTBL_BusType_Ultra3 0x00000002l +#define CFGTBL_BusType_Fibre1G 0x00000100l +#define CFGTBL_BusType_Fibre2G 0x00000200l +typedef struct _vals32 +{ + __u32 lower; + __u32 upper; +} vals32; + +typedef union _u64bit +{ + vals32 val32; + __u64 val; +} u64bit; + +// Type defs used in the following structs +#define BYTE __u8 +#define WORD __u16 +#define HWORD __u16 +#define DWORD __u32 +#define QWORD vals32 + +//########################################################################### +//STRUCTURES +//########################################################################### +#define CISS_MAX_LUN 16 +// SCSI-3 Cmmands + +#pragma pack(1) + +#define CISS_INQUIRY 0x12 +//Date returned +typedef struct _InquiryData_struct +{ + BYTE data_byte[36]; +} InquiryData_struct; + +#define CISS_REPORT_LOG 0xc2 /* Report Logical LUNs */ +// Data returned +typedef struct _ReportLUNdata_struct +{ + BYTE LUNListLength[4]; + DWORD reserved; + BYTE LUN[CISS_MAX_LUN][8]; +} ReportLunData_struct; + +#define CCISS_READ_CAPACITY 0x25 /* Read Capacity */ +typedef struct _ReadCapdata_struct +{ + BYTE total_size[4]; // Total size in blocks + BYTE block_size[4]; // Size of blocks in bytes +} ReadCapdata_struct; + +// 12 byte commands not implemented in firmware yet. +// #define CCISS_READ 0xa8 // Read(12) +// #define CCISS_WRITE 0xaa // Write(12) + #define CCISS_READ 0x28 // Read(10) + #define CCISS_WRITE 0x2a // Write(10) + +//Command List Structure +typedef union _SCSI3Addr_struct { + struct { + BYTE Bus:6; + BYTE Mode:2; // b00 + BYTE Dev; + } PeripDev; + struct { + BYTE DevMSB:6; + BYTE Mode:2; // b01 + BYTE DevLSB; + } LogDev; + struct { + BYTE Targ:6; + BYTE Mode:2; // b10 + BYTE Dev:5; + BYTE Bus:3; + } LogUnit; +} SCSI3Addr_struct; + +typedef struct _PhysDevAddr_struct { + DWORD TargetId:24; + DWORD Bus:6; + DWORD Mode:2; + SCSI3Addr_struct Target[2]; //2 level target device addr +} PhysDevAddr_struct; + +typedef struct _LogDevAddr_struct { + DWORD VolId:30; + DWORD Mode:2; + BYTE reserved[4]; +} LogDevAddr_struct; + +typedef union _LUNAddr_struct { + BYTE LunAddrBytes[8]; + SCSI3Addr_struct SCSI3Lun[4]; + PhysDevAddr_struct PhysDev; + LogDevAddr_struct LogDev; +} LUNAddr_struct; + +typedef struct _CommandListHeader_struct { + BYTE ReplyQueue; + BYTE SGList; + HWORD SGTotal; + QWORD Tag; + LUNAddr_struct LUN; +} CommandListHeader_struct; +typedef struct _RequestBlock_struct { + BYTE CDBLen; + struct { + BYTE Type:3; + BYTE Attribute:3; + BYTE Direction:2; + } Type; + HWORD Timeout; + BYTE CDB[16]; +} RequestBlock_struct; +typedef struct _ErrDescriptor_struct { + QWORD Addr; + DWORD Len; +} ErrDescriptor_struct; +typedef struct _SGDescriptor_struct { + QWORD Addr; + DWORD Len; + DWORD Ext; +} SGDescriptor_struct; + +typedef union _MoreErrInfo_struct{ + struct { + BYTE Reserved[3]; + BYTE Type; + DWORD ErrorInfo; + }Common_Info; + struct{ + BYTE Reserved[2]; + BYTE offense_size;//size of offending entry + BYTE offense_num; //byte # of offense 0-base + DWORD offense_value; + }Invalid_Cmd; +}MoreErrInfo_struct; +typedef struct _ErrorInfo_struct { + BYTE ScsiStatus; + BYTE SenseLen; + HWORD CommandStatus; + DWORD ResidualCnt; + MoreErrInfo_struct MoreErrInfo; + BYTE SenseInfo[SENSEINFOBYTES]; +} ErrorInfo_struct; + +/* Command types */ +#define CMD_RWREQ 0x00 +#define CMD_IOCTL_PEND 0x01 +#define CMD_IOCTL_DONE 0x02 + +typedef struct _CommandList_struct { + CommandListHeader_struct Header; + RequestBlock_struct Request; + ErrDescriptor_struct ErrDesc; + SGDescriptor_struct SG[MAXSGENTRIES]; + /* information associated with the command */ + __u32 busaddr; /* physical addres of this record */ + ErrorInfo_struct * err_info; /* pointer to the allocated mem */ + int cmd_type; + struct _CommandList_struct *prev; + struct _CommandList_struct *next; + struct buffer_head * bh; +} CommandList_struct; + +//Configuration Table Structure +typedef struct _HostWrite_struct { + DWORD TransportRequest; + DWORD Reserved; + DWORD CoalIntDelay; + DWORD CoalIntCount; +} HostWrite_struct; + +typedef struct _CfgTable_struct { + BYTE Signature[4]; + DWORD SpecValence; + DWORD TransportSupport; + DWORD TransportActive; + HostWrite_struct HostWrite; + DWORD CmdsOutMax; + DWORD BusTypes; + DWORD Reserved; + BYTE ServerName[16]; + DWORD HeartBeat; +} CfgTable_struct; +#pragma pack() +#endif // CCISS_CMD_H diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index b03429fae..f26611bc3 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -46,6 +46,12 @@ #define DRIVER_NAME "Compaq SMART2 Driver (v 2.4.0)" #define DRIVER_VERSION SMART2_DRIVER_VERSION(2,4,0) + +/* Embedded module documentation macros - see modules.h */ +/* Original author Chris Frantz - Compaq Computer Corporation */ +MODULE_AUTHOR("Compaq Computer Corporation"); +MODULE_DESCRIPTION("Driver for Compaq Smart2 Array Controllers"); + #define MAJOR_NR COMPAQ_SMART2_MAJOR #include <linux/blk.h> #include <linux/blkdev.h> @@ -85,6 +91,7 @@ static struct board_type products[] = { { 0x40330E11, "Smart Array 3100ES", &smart2_access }, { 0x40340E11, "Smart Array 221", &smart2_access }, { 0x40400E11, "Integrated Array", &smart4_access }, + { 0x40480E11, "Compaq Raid LC2", &smart4_access }, { 0x40500E11, "Smart Array 4200", &smart4_access }, { 0x40510E11, "Smart Array 4250ES", &smart4_access }, { 0x40580E11, "Smart Array 431", &smart4_access }, @@ -109,8 +116,8 @@ static struct proc_dir_entry *proc_array = NULL; int cpqarray_init(void); static int cpqarray_pci_detect(void); -static int cpqarray_pci_init(ctlr_info_t *c, unchar bus, unchar device_fn); -static ulong remap_pci_mem(ulong base, ulong size); +static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev); +static void *remap_pci_mem(ulong base, ulong size); static int cpqarray_eisa_detect(void); static int pollcomplete(int ctlr); static void getgeometry(int ctlr); @@ -328,7 +335,7 @@ void cleanup_module(void) for(i=0; i<nr_ctlr; i++) { hba[i]->access.set_intr_mask(hba[i], 0); free_irq(hba[i]->intr, hba[i]); - iounmap((void*)hba[i]->vaddr); + iounmap(hba[i]->vaddr); unregister_blkdev(MAJOR_NR+i, hba[i]->devname); del_timer(&hba[i]->timer); blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR + i)); @@ -535,8 +542,7 @@ int __init cpqarray_init(void) */ static int cpqarray_pci_detect(void) { - int index; - unchar bus=0, dev_fn=0; + struct pci_dev *pdev; #define IDA_BOARD_TYPES 3 static int ida_vendor_id[IDA_BOARD_TYPES] = { PCI_VENDOR_ID_DEC, @@ -547,29 +553,22 @@ static int cpqarray_pci_detect(void) /* search for all PCI board types that could be for this driver */ for(brdtype=0; brdtype<IDA_BOARD_TYPES; brdtype++) { - for(index=0; ; index++) { - if (pcibios_find_device(ida_vendor_id[brdtype], - ida_device_id[brdtype], index, &bus, &dev_fn)) - break; + pdev = pci_find_device(ida_vendor_id[brdtype], + ida_device_id[brdtype], NULL); + while (pdev) { printk(KERN_DEBUG "cpqarray: Device %x has been found at %x %x\n", - ida_vendor_id[brdtype], bus, dev_fn); - if (index == 1000000) break; + ida_vendor_id[brdtype], + pdev->bus->number, pdev->devfn); if (nr_ctlr == 8) { printk(KERN_WARNING "cpqarray: This driver" " supports a maximum of 8 controllers.\n"); break; } - + /* if it is a PCI_DEVICE_ID_NCR_53C1510, make sure it's the Compaq version of the chip */ if (ida_device_id[brdtype] == PCI_DEVICE_ID_NCR_53C1510) { - unsigned short subvendor=0; - if(pcibios_read_config_word(bus, dev_fn, - PCI_SUBSYSTEM_VENDOR_ID, &subvendor)) - { - printk(KERN_DEBUG "cpqarray: failed to read subvendor\n"); - continue; - } + unsigned short subvendor=pdev->subsystem_vendor; if(subvendor != PCI_VENDOR_ID_COMPAQ) { printk(KERN_DEBUG @@ -584,7 +583,7 @@ static int cpqarray_pci_detect(void) continue; } memset(hba[nr_ctlr], 0, sizeof(ctlr_info_t)); - if (cpqarray_pci_init(hba[nr_ctlr], bus, dev_fn) != 0) + if (cpqarray_pci_init(hba[nr_ctlr], pdev) != 0) { kfree(hba[nr_ctlr]); continue; @@ -593,6 +592,8 @@ static int cpqarray_pci_detect(void) hba[nr_ctlr]->ctlr = nr_ctlr; nr_ctlr++; + pdev = pci_find_device(ida_vendor_id[brdtype], + ida_device_id[brdtype], pdev); } } @@ -603,24 +604,23 @@ static int cpqarray_pci_detect(void) * Find the IO address of the controller, its IRQ and so forth. Fill * in some basic stuff into the ctlr_info_t structure. */ -static int cpqarray_pci_init(ctlr_info_t *c, unchar bus, unchar device_fn) +static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev) { ushort vendor_id, device_id, command; unchar cache_line_size, latency_timer; unchar irq, revision; - uint addr[6]; + unsigned long addr[6]; __u32 board_id; - struct pci_dev *pdev; int i; - pdev = pci_find_slot(bus, device_fn); + c->pci_dev = pdev; vendor_id = pdev->vendor; device_id = pdev->device; irq = pdev->irq; for(i=0; i<6; i++) - addr[i] = pdev->resource[i].flags; + addr[i] = pci_resource_start(pdev, i); if (pci_enable_device(pdev)) return -1; @@ -637,7 +637,7 @@ DBGINFO( printk("device_id = %x\n", device_id); printk("command = %x\n", command); for(i=0; i<6; i++) - printk("addr[%d] = %x\n", i, addr[i]); + printk("addr[%d] = %lx\n", i, addr[i]); printk("revision = %x\n", revision); printk("irq = %x\n", irq); printk("cache_line_size = %x\n", cache_line_size); @@ -646,17 +646,19 @@ DBGINFO( ); c->intr = irq; - c->ioaddr = addr[0] & ~0x1; + c->ioaddr = addr[0]; - /* - * Memory base addr is first addr with the first bit _not_ set - */ + c->paddr = 0; for(i=0; i<6; i++) - if (!(addr[i] & 0x1)) { + if (pci_resource_flags(pdev, i) & IORESOURCE_MEM) { c->paddr = pci_resource_start (pdev, i); break; } + if (!c->paddr) + return -1; c->vaddr = remap_pci_mem(c->paddr, 128); + if (!c->vaddr) + return -1; c->board_id = board_id; for(i=0; i<NR_PRODUCTS; i++) { @@ -679,13 +681,13 @@ DBGINFO( /* * Map (physical) PCI mem into (virtual) kernel space */ -static ulong remap_pci_mem(ulong base, ulong size) +static void *remap_pci_mem(ulong base, ulong size) { ulong page_base = ((ulong) base) & PAGE_MASK; ulong page_offs = ((ulong) base) - page_base; - ulong page_remapped = (ulong) ioremap(page_base, page_offs+size); + void *page_remapped = ioremap(page_base, page_offs+size); - return (ulong) (page_remapped ? (page_remapped + page_offs) : 0UL); + return (page_remapped ? (page_remapped + page_offs) : NULL); } #ifndef MODULE @@ -769,6 +771,7 @@ static int cpqarray_eisa_detect(void) hba[nr_ctlr]->access = *(products[j].access); hba[nr_ctlr]->ctlr = nr_ctlr; hba[nr_ctlr]->board_id = board_id; + hba[nr_ctlr]->pci_dev = NULL; /* not PCI */ DBGINFO( printk("i = %d, j = %d\n", i, j); @@ -898,7 +901,7 @@ static void do_ida_request(int ctlr) if (ctlr != MAJOR(creq->rq_dev)-MAJOR_NR || ctlr > nr_ctlr || h == NULL) { - printk("doreq cmd for %d, %x at %p\n", + printk(KERN_WARNING "doreq cmd for %d, %x at %p\n", ctlr, creq->rq_dev, creq); complete_buffers(creq->bh, 0); start_io(h); @@ -1188,6 +1191,20 @@ static int ida_ioctl(struct inode *inode, struct file *filep, unsigned int cmd, if (!arg) return -EINVAL; put_user(DRIVER_VERSION, (unsigned long*)arg); return 0; + case IDAGETPCIINFO: + { + + ida_pci_info_struct pciinfo; + + if (!arg) return -EINVAL; + pciinfo.bus = hba[ctlr]->pci_dev->bus->number; + pciinfo.dev_fn = hba[ctlr]->pci_dev->devfn; + pciinfo.board_id = hba[ctlr]->board_id; + if(copy_to_user((void *) arg, &pciinfo, + sizeof( ida_pci_info_struct))) + return -EFAULT; + return(0); + } case BLKFLSBUF: case BLKROSET: @@ -1198,7 +1215,7 @@ static int ida_ioctl(struct inode *inode, struct file *filep, unsigned int cmd, return blk_ioctl(inode->i_rdev, cmd, arg); default: - return -EBADRQC; + return -EINVAL; } } @@ -1378,6 +1395,8 @@ static int sendcmd( ctlr_info_t *info_p = hba[ctlr]; c = cmd_alloc(info_p); + if(!c) + return IO_ERROR; c->ctlr = ctlr; c->hdr.unit = log_unit; c->hdr.prio = 0; diff --git a/drivers/block/cpqarray.h b/drivers/block/cpqarray.h index 31e9786f3..de569c7de 100644 --- a/drivers/block/cpqarray.h +++ b/drivers/block/cpqarray.h @@ -87,12 +87,13 @@ struct ctlr_info { int log_drives; int phys_drives; + struct pci_dev *pci_dev; /* NULL if EISA */ __u32 board_id; char *product_name; - __u32 vaddr; - __u32 paddr; - __u32 ioaddr; + void *vaddr; + unsigned long paddr; + unsigned long ioaddr; int intr; int usage_count; drv_info_t drv[NWD]; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index dcd9d1036..2b26bb66b 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -118,6 +118,12 @@ * being used to store jiffies, which are unsigned longs). */ +/* + * 2000/08/28 -- Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * - get rid of check_region + * - s/suser/capable/ + */ + #define FLOPPY_SANITY_CHECK #undef FLOPPY_SILENT_DCL_CLEAR @@ -162,7 +168,7 @@ static int print_unex=1; * It's been recommended that take about 1/4 of the default speed * in some more extreme cases. */ -static int slow_floppy = 0; +static int slow_floppy; #include <asm/dma.h> #include <asm/irq.h> @@ -197,7 +203,7 @@ static unsigned short virtual_dma_port=0x3f0; void floppy_interrupt(int irq, void *dev_id, struct pt_regs * regs); static int set_dor(int fdc, char mask, char data); static void register_devfs_entries (int drive) __init; -static devfs_handle_t devfs_handle = NULL; +static devfs_handle_t devfs_handle; #define K_64 0x10000 /* 64KB */ @@ -215,7 +221,7 @@ static int allowed_drive_mask = 0x33; #include <asm/floppy.h> -static int irqdma_allocated = 0; +static int irqdma_allocated; #define MAJOR_NR FLOPPY_MAJOR @@ -253,7 +259,7 @@ static inline void fallback_on_nodma_alloc(char **addr, size_t l) /* End dma memory related stuff */ -static unsigned long fake_change = 0; +static unsigned long fake_change; static int initialising=1; static inline int TYPE(kdev_t x) { @@ -454,10 +460,7 @@ static struct floppy_struct floppy_type[32] = { #define SECTSIZE (_FD_SECTSIZE(*floppy)) /* Auto-detection: Disk type used until the next media change occurs. */ -static struct floppy_struct *current_type[N_DRIVE] = { - NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL -}; +static struct floppy_struct *current_type[N_DRIVE]; /* * User-provided type information. current_type points to @@ -466,14 +469,14 @@ static struct floppy_struct *current_type[N_DRIVE] = { static struct floppy_struct user_params[N_DRIVE]; static int floppy_sizes[256]; -static int floppy_blocksizes[256] = { 0, }; +static int floppy_blocksizes[256]; /* * The driver is trying to determine the correct media format * while probing is set. rw_interrupt() clears it after a * successful access. */ -static int probing = 0; +static int probing; /* Synchronization of FDC access. */ #define FD_COMMAND_NONE -1 @@ -481,7 +484,7 @@ static int probing = 0; #define FD_COMMAND_OKAY 3 static volatile int command_status = FD_COMMAND_NONE; -static unsigned long fdc_busy = 0; +static unsigned long fdc_busy; static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); static DECLARE_WAIT_QUEUE_HEAD(command_done); @@ -552,9 +555,7 @@ static void reset_fdc(void); #define NEED_1_RECAL -2 #define NEED_2_RECAL -3 -/* */ -static int usage_count = 0; - +static int usage_count; /* buffer related variables */ static int buffer_track = -1; @@ -567,8 +568,8 @@ static struct floppy_fdc_state fdc_state[N_FDC]; static int fdc; /* current fdc */ static struct floppy_struct *_floppy = floppy_type; -static unsigned char current_drive = 0; -static long current_count_sectors = 0; +static unsigned char current_drive; +static long current_count_sectors; static unsigned char sector_t; /* sector in track */ static unsigned char in_sector_offset; /* offset within physical sector, * expressed in units of 512 bytes */ @@ -619,7 +620,7 @@ static void is_alive(const char *message) #define OLOGSIZE 20 -static void (*lasthandler)(void) = NULL; +static void (*lasthandler)(void); static unsigned long interruptjiffies; static unsigned long resultjiffies; static int resultsize; @@ -985,8 +986,7 @@ static void empty(void) { } -static struct tq_struct floppy_tq = -{ 0, 0, 0, 0 }; +static struct tq_struct floppy_tq; static void schedule_bh( void (*handler)(void*) ) { @@ -1263,7 +1263,7 @@ static inline void perpendicular_mode(void) } /* perpendicular_mode */ static int fifo_depth = 0xa; -static int no_fifo = 0; +static int no_fifo; static int fdc_configure(void) { @@ -2282,6 +2282,7 @@ static int do_format(kdev_t device, struct format_descr *tmp_format_req) static void request_done(int uptodate) { int block; + unsigned long flags; probing = 0; reschedule_timeout(MAXTIMEOUT, "request done %d", uptodate); @@ -2300,6 +2301,7 @@ static void request_done(int uptodate) DRS->maxtrack = 1; /* unlock chained buffers */ + spin_lock_irqsave(&io_request_lock, flags); while (current_count_sectors && !QUEUE_EMPTY && current_count_sectors >= CURRENT->current_nr_sectors){ current_count_sectors -= CURRENT->current_nr_sectors; @@ -2307,6 +2309,8 @@ static void request_done(int uptodate) CURRENT->sector += CURRENT->current_nr_sectors; end_request(1); } + spin_unlock_irqrestore(&io_request_lock, flags); + if (current_count_sectors && !QUEUE_EMPTY){ /* "unlock" last subsector */ CURRENT->buffer += current_count_sectors <<9; @@ -2330,7 +2334,9 @@ static void request_done(int uptodate) DRWE->last_error_sector = CURRENT->sector; DRWE->last_error_generation = DRS->generation; } + spin_lock_irqsave(&io_request_lock, flags); end_request(0); + spin_unlock_irqrestore(&io_request_lock, flags); } } @@ -3497,7 +3503,7 @@ static int fd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, /* permission checks */ if (((cmd & 0x40) && !(filp->f_mode & 2)) || - ((cmd & 0x80) && !suser())) + ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))) return -EPERM; /* copyin */ @@ -4299,23 +4305,14 @@ static int floppy_grab_irq_and_dma(void) for (fdc=0; fdc< N_FDC; fdc++){ if (FDCS->address != -1){ - if (check_region(FDCS->address, 6) < 0 || - check_region(FDCS->address+7, 1) < 0) { + if (!request_region(FDCS->address, 6, "floppy")) { DPRINT("Floppy io-port 0x%04lx in use\n", FDCS->address); - fd_free_irq(); - fd_free_dma(); - while(--fdc >= 0) { - release_region(FDCS->address, 6); - release_region(FDCS->address+7, 1); - } - MOD_DEC_USE_COUNT; - spin_lock_irqsave(&floppy_usage_lock, flags); - usage_count--; - spin_unlock_irqrestore(&floppy_usage_lock, flags); - return -1; + goto cleanup1; + } + if (!request_region(FDCS->address + 7, 1, "floppy DIR")) { + DPRINT("Floppy io-port 0x%04lx in use\n", FDCS->address + 7); + goto cleanup2; } - request_region(FDCS->address, 6, "floppy"); - request_region(FDCS->address+7, 1, "floppy DIR"); /* address + 6 is reserved, and may be taken by IDE. * Unfortunately, Adaptec doesn't know this :-(, */ } @@ -4339,6 +4336,20 @@ static int floppy_grab_irq_and_dma(void) fdc = 0; irqdma_allocated = 1; return 0; +cleanup2: + release_region(FDCS->address, 6); +cleanup1: + fd_free_irq(); + fd_free_dma(); + while(--fdc >= 0) { + release_region(FDCS->address, 6); + release_region(FDCS->address + 7, 1); + } + MOD_DEC_USE_COUNT; + spin_lock_irqsave(&floppy_usage_lock, flags); + usage_count--; + spin_unlock_irqrestore(&floppy_usage_lock, flags); + return -1; } static void floppy_release_irq_and_dma(void) diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c index a9bf58157..2cc319e20 100644 --- a/drivers/block/genhd.c +++ b/drivers/block/genhd.c @@ -23,7 +23,6 @@ extern int blk_dev_init(void); #ifdef CONFIG_BLK_DEV_DAC960 extern void DAC960_Initialize(void); #endif -extern int scsi_dev_init(void); extern int net_dev_init(void); extern void console_map_init(void); extern int soc_probe(void); @@ -50,9 +49,6 @@ void __init device_init(void) /* This has to be done before scsi_dev_init */ soc_probe(); #endif -#ifdef CONFIG_SCSI - scsi_dev_init(); -#endif #ifdef CONFIG_IEEE1394 ieee1394_init(); #endif diff --git a/drivers/block/ida_ioctl.h b/drivers/block/ida_ioctl.h index 9c159df98..5cc212431 100644 --- a/drivers/block/ida_ioctl.h +++ b/drivers/block/ida_ioctl.h @@ -33,7 +33,14 @@ #define IDAGETCTLRSIG 0x29293030 #define IDAREVALIDATEVOLS 0x30303131 #define IDADRIVERVERSION 0x31313232 +#define IDAGETPCIINFO 0x32323333 +typedef struct _ida_pci_info_struct +{ + unsigned char bus; + unsigned char dev_fn; + __u32 board_id; +} ida_pci_info_struct; /* * Normally, the ioctl determines the logical unit for this command by * the major,minor number of the fd passed to ioctl. If you need to send @@ -60,7 +67,7 @@ typedef struct { union ctlr_cmds { drv_info_t drv; - unsigned char buf[512]; + unsigned char buf[1024]; id_ctlr_t id_ctlr; drv_param_t drv_param; diff --git a/drivers/block/linear.c b/drivers/block/linear.c deleted file mode 100644 index 855bc44dd..000000000 --- a/drivers/block/linear.c +++ /dev/null @@ -1,213 +0,0 @@ -/* - linear.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - <zyngier@ufr-info-p7.ibp.fr> or - <maz@gloups.fdn.fr> - - Linear mode management functions. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#include <linux/module.h> - -#include <linux/raid/md.h> -#include <linux/malloc.h> - -#include <linux/raid/linear.h> - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER -#define MD_PERSONALITY - -static int linear_run (mddev_t *mddev) -{ - linear_conf_t *conf; - struct linear_hash *table; - mdk_rdev_t *rdev; - int size, i, j, nb_zone; - unsigned int curr_offset; - - MOD_INC_USE_COUNT; - - conf = kmalloc (sizeof (*conf), GFP_KERNEL); - if (!conf) - goto out; - mddev->private = conf; - - if (md_check_ordering(mddev)) { - printk("linear: disks are not ordered, aborting!\n"); - goto out; - } - /* - * Find the smallest device. - */ - - conf->smallest = NULL; - curr_offset = 0; - ITERATE_RDEV_ORDERED(mddev,rdev,j) { - dev_info_t *disk = conf->disks + j; - - disk->dev = rdev->dev; - disk->size = rdev->size; - disk->offset = curr_offset; - - curr_offset += disk->size; - - if (!conf->smallest || (disk->size < conf->smallest->size)) - conf->smallest = disk; - } - - nb_zone = conf->nr_zones = - md_size[mdidx(mddev)] / conf->smallest->size + - ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0); - - conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone, - GFP_KERNEL); - if (!conf->hash_table) - goto out; - - /* - * Here we generate the linear hash table - */ - table = conf->hash_table; - i = 0; - size = 0; - for (j = 0; j < mddev->nb_dev; j++) { - dev_info_t *disk = conf->disks + j; - - if (size < 0) { - table->dev1 = disk; - table++; - } - size += disk->size; - - while (size) { - table->dev0 = disk; - size -= conf->smallest->size; - if (size < 0) - break; - table->dev1 = NULL; - table++; - } - } - table->dev1 = NULL; - - return 0; - -out: - if (conf) - kfree(conf); - MOD_DEC_USE_COUNT; - return 1; -} - -static int linear_stop (mddev_t *mddev) -{ - linear_conf_t *conf = mddev_to_conf(mddev); - - kfree(conf->hash_table); - kfree(conf); - - MOD_DEC_USE_COUNT; - - return 0; -} - -static int linear_make_request (mddev_t *mddev, - int rw, struct buffer_head * bh) -{ - linear_conf_t *conf = mddev_to_conf(mddev); - struct linear_hash *hash; - dev_info_t *tmp_dev; - long block; - - block = bh->b_rsector >> 1; - hash = conf->hash_table + (block / conf->smallest->size); - - if (block >= (hash->dev0->size + hash->dev0->offset)) { - if (!hash->dev1) { - printk ("linear_make_request : hash->dev1==NULL for block %ld\n", - block); - return -1; - } - tmp_dev = hash->dev1; - } else - tmp_dev = hash->dev0; - - if (block >= (tmp_dev->size + tmp_dev->offset) - || block < tmp_dev->offset) { - printk ("linear_make_request: Block %ld out of bounds on dev %s size %ld offset %ld\n", block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); - return -1; - } - bh->b_rdev = tmp_dev->dev; - bh->b_rsector = bh->b_rsector - (tmp_dev->offset << 1); - - return 1; -} - -static int linear_status (char *page, mddev_t *mddev) -{ - int sz = 0; - -#undef MD_DEBUG -#ifdef MD_DEBUG - int j; - linear_conf_t *conf = mddev_to_conf(mddev); - - sz += sprintf(page+sz, " "); - for (j = 0; j < conf->nr_zones; j++) - { - sz += sprintf(page+sz, "[%s", - partition_name(conf->hash_table[j].dev0->dev)); - - if (conf->hash_table[j].dev1) - sz += sprintf(page+sz, "/%s] ", - partition_name(conf->hash_table[j].dev1->dev)); - else - sz += sprintf(page+sz, "] "); - } - sz += sprintf(page+sz, "\n"); -#endif - sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024); - return sz; -} - - -static mdk_personality_t linear_personality= -{ - name: "linear", - make_request: linear_make_request, - run: linear_run, - stop: linear_stop, - status: linear_status, -}; - -#ifndef MODULE - -void md__init linear_init (void) -{ - register_md_personality (LINEAR, &linear_personality); -} - -#else - -int init_module (void) -{ - return (register_md_personality (LINEAR, &linear_personality)); -} - -void cleanup_module (void) -{ - unregister_md_personality (LINEAR); -} - -#endif - diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 8a61d1e84..db562b8ac 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -600,6 +600,8 @@ static inline void add_request(request_queue_t * q, struct request * req, major = MAJOR(req->rq_dev); if (major >= COMPAQ_SMART2_MAJOR+0 && major <= COMPAQ_SMART2_MAJOR+7) (q->request_fn)(q); + if (major >= COMPAQ_CISS_MAJOR+0 && major <= COMPAQ_CISS_MAJOR+7) + (q->request_fn)(q); if (major >= DAC960_MAJOR+0 && major <= DAC960_MAJOR+7) (q->request_fn)(q); } @@ -1128,9 +1130,6 @@ int __init blk_dev_init(void) #ifdef CONFIG_SJCD sjcd_init(); #endif CONFIG_SJCD -#ifdef CONFIG_BLK_DEV_MD - md_init(); -#endif CONFIG_BLK_DEV_MD #ifdef CONFIG_APBLOCK ap_init(); #endif diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 47651b852..da1d6629d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -357,7 +357,7 @@ repeat: if (current_request->cmd == WRITE) { mark_buffer_uptodate(bh, 1); - mark_buffer_dirty(bh, 1); + mark_buffer_dirty(bh); } brelse(bh); dest_addr += size; @@ -584,6 +584,8 @@ static int loop_set_status(struct loop_device *lo, struct loop_info *arg) type = info.lo_encrypt_type; if (type >= MAX_LO_CRYPT || xfer_funcs[type] == NULL) return -EINVAL; + if (type == LO_CRYPT_XOR && info.lo_encrypt_key_size == 0) + return -EINVAL; err = loop_release_xfer(lo); if (!err) err = loop_init_xfer(lo, type, &info); @@ -793,7 +795,6 @@ int __init loop_init(void) max_loop = 8; } - printk(KERN_INFO "loop: registered device at major %d\n", MAJOR_NR); printk(KERN_INFO "loop: enabling %d loop devices\n", max_loop); loop_dev = kmalloc (max_loop * sizeof(struct loop_device), GFP_KERNEL); diff --git a/drivers/block/lvm-snap.c b/drivers/block/lvm-snap.c deleted file mode 100644 index 938ffc26e..000000000 --- a/drivers/block/lvm-snap.c +++ /dev/null @@ -1,434 +0,0 @@ -/* - * kernel/lvm-snap.c - * - * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE - * - * LVM snapshot driver is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * LVM driver is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU CC; see the file COPYING. If not, write to - * the Free Software Foundation, 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - * - */ - -#include <linux/kernel.h> -#include <linux/vmalloc.h> -#include <linux/blkdev.h> -#include <linux/smp_lock.h> -#include <linux/types.h> -#include <linux/iobuf.h> -#include <linux/lvm.h> - - -static char *lvm_snap_version __attribute__ ((unused)) = "LVM 0.8final (15/02/2000)\n"; - -extern const char *const lvm_name; -extern int lvm_blocksizes[]; - -void lvm_snapshot_release(lv_t *); - -#define hashfn(dev,block,mask,chunk_size) \ - ((HASHDEV(dev)^((block)/(chunk_size))) & (mask)) - -static inline lv_block_exception_t * -lvm_find_exception_table(kdev_t org_dev, unsigned long org_start, lv_t * lv) -{ - struct list_head * hash_table = lv->lv_snapshot_hash_table, * next; - unsigned long mask = lv->lv_snapshot_hash_mask; - int chunk_size = lv->lv_chunk_size; - lv_block_exception_t * ret; - int i = 0; - - hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)]; - ret = NULL; - for (next = hash_table->next; next != hash_table; next = next->next) - { - lv_block_exception_t * exception; - - exception = list_entry(next, lv_block_exception_t, hash); - if (exception->rsector_org == org_start && - exception->rdev_org == org_dev) - { - if (i) - { - /* fun, isn't it? :) */ - list_del(next); - list_add(next, hash_table); - } - ret = exception; - break; - } - i++; - } - return ret; -} - -static inline void lvm_hash_link(lv_block_exception_t * exception, - kdev_t org_dev, unsigned long org_start, - lv_t * lv) -{ - struct list_head * hash_table = lv->lv_snapshot_hash_table; - unsigned long mask = lv->lv_snapshot_hash_mask; - int chunk_size = lv->lv_chunk_size; - - hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)]; - list_add(&exception->hash, hash_table); -} - -int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector, - unsigned long pe_start, lv_t * lv) -{ - int ret; - unsigned long pe_off, pe_adjustment, __org_start; - kdev_t __org_dev; - int chunk_size = lv->lv_chunk_size; - lv_block_exception_t * exception; - - pe_off = pe_start % chunk_size; - pe_adjustment = (*org_sector-pe_off) % chunk_size; - __org_start = *org_sector - pe_adjustment; - __org_dev = *org_dev; - - ret = 0; - exception = lvm_find_exception_table(__org_dev, __org_start, lv); - if (exception) - { - *org_dev = exception->rdev_new; - *org_sector = exception->rsector_new + pe_adjustment; - ret = 1; - } - return ret; -} - -static void lvm_drop_snapshot(lv_t * lv_snap, const char * reason) -{ - kdev_t last_dev; - int i; - - /* no exception storage space available for this snapshot - or error on this snapshot --> release it */ - invalidate_buffers(lv_snap->lv_dev); - - for (i = last_dev = 0; i < lv_snap->lv_remap_ptr; i++) { - if ( lv_snap->lv_block_exception[i].rdev_new != last_dev) { - last_dev = lv_snap->lv_block_exception[i].rdev_new; - invalidate_buffers(last_dev); - } - } - - lvm_snapshot_release(lv_snap); - - printk(KERN_INFO - "%s -- giving up to snapshot %s on %s due %s\n", - lvm_name, lv_snap->lv_snapshot_org->lv_name, lv_snap->lv_name, - reason); -} - -static inline void lvm_snapshot_prepare_blocks(unsigned long * blocks, - unsigned long start, - int nr_sectors, - int blocksize) -{ - int i, sectors_per_block, nr_blocks; - - sectors_per_block = blocksize >> 9; - nr_blocks = nr_sectors / sectors_per_block; - start /= sectors_per_block; - - for (i = 0; i < nr_blocks; i++) - blocks[i] = start++; -} - -static inline int get_blksize(kdev_t dev) -{ - int correct_size = BLOCK_SIZE, i, major; - - major = MAJOR(dev); - if (blksize_size[major]) - { - i = blksize_size[major][MINOR(dev)]; - if (i) - correct_size = i; - } - return correct_size; -} - -#ifdef DEBUG_SNAPSHOT -static inline void invalidate_snap_cache(unsigned long start, unsigned long nr, - kdev_t dev) -{ - struct buffer_head * bh; - int sectors_per_block, i, blksize, minor; - - minor = MINOR(dev); - blksize = lvm_blocksizes[minor]; - sectors_per_block = blksize >> 9; - nr /= sectors_per_block; - start /= sectors_per_block; - - for (i = 0; i < nr; i++) - { - bh = get_hash_table(dev, start++, blksize); - if (bh) - bforget(bh); - } -} -#endif - -/* - * copy on write handler for one snapshot logical volume - * - * read the original blocks and store it/them on the new one(s). - * if there is no exception storage space free any longer --> release snapshot. - * - * this routine gets called for each _first_ write to a physical chunk. - */ -int lvm_snapshot_COW(kdev_t org_phys_dev, - unsigned long org_phys_sector, - unsigned long org_pe_start, - unsigned long org_virt_sector, - lv_t * lv_snap) -{ - const char * reason; - unsigned long org_start, snap_start, snap_phys_dev, virt_start, pe_off; - int idx = lv_snap->lv_remap_ptr, chunk_size = lv_snap->lv_chunk_size; - struct kiobuf * iobuf; - unsigned long blocks[KIO_MAX_SECTORS]; - int blksize_snap, blksize_org, min_blksize, max_blksize; - int max_sectors, nr_sectors; - - /* check if we are out of snapshot space */ - if (idx >= lv_snap->lv_remap_end) - goto fail_out_of_space; - - /* calculate physical boundaries of source chunk */ - pe_off = org_pe_start % chunk_size; - org_start = org_phys_sector - ((org_phys_sector-pe_off) % chunk_size); - virt_start = org_virt_sector - (org_phys_sector - org_start); - - /* calculate physical boundaries of destination chunk */ - snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new; - snap_start = lv_snap->lv_block_exception[idx].rsector_new; - -#ifdef DEBUG_SNAPSHOT - printk(KERN_INFO - "%s -- COW: " - "org %02d:%02d faulting %lu start %lu, " - "snap %02d:%02d start %lu, " - "size %d, pe_start %lu pe_off %lu, virt_sec %lu\n", - lvm_name, - MAJOR(org_phys_dev), MINOR(org_phys_dev), org_phys_sector, - org_start, - MAJOR(snap_phys_dev), MINOR(snap_phys_dev), snap_start, - chunk_size, - org_pe_start, pe_off, - org_virt_sector); -#endif - - iobuf = lv_snap->lv_iobuf; - - blksize_org = get_blksize(org_phys_dev); - blksize_snap = get_blksize(snap_phys_dev); - max_blksize = max(blksize_org, blksize_snap); - min_blksize = min(blksize_org, blksize_snap); - max_sectors = KIO_MAX_SECTORS * (min_blksize>>9); - - if (chunk_size % (max_blksize>>9)) - goto fail_blksize; - - while (chunk_size) - { - nr_sectors = min(chunk_size, max_sectors); - chunk_size -= nr_sectors; - - iobuf->length = nr_sectors << 9; - - lvm_snapshot_prepare_blocks(blocks, org_start, - nr_sectors, blksize_org); - if (brw_kiovec(READ, 1, &iobuf, org_phys_dev, - blocks, blksize_org) != (nr_sectors<<9)) - goto fail_raw_read; - - lvm_snapshot_prepare_blocks(blocks, snap_start, - nr_sectors, blksize_snap); - if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, - blocks, blksize_snap) != (nr_sectors<<9)) - goto fail_raw_write; - } - -#ifdef DEBUG_SNAPSHOT - /* invalidate the logcial snapshot buffer cache */ - invalidate_snap_cache(virt_start, lv_snap->lv_chunk_size, - lv_snap->lv_dev); -#endif - - /* the original chunk is now stored on the snapshot volume - so update the execption table */ - lv_snap->lv_block_exception[idx].rdev_org = org_phys_dev; - lv_snap->lv_block_exception[idx].rsector_org = org_start; - lvm_hash_link(lv_snap->lv_block_exception + idx, - org_phys_dev, org_start, lv_snap); - lv_snap->lv_remap_ptr = idx + 1; - return 1; - - /* slow path */ - out: - lvm_drop_snapshot(lv_snap, reason); - return -1; - - fail_out_of_space: - reason = "out of space"; - goto out; - fail_raw_read: - reason = "read error"; - goto out; - fail_raw_write: - reason = "write error"; - goto out; - fail_blksize: - reason = "blocksize error"; - goto out; -} - -static int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors) -{ - int bytes, nr_pages, err, i; - - bytes = sectors << 9; - nr_pages = (bytes + ~PAGE_MASK) >> PAGE_SHIFT; - err = expand_kiobuf(iobuf, nr_pages); - if (err) - goto out; - - err = -ENOMEM; - iobuf->locked = 1; - iobuf->nr_pages = 0; - for (i = 0; i < nr_pages; i++) - { - struct page * page; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,27) - page = alloc_page(GFP_KERNEL); - if (!page) - goto out; -#else - { - unsigned long addr = __get_free_page(GFP_USER); - if (!addr) - goto out; - iobuf->pagelist[i] = addr; - page = virt_to_page(addr); - } -#endif - - iobuf->maplist[i] = page; - /* the only point to lock the page here is to be allowed - to share unmap_kiobuf() in the fail-path */ -#ifndef LockPage -#define LockPage(map) set_bit(PG_locked, &(map)->flags) -#endif - LockPage(page); - iobuf->nr_pages++; - } - iobuf->offset = 0; - - err = 0; - out: - return err; -} - -static int calc_max_buckets(void) -{ - unsigned long mem; - - mem = num_physpages << PAGE_SHIFT; - mem /= 100; - mem *= 2; - mem /= sizeof(struct list_head); - - return mem; -} - -static int lvm_snapshot_alloc_hash_table(lv_t * lv) -{ - int err; - unsigned long buckets, max_buckets, size; - struct list_head * hash; - - buckets = lv->lv_remap_end; - max_buckets = calc_max_buckets(); - buckets = min(buckets, max_buckets); - while (buckets & (buckets-1)) - buckets &= (buckets-1); - - size = buckets * sizeof(struct list_head); - - err = -ENOMEM; - hash = vmalloc(size); - lv->lv_snapshot_hash_table = hash; - - if (!hash) - goto out; - - lv->lv_snapshot_hash_mask = buckets-1; - while (buckets--) - INIT_LIST_HEAD(hash+buckets); - err = 0; - out: - return err; -} - -int lvm_snapshot_alloc(lv_t * lv_snap) -{ - int err, blocksize, max_sectors; - - err = alloc_kiovec(1, &lv_snap->lv_iobuf); - if (err) - goto out; - - blocksize = lvm_blocksizes[MINOR(lv_snap->lv_dev)]; - max_sectors = KIO_MAX_SECTORS << (PAGE_SHIFT-9); - - err = lvm_snapshot_alloc_iobuf_pages(lv_snap->lv_iobuf, max_sectors); - if (err) - goto out_free_kiovec; - - err = lvm_snapshot_alloc_hash_table(lv_snap); - if (err) - goto out_free_kiovec; - out: - return err; - - out_free_kiovec: - unmap_kiobuf(lv_snap->lv_iobuf); - free_kiovec(1, &lv_snap->lv_iobuf); - goto out; -} - -void lvm_snapshot_release(lv_t * lv) -{ - if (lv->lv_block_exception) - { - vfree(lv->lv_block_exception); - lv->lv_block_exception = NULL; - } - if (lv->lv_snapshot_hash_table) - { - vfree(lv->lv_snapshot_hash_table); - lv->lv_snapshot_hash_table = NULL; - } - if (lv->lv_iobuf) - { - free_kiovec(1, &lv->lv_iobuf); - lv->lv_iobuf = NULL; - } -} diff --git a/drivers/block/lvm.c b/drivers/block/lvm.c deleted file mode 100644 index d8cf20d79..000000000 --- a/drivers/block/lvm.c +++ /dev/null @@ -1,2532 +0,0 @@ -/* - * kernel/lvm.c - * - * Copyright (C) 1997 - 2000 Heinz Mauelshagen, Germany - * - * February-November 1997 - * April-May,July-August,November 1998 - * January-March,May,July,September,October 1999 - * January,February 2000 - * - * - * LVM driver is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * LVM driver is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU CC; see the file COPYING. If not, write to - * the Free Software Foundation, 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - * - */ - -/* - * Changelog - * - * 09/11/1997 - added chr ioctls VG_STATUS_GET_COUNT - * and VG_STATUS_GET_NAMELIST - * 18/01/1998 - change lvm_chr_open/close lock handling - * 30/04/1998 - changed LV_STATUS ioctl to LV_STATUS_BYNAME and - * - added LV_STATUS_BYINDEX ioctl - * - used lvm_status_byname_req_t and - * lvm_status_byindex_req_t vars - * 04/05/1998 - added multiple device support - * 08/05/1998 - added support to set/clear extendable flag in volume group - * 09/05/1998 - changed output of lvm_proc_get_info() because of - * support for free (eg. longer) logical volume names - * 12/05/1998 - added spin_locks (thanks to Pascal van Dam - * <pascal@ramoth.xs4all.nl>) - * 25/05/1998 - fixed handling of locked PEs in lvm_map() and lvm_chr_ioctl() - * 26/05/1998 - reactivated verify_area by access_ok - * 07/06/1998 - used vmalloc/vfree instead of kmalloc/kfree to go - * beyond 128/256 KB max allocation limit per call - * - #ifdef blocked spin_lock calls to avoid compile errors - * with 2.0.x - * 11/06/1998 - another enhancement to spinlock code in lvm_chr_open() - * and use of LVM_VERSION_CODE instead of my own macros - * (thanks to Michael Marxmeier <mike@msede.com>) - * 07/07/1998 - added statistics in lvm_map() - * 08/07/1998 - saved statistics in lvm_do_lv_extend_reduce() - * 25/07/1998 - used __initfunc macro - * 02/08/1998 - changes for official char/block major numbers - * 07/08/1998 - avoided init_module() and cleanup_module() to be static - * 30/08/1998 - changed VG lv_open counter from sum of LV lv_open counters - * to sum of LVs open (no matter how often each is) - * 01/09/1998 - fixed lvm_gendisk.part[] index error - * 07/09/1998 - added copying of lv_current_pe-array - * in LV_STATUS_BYINDEX ioctl - * 17/11/1998 - added KERN_* levels to printk - * 13/01/1999 - fixed LV index bug in lvm_do_lv_create() which hit lvrename - * 07/02/1999 - fixed spinlock handling bug in case of LVM_RESET - * by moving spinlock code from lvm_chr_open() - * to lvm_chr_ioctl() - * - added LVM_LOCK_LVM ioctl to lvm_chr_ioctl() - * - allowed LVM_RESET and retrieval commands to go ahead; - * only other update ioctls are blocked now - * - fixed pv->pe to NULL for pv_status - * - using lv_req structure in lvm_chr_ioctl() now - * - fixed NULL ptr reference bug in lvm_do_lv_extend_reduce() - * caused by uncontiguous PV array in lvm_chr_ioctl(VG_REDUCE) - * 09/02/1999 - changed BLKRASET and BLKRAGET in lvm_chr_ioctl() to - * handle lgoical volume private read ahead sector - * - implemented LV read_ahead handling with lvm_blk_read() - * and lvm_blk_write() - * 10/02/1999 - implemented 2.[12].* support function lvm_hd_name() - * to be used in drivers/block/genhd.c by disk_name() - * 12/02/1999 - fixed index bug in lvm_blk_ioctl(), HDIO_GETGEO - * - enhanced gendisk insert/remove handling - * 16/02/1999 - changed to dynamic block minor number allocation to - * have as much as 99 volume groups with 256 logical volumes - * as the grand total; this allows having 1 volume group with - * up to 256 logical volumes in it - * 21/02/1999 - added LV open count information to proc filesystem - * - substituted redundant LVM_RESET code by calls - * to lvm_do_vg_remove() - * 22/02/1999 - used schedule_timeout() to be more responsive - * in case of lvm_do_vg_remove() with lots of logical volumes - * 19/03/1999 - fixed NULL pointer bug in module_init/lvm_init - * 17/05/1999 - used DECLARE_WAIT_QUEUE_HEAD macro (>2.3.0) - * - enhanced lvm_hd_name support - * 03/07/1999 - avoided use of KERNEL_VERSION macro based ifdefs and - * memcpy_tofs/memcpy_fromfs macro redefinitions - * 06/07/1999 - corrected reads/writes statistic counter copy in case - * of striped logical volume - * 28/07/1999 - implemented snapshot logical volumes - * - lvm_chr_ioctl - * - LV_STATUS_BYINDEX - * - LV_STATUS_BYNAME - * - lvm_do_lv_create - * - lvm_do_lv_remove - * - lvm_map - * - new lvm_snapshot_remap_block - * - new lvm_snapshot_remap_new_block - * 08/10/1999 - implemented support for multiple snapshots per - * original logical volume - * 12/10/1999 - support for 2.3.19 - * 11/11/1999 - support for 2.3.28 - * 21/11/1999 - changed lvm_map() interface to buffer_head based - * 19/12/1999 - support for 2.3.33 - * 01/01/2000 - changed locking concept in lvm_map(), - * lvm_do_vg_create() and lvm_do_lv_remove() - * 15/01/2000 - fixed PV_FLUSH bug in lvm_chr_ioctl() - * 24/01/2000 - ported to 2.3.40 including Alan Cox's pointer changes etc. - * 29/01/2000 - used kmalloc/kfree again for all small structures - * 20/01/2000 - cleaned up lvm_chr_ioctl by moving code - * to seperated functions - * - avoided "/dev/" in proc filesystem output - * - avoided inline strings functions lvm_strlen etc. - * 14/02/2000 - support for 2.3.43 - * - integrated Andrea Arcangeli's snapshot code - * - */ - - -static char *lvm_version = "LVM version 0.8final by Heinz Mauelshagen (15/02/2000)\n"; -static char *lvm_short_version = "version 0.8final (15/02/2000)"; - -#define MAJOR_NR LVM_BLK_MAJOR -#define DEVICE_OFF(device) - -#include <linux/config.h> -#include <linux/version.h> - -#ifdef MODVERSIONS -#undef MODULE -#define MODULE -#include <linux/modversions.h> -#endif - -#include <linux/module.h> - -#include <linux/kernel.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> -#include <linux/init.h> - -#include <linux/hdreg.h> -#include <linux/stat.h> -#include <linux/fs.h> -#include <linux/proc_fs.h> -#include <linux/blkdev.h> -#include <linux/genhd.h> -#include <linux/locks.h> -#include <linux/smp_lock.h> -#include <asm/ioctl.h> -#include <asm/segment.h> -#include <asm/uaccess.h> - -#ifdef CONFIG_KERNELD -#include <linux/kerneld.h> -#endif - -#define LOCAL_END_REQUEST - -#include <linux/blk.h> -#include <linux/blkpg.h> - -#include <linux/errno.h> -#include <linux/lvm.h> - -#define LVM_CORRECT_READ_AHEAD(a) \ - (((a) < LVM_MIN_READ_AHEAD || (a) > LVM_MAX_READ_AHEAD) \ - ? LVM_MAX_READ_AHEAD : (a)) - -#ifndef WRITEA -# define WRITEA WRITE -#endif - -/* - * External function prototypes - */ -#ifdef MODULE -int init_module(void); -void cleanup_module(void); -#else -extern int lvm_init(void); -#endif - -static void lvm_dummy_device_request(request_queue_t *); -#define DEVICE_REQUEST lvm_dummy_device_request - -static int lvm_make_request_fn(request_queue_t *, int, struct buffer_head*); -static void lvm_plug_device_noop(request_queue_t *, kdev_t); - -static int lvm_blk_ioctl(struct inode *, struct file *, uint, ulong); -static int lvm_blk_open(struct inode *, struct file *); - -static int lvm_chr_open(struct inode *, struct file *); - -static int lvm_chr_close(struct inode *, struct file *); -static int lvm_blk_close(struct inode *, struct file *); - -static int lvm_chr_ioctl(struct inode *, struct file *, uint, ulong); - -#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS -static int lvm_proc_get_info(char *, char **, off_t, int); -static int (*lvm_proc_get_info_ptr) (char *, char **, off_t, int) = -&lvm_proc_get_info; -#endif - -#ifdef LVM_HD_NAME -void lvm_hd_name(char *, int); -#endif -/* End external function prototypes */ - - -/* - * Internal function prototypes - */ -static void lvm_init_vars(void); - -/* external snapshot calls */ -int lvm_snapshot_remap_block(kdev_t *, ulong *, ulong, lv_t *); -int lvm_snapshot_COW(kdev_t, ulong, ulong, ulong, lv_t *); -int lvm_snapshot_alloc(lv_t *); -void lvm_snapshot_release(lv_t *); - -#ifdef LVM_HD_NAME -extern void (*lvm_hd_name_ptr) (char *, int); -#endif -static int lvm_map(struct buffer_head *, int); -static int lvm_do_lock_lvm(void); -static int lvm_do_le_remap(vg_t *, void *); -static int lvm_do_pe_lock_unlock(vg_t *r, void *); -static int lvm_do_vg_create(int, void *); -static int lvm_do_vg_extend(vg_t *, void *); -static int lvm_do_vg_reduce(vg_t *, void *); -static int lvm_do_vg_remove(int); -static int lvm_do_lv_create(int, char *, lv_t *); -static int lvm_do_lv_remove(int, char *, int); -static int lvm_do_lv_extend_reduce(int, char *, lv_t *); -static int lvm_do_lv_status_byname(vg_t *r, void *); -static int lvm_do_lv_status_byindex(vg_t *, void *arg); -static int lvm_do_pv_change(vg_t*, void*); -static int lvm_do_pv_status(vg_t *, void *); -static void lvm_geninit(struct gendisk *); -#ifdef LVM_GET_INODE -static struct inode *lvm_get_inode(int); -void lvm_clear_inode(struct inode *); -#endif -/* END Internal function prototypes */ - - -/* volume group descriptor area pointers */ -static vg_t *vg[ABS_MAX_VG]; -static pv_t *pvp = NULL; -static lv_t *lvp = NULL; -static pe_t *pep = NULL; -static pe_t *pep1 = NULL; - - -/* map from block minor number to VG and LV numbers */ -typedef struct { - int vg_number; - int lv_number; -} vg_lv_map_t; -static vg_lv_map_t vg_lv_map[ABS_MAX_LV]; - - -/* Request structures (lvm_chr_ioctl()) */ -static pv_change_req_t pv_change_req; -static pv_flush_req_t pv_flush_req; -static pv_status_req_t pv_status_req; -static pe_lock_req_t pe_lock_req; -static le_remap_req_t le_remap_req; -static lv_req_t lv_req; - -#ifdef LVM_TOTAL_RESET -static int lvm_reset_spindown = 0; -#endif - -static char pv_name[NAME_LEN]; -/* static char rootvg[NAME_LEN] = { 0, }; */ -static uint lv_open = 0; -const char *const lvm_name = LVM_NAME; -static int lock = 0; -static int loadtime = 0; -static uint vg_count = 0; -static long lvm_chr_open_count = 0; -static ushort lvm_iop_version = LVM_DRIVER_IOP_VERSION; -static DECLARE_WAIT_QUEUE_HEAD(lvm_snapshot_wait); -static DECLARE_WAIT_QUEUE_HEAD(lvm_wait); -static DECLARE_WAIT_QUEUE_HEAD(lvm_map_wait); - -static spinlock_t lvm_lock = SPIN_LOCK_UNLOCKED; - -static struct file_operations lvm_chr_fops = -{ - owner: THIS_MODULE, - open: lvm_chr_open, - release: lvm_chr_close, - ioctl: lvm_chr_ioctl, -}; - -static struct block_device_operations lvm_blk_dops = -{ - open: lvm_blk_open, - release: lvm_blk_close, - ioctl: lvm_blk_ioctl -}; - -/* gendisk structures */ -static struct hd_struct lvm_hd_struct[MAX_LV]; -static int lvm_blocksizes[MAX_LV] = -{0,}; -static int lvm_size[MAX_LV] = -{0,}; -static struct gendisk lvm_gendisk = -{ - MAJOR_NR, /* major # */ - LVM_NAME, /* name of major */ - 0, /* number of times minor is shifted - to get real minor */ - 1, /* maximum partitions per device */ - lvm_hd_struct, /* partition table */ - lvm_size, /* device size in blocks, copied - to block_size[] */ - MAX_LV, /* number or real devices */ - NULL, /* internal */ - NULL, /* pointer to next gendisk struct (internal) */ -}; - - -#ifdef MODULE -/* - * Module initialization... - */ -int init_module(void) -#else -/* - * Driver initialization... - */ -#ifdef __initfunc -__initfunc(int lvm_init(void)) -#else -int __init lvm_init(void) -#endif -#endif /* #ifdef MODULE */ -{ - struct gendisk *gendisk_ptr = NULL; - - if (register_chrdev(LVM_CHAR_MAJOR, lvm_name, &lvm_chr_fops) < 0) { - printk(KERN_ERR "%s -- register_chrdev failed\n", lvm_name); - return -EIO; - } - if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0) - { - printk("%s -- register_blkdev failed\n", lvm_name); - if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) - printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); - return -EIO; - } -#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS - create_proc_info_entry(LVM_NAME, S_IFREG | S_IRUGO, - &proc_root, lvm_proc_get_info_ptr); -#endif - - lvm_init_vars(); - lvm_geninit(&lvm_gendisk); - - /* insert our gendisk at the corresponding major */ - if (gendisk_head != NULL) { - gendisk_ptr = gendisk_head; - while (gendisk_ptr->next != NULL && - gendisk_ptr->major > lvm_gendisk.major) { - gendisk_ptr = gendisk_ptr->next; - } - lvm_gendisk.next = gendisk_ptr->next; - gendisk_ptr->next = &lvm_gendisk; - } else { - gendisk_head = &lvm_gendisk; - lvm_gendisk.next = NULL; - } - -#ifdef LVM_HD_NAME - /* reference from drivers/block/genhd.c */ - lvm_hd_name_ptr = lvm_hd_name; -#endif - - blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST); - blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_make_request_fn); - blk_queue_pluggable(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_plug_device_noop); - /* optional read root VGDA */ -/* - if ( *rootvg != 0) vg_read_with_pv_and_lv ( rootvg, &vg); -*/ - - printk(KERN_INFO - "%s%s -- " -#ifdef MODULE - "Module" -#else - "Driver" -#endif - " successfully initialized\n", - lvm_version, lvm_name); - - return 0; -} /* init_module() / lvm_init() */ - - -#ifdef MODULE -/* - * Module cleanup... - */ -void cleanup_module(void) -{ - struct gendisk *gendisk_ptr = NULL, *gendisk_ptr_prev = NULL; - - if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) { - printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); - } - if (unregister_blkdev(MAJOR_NR, lvm_name) < 0) { - printk(KERN_ERR "%s -- unregister_blkdev failed\n", lvm_name); - } - blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR)); - - gendisk_ptr = gendisk_ptr_prev = gendisk_head; - while (gendisk_ptr != NULL) { - if (gendisk_ptr == &lvm_gendisk) - break; - gendisk_ptr_prev = gendisk_ptr; - gendisk_ptr = gendisk_ptr->next; - } - /* delete our gendisk from chain */ - if (gendisk_ptr == &lvm_gendisk) - gendisk_ptr_prev->next = gendisk_ptr->next; - - blk_size[MAJOR_NR] = NULL; - blksize_size[MAJOR_NR] = NULL; - -#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS - remove_proc_entry(LVM_NAME, &proc_root); -#endif - -#ifdef LVM_HD_NAME - /* reference from linux/drivers/block/genhd.c */ - lvm_hd_name_ptr = NULL; -#endif - - printk(KERN_INFO "%s -- Module successfully deactivated\n", lvm_name); - - return; -} /* void cleanup_module() */ -#endif /* #ifdef MODULE */ - - -/* - * support function to initialize lvm variables - */ -#ifdef __initfunc -__initfunc(void lvm_init_vars(void)) -#else -void __init lvm_init_vars(void) -#endif -{ - int v; - - loadtime = CURRENT_TIME; - - pe_lock_req.lock = UNLOCK_PE; - pe_lock_req.data.lv_dev = \ - pe_lock_req.data.pv_dev = \ - pe_lock_req.data.pv_offset = 0; - - /* Initialize VG pointers */ - for (v = 0; v < ABS_MAX_VG; v++) vg[v] = NULL; - - /* Initialize LV -> VG association */ - for (v = 0; v < ABS_MAX_LV; v++) { - /* index ABS_MAX_VG never used for real VG */ - vg_lv_map[v].vg_number = ABS_MAX_VG; - vg_lv_map[v].lv_number = -1; - } - - return; -} /* lvm_init_vars() */ - - -/******************************************************************** - * - * Character device functions - * - ********************************************************************/ - -/* - * character device open routine - */ -static int lvm_chr_open(struct inode *inode, - struct file *file) -{ - int minor = MINOR(inode->i_rdev); - -#ifdef DEBUG - printk(KERN_DEBUG - "%s -- lvm_chr_open MINOR: %d VG#: %d mode: 0x%X lock: %d\n", - lvm_name, minor, VG_CHR(minor), file->f_mode, lock); -#endif - - /* super user validation */ - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - - /* Group special file open */ - if (VG_CHR(minor) > MAX_VG) return -ENXIO; - - lvm_chr_open_count++; - return 0; -} /* lvm_chr_open() */ - - -/* - * character device i/o-control routine - * - * Only one changing process can do changing ioctl at one time, - * others will block. - * - */ -static int lvm_chr_ioctl(struct inode *inode, struct file *file, - uint command, ulong a) -{ - int minor = MINOR(inode->i_rdev); - uint extendable, l, v; - void *arg = (void *) a; - lv_t lv; - vg_t* vg_ptr = vg[VG_CHR(minor)]; - - /* otherwise cc will complain about unused variables */ - (void) lvm_lock; - - -#ifdef DEBUG_IOCTL - printk(KERN_DEBUG - "%s -- lvm_chr_ioctl: command: 0x%X MINOR: %d " - "VG#: %d mode: 0x%X\n", - lvm_name, command, minor, VG_CHR(minor), file->f_mode); -#endif - -#ifdef LVM_TOTAL_RESET - if (lvm_reset_spindown > 0) return -EACCES; -#endif - - /* Main command switch */ - switch (command) { - case LVM_LOCK_LVM: - /* lock the LVM */ - return lvm_do_lock_lvm(); - - case LVM_GET_IOP_VERSION: - /* check lvm version to ensure driver/tools+lib - interoperability */ - if (copy_to_user(arg, &lvm_iop_version, sizeof(ushort)) != 0) - return -EFAULT; - return 0; - -#ifdef LVM_TOTAL_RESET - case LVM_RESET: - /* lock reset function */ - lvm_reset_spindown = 1; - for (v = 0; v < ABS_MAX_VG; v++) { - if (vg[v] != NULL) lvm_do_vg_remove(v); - } - -#ifdef MODULE - while (GET_USE_COUNT(&__this_module) < 1) - MOD_INC_USE_COUNT; - while (GET_USE_COUNT(&__this_module) > 1) - MOD_DEC_USE_COUNT; -#endif /* MODULE */ - lock = 0; /* release lock */ - wake_up_interruptible(&lvm_wait); - return 0; -#endif /* LVM_TOTAL_RESET */ - - - case LE_REMAP: - /* remap a logical extent (after moving the physical extent) */ - return lvm_do_le_remap(vg_ptr,arg); - - case PE_LOCK_UNLOCK: - /* lock/unlock i/o to a physical extent to move it to another - physical volume (move's done in user space's pvmove) */ - return lvm_do_pe_lock_unlock(vg_ptr,arg); - - case VG_CREATE: - /* create a VGDA */ - return lvm_do_vg_create(minor, arg); - - case VG_REMOVE: - /* remove an inactive VGDA */ - return lvm_do_vg_remove(minor); - - case VG_EXTEND: - /* extend a volume group */ - return lvm_do_vg_extend(vg_ptr,arg); - - case VG_REDUCE: - /* reduce a volume group */ - return lvm_do_vg_reduce(vg_ptr,arg); - - - case VG_SET_EXTENDABLE: - /* set/clear extendability flag of volume group */ - if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(&extendable, arg, sizeof(extendable)) != 0) - return -EFAULT; - - if (extendable == VG_EXTENDABLE || - extendable == ~VG_EXTENDABLE) { - if (extendable == VG_EXTENDABLE) - vg_ptr->vg_status |= VG_EXTENDABLE; - else - vg_ptr->vg_status &= ~VG_EXTENDABLE; - } else return -EINVAL; - return 0; - - - case VG_STATUS: - /* get volume group data (only the vg_t struct) */ - if (vg_ptr == NULL) return -ENXIO; - if (copy_to_user(arg, vg_ptr, sizeof(vg_t)) != 0) - return -EFAULT; - return 0; - - - case VG_STATUS_GET_COUNT: - /* get volume group count */ - if (copy_to_user(arg, &vg_count, sizeof(vg_count)) != 0) - return -EFAULT; - return 0; - - - case VG_STATUS_GET_NAMELIST: - /* get volume group count */ - for (l = v = 0; v < ABS_MAX_VG; v++) { - if (vg[v] != NULL) { - if (copy_to_user(arg + l++ * NAME_LEN, - vg[v]->vg_name, - NAME_LEN) != 0) - return -EFAULT; - } - } - return 0; - - - case LV_CREATE: - case LV_REMOVE: - case LV_EXTEND: - case LV_REDUCE: - /* create, remove, extend or reduce a logical volume */ - if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(&lv_req, arg, sizeof(lv_req)) != 0) - return -EFAULT; - - if (command != LV_REMOVE) { - if (copy_from_user(&lv, lv_req.lv, sizeof(lv_t)) != 0) - return -EFAULT; - } - switch (command) { - case LV_CREATE: - return lvm_do_lv_create(minor, lv_req.lv_name, &lv); - - case LV_REMOVE: - return lvm_do_lv_remove(minor, lv_req.lv_name, -1); - - case LV_EXTEND: - case LV_REDUCE: - return lvm_do_lv_extend_reduce(minor, lv_req.lv_name, &lv); - } - - - case LV_STATUS_BYNAME: - /* get status of a logical volume by name */ - return lvm_do_lv_status_byname(vg_ptr,arg); - - case LV_STATUS_BYINDEX: - /* get status of a logical volume by index */ - return lvm_do_lv_status_byindex(vg_ptr,arg); - - case PV_CHANGE: - /* change a physical volume */ - return lvm_do_pv_change(vg_ptr,arg); - - case PV_STATUS: - /* get physical volume data (pv_t structure only) */ - return lvm_do_pv_status(vg_ptr,arg); - - case PV_FLUSH: - /* physical volume buffer flush/invalidate */ - if (copy_from_user(&pv_flush_req, arg, - sizeof(pv_flush_req)) != 0) - return -EFAULT; - - for ( v = 0; v < ABS_MAX_VG; v++) { - unsigned int p; - if ( vg[v] == NULL) continue; - for ( p = 0; p < vg[v]->pv_max; p++) { - if ( vg[v]->pv[p] != NULL && - strcmp ( vg[v]->pv[p]->pv_name, - pv_flush_req.pv_name) == 0) { - fsync_dev ( vg[v]->pv[p]->pv_dev); - invalidate_buffers ( vg[v]->pv[p]->pv_dev); - return 0; - } - } - } - return 0; - - default: - printk(KERN_WARNING - "%s -- lvm_chr_ioctl: unknown command %x\n", - lvm_name, command); - return -EINVAL; - } - - return 0; -} /* lvm_chr_ioctl */ - - -/* - * character device close routine - */ -static int lvm_chr_close(struct inode *inode, struct file *file) -{ -#ifdef DEBUG - int minor = MINOR(inode->i_rdev); - printk(KERN_DEBUG - "%s -- lvm_chr_close VG#: %d\n", lvm_name, VG_CHR(minor)); -#endif - - lock_kernel(); -#ifdef LVM_TOTAL_RESET - if (lvm_reset_spindown > 0) { - lvm_reset_spindown = 0; - lvm_chr_open_count = 1; - } -#endif - - if (lvm_chr_open_count > 0) lvm_chr_open_count--; - if (lock == current->pid) { - lock = 0; /* release lock */ - wake_up_interruptible(&lvm_wait); - } - unlock_kernel(); - - return 0; -} /* lvm_chr_close() */ - - - -/******************************************************************** - * - * Block device functions - * - ********************************************************************/ - -/* - * block device open routine - */ -static int lvm_blk_open(struct inode *inode, struct file *file) -{ - int minor = MINOR(inode->i_rdev); - lv_t *lv_ptr; - vg_t *vg_ptr = vg[VG_BLK(minor)]; - -#ifdef DEBUG_LVM_BLK_OPEN - printk(KERN_DEBUG - "%s -- lvm_blk_open MINOR: %d VG#: %d LV#: %d mode: 0x%X\n", - lvm_name, minor, VG_BLK(minor), LV_BLK(minor), file->f_mode); -#endif - -#ifdef LVM_TOTAL_RESET - if (lvm_reset_spindown > 0) - return -EPERM; -#endif - - if (vg_ptr != NULL && - (vg_ptr->vg_status & VG_ACTIVE) && - (lv_ptr = vg_ptr->lv[LV_BLK(minor)]) != NULL && - LV_BLK(minor) >= 0 && - LV_BLK(minor) < vg_ptr->lv_max) { - - /* Check parallel LV spindown (LV remove) */ - if (lv_ptr->lv_status & LV_SPINDOWN) return -EPERM; - - /* Check inactive LV and open for read/write */ - if (file->f_mode & O_RDWR) { - if (!(lv_ptr->lv_status & LV_ACTIVE)) return -EPERM; - if (!(lv_ptr->lv_access & LV_WRITE)) return -EACCES; - } - - /* be sure to increment VG counter */ - if (lv_ptr->lv_open == 0) vg_ptr->lv_open++; - lv_ptr->lv_open++; - - MOD_INC_USE_COUNT; - -#ifdef DEBUG_LVM_BLK_OPEN - printk(KERN_DEBUG - "%s -- lvm_blk_open MINOR: %d VG#: %d LV#: %d size: %d\n", - lvm_name, minor, VG_BLK(minor), LV_BLK(minor), - lv_ptr->lv_size); -#endif - - return 0; - } - return -ENXIO; -} /* lvm_blk_open() */ - - -/* - * block device i/o-control routine - */ -static int lvm_blk_ioctl(struct inode *inode, struct file *file, - uint command, ulong a) -{ - int minor = MINOR(inode->i_rdev); - vg_t *vg_ptr = vg[VG_BLK(minor)]; - lv_t *lv_ptr = vg_ptr->lv[LV_BLK(minor)]; - void *arg = (void *) a; - struct hd_geometry *hd = (struct hd_geometry *) a; - -#ifdef DEBUG_IOCTL - printk(KERN_DEBUG - "%s -- lvm_blk_ioctl MINOR: %d command: 0x%X arg: %X " - "VG#: %dl LV#: %d\n", - lvm_name, minor, command, (ulong) arg, - VG_BLK(minor), LV_BLK(minor)); -#endif - - switch (command) { - case BLKGETSIZE: - /* return device size */ -#ifdef DEBUG_IOCTL - printk(KERN_DEBUG - "%s -- lvm_blk_ioctl -- BLKGETSIZE: %u\n", - lvm_name, lv_ptr->lv_size); -#endif - if (put_user(lv_ptr->lv_size, (long *)arg)) - return -EFAULT; - break; - - - case BLKFLSBUF: - /* flush buffer cache */ - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - -#ifdef DEBUG_IOCTL - printk(KERN_DEBUG - "%s -- lvm_blk_ioctl -- BLKFLSBUF\n", lvm_name); -#endif - fsync_dev(inode->i_rdev); - invalidate_buffers(inode->i_rdev); - break; - - - case BLKRASET: - /* set read ahead for block device */ - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - -#ifdef DEBUG_IOCTL - printk(KERN_DEBUG - "%s -- lvm_blk_ioctl -- BLKRASET: %d sectors for %02X:%02X\n", - lvm_name, (long) arg, MAJOR(inode->i_rdev), minor); -#endif - if ((long) arg < LVM_MIN_READ_AHEAD || - (long) arg > LVM_MAX_READ_AHEAD) - return -EINVAL; - read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = (long) arg; - break; - - - case BLKRAGET: - /* get current read ahead setting */ -#ifdef DEBUG_IOCTL - printk(KERN_DEBUG - "%s -- lvm_blk_ioctl -- BLKRAGET\n", lvm_name); -#endif - if (put_user(lv_ptr->lv_read_ahead, (long *)arg)) - return -EFAULT; - break; - - - case HDIO_GETGEO: - /* get disk geometry */ -#ifdef DEBUG_IOCTL - printk(KERN_DEBUG - "%s -- lvm_blk_ioctl -- HDIO_GETGEO\n", lvm_name); -#endif - if (hd == NULL) - return -EINVAL; - { - unsigned char heads = 64; - unsigned char sectors = 32; - long start = 0; - short cylinders = lv_ptr->lv_size / heads / sectors; - - if (copy_to_user((char *) &hd->heads, &heads, - sizeof(heads)) != 0 || - copy_to_user((char *) &hd->sectors, §ors, - sizeof(sectors)) != 0 || - copy_to_user((short *) &hd->cylinders, - &cylinders, sizeof(cylinders)) != 0 || - copy_to_user((long *) &hd->start, &start, - sizeof(start)) != 0) - return -EFAULT; - } - -#ifdef DEBUG_IOCTL - printk(KERN_DEBUG - "%s -- lvm_blk_ioctl -- cylinders: %d\n", - lvm_name, lv_ptr->lv_size / heads / sectors); -#endif - break; - - - case LV_SET_ACCESS: - /* set access flags of a logical volume */ - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - lv_ptr->lv_access = (ulong) arg; - break; - - - case LV_SET_STATUS: - /* set status flags of a logical volume */ - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (!((ulong) arg & LV_ACTIVE) && lv_ptr->lv_open > 1) - return -EPERM; - lv_ptr->lv_status = (ulong) arg; - break; - - - case LV_SET_ALLOCATION: - /* set allocation flags of a logical volume */ - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - lv_ptr->lv_allocation = (ulong) arg; - break; - - - default: - printk(KERN_WARNING - "%s -- lvm_blk_ioctl: unknown command %d\n", - lvm_name, command); - return -EINVAL; - } - - return 0; -} /* lvm_blk_ioctl() */ - - -/* - * block device close routine - */ -static int lvm_blk_close(struct inode *inode, struct file *file) -{ - int minor = MINOR(inode->i_rdev); - vg_t *vg_ptr = vg[VG_BLK(minor)]; - lv_t *lv_ptr = vg_ptr->lv[LV_BLK(minor)]; - -#ifdef DEBUG - printk(KERN_DEBUG - "%s -- lvm_blk_close MINOR: %d VG#: %d LV#: %d\n", - lvm_name, minor, VG_BLK(minor), LV_BLK(minor)); -#endif - - sync_dev(inode->i_rdev); - if (lv_ptr->lv_open == 1) vg_ptr->lv_open--; - lv_ptr->lv_open--; - - MOD_DEC_USE_COUNT; - - return 0; -} /* lvm_blk_close() */ - - -#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS -/* - * Support function /proc-Filesystem - */ -#define LVM_PROC_BUF ( i == 0 ? dummy_buf : &buf[sz]) - -static int lvm_proc_get_info(char *page, char **start, off_t pos, int count) -{ - int c, i, l, p, v, vg_counter, pv_counter, lv_counter, lv_open_counter, - lv_open_total, pe_t_bytes, lv_block_exception_t_bytes, seconds; - static off_t sz; - off_t sz_last; - char allocation_flag, inactive_flag, rw_flag, stripes_flag; - char *lv_name, *pv_name; - static char *buf = NULL; - static char dummy_buf[160]; /* sized for 2 lines */ - vg_t *vg_ptr; - lv_t *lv_ptr; - pv_t *pv_ptr; - - -#ifdef DEBUG_LVM_PROC_GET_INFO - printk(KERN_DEBUG - "%s - lvm_proc_get_info CALLED pos: %lu count: %d whence: %d\n", - lvm_name, pos, count, whence); -#endif - - if (pos == 0 || buf == NULL) { - sz_last = vg_counter = pv_counter = lv_counter = lv_open_counter = \ - lv_open_total = pe_t_bytes = lv_block_exception_t_bytes = 0; - - /* search for activity */ - for (v = 0; v < ABS_MAX_VG; v++) { - if ((vg_ptr = vg[v]) != NULL) { - vg_counter++; - pv_counter += vg_ptr->pv_cur; - lv_counter += vg_ptr->lv_cur; - if (vg_ptr->lv_cur > 0) { - for (l = 0; l < vg[v]->lv_max; l++) { - if ((lv_ptr = vg_ptr->lv[l]) != NULL) { - pe_t_bytes += lv_ptr->lv_allocated_le; - if (lv_ptr->lv_block_exception != NULL) - lv_block_exception_t_bytes += lv_ptr->lv_remap_end; - if (lv_ptr->lv_open > 0) { - lv_open_counter++; - lv_open_total += lv_ptr->lv_open; - } - } - } - } - } - } - pe_t_bytes *= sizeof(pe_t); - lv_block_exception_t_bytes *= sizeof(lv_block_exception_t); - - if (buf != NULL) { -#ifdef DEBUG_KFREE - printk(KERN_DEBUG - "%s -- kfree %d\n", lvm_name, __LINE__); -#endif - kfree(buf); - buf = NULL; - } - /* 2 times: first to get size to allocate buffer, - 2nd to fill the malloced buffer */ - for (i = 0; i < 2; i++) { - sz = 0; - sz += sprintf(LVM_PROC_BUF, - "LVM " -#ifdef MODULE - "module" -#else - "driver" -#endif - " %s\n\n" - "Total: %d VG%s %d PV%s %d LV%s ", - lvm_short_version, - vg_counter, vg_counter == 1 ? "" : "s", - pv_counter, pv_counter == 1 ? "" : "s", - lv_counter, lv_counter == 1 ? "" : "s"); - sz += sprintf(LVM_PROC_BUF, - "(%d LV%s open", - lv_open_counter, - lv_open_counter == 1 ? "" : "s"); - if (lv_open_total > 0) - sz += sprintf(LVM_PROC_BUF, - " %d times)\n", - lv_open_total); - else - sz += sprintf(LVM_PROC_BUF, ")"); - sz += sprintf(LVM_PROC_BUF, - "\nGlobal: %lu bytes malloced IOP version: %d ", - vg_counter * sizeof(vg_t) + - pv_counter * sizeof(pv_t) + - lv_counter * sizeof(lv_t) + - pe_t_bytes + lv_block_exception_t_bytes + sz_last, - lvm_iop_version); - - seconds = CURRENT_TIME - loadtime; - if (seconds < 0) - loadtime = CURRENT_TIME + seconds; - if (seconds / 86400 > 0) { - sz += sprintf(LVM_PROC_BUF, "%d day%s ", - seconds / 86400, - seconds / 86400 == 0 || - seconds / 86400 > 1 ? "s" : ""); - } - sz += sprintf(LVM_PROC_BUF, "%d:%02d:%02d active\n", - (seconds % 86400) / 3600, - (seconds % 3600) / 60, - seconds % 60); - - if (vg_counter > 0) { - for (v = 0; v < ABS_MAX_VG; v++) { - /* volume group */ - if ((vg_ptr = vg[v]) != NULL) { - inactive_flag = ' '; - if (!(vg_ptr->vg_status & VG_ACTIVE)) inactive_flag = 'I'; - sz += sprintf(LVM_PROC_BUF, - "\nVG: %c%s [%d PV, %d LV/%d open] " - " PE Size: %d KB\n" - " Usage [KB/PE]: %d /%d total " - "%d /%d used %d /%d free", - inactive_flag, - vg_ptr->vg_name, - vg_ptr->pv_cur, - vg_ptr->lv_cur, - vg_ptr->lv_open, - vg_ptr->pe_size >> 1, - vg_ptr->pe_size * vg_ptr->pe_total >> 1, - vg_ptr->pe_total, - vg_ptr->pe_allocated * vg_ptr->pe_size >> 1, - vg_ptr->pe_allocated, - (vg_ptr->pe_total - vg_ptr->pe_allocated) * - vg_ptr->pe_size >> 1, - vg_ptr->pe_total - vg_ptr->pe_allocated); - - /* physical volumes */ - sz += sprintf(LVM_PROC_BUF, - "\n PV%s ", - vg_ptr->pv_cur == 1 ? ": " : "s:"); - c = 0; - for (p = 0; p < vg_ptr->pv_max; p++) { - if ((pv_ptr = vg_ptr->pv[p]) != NULL) { - inactive_flag = 'A'; - if (!(pv_ptr->pv_status & PV_ACTIVE)) - inactive_flag = 'I'; - allocation_flag = 'A'; - if (!(pv_ptr->pv_allocatable & PV_ALLOCATABLE)) - allocation_flag = 'N'; - pv_name = strchr(pv_ptr->pv_name+1,'/'); - if ( pv_name == 0) pv_name = pv_ptr->pv_name; - else pv_name++; - sz += sprintf(LVM_PROC_BUF, - "[%c%c] %-21s %8d /%-6d " - "%8d /%-6d %8d /%-6d", - inactive_flag, - allocation_flag, - pv_name, - pv_ptr->pe_total * - pv_ptr->pe_size >> 1, - pv_ptr->pe_total, - pv_ptr->pe_allocated * - pv_ptr->pe_size >> 1, - pv_ptr->pe_allocated, - (pv_ptr->pe_total - - pv_ptr->pe_allocated) * - pv_ptr->pe_size >> 1, - pv_ptr->pe_total - - pv_ptr->pe_allocated); - c++; - if (c < vg_ptr->pv_cur) - sz += sprintf(LVM_PROC_BUF, - "\n "); - } - } - - /* logical volumes */ - sz += sprintf(LVM_PROC_BUF, - "\n LV%s ", - vg_ptr->lv_cur == 1 ? ": " : "s:"); - c = 0; - for (l = 0; l < vg[v]->lv_max; l++) { - if ((lv_ptr = vg_ptr->lv[l]) != NULL) { - inactive_flag = 'A'; - if (!(lv_ptr->lv_status & LV_ACTIVE)) - inactive_flag = 'I'; - rw_flag = 'R'; - if (lv_ptr->lv_access & LV_WRITE) - rw_flag = 'W'; - allocation_flag = 'D'; - if (lv_ptr->lv_allocation & LV_CONTIGUOUS) - allocation_flag = 'C'; - stripes_flag = 'L'; - if (lv_ptr->lv_stripes > 1) - stripes_flag = 'S'; - sz += sprintf(LVM_PROC_BUF, - "[%c%c%c%c", - inactive_flag, - rw_flag, - allocation_flag, - stripes_flag); - if (lv_ptr->lv_stripes > 1) - sz += sprintf(LVM_PROC_BUF, "%-2d", - lv_ptr->lv_stripes); - else - sz += sprintf(LVM_PROC_BUF, " "); - lv_name = strrchr(lv_ptr->lv_name, '/'); - if ( lv_name == 0) lv_name = lv_ptr->lv_name; - else lv_name++; - sz += sprintf(LVM_PROC_BUF, "] %-25s", lv_name); - if (strlen(lv_name) > 25) - sz += sprintf(LVM_PROC_BUF, - "\n "); - sz += sprintf(LVM_PROC_BUF, "%9d /%-6d ", - lv_ptr->lv_size >> 1, - lv_ptr->lv_size / vg[v]->pe_size); - - if (lv_ptr->lv_open == 0) - sz += sprintf(LVM_PROC_BUF, "close"); - else - sz += sprintf(LVM_PROC_BUF, "%dx open", - lv_ptr->lv_open); - c++; - if (c < vg_ptr->lv_cur) - sz += sprintf(LVM_PROC_BUF, - "\n "); - } - } - if (vg_ptr->lv_cur == 0) sz += sprintf(LVM_PROC_BUF, "none"); - sz += sprintf(LVM_PROC_BUF, "\n"); - } - } - } - if (buf == NULL) { - if ((buf = vmalloc(sz)) == NULL) { - sz = 0; - return sprintf(page, "%s - vmalloc error at line %d\n", - lvm_name, __LINE__); - } - } - sz_last = sz; - } - } - if (pos > sz - 1) { - vfree(buf); - buf = NULL; - return 0; - } - *start = &buf[pos]; - if (sz - pos < count) - return sz - pos; - else - return count; -} /* lvm_proc_get_info() */ -#endif /* #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS */ - - -/* - * block device support function for /usr/src/linux/drivers/block/ll_rw_blk.c - * (see init_module/lvm_init) - */ -static int lvm_map(struct buffer_head *bh, int rw) -{ - int minor = MINOR(bh->b_rdev); - ulong index; - ulong pe_start; - ulong size = bh->b_size >> 9; - ulong rsector_tmp = bh->b_rsector; - ulong rsector_sav; - kdev_t rdev_tmp = bh->b_rdev; - kdev_t rdev_sav; - lv_t *lv = vg[VG_BLK(minor)]->lv[LV_BLK(minor)]; - - - if (!(lv->lv_status & LV_ACTIVE)) { - printk(KERN_ALERT - "%s - lvm_map: ll_rw_blk for inactive LV %s\n", - lvm_name, lv->lv_name); - goto error; - } -/* - if ( lv->lv_access & LV_SNAPSHOT) - printk ( "%s -- %02d:%02d block: %lu rw: %d\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), bh->b_blocknr, rw); - */ - - /* take care of snapshot chunk writes before - check for writable logical volume */ - if ((lv->lv_access & LV_SNAPSHOT) && - MAJOR(bh->b_rdev) != 0 && - MAJOR(bh->b_rdev) != MAJOR_NR && - (rw == WRITEA || rw == WRITE)) - { - printk ( "%s -- doing snapshot write for %02d:%02d[%02d:%02d] b_blocknr: %lu b_rsector: %lu\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), MAJOR ( bh->b_rdev), MINOR ( bh->b_rdev), bh->b_blocknr, bh->b_rsector); - goto error; - } - - if ((rw == WRITE || rw == WRITEA) && - !(lv->lv_access & LV_WRITE)) { - printk(KERN_CRIT - "%s - lvm_map: ll_rw_blk write for readonly LV %s\n", - lvm_name, lv->lv_name); - goto error; - } -#ifdef DEBUG_MAP - printk(KERN_DEBUG - "%s - lvm_map minor:%d *rdev: %02d:%02d *rsector: %lu " - "size:%lu\n", - lvm_name, minor, - MAJOR(rdev_tmp), - MINOR(rdev_tmp), - rsector_tmp, size); -#endif - - if (rsector_tmp + size > lv->lv_size) { - printk(KERN_ALERT - "%s - lvm_map *rsector: %lu or size: %lu wrong for" - " minor: %2d\n", lvm_name, rsector_tmp, size, minor); - goto error; - } - rsector_sav = rsector_tmp; - rdev_sav = rdev_tmp; - -lvm_second_remap: - /* linear mapping */ - if (lv->lv_stripes < 2) { - /* get the index */ - index = rsector_tmp / vg[VG_BLK(minor)]->pe_size; - pe_start = lv->lv_current_pe[index].pe; - rsector_tmp = lv->lv_current_pe[index].pe + - (rsector_tmp % vg[VG_BLK(minor)]->pe_size); - rdev_tmp = lv->lv_current_pe[index].dev; - -#ifdef DEBUG_MAP - printk(KERN_DEBUG - "lv_current_pe[%ld].pe: %ld rdev: %02d:%02d rsector:%ld\n", - index, - lv->lv_current_pe[index].pe, - MAJOR(rdev_tmp), - MINOR(rdev_tmp), - rsector_tmp); -#endif - - /* striped mapping */ - } else { - ulong stripe_index; - ulong stripe_length; - - stripe_length = vg[VG_BLK(minor)]->pe_size * lv->lv_stripes; - stripe_index = (rsector_tmp % stripe_length) / lv->lv_stripesize; - index = rsector_tmp / stripe_length + - (stripe_index % lv->lv_stripes) * - (lv->lv_allocated_le / lv->lv_stripes); - pe_start = lv->lv_current_pe[index].pe; - rsector_tmp = lv->lv_current_pe[index].pe + - (rsector_tmp % stripe_length) - - (stripe_index % lv->lv_stripes) * lv->lv_stripesize - - stripe_index / lv->lv_stripes * - (lv->lv_stripes - 1) * lv->lv_stripesize; - rdev_tmp = lv->lv_current_pe[index].dev; - } - -#ifdef DEBUG_MAP - printk(KERN_DEBUG - "lv_current_pe[%ld].pe: %ld rdev: %02d:%02d rsector:%ld\n" - "stripe_length: %ld stripe_index: %ld\n", - index, - lv->lv_current_pe[index].pe, - MAJOR(rdev_tmp), - MINOR(rdev_tmp), - rsector_tmp, - stripe_length, - stripe_index); -#endif - - /* handle physical extents on the move */ - if (pe_lock_req.lock == LOCK_PE) { - if (rdev_tmp == pe_lock_req.data.pv_dev && - rsector_tmp >= pe_lock_req.data.pv_offset && - rsector_tmp < (pe_lock_req.data.pv_offset + - vg[VG_BLK(minor)]->pe_size)) { - sleep_on(&lvm_map_wait); - rsector_tmp = rsector_sav; - rdev_tmp = rdev_sav; - goto lvm_second_remap; - } - } - /* statistic */ - if (rw == WRITE || rw == WRITEA) - lv->lv_current_pe[index].writes++; - else - lv->lv_current_pe[index].reads++; - - /* snapshot volume exception handling on physical device address base */ - if (lv->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG)) { - /* original logical volume */ - if (lv->lv_access & LV_SNAPSHOT_ORG) { - if (rw == WRITE || rw == WRITEA) - { - lv_t *lv_ptr; - - /* start with first snapshot and loop thrugh all of them */ - for (lv_ptr = lv->lv_snapshot_next; - lv_ptr != NULL; - lv_ptr = lv_ptr->lv_snapshot_next) { - down(&lv->lv_snapshot_org->lv_snapshot_sem); - /* do we still have exception storage for this snapshot free? */ - if (lv_ptr->lv_block_exception != NULL) { - rdev_sav = rdev_tmp; - rsector_sav = rsector_tmp; - if (!lvm_snapshot_remap_block(&rdev_tmp, - &rsector_tmp, - pe_start, - lv_ptr)) { - /* create a new mapping */ - lvm_snapshot_COW(rdev_tmp, - rsector_tmp, - pe_start, - rsector_sav, - lv_ptr); - } - rdev_tmp = rdev_sav; - rsector_tmp = rsector_sav; - } - up(&lv->lv_snapshot_org->lv_snapshot_sem); - } - } - } else { - /* remap snapshot logical volume */ - down(&lv->lv_snapshot_sem); - if (lv->lv_block_exception != NULL) - lvm_snapshot_remap_block(&rdev_tmp, &rsector_tmp, pe_start, lv); - up(&lv->lv_snapshot_sem); - } - } - bh->b_rdev = rdev_tmp; - bh->b_rsector = rsector_tmp; - - return 1; - - error: - buffer_IO_error(bh); - return -1; -} /* lvm_map() */ - - -/* - * internal support functions - */ - -#ifdef LVM_HD_NAME -/* - * generate "hard disk" name - */ -void lvm_hd_name(char *buf, int minor) -{ - int len = 0; - lv_t *lv_ptr; - - if (vg[VG_BLK(minor)] == NULL || - (lv_ptr = vg[VG_BLK(minor)]->lv[LV_BLK(minor)]) == NULL) - return; - len = strlen(lv_ptr->lv_name) - 5; - memcpy(buf, &lv_ptr->lv_name[5], len); - buf[len] = 0; - return; -} -#endif - - -/* - * this one never should be called... - */ -static void lvm_dummy_device_request(request_queue_t * t) -{ - printk(KERN_EMERG - "%s -- oops, got lvm request for %02d:%02d [sector: %lu]\n", - lvm_name, - MAJOR(CURRENT->rq_dev), - MINOR(CURRENT->rq_dev), - CURRENT->sector); - return; -} - - -/* - * make request function - */ -static int lvm_make_request_fn(request_queue_t *q, int rw, struct buffer_head *bh) -{ - lvm_map(bh, rw); - return 1; -} - -/* - * plug device function is a noop because plugging has to happen - * in the queue of the physical blockdevice to allow the - * elevator to do a better job. - */ -static void lvm_plug_device_noop(request_queue_t *q, kdev_t dev) { } - -/******************************************************************** - * - * Character device support functions - * - ********************************************************************/ -/* - * character device support function logical volume manager lock - */ -static int lvm_do_lock_lvm(void) -{ -lock_try_again: - spin_lock(&lvm_lock); - if (lock != 0 && lock != current->pid) { -#ifdef DEBUG_IOCTL - printk(KERN_INFO "lvm_do_lock_lvm: %s is locked by pid %d ...\n", - lvm_name, lock); -#endif - spin_unlock(&lvm_lock); - interruptible_sleep_on(&lvm_wait); - if (current->sigpending != 0) - return -EINTR; -#ifdef LVM_TOTAL_RESET - if (lvm_reset_spindown > 0) - return -EACCES; -#endif - goto lock_try_again; - } - lock = current->pid; - spin_unlock(&lvm_lock); - return 0; -} /* lvm_do_lock_lvm */ - - -/* - * character device support function lock/unlock physical extend - */ -static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg) -{ - uint p; - - if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(&pe_lock_req, arg, - sizeof(pe_lock_req_t)) != 0) return -EFAULT; - - switch (pe_lock_req.lock) { - case LOCK_PE: - for (p = 0; p < vg_ptr->pv_max; p++) { - if (vg_ptr->pv[p] != NULL && - pe_lock_req.data.pv_dev == - vg_ptr->pv[p]->pv_dev) - break; - } - if (p == vg_ptr->pv_max) return -ENXIO; - - pe_lock_req.lock = UNLOCK_PE; - fsync_dev(pe_lock_req.data.lv_dev); - pe_lock_req.lock = LOCK_PE; - break; - - case UNLOCK_PE: - pe_lock_req.lock = UNLOCK_PE; - pe_lock_req.data.lv_dev = \ - pe_lock_req.data.pv_dev = \ - pe_lock_req.data.pv_offset = 0; - wake_up(&lvm_map_wait); - break; - - default: - return -EINVAL; - } - return 0; -} - - -/* - * character device support function logical extend remap - */ -static int lvm_do_le_remap(vg_t *vg_ptr, void *arg) -{ - uint l, le; - lv_t *lv_ptr; - - if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(&le_remap_req, arg, - sizeof(le_remap_req_t)) != 0) - return -EFAULT; - - for (l = 0; l < vg_ptr->lv_max; l++) { - lv_ptr = vg_ptr->lv[l]; - if (lv_ptr != NULL && - strcmp(lv_ptr->lv_name, - le_remap_req.lv_name) == 0) { - for (le = 0; le < lv_ptr->lv_allocated_le; - le++) { - if (lv_ptr->lv_current_pe[le].dev == - le_remap_req.old_dev && - lv_ptr->lv_current_pe[le].pe == - le_remap_req.old_pe) { - lv_ptr->lv_current_pe[le].dev = - le_remap_req.new_dev; - lv_ptr->lv_current_pe[le].pe = - le_remap_req.new_pe; - return 0; - } - } - return -EINVAL; - } - } - return -ENXIO; -} /* lvm_do_le_remap() */ - - -/* - * character device support function VGDA create - */ -int lvm_do_vg_create(int minor, void *arg) -{ - int snaporg_minor = 0; - ulong l, p; - lv_t lv; - vg_t *vg_ptr; - pv_t *pv_ptr; - lv_t *lv_ptr; - - if (vg[VG_CHR(minor)] != NULL) return -EPERM; - - if ((vg_ptr = kmalloc(sizeof(vg_t),GFP_KERNEL)) == NULL) { - printk(KERN_CRIT - "%s -- VG_CREATE: kmalloc error VG at line %d\n", - lvm_name, __LINE__); - return -ENOMEM; - } - /* get the volume group structure */ - if (copy_from_user(vg_ptr, arg, sizeof(vg_t)) != 0) { - kfree(vg_ptr); - return -EFAULT; - } - /* we are not that active so far... */ - vg_ptr->vg_status &= ~VG_ACTIVE; - vg[VG_CHR(minor)] = vg_ptr; - - vg[VG_CHR(minor)]->pe_allocated = 0; - if (vg_ptr->pv_max > ABS_MAX_PV) { - printk(KERN_WARNING - "%s -- Can't activate VG: ABS_MAX_PV too small\n", - lvm_name); - kfree(vg_ptr); - vg[VG_CHR(minor)] = NULL; - return -EPERM; - } - if (vg_ptr->lv_max > ABS_MAX_LV) { - printk(KERN_WARNING - "%s -- Can't activate VG: ABS_MAX_LV too small for %u\n", - lvm_name, vg_ptr->lv_max); - kfree(vg_ptr); - vg_ptr = NULL; - return -EPERM; - } - /* get the physical volume structures */ - vg_ptr->pv_act = vg_ptr->pv_cur = 0; - for (p = 0; p < vg_ptr->pv_max; p++) { - /* user space address */ - if ((pvp = vg_ptr->pv[p]) != NULL) { - pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL); - if (pv_ptr == NULL) { - printk(KERN_CRIT - "%s -- VG_CREATE: kmalloc error PV at line %d\n", - lvm_name, __LINE__); - lvm_do_vg_remove(minor); - return -ENOMEM; - } - if (copy_from_user(pv_ptr, pvp, sizeof(pv_t)) != 0) { - lvm_do_vg_remove(minor); - return -EFAULT; - } - /* We don't need the PE list - in kernel space as with LVs pe_t list (see below) */ - pv_ptr->pe = NULL; - pv_ptr->pe_allocated = 0; - pv_ptr->pv_status = PV_ACTIVE; - vg_ptr->pv_act++; - vg_ptr->pv_cur++; - -#ifdef LVM_GET_INODE - /* insert a dummy inode for fs_may_mount */ - pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev); -#endif - } - } - - /* get the logical volume structures */ - vg_ptr->lv_cur = 0; - for (l = 0; l < vg_ptr->lv_max; l++) { - /* user space address */ - if ((lvp = vg_ptr->lv[l]) != NULL) { - if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) { - lvm_do_vg_remove(minor); - return -EFAULT; - } - vg_ptr->lv[l] = NULL; - if (lvm_do_lv_create(minor, lv.lv_name, &lv) != 0) { - lvm_do_vg_remove(minor); - return -EFAULT; - } - } - } - - /* Second path to correct snapshot logical volumes which are not - in place during first path above */ - for (l = 0; l < vg_ptr->lv_max; l++) { - if ((lv_ptr = vg_ptr->lv[l]) != NULL && - vg_ptr->lv[l]->lv_access & LV_SNAPSHOT) { - snaporg_minor = lv_ptr->lv_snapshot_minor; - if (vg_ptr->lv[LV_BLK(snaporg_minor)] != NULL) { - /* get pointer to original logical volume */ - lv_ptr = vg_ptr->lv[l]->lv_snapshot_org = - vg_ptr->lv[LV_BLK(snaporg_minor)]; - - /* set necessary fields of original logical volume */ - lv_ptr->lv_access |= LV_SNAPSHOT_ORG; - lv_ptr->lv_snapshot_minor = 0; - lv_ptr->lv_snapshot_org = lv_ptr; - lv_ptr->lv_snapshot_prev = NULL; - - /* find last snapshot logical volume in the chain */ - while (lv_ptr->lv_snapshot_next != NULL) - lv_ptr = lv_ptr->lv_snapshot_next; - - /* set back pointer to this last one in our new logical volume */ - vg_ptr->lv[l]->lv_snapshot_prev = lv_ptr; - - /* last logical volume now points to our new snapshot volume */ - lv_ptr->lv_snapshot_next = vg_ptr->lv[l]; - - /* now point to the new one */ - lv_ptr = lv_ptr->lv_snapshot_next; - - /* set necessary fields of new snapshot logical volume */ - lv_ptr->lv_snapshot_next = NULL; - lv_ptr->lv_current_pe = - vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_current_pe; - lv_ptr->lv_allocated_le = - vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_allocated_le; - lv_ptr->lv_current_le = - vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_current_le; - lv_ptr->lv_size = - vg_ptr->lv[LV_BLK(snaporg_minor)]->lv_size; - } - } - } - - vg_count++; - - /* let's go active */ - vg_ptr->vg_status |= VG_ACTIVE; - - MOD_INC_USE_COUNT; - - return 0; -} /* lvm_do_vg_create() */ - - -/* - * character device support function VGDA extend - */ -static int lvm_do_vg_extend(vg_t *vg_ptr, void *arg) -{ - uint p; - pv_t *pv_ptr; - - if (vg_ptr == NULL) return -ENXIO; - if (vg_ptr->pv_cur < vg_ptr->pv_max) { - for (p = 0; p < vg_ptr->pv_max; p++) { - if (vg_ptr->pv[p] == NULL) { - if ((pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL)) == NULL) { - printk(KERN_CRIT - "%s -- VG_EXTEND: kmalloc error PV at line %d\n", - lvm_name, __LINE__); - return -ENOMEM; - } - if (copy_from_user(pv_ptr, arg, sizeof(pv_t)) != 0) { - kfree(pv_ptr); - vg_ptr->pv[p] = NULL; - return -EFAULT; - } - - pv_ptr->pv_status = PV_ACTIVE; - /* We don't need the PE list - in kernel space like LVs pe_t list */ - pv_ptr->pe = NULL; - vg_ptr->pv_cur++; - vg_ptr->pv_act++; - vg_ptr->pe_total += - pv_ptr->pe_total; -#ifdef LVM_GET_INODE - /* insert a dummy inode for fs_may_mount */ - pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev); -#endif - return 0; - } - } - } -return -EPERM; -} /* lvm_do_vg_extend() */ - - -/* - * character device support function VGDA reduce - */ -static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg) -{ - uint p; - pv_t *pv_ptr; - - if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(pv_name, arg, sizeof(pv_name)) != 0) - return -EFAULT; - - for (p = 0; p < vg_ptr->pv_max; p++) { - pv_ptr = vg_ptr->pv[p]; - if (pv_ptr != NULL && - strcmp(pv_ptr->pv_name, - pv_name) == 0) { - if (pv_ptr->lv_cur > 0) return -EPERM; - vg_ptr->pe_total -= - pv_ptr->pe_total; - vg_ptr->pv_cur--; - vg_ptr->pv_act--; -#ifdef LVM_GET_INODE - lvm_clear_inode(pv_ptr->inode); -#endif - kfree(pv_ptr); - /* Make PV pointer array contiguous */ - for (; p < vg_ptr->pv_max - 1; p++) - vg_ptr->pv[p] = vg_ptr->pv[p + 1]; - vg_ptr->pv[p + 1] = NULL; - return 0; - } - } - return -ENXIO; -} /* lvm_do_vg_reduce */ - - -/* - * character device support function VGDA remove - */ -static int lvm_do_vg_remove(int minor) -{ - int i; - vg_t *vg_ptr = vg[VG_CHR(minor)]; - pv_t *pv_ptr; - - if (vg_ptr == NULL) return -ENXIO; - -#ifdef LVM_TOTAL_RESET - if (vg_ptr->lv_open > 0 && lvm_reset_spindown == 0) -#else - if (vg_ptr->lv_open > 0) -#endif - return -EPERM; - - /* let's go inactive */ - vg_ptr->vg_status &= ~VG_ACTIVE; - - /* free LVs */ - /* first free snapshot logical volumes */ - for (i = 0; i < vg_ptr->lv_max; i++) { - if (vg_ptr->lv[i] != NULL && - vg_ptr->lv[i]->lv_access & LV_SNAPSHOT) { - lvm_do_lv_remove(minor, NULL, i); - current->state = TASK_UNINTERRUPTIBLE; - schedule_timeout(1); - } - } - /* then free the rest of the LVs */ - for (i = 0; i < vg_ptr->lv_max; i++) { - if (vg_ptr->lv[i] != NULL) { - lvm_do_lv_remove(minor, NULL, i); - current->state = TASK_UNINTERRUPTIBLE; - schedule_timeout(1); - } - } - - /* free PVs */ - for (i = 0; i < vg_ptr->pv_max; i++) { - if ((pv_ptr = vg_ptr->pv[i]) != NULL) { -#ifdef DEBUG_KFREE - printk(KERN_DEBUG - "%s -- kfree %d\n", lvm_name, __LINE__); -#endif -#ifdef LVM_GET_INODE - lvm_clear_inode(pv_ptr->inode); -#endif - kfree(pv_ptr); - vg[VG_CHR(minor)]->pv[i] = NULL; - } - } - -#ifdef DEBUG_KFREE - printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); -#endif - kfree(vg_ptr); - vg[VG_CHR(minor)] = NULL; - - vg_count--; - - MOD_DEC_USE_COUNT; - - return 0; -} /* lvm_do_vg_remove() */ - - -/* - * character device support function logical volume create - */ -static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv) -{ - int l, le, l_new, p, size; - ulong lv_status_save; - lv_block_exception_t *lvbe = lv->lv_block_exception; - vg_t *vg_ptr = vg[VG_CHR(minor)]; - lv_t *lv_ptr = NULL; - - if ((pep = lv->lv_current_pe) == NULL) return -EINVAL; - if (lv->lv_chunk_size > LVM_SNAPSHOT_MAX_CHUNK) - return -EINVAL; - - for (l = 0; l < vg_ptr->lv_max; l++) { - if (vg_ptr->lv[l] != NULL && - strcmp(vg_ptr->lv[l]->lv_name, lv_name) == 0) - return -EEXIST; - } - - /* in case of lv_remove(), lv_create() pair; for eg. lvrename does this */ - l_new = -1; - if (vg_ptr->lv[lv->lv_number] == NULL) - l_new = lv->lv_number; - else { - for (l = 0; l < vg_ptr->lv_max; l++) { - if (vg_ptr->lv[l] == NULL) - if (l_new == -1) l_new = l; - } - } - if (l_new == -1) return -EPERM; - else l = l_new; - - if ((lv_ptr = kmalloc(sizeof(lv_t),GFP_KERNEL)) == NULL) {; - printk(KERN_CRIT "%s -- LV_CREATE: kmalloc error LV at line %d\n", - lvm_name, __LINE__); - return -ENOMEM; - } - /* copy preloaded LV */ - memcpy((char *) lv_ptr, (char *) lv, sizeof(lv_t)); - - lv_status_save = lv_ptr->lv_status; - lv_ptr->lv_status &= ~LV_ACTIVE; - lv_ptr->lv_snapshot_org = \ - lv_ptr->lv_snapshot_prev = \ - lv_ptr->lv_snapshot_next = NULL; - lv_ptr->lv_block_exception = NULL; - init_MUTEX(&lv_ptr->lv_snapshot_sem); - vg_ptr->lv[l] = lv_ptr; - - /* get the PE structures from user space if this - is no snapshot logical volume */ - if (!(lv_ptr->lv_access & LV_SNAPSHOT)) { - size = lv_ptr->lv_allocated_le * sizeof(pe_t); - if ((lv_ptr->lv_current_pe = vmalloc(size)) == NULL) { - printk(KERN_CRIT - "%s -- LV_CREATE: vmalloc error LV_CURRENT_PE of %d Byte " - "at line %d\n", - lvm_name, size, __LINE__); -#ifdef DEBUG_KFREE - printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); -#endif - kfree(lv_ptr); - vg[VG_CHR(minor)]->lv[l] = NULL; - return -ENOMEM; - } - if (copy_from_user(lv_ptr->lv_current_pe, pep, size)) { - vfree(lv_ptr->lv_current_pe); - kfree(lv_ptr); - vg_ptr->lv[l] = NULL; - return -EFAULT; - } - /* correct the PE count in PVs */ - for (le = 0; le < lv_ptr->lv_allocated_le; le++) { - vg_ptr->pe_allocated++; - for (p = 0; p < vg_ptr->pv_cur; p++) { - if (vg_ptr->pv[p]->pv_dev == - lv_ptr->lv_current_pe[le].dev) - vg_ptr->pv[p]->pe_allocated++; - } - } - } else { - /* Get snapshot exception data and block list */ - if (lvbe != NULL) { - lv_ptr->lv_snapshot_org = - vg_ptr->lv[LV_BLK(lv_ptr->lv_snapshot_minor)]; - if (lv_ptr->lv_snapshot_org != NULL) { - size = lv_ptr->lv_remap_end * sizeof(lv_block_exception_t); - if ((lv_ptr->lv_block_exception = vmalloc(size)) == NULL) { - printk(KERN_CRIT - "%s -- lvm_do_lv_create: vmalloc error LV_BLOCK_EXCEPTION " - "of %d byte at line %d\n", - lvm_name, size, __LINE__); -#ifdef DEBUG_KFREE - printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); -#endif - kfree(lv_ptr); - vg_ptr->lv[l] = NULL; - return -ENOMEM; - } - if (copy_from_user(lv_ptr->lv_block_exception, lvbe, size)) { - vfree(lv_ptr->lv_block_exception); - kfree(lv_ptr); - vg[VG_CHR(minor)]->lv[l] = NULL; - return -EFAULT; - } - /* get pointer to original logical volume */ - lv_ptr = lv_ptr->lv_snapshot_org; - - lv_ptr->lv_snapshot_minor = 0; - lv_ptr->lv_snapshot_org = lv_ptr; - lv_ptr->lv_snapshot_prev = NULL; - /* walk thrugh the snapshot list */ - while (lv_ptr->lv_snapshot_next != NULL) - lv_ptr = lv_ptr->lv_snapshot_next; - /* now lv_ptr points to the last existing snapshot in the chain */ - vg_ptr->lv[l]->lv_snapshot_prev = lv_ptr; - /* our new one now back points to the previous last in the chain */ - lv_ptr = vg_ptr->lv[l]; - /* now lv_ptr points to our new last snapshot logical volume */ - lv_ptr->lv_snapshot_org = lv_ptr->lv_snapshot_prev->lv_snapshot_org; - lv_ptr->lv_snapshot_next = NULL; - lv_ptr->lv_current_pe = lv_ptr->lv_snapshot_org->lv_current_pe; - lv_ptr->lv_allocated_le = lv_ptr->lv_snapshot_org->lv_allocated_le; - lv_ptr->lv_current_le = lv_ptr->lv_snapshot_org->lv_current_le; - lv_ptr->lv_size = lv_ptr->lv_snapshot_org->lv_size; - lv_ptr->lv_stripes = lv_ptr->lv_snapshot_org->lv_stripes; - lv_ptr->lv_stripesize = lv_ptr->lv_snapshot_org->lv_stripesize; - { - int err = lvm_snapshot_alloc(lv_ptr); - if (err) - { - vfree(lv_ptr->lv_block_exception); - kfree(lv_ptr); - vg[VG_CHR(minor)]->lv[l] = NULL; - return err; - } - } - } else { - vfree(lv_ptr->lv_block_exception); - kfree(lv_ptr); - vg_ptr->lv[l] = NULL; - return -EFAULT; - } - } else { - kfree(vg_ptr->lv[l]); - vg_ptr->lv[l] = NULL; - return -EINVAL; - } - } /* if ( vg[VG_CHR(minor)]->lv[l]->lv_access & LV_SNAPSHOT) */ - - lv_ptr = vg_ptr->lv[l]; - lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0; - lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size; - lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1; - vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = vg_ptr->vg_number; - vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = lv_ptr->lv_number; - read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead); - vg_ptr->lv_cur++; - lv_ptr->lv_status = lv_status_save; - - /* optionally add our new snapshot LV */ - if (lv_ptr->lv_access & LV_SNAPSHOT) { - /* sync the original logical volume */ - fsync_dev(lv_ptr->lv_snapshot_org->lv_dev); - /* put ourselve into the chain */ - lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr; - lv_ptr->lv_snapshot_org->lv_access |= LV_SNAPSHOT_ORG; - } - return 0; -} /* lvm_do_lv_create() */ - - -/* - * character device support function logical volume remove - */ -static int lvm_do_lv_remove(int minor, char *lv_name, int l) -{ - uint le, p; - vg_t *vg_ptr = vg[VG_CHR(minor)]; - lv_t *lv_ptr; - - if (l == -1) { - for (l = 0; l < vg_ptr->lv_max; l++) { - if (vg_ptr->lv[l] != NULL && - strcmp(vg_ptr->lv[l]->lv_name, lv_name) == 0) { - break; - } - } - } - if (l == vg_ptr->lv_max) return -ENXIO; - - lv_ptr = vg_ptr->lv[l]; -#ifdef LVM_TOTAL_RESET - if (lv_ptr->lv_open > 0 && lvm_reset_spindown == 0) -#else - if (lv_ptr->lv_open > 0) -#endif - return -EBUSY; - - /* check for deletion of snapshot source while - snapshot volume still exists */ - if ((lv_ptr->lv_access & LV_SNAPSHOT_ORG) && - lv_ptr->lv_snapshot_next != NULL) - return -EPERM; - - lv_ptr->lv_status |= LV_SPINDOWN; - - /* sync the buffers */ - fsync_dev(lv_ptr->lv_dev); - - lv_ptr->lv_status &= ~LV_ACTIVE; - - /* invalidate the buffers */ - invalidate_buffers(lv_ptr->lv_dev); - - /* reset generic hd */ - lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = -1; - lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = 0; - lvm_size[MINOR(lv_ptr->lv_dev)] = 0; - - /* reset VG/LV mapping */ - vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = ABS_MAX_VG; - vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = -1; - - /* correct the PE count in PVs if this is no snapshot logical volume */ - if (!(lv_ptr->lv_access & LV_SNAPSHOT)) { - /* only if this is no snapshot logical volume because - we share the lv_current_pe[] structs with the - original logical volume */ - for (le = 0; le < lv_ptr->lv_allocated_le; le++) { - vg_ptr->pe_allocated--; - for (p = 0; p < vg_ptr->pv_cur; p++) { - if (vg_ptr->pv[p]->pv_dev == - lv_ptr->lv_current_pe[le].dev) - vg_ptr->pv[p]->pe_allocated--; - } - } - vfree(lv_ptr->lv_current_pe); - /* LV_SNAPSHOT */ - } else { - /* remove this snapshot logical volume from the chain */ - lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr->lv_snapshot_next; - if (lv_ptr->lv_snapshot_next != NULL) { - lv_ptr->lv_snapshot_next->lv_snapshot_prev = - lv_ptr->lv_snapshot_prev; - } - /* no more snapshots? */ - if (lv_ptr->lv_snapshot_org->lv_snapshot_next == NULL) - lv_ptr->lv_snapshot_org->lv_access &= ~LV_SNAPSHOT_ORG; - lvm_snapshot_release(lv_ptr); - } - -#ifdef DEBUG_KFREE - printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); -#endif - kfree(lv_ptr); - vg_ptr->lv[l] = NULL; - vg_ptr->lv_cur--; - return 0; -} /* lvm_do_lv_remove() */ - - -/* - * character device support function logical volume extend / reduce - */ -static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv) -{ - int l, le, p, size, old_allocated_le; - uint32_t end, lv_status_save; - vg_t *vg_ptr = vg[VG_CHR(minor)]; - lv_t *lv_ptr; - pe_t *pe; - - if ((pep = lv->lv_current_pe) == NULL) return -EINVAL; - - for (l = 0; l < vg_ptr->lv_max; l++) { - if (vg_ptr->lv[l] != NULL && - strcmp(vg_ptr->lv[l]->lv_name, lv_name) == 0) - break; - } - if (l == vg_ptr->lv_max) return -ENXIO; - lv_ptr = vg_ptr->lv[l]; - - /* check for active snapshot */ - if (lv->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG)) return -EPERM; - - if ((pe = vmalloc(size = lv->lv_current_le * sizeof(pe_t))) == NULL) { - printk(KERN_CRIT - "%s -- lvm_do_lv_extend_reduce: vmalloc error LV_CURRENT_PE " - "of %d Byte at line %d\n", - lvm_name, size, __LINE__); - return -ENOMEM; - } - /* get the PE structures from user space */ - if (copy_from_user(pe, pep, size)) { - vfree(pe); - return -EFAULT; - } - -#ifdef DEBUG - printk(KERN_DEBUG - "%s -- fsync_dev and " - "invalidate_buffers for %s [%s] in %s\n", - lvm_name, lv_ptr->lv_name, - kdevname(lv_ptr->lv_dev), - vg_ptr->vg_name); -#endif - - lv_ptr->lv_status |= LV_SPINDOWN; - fsync_dev(lv_ptr->lv_dev); - lv_ptr->lv_status &= ~LV_ACTIVE; - invalidate_buffers(lv_ptr->lv_dev); - - /* reduce allocation counters on PV(s) */ - for (le = 0; le < lv_ptr->lv_allocated_le; le++) { - vg_ptr->pe_allocated--; - for (p = 0; p < vg_ptr->pv_cur; p++) { - if (vg_ptr->pv[p]->pv_dev == - lv_ptr->lv_current_pe[le].dev) { - vg_ptr->pv[p]->pe_allocated--; - break; - } - } - } - - - /* save pointer to "old" lv/pe pointer array */ - pep1 = lv_ptr->lv_current_pe; - end = lv_ptr->lv_current_le; - - /* save open counter */ - lv_open = lv_ptr->lv_open; - - /* save # of old allocated logical extents */ - old_allocated_le = lv_ptr->lv_allocated_le; - - /* copy preloaded LV */ - lv_status_save = lv->lv_status; - lv->lv_status |= LV_SPINDOWN; - lv->lv_status &= ~LV_ACTIVE; - memcpy((char *) lv_ptr, (char *) lv, sizeof(lv_t)); - lv_ptr->lv_current_pe = pe; - lv_ptr->lv_open = lv_open; - - /* save availiable i/o statistic data */ - /* linear logical volume */ - if (lv_ptr->lv_stripes < 2) { - /* Check what last LE shall be used */ - if (end > lv_ptr->lv_current_le) end = lv_ptr->lv_current_le; - for (le = 0; le < end; le++) { - lv_ptr->lv_current_pe[le].reads = pep1[le].reads; - lv_ptr->lv_current_pe[le].writes = pep1[le].writes; - } - /* striped logical volume */ - } else { - uint i, j, source, dest, end, old_stripe_size, new_stripe_size; - - old_stripe_size = old_allocated_le / lv_ptr->lv_stripes; - new_stripe_size = lv_ptr->lv_allocated_le / lv_ptr->lv_stripes; - end = old_stripe_size; - if (end > new_stripe_size) end = new_stripe_size; - for (i = source = dest = 0; - i < lv_ptr->lv_stripes; i++) { - for (j = 0; j < end; j++) { - lv_ptr->lv_current_pe[dest + j].reads = - pep1[source + j].reads; - lv_ptr->lv_current_pe[dest + j].writes = - pep1[source + j].writes; - } - source += old_stripe_size; - dest += new_stripe_size; - } - } - vfree(pep1); - pep1 = NULL; - - - /* extend the PE count in PVs */ - for (le = 0; le < lv_ptr->lv_allocated_le; le++) { - vg_ptr->pe_allocated++; - for (p = 0; p < vg_ptr->pv_cur; p++) { - if (vg_ptr->pv[p]->pv_dev == - vg_ptr->lv[l]->lv_current_pe[le].dev) { - vg_ptr->pv[p]->pe_allocated++; - break; - } - } - } - - lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0; - lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size; - lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1; - /* vg_lv_map array doesn't have to be changed here */ - - read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead = LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead); - lv_ptr->lv_status = lv_status_save; - - return 0; -} /* lvm_do_lv_extend_reduce() */ - - -/* - * character device support function logical volume status by name - */ -static int lvm_do_lv_status_byname(vg_t *vg_ptr, void *arg) -{ - uint l; - ulong size; - lv_t lv; - lv_t *lv_ptr; - lv_status_byname_req_t lv_status_byname_req; - - if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(&lv_status_byname_req, arg, - sizeof(lv_status_byname_req_t)) != 0) - return -EFAULT; - - if (lv_status_byname_req.lv == NULL) return -EINVAL; - if (copy_from_user(&lv, lv_status_byname_req.lv, - sizeof(lv_t)) != 0) - return -EFAULT; - - for (l = 0; l < vg_ptr->lv_max; l++) { - lv_ptr = vg_ptr->lv[l]; - if (lv_ptr != NULL && - strcmp(lv_ptr->lv_name, - lv_status_byname_req.lv_name) == 0) { - if (copy_to_user(lv_status_byname_req.lv, - lv_ptr, - sizeof(lv_t)) != 0) - return -EFAULT; - - if (lv.lv_current_pe != NULL) { - size = lv_ptr->lv_allocated_le * - sizeof(pe_t); - if (copy_to_user(lv.lv_current_pe, - lv_ptr->lv_current_pe, - size) != 0) - return -EFAULT; - } - return 0; - } - } - return -ENXIO; -} /* lvm_do_lv_status_byname() */ - - -/* - * character device support function logical volume status by index - */ -static int lvm_do_lv_status_byindex(vg_t *vg_ptr,void *arg) -{ - ulong size; - lv_t lv; - lv_t *lv_ptr; - lv_status_byindex_req_t lv_status_byindex_req; - - if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(&lv_status_byindex_req, arg, - sizeof(lv_status_byindex_req)) != 0) - return -EFAULT; - - if ((lvp = lv_status_byindex_req.lv) == NULL) - return -EINVAL; - if ( ( lv_ptr = vg_ptr->lv[lv_status_byindex_req.lv_index]) == NULL) - return -ENXIO; - - if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) - return -EFAULT; - - if (copy_to_user(lvp, lv_ptr, sizeof(lv_t)) != 0) - return -EFAULT; - - if (lv.lv_current_pe != NULL) { - size = lv_ptr->lv_allocated_le * sizeof(pe_t); - if (copy_to_user(lv.lv_current_pe, - lv_ptr->lv_current_pe, - size) != 0) - return -EFAULT; - } - return 0; -} /* lvm_do_lv_status_byindex() */ - - -/* - * character device support function physical volume change - */ -static int lvm_do_pv_change(vg_t *vg_ptr, void *arg) -{ - uint p; - pv_t *pv_ptr; -#ifdef LVM_GET_INODE - struct inode *inode_sav; -#endif - - if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(&pv_change_req, arg, - sizeof(pv_change_req)) != 0) - return -EFAULT; - - for (p = 0; p < vg_ptr->pv_max; p++) { - pv_ptr = vg_ptr->pv[p]; - if (pv_ptr != NULL && - strcmp(pv_ptr->pv_name, - pv_change_req.pv_name) == 0) { -#ifdef LVM_GET_INODE - inode_sav = pv_ptr->inode; -#endif - if (copy_from_user(pv_ptr, - pv_change_req.pv, - sizeof(pv_t)) != 0) - return -EFAULT; - - /* We don't need the PE list - in kernel space as with LVs pe_t list */ - pv_ptr->pe = NULL; -#ifdef LVM_GET_INODE - pv_ptr->inode = inode_sav; -#endif - return 0; - } - } - return -ENXIO; -} /* lvm_do_pv_change() */ - -/* - * character device support function get physical volume status - */ -static int lvm_do_pv_status(vg_t *vg_ptr, void *arg) -{ - uint p; - pv_t *pv_ptr; - - if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(&pv_status_req, arg, - sizeof(pv_status_req)) != 0) - return -EFAULT; - - for (p = 0; p < vg_ptr->pv_max; p++) { - pv_ptr = vg_ptr->pv[p]; - if (pv_ptr != NULL && - strcmp(pv_ptr->pv_name, - pv_status_req.pv_name) == 0) { - if (copy_to_user(pv_status_req.pv, - pv_ptr, - sizeof(pv_t)) != 0) - return -EFAULT; - return 0; - } - } - return -ENXIO; -} /* lvm_do_pv_status() */ - - -/* - * support function initialize gendisk variables - */ -#ifdef __initfunc -__initfunc(void lvm_geninit(struct gendisk *lvm_gdisk)) -#else -void __init - lvm_geninit(struct gendisk *lvm_gdisk) -#endif -{ - int i = 0; - -#ifdef DEBUG_GENDISK - printk(KERN_DEBUG "%s -- lvm_gendisk\n", lvm_name); -#endif - - for (i = 0; i < MAX_LV; i++) { - lvm_gendisk.part[i].start_sect = -1; /* avoid partition check */ - lvm_size[i] = lvm_gendisk.part[i].nr_sects = 0; - lvm_blocksizes[i] = BLOCK_SIZE; - } - - blksize_size[MAJOR_NR] = lvm_blocksizes; - blk_size[MAJOR_NR] = lvm_size; - - return; -} /* lvm_gen_init() */ - - -#ifdef LVM_GET_INODE -/* - * support function to get an empty inode - * - * Gets an empty inode to be inserted into the inode hash, - * so that a physical volume can't be mounted. - * This is analog to drivers/block/md.c - * - * Is this the real thing? - * - * No, it's bollocks. md.c tries to do a bit different thing that might - * _somewhat_ work eons ago. Neither does any good these days. mount() couldn't - * care less for icache (it cares only for ->s_root->d_count and if we want - * loopback mounts even that will stop). BTW, with the form used here mount() - * would have to scan the _whole_ icache to detect the attempt - how on the - * Earth could it guess the i_ino of your dummy inode? Official line on the - * exclusion between mount()/swapon()/open()/etc. is Just Don't Do It(tm). - * If you can convince Linus that it's worth changing - fine, then you'll need - * to do blkdev_get()/blkdev_put(). Until then... - */ -struct inode *lvm_get_inode(int dev) -{ - struct inode *inode_this = NULL; - - /* Lock the device by inserting a dummy inode. */ - inode_this = get_empty_inode(); - inode_this->i_dev = dev; - insert_inode_hash(inode_this); - return inode_this; -} - - -/* - * support function to clear an inode - * - */ -void lvm_clear_inode(struct inode *inode) -{ -#ifdef I_FREEING - inode->i_state |= I_FREEING; -#endif - clear_inode(inode); - return; -} -#endif /* #ifdef LVM_GET_INODE */ diff --git a/drivers/block/md.c b/drivers/block/md.c deleted file mode 100644 index 19b97d161..000000000 --- a/drivers/block/md.c +++ /dev/null @@ -1,3867 +0,0 @@ -/* - md.c : Multiple Devices driver for Linux - Copyright (C) 1998, 1999, 2000 Ingo Molnar - - completely rewritten, based on the MD driver code from Marc Zyngier - - Changes: - - - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar - - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> - - kerneld support by Boris Tobotras <boris@xtalk.msk.su> - - kmod support by: Cyrus Durgin - - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> - - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> - - - lots of fixes and improvements to the RAID1/RAID5 and generic - RAID code (such as request based resynchronization): - - Neil Brown <neilb@cse.unsw.edu.au>. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#include <linux/module.h> -#include <linux/config.h> -#include <linux/raid/md.h> -#include <linux/raid/xor.h> -#include <linux/devfs_fs_kernel.h> - -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif - -#define __KERNEL_SYSCALLS__ -#include <linux/unistd.h> - -#include <asm/unaligned.h> - -extern asmlinkage int sys_sched_yield(void); -extern asmlinkage long sys_setsid(void); - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER - -#include <linux/blk.h> - -#define DEBUG 0 -#if DEBUG -# define dprintk(x...) printk(x) -#else -# define dprintk(x...) do { } while(0) -#endif - -static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, }; - -/* - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' - * is 100 KB/sec, so the extra system load does not show up that much. - * Increase it if you want to have more _guaranteed_ speed. Note that - * the RAID driver will use the maximum available bandwith if the IO - * subsystem is idle. There is also an 'absolute maximum' reconstruction - * speed limit - in case reconstruction slows down your system despite - * idle IO detection. - * - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. - */ - -static int sysctl_speed_limit_min = 100; -static int sysctl_speed_limit_max = 100000; - -static struct ctl_table_header *raid_table_header; - -static ctl_table raid_table[] = { - {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", - &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, - {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", - &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, - {0} -}; - -static ctl_table raid_dir_table[] = { - {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, - {0} -}; - -static ctl_table raid_root_table[] = { - {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, - {0} -}; - -/* - * these have to be allocated separately because external - * subsystems want to have a pre-defined structure - */ -struct hd_struct md_hd_struct[MAX_MD_DEVS]; -static int md_blocksizes[MAX_MD_DEVS]; -static int md_hardsect_sizes[MAX_MD_DEVS]; -static int md_maxreadahead[MAX_MD_DEVS]; -static mdk_thread_t *md_recovery_thread = NULL; - -int md_size[MAX_MD_DEVS] = {0, }; - -extern struct block_device_operations md_fops; -static devfs_handle_t devfs_handle = NULL; - -static struct gendisk md_gendisk= -{ - major: MD_MAJOR, - major_name: "md", - minor_shift: 0, - max_p: 1, - part: md_hd_struct, - sizes: md_size, - nr_real: MAX_MD_DEVS, - real_devices: NULL, - next: NULL, - fops: &md_fops, -}; - -/* - * Enables to iterate over all existing md arrays - */ -static MD_LIST_HEAD(all_mddevs); - -/* - * The mapping between kdev and mddev is not necessary a simple - * one! Eg. HSM uses several sub-devices to implement Logical - * Volumes. All these sub-devices map to the same mddev. - */ -dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, }; - -void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data) -{ - unsigned int minor = MINOR(dev); - - if (MAJOR(dev) != MD_MAJOR) { - MD_BUG(); - return; - } - if (mddev_map[minor].mddev != NULL) { - MD_BUG(); - return; - } - mddev_map[minor].mddev = mddev; - mddev_map[minor].data = data; -} - -void del_mddev_mapping (mddev_t * mddev, kdev_t dev) -{ - unsigned int minor = MINOR(dev); - - if (MAJOR(dev) != MD_MAJOR) { - MD_BUG(); - return; - } - if (mddev_map[minor].mddev != mddev) { - MD_BUG(); - return; - } - mddev_map[minor].mddev = NULL; - mddev_map[minor].data = NULL; -} - -static int md_make_request (request_queue_t *q, int rw, struct buffer_head * bh) -{ - mddev_t *mddev = kdev_to_mddev(bh->b_rdev); - - if (mddev && mddev->pers) - return mddev->pers->make_request(mddev, rw, bh); - else { - buffer_IO_error(bh); - return -1; - } -} - -static mddev_t * alloc_mddev (kdev_t dev) -{ - mddev_t *mddev; - - if (MAJOR(dev) != MD_MAJOR) { - MD_BUG(); - return 0; - } - mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); - if (!mddev) - return NULL; - - memset(mddev, 0, sizeof(*mddev)); - - mddev->__minor = MINOR(dev); - init_MUTEX(&mddev->reconfig_sem); - init_MUTEX(&mddev->recovery_sem); - init_MUTEX(&mddev->resync_sem); - MD_INIT_LIST_HEAD(&mddev->disks); - MD_INIT_LIST_HEAD(&mddev->all_mddevs); - - /* - * The 'base' mddev is the one with data NULL. - * personalities can create additional mddevs - * if necessary. - */ - add_mddev_mapping(mddev, dev, 0); - md_list_add(&mddev->all_mddevs, &all_mddevs); - - MOD_INC_USE_COUNT; - - return mddev; -} - -struct gendisk * find_gendisk (kdev_t dev) -{ - struct gendisk *tmp = gendisk_head; - - while (tmp != NULL) { - if (tmp->major == MAJOR(dev)) - return (tmp); - tmp = tmp->next; - } - return (NULL); -} - -mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) -{ - mdk_rdev_t * rdev; - struct md_list_head *tmp; - - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->desc_nr == nr) - return rdev; - } - return NULL; -} - -mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) -{ - struct md_list_head *tmp; - mdk_rdev_t *rdev; - - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->dev == dev) - return rdev; - } - return NULL; -} - -static MD_LIST_HEAD(device_names); - -char * partition_name (kdev_t dev) -{ - struct gendisk *hd; - static char nomem [] = "<nomem>"; - dev_name_t *dname; - struct md_list_head *tmp = device_names.next; - - while (tmp != &device_names) { - dname = md_list_entry(tmp, dev_name_t, list); - if (dname->dev == dev) - return dname->name; - tmp = tmp->next; - } - - dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); - - if (!dname) - return nomem; - /* - * ok, add this new device name to the list - */ - hd = find_gendisk (dev); - dname->name = NULL; - if (hd) - dname->name = disk_name (hd, MINOR(dev), dname->namebuf); - if (!dname->name) { - sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); - dname->name = dname->namebuf; - } - - dname->dev = dev; - MD_INIT_LIST_HEAD(&dname->list); - md_list_add(&dname->list, &device_names); - - return dname->name; -} - -static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev, - int persistent) -{ - unsigned int size = 0; - - if (blk_size[MAJOR(dev)]) - size = blk_size[MAJOR(dev)][MINOR(dev)]; - if (persistent) - size = MD_NEW_SIZE_BLOCKS(size); - return size; -} - -static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent) -{ - unsigned int size; - - size = calc_dev_sboffset(dev, mddev, persistent); - if (!mddev->sb) { - MD_BUG(); - return size; - } - if (mddev->sb->chunk_size) - size &= ~(mddev->sb->chunk_size/1024 - 1); - return size; -} - -static unsigned int zoned_raid_size (mddev_t *mddev) -{ - unsigned int mask; - mdk_rdev_t * rdev; - struct md_list_head *tmp; - - if (!mddev->sb) { - MD_BUG(); - return -EINVAL; - } - /* - * do size and offset calculations. - */ - mask = ~(mddev->sb->chunk_size/1024 - 1); - - ITERATE_RDEV(mddev,rdev,tmp) { - rdev->size &= mask; - md_size[mdidx(mddev)] += rdev->size; - } - return 0; -} - -/* - * We check wether all devices are numbered from 0 to nb_dev-1. The - * order is guaranteed even after device name changes. - * - * Some personalities (raid0, linear) use this. Personalities that - * provide data have to be able to deal with loss of individual - * disks, so they do their checking themselves. - */ -int md_check_ordering (mddev_t *mddev) -{ - int i, c; - mdk_rdev_t *rdev; - struct md_list_head *tmp; - - /* - * First, all devices must be fully functional - */ - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) { - printk("md: md%d's device %s faulty, aborting.\n", - mdidx(mddev), partition_name(rdev->dev)); - goto abort; - } - } - - c = 0; - ITERATE_RDEV(mddev,rdev,tmp) { - c++; - } - if (c != mddev->nb_dev) { - MD_BUG(); - goto abort; - } - if (mddev->nb_dev != mddev->sb->raid_disks) { - printk("md: md%d, array needs %d disks, has %d, aborting.\n", - mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev); - goto abort; - } - /* - * Now the numbering check - */ - for (i = 0; i < mddev->nb_dev; i++) { - c = 0; - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->desc_nr == i) - c++; - } - if (!c) { - printk("md: md%d, missing disk #%d, aborting.\n", - mdidx(mddev), i); - goto abort; - } - if (c > 1) { - printk("md: md%d, too many disks #%d, aborting.\n", - mdidx(mddev), i); - goto abort; - } - } - return 0; -abort: - return 1; -} - -static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb) -{ - if (disk_active(disk)) { - sb->working_disks--; - } else { - if (disk_spare(disk)) { - sb->spare_disks--; - sb->working_disks--; - } else { - sb->failed_disks--; - } - } - sb->nr_disks--; - disk->major = 0; - disk->minor = 0; - mark_disk_removed(disk); -} - -#define BAD_MAGIC KERN_ERR \ -"md: invalid raid superblock magic on %s\n" - -#define BAD_MINOR KERN_ERR \ -"md: %s: invalid raid minor (%x)\n" - -#define OUT_OF_MEM KERN_ALERT \ -"md: out of memory.\n" - -#define NO_SB KERN_ERR \ -"md: disabled device %s, could not read superblock.\n" - -#define BAD_CSUM KERN_WARNING \ -"md: invalid superblock checksum on %s\n" - -static int alloc_array_sb (mddev_t * mddev) -{ - if (mddev->sb) { - MD_BUG(); - return 0; - } - - mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); - if (!mddev->sb) - return -ENOMEM; - md_clear_page(mddev->sb); - return 0; -} - -static int alloc_disk_sb (mdk_rdev_t * rdev) -{ - if (rdev->sb) - MD_BUG(); - - rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL); - if (!rdev->sb) { - printk (OUT_OF_MEM); - return -EINVAL; - } - md_clear_page(rdev->sb); - - return 0; -} - -static void free_disk_sb (mdk_rdev_t * rdev) -{ - if (rdev->sb) { - free_page((unsigned long) rdev->sb); - rdev->sb = NULL; - rdev->sb_offset = 0; - rdev->size = 0; - } else { - if (!rdev->faulty) - MD_BUG(); - } -} - -static void mark_rdev_faulty (mdk_rdev_t * rdev) -{ - if (!rdev) { - MD_BUG(); - return; - } - free_disk_sb(rdev); - rdev->faulty = 1; -} - -static int read_disk_sb (mdk_rdev_t * rdev) -{ - int ret = -EINVAL; - struct buffer_head *bh = NULL; - kdev_t dev = rdev->dev; - mdp_super_t *sb; - unsigned long sb_offset; - - if (!rdev->sb) { - MD_BUG(); - goto abort; - } - - /* - * Calculate the position of the superblock, - * it's at the end of the disk - */ - sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); - rdev->sb_offset = sb_offset; - printk("(read) %s's sb offset: %ld", partition_name(dev), sb_offset); - fsync_dev(dev); - set_blocksize (dev, MD_SB_BYTES); - bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - - if (bh) { - sb = (mdp_super_t *) bh->b_data; - memcpy (rdev->sb, sb, MD_SB_BYTES); - } else { - printk (NO_SB,partition_name(rdev->dev)); - goto abort; - } - printk(" [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); - ret = 0; -abort: - if (bh) - brelse (bh); - return ret; -} - -static unsigned int calc_sb_csum (mdp_super_t * sb) -{ - unsigned int disk_csum, csum; - - disk_csum = sb->sb_csum; - sb->sb_csum = 0; - csum = csum_partial((void *)sb, MD_SB_BYTES, 0); - sb->sb_csum = disk_csum; - return csum; -} - -/* - * Check one RAID superblock for generic plausibility - */ - -static int check_disk_sb (mdk_rdev_t * rdev) -{ - mdp_super_t *sb; - int ret = -EINVAL; - - sb = rdev->sb; - if (!sb) { - MD_BUG(); - goto abort; - } - - if (sb->md_magic != MD_SB_MAGIC) { - printk (BAD_MAGIC, partition_name(rdev->dev)); - goto abort; - } - - if (sb->md_minor >= MAX_MD_DEVS) { - printk (BAD_MINOR, partition_name(rdev->dev), - sb->md_minor); - goto abort; - } - - if (calc_sb_csum(sb) != sb->sb_csum) - printk(BAD_CSUM, partition_name(rdev->dev)); - ret = 0; -abort: - return ret; -} - -static kdev_t dev_unit(kdev_t dev) -{ - unsigned int mask; - struct gendisk *hd = find_gendisk(dev); - - if (!hd) - return 0; - mask = ~((1 << hd->minor_shift) - 1); - - return MKDEV(MAJOR(dev), MINOR(dev) & mask); -} - -static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) -{ - struct md_list_head *tmp; - mdk_rdev_t *rdev; - - ITERATE_RDEV(mddev,rdev,tmp) - if (dev_unit(rdev->dev) == dev_unit(dev)) - return rdev; - - return NULL; -} - -static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) -{ - struct md_list_head *tmp; - mdk_rdev_t *rdev; - - ITERATE_RDEV(mddev1,rdev,tmp) - if (match_dev_unit(mddev2, rdev->dev)) - return 1; - - return 0; -} - -static MD_LIST_HEAD(all_raid_disks); -static MD_LIST_HEAD(pending_raid_disks); - -static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev) -{ - mdk_rdev_t *same_pdev; - - if (rdev->mddev) { - MD_BUG(); - return; - } - same_pdev = match_dev_unit(mddev, rdev->dev); - if (same_pdev) - printk( KERN_WARNING -"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" -" protection against single-disk failure might be compromised.\n", - mdidx(mddev), partition_name(rdev->dev), - partition_name(same_pdev->dev)); - - md_list_add(&rdev->same_set, &mddev->disks); - rdev->mddev = mddev; - mddev->nb_dev++; - printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev); -} - -static void unbind_rdev_from_array (mdk_rdev_t * rdev) -{ - if (!rdev->mddev) { - MD_BUG(); - return; - } - md_list_del(&rdev->same_set); - MD_INIT_LIST_HEAD(&rdev->same_set); - rdev->mddev->nb_dev--; - printk("unbind<%s,%d>\n", partition_name(rdev->dev), - rdev->mddev->nb_dev); - rdev->mddev = NULL; -} - -/* - * prevent the device from being mounted, repartitioned or - * otherwise reused by a RAID array (or any other kernel - * subsystem), by opening the device. [simply getting an - * inode is not enough, the SCSI module usage code needs - * an explicit open() on the device] - */ -static int lock_rdev (mdk_rdev_t *rdev) -{ - int err = 0; - - /* - * First insert a dummy inode. - */ - if (rdev->inode) - MD_BUG(); - rdev->inode = get_empty_inode(); - if (!rdev->inode) - return -ENOMEM; - /* - * we dont care about any other fields - */ - rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev; - insert_inode_hash(rdev->inode); - - memset(&rdev->filp, 0, sizeof(rdev->filp)); - rdev->filp.f_mode = 3; /* read write */ - return err; -} - -static void unlock_rdev (mdk_rdev_t *rdev) -{ - if (!rdev->inode) - MD_BUG(); - iput(rdev->inode); - rdev->inode = NULL; -} - -static void export_rdev (mdk_rdev_t * rdev) -{ - printk("export_rdev(%s)\n",partition_name(rdev->dev)); - if (rdev->mddev) - MD_BUG(); - unlock_rdev(rdev); - free_disk_sb(rdev); - md_list_del(&rdev->all); - MD_INIT_LIST_HEAD(&rdev->all); - if (rdev->pending.next != &rdev->pending) { - printk("(%s was pending)\n",partition_name(rdev->dev)); - md_list_del(&rdev->pending); - MD_INIT_LIST_HEAD(&rdev->pending); - } - rdev->dev = 0; - rdev->faulty = 0; - kfree(rdev); -} - -static void kick_rdev_from_array (mdk_rdev_t * rdev) -{ - unbind_rdev_from_array(rdev); - export_rdev(rdev); -} - -static void export_array (mddev_t *mddev) -{ - struct md_list_head *tmp; - mdk_rdev_t *rdev; - mdp_super_t *sb = mddev->sb; - - if (mddev->sb) { - mddev->sb = NULL; - free_page((unsigned long) sb); - } - - ITERATE_RDEV(mddev,rdev,tmp) { - if (!rdev->mddev) { - MD_BUG(); - continue; - } - kick_rdev_from_array(rdev); - } - if (mddev->nb_dev) - MD_BUG(); -} - -static void free_mddev (mddev_t *mddev) -{ - if (!mddev) { - MD_BUG(); - return; - } - - export_array(mddev); - md_size[mdidx(mddev)] = 0; - md_hd_struct[mdidx(mddev)].nr_sects = 0; - - /* - * Make sure nobody else is using this mddev - * (careful, we rely on the global kernel lock here) - */ - while (md_atomic_read(&mddev->resync_sem.count) != 1) - schedule(); - while (md_atomic_read(&mddev->recovery_sem.count) != 1) - schedule(); - - del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); - md_list_del(&mddev->all_mddevs); - MD_INIT_LIST_HEAD(&mddev->all_mddevs); - kfree(mddev); - MOD_DEC_USE_COUNT; -} - -#undef BAD_CSUM -#undef BAD_MAGIC -#undef OUT_OF_MEM -#undef NO_SB - -static void print_desc(mdp_disk_t *desc) -{ - printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number, - partition_name(MKDEV(desc->major,desc->minor)), - desc->major,desc->minor,desc->raid_disk,desc->state); -} - -static void print_sb(mdp_super_t *sb) -{ - int i; - - printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", - sb->major_version, sb->minor_version, sb->patch_version, - sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, - sb->ctime); - printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, - sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, - sb->layout, sb->chunk_size); - printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", - sb->utime, sb->state, sb->active_disks, sb->working_disks, - sb->failed_disks, sb->spare_disks, - sb->sb_csum, (unsigned long)sb->events_lo); - - for (i = 0; i < MD_SB_DISKS; i++) { - mdp_disk_t *desc; - - desc = sb->disks + i; - printk(" D %2d: ", i); - print_desc(desc); - } - printk(" THIS: "); - print_desc(&sb->this_disk); - -} - -static void print_rdev(mdk_rdev_t *rdev) -{ - printk(" rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", - partition_name(rdev->dev), partition_name(rdev->old_dev), - rdev->size, rdev->faulty, rdev->desc_nr); - if (rdev->sb) { - printk("rdev superblock:\n"); - print_sb(rdev->sb); - } else - printk("no rdev superblock!\n"); -} - -void md_print_devices (void) -{ - struct md_list_head *tmp, *tmp2; - mdk_rdev_t *rdev; - mddev_t *mddev; - - printk("\n"); - printk(" **********************************\n"); - printk(" * <COMPLETE RAID STATE PRINTOUT> *\n"); - printk(" **********************************\n"); - ITERATE_MDDEV(mddev,tmp) { - printk("md%d: ", mdidx(mddev)); - - ITERATE_RDEV(mddev,rdev,tmp2) - printk("<%s>", partition_name(rdev->dev)); - - if (mddev->sb) { - printk(" array superblock:\n"); - print_sb(mddev->sb); - } else - printk(" no array superblock.\n"); - - ITERATE_RDEV(mddev,rdev,tmp2) - print_rdev(rdev); - } - printk(" **********************************\n"); - printk("\n"); -} - -static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2) -{ - int ret; - mdp_super_t *tmp1, *tmp2; - - tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); - tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); - - if (!tmp1 || !tmp2) { - ret = 0; - goto abort; - } - - *tmp1 = *sb1; - *tmp2 = *sb2; - - /* - * nr_disks is not constant - */ - tmp1->nr_disks = 0; - tmp2->nr_disks = 0; - - if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) - ret = 0; - else - ret = 1; - -abort: - if (tmp1) - kfree(tmp1); - if (tmp2) - kfree(tmp2); - - return ret; -} - -static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) -{ - if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && - (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && - (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && - (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) - - return 1; - - return 0; -} - -static mdk_rdev_t * find_rdev_all (kdev_t dev) -{ - struct md_list_head *tmp; - mdk_rdev_t *rdev; - - tmp = all_raid_disks.next; - while (tmp != &all_raid_disks) { - rdev = md_list_entry(tmp, mdk_rdev_t, all); - if (rdev->dev == dev) - return rdev; - tmp = tmp->next; - } - return NULL; -} - -#define GETBLK_FAILED KERN_ERR \ -"md: getblk failed for device %s\n" - -static int write_disk_sb(mdk_rdev_t * rdev) -{ - struct buffer_head *bh; - kdev_t dev; - unsigned long sb_offset, size; - mdp_super_t *sb; - - if (!rdev->sb) { - MD_BUG(); - return -1; - } - if (rdev->faulty) { - MD_BUG(); - return -1; - } - if (rdev->sb->md_magic != MD_SB_MAGIC) { - MD_BUG(); - return -1; - } - - dev = rdev->dev; - sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); - if (rdev->sb_offset != sb_offset) { - printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset); - goto skip; - } - /* - * If the disk went offline meanwhile and it's just a spare, then - * it's size has changed to zero silently, and the MD code does - * not yet know that it's faulty. - */ - size = calc_dev_size(dev, rdev->mddev, 1); - if (size != rdev->size) { - printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev), rdev->size, size); - goto skip; - } - - printk("(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); - fsync_dev(dev); - set_blocksize(dev, MD_SB_BYTES); - bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - if (!bh) { - printk(GETBLK_FAILED, partition_name(dev)); - return 1; - } - memset(bh->b_data,0,bh->b_size); - sb = (mdp_super_t *) bh->b_data; - memcpy(sb, rdev->sb, MD_SB_BYTES); - - mark_buffer_uptodate(bh, 1); - mark_buffer_dirty(bh, 1); - ll_rw_block(WRITE, 1, &bh); - wait_on_buffer(bh); - brelse(bh); - fsync_dev(dev); -skip: - return 0; -} -#undef GETBLK_FAILED - -static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) -{ - int i, ok = 0; - mdp_disk_t *desc; - - for (i = 0; i < MD_SB_DISKS; i++) { - desc = mddev->sb->disks + i; -#if 0 - if (disk_faulty(desc)) { - if (MKDEV(desc->major,desc->minor) == rdev->dev) - ok = 1; - continue; - } -#endif - if (MKDEV(desc->major,desc->minor) == rdev->dev) { - rdev->sb->this_disk = *desc; - rdev->desc_nr = desc->number; - ok = 1; - break; - } - } - - if (!ok) { - MD_BUG(); - } -} - -static int sync_sbs(mddev_t * mddev) -{ - mdk_rdev_t *rdev; - mdp_super_t *sb; - struct md_list_head *tmp; - - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) - continue; - sb = rdev->sb; - *sb = *mddev->sb; - set_this_disk(mddev, rdev); - sb->sb_csum = calc_sb_csum(sb); - } - return 0; -} - -int md_update_sb(mddev_t * mddev) -{ - int first, err, count = 100; - struct md_list_head *tmp; - mdk_rdev_t *rdev; - -repeat: - mddev->sb->utime = CURRENT_TIME; - if ((++mddev->sb->events_lo)==0) - ++mddev->sb->events_hi; - - if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { - /* - * oops, this 64-bit counter should never wrap. - * Either we are in around ~1 trillion A.C., assuming - * 1 reboot per second, or we have a bug: - */ - MD_BUG(); - mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; - } - sync_sbs(mddev); - - /* - * do not write anything to disk if using - * nonpersistent superblocks - */ - if (mddev->sb->not_persistent) - return 0; - - printk(KERN_INFO "md: updating md%d RAID superblock on device\n", - mdidx(mddev)); - - first = 1; - err = 0; - ITERATE_RDEV(mddev,rdev,tmp) { - if (!first) { - first = 0; - printk(", "); - } - if (rdev->faulty) - printk("(skipping faulty "); - printk("%s ", partition_name(rdev->dev)); - if (!rdev->faulty) { - printk("[events: %08lx]", - (unsigned long)rdev->sb->events_lo); - err += write_disk_sb(rdev); - } else - printk(")\n"); - } - printk(".\n"); - if (err) { - printk("errors occured during superblock update, repeating\n"); - if (--count) - goto repeat; - printk("excessive errors occured during superblock update, exiting\n"); - } - return 0; -} - -/* - * Import a device. If 'on_disk', then sanity check the superblock - * - * mark the device faulty if: - * - * - the device is nonexistent (zero size) - * - the device has no valid superblock - * - * a faulty rdev _never_ has rdev->sb set. - */ -static int md_import_device (kdev_t newdev, int on_disk) -{ - int err; - mdk_rdev_t *rdev; - unsigned int size; - - if (find_rdev_all(newdev)) - return -EEXIST; - - rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); - if (!rdev) { - printk("could not alloc mem for %s!\n", partition_name(newdev)); - return -ENOMEM; - } - memset(rdev, 0, sizeof(*rdev)); - - if (get_super(newdev)) { - printk("md: can not import %s, has active inodes!\n", - partition_name(newdev)); - err = -EBUSY; - goto abort_free; - } - - if ((err = alloc_disk_sb(rdev))) - goto abort_free; - - rdev->dev = newdev; - if (lock_rdev(rdev)) { - printk("md: could not lock %s, zero-size? Marking faulty.\n", - partition_name(newdev)); - err = -EINVAL; - goto abort_free; - } - rdev->desc_nr = -1; - rdev->faulty = 0; - - size = 0; - if (blk_size[MAJOR(newdev)]) - size = blk_size[MAJOR(newdev)][MINOR(newdev)]; - if (!size) { - printk("md: %s has zero size, marking faulty!\n", - partition_name(newdev)); - err = -EINVAL; - goto abort_free; - } - - if (on_disk) { - if ((err = read_disk_sb(rdev))) { - printk("md: could not read %s's sb, not importing!\n", - partition_name(newdev)); - goto abort_free; - } - if ((err = check_disk_sb(rdev))) { - printk("md: %s has invalid sb, not importing!\n", - partition_name(newdev)); - goto abort_free; - } - - rdev->old_dev = MKDEV(rdev->sb->this_disk.major, - rdev->sb->this_disk.minor); - rdev->desc_nr = rdev->sb->this_disk.number; - } - md_list_add(&rdev->all, &all_raid_disks); - MD_INIT_LIST_HEAD(&rdev->pending); - - if (rdev->faulty && rdev->sb) - free_disk_sb(rdev); - return 0; - -abort_free: - if (rdev->sb) { - if (rdev->inode) - unlock_rdev(rdev); - free_disk_sb(rdev); - } - kfree(rdev); - return err; -} - -/* - * Check a full RAID array for plausibility - */ - -#define INCONSISTENT KERN_ERR \ -"md: fatal superblock inconsistency in %s -- removing from array\n" - -#define OUT_OF_DATE KERN_ERR \ -"md: superblock update time inconsistency -- using the most recent one\n" - -#define OLD_VERSION KERN_ALERT \ -"md: md%d: unsupported raid array version %d.%d.%d\n" - -#define NOT_CLEAN_IGNORE KERN_ERR \ -"md: md%d: raid array is not clean -- starting background reconstruction\n" - -#define UNKNOWN_LEVEL KERN_ERR \ -"md: md%d: unsupported raid level %d\n" - -static int analyze_sbs (mddev_t * mddev) -{ - int out_of_date = 0, i; - struct md_list_head *tmp, *tmp2; - mdk_rdev_t *rdev, *rdev2, *freshest; - mdp_super_t *sb; - - /* - * Verify the RAID superblock on each real device - */ - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) { - MD_BUG(); - goto abort; - } - if (!rdev->sb) { - MD_BUG(); - goto abort; - } - if (check_disk_sb(rdev)) - goto abort; - } - - /* - * The superblock constant part has to be the same - * for all disks in the array. - */ - sb = NULL; - - ITERATE_RDEV(mddev,rdev,tmp) { - if (!sb) { - sb = rdev->sb; - continue; - } - if (!sb_equal(sb, rdev->sb)) { - printk (INCONSISTENT, partition_name(rdev->dev)); - kick_rdev_from_array(rdev); - continue; - } - } - - /* - * OK, we have all disks and the array is ready to run. Let's - * find the freshest superblock, that one will be the superblock - * that represents the whole array. - */ - if (!mddev->sb) - if (alloc_array_sb(mddev)) - goto abort; - sb = mddev->sb; - freshest = NULL; - - ITERATE_RDEV(mddev,rdev,tmp) { - __u64 ev1, ev2; - /* - * if the checksum is invalid, use the superblock - * only as a last resort. (decrease it's age by - * one event) - */ - if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { - if (rdev->sb->events_lo || rdev->sb->events_hi) - if ((rdev->sb->events_lo--)==0) - rdev->sb->events_hi--; - } - - printk("%s's event counter: %08lx\n", partition_name(rdev->dev), - (unsigned long)rdev->sb->events_lo); - if (!freshest) { - freshest = rdev; - continue; - } - /* - * Find the newest superblock version - */ - ev1 = md_event(rdev->sb); - ev2 = md_event(freshest->sb); - if (ev1 != ev2) { - out_of_date = 1; - if (ev1 > ev2) - freshest = rdev; - } - } - if (out_of_date) { - printk(OUT_OF_DATE); - printk("freshest: %s\n", partition_name(freshest->dev)); - } - memcpy (sb, freshest->sb, sizeof(*sb)); - - /* - * at this point we have picked the 'best' superblock - * from all available superblocks. - * now we validate this superblock and kick out possibly - * failed disks. - */ - ITERATE_RDEV(mddev,rdev,tmp) { - /* - * Kick all non-fresh devices faulty - */ - __u64 ev1, ev2; - ev1 = md_event(rdev->sb); - ev2 = md_event(sb); - ++ev1; - if (ev1 < ev2) { - printk("md: kicking non-fresh %s from array!\n", - partition_name(rdev->dev)); - kick_rdev_from_array(rdev); - continue; - } - } - - /* - * Fix up changed device names ... but only if this disk has a - * recent update time. Use faulty checksum ones too. - */ - ITERATE_RDEV(mddev,rdev,tmp) { - __u64 ev1, ev2, ev3; - if (rdev->faulty) { /* REMOVEME */ - MD_BUG(); - goto abort; - } - ev1 = md_event(rdev->sb); - ev2 = md_event(sb); - ev3 = ev2; - --ev3; - if ((rdev->dev != rdev->old_dev) && - ((ev1 == ev2) || (ev1 == ev3))) { - mdp_disk_t *desc; - - printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev)); - if (rdev->desc_nr == -1) { - MD_BUG(); - goto abort; - } - desc = &sb->disks[rdev->desc_nr]; - if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { - MD_BUG(); - goto abort; - } - desc->major = MAJOR(rdev->dev); - desc->minor = MINOR(rdev->dev); - desc = &rdev->sb->this_disk; - desc->major = MAJOR(rdev->dev); - desc->minor = MINOR(rdev->dev); - } - } - - /* - * Remove unavailable and faulty devices ... - * - * note that if an array becomes completely unrunnable due to - * missing devices, we do not write the superblock back, so the - * administrator has a chance to fix things up. The removal thus - * only happens if it's nonfatal to the contents of the array. - */ - for (i = 0; i < MD_SB_DISKS; i++) { - int found; - mdp_disk_t *desc; - kdev_t dev; - - desc = sb->disks + i; - dev = MKDEV(desc->major, desc->minor); - - /* - * We kick faulty devices/descriptors immediately. - */ - if (disk_faulty(desc)) { - found = 0; - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->desc_nr != desc->number) - continue; - printk("md%d: kicking faulty %s!\n", - mdidx(mddev),partition_name(rdev->dev)); - kick_rdev_from_array(rdev); - found = 1; - break; - } - if (!found) { - if (dev == MKDEV(0,0)) - continue; - printk("md%d: removing former faulty %s!\n", - mdidx(mddev), partition_name(dev)); - } - remove_descriptor(desc, sb); - continue; - } - - if (dev == MKDEV(0,0)) - continue; - /* - * Is this device present in the rdev ring? - */ - found = 0; - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->desc_nr == desc->number) { - found = 1; - break; - } - } - if (found) - continue; - - printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev)); - remove_descriptor(desc, sb); - } - - /* - * Double check wether all devices mentioned in the - * superblock are in the rdev ring. - */ - for (i = 0; i < MD_SB_DISKS; i++) { - mdp_disk_t *desc; - kdev_t dev; - - desc = sb->disks + i; - dev = MKDEV(desc->major, desc->minor); - - if (dev == MKDEV(0,0)) - continue; - - if (disk_faulty(desc)) { - MD_BUG(); - goto abort; - } - - rdev = find_rdev(mddev, dev); - if (!rdev) { - MD_BUG(); - goto abort; - } - } - - /* - * Do a final reality check. - */ - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->desc_nr == -1) { - MD_BUG(); - goto abort; - } - /* - * is the desc_nr unique? - */ - ITERATE_RDEV(mddev,rdev2,tmp2) { - if ((rdev2 != rdev) && - (rdev2->desc_nr == rdev->desc_nr)) { - MD_BUG(); - goto abort; - } - } - /* - * is the device unique? - */ - ITERATE_RDEV(mddev,rdev2,tmp2) { - if ((rdev2 != rdev) && - (rdev2->dev == rdev->dev)) { - MD_BUG(); - goto abort; - } - } - } - - /* - * Check if we can support this RAID array - */ - if (sb->major_version != MD_MAJOR_VERSION || - sb->minor_version > MD_MINOR_VERSION) { - - printk (OLD_VERSION, mdidx(mddev), sb->major_version, - sb->minor_version, sb->patch_version); - goto abort; - } - - if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || - (sb->level == 4) || (sb->level == 5))) - printk (NOT_CLEAN_IGNORE, mdidx(mddev)); - - return 0; -abort: - return 1; -} - -#undef INCONSISTENT -#undef OUT_OF_DATE -#undef OLD_VERSION -#undef OLD_LEVEL - -static int device_size_calculation (mddev_t * mddev) -{ - int data_disks = 0, persistent; - unsigned int readahead; - mdp_super_t *sb = mddev->sb; - struct md_list_head *tmp; - mdk_rdev_t *rdev; - - /* - * Do device size calculation. Bail out if too small. - * (we have to do this after having validated chunk_size, - * because device size has to be modulo chunk_size) - */ - persistent = !mddev->sb->not_persistent; - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) - continue; - if (rdev->size) { - MD_BUG(); - continue; - } - rdev->size = calc_dev_size(rdev->dev, mddev, persistent); - if (rdev->size < sb->chunk_size / 1024) { - printk (KERN_WARNING - "Dev %s smaller than chunk_size: %ldk < %dk\n", - partition_name(rdev->dev), - rdev->size, sb->chunk_size / 1024); - return -EINVAL; - } - } - - switch (sb->level) { - case -3: - data_disks = 1; - break; - case -2: - data_disks = 1; - break; - case -1: - zoned_raid_size(mddev); - data_disks = 1; - break; - case 0: - zoned_raid_size(mddev); - data_disks = sb->raid_disks; - break; - case 1: - data_disks = 1; - break; - case 4: - case 5: - data_disks = sb->raid_disks-1; - break; - default: - printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level); - goto abort; - } - if (!md_size[mdidx(mddev)]) - md_size[mdidx(mddev)] = sb->size * data_disks; - - readahead = MD_READAHEAD; - if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { - readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; - if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) - readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; - } else { - if (sb->level == -3) - readahead = 0; - } - md_maxreadahead[mdidx(mddev)] = readahead; - - printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", - mdidx(mddev), readahead*(PAGE_SIZE/1024)); - - printk(KERN_INFO - "md%d: %d data-disks, max readahead per data-disk: %ldk\n", - mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); - return 0; -abort: - return 1; -} - - -#define TOO_BIG_CHUNKSIZE KERN_ERR \ -"too big chunk_size: %d > %d\n" - -#define TOO_SMALL_CHUNKSIZE KERN_ERR \ -"too small chunk_size: %d < %ld\n" - -#define BAD_CHUNKSIZE KERN_ERR \ -"no chunksize specified, see 'man raidtab'\n" - -static int do_md_run (mddev_t * mddev) -{ - int pnum, err; - int chunk_size; - struct md_list_head *tmp; - mdk_rdev_t *rdev; - - - if (!mddev->nb_dev) { - MD_BUG(); - return -EINVAL; - } - - if (mddev->pers) - return -EBUSY; - - /* - * Resize disks to align partitions size on a given - * chunk size. - */ - md_size[mdidx(mddev)] = 0; - - /* - * Analyze all RAID superblock(s) - */ - if (analyze_sbs(mddev)) { - MD_BUG(); - return -EINVAL; - } - - chunk_size = mddev->sb->chunk_size; - pnum = level_to_pers(mddev->sb->level); - - mddev->param.chunk_size = chunk_size; - mddev->param.personality = pnum; - - if (chunk_size > MAX_CHUNK_SIZE) { - printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); - return -EINVAL; - } - /* - * chunk-size has to be a power of 2 and multiples of PAGE_SIZE - */ - if ( (1 << ffz(~chunk_size)) != chunk_size) { - MD_BUG(); - return -EINVAL; - } - if (chunk_size < PAGE_SIZE) { - printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); - return -EINVAL; - } - - if (pnum >= MAX_PERSONALITY) { - MD_BUG(); - return -EINVAL; - } - - if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) { - /* - * 'default chunksize' in the old md code used to - * be PAGE_SIZE, baaad. - * we abort here to be on the safe side. We dont - * want to continue the bad practice. - */ - printk(BAD_CHUNKSIZE); - return -EINVAL; - } - - if (!pers[pnum]) - { -#ifdef CONFIG_KMOD - char module_name[80]; - sprintf (module_name, "md-personality-%d", pnum); - request_module (module_name); - if (!pers[pnum]) -#endif - return -EINVAL; - } - - if (device_size_calculation(mddev)) - return -EINVAL; - - /* - * Drop all container device buffers, from now on - * the only valid external interface is through the md - * device. - * Also find largest hardsector size - */ - md_hardsect_sizes[mdidx(mddev)] = 512; - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) - continue; - fsync_dev(rdev->dev); - invalidate_buffers(rdev->dev); - if (get_hardsect_size(rdev->dev) - > md_hardsect_sizes[mdidx(mddev)]) - md_hardsect_sizes[mdidx(mddev)] = - get_hardsect_size(rdev->dev); - } - md_blocksizes[mdidx(mddev)] = 1024; - if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) - md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; - mddev->pers = pers[pnum]; - - err = mddev->pers->run(mddev); - if (err) { - printk("pers->run() failed ...\n"); - mddev->pers = NULL; - return -EINVAL; - } - - mddev->sb->state &= ~(1 << MD_SB_CLEAN); - md_update_sb(mddev); - - /* - * md_size has units of 1K blocks, which are - * twice as large as sectors. - */ - md_hd_struct[mdidx(mddev)].start_sect = 0; - md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1; - - read_ahead[MD_MAJOR] = 1024; - return (0); -} - -#undef TOO_BIG_CHUNKSIZE -#undef BAD_CHUNKSIZE - -#define OUT(x) do { err = (x); goto out; } while (0) - -static int restart_array (mddev_t *mddev) -{ - int err = 0; - - /* - * Complain if it has no devices - */ - if (!mddev->nb_dev) - OUT(-ENXIO); - - if (mddev->pers) { - if (!mddev->ro) - OUT(-EBUSY); - - mddev->ro = 0; - set_device_ro(mddev_to_kdev(mddev), 0); - - printk (KERN_INFO - "md%d switched to read-write mode.\n", mdidx(mddev)); - /* - * Kick recovery or resync if necessary - */ - md_recover_arrays(); - if (mddev->pers->restart_resync) - mddev->pers->restart_resync(mddev); - } else - err = -EINVAL; - -out: - return err; -} - -#define STILL_MOUNTED KERN_WARNING \ -"md: md%d still mounted.\n" - -static int do_md_stop (mddev_t * mddev, int ro) -{ - int err = 0, resync_interrupted = 0; - kdev_t dev = mddev_to_kdev(mddev); - - if (!ro && get_super(dev)) { - printk (STILL_MOUNTED, mdidx(mddev)); - OUT(-EBUSY); - } - - if (mddev->pers) { - /* - * It is safe to call stop here, it only frees private - * data. Also, it tells us if a device is unstoppable - * (eg. resyncing is in progress) - */ - if (mddev->pers->stop_resync) - if (mddev->pers->stop_resync(mddev)) - resync_interrupted = 1; - - if (mddev->recovery_running) - md_interrupt_thread(md_recovery_thread); - - /* - * This synchronizes with signal delivery to the - * resync or reconstruction thread. It also nicely - * hangs the process if some reconstruction has not - * finished. - */ - down(&mddev->recovery_sem); - up(&mddev->recovery_sem); - - /* - * sync and invalidate buffers because we cannot kill the - * main thread with valid IO transfers still around. - * the kernel lock protects us from new requests being - * added after invalidate_buffers(). - */ - fsync_dev (mddev_to_kdev(mddev)); - fsync_dev (dev); - invalidate_buffers (dev); - - if (ro) { - if (mddev->ro) - OUT(-ENXIO); - mddev->ro = 1; - } else { - if (mddev->ro) - set_device_ro(dev, 0); - if (mddev->pers->stop(mddev)) { - if (mddev->ro) - set_device_ro(dev, 1); - OUT(-EBUSY); - } - if (mddev->ro) - mddev->ro = 0; - } - if (mddev->sb) { - /* - * mark it clean only if there was no resync - * interrupted. - */ - if (!mddev->recovery_running && !resync_interrupted) { - printk("marking sb clean...\n"); - mddev->sb->state |= 1 << MD_SB_CLEAN; - } - md_update_sb(mddev); - } - if (ro) - set_device_ro(dev, 1); - } - - /* - * Free resources if final stop - */ - if (!ro) { - printk (KERN_INFO "md%d stopped.\n", mdidx(mddev)); - free_mddev(mddev); - - } else - printk (KERN_INFO - "md%d switched to read-only mode.\n", mdidx(mddev)); -out: - return err; -} - -#undef OUT - -/* - * We have to safely support old arrays too. - */ -int detect_old_array (mdp_super_t *sb) -{ - if (sb->major_version > 0) - return 0; - if (sb->minor_version >= 90) - return 0; - - return -EINVAL; -} - - -static void autorun_array (mddev_t *mddev) -{ - mdk_rdev_t *rdev; - struct md_list_head *tmp; - int err; - - if (mddev->disks.prev == &mddev->disks) { - MD_BUG(); - return; - } - - printk("running: "); - - ITERATE_RDEV(mddev,rdev,tmp) { - printk("<%s>", partition_name(rdev->dev)); - } - printk("\nnow!\n"); - - err = do_md_run (mddev); - if (err) { - printk("do_md_run() returned %d\n", err); - /* - * prevent the writeback of an unrunnable array - */ - mddev->sb_dirty = 0; - do_md_stop (mddev, 0); - } -} - -/* - * lets try to run arrays based on all disks that have arrived - * until now. (those are in the ->pending list) - * - * the method: pick the first pending disk, collect all disks with - * the same UUID, remove all from the pending list and put them into - * the 'same_array' list. Then order this list based on superblock - * update time (freshest comes first), kick out 'old' disks and - * compare superblocks. If everything's fine then run it. - */ -static void autorun_devices (void) -{ - struct md_list_head candidates; - struct md_list_head *tmp; - mdk_rdev_t *rdev0, *rdev; - mddev_t *mddev; - kdev_t md_kdev; - - - printk("autorun ...\n"); - while (pending_raid_disks.next != &pending_raid_disks) { - rdev0 = md_list_entry(pending_raid_disks.next, - mdk_rdev_t, pending); - - printk("considering %s ...\n", partition_name(rdev0->dev)); - MD_INIT_LIST_HEAD(&candidates); - ITERATE_RDEV_PENDING(rdev,tmp) { - if (uuid_equal(rdev0, rdev)) { - if (!sb_equal(rdev0->sb, rdev->sb)) { - printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev)); - continue; - } - printk(" adding %s ...\n", partition_name(rdev->dev)); - md_list_del(&rdev->pending); - md_list_add(&rdev->pending, &candidates); - } - } - /* - * now we have a set of devices, with all of them having - * mostly sane superblocks. It's time to allocate the - * mddev. - */ - md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); - mddev = kdev_to_mddev(md_kdev); - if (mddev) { - printk("md%d already running, cannot run %s\n", - mdidx(mddev), partition_name(rdev0->dev)); - ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) - export_rdev(rdev); - continue; - } - mddev = alloc_mddev(md_kdev); - printk("created md%d\n", mdidx(mddev)); - ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { - bind_rdev_to_array(rdev, mddev); - md_list_del(&rdev->pending); - MD_INIT_LIST_HEAD(&rdev->pending); - } - autorun_array(mddev); - } - printk("... autorun DONE.\n"); -} - -/* - * import RAID devices based on one partition - * if possible, the array gets run as well. - */ - -#define BAD_VERSION KERN_ERR \ -"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" - -#define OUT_OF_MEM KERN_ALERT \ -"md: out of memory.\n" - -#define NO_DEVICE KERN_ERR \ -"md: disabled device %s\n" - -#define AUTOADD_FAILED KERN_ERR \ -"md: auto-adding devices to md%d FAILED (error %d).\n" - -#define AUTOADD_FAILED_USED KERN_ERR \ -"md: cannot auto-add device %s to md%d, already used.\n" - -#define AUTORUN_FAILED KERN_ERR \ -"md: auto-running md%d FAILED (error %d).\n" - -#define MDDEV_BUSY KERN_ERR \ -"md: cannot auto-add to md%d, already running.\n" - -#define AUTOADDING KERN_INFO \ -"md: auto-adding devices to md%d, based on %s's superblock.\n" - -#define AUTORUNNING KERN_INFO \ -"md: auto-running md%d.\n" - -static int autostart_array (kdev_t startdev) -{ - int err = -EINVAL, i; - mdp_super_t *sb = NULL; - mdk_rdev_t *start_rdev = NULL, *rdev; - - if (md_import_device(startdev, 1)) { - printk("could not import %s!\n", partition_name(startdev)); - goto abort; - } - - start_rdev = find_rdev_all(startdev); - if (!start_rdev) { - MD_BUG(); - goto abort; - } - if (start_rdev->faulty) { - printk("can not autostart based on faulty %s!\n", - partition_name(startdev)); - goto abort; - } - md_list_add(&start_rdev->pending, &pending_raid_disks); - - sb = start_rdev->sb; - - err = detect_old_array(sb); - if (err) { - printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n"); - goto abort; - } - - for (i = 0; i < MD_SB_DISKS; i++) { - mdp_disk_t *desc; - kdev_t dev; - - desc = sb->disks + i; - dev = MKDEV(desc->major, desc->minor); - - if (dev == MKDEV(0,0)) - continue; - if (dev == startdev) - continue; - if (md_import_device(dev, 1)) { - printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev)); - continue; - } - rdev = find_rdev_all(dev); - if (!rdev) { - MD_BUG(); - goto abort; - } - md_list_add(&rdev->pending, &pending_raid_disks); - } - - /* - * possibly return codes - */ - autorun_devices(); - return 0; - -abort: - if (start_rdev) - export_rdev(start_rdev); - return err; -} - -#undef BAD_VERSION -#undef OUT_OF_MEM -#undef NO_DEVICE -#undef AUTOADD_FAILED_USED -#undef AUTOADD_FAILED -#undef AUTORUN_FAILED -#undef AUTOADDING -#undef AUTORUNNING - -struct { - int set; - int noautodetect; - -} raid_setup_args md__initdata = { 0, 0 }; - -void md_setup_drive(void) md__init; - -/* - * Searches all registered partitions for autorun RAID arrays - * at boot time. - */ -#ifdef CONFIG_AUTODETECT_RAID -static int detected_devices[128] md__initdata; -static int dev_cnt=0; -void md_autodetect_dev(kdev_t dev) -{ - if (dev_cnt >= 0 && dev_cnt < 127) - detected_devices[dev_cnt++] = dev; -} -#endif - -void md__init md_run_setup(void) -{ -#ifdef CONFIG_AUTODETECT_RAID - mdk_rdev_t *rdev; - int i; - - if (raid_setup_args.noautodetect) - printk(KERN_INFO "skipping autodetection of RAID arrays\n"); - else { - - printk(KERN_INFO "autodetecting RAID arrays\n"); - - for (i=0; i<dev_cnt; i++) { - kdev_t dev = detected_devices[i]; - - if (md_import_device(dev,1)) { - printk(KERN_ALERT "could not import %s!\n", - partition_name(dev)); - continue; - } - /* - * Sanity checks: - */ - rdev = find_rdev_all(dev); - if (!rdev) { - MD_BUG(); - continue; - } - if (rdev->faulty) { - MD_BUG(); - continue; - } - md_list_add(&rdev->pending, &pending_raid_disks); - } - - autorun_devices(); - } - - dev_cnt = -1; /* make sure further calls to md_autodetect_dev are ignored */ -#endif -#ifdef CONFIG_MD_BOOT - md_setup_drive(); -#endif - -} - -static int get_version (void * arg) -{ - mdu_version_t ver; - - ver.major = MD_MAJOR_VERSION; - ver.minor = MD_MINOR_VERSION; - ver.patchlevel = MD_PATCHLEVEL_VERSION; - - if (md_copy_to_user(arg, &ver, sizeof(ver))) - return -EFAULT; - - return 0; -} - -#define SET_FROM_SB(x) info.x = mddev->sb->x -static int get_array_info (mddev_t * mddev, void * arg) -{ - mdu_array_info_t info; - - if (!mddev->sb) - return -EINVAL; - - SET_FROM_SB(major_version); - SET_FROM_SB(minor_version); - SET_FROM_SB(patch_version); - SET_FROM_SB(ctime); - SET_FROM_SB(level); - SET_FROM_SB(size); - SET_FROM_SB(nr_disks); - SET_FROM_SB(raid_disks); - SET_FROM_SB(md_minor); - SET_FROM_SB(not_persistent); - - SET_FROM_SB(utime); - SET_FROM_SB(state); - SET_FROM_SB(active_disks); - SET_FROM_SB(working_disks); - SET_FROM_SB(failed_disks); - SET_FROM_SB(spare_disks); - - SET_FROM_SB(layout); - SET_FROM_SB(chunk_size); - - if (md_copy_to_user(arg, &info, sizeof(info))) - return -EFAULT; - - return 0; -} -#undef SET_FROM_SB - -#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x -static int get_disk_info (mddev_t * mddev, void * arg) -{ - mdu_disk_info_t info; - unsigned int nr; - - if (!mddev->sb) - return -EINVAL; - - if (md_copy_from_user(&info, arg, sizeof(info))) - return -EFAULT; - - nr = info.number; - if (nr >= mddev->sb->nr_disks) - return -EINVAL; - - SET_FROM_SB(major); - SET_FROM_SB(minor); - SET_FROM_SB(raid_disk); - SET_FROM_SB(state); - - if (md_copy_to_user(arg, &info, sizeof(info))) - return -EFAULT; - - return 0; -} -#undef SET_FROM_SB - -#define SET_SB(x) mddev->sb->disks[nr].x = info->x - -static int add_new_disk (mddev_t * mddev, mdu_disk_info_t *info) -{ - int err, size, persistent; - mdk_rdev_t *rdev; - unsigned int nr; - kdev_t dev; - dev = MKDEV(info->major,info->minor); - - if (find_rdev_all(dev)) { - printk("device %s already used in a RAID array!\n", - partition_name(dev)); - return -EBUSY; - } - if (!mddev->sb) { - /* expecting a device which has a superblock */ - err = md_import_device(dev, 1); - if (err) { - printk("md error, md_import_device returned %d\n", err); - return -EINVAL; - } - rdev = find_rdev_all(dev); - if (!rdev) { - MD_BUG(); - return -EINVAL; - } - if (mddev->nb_dev) { - mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, - mdk_rdev_t, same_set); - if (!uuid_equal(rdev0, rdev)) { - printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev)); - export_rdev(rdev); - return -EINVAL; - } - if (!sb_equal(rdev0->sb, rdev->sb)) { - printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev)); - export_rdev(rdev); - return -EINVAL; - } - } - bind_rdev_to_array(rdev, mddev); - return 0; - } - - nr = info->number; - if (nr >= mddev->sb->nr_disks) - return -EINVAL; - - SET_SB(number); - SET_SB(major); - SET_SB(minor); - SET_SB(raid_disk); - SET_SB(state); - - if ((info->state & (1<<MD_DISK_FAULTY))==0) { - err = md_import_device (dev, 0); - if (err) { - printk("md: error, md_import_device() returned %d\n", err); - return -EINVAL; - } - rdev = find_rdev_all(dev); - if (!rdev) { - MD_BUG(); - return -EINVAL; - } - - rdev->old_dev = dev; - rdev->desc_nr = info->number; - - bind_rdev_to_array(rdev, mddev); - - persistent = !mddev->sb->not_persistent; - if (!persistent) - printk("nonpersistent superblock ...\n"); - if (!mddev->sb->chunk_size) - printk("no chunksize?\n"); - - size = calc_dev_size(dev, mddev, persistent); - rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); - - if (!mddev->sb->size || (mddev->sb->size > size)) - mddev->sb->size = size; - } - - /* - * sync all other superblocks with the main superblock - */ - sync_sbs(mddev); - - return 0; -} -#undef SET_SB - -static int hot_remove_disk (mddev_t * mddev, kdev_t dev) -{ - int err; - mdk_rdev_t *rdev; - mdp_disk_t *disk; - - if (!mddev->pers) - return -ENODEV; - - printk("trying to remove %s from md%d ... \n", - partition_name(dev), mdidx(mddev)); - - if (!mddev->pers->diskop) { - printk("md%d: personality does not support diskops!\n", - mdidx(mddev)); - return -EINVAL; - } - - rdev = find_rdev(mddev, dev); - if (!rdev) - return -ENXIO; - - if (rdev->desc_nr == -1) { - MD_BUG(); - return -EINVAL; - } - disk = &mddev->sb->disks[rdev->desc_nr]; - if (disk_active(disk)) - goto busy; - if (disk_removed(disk)) { - MD_BUG(); - return -EINVAL; - } - - err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); - if (err == -EBUSY) - goto busy; - if (err) { - MD_BUG(); - return -EINVAL; - } - - remove_descriptor(disk, mddev->sb); - kick_rdev_from_array(rdev); - mddev->sb_dirty = 1; - md_update_sb(mddev); - - return 0; -busy: - printk("cannot remove active disk %s from md%d ... \n", - partition_name(dev), mdidx(mddev)); - return -EBUSY; -} - -static int hot_add_disk (mddev_t * mddev, kdev_t dev) -{ - int i, err, persistent; - unsigned int size; - mdk_rdev_t *rdev; - mdp_disk_t *disk; - - if (!mddev->pers) - return -ENODEV; - - printk("trying to hot-add %s to md%d ... \n", - partition_name(dev), mdidx(mddev)); - - if (!mddev->pers->diskop) { - printk("md%d: personality does not support diskops!\n", - mdidx(mddev)); - return -EINVAL; - } - - persistent = !mddev->sb->not_persistent; - size = calc_dev_size(dev, mddev, persistent); - - if (size < mddev->sb->size) { - printk("md%d: disk size %d blocks < array size %d\n", - mdidx(mddev), size, mddev->sb->size); - return -ENOSPC; - } - - rdev = find_rdev(mddev, dev); - if (rdev) - return -EBUSY; - - err = md_import_device (dev, 0); - if (err) { - printk("md: error, md_import_device() returned %d\n", err); - return -EINVAL; - } - rdev = find_rdev_all(dev); - if (!rdev) { - MD_BUG(); - return -EINVAL; - } - if (rdev->faulty) { - printk("md: can not hot-add faulty %s disk to md%d!\n", - partition_name(dev), mdidx(mddev)); - err = -EINVAL; - goto abort_export; - } - bind_rdev_to_array(rdev, mddev); - - /* - * The rest should better be atomic, we can have disk failures - * noticed in interrupt contexts ... - */ - rdev->old_dev = dev; - rdev->size = size; - rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); - - disk = mddev->sb->disks + mddev->sb->raid_disks; - for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { - disk = mddev->sb->disks + i; - - if (!disk->major && !disk->minor) - break; - if (disk_removed(disk)) - break; - } - if (i == MD_SB_DISKS) { - printk("md%d: can not hot-add to full array!\n", mdidx(mddev)); - err = -EBUSY; - goto abort_unbind_export; - } - - if (disk_removed(disk)) { - /* - * reuse slot - */ - if (disk->number != i) { - MD_BUG(); - err = -EINVAL; - goto abort_unbind_export; - } - } else { - disk->number = i; - } - - disk->raid_disk = disk->number; - disk->major = MAJOR(dev); - disk->minor = MINOR(dev); - - if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { - MD_BUG(); - err = -EINVAL; - goto abort_unbind_export; - } - - mark_disk_spare(disk); - mddev->sb->nr_disks++; - mddev->sb->spare_disks++; - mddev->sb->working_disks++; - - mddev->sb_dirty = 1; - - md_update_sb(mddev); - - /* - * Kick recovery, maybe this spare has to be added to the - * array immediately. - */ - md_recover_arrays(); - - return 0; - -abort_unbind_export: - unbind_rdev_from_array(rdev); - -abort_export: - export_rdev(rdev); - return err; -} - -#define SET_SB(x) mddev->sb->x = info->x -static int set_array_info (mddev_t * mddev, mdu_array_info_t *info) -{ - - if (alloc_array_sb(mddev)) - return -ENOMEM; - - mddev->sb->major_version = MD_MAJOR_VERSION; - mddev->sb->minor_version = MD_MINOR_VERSION; - mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; - mddev->sb->ctime = CURRENT_TIME; - - SET_SB(level); - SET_SB(size); - SET_SB(nr_disks); - SET_SB(raid_disks); - SET_SB(md_minor); - SET_SB(not_persistent); - - SET_SB(state); - SET_SB(active_disks); - SET_SB(working_disks); - SET_SB(failed_disks); - SET_SB(spare_disks); - - SET_SB(layout); - SET_SB(chunk_size); - - mddev->sb->md_magic = MD_SB_MAGIC; - - /* - * Generate a 128 bit UUID - */ - get_random_bytes(&mddev->sb->set_uuid0, 4); - get_random_bytes(&mddev->sb->set_uuid1, 4); - get_random_bytes(&mddev->sb->set_uuid2, 4); - get_random_bytes(&mddev->sb->set_uuid3, 4); - - return 0; -} -#undef SET_SB - -static int set_disk_info (mddev_t * mddev, void * arg) -{ - printk("not yet"); - return -EINVAL; -} - -static int clear_array (mddev_t * mddev) -{ - printk("not yet"); - return -EINVAL; -} - -static int write_raid_info (mddev_t * mddev) -{ - printk("not yet"); - return -EINVAL; -} - -static int protect_array (mddev_t * mddev) -{ - printk("not yet"); - return -EINVAL; -} - -static int unprotect_array (mddev_t * mddev) -{ - printk("not yet"); - return -EINVAL; -} - -static int set_disk_faulty (mddev_t *mddev, kdev_t dev) -{ - int ret; - - fsync_dev(mddev_to_kdev(mddev)); - ret = md_error(mddev_to_kdev(mddev), dev); - return ret; -} - -static int md_ioctl (struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - unsigned int minor; - int err = 0; - struct hd_geometry *loc = (struct hd_geometry *) arg; - mddev_t *mddev = NULL; - kdev_t dev; - - if (!md_capable_admin()) - return -EACCES; - - dev = inode->i_rdev; - minor = MINOR(dev); - if (minor >= MAX_MD_DEVS) - return -EINVAL; - - /* - * Commands dealing with the RAID driver but not any - * particular array: - */ - switch (cmd) - { - case RAID_VERSION: - err = get_version((void *)arg); - goto done; - - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto done_unlock; - - case BLKGETSIZE: /* Return device size */ - if (!arg) { - err = -EINVAL; - goto abort; - } - err = md_put_user(md_hd_struct[minor].nr_sects, - (long *) arg); - goto done; - - case BLKFLSBUF: - fsync_dev(dev); - invalidate_buffers(dev); - goto done; - - case BLKRASET: - if (arg > 0xff) { - err = -EINVAL; - goto abort; - } - read_ahead[MAJOR(dev)] = arg; - goto done; - - case BLKRAGET: - if (!arg) { - err = -EINVAL; - goto abort; - } - err = md_put_user (read_ahead[ - MAJOR(dev)], (long *) arg); - goto done; - default: - } - - /* - * Commands creating/starting a new array: - */ - - mddev = kdev_to_mddev(dev); - - switch (cmd) - { - case SET_ARRAY_INFO: - case START_ARRAY: - if (mddev) { - printk("array md%d already exists!\n", - mdidx(mddev)); - err = -EEXIST; - goto abort; - } - default: - } - switch (cmd) - { - case SET_ARRAY_INFO: - mddev = alloc_mddev(dev); - if (!mddev) { - err = -ENOMEM; - goto abort; - } - /* - * alloc_mddev() should possibly self-lock. - */ - err = lock_mddev(mddev); - if (err) { - printk("ioctl, reason %d, cmd %d\n", err, cmd); - goto abort; - } - - if (mddev->sb) { - printk("array md%d already has a superblock!\n", - mdidx(mddev)); - err = -EBUSY; - goto abort_unlock; - } - if (arg) { - mdu_array_info_t info; - if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { - err = -EFAULT; - goto abort_unlock; - } - err = set_array_info(mddev, &info); - if (err) { - printk("couldnt set array info. %d\n", err); - goto abort_unlock; - } - } - goto done_unlock; - - case START_ARRAY: - /* - * possibly make it lock the array ... - */ - err = autostart_array((kdev_t)arg); - if (err) { - printk("autostart %s failed!\n", - partition_name((kdev_t)arg)); - goto abort; - } - goto done; - - default: - } - - /* - * Commands querying/configuring an existing array: - */ - - if (!mddev) { - err = -ENODEV; - goto abort; - } - err = lock_mddev(mddev); - if (err) { - printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); - goto abort; - } - /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ - if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { - err = -ENODEV; - goto abort_unlock; - } - - /* - * Commands even a read-only array can execute: - */ - switch (cmd) - { - case GET_ARRAY_INFO: - err = get_array_info(mddev, (void *)arg); - goto done_unlock; - - case GET_DISK_INFO: - err = get_disk_info(mddev, (void *)arg); - goto done_unlock; - - case RESTART_ARRAY_RW: - err = restart_array(mddev); - goto done_unlock; - - case STOP_ARRAY: - if (!(err = do_md_stop (mddev, 0))) - mddev = NULL; - goto done_unlock; - - case STOP_ARRAY_RO: - err = do_md_stop (mddev, 1); - goto done_unlock; - - /* - * We have a problem here : there is no easy way to give a CHS - * virtual geometry. We currently pretend that we have a 2 heads - * 4 sectors (with a BIG number of cylinders...). This drives - * dosfs just mad... ;-) - */ - case HDIO_GETGEO: - if (!loc) { - err = -EINVAL; - goto abort_unlock; - } - err = md_put_user (2, (char *) &loc->heads); - if (err) - goto abort_unlock; - err = md_put_user (4, (char *) &loc->sectors); - if (err) - goto abort_unlock; - err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, - (short *) &loc->cylinders); - if (err) - goto abort_unlock; - err = md_put_user (md_hd_struct[minor].start_sect, - (long *) &loc->start); - goto done_unlock; - } - - /* - * The remaining ioctls are changing the state of the - * superblock, so we do not allow read-only arrays - * here: - */ - if (mddev->ro) { - err = -EROFS; - goto abort_unlock; - } - - switch (cmd) - { - case CLEAR_ARRAY: - err = clear_array(mddev); - goto done_unlock; - - case ADD_NEW_DISK: - { - mdu_disk_info_t info; - if (md_copy_from_user(&info, (void*)arg, sizeof(info))) - err = -EFAULT; - else - err = add_new_disk(mddev, &info); - goto done_unlock; - } - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, (kdev_t)arg); - goto done_unlock; - - case HOT_ADD_DISK: - err = hot_add_disk(mddev, (kdev_t)arg); - goto done_unlock; - - case SET_DISK_INFO: - err = set_disk_info(mddev, (void *)arg); - goto done_unlock; - - case WRITE_RAID_INFO: - err = write_raid_info(mddev); - goto done_unlock; - - case UNPROTECT_ARRAY: - err = unprotect_array(mddev); - goto done_unlock; - - case PROTECT_ARRAY: - err = protect_array(mddev); - goto done_unlock; - - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, (kdev_t)arg); - goto done_unlock; - - case RUN_ARRAY: - { -/* The data is never used.... - mdu_param_t param; - err = md_copy_from_user(¶m, (mdu_param_t *)arg, - sizeof(param)); - if (err) - goto abort_unlock; -*/ - err = do_md_run (mddev); - /* - * we have to clean up the mess if - * the array cannot be run for some - * reason ... - */ - if (err) { - mddev->sb_dirty = 0; - if (!do_md_stop (mddev, 0)) - mddev = NULL; - } - goto done_unlock; - } - - default: - printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid); - err = -EINVAL; - goto abort_unlock; - } - -done_unlock: -abort_unlock: - if (mddev) - unlock_mddev(mddev); - - return err; -done: - if (err) - printk("huh12?\n"); -abort: - return err; -} - -static int md_open (struct inode *inode, struct file *file) -{ - /* - * Always succeed - */ - return (0); -} - -static struct block_device_operations md_fops= -{ - open: md_open, - ioctl: md_ioctl, -}; - - -int md_thread(void * arg) -{ - mdk_thread_t *thread = arg; - - md_lock_kernel(); - exit_mm(current); - exit_files(current); - exit_fs(current); - - /* - * Detach thread - */ - daemonize(); - sprintf(current->comm, thread->name); - md_init_signals(); - md_flush_signals(); - thread->tsk = current; - - /* - * md_thread is a 'system-thread', it's priority should be very - * high. We avoid resource deadlocks individually in each - * raid personality. (RAID5 does preallocation) We also use RR and - * the very same RT priority as kswapd, thus we will never get - * into a priority inversion deadlock. - * - * we definitely have to have equal or higher priority than - * bdflush, otherwise bdflush will deadlock if there are too - * many dirty RAID5 blocks. - */ - current->policy = SCHED_OTHER; - current->nice = -20; -// md_unlock_kernel(); - - up(thread->sem); - - for (;;) { - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&thread->wqueue, &wait); - set_task_state(current, TASK_INTERRUPTIBLE); - if (!test_bit(THREAD_WAKEUP, &thread->flags)) { - dprintk("thread %p went to sleep.\n", thread); - schedule(); - dprintk("thread %p woke up.\n", thread); - } - current->state = TASK_RUNNING; - remove_wait_queue(&thread->wqueue, &wait); - clear_bit(THREAD_WAKEUP, &thread->flags); - - if (thread->run) { - thread->run(thread->data); - run_task_queue(&tq_disk); - } else - break; - if (md_signal_pending(current)) { - printk("%8s(%d) flushing signals.\n", current->comm, - current->pid); - md_flush_signals(); - } - } - up(thread->sem); - return 0; -} - -void md_wakeup_thread(mdk_thread_t *thread) -{ - dprintk("waking up MD thread %p.\n", thread); - set_bit(THREAD_WAKEUP, &thread->flags); - wake_up(&thread->wqueue); -} - -mdk_thread_t *md_register_thread (void (*run) (void *), - void *data, const char *name) -{ - mdk_thread_t *thread; - int ret; - DECLARE_MUTEX_LOCKED(sem); - - thread = (mdk_thread_t *) kmalloc - (sizeof(mdk_thread_t), GFP_KERNEL); - if (!thread) - return NULL; - - memset(thread, 0, sizeof(mdk_thread_t)); - md_init_waitqueue_head(&thread->wqueue); - - thread->sem = &sem; - thread->run = run; - thread->data = data; - thread->name = name; - ret = kernel_thread(md_thread, thread, 0); - if (ret < 0) { - kfree(thread); - return NULL; - } - down(&sem); - return thread; -} - -void md_interrupt_thread (mdk_thread_t *thread) -{ - if (!thread->tsk) { - MD_BUG(); - return; - } - printk("interrupting MD-thread pid %d\n", thread->tsk->pid); - send_sig(SIGKILL, thread->tsk, 1); -} - -void md_unregister_thread (mdk_thread_t *thread) -{ - DECLARE_MUTEX_LOCKED(sem); - - thread->sem = &sem; - thread->run = NULL; - thread->name = NULL; - if (!thread->tsk) { - MD_BUG(); - return; - } - md_interrupt_thread(thread); - down(&sem); -} - -void md_recover_arrays (void) -{ - if (!md_recovery_thread) { - MD_BUG(); - return; - } - md_wakeup_thread(md_recovery_thread); -} - - -int md_error (kdev_t dev, kdev_t rdev) -{ - mddev_t *mddev; - mdk_rdev_t * rrdev; - int rc; - - mddev = kdev_to_mddev(dev); -/* printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3)); - */ - if (!mddev) { - MD_BUG(); - return 0; - } - rrdev = find_rdev(mddev, rdev); - mark_rdev_faulty(rrdev); - /* - * if recovery was running, stop it now. - */ - if (mddev->pers->stop_resync) - mddev->pers->stop_resync(mddev); - if (mddev->recovery_running) - md_interrupt_thread(md_recovery_thread); - if (mddev->pers->error_handler) { - rc = mddev->pers->error_handler(mddev, rdev); - md_recover_arrays(); - return rc; - } - return 0; -} - -static int status_unused (char * page) -{ - int sz = 0, i = 0; - mdk_rdev_t *rdev; - struct md_list_head *tmp; - - sz += sprintf(page + sz, "unused devices: "); - - ITERATE_RDEV_ALL(rdev,tmp) { - if (!rdev->same_set.next && !rdev->same_set.prev) { - /* - * The device is not yet used by any array. - */ - i++; - sz += sprintf(page + sz, "%s ", - partition_name(rdev->dev)); - } - } - if (!i) - sz += sprintf(page + sz, "<none>"); - - sz += sprintf(page + sz, "\n"); - return sz; -} - - -static int status_resync (char * page, mddev_t * mddev) -{ - int sz = 0; - unsigned long max_blocks, resync, res, dt, db, rt; - - resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); - max_blocks = mddev->sb->size; - - /* - * Should not happen. - */ - if (!max_blocks) { - MD_BUG(); - return 0; - } - res = (resync/1024)*1000/(max_blocks/1024 + 1); - { - int i, x = res/50, y = 20-x; - sz += sprintf(page + sz, "["); - for (i = 0; i < x; i++) - sz += sprintf(page + sz, "="); - sz += sprintf(page + sz, ">"); - for (i = 0; i < y; i++) - sz += sprintf(page + sz, "."); - sz += sprintf(page + sz, "] "); - } - if (!mddev->recovery_running) - /* - * true resync - */ - sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)", - res/10, res % 10, resync, max_blocks); - else - /* - * recovery ... - */ - sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)", - res/10, res % 10, resync, max_blocks); - - /* - * We do not want to overflow, so the order of operands and - * the * 100 / 100 trick are important. We do a +1 to be - * safe against division by zero. We only estimate anyway. - * - * dt: time from mark until now - * db: blocks written from mark until now - * rt: remaining time - */ - dt = ((jiffies - mddev->resync_mark) / HZ); - if (!dt) dt++; - db = resync - mddev->resync_mark_cnt; - rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; - - sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); - - sz += sprintf(page + sz, " speed=%ldK/sec", db/dt); - - return sz; -} - -static int md_status_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int sz = 0, j, size; - struct md_list_head *tmp, *tmp2; - mdk_rdev_t *rdev; - mddev_t *mddev; - - sz += sprintf(page + sz, "Personalities : "); - for (j = 0; j < MAX_PERSONALITY; j++) - if (pers[j]) - sz += sprintf(page+sz, "[%s] ", pers[j]->name); - - sz += sprintf(page+sz, "\n"); - - - sz += sprintf(page+sz, "read_ahead "); - if (read_ahead[MD_MAJOR] == INT_MAX) - sz += sprintf(page+sz, "not set\n"); - else - sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); - - ITERATE_MDDEV(mddev,tmp) { - sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), - mddev->pers ? "" : "in"); - if (mddev->pers) { - if (mddev->ro) - sz += sprintf(page + sz, " (read-only)"); - sz += sprintf(page + sz, " %s", mddev->pers->name); - } - - size = 0; - ITERATE_RDEV(mddev,rdev,tmp2) { - sz += sprintf(page + sz, " %s[%d]", - partition_name(rdev->dev), rdev->desc_nr); - if (rdev->faulty) { - sz += sprintf(page + sz, "(F)"); - continue; - } - size += rdev->size; - } - - if (mddev->nb_dev) { - if (mddev->pers) - sz += sprintf(page + sz, "\n %d blocks", - md_size[mdidx(mddev)]); - else - sz += sprintf(page + sz, "\n %d blocks", size); - } - - if (!mddev->pers) { - sz += sprintf(page+sz, "\n"); - continue; - } - - sz += mddev->pers->status (page+sz, mddev); - - sz += sprintf(page+sz, "\n "); - if (mddev->curr_resync) { - sz += status_resync (page+sz, mddev); - } else { - if (md_atomic_read(&mddev->resync_sem.count) != 1) - sz += sprintf(page + sz, " resync=DELAYED"); - } - sz += sprintf(page + sz, "\n"); - } - sz += status_unused (page + sz); - - return sz; -} - -int register_md_personality (int pnum, mdk_personality_t *p) -{ - if (pnum >= MAX_PERSONALITY) - return -EINVAL; - - if (pers[pnum]) - return -EBUSY; - - pers[pnum] = p; - printk(KERN_INFO "%s personality registered\n", p->name); - return 0; -} - -int unregister_md_personality (int pnum) -{ - if (pnum >= MAX_PERSONALITY) - return -EINVAL; - - printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name); - pers[pnum] = NULL; - return 0; -} - -static mdp_disk_t *get_spare(mddev_t *mddev) -{ - mdp_super_t *sb = mddev->sb; - mdp_disk_t *disk; - mdk_rdev_t *rdev; - struct md_list_head *tmp; - - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) - continue; - if (!rdev->sb) { - MD_BUG(); - continue; - } - disk = &sb->disks[rdev->desc_nr]; - if (disk_faulty(disk)) { - MD_BUG(); - continue; - } - if (disk_active(disk)) - continue; - return disk; - } - return NULL; -} - -static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; -void md_sync_acct(kdev_t dev, unsigned long nr_sectors) -{ - unsigned int major = MAJOR(dev); - unsigned int index; - - index = disk_index(dev); - if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) - return; - - sync_io[major][index] += nr_sectors; -} - -static int is_mddev_idle (mddev_t *mddev) -{ - mdk_rdev_t * rdev; - struct md_list_head *tmp; - int idle; - unsigned long curr_events; - - idle = 1; - ITERATE_RDEV(mddev,rdev,tmp) { - int major = MAJOR(rdev->dev); - int idx = disk_index(rdev->dev); - - if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) - continue; - - curr_events = kstat.dk_drive_rblk[major][idx] + - kstat.dk_drive_wblk[major][idx] ; - curr_events -= sync_io[major][idx]; -// printk("events(major: %d, idx: %d): %ld\n", major, idx, curr_events); - if (curr_events != rdev->last_events) { -// printk("!I(%ld)", curr_events - rdev->last_events); - rdev->last_events = curr_events; - idle = 0; - } - } - return idle; -} - -MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); - -void md_done_sync(mddev_t *mddev, int blocks, int ok) -{ - /* another "blocks" (1K) blocks have been synced */ - atomic_sub(blocks, &mddev->recovery_active); - wake_up(&mddev->recovery_wait); - if (!ok) { - // stop recovery, signal do_sync .... - } -} - -#define SYNC_MARKS 10 -#define SYNC_MARK_STEP (3*HZ) -int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) -{ - mddev_t *mddev2; - unsigned int max_blocks, currspeed, - j, window, err, serialize; - kdev_t read_disk = mddev_to_kdev(mddev); - unsigned long mark[SYNC_MARKS]; - unsigned long mark_cnt[SYNC_MARKS]; - int last_mark,m; - struct md_list_head *tmp; - unsigned long last_check; - - - err = down_interruptible(&mddev->resync_sem); - if (err) - goto out_nolock; - -recheck: - serialize = 0; - ITERATE_MDDEV(mddev2,tmp) { - if (mddev2 == mddev) - continue; - if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { - printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2)); - serialize = 1; - break; - } - } - if (serialize) { - interruptible_sleep_on(&resync_wait); - if (md_signal_pending(current)) { - md_flush_signals(); - err = -EINTR; - goto out; - } - goto recheck; - } - - mddev->curr_resync = 1; - - max_blocks = mddev->sb->size; - - printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); - printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", - sysctl_speed_limit_min); - printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max); - - /* - * Resync has low priority. - */ - current->nice = 19; - - is_mddev_idle(mddev); /* this also initializes IO event counters */ - for (m = 0; m < SYNC_MARKS; m++) { - mark[m] = jiffies; - mark_cnt[m] = 0; - } - last_mark = 0; - mddev->resync_mark = mark[last_mark]; - mddev->resync_mark_cnt = mark_cnt[last_mark]; - - /* - * Tune reconstruction: - */ - window = MAX_READAHEAD*(PAGE_SIZE/1024); - printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window,max_blocks); - - atomic_set(&mddev->recovery_active, 0); - init_waitqueue_head(&mddev->recovery_wait); - last_check = 0; - for (j = 0; j < max_blocks;) { - int blocks; - - blocks = mddev->pers->sync_request(mddev, j); - - if (blocks < 0) { - err = blocks; - goto out; - } - atomic_add(blocks, &mddev->recovery_active); - j += blocks; - mddev->curr_resync = j; - - if (last_check + window > j) - continue; - - run_task_queue(&tq_disk); //?? - - if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { - /* step marks */ - int next = (last_mark+1) % SYNC_MARKS; - - mddev->resync_mark = mark[next]; - mddev->resync_mark_cnt = mark_cnt[next]; - mark[next] = jiffies; - mark_cnt[next] = j - atomic_read(&mddev->recovery_active); - last_mark = next; - } - - - if (md_signal_pending(current)) { - /* - * got a signal, exit. - */ - mddev->curr_resync = 0; - printk("md_do_sync() got signal ... exiting\n"); - md_flush_signals(); - err = -EINTR; - goto out; - } - - /* - * this loop exits only if either when we are slower than - * the 'hard' speed limit, or the system was IO-idle for - * a jiffy. - * the system might be non-idle CPU-wise, but we only care - * about not overloading the IO subsystem. (things like an - * e2fsck being done on the RAID array should execute fast) - */ -repeat: - if (md_need_resched(current)) - schedule(); - - currspeed = (j-mddev->resync_mark_cnt)/((jiffies-mddev->resync_mark)/HZ +1) +1; - - if (currspeed > sysctl_speed_limit_min) { - current->nice = 19; - - if ((currspeed > sysctl_speed_limit_max) || - !is_mddev_idle(mddev)) { - current->state = TASK_INTERRUPTIBLE; - md_schedule_timeout(HZ/4); - if (!md_signal_pending(current)) - goto repeat; - } - } else - current->nice = -20; - } - fsync_dev(read_disk); - printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); - err = 0; - /* - * this also signals 'finished resyncing' to md_stop - */ -out: - wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); - up(&mddev->resync_sem); -out_nolock: - mddev->curr_resync = 0; - wake_up(&resync_wait); - return err; -} - - -/* - * This is a kernel thread which syncs a spare disk with the active array - * - * the amount of foolproofing might seem to be a tad excessive, but an - * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs - * of my root partition with the first 0.5 gigs of my /home partition ... so - * i'm a bit nervous ;) - */ -void md_do_recovery (void *data) -{ - int err; - mddev_t *mddev; - mdp_super_t *sb; - mdp_disk_t *spare; - struct md_list_head *tmp; - - printk(KERN_INFO "md: recovery thread got woken up ...\n"); -restart: - ITERATE_MDDEV(mddev,tmp) { - sb = mddev->sb; - if (!sb) - continue; - if (mddev->recovery_running) - continue; - if (sb->active_disks == sb->raid_disks) - continue; - if (!sb->spare_disks) { - printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev)); - continue; - } - /* - * now here we get the spare and resync it. - */ - if ((spare = get_spare(mddev)) == NULL) - continue; - printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); - if (!mddev->pers->diskop) - continue; - if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) - continue; - down(&mddev->recovery_sem); - mddev->recovery_running = 1; - err = md_do_sync(mddev, spare); - if (err == -EIO) { - printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); - if (!disk_faulty(spare)) { - mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); - mark_disk_faulty(spare); - mark_disk_nonsync(spare); - mark_disk_inactive(spare); - sb->spare_disks--; - sb->working_disks--; - sb->failed_disks++; - } - } else - if (disk_faulty(spare)) - mddev->pers->diskop(mddev, &spare, - DISKOP_SPARE_INACTIVE); - if (err == -EINTR || err == -ENOMEM) { - /* - * Recovery got interrupted, or ran out of mem ... - * signal back that we have finished using the array. - */ - mddev->pers->diskop(mddev, &spare, - DISKOP_SPARE_INACTIVE); - up(&mddev->recovery_sem); - mddev->recovery_running = 0; - continue; - } else { - mddev->recovery_running = 0; - up(&mddev->recovery_sem); - } - if (!disk_faulty(spare)) { - /* - * the SPARE_ACTIVE diskop possibly changes the - * pointer too - */ - mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); - mark_disk_sync(spare); - mark_disk_active(spare); - sb->active_disks++; - sb->spare_disks--; - } - mddev->sb_dirty = 1; - md_update_sb(mddev); - goto restart; - } - printk(KERN_INFO "md: recovery thread finished ...\n"); - -} - -int md_notify_reboot(struct notifier_block *this, - unsigned long code, void *x) -{ - struct md_list_head *tmp; - mddev_t *mddev; - - if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) - || (code == MD_SYS_POWER_OFF)) { - - printk(KERN_INFO "stopping all md devices.\n"); - - ITERATE_MDDEV(mddev,tmp) - do_md_stop (mddev, 1); - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... - */ - md_mdelay(1000*1); - } - return NOTIFY_DONE; -} - -struct notifier_block md_notifier = { - md_notify_reboot, - NULL, - 0 -}; -#ifndef MODULE -static int md__init raid_setup(char *str) -{ - int len, pos; - - len = strlen(str) + 1; - pos = 0; - - while (pos < len) { - char *comma = strchr(str+pos, ','); - int wlen; - if (comma) - wlen = (comma-str)-pos; - else wlen = (len-1)-pos; - - if (strncmp(str, "noautodetect", wlen) == 0) - raid_setup_args.noautodetect = 1; - pos += wlen+1; - } - raid_setup_args.set = 1; - return 1; -} -__setup("raid=", raid_setup); -#endif -static void md_geninit (void) -{ - int i; - - for(i = 0; i < MAX_MD_DEVS; i++) { - md_blocksizes[i] = 1024; - md_size[i] = 0; - md_hardsect_sizes[i] = 512; - md_maxreadahead[i] = MD_READAHEAD; - register_disk(&md_gendisk, MKDEV(MAJOR_NR,i), 1, &md_fops, 0); - } - blksize_size[MAJOR_NR] = md_blocksizes; - blk_size[MAJOR_NR] = md_size; - max_readahead[MAJOR_NR] = md_maxreadahead; - hardsect_size[MAJOR_NR] = md_hardsect_sizes; - - printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); - -#ifdef CONFIG_PROC_FS - create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL); -#endif -} -void hsm_init (void); -void translucent_init (void); -void linear_init (void); -void raid0_init (void); -void raid1_init (void); -void raid5_init (void); - -int md__init md_init (void) -{ - static char * name = "mdrecoveryd"; - - printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n", - MD_MAJOR_VERSION, MD_MINOR_VERSION, - MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL); - - if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) - { - printk (KERN_ALERT "Unable to get major %d for md\n", MAJOR_NR); - return (-1); - } - devfs_handle = devfs_mk_dir (NULL, "md", NULL); - devfs_register_series (devfs_handle, "%u",MAX_MD_DEVS,DEVFS_FL_DEFAULT, - MAJOR_NR, 0, S_IFBLK | S_IRUSR | S_IWUSR, - &md_fops, NULL); - - /* forward all md request to md_make_request */ - blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request); - - - read_ahead[MAJOR_NR] = INT_MAX; - md_gendisk.next = gendisk_head; - - gendisk_head = &md_gendisk; - - md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); - if (!md_recovery_thread) - printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n"); - - md_register_reboot_notifier(&md_notifier); - raid_table_header = register_sysctl_table(raid_root_table, 1); - -#ifdef CONFIG_MD_LINEAR - linear_init (); -#endif -#ifdef CONFIG_MD_RAID0 - raid0_init (); -#endif -#ifdef CONFIG_MD_RAID1 - raid1_init (); -#endif -#ifdef CONFIG_MD_RAID5 - raid5_init (); -#endif - md_geninit(); - return (0); -} - -#ifdef CONFIG_MD_BOOT -#define MAX_MD_BOOT_DEVS 8 -struct { - unsigned long set; - int pers[MAX_MD_BOOT_DEVS]; - int chunk[MAX_MD_BOOT_DEVS]; - kdev_t devices[MAX_MD_BOOT_DEVS][MAX_REAL]; -} md_setup_args md__initdata; - -/* - * Parse the command-line parameters given our kernel, but do not - * actually try to invoke the MD device now; that is handled by - * md_setup_drive after the low-level disk drivers have initialised. - * - * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which - * assigns the task of parsing integer arguments to the - * invoked program now). Added ability to initialise all - * the MD devices (by specifying multiple "md=" lines) - * instead of just one. -- KTK - * 18May2000: Added support for persistant-superblock arrays: - * md=n,0,factor,fault,device-list uses RAID0 for device n - * md=n,-1,factor,fault,device-list uses LINEAR for device n - * md=n,device-list reads a RAID superblock from the devices - * elements in device-list are read by name_to_kdev_t so can be - * a hex number or something like /dev/hda1 /dev/sdb - */ -extern kdev_t name_to_kdev_t(char *line) md__init; -static int md__init md_setup(char *str) -{ - int minor, level, factor, fault, i=0; - kdev_t device; - char *devnames, *pername = ""; - - if(get_option(&str, &minor) != 2) { /* MD Number */ - printk("md: Too few arguments supplied to md=.\n"); - return 0; - } - if (minor >= MAX_MD_BOOT_DEVS) { - printk ("md: Minor device number too high.\n"); - return 0; - } else if (md_setup_args.set & (1 << minor)) { - printk ("md: Warning - md=%d,... has been specified twice;\n" - " will discard the first definition.\n", minor); - } - switch(get_option(&str, &level)) { /* RAID Personality */ - case 2: /* could be 0 or -1.. */ - if (level == 0 || level == -1) { - if (get_option(&str, &factor) != 2 || /* Chunk Size */ - get_option(&str, &fault) != 2) { - printk("md: Too few arguments supplied to md=.\n"); - return 0; - } - md_setup_args.pers[minor] = level; - md_setup_args.chunk[minor] = 1 << (factor+12); - switch(level) { - case -1: - level = LINEAR; - pername = "linear"; - break; - case 0: - level = RAID0; - pername = "raid0"; - break; - default: - printk ("md: The kernel has not been configured for raid%d" - " support!\n", level); - return 0; - } - md_setup_args.pers[minor] = level; - break; - } - /* FALL THROUGH */ - case 1: /* the first device is numeric */ - md_setup_args.devices[minor][i++] = level; - /* FALL THROUGH */ - case 0: - md_setup_args.pers[minor] = 0; - pername="super-block"; - } - devnames = str; - for (; i<MAX_REAL && str; i++) { - if ((device = name_to_kdev_t(str))) { - md_setup_args.devices[minor][i] = device; - } else { - printk ("md: Unknown device name, %s.\n", str); - return 0; - } - if ((str = strchr(str, ',')) != NULL) - str++; - } - if (!i) { - printk ("md: No devices specified for md%d?\n", minor); - return 0; - } - - printk ("md: Will configure md%d (%s) from %s, below.\n", - minor, pername, devnames); - md_setup_args.devices[minor][i] = (kdev_t) 0; - md_setup_args.set |= (1 << minor); - return 1; -} - -void md__init md_setup_drive(void) -{ - int minor, i; - kdev_t dev; - mddev_t*mddev; - - for (minor = 0; minor < MAX_MD_BOOT_DEVS; minor++) { - mdu_disk_info_t dinfo; - int err=0; - if (!(md_setup_args.set & (1 << minor))) - continue; - printk("md: Loading md%d.\n", minor); - mddev = alloc_mddev(MKDEV(MD_MAJOR,minor)); - if (md_setup_args.pers[minor]) { - /* non-persistent */ - mdu_array_info_t ainfo; - ainfo.level = pers_to_level(md_setup_args.pers[minor]); - ainfo.size = 0; - ainfo.nr_disks =0; - ainfo.raid_disks =0; - ainfo.md_minor =minor; - ainfo.not_persistent = 1; - - ainfo.state = MD_SB_CLEAN; - ainfo.active_disks = 0; - ainfo.working_disks = 0; - ainfo.failed_disks = 0; - ainfo.spare_disks = 0; - ainfo.layout = 0; - ainfo.chunk_size = md_setup_args.chunk[minor]; - err = set_array_info(mddev, &ainfo); - for (i=0; !err && (dev = md_setup_args.devices[minor][i]); i++) { - dinfo.number = i; - dinfo.raid_disk = i; - dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC); - dinfo.major = MAJOR(dev); - dinfo.minor = MINOR(dev); - mddev->sb->nr_disks++; - mddev->sb->raid_disks++; - mddev->sb->active_disks++; - mddev->sb->working_disks++; - err = add_new_disk (mddev, &dinfo); - } - } else { - /* persistent */ - for (i = 0; (dev = md_setup_args.devices[minor][i]); i++) { - dinfo.major = MAJOR(dev); - dinfo.minor = MINOR(dev); - add_new_disk (mddev, &dinfo); - } - } - if (!err) - err = do_md_run(mddev); - if (err) { - mddev->sb_dirty = 0; - do_md_stop(mddev, 0); - printk("md: starting md%d failed\n", minor); - } - } -} - -__setup("md=", md_setup); -#endif - -#ifdef MODULE -int init_module (void) -{ - return md_init(); -} - -static void free_device_names(void) -{ - while (device_names.next != &device_names) { - struct list_head *tmp = device_names.next; - list_del(tmp); - kfree(tmp); - } -} - - -void cleanup_module (void) -{ - struct gendisk **gendisk_ptr; - - md_unregister_thread(md_recovery_thread); - devfs_unregister(devfs_handle); - - devfs_unregister_blkdev(MAJOR_NR,"md"); - unregister_reboot_notifier(&md_notifier); - unregister_sysctl_table(raid_table_header); -#ifdef CONFIG_PROC_FS - remove_proc_entry("mdstat", NULL); -#endif - - gendisk_ptr = &gendisk_head; - while (*gendisk_ptr) { - if (*gendisk_ptr == &md_gendisk) { - *gendisk_ptr = md_gendisk.next; - break; - } - gendisk_ptr = & (*gendisk_ptr)->next; - } - blk_dev[MAJOR_NR].queue = NULL; - blksize_size[MAJOR_NR] = NULL; - blk_size[MAJOR_NR] = NULL; - max_readahead[MAJOR_NR] = NULL; - hardsect_size[MAJOR_NR] = NULL; - - free_device_names(); - -} -#endif - -MD_EXPORT_SYMBOL(md_size); -MD_EXPORT_SYMBOL(register_md_personality); -MD_EXPORT_SYMBOL(unregister_md_personality); -MD_EXPORT_SYMBOL(partition_name); -MD_EXPORT_SYMBOL(md_error); -MD_EXPORT_SYMBOL(md_do_sync); -MD_EXPORT_SYMBOL(md_sync_acct); -MD_EXPORT_SYMBOL(md_done_sync); -MD_EXPORT_SYMBOL(md_recover_arrays); -MD_EXPORT_SYMBOL(md_register_thread); -MD_EXPORT_SYMBOL(md_unregister_thread); -MD_EXPORT_SYMBOL(md_update_sb); -MD_EXPORT_SYMBOL(md_wakeup_thread); -MD_EXPORT_SYMBOL(md_print_devices); -MD_EXPORT_SYMBOL(find_rdev_nr); -MD_EXPORT_SYMBOL(md_interrupt_thread); -MD_EXPORT_SYMBOL(mddev_map); -MD_EXPORT_SYMBOL(md_check_ordering); - diff --git a/drivers/block/paride/paride.h b/drivers/block/paride/paride.h index e08e07e65..c656858a7 100644 --- a/drivers/block/paride/paride.h +++ b/drivers/block/paride/paride.h @@ -1,3 +1,6 @@ +#ifndef __DRIVERS_PARIDE_H__ +#define __DRIVERS_PARIDE_H__ + /* paride.h (c) 1997-8 Grant R. Guenther <grant@torque.net> Under the terms of the GPL. @@ -161,4 +164,5 @@ typedef struct pi_protocol PIP; extern int pi_register( PIP * ); extern void pi_unregister ( PIP * ); +#endif /* __DRIVERS_PARIDE_H__ */ /* end of paride.h */ diff --git a/drivers/block/raid0.c b/drivers/block/raid0.c deleted file mode 100644 index 09f3f8547..000000000 --- a/drivers/block/raid0.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - raid0.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - <zyngier@ufr-info-p7.ibp.fr> or - <maz@gloups.fdn.fr> - Copyright (C) 1999, 2000 Ingo Molnar, Red Hat - - - RAID-0 management functions. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#include <linux/module.h> -#include <linux/raid/raid0.h> - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER -#define MD_PERSONALITY - -static int create_strip_zones (mddev_t *mddev) -{ - int i, c, j, j1, j2; - unsigned long current_offset, curr_zone_offset; - raid0_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; - - /* - * The number of 'same size groups' - */ - conf->nr_strip_zones = 0; - - ITERATE_RDEV_ORDERED(mddev,rdev1,j1) { - printk("raid0: looking at %s\n", partition_name(rdev1->dev)); - c = 0; - ITERATE_RDEV_ORDERED(mddev,rdev2,j2) { - printk("raid0: comparing %s(%ld) with %s(%ld)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size); - if (rdev2 == rdev1) { - printk("raid0: END\n"); - break; - } - if (rdev2->size == rdev1->size) - { - /* - * Not unique, dont count it as a new - * group - */ - printk("raid0: EQUAL\n"); - c = 1; - break; - } - printk("raid0: NOT EQUAL\n"); - } - if (!c) { - printk("raid0: ==> UNIQUE\n"); - conf->nr_strip_zones++; - printk("raid0: %d zones\n", conf->nr_strip_zones); - } - } - printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); - - conf->strip_zone = vmalloc(sizeof(struct strip_zone)* - conf->nr_strip_zones); - if (!conf->strip_zone) - return 1; - - - conf->smallest = NULL; - current_offset = 0; - curr_zone_offset = 0; - - for (i = 0; i < conf->nr_strip_zones; i++) - { - struct strip_zone *zone = conf->strip_zone + i; - - printk("zone %d\n", i); - zone->dev_offset = current_offset; - smallest = NULL; - c = 0; - - ITERATE_RDEV_ORDERED(mddev,rdev,j) { - - printk(" checking %s ...", partition_name(rdev->dev)); - if (rdev->size > current_offset) - { - printk(" contained as device %d\n", c); - zone->dev[c] = rdev; - c++; - if (!smallest || (rdev->size <smallest->size)) { - smallest = rdev; - printk(" (%ld) is smallest!.\n", rdev->size); - } - } else - printk(" nope.\n"); - } - - zone->nb_dev = c; - zone->size = (smallest->size - current_offset) * c; - printk(" zone->nb_dev: %d, size: %ld\n",zone->nb_dev,zone->size); - - if (!conf->smallest || (zone->size < conf->smallest->size)) - conf->smallest = zone; - - zone->zone_offset = curr_zone_offset; - curr_zone_offset += zone->size; - - current_offset = smallest->size; - printk("current zone offset: %ld\n", current_offset); - } - printk("done.\n"); - return 0; -} - -static int raid0_run (mddev_t *mddev) -{ - unsigned long cur=0, i=0, size, zone0_size, nb_zone; - raid0_conf_t *conf; - - MOD_INC_USE_COUNT; - - conf = vmalloc(sizeof (raid0_conf_t)); - if (!conf) - goto out; - mddev->private = (void *)conf; - - if (md_check_ordering(mddev)) { - printk("raid0: disks are not ordered, aborting!\n"); - goto out_free_conf; - } - - if (create_strip_zones (mddev)) - goto out_free_conf; - - printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]); - printk("raid0 : conf->smallest->size is %ld blocks.\n", conf->smallest->size); - nb_zone = md_size[mdidx(mddev)]/conf->smallest->size + - (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0); - printk("raid0 : nb_zone is %ld.\n", nb_zone); - conf->nr_zones = nb_zone; - - printk("raid0 : Allocating %ld bytes for hash.\n", - nb_zone*sizeof(struct raid0_hash)); - - conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone); - if (!conf->hash_table) - goto out_free_zone_conf; - size = conf->strip_zone[cur].size; - - i = 0; - while (cur < conf->nr_strip_zones) { - conf->hash_table[i].zone0 = conf->strip_zone + cur; - - /* - * If we completely fill the slot - */ - if (size >= conf->smallest->size) { - conf->hash_table[i++].zone1 = NULL; - size -= conf->smallest->size; - - if (!size) { - if (++cur == conf->nr_strip_zones) - continue; - size = conf->strip_zone[cur].size; - } - continue; - } - if (++cur == conf->nr_strip_zones) { - /* - * Last dev, set unit1 as NULL - */ - conf->hash_table[i].zone1=NULL; - continue; - } - - /* - * Here we use a 2nd dev to fill the slot - */ - zone0_size = size; - size = conf->strip_zone[cur].size; - conf->hash_table[i++].zone1 = conf->strip_zone + cur; - size -= (conf->smallest->size - zone0_size); - } - return 0; - -out_free_zone_conf: - vfree(conf->strip_zone); - conf->strip_zone = NULL; - -out_free_conf: - vfree(conf); - mddev->private = NULL; -out: - MOD_DEC_USE_COUNT; - return 1; -} - -static int raid0_stop (mddev_t *mddev) -{ - raid0_conf_t *conf = mddev_to_conf(mddev); - - vfree (conf->hash_table); - conf->hash_table = NULL; - vfree (conf->strip_zone); - conf->strip_zone = NULL; - vfree (conf); - mddev->private = NULL; - - MOD_DEC_USE_COUNT; - return 0; -} - -/* - * FIXME - We assume some things here : - * - requested buffers NEVER bigger than chunk size, - * - requested buffers NEVER cross stripes limits. - * Of course, those facts may not be valid anymore (and surely won't...) - * Hey guys, there's some work out there ;-) - */ -static int raid0_make_request (mddev_t *mddev, - int rw, struct buffer_head * bh) -{ - unsigned int sect_in_chunk, chunksize_bits, chunk_size; - raid0_conf_t *conf = mddev_to_conf(mddev); - struct raid0_hash *hash; - struct strip_zone *zone; - mdk_rdev_t *tmp_dev; - unsigned long chunk, block, rsect; - - chunk_size = mddev->param.chunk_size >> 10; - chunksize_bits = ffz(~chunk_size); - block = bh->b_rsector >> 1; - hash = conf->hash_table + block / conf->smallest->size; - - /* Sanity check */ - if (chunk_size < (block % chunk_size) + (bh->b_size >> 10)) - goto bad_map; - - if (!hash) - goto bad_hash; - - if (!hash->zone0) - goto bad_zone0; - - if (block >= (hash->zone0->size + hash->zone0->zone_offset)) { - if (!hash->zone1) - goto bad_zone1; - zone = hash->zone1; - } else - zone = hash->zone0; - - sect_in_chunk = bh->b_rsector & ((chunk_size<<1) -1); - chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits); - tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev]; - rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1) - + sect_in_chunk; - - /* - * The new BH_Lock semantics in ll_rw_blk.c guarantee that this - * is the only IO operation happening on this bh. - */ - bh->b_rdev = tmp_dev->dev; - bh->b_rsector = rsect; - - /* - * Let the main block layer submit the IO and resolve recursion: - */ - return 1; - -bad_map: - printk ("raid0_make_request bug: can't convert block across chunks or bigger than %dk %ld %d\n", chunk_size, bh->b_rsector, bh->b_size >> 10); - return -1; -bad_hash: - printk("raid0_make_request bug: hash==NULL for block %ld\n", block); - return -1; -bad_zone0: - printk ("raid0_make_request bug: hash->zone0==NULL for block %ld\n", block); - return -1; -bad_zone1: - printk ("raid0_make_request bug: hash->zone1==NULL for block %ld\n", block); - return -1; -} - -static int raid0_status (char *page, mddev_t *mddev) -{ - int sz = 0; -#undef MD_DEBUG -#ifdef MD_DEBUG - int j, k; - raid0_conf_t *conf = mddev_to_conf(mddev); - - sz += sprintf(page + sz, " "); - for (j = 0; j < conf->nr_zones; j++) { - sz += sprintf(page + sz, "[z%d", - conf->hash_table[j].zone0 - conf->strip_zone); - if (conf->hash_table[j].zone1) - sz += sprintf(page+sz, "/z%d] ", - conf->hash_table[j].zone1 - conf->strip_zone); - else - sz += sprintf(page+sz, "] "); - } - - sz += sprintf(page + sz, "\n"); - - for (j = 0; j < conf->nr_strip_zones; j++) { - sz += sprintf(page + sz, " z%d=[", j); - for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - sz += sprintf (page+sz, "%s/", partition_name( - conf->strip_zone[j].dev[k]->dev)); - sz--; - sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n", - conf->strip_zone[j].zone_offset, - conf->strip_zone[j].dev_offset, - conf->strip_zone[j].size); - } -#endif - sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024); - return sz; -} - -static mdk_personality_t raid0_personality= -{ - name: "raid0", - make_request: raid0_make_request, - run: raid0_run, - stop: raid0_stop, - status: raid0_status, -}; - -#ifndef MODULE - -void raid0_init (void) -{ - register_md_personality (RAID0, &raid0_personality); -} - -#else - -int init_module (void) -{ - return (register_md_personality (RAID0, &raid0_personality)); -} - -void cleanup_module (void) -{ - unregister_md_personality (RAID0); -} - -#endif - diff --git a/drivers/block/raid1.c b/drivers/block/raid1.c deleted file mode 100644 index b39c87e0e..000000000 --- a/drivers/block/raid1.c +++ /dev/null @@ -1,1897 +0,0 @@ -/* - * raid1.c : Multiple Devices driver for Linux - * - * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat - * - * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * - * RAID-1 management functions. - * - * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 - * - * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> - * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/module.h> -#include <linux/malloc.h> -#include <linux/raid/raid1.h> -#include <asm/atomic.h> - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER -#define MD_PERSONALITY - -#define MAX_WORK_PER_DISK 128 - -/* - * The following can be used to debug the driver - */ -#define RAID1_DEBUG 0 - -#if RAID1_DEBUG -#define PRINTK(x...) printk(x) -#define inline -#define __inline__ -#else -#define PRINTK(x...) do { } while (0) -#endif - - -static mdk_personality_t raid1_personality; -static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; -struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; - -static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) -{ - /* return a linked list of "cnt" struct buffer_heads. - * don't take any off the free list unless we know we can - * get all we need, otherwise we could deadlock - */ - struct buffer_head *bh=NULL; - - while(cnt) { - struct buffer_head *t; - md_spin_lock_irq(&conf->device_lock); - if (conf->freebh_cnt >= cnt) - while (cnt) { - t = conf->freebh; - conf->freebh = t->b_next; - t->b_next = bh; - bh = t; - t->b_state = 0; - conf->freebh_cnt--; - cnt--; - } - md_spin_unlock_irq(&conf->device_lock); - if (cnt == 0) - break; - t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_BUFFER); - if (t) { - memset(t, 0, sizeof(*t)); - t->b_next = bh; - bh = t; - cnt--; - } else { - PRINTK("waiting for %d bh\n", cnt); - wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt); - } - } - return bh; -} - -static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) -{ - md_spin_lock_irq(&conf->device_lock); - while (bh) { - struct buffer_head *t = bh; - bh=bh->b_next; - if (t->b_pprev == NULL) - kfree(t); - else { - t->b_next= conf->freebh; - conf->freebh = t; - conf->freebh_cnt++; - } - } - md_spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_buffer); -} - -static int raid1_grow_bh(raid1_conf_t *conf, int cnt) -{ - /* allocate cnt buffer_heads, possibly less if kalloc fails */ - int i = 0; - - while (i < cnt) { - struct buffer_head *bh; - bh = kmalloc(sizeof(*bh), GFP_KERNEL); - if (!bh) break; - memset(bh, 0, sizeof(*bh)); - - md_spin_lock_irq(&conf->device_lock); - bh->b_pprev = &conf->freebh; - bh->b_next = conf->freebh; - conf->freebh = bh; - conf->freebh_cnt++; - md_spin_unlock_irq(&conf->device_lock); - - i++; - } - return i; -} - -static int raid1_shrink_bh(raid1_conf_t *conf, int cnt) -{ - /* discard cnt buffer_heads, if we can find them */ - int i = 0; - - md_spin_lock_irq(&conf->device_lock); - while ((i < cnt) && conf->freebh) { - struct buffer_head *bh = conf->freebh; - conf->freebh = bh->b_next; - kfree(bh); - i++; - conf->freebh_cnt--; - } - md_spin_unlock_irq(&conf->device_lock); - return i; -} - - -static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) -{ - struct raid1_bh *r1_bh = NULL; - - do { - md_spin_lock_irq(&conf->device_lock); - if (conf->freer1) { - r1_bh = conf->freer1; - conf->freer1 = r1_bh->next_r1; - r1_bh->next_r1 = NULL; - r1_bh->state = 0; - r1_bh->bh_req.b_state = 0; - } - md_spin_unlock_irq(&conf->device_lock); - if (r1_bh) - return r1_bh; - r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), - GFP_BUFFER); - if (r1_bh) { - memset(r1_bh, 0, sizeof(*r1_bh)); - return r1_bh; - } - wait_event(conf->wait_buffer, conf->freer1); - } while (1); -} - -static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) -{ - struct buffer_head *bh = r1_bh->mirror_bh_list; - raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); - - r1_bh->mirror_bh_list = NULL; - - if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { - md_spin_lock_irq(&conf->device_lock); - r1_bh->next_r1 = conf->freer1; - conf->freer1 = r1_bh; - md_spin_unlock_irq(&conf->device_lock); - } else { - kfree(r1_bh); - } - raid1_free_bh(conf, bh); -} - -static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) -{ - int i = 0; - - while (i < cnt) { - struct raid1_bh *r1_bh; - r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); - if (!r1_bh) - break; - memset(r1_bh, 0, sizeof(*r1_bh)); - - md_spin_lock_irq(&conf->device_lock); - set_bit(R1BH_PreAlloc, &r1_bh->state); - r1_bh->next_r1 = conf->freer1; - conf->freer1 = r1_bh; - md_spin_unlock_irq(&conf->device_lock); - - i++; - } - return i; -} - -static void raid1_shrink_r1bh(raid1_conf_t *conf) -{ - md_spin_lock_irq(&conf->device_lock); - while (conf->freer1) { - struct raid1_bh *r1_bh = conf->freer1; - conf->freer1 = r1_bh->next_r1; - kfree(r1_bh); - } - md_spin_unlock_irq(&conf->device_lock); -} - - - -static inline void raid1_free_buf(struct raid1_bh *r1_bh) -{ - struct buffer_head *bh = r1_bh->mirror_bh_list; - raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); - r1_bh->mirror_bh_list = NULL; - - md_spin_lock_irq(&conf->device_lock); - r1_bh->next_r1 = conf->freebuf; - conf->freebuf = r1_bh; - md_spin_unlock_irq(&conf->device_lock); - raid1_free_bh(conf, bh); -} - -static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) -{ - struct raid1_bh *r1_bh; - - md_spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); - r1_bh = conf->freebuf; - conf->freebuf = r1_bh->next_r1; - r1_bh->next_r1= NULL; - md_spin_unlock_irq(&conf->device_lock); - - return r1_bh; -} - -static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) -{ - int i = 0; - - md_spin_lock_irq(&conf->device_lock); - while (i < cnt) { - struct raid1_bh *r1_bh; - struct page *page; - - page = alloc_page(GFP_KERNEL); - if (!page) - break; - - r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); - if (!r1_bh) { - __free_page(page); - break; - } - memset(r1_bh, 0, sizeof(*r1_bh)); - r1_bh->bh_req.b_page = page; - r1_bh->bh_req.b_data = page_address(page); - r1_bh->next_r1 = conf->freebuf; - conf->freebuf = r1_bh; - i++; - } - md_spin_unlock_irq(&conf->device_lock); - return i; -} - -static void raid1_shrink_buffers (raid1_conf_t *conf) -{ - md_spin_lock_irq(&conf->device_lock); - while (conf->freebuf) { - struct raid1_bh *r1_bh = conf->freebuf; - conf->freebuf = r1_bh->next_r1; - __free_page(r1_bh->bh_req.b_page); - kfree(r1_bh); - } - md_spin_unlock_irq(&conf->device_lock); -} - -static int raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - int i, disks = MD_SB_DISKS; - - /* - * Later we do read balancing on the read side - * now we use the first available disk. - */ - - for (i = 0; i < disks; i++) { - if (conf->mirrors[i].operational) { - *rdev = conf->mirrors[i].dev; - return (0); - } - } - - printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); - return (-1); -} - -static void raid1_reschedule_retry (struct raid1_bh *r1_bh) -{ - unsigned long flags; - mddev_t *mddev = r1_bh->mddev; - raid1_conf_t *conf = mddev_to_conf(mddev); - - md_spin_lock_irqsave(&retry_list_lock, flags); - if (raid1_retry_list == NULL) - raid1_retry_tail = &raid1_retry_list; - *raid1_retry_tail = r1_bh; - raid1_retry_tail = &r1_bh->next_r1; - r1_bh->next_r1 = NULL; - md_spin_unlock_irqrestore(&retry_list_lock, flags); - md_wakeup_thread(conf->thread); -} - - -static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) -{ - unsigned long flags; - spin_lock_irqsave(&conf->segment_lock, flags); - if (sector < conf->start_active) - conf->cnt_done--; - else if (sector >= conf->start_future && conf->phase == phase) - conf->cnt_future--; - else if (!--conf->cnt_pending) - wake_up(&conf->wait_ready); - - spin_unlock_irqrestore(&conf->segment_lock, flags); -} - -static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) -{ - unsigned long flags; - spin_lock_irqsave(&conf->segment_lock, flags); - if (sector >= conf->start_ready) - --conf->cnt_ready; - else if (sector >= conf->start_active) { - if (!--conf->cnt_active) { - conf->start_active = conf->start_ready; - wake_up(&conf->wait_done); - } - } - spin_unlock_irqrestore(&conf->segment_lock, flags); -} - -/* - * raid1_end_bh_io() is called when we have finished servicing a mirrored - * operation and are ready to return a success/failure code to the buffer - * cache layer. - */ -static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) -{ - struct buffer_head *bh = r1_bh->master_bh; - - io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), - test_bit(R1BH_SyncPhase, &r1_bh->state)); - - bh->b_end_io(bh, uptodate); - raid1_free_r1bh(r1_bh); -} -void raid1_end_request (struct buffer_head *bh, int uptodate) -{ - struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); - - /* - * this branch is our 'one mirror IO has finished' event handler: - */ - if (!uptodate) - md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev); - else - /* - * Set R1BH_Uptodate in our master buffer_head, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the complex operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' buffer_head. - */ - set_bit (R1BH_Uptodate, &r1_bh->state); - - /* - * We split up the read and write side, imho they are - * conceptually different. - */ - - if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { - /* - * we have only one buffer_head on the read side - */ - - if (uptodate) { - raid1_end_bh_io(r1_bh, uptodate); - return; - } - /* - * oops, read error: - */ - printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", - partition_name(bh->b_dev), bh->b_blocknr); - raid1_reschedule_retry(r1_bh); - return; - } - - /* - * WRITE: - * - * Let's see if all mirrored write operations have finished - * already. - */ - - if (atomic_dec_and_test(&r1_bh->remaining)) - raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); -} - -/* - * This routine returns the disk from which the requested read should - * be done. It bookkeeps the last read position for every disk - * in array and when new read requests come, the disk which last - * position is nearest to the request, is chosen. - * - * TODO: now if there are 2 mirrors in the same 2 devices, performance - * degrades dramatically because position is mirror, not device based. - * This should be changed to be device based. Also atomic sequential - * reads should be somehow balanced. - */ - -static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) -{ - int new_disk = conf->last_used; - const int sectors = bh->b_size >> 9; - const unsigned long this_sector = bh->b_rsector; - int disk = new_disk; - unsigned long new_distance; - unsigned long current_distance; - - /* - * Check if it is sane at all to balance - */ - - if (conf->resync_mirrors) - goto rb_out; - - if (conf->working_disks < 2) { - int i = 0; - - while( !conf->mirrors[new_disk].operational && - (i < MD_SB_DISKS) ) { - new_disk = conf->mirrors[new_disk].next; - i++; - } - - if (i >= MD_SB_DISKS) { - /* - * This means no working disk was found - * Nothing much to do, lets not change anything - * and hope for the best... - */ - - new_disk = conf->last_used; - } - - goto rb_out; - } - - /* - * Don't touch anything for sequential reads. - */ - - if (this_sector == conf->mirrors[new_disk].head_position) - goto rb_out; - - /* - * If reads have been done only on a single disk - * for a time, lets give another disk a change. - * This is for kicking those idling disks so that - * they would find work near some hotspot. - */ - - if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { - conf->sect_count = 0; - - while( new_disk != conf->mirrors[new_disk].next ) { - if ((conf->mirrors[new_disk].write_only) || - (!conf->mirrors[new_disk].operational) ) - continue; - - new_disk = conf->mirrors[new_disk].next; - break; - } - - goto rb_out; - } - - current_distance = abs(this_sector - - conf->mirrors[disk].head_position); - - /* Find the disk which is closest */ - - while( conf->mirrors[disk].next != conf->last_used ) { - disk = conf->mirrors[disk].next; - - if ((conf->mirrors[disk].write_only) || - (!conf->mirrors[disk].operational)) - continue; - - new_distance = abs(this_sector - - conf->mirrors[disk].head_position); - - if (new_distance < current_distance) { - conf->sect_count = 0; - current_distance = new_distance; - new_disk = disk; - } - } - -rb_out: - conf->mirrors[new_disk].head_position = this_sector + sectors; - - conf->last_used = new_disk; - conf->sect_count += sectors; - - return new_disk; -} - -static int raid1_make_request (mddev_t *mddev, int rw, - struct buffer_head * bh) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - struct buffer_head *bh_req, *bhl; - struct raid1_bh * r1_bh; - int disks = MD_SB_DISKS; - int i, sum_bhs = 0, sectors; - struct mirror_info *mirror; - - if (!buffer_locked(bh)) - BUG(); - -/* - * make_request() can abort the operation when READA is being - * used and no empty request is available. - * - * Currently, just replace the command with READ/WRITE. - */ - if (rw == READA) - rw = READ; - - r1_bh = raid1_alloc_r1bh (conf); - - spin_lock_irq(&conf->segment_lock); - wait_event_lock_irq(conf->wait_done, - bh->b_rsector < conf->start_active || - bh->b_rsector >= conf->start_future, - conf->segment_lock); - if (bh->b_rsector < conf->start_active) - conf->cnt_done++; - else { - conf->cnt_future++; - if (conf->phase) - set_bit(R1BH_SyncPhase, &r1_bh->state); - } - spin_unlock_irq(&conf->segment_lock); - - /* - * i think the read and write branch should be separated completely, - * since we want to do read balancing on the read side for example. - * Alternative implementations? :) --mingo - */ - - r1_bh->master_bh = bh; - r1_bh->mddev = mddev; - r1_bh->cmd = rw; - - sectors = bh->b_size >> 9; - if (rw == READ) { - /* - * read balancing logic: - */ - mirror = conf->mirrors + raid1_read_balance(conf, bh); - - bh_req = &r1_bh->bh_req; - memcpy(bh_req, bh, sizeof(*bh)); - bh_req->b_blocknr = bh->b_rsector * sectors; - bh_req->b_dev = mirror->dev; - bh_req->b_rdev = mirror->dev; - /* bh_req->b_rsector = bh->n_rsector; */ - bh_req->b_end_io = raid1_end_request; - bh_req->b_private = r1_bh; - generic_make_request (rw, bh_req); - return 0; - } - - /* - * WRITE: - */ - - bhl = raid1_alloc_bh(conf, conf->raid_disks); - for (i = 0; i < disks; i++) { - struct buffer_head *mbh; - if (!conf->mirrors[i].operational) - continue; - - /* - * We should use a private pool (size depending on NR_REQUEST), - * to avoid writes filling up the memory with bhs - * - * Such pools are much faster than kmalloc anyways (so we waste - * almost nothing by not using the master bh when writing and - * win alot of cleanness) but for now we are cool enough. --mingo - * - * It's safe to sleep here, buffer heads cannot be used in a shared - * manner in the write branch. Look how we lock the buffer at the - * beginning of this function to grok the difference ;) - */ - mbh = bhl; - if (mbh == NULL) { - MD_BUG(); - break; - } - bhl = mbh->b_next; - mbh->b_next = NULL; - mbh->b_this_page = (struct buffer_head *)1; - - /* - * prepare mirrored mbh (fields ordered for max mem throughput): - */ - mbh->b_blocknr = bh->b_rsector * sectors; - mbh->b_dev = conf->mirrors[i].dev; - mbh->b_rdev = conf->mirrors[i].dev; - mbh->b_rsector = bh->b_rsector; - mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) | - (1<<BH_Mapped) | (1<<BH_Lock); - - atomic_set(&mbh->b_count, 1); - mbh->b_size = bh->b_size; - mbh->b_page = bh->b_page; - mbh->b_data = bh->b_data; - mbh->b_list = BUF_LOCKED; - mbh->b_end_io = raid1_end_request; - mbh->b_private = r1_bh; - - mbh->b_next = r1_bh->mirror_bh_list; - r1_bh->mirror_bh_list = mbh; - sum_bhs++; - } - if (bhl) raid1_free_bh(conf,bhl); - md_atomic_set(&r1_bh->remaining, sum_bhs); - - /* - * We have to be a bit careful about the semaphore above, thats - * why we start the requests separately. Since kmalloc() could - * fail, sleep and make_request() can sleep too, this is the - * safer solution. Imagine, end_request decreasing the semaphore - * before we could have set it up ... We could play tricks with - * the semaphore (presetting it and correcting at the end if - * sum_bhs is not 'n' but we have to do end_request by hand if - * all requests finish until we had a chance to set up the - * semaphore correctly ... lots of races). - */ - bh = r1_bh->mirror_bh_list; - while(bh) { - struct buffer_head *bh2 = bh; - bh = bh->b_next; - generic_make_request(rw, bh2); - } - return (0); -} - -static int raid1_status (char *page, mddev_t *mddev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - int sz = 0, i; - - sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, - conf->working_disks); - for (i = 0; i < conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", - conf->mirrors[i].operational ? "U" : "_"); - sz += sprintf (page+sz, "]"); - return sz; -} - -static void unlink_disk (raid1_conf_t *conf, int target) -{ - int disks = MD_SB_DISKS; - int i; - - for (i = 0; i < disks; i++) - if (conf->mirrors[i].next == target) - conf->mirrors[i].next = conf->mirrors[target].next; -} - -#define LAST_DISK KERN_ALERT \ -"raid1: only one disk left and IO error.\n" - -#define NO_SPARE_DISK KERN_ALERT \ -"raid1: no spare disk left, degrading mirror level by one.\n" - -#define DISK_FAILED KERN_ALERT \ -"raid1: Disk failure on %s, disabling device. \n" \ -" Operation continuing on %d devices\n" - -#define START_SYNCING KERN_ALERT \ -"raid1: start syncing spare disk.\n" - -#define ALREADY_SYNCING KERN_INFO \ -"raid1: syncing already in progress.\n" - -static void mark_disk_bad (mddev_t *mddev, int failed) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - struct mirror_info *mirror = conf->mirrors+failed; - mdp_super_t *sb = mddev->sb; - - mirror->operational = 0; - unlink_disk(conf, failed); - mark_disk_faulty(sb->disks+mirror->number); - mark_disk_nonsync(sb->disks+mirror->number); - mark_disk_inactive(sb->disks+mirror->number); - sb->active_disks--; - sb->working_disks--; - sb->failed_disks++; - mddev->sb_dirty = 1; - md_wakeup_thread(conf->thread); - conf->working_disks--; - printk (DISK_FAILED, partition_name (mirror->dev), - conf->working_disks); -} - -static int raid1_error (mddev_t *mddev, kdev_t dev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - struct mirror_info * mirrors = conf->mirrors; - int disks = MD_SB_DISKS; - int i; - - if (conf->working_disks == 1) { - /* - * Uh oh, we can do nothing if this is our last disk, but - * first check if this is a queued request for a device - * which has just failed. - */ - for (i = 0; i < disks; i++) { - if (mirrors[i].dev==dev && !mirrors[i].operational) - return 0; - } - printk (LAST_DISK); - } else { - /* - * Mark disk as unusable - */ - for (i = 0; i < disks; i++) { - if (mirrors[i].dev==dev && mirrors[i].operational) { - mark_disk_bad(mddev, i); - break; - } - } - } - return 0; -} - -#undef LAST_DISK -#undef NO_SPARE_DISK -#undef DISK_FAILED -#undef START_SYNCING - -/* - * Insert the spare disk into the drive-ring - */ -static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror) -{ - int j, next; - int disks = MD_SB_DISKS; - struct mirror_info *p = conf->mirrors; - - for (j = 0; j < disks; j++, p++) - if (p->operational && !p->write_only) { - next = p->next; - p->next = mirror->raid_disk; - mirror->next = next; - return; - } - - printk("raid1: bug: no read-operational devices\n"); -} - -static void print_raid1_conf (raid1_conf_t *conf) -{ - int i; - struct mirror_info *tmp; - - printk("RAID1 conf printout:\n"); - if (!conf) { - printk("(conf==NULL)\n"); - return; - } - printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, - conf->raid_disks, conf->nr_disks); - - for (i = 0; i < MD_SB_DISKS; i++) { - tmp = conf->mirrors + i; - printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", - i, tmp->spare,tmp->operational, - tmp->number,tmp->raid_disk,tmp->used_slot, - partition_name(tmp->dev)); - } -} - -static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) -{ - int err = 0; - int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; - raid1_conf_t *conf = mddev->private; - struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; - mdp_super_t *sb = mddev->sb; - mdp_disk_t *failed_desc, *spare_desc, *added_desc; - - print_raid1_conf(conf); - md_spin_lock_irq(&conf->device_lock); - /* - * find the disk ... - */ - switch (state) { - - case DISKOP_SPARE_ACTIVE: - - /* - * Find the failed disk within the RAID1 configuration ... - * (this can only be in the first conf->working_disks part) - */ - for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->mirrors + i; - if ((!tmp->operational && !tmp->spare) || - !tmp->used_slot) { - failed_disk = i; - break; - } - } - /* - * When we activate a spare disk we _must_ have a disk in - * the lower (active) part of the array to replace. - */ - if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { - MD_BUG(); - err = 1; - goto abort; - } - /* fall through */ - - case DISKOP_SPARE_WRITE: - case DISKOP_SPARE_INACTIVE: - - /* - * Find the spare disk ... (can only be in the 'high' - * area of the array) - */ - for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { - tmp = conf->mirrors + i; - if (tmp->spare && tmp->number == (*d)->number) { - spare_disk = i; - break; - } - } - if (spare_disk == -1) { - MD_BUG(); - err = 1; - goto abort; - } - break; - - case DISKOP_HOT_REMOVE_DISK: - - for (i = 0; i < MD_SB_DISKS; i++) { - tmp = conf->mirrors + i; - if (tmp->used_slot && (tmp->number == (*d)->number)) { - if (tmp->operational) { - err = -EBUSY; - goto abort; - } - removed_disk = i; - break; - } - } - if (removed_disk == -1) { - MD_BUG(); - err = 1; - goto abort; - } - break; - - case DISKOP_HOT_ADD_DISK: - - for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { - tmp = conf->mirrors + i; - if (!tmp->used_slot) { - added_disk = i; - break; - } - } - if (added_disk == -1) { - MD_BUG(); - err = 1; - goto abort; - } - break; - } - - switch (state) { - /* - * Switch the spare disk to write-only mode: - */ - case DISKOP_SPARE_WRITE: - sdisk = conf->mirrors + spare_disk; - sdisk->operational = 1; - sdisk->write_only = 1; - break; - /* - * Deactivate a spare disk: - */ - case DISKOP_SPARE_INACTIVE: - sdisk = conf->mirrors + spare_disk; - sdisk->operational = 0; - sdisk->write_only = 0; - break; - /* - * Activate (mark read-write) the (now sync) spare disk, - * which means we switch it's 'raid position' (->raid_disk) - * with the failed disk. (only the first 'conf->nr_disks' - * slots are used for 'real' disks and we must preserve this - * property) - */ - case DISKOP_SPARE_ACTIVE: - - sdisk = conf->mirrors + spare_disk; - fdisk = conf->mirrors + failed_disk; - - spare_desc = &sb->disks[sdisk->number]; - failed_desc = &sb->disks[fdisk->number]; - - if (spare_desc != *d) { - MD_BUG(); - err = 1; - goto abort; - } - - if (spare_desc->raid_disk != sdisk->raid_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - if (sdisk->raid_disk != spare_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - if (failed_desc->raid_disk != fdisk->raid_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - if (fdisk->raid_disk != failed_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - /* - * do the switch finally - */ - xchg_values(*spare_desc, *failed_desc); - xchg_values(*fdisk, *sdisk); - - /* - * (careful, 'failed' and 'spare' are switched from now on) - * - * we want to preserve linear numbering and we want to - * give the proper raid_disk number to the now activated - * disk. (this means we switch back these values) - */ - - xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); - xchg_values(sdisk->raid_disk, fdisk->raid_disk); - xchg_values(spare_desc->number, failed_desc->number); - xchg_values(sdisk->number, fdisk->number); - - *d = failed_desc; - - if (sdisk->dev == MKDEV(0,0)) - sdisk->used_slot = 0; - /* - * this really activates the spare. - */ - fdisk->spare = 0; - fdisk->write_only = 0; - link_disk(conf, fdisk); - - /* - * if we activate a spare, we definitely replace a - * non-operational disk slot in the 'low' area of - * the disk array. - */ - - conf->working_disks++; - - break; - - case DISKOP_HOT_REMOVE_DISK: - rdisk = conf->mirrors + removed_disk; - - if (rdisk->spare && (removed_disk < conf->raid_disks)) { - MD_BUG(); - err = 1; - goto abort; - } - rdisk->dev = MKDEV(0,0); - rdisk->used_slot = 0; - conf->nr_disks--; - break; - - case DISKOP_HOT_ADD_DISK: - adisk = conf->mirrors + added_disk; - added_desc = *d; - - if (added_disk != added_desc->number) { - MD_BUG(); - err = 1; - goto abort; - } - - adisk->number = added_desc->number; - adisk->raid_disk = added_desc->raid_disk; - adisk->dev = MKDEV(added_desc->major,added_desc->minor); - - adisk->operational = 0; - adisk->write_only = 0; - adisk->spare = 1; - adisk->used_slot = 1; - adisk->head_position = 0; - conf->nr_disks++; - - break; - - default: - MD_BUG(); - err = 1; - goto abort; - } -abort: - md_spin_unlock_irq(&conf->device_lock); - if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) - /* should move to "END_REBUILD" when such exists */ - raid1_shrink_buffers(conf); - - print_raid1_conf(conf); - return err; -} - - -#define IO_ERROR KERN_ALERT \ -"raid1: %s: unrecoverable I/O read error for block %lu\n" - -#define REDIRECT_SECTOR KERN_ERR \ -"raid1: %s: redirecting sector %lu to another mirror\n" - -/* - * This is a kernel thread which: - * - * 1. Retries failed read operations on working mirrors. - * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. - */ -static void end_sync_write(struct buffer_head *bh, int uptodate); -static void end_sync_read(struct buffer_head *bh, int uptodate); - -static void raid1d (void *data) -{ - struct raid1_bh *r1_bh; - struct buffer_head *bh; - unsigned long flags; - mddev_t *mddev; - kdev_t dev; - - - for (;;) { - md_spin_lock_irqsave(&retry_list_lock, flags); - r1_bh = raid1_retry_list; - if (!r1_bh) - break; - raid1_retry_list = r1_bh->next_r1; - md_spin_unlock_irqrestore(&retry_list_lock, flags); - - mddev = r1_bh->mddev; - if (mddev->sb_dirty) { - printk(KERN_INFO "dirty sb detected, updating.\n"); - mddev->sb_dirty = 0; - md_update_sb(mddev); - } - bh = &r1_bh->bh_req; - switch(r1_bh->cmd) { - case SPECIAL: - /* have to allocate lots of bh structures and - * schedule writes - */ - if (test_bit(R1BH_Uptodate, &r1_bh->state)) { - int i, sum_bhs = 0; - int disks = MD_SB_DISKS; - struct buffer_head *bhl, *mbh; - raid1_conf_t *conf; - int sectors = bh->b_size >> 9; - - conf = mddev_to_conf(mddev); - bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ - for (i = 0; i < disks ; i++) { - if (!conf->mirrors[i].operational) - continue; - if (i==conf->last_used) - /* we read from here, no need to write */ - continue; - if (i < conf->raid_disks - && !conf->resync_mirrors) - /* don't need to write this, - * we are just rebuilding */ - continue; - mbh = bhl; - if (!mbh) { - MD_BUG(); - break; - } - bhl = mbh->b_next; - mbh->b_this_page = (struct buffer_head *)1; - - - /* - * prepare mirrored bh (fields ordered for max mem throughput): - */ - mbh->b_blocknr = bh->b_blocknr; - mbh->b_dev = conf->mirrors[i].dev; - mbh->b_rdev = conf->mirrors[i].dev; - mbh->b_rsector = bh->b_blocknr * sectors; - mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) | - (1<<BH_Mapped) | (1<<BH_Lock); - atomic_set(&mbh->b_count, 1); - mbh->b_size = bh->b_size; - mbh->b_page = bh->b_page; - mbh->b_data = bh->b_data; - mbh->b_list = BUF_LOCKED; - mbh->b_end_io = end_sync_write; - mbh->b_private = r1_bh; - - mbh->b_next = r1_bh->mirror_bh_list; - r1_bh->mirror_bh_list = mbh; - - sum_bhs++; - } - md_atomic_set(&r1_bh->remaining, sum_bhs); - if (bhl) raid1_free_bh(conf, bhl); - mbh = r1_bh->mirror_bh_list; - while (mbh) { - struct buffer_head *bh1 = mbh; - mbh = mbh->b_next; - generic_make_request(WRITE, bh1); - md_sync_acct(bh1->b_rdev, bh1->b_size/512); - } - } else { - dev = bh->b_dev; - raid1_map (mddev, &bh->b_dev, bh->b_size >> 9); - if (bh->b_dev == dev) { - printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); - md_done_sync(mddev, bh->b_size>>10, 0); - } else { - printk (REDIRECT_SECTOR, - partition_name(bh->b_dev), bh->b_blocknr); - bh->b_rdev = bh->b_dev; - generic_make_request(READ, bh); - } - } - - break; - case READ: - case READA: - dev = bh->b_dev; - - raid1_map (mddev, &bh->b_dev, bh->b_size >> 9); - if (bh->b_dev == dev) { - printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); - raid1_end_bh_io(r1_bh, 0); - } else { - printk (REDIRECT_SECTOR, - partition_name(bh->b_dev), bh->b_blocknr); - bh->b_rdev = bh->b_dev; - generic_make_request (r1_bh->cmd, bh); - } - break; - } - } - md_spin_unlock_irqrestore(&retry_list_lock, flags); -} -#undef IO_ERROR -#undef REDIRECT_SECTOR - -/* - * Private kernel thread to reconstruct mirrors after an unclean - * shutdown. - */ -static void raid1syncd (void *data) -{ - raid1_conf_t *conf = data; - mddev_t *mddev = conf->mddev; - - if (!conf->resync_mirrors) - return; - if (conf->resync_mirrors == 2) - return; - down(&mddev->recovery_sem); - if (!md_do_sync(mddev, NULL)) { - /* - * Only if everything went Ok. - */ - conf->resync_mirrors = 0; - } - - /* If reconstruction was interrupted, we need to close the "active" and "pending" - * holes. - * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 - */ - /* this is really needed when recovery stops too... */ - spin_lock_irq(&conf->segment_lock); - conf->start_active = conf->start_pending; - conf->start_ready = conf->start_pending; - wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); - conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; - conf->start_future = mddev->sb->size+1; - conf->cnt_pending = conf->cnt_future; - conf->cnt_future = 0; - conf->phase = conf->phase ^1; - wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); - conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; - conf->phase = 0; - conf->cnt_future = conf->cnt_done;; - conf->cnt_done = 0; - spin_unlock_irq(&conf->segment_lock); - wake_up(&conf->wait_done); - - up(&mddev->recovery_sem); - raid1_shrink_buffers(conf); -} - -/* - * perform a "sync" on one "block" - * - * We need to make sure that no normal I/O request - particularly write - * requests - conflict with active sync requests. - * This is achieved by conceptually dividing the device space into a - * number of sections: - * DONE: 0 .. a-1 These blocks are in-sync - * ACTIVE: a.. b-1 These blocks may have active sync requests, but - * no normal IO requests - * READY: b .. c-1 These blocks have no normal IO requests - sync - * request may be happening - * PENDING: c .. d-1 These blocks may have IO requests, but no new - * ones will be added - * FUTURE: d .. end These blocks are not to be considered yet. IO may - * be happening, but not sync - * - * We keep a - * phase which flips (0 or 1) each time d moves and - * a count of: - * z = active io requests in FUTURE since d moved - marked with - * current phase - * y = active io requests in FUTURE before d moved, or PENDING - - * marked with previous phase - * x = active sync requests in READY - * w = active sync requests in ACTIVE - * v = active io requests in DONE - * - * Normally, a=b=c=d=0 and z= active io requests - * or a=b=c=d=END and v= active io requests - * Allowed changes to a,b,c,d: - * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase - * B: y==0 -> c=d - * C: b=c, w+=x, x=0 - * D: w==0 -> a=b - * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 - * - * At start of sync we apply A. - * When y reaches 0, we apply B then A then being sync requests - * When sync point reaches c-1, we wait for y==0, and W==0, and - * then apply apply B then A then D then C. - * Finally, we apply E - * - * The sync request simply issues a "read" against a working drive - * This is marked so that on completion the raid1d thread is woken to - * issue suitable write requests - */ - -static int raid1_sync_request (mddev_t *mddev, unsigned long block_nr) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - struct mirror_info *mirror; - struct raid1_bh *r1_bh; - struct buffer_head *bh; - int bsize; - - spin_lock_irq(&conf->segment_lock); - if (!block_nr) { - /* initialize ...*/ - int buffs; - conf->start_active = 0; - conf->start_ready = 0; - conf->start_pending = 0; - conf->start_future = 0; - conf->phase = 0; - /* we want enough buffers to hold twice the window of 128*/ - buffs = 128 *2 / (PAGE_SIZE>>9); - buffs = raid1_grow_buffers(conf, buffs); - if (buffs < 2) - goto nomem; - - conf->window = buffs*(PAGE_SIZE>>9)/2; - conf->cnt_future += conf->cnt_done+conf->cnt_pending; - conf->cnt_done = conf->cnt_pending = 0; - if (conf->cnt_ready || conf->cnt_active) - MD_BUG(); - } - while ((block_nr<<1) >= conf->start_pending) { - PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", - block_nr<<1, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, - conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); - wait_event_lock_irq(conf->wait_done, - !conf->cnt_active, - conf->segment_lock); - wait_event_lock_irq(conf->wait_ready, - !conf->cnt_pending, - conf->segment_lock); - conf->start_active = conf->start_ready; - conf->start_ready = conf->start_pending; - conf->start_pending = conf->start_future; - conf->start_future = conf->start_future+conf->window; - // Note: falling off the end is not a problem - conf->phase = conf->phase ^1; - conf->cnt_active = conf->cnt_ready; - conf->cnt_ready = 0; - conf->cnt_pending = conf->cnt_future; - conf->cnt_future = 0; - wake_up(&conf->wait_done); - } - conf->cnt_ready++; - spin_unlock_irq(&conf->segment_lock); - - - /* If reconstructing, and >1 working disc, - * could dedicate one to rebuild and others to - * service read requests .. - */ - mirror = conf->mirrors+conf->last_used; - - r1_bh = raid1_alloc_buf (conf); - r1_bh->master_bh = NULL; - r1_bh->mddev = mddev; - r1_bh->cmd = SPECIAL; - bh = &r1_bh->bh_req; - - bh->b_blocknr = block_nr; - bsize = 1024; - while (!(bh->b_blocknr & 1) && bsize < PAGE_SIZE - && (bh->b_blocknr+2)*(bsize>>10) < mddev->sb->size) { - bh->b_blocknr >>= 1; - bsize <<= 1; - } - bh->b_size = bsize; - bh->b_list = BUF_LOCKED; - bh->b_dev = mirror->dev; - bh->b_rdev = mirror->dev; - bh->b_state = (1<<BH_Req) | (1<<BH_Mapped); - if (!bh->b_page) - BUG(); - if (!bh->b_data) - BUG(); - if (bh->b_data != page_address(bh->b_page)) - BUG(); - bh->b_end_io = end_sync_read; - bh->b_private = r1_bh; - bh->b_rsector = block_nr<<1; - init_waitqueue_head(&bh->b_wait); - - generic_make_request(READ, bh); - md_sync_acct(bh->b_rdev, bh->b_size/512); - - return (bsize >> 10); - -nomem: - raid1_shrink_buffers(conf); - spin_unlock_irq(&conf->segment_lock); - return -ENOMEM; -} - -static void end_sync_read(struct buffer_head *bh, int uptodate) -{ - struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); - - /* we have read a block, now it needs to be re-written, - * or re-read if the read failed. - * We don't do much here, just schedule handling by raid1d - */ - if (!uptodate) - md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev); - else - set_bit(R1BH_Uptodate, &r1_bh->state); - raid1_reschedule_retry(r1_bh); -} - -static void end_sync_write(struct buffer_head *bh, int uptodate) -{ - struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); - - if (!uptodate) - md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev); - if (atomic_dec_and_test(&r1_bh->remaining)) { - mddev_t *mddev = r1_bh->mddev; - unsigned long sect = bh->b_blocknr * (bh->b_size>>9); - int size = bh->b_size; - raid1_free_buf(r1_bh); - sync_request_done(sect, mddev_to_conf(mddev)); - md_done_sync(mddev,size>>10, uptodate); - } -} - -/* - * This will catch the scenario in which one of the mirrors was - * mounted as a normal device rather than as a part of a raid set. - * - * check_consistency is very personality-dependent, eg. RAID5 cannot - * do this check, it uses another method. - */ -static int __check_consistency (mddev_t *mddev, int row) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - int disks = MD_SB_DISKS; - kdev_t dev; - struct buffer_head *bh = NULL; - int i, rc = 0; - char *buffer = NULL; - - for (i = 0; i < disks; i++) { - printk("(checking disk %d)\n",i); - if (!conf->mirrors[i].operational) - continue; - printk("(really checking disk %d)\n",i); - dev = conf->mirrors[i].dev; - set_blocksize(dev, 4096); - if ((bh = bread(dev, row / 4, 4096)) == NULL) - break; - if (!buffer) { - buffer = (char *) __get_free_page(GFP_KERNEL); - if (!buffer) - break; - memcpy(buffer, bh->b_data, 4096); - } else if (memcmp(buffer, bh->b_data, 4096)) { - rc = 1; - break; - } - bforget(bh); - fsync_dev(dev); - invalidate_buffers(dev); - bh = NULL; - } - if (buffer) - free_page((unsigned long) buffer); - if (bh) { - dev = bh->b_dev; - bforget(bh); - fsync_dev(dev); - invalidate_buffers(dev); - } - return rc; -} - -static int check_consistency (mddev_t *mddev) -{ - if (__check_consistency(mddev, 0)) -/* - * we do not do this currently, as it's perfectly possible to - * have an inconsistent array when it's freshly created. Only - * newly written data has to be consistent. - */ - return 0; - - return 0; -} - -#define INVALID_LEVEL KERN_WARNING \ -"raid1: md%d: raid level not set to mirroring (%d)\n" - -#define NO_SB KERN_ERR \ -"raid1: disabled mirror %s (couldn't access raid superblock)\n" - -#define ERRORS KERN_ERR \ -"raid1: disabled mirror %s (errors detected)\n" - -#define NOT_IN_SYNC KERN_ERR \ -"raid1: disabled mirror %s (not in sync)\n" - -#define INCONSISTENT KERN_ERR \ -"raid1: disabled mirror %s (inconsistent descriptor)\n" - -#define ALREADY_RUNNING KERN_ERR \ -"raid1: disabled mirror %s (mirror %d already operational)\n" - -#define OPERATIONAL KERN_INFO \ -"raid1: device %s operational as mirror %d\n" - -#define MEM_ERROR KERN_ERR \ -"raid1: couldn't allocate memory for md%d\n" - -#define SPARE KERN_INFO \ -"raid1: spare disk %s\n" - -#define NONE_OPERATIONAL KERN_ERR \ -"raid1: no operational mirrors for md%d\n" - -#define RUNNING_CKRAID KERN_ERR \ -"raid1: detected mirror differences -- running resync\n" - -#define ARRAY_IS_ACTIVE KERN_INFO \ -"raid1: raid set md%d active with %d out of %d mirrors\n" - -#define THREAD_ERROR KERN_ERR \ -"raid1: couldn't allocate thread for md%d\n" - -#define START_RESYNC KERN_WARNING \ -"raid1: raid set md%d not clean; reconstructing mirrors\n" - -static int raid1_run (mddev_t *mddev) -{ - raid1_conf_t *conf; - int i, j, disk_idx; - struct mirror_info *disk; - mdp_super_t *sb = mddev->sb; - mdp_disk_t *descriptor; - mdk_rdev_t *rdev; - struct md_list_head *tmp; - int start_recovery = 0; - - MOD_INC_USE_COUNT; - - if (sb->level != 1) { - printk(INVALID_LEVEL, mdidx(mddev), sb->level); - goto out; - } - /* - * copy the already verified devices into our private RAID1 - * bookkeeping area. [whatever we allocate in raid1_run(), - * should be freed in raid1_stop()] - */ - - conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); - mddev->private = conf; - if (!conf) { - printk(MEM_ERROR, mdidx(mddev)); - goto out; - } - memset(conf, 0, sizeof(*conf)); - - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) { - printk(ERRORS, partition_name(rdev->dev)); - } else { - if (!rdev->sb) { - MD_BUG(); - continue; - } - } - if (rdev->desc_nr == -1) { - MD_BUG(); - continue; - } - descriptor = &sb->disks[rdev->desc_nr]; - disk_idx = descriptor->raid_disk; - disk = conf->mirrors + disk_idx; - - if (disk_faulty(descriptor)) { - disk->number = descriptor->number; - disk->raid_disk = disk_idx; - disk->dev = rdev->dev; - disk->sect_limit = MAX_WORK_PER_DISK; - disk->operational = 0; - disk->write_only = 0; - disk->spare = 0; - disk->used_slot = 1; - disk->head_position = 0; - continue; - } - if (disk_active(descriptor)) { - if (!disk_sync(descriptor)) { - printk(NOT_IN_SYNC, - partition_name(rdev->dev)); - continue; - } - if ((descriptor->number > MD_SB_DISKS) || - (disk_idx > sb->raid_disks)) { - - printk(INCONSISTENT, - partition_name(rdev->dev)); - continue; - } - if (disk->operational) { - printk(ALREADY_RUNNING, - partition_name(rdev->dev), - disk_idx); - continue; - } - printk(OPERATIONAL, partition_name(rdev->dev), - disk_idx); - disk->number = descriptor->number; - disk->raid_disk = disk_idx; - disk->dev = rdev->dev; - disk->sect_limit = MAX_WORK_PER_DISK; - disk->operational = 1; - disk->write_only = 0; - disk->spare = 0; - disk->used_slot = 1; - disk->head_position = 0; - conf->working_disks++; - } else { - /* - * Must be a spare disk .. - */ - printk(SPARE, partition_name(rdev->dev)); - disk->number = descriptor->number; - disk->raid_disk = disk_idx; - disk->dev = rdev->dev; - disk->sect_limit = MAX_WORK_PER_DISK; - disk->operational = 0; - disk->write_only = 0; - disk->spare = 1; - disk->used_slot = 1; - disk->head_position = 0; - } - } - conf->raid_disks = sb->raid_disks; - conf->nr_disks = sb->nr_disks; - conf->mddev = mddev; - conf->device_lock = MD_SPIN_LOCK_UNLOCKED; - - conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; - init_waitqueue_head(&conf->wait_buffer); - init_waitqueue_head(&conf->wait_done); - init_waitqueue_head(&conf->wait_ready); - - if (!conf->working_disks) { - printk(NONE_OPERATIONAL, mdidx(mddev)); - goto out_free_conf; - } - - - /* pre-allocate some buffer_head structures. - * As a minimum, 1 r1bh and raid_disks buffer_heads - * would probably get us by in tight memory situations, - * but a few more is probably a good idea. - * For now, try 16 r1bh and 16*raid_disks bufferheads - * This will allow at least 16 concurrent reads or writes - * even if kmalloc starts failing - */ - if (raid1_grow_r1bh(conf, 16) < 16 || - raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) { - printk(MEM_ERROR, mdidx(mddev)); - goto out_free_conf; - } - - for (i = 0; i < MD_SB_DISKS; i++) { - - descriptor = sb->disks+i; - disk_idx = descriptor->raid_disk; - disk = conf->mirrors + disk_idx; - - if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && - !disk->used_slot) { - - disk->number = descriptor->number; - disk->raid_disk = disk_idx; - disk->dev = MKDEV(0,0); - - disk->operational = 0; - disk->write_only = 0; - disk->spare = 0; - disk->used_slot = 1; - disk->head_position = 0; - } - } - - /* - * find the first working one and use it as a starting point - * to read balancing. - */ - for (j = 0; !conf->mirrors[j].operational; j++) - /* nothing */; - conf->last_used = j; - - /* - * initialize the 'working disks' list. - */ - for (i = conf->raid_disks - 1; i >= 0; i--) { - if (conf->mirrors[i].operational) { - conf->mirrors[i].next = j; - j = i; - } - } - - if (conf->working_disks != sb->raid_disks) { - printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); - start_recovery = 1; - } - - if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) { - /* - * we do sanity checks even if the device says - * it's clean ... - */ - if (check_consistency(mddev)) { - printk(RUNNING_CKRAID); - sb->state &= ~(1 << MD_SB_CLEAN); - } - } - - { - const char * name = "raid1d"; - - conf->thread = md_register_thread(raid1d, conf, name); - if (!conf->thread) { - printk(THREAD_ERROR, mdidx(mddev)); - goto out_free_conf; - } - } - - if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { - const char * name = "raid1syncd"; - - conf->resync_thread = md_register_thread(raid1syncd, conf,name); - if (!conf->resync_thread) { - printk(THREAD_ERROR, mdidx(mddev)); - goto out_free_conf; - } - - printk(START_RESYNC, mdidx(mddev)); - conf->resync_mirrors = 1; - md_wakeup_thread(conf->resync_thread); - } - - /* - * Regenerate the "device is in sync with the raid set" bit for - * each device. - */ - for (i = 0; i < MD_SB_DISKS; i++) { - mark_disk_nonsync(sb->disks+i); - for (j = 0; j < sb->raid_disks; j++) { - if (!conf->mirrors[j].operational) - continue; - if (sb->disks[i].number == conf->mirrors[j].number) - mark_disk_sync(sb->disks+i); - } - } - sb->active_disks = conf->working_disks; - - if (start_recovery) - md_recover_arrays(); - - - printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); - /* - * Ok, everything is just fine now - */ - return 0; - -out_free_conf: - raid1_shrink_r1bh(conf); - raid1_shrink_bh(conf, conf->freebh_cnt); - raid1_shrink_buffers(conf); - kfree(conf); - mddev->private = NULL; -out: - MOD_DEC_USE_COUNT; - return -EIO; -} - -#undef INVALID_LEVEL -#undef NO_SB -#undef ERRORS -#undef NOT_IN_SYNC -#undef INCONSISTENT -#undef ALREADY_RUNNING -#undef OPERATIONAL -#undef SPARE -#undef NONE_OPERATIONAL -#undef RUNNING_CKRAID -#undef ARRAY_IS_ACTIVE - -static int raid1_stop_resync (mddev_t *mddev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - - if (conf->resync_thread) { - if (conf->resync_mirrors) { - conf->resync_mirrors = 2; - md_interrupt_thread(conf->resync_thread); - - printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); - return 1; - } - return 0; - } - return 0; -} - -static int raid1_restart_resync (mddev_t *mddev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - - if (conf->resync_mirrors) { - if (!conf->resync_thread) { - MD_BUG(); - return 0; - } - conf->resync_mirrors = 1; - md_wakeup_thread(conf->resync_thread); - return 1; - } - return 0; -} - -static int raid1_stop (mddev_t *mddev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - - md_unregister_thread(conf->thread); - if (conf->resync_thread) - md_unregister_thread(conf->resync_thread); - raid1_shrink_r1bh(conf); - raid1_shrink_bh(conf, conf->freebh_cnt); - raid1_shrink_buffers(conf); - kfree(conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return 0; -} - -static mdk_personality_t raid1_personality= -{ - name: "raid1", - make_request: raid1_make_request, - run: raid1_run, - stop: raid1_stop, - status: raid1_status, - error_handler: raid1_error, - diskop: raid1_diskop, - stop_resync: raid1_stop_resync, - restart_resync: raid1_restart_resync, - sync_request: raid1_sync_request -}; - -int raid1_init (void) -{ - return register_md_personality (RAID1, &raid1_personality); -} - -#ifdef MODULE -int init_module (void) -{ - return raid1_init(); -} - -void cleanup_module (void) -{ - unregister_md_personality (RAID1); -} -#endif diff --git a/drivers/block/raid5.c b/drivers/block/raid5.c deleted file mode 100644 index cff836dc4..000000000 --- a/drivers/block/raid5.c +++ /dev/null @@ -1,2371 +0,0 @@ -/* - * raid5.c : Multiple Devices driver for Linux - * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * Copyright (C) 1999, 2000 Ingo Molnar - * - * RAID-5 management functions. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/locks.h> -#include <linux/malloc.h> -#include <linux/raid/raid5.h> -#include <asm/bitops.h> -#include <asm/atomic.h> - -static mdk_personality_t raid5_personality; - -/* - * Stripe cache - */ - -#define NR_STRIPES 128 -#define HASH_PAGES 1 -#define HASH_PAGES_ORDER 0 -#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) -#define HASH_MASK (NR_HASH - 1) -#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK]) - -/* - * The following can be used to debug the driver - */ -#define RAID5_DEBUG 0 -#define RAID5_PARANOIA 1 -#if RAID5_PARANOIA && CONFIG_SMP -# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG() -# define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG() -#else -# define CHECK_DEVLOCK() -# define CHECK_SHLOCK(unused) -#endif - -#if RAID5_DEBUG -#define PRINTK(x...) printk(x) -#define inline -#define __inline__ -#else -#define PRINTK(x...) do { } while (0) -#endif - -static void print_raid5_conf (raid5_conf_t *conf); - -static inline int stripe_locked(struct stripe_head *sh) -{ - return test_bit(STRIPE_LOCKED, &sh->state); -} - -static void __unlock_stripe(struct stripe_head *sh) -{ - if (!md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) - BUG(); - PRINTK("unlocking stripe %lu\n", sh->sector); - wake_up(&sh->wait); -} - -static void finish_unlock_stripe(struct stripe_head *sh) -{ - raid5_conf_t *conf = sh->raid_conf; - sh->cmd = STRIPE_NONE; - sh->phase = PHASE_COMPLETE; - atomic_dec(&conf->nr_pending_stripes); - atomic_inc(&conf->nr_cached_stripes); - __unlock_stripe(sh); - atomic_dec(&sh->count); - wake_up(&conf->wait_for_stripe); -} - -static void remove_hash(raid5_conf_t *conf, struct stripe_head *sh) -{ - PRINTK("remove_hash(), stripe %lu\n", sh->sector); - - CHECK_DEVLOCK(); - CHECK_SHLOCK(sh); - if (sh->hash_pprev) { - if (sh->hash_next) - sh->hash_next->hash_pprev = sh->hash_pprev; - *sh->hash_pprev = sh->hash_next; - sh->hash_pprev = NULL; - atomic_dec(&conf->nr_hashed_stripes); - } -} - -static void lock_get_bh (struct buffer_head *bh) -{ - while (md_test_and_set_bit(BH_Lock, &bh->b_state)) - __wait_on_buffer(bh); - atomic_inc(&bh->b_count); -} - -static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) -{ - struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size); - - PRINTK("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", - sh->sector, atomic_read(&conf->nr_hashed_stripes)); - - CHECK_DEVLOCK(); - CHECK_SHLOCK(sh); - if ((sh->hash_next = *shp) != NULL) - (*shp)->hash_pprev = &sh->hash_next; - *shp = sh; - sh->hash_pprev = shp; - atomic_inc(&conf->nr_hashed_stripes); -} - -static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size) -{ - struct buffer_head *bh; - unsigned long flags; - - CHECK_SHLOCK(sh); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - bh = sh->buffer_pool; - if (!bh) - goto out_unlock; - sh->buffer_pool = bh->b_next; - bh->b_size = b_size; - if (atomic_read(&bh->b_count)) - BUG(); -out_unlock: - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); - - return bh; -} - -static struct buffer_head *get_free_bh(struct stripe_head *sh) -{ - struct buffer_head *bh; - unsigned long flags; - - CHECK_SHLOCK(sh); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - bh = sh->bh_pool; - if (!bh) - goto out_unlock; - sh->bh_pool = bh->b_next; - if (atomic_read(&bh->b_count)) - BUG(); -out_unlock: - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); - - return bh; -} - -static void put_free_buffer(struct stripe_head *sh, struct buffer_head *bh) -{ - unsigned long flags; - - if (atomic_read(&bh->b_count)) - BUG(); - CHECK_SHLOCK(sh); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - bh->b_next = sh->buffer_pool; - sh->buffer_pool = bh; - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); -} - -static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh) -{ - unsigned long flags; - - if (atomic_read(&bh->b_count)) - BUG(); - CHECK_SHLOCK(sh); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - bh->b_next = sh->bh_pool; - sh->bh_pool = bh; - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); -} - -static struct stripe_head *get_free_stripe(raid5_conf_t *conf) -{ - struct stripe_head *sh; - - md_spin_lock_irq(&conf->device_lock); - sh = conf->free_sh_list; - if (!sh) - goto out; - conf->free_sh_list = sh->free_next; - atomic_dec(&conf->nr_free_sh); - if (!atomic_read(&conf->nr_free_sh) && conf->free_sh_list) - BUG(); - if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || - atomic_read(&sh->count)) - BUG(); -out: - md_spin_unlock_irq(&conf->device_lock); - return sh; -} - -static void __put_free_stripe (raid5_conf_t *conf, struct stripe_head *sh) -{ - if (atomic_read(&sh->count) != 0) - BUG(); - CHECK_DEVLOCK(); - CHECK_SHLOCK(sh); - clear_bit(STRIPE_LOCKED, &sh->state); - sh->free_next = conf->free_sh_list; - conf->free_sh_list = sh; - atomic_inc(&conf->nr_free_sh); -} - -static void shrink_buffers(struct stripe_head *sh, int num) -{ - struct buffer_head *bh; - - while (num--) { - bh = get_free_buffer(sh, -1); - if (!bh) - return; - free_page((unsigned long) bh->b_data); - kfree(bh); - } -} - -static void shrink_bh(struct stripe_head *sh, int num) -{ - struct buffer_head *bh; - - while (num--) { - bh = get_free_bh(sh); - if (!bh) - return; - kfree(bh); - } -} - -static int grow_raid5_buffers(struct stripe_head *sh, int num, int b_size, int priority) -{ - struct buffer_head *bh; - - while (num--) { - struct page *page; - bh = kmalloc(sizeof(struct buffer_head), priority); - if (!bh) - return 1; - memset(bh, 0, sizeof (struct buffer_head)); - init_waitqueue_head(&bh->b_wait); - page = alloc_page(priority); - bh->b_data = page_address(page); - if (!bh->b_data) { - kfree(bh); - return 1; - } - bh->b_size = b_size; - atomic_set(&bh->b_count, 0); - bh->b_page = page; - put_free_buffer(sh, bh); - } - return 0; -} - -static int grow_bh(struct stripe_head *sh, int num, int priority) -{ - struct buffer_head *bh; - - while (num--) { - bh = kmalloc(sizeof(struct buffer_head), priority); - if (!bh) - return 1; - memset(bh, 0, sizeof (struct buffer_head)); - init_waitqueue_head(&bh->b_wait); - put_free_bh(sh, bh); - } - return 0; -} - -static void raid5_free_buffer(struct stripe_head *sh, struct buffer_head *bh) -{ - put_free_buffer(sh, bh); -} - -static void raid5_free_bh(struct stripe_head *sh, struct buffer_head *bh) -{ - put_free_bh(sh, bh); -} - -static void raid5_free_old_bh(struct stripe_head *sh, int i) -{ - CHECK_SHLOCK(sh); - if (!sh->bh_old[i]) - BUG(); - raid5_free_buffer(sh, sh->bh_old[i]); - sh->bh_old[i] = NULL; -} - -static void raid5_update_old_bh(struct stripe_head *sh, int i) -{ - CHECK_SHLOCK(sh); - PRINTK("stripe %lu, idx %d, updating cache copy\n", sh->sector, i); - if (!sh->bh_copy[i]) - BUG(); - if (sh->bh_old[i]) - raid5_free_old_bh(sh, i); - sh->bh_old[i] = sh->bh_copy[i]; - sh->bh_copy[i] = NULL; -} - -static void free_stripe(struct stripe_head *sh) -{ - raid5_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks, j; - - if (atomic_read(&sh->count) != 0) - BUG(); - CHECK_DEVLOCK(); - CHECK_SHLOCK(sh); - PRINTK("free_stripe called, stripe %lu\n", sh->sector); - if (sh->phase != PHASE_COMPLETE || atomic_read(&sh->count)) { - PRINTK("raid5: free_stripe(), sector %lu, phase %d, count %d\n", sh->sector, sh->phase, atomic_read(&sh->count)); - return; - } - for (j = 0; j < disks; j++) { - if (sh->bh_old[j]) - raid5_free_old_bh(sh, j); - if (sh->bh_new[j] || sh->bh_copy[j]) - BUG(); - } - remove_hash(conf, sh); - __put_free_stripe(conf, sh); -} - -static int shrink_stripe_cache(raid5_conf_t *conf, int nr) -{ - struct stripe_head *sh; - int i, count = 0; - - PRINTK("shrink_stripe_cache called, %d/%d, clock %d\n", nr, atomic_read(&conf->nr_hashed_stripes), conf->clock); - md_spin_lock_irq(&conf->device_lock); - for (i = 0; i < NR_HASH; i++) { - sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK]; - for (; sh; sh = sh->hash_next) { - if (sh->phase != PHASE_COMPLETE) - continue; - if (atomic_read(&sh->count)) - continue; - /* - * Try to lock this stripe: - */ - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) - continue; - free_stripe(sh); - if (++count == nr) { - conf->clock = (i + conf->clock) & HASH_MASK; - goto out; - } - } - } -out: - md_spin_unlock_irq(&conf->device_lock); - PRINTK("shrink completed, nr_hashed_stripes %d, nr_pending_strips %d\n", - atomic_read(&conf->nr_hashed_stripes), - atomic_read(&conf->nr_pending_stripes)); - return count; -} - -void __wait_lock_stripe(struct stripe_head *sh) -{ - MD_DECLARE_WAITQUEUE(wait, current); - - PRINTK("wait_lock_stripe %lu\n", sh->sector); - if (!atomic_read(&sh->count)) - BUG(); - add_wait_queue(&sh->wait, &wait); -repeat: - set_current_state(TASK_UNINTERRUPTIBLE); - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) { - schedule(); - goto repeat; - } - PRINTK("wait_lock_stripe %lu done\n", sh->sector); - remove_wait_queue(&sh->wait, &wait); - current->state = TASK_RUNNING; -} - -static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector, int size) -{ - struct stripe_head *sh; - - PRINTK("__find_stripe, sector %lu\n", sector); - for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next) { - if (sh->sector == sector && sh->raid_conf == conf) { - if (sh->size != size) - BUG(); - return sh; - } - } - PRINTK("__stripe %lu not in cache\n", sector); - return NULL; -} - -static inline struct stripe_head *alloc_stripe(raid5_conf_t *conf, unsigned long sector, int size) -{ - struct stripe_head *sh; - struct buffer_head *buffer_pool, *bh_pool; - MD_DECLARE_WAITQUEUE(wait, current); - - PRINTK("alloc_stripe called\n"); - - - while ((sh = get_free_stripe(conf)) == NULL) { - int cnt; - add_wait_queue(&conf->wait_for_stripe, &wait); - set_current_state(TASK_UNINTERRUPTIBLE); - cnt = shrink_stripe_cache(conf, conf->max_nr_stripes / 8); - sh = get_free_stripe(conf); - if (!sh && cnt < (conf->max_nr_stripes/8)) { - md_wakeup_thread(conf->thread); - PRINTK("waiting for some stripes to complete - %d %d\n", cnt, conf->max_nr_stripes/8); - schedule(); - } - remove_wait_queue(&conf->wait_for_stripe, &wait); - current->state = TASK_RUNNING; - if (sh) - break; - } - - buffer_pool = sh->buffer_pool; - bh_pool = sh->bh_pool; - memset(sh, 0, sizeof(*sh)); - sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; - md_init_waitqueue_head(&sh->wait); - sh->buffer_pool = buffer_pool; - sh->bh_pool = bh_pool; - sh->phase = PHASE_COMPLETE; - sh->cmd = STRIPE_NONE; - sh->raid_conf = conf; - sh->sector = sector; - sh->size = size; - atomic_inc(&conf->nr_cached_stripes); - - return sh; -} - -static struct stripe_head *get_lock_stripe(raid5_conf_t *conf, unsigned long sector, int size) -{ - struct stripe_head *sh, *new = NULL; - - PRINTK("get_stripe, sector %lu\n", sector); - - /* - * Do this in set_blocksize()! - */ - if (conf->buffer_size != size) { - PRINTK("switching size, %d --> %d\n", conf->buffer_size, size); - shrink_stripe_cache(conf, conf->max_nr_stripes); - conf->buffer_size = size; - } - -repeat: - md_spin_lock_irq(&conf->device_lock); - sh = __find_stripe(conf, sector, size); - if (!sh) { - if (!new) { - md_spin_unlock_irq(&conf->device_lock); - new = alloc_stripe(conf, sector, size); - goto repeat; - } - sh = new; - new = NULL; - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) - BUG(); - insert_hash(conf, sh); - atomic_inc(&sh->count); - md_spin_unlock_irq(&conf->device_lock); - } else { - atomic_inc(&sh->count); - if (new) { - if (md_test_and_set_bit(STRIPE_LOCKED, &new->state)) - BUG(); - __put_free_stripe(conf, new); - } - md_spin_unlock_irq(&conf->device_lock); - PRINTK("get_stripe, waiting, sector %lu\n", sector); - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) - __wait_lock_stripe(sh); - } - return sh; -} - -static int grow_stripes(raid5_conf_t *conf, int num, int priority) -{ - struct stripe_head *sh; - - while (num--) { - sh = kmalloc(sizeof(struct stripe_head), priority); - if (!sh) - return 1; - memset(sh, 0, sizeof(*sh)); - sh->raid_conf = conf; - sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; - md_init_waitqueue_head(&sh->wait); - - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) - BUG(); - if (grow_raid5_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) { - shrink_buffers(sh, 2 * conf->raid_disks); - kfree(sh); - return 1; - } - if (grow_bh(sh, conf->raid_disks, priority)) { - shrink_buffers(sh, 2 * conf->raid_disks); - shrink_bh(sh, conf->raid_disks); - kfree(sh); - return 1; - } - md_spin_lock_irq(&conf->device_lock); - __put_free_stripe(conf, sh); - atomic_inc(&conf->nr_stripes); - md_spin_unlock_irq(&conf->device_lock); - } - return 0; -} - -static void shrink_stripes(raid5_conf_t *conf, int num) -{ - struct stripe_head *sh; - - while (num--) { - sh = get_free_stripe(conf); - if (!sh) - break; - if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) - BUG(); - shrink_buffers(sh, conf->raid_disks * 2); - shrink_bh(sh, conf->raid_disks); - kfree(sh); - atomic_dec(&conf->nr_stripes); - } -} - - -static struct buffer_head *raid5_alloc_buffer(struct stripe_head *sh, int b_size) -{ - struct buffer_head *bh; - - bh = get_free_buffer(sh, b_size); - if (!bh) - BUG(); - return bh; -} - -static struct buffer_head *raid5_alloc_bh(struct stripe_head *sh) -{ - struct buffer_head *bh; - - bh = get_free_bh(sh); - if (!bh) - BUG(); - return bh; -} - -static void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate) -{ - struct buffer_head *bh = sh->bh_new[i]; - - PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh->b_blocknr, uptodate); - sh->bh_new[i] = NULL; - raid5_free_bh(sh, sh->bh_req[i]); - sh->bh_req[i] = NULL; - PRINTK("calling %p->end_io: %p.\n", bh, bh->b_end_io); - bh->b_end_io(bh, uptodate); - if (!uptodate) - printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for " - "block %lu\n", - partition_name(mddev_to_kdev(sh->raid_conf->mddev)), - bh->b_blocknr); -} - -static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate) -{ - if (uptodate) - set_bit(BH_Uptodate, &bh->b_state); - else - clear_bit(BH_Uptodate, &bh->b_state); -} - -static void raid5_end_request (struct buffer_head * bh, int uptodate) -{ - struct stripe_head *sh = bh->b_private; - raid5_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks, i; - unsigned long flags; - - PRINTK("end_request %lu, nr_pending %d, uptodate: %d, (caller: %p,%p,%p,%p).\n", sh->sector, atomic_read(&sh->nr_pending), uptodate, __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2), __builtin_return_address(3)); - md_spin_lock_irqsave(&sh->stripe_lock, flags); - raid5_mark_buffer_uptodate(bh, uptodate); - if (!uptodate) - md_error(mddev_to_kdev(conf->mddev), bh->b_dev); - if (conf->failed_disks) { - for (i = 0; i < disks; i++) { - if (conf->disks[i].operational) - continue; - if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i]) - continue; - if (bh->b_dev != conf->disks[i].dev) - continue; - set_bit(STRIPE_ERROR, &sh->state); - } - } - md_spin_unlock_irqrestore(&sh->stripe_lock, flags); - - if (atomic_dec_and_test(&sh->nr_pending)) { - atomic_inc(&conf->nr_handle); - md_wakeup_thread(conf->thread); - } -} - -static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i) -{ - raid5_conf_t *conf = sh->raid_conf; - char *b_data; - struct page *b_page; - unsigned long block = sh->sector / (sh->size >> 9); - - b_data = bh->b_data; - b_page = bh->b_page; - memset (bh, 0, sizeof (struct buffer_head)); - init_waitqueue_head(&bh->b_wait); - init_buffer(bh, raid5_end_request, sh); - bh->b_dev = conf->disks[i].dev; - bh->b_blocknr = block; - - bh->b_data = b_data; - bh->b_page = b_page; - - bh->b_rdev = conf->disks[i].dev; - bh->b_rsector = sh->sector; - - bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); - bh->b_size = sh->size; - bh->b_list = BUF_LOCKED; -} - -static int raid5_error (mddev_t *mddev, kdev_t dev) -{ - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - mdp_super_t *sb = mddev->sb; - struct disk_info *disk; - int i; - - PRINTK("raid5_error called\n"); - conf->resync_parity = 0; - for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) { - if (disk->dev == dev && disk->operational) { - disk->operational = 0; - mark_disk_faulty(sb->disks+disk->number); - mark_disk_nonsync(sb->disks+disk->number); - mark_disk_inactive(sb->disks+disk->number); - sb->active_disks--; - sb->working_disks--; - sb->failed_disks++; - mddev->sb_dirty = 1; - conf->working_disks--; - conf->failed_disks++; - md_wakeup_thread(conf->thread); - printk (KERN_ALERT - "raid5: Disk failure on %s, disabling device." - " Operation continuing on %d devices\n", - partition_name (dev), conf->working_disks); - return 0; - } - } - /* - * handle errors in spares (during reconstruction) - */ - if (conf->spare) { - disk = conf->spare; - if (disk->dev == dev) { - printk (KERN_ALERT - "raid5: Disk failure on spare %s\n", - partition_name (dev)); - if (!conf->spare->operational) { - MD_BUG(); - return -EIO; - } - disk->operational = 0; - disk->write_only = 0; - conf->spare = NULL; - mark_disk_faulty(sb->disks+disk->number); - mark_disk_nonsync(sb->disks+disk->number); - mark_disk_inactive(sb->disks+disk->number); - sb->spare_disks--; - sb->working_disks--; - sb->failed_disks++; - - return 0; - } - } - MD_BUG(); - return -EIO; -} - -/* - * Input: a 'big' sector number, - * Output: index of the data and parity disk, and the sector # in them. - */ -static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks, - unsigned int data_disks, unsigned int * dd_idx, - unsigned int * pd_idx, raid5_conf_t *conf) -{ - unsigned long stripe; - unsigned long chunk_number; - unsigned int chunk_offset; - unsigned long new_sector; - int sectors_per_chunk = conf->chunk_size >> 9; - - /* First compute the information on this sector */ - - /* - * Compute the chunk number and the sector offset inside the chunk - */ - chunk_number = r_sector / sectors_per_chunk; - chunk_offset = r_sector % sectors_per_chunk; - - /* - * Compute the stripe number - */ - stripe = chunk_number / data_disks; - - /* - * Compute the data disk and parity disk indexes inside the stripe - */ - *dd_idx = chunk_number % data_disks; - - /* - * Select the parity disk based on the user selected algorithm. - */ - if (conf->level == 4) - *pd_idx = data_disks; - else switch (conf->algorithm) { - case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - if (*dd_idx >= *pd_idx) - (*dd_idx)++; - break; - case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*dd_idx >= *pd_idx) - (*dd_idx)++; - break; - case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; - break; - case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; - break; - default: - printk ("raid5: unsupported algorithm %d\n", conf->algorithm); - } - - /* - * Finally, compute the new sector number - */ - new_sector = stripe * sectors_per_chunk + chunk_offset; - return new_sector; -} - -static unsigned long compute_blocknr(struct stripe_head *sh, int i) -{ - raid5_conf_t *conf = sh->raid_conf; - int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; - unsigned long new_sector = sh->sector, check; - int sectors_per_chunk = conf->chunk_size >> 9; - unsigned long stripe = new_sector / sectors_per_chunk; - int chunk_offset = new_sector % sectors_per_chunk; - int chunk_number, dummy1, dummy2, dd_idx = i; - unsigned long r_sector, blocknr; - - switch (conf->algorithm) { - case ALGORITHM_LEFT_ASYMMETRIC: - case ALGORITHM_RIGHT_ASYMMETRIC: - if (i > sh->pd_idx) - i--; - break; - case ALGORITHM_LEFT_SYMMETRIC: - case ALGORITHM_RIGHT_SYMMETRIC: - if (i < sh->pd_idx) - i += raid_disks; - i -= (sh->pd_idx + 1); - break; - default: - printk ("raid5: unsupported algorithm %d\n", conf->algorithm); - } - - chunk_number = stripe * data_disks + i; - r_sector = chunk_number * sectors_per_chunk + chunk_offset; - blocknr = r_sector / (sh->size >> 9); - - check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); - if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { - printk("compute_blocknr: map not correct\n"); - return 0; - } - return blocknr; -} - -static void compute_block(struct stripe_head *sh, int dd_idx) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, count, disks = conf->raid_disks; - struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; - - PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx); - - if (sh->bh_old[dd_idx] == NULL) - sh->bh_old[dd_idx] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx); - - memset(sh->bh_old[dd_idx]->b_data, 0, sh->size); - bh_ptr[0] = sh->bh_old[dd_idx]; - count = 1; - for (i = 0; i < disks; i++) { - if (i == dd_idx) - continue; - if (sh->bh_old[i]) { - bh_ptr[count++] = sh->bh_old[i]; - } else { - printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); - } - if (count == MAX_XOR_BLOCKS) { - xor_block(count, &bh_ptr[0]); - count = 1; - } - } - if (count != 1) - xor_block(count, &bh_ptr[0]); - raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1); -} - -static void compute_parity(struct stripe_head *sh, int method) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; - struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; - - PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method); - for (i = 0; i < disks; i++) { - if (i == pd_idx || !sh->bh_new[i]) - continue; - if (!sh->bh_copy[i]) - sh->bh_copy[i] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_copy[i], i); - atomic_set_buffer_dirty(sh->bh_copy[i]); - memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size); - } - if (sh->bh_copy[pd_idx] == NULL) { - sh->bh_copy[pd_idx] = raid5_alloc_buffer(sh, sh->size); - atomic_set_buffer_dirty(sh->bh_copy[pd_idx]); - } - raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx); - - if (method == RECONSTRUCT_WRITE) { - memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size); - bh_ptr[0] = sh->bh_copy[pd_idx]; - count = 1; - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i]) { - bh_ptr[count++] = sh->bh_copy[i]; - } else if (sh->bh_old[i]) { - bh_ptr[count++] = sh->bh_old[i]; - } - if (count == MAX_XOR_BLOCKS) { - xor_block(count, &bh_ptr[0]); - count = 1; - } - } - if (count != 1) { - xor_block(count, &bh_ptr[0]); - } - } else if (method == READ_MODIFY_WRITE) { - memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size); - bh_ptr[0] = sh->bh_copy[pd_idx]; - count = 1; - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i] && sh->bh_old[i]) { - bh_ptr[count++] = sh->bh_copy[i]; - bh_ptr[count++] = sh->bh_old[i]; - } - if (count >= (MAX_XOR_BLOCKS - 1)) { - xor_block(count, &bh_ptr[0]); - count = 1; - } - } - if (count != 1) { - xor_block(count, &bh_ptr[0]); - } - } - raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1); -} - -static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw) -{ - raid5_conf_t *conf = sh->raid_conf; - struct buffer_head *bh_req; - - PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector); - CHECK_SHLOCK(sh); - if (sh->bh_new[dd_idx]) - BUG(); - - bh_req = raid5_alloc_bh(sh); - raid5_build_block(sh, bh_req, dd_idx); - bh_req->b_data = bh->b_data; - bh_req->b_page = bh->b_page; - - md_spin_lock_irq(&conf->device_lock); - if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) { - PRINTK("stripe s#%lu => PHASE_BEGIN (%s)\n", sh->sector, rw == READ ? "read" : "write"); - sh->phase = PHASE_BEGIN; - sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE; - atomic_inc(&conf->nr_pending_stripes); - atomic_inc(&conf->nr_handle); - PRINTK("# of pending stripes: %u, # of handle: %u\n", atomic_read(&conf->nr_pending_stripes), atomic_read(&conf->nr_handle)); - } - sh->bh_new[dd_idx] = bh; - sh->bh_req[dd_idx] = bh_req; - sh->cmd_new[dd_idx] = rw; - sh->new[dd_idx] = 1; - md_spin_unlock_irq(&conf->device_lock); - - PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx); -} - -static void complete_stripe(struct stripe_head *sh) -{ - raid5_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks; - int i, new = 0; - - PRINTK("complete_stripe %lu\n", sh->sector); - for (i = 0; i < disks; i++) { - if (sh->cmd == STRIPE_SYNC && sh->bh_copy[i]) - raid5_update_old_bh(sh, i); - if (sh->cmd == STRIPE_WRITE && i == sh->pd_idx) - raid5_update_old_bh(sh, i); - if (sh->bh_new[i]) { - PRINTK("stripe %lu finishes new bh, sh->new == %d\n", sh->sector, sh->new[i]); - if (!sh->new[i]) { -#if 0 - if (sh->cmd == STRIPE_WRITE) { - if (memcmp(sh->bh_new[i]->b_data, sh->bh_copy[i]->b_data, sh->size)) { - printk("copy differs, %s, sector %lu ", - test_bit(BH_Dirty, &sh->bh_new[i]->b_state) ? "dirty" : "clean", - sh->sector); - } else if (test_bit(BH_Dirty, &sh->bh_new[i]->b_state)) - printk("sector %lu dirty\n", sh->sector); - } -#endif - if (sh->cmd == STRIPE_WRITE) - raid5_update_old_bh(sh, i); - raid5_end_buffer_io(sh, i, 1); - continue; - } else - new++; - } - if (new && sh->cmd == STRIPE_WRITE) - printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new); - } - if (sh->cmd == STRIPE_SYNC) - md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1); - if (!new) - finish_unlock_stripe(sh); - else { - PRINTK("stripe %lu, new == %d\n", sh->sector, new); - sh->phase = PHASE_BEGIN; - } -} - - -static void handle_stripe_write (mddev_t *mddev , raid5_conf_t *conf, - struct stripe_head *sh, int nr_write, int * operational, int disks, - int parity, int parity_failed, int nr_cache, int nr_cache_other, - int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) -{ - int i; - unsigned int block; - struct buffer_head *bh; - int method1 = INT_MAX, method2 = INT_MAX; - - /* - * Attempt to add entries :-) - */ - if (nr_write != disks - 1) { - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i]) - continue; - block = (int) compute_blocknr(sh, i); - bh = get_hash_table(mddev_to_kdev(mddev), block, sh->size); - if (!bh) - continue; - if (buffer_dirty(bh) && !md_test_and_set_bit(BH_Lock, &bh->b_state)) { - PRINTK("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block); - add_stripe_bh(sh, bh, i, WRITE); - sh->new[i] = 0; - nr_write++; - if (sh->bh_old[i]) { - nr_cache_overwrite++; - nr_cache_other--; - } else - if (!operational[i]) { - nr_failed_overwrite++; - nr_failed_other--; - } - } - atomic_dec(&bh->b_count); - } - } - PRINTK("handle_stripe() -- begin writing, stripe %lu\n", sh->sector); - /* - * Writing, need to update parity buffer. - * - * Compute the number of I/O requests in the "reconstruct - * write" and "read modify write" methods. - */ - if (!nr_failed_other) - method1 = (disks - 1) - (nr_write + nr_cache_other); - if (!nr_failed_overwrite && !parity_failed) - method2 = nr_write - nr_cache_overwrite + (1 - parity); - - if (method1 == INT_MAX && method2 == INT_MAX) - BUG(); - PRINTK("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2); - - if (!method1 || !method2) { - sh->phase = PHASE_WRITE; - compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); - - for (i = 0; i < disks; i++) { - if (!operational[i] && !conf->spare && !conf->resync_parity) - continue; - bh = sh->bh_copy[i]; - if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL))) - printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]); - if (i == sh->pd_idx && !bh) - printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i); - if (bh) { - PRINTK("making request for buffer %d\n", i); - lock_get_bh(bh); - if (!operational[i] && !conf->resync_parity) { - PRINTK("writing spare %d\n", i); - atomic_inc(&sh->nr_pending); - bh->b_dev = bh->b_rdev = conf->spare->dev; - generic_make_request(WRITE, bh); - } else { - atomic_inc(&sh->nr_pending); - bh->b_dev = bh->b_rdev = conf->disks[i].dev; - generic_make_request(WRITE, bh); - } - atomic_dec(&bh->b_count); - } - } - PRINTK("handle_stripe() %lu, writing back %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); - return; - } - - if (method1 < method2) { - sh->write_method = RECONSTRUCT_WRITE; - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i] || sh->bh_old[i]) - continue; - sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - } - } else { - sh->write_method = READ_MODIFY_WRITE; - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!sh->bh_new[i] && i != sh->pd_idx) - continue; - sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - } - } - sh->phase = PHASE_READ_OLD; - for (i = 0; i < disks; i++) { - if (!sh->bh_old[i]) - continue; - if (test_bit(BH_Uptodate, &sh->bh_old[i]->b_state)) - continue; - lock_get_bh(sh->bh_old[i]); - atomic_inc(&sh->nr_pending); - sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev; - generic_make_request(READ, sh->bh_old[i]); - atomic_dec(&sh->bh_old[i]->b_count); - } - PRINTK("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); -} - -/* - * Reading - */ -static void handle_stripe_read (mddev_t *mddev , raid5_conf_t *conf, - struct stripe_head *sh, int nr_read, int * operational, int disks, - int parity, int parity_failed, int nr_cache, int nr_cache_other, - int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) -{ - int i; - int method1 = INT_MAX; - - method1 = nr_read - nr_cache_overwrite; - - PRINTK("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1); - - if (!method1 || (method1 == 1 && nr_cache == disks - 1)) { - PRINTK("read %lu completed from cache\n", sh->sector); - for (i = 0; i < disks; i++) { - if (!sh->bh_new[i]) - continue; - if (!sh->bh_old[i]) - compute_block(sh, i); - memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); - } - complete_stripe(sh); - return; - } - if (nr_failed_overwrite) { - sh->phase = PHASE_READ_OLD; - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!operational[i]) - continue; - sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - lock_get_bh(sh->bh_old[i]); - atomic_inc(&sh->nr_pending); - sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev; - generic_make_request(READ, sh->bh_old[i]); - atomic_dec(&sh->bh_old[i]->b_count); - } - PRINTK("handle_stripe() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); - return; - } - sh->phase = PHASE_READ; - for (i = 0; i < disks; i++) { - if (!sh->bh_new[i]) - continue; - if (sh->bh_old[i]) { - memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); - continue; - } -#if RAID5_PARANOIA - if (sh->bh_req[i] == NULL || test_bit(BH_Lock, &sh->bh_req[i]->b_state)) { - int j; - printk("req %d is NULL! or locked \n", i); - for (j=0; j<disks; j++) { - printk("%d: new=%p old=%p req=%p new=%d cmd=%d\n", - j, sh->bh_new[j], sh->bh_old[j], sh->bh_req[j], - sh->new[j], sh->cmd_new[j]); - } - - } -#endif - lock_get_bh(sh->bh_req[i]); - atomic_inc(&sh->nr_pending); - sh->bh_req[i]->b_dev = sh->bh_req[i]->b_rdev = conf->disks[i].dev; - generic_make_request(READ, sh->bh_req[i]); - atomic_dec(&sh->bh_req[i]->b_count); - } - PRINTK("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)); -} - -/* - * Syncing - */ -static void handle_stripe_sync (mddev_t *mddev , raid5_conf_t *conf, - struct stripe_head *sh, int * operational, int disks, - int parity, int parity_failed, int nr_cache, int nr_cache_other, - int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) -{ - struct buffer_head *bh; - int i, pd_idx; - - /* firstly, we want to have data from all non-failed drives - * in bh_old - */ - PRINTK("handle_stripe_sync: sec=%lu disks=%d nr_cache=%d\n", sh->sector, disks, nr_cache); - if ((nr_cache < disks-1) || ((nr_cache == disks-1) && !(parity_failed+nr_failed_other+nr_failed_overwrite)) - ) { - sh->phase = PHASE_READ_OLD; - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!conf->disks[i].operational) - continue; - - bh = raid5_alloc_buffer(sh, sh->size); - sh->bh_old[i] = bh; - raid5_build_block(sh, bh, i); - lock_get_bh(bh); - atomic_inc(&sh->nr_pending); - bh->b_dev = bh->b_rdev = conf->disks[i].dev; - generic_make_request(READ, bh); - md_sync_acct(bh->b_rdev, bh->b_size/512); - atomic_dec(&sh->bh_old[i]->b_count); - } - PRINTK("handle_stripe_sync() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); - - return; - } - /* now, if there is a failed drive, rebuild and write to spare */ - if (nr_cache == disks-1) { - sh->phase = PHASE_WRITE; - /* we can generate the missing block, which will be on the failed drive */ - for (i=0; i<disks; i++) { - if (operational[i]) - continue; - compute_block(sh, i); - if (conf->spare) { - bh = sh->bh_copy[i]; - if (bh) { - memcpy(bh->b_data, sh->bh_old[i]->b_data, sh->size); - set_bit(BH_Uptodate, &bh->b_state); - } else { - bh = sh->bh_old[i]; - sh->bh_old[i] = NULL; - sh->bh_copy[i] = bh; - } - atomic_inc(&sh->nr_pending); - lock_get_bh(bh); - bh->b_dev = bh->b_rdev = conf->spare->dev; - generic_make_request(WRITE, bh); - md_sync_acct(bh->b_rdev, bh->b_size/512); - atomic_dec(&bh->b_count); - PRINTK("handle_stripe_sync() %lu, phase WRITE, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); - } - break; - } - return; - } - - /* nr_cache == disks: - * check parity and compute/write if needed - */ - - compute_parity(sh, RECONSTRUCT_WRITE); - pd_idx = sh->pd_idx; - if (!memcmp(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size)) { - /* the parity is correct - Yay! */ - complete_stripe(sh); - } else { - sh->phase = PHASE_WRITE; - bh = sh->bh_copy[pd_idx]; - atomic_set_buffer_dirty(bh); - lock_get_bh(bh); - atomic_inc(&sh->nr_pending); - bh->b_dev = bh->b_rdev = conf->disks[pd_idx].dev; - generic_make_request(WRITE, bh); - md_sync_acct(bh->b_rdev, bh->b_size/512); - atomic_dec(&bh->b_count); - PRINTK("handle_stripe_sync() %lu phase WRITE, pending %d buffers\n", - sh->sector, md_atomic_read(&sh->nr_pending)); - } -} - -/* - * handle_stripe() is our main logic routine. Note that: - * - * 1. lock_stripe() should be used whenever we can't accept additonal - * buffers, either during short sleeping in handle_stripe() or - * during io operations. - * - * 2. We should be careful to set sh->nr_pending whenever we sleep, - * to prevent re-entry of handle_stripe() for the same sh. - * - * 3. conf->failed_disks and disk->operational can be changed - * from an interrupt. This complicates things a bit, but it allows - * us to stop issuing requests for a failed drive as soon as possible. - */ -static void handle_stripe(struct stripe_head *sh) -{ - raid5_conf_t *conf = sh->raid_conf; - mddev_t *mddev = conf->mddev; - int disks = conf->raid_disks; - int i, nr_read = 0, nr_write = 0, parity = 0; - int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0; - int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0; - int operational[MD_SB_DISKS], failed_disks = conf->failed_disks; - - PRINTK("handle_stripe(), stripe %lu\n", sh->sector); - if (!stripe_locked(sh)) - BUG(); - if (md_atomic_read(&sh->nr_pending)) - BUG(); - if (sh->phase == PHASE_COMPLETE) - BUG(); - - atomic_dec(&conf->nr_handle); - - if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) { - printk("raid5: restarting stripe %lu\n", sh->sector); - sh->phase = PHASE_BEGIN; - } - - if ((sh->cmd == STRIPE_WRITE && sh->phase == PHASE_WRITE) || - (sh->cmd == STRIPE_READ && sh->phase == PHASE_READ) || - (sh->cmd == STRIPE_SYNC && sh->phase == PHASE_WRITE) - ) { - /* - * Completed - */ - complete_stripe(sh); - if (sh->phase == PHASE_COMPLETE) - return; - } - - md_spin_lock_irq(&conf->device_lock); - for (i = 0; i < disks; i++) { - operational[i] = conf->disks[i].operational; - if (i == sh->pd_idx && conf->resync_parity) - operational[i] = 0; - } - failed_disks = conf->failed_disks; - md_spin_unlock_irq(&conf->device_lock); - - /* - * Make this one more graceful? - */ - if (failed_disks > 1) { - for (i = 0; i < disks; i++) { - if (sh->bh_new[i]) { - raid5_end_buffer_io(sh, i, 0); - continue; - } - } - if (sh->cmd == STRIPE_SYNC) - md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1); - finish_unlock_stripe(sh); - return; - } - - PRINTK("=== stripe index START ===\n"); - for (i = 0; i < disks; i++) { - PRINTK("disk %d, ", i); - if (sh->bh_old[i]) { - nr_cache++; - PRINTK(" (old cached, %d)", nr_cache); - } - if (i == sh->pd_idx) { - PRINTK(" PARITY."); - if (sh->bh_old[i]) { - PRINTK(" CACHED."); - parity = 1; - } else { - PRINTK(" UNCACHED."); - if (!operational[i]) { - PRINTK(" FAILED."); - parity_failed = 1; - } - } - PRINTK("\n"); - continue; - } - if (!sh->bh_new[i]) { - PRINTK(" (no new data block) "); - if (sh->bh_old[i]) { - PRINTK(" (but old block cached) "); - nr_cache_other++; - } else { - if (!operational[i]) { - PRINTK(" (because failed disk) "); - nr_failed_other++; - } else - PRINTK(" (no old block either) "); - } - PRINTK("\n"); - continue; - } - sh->new[i] = 0; - if (sh->cmd_new[i] == READ) { - nr_read++; - PRINTK(" (new READ %d)", nr_read); - } - if (sh->cmd_new[i] == WRITE) { - nr_write++; - PRINTK(" (new WRITE %d)", nr_write); - } - if (sh->bh_old[i]) { - nr_cache_overwrite++; - PRINTK(" (overwriting old %d)", nr_cache_overwrite); - } else { - if (!operational[i]) { - nr_failed_overwrite++; - PRINTK(" (overwriting failed %d)", nr_failed_overwrite); - } - } - PRINTK("\n"); - } - PRINTK("=== stripe index END ===\n"); - - if (nr_write && nr_read) - BUG(); - - if (nr_write) - handle_stripe_write( - mddev, conf, sh, nr_write, operational, disks, - parity, parity_failed, nr_cache, nr_cache_other, - nr_failed_other, nr_cache_overwrite, - nr_failed_overwrite - ); - else if (nr_read) - handle_stripe_read( - mddev, conf, sh, nr_read, operational, disks, - parity, parity_failed, nr_cache, nr_cache_other, - nr_failed_other, nr_cache_overwrite, - nr_failed_overwrite - ); - else if (sh->cmd == STRIPE_SYNC) - handle_stripe_sync( - mddev, conf, sh, operational, disks, - parity, parity_failed, nr_cache, nr_cache_other, - nr_failed_other, nr_cache_overwrite, nr_failed_overwrite - ); -} - - -static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) -{ - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - const unsigned int raid_disks = conf->raid_disks; - const unsigned int data_disks = raid_disks - 1; - unsigned int dd_idx, pd_idx; - unsigned long new_sector; - - struct stripe_head *sh; - - if (rw == READA) - rw = READ; - - new_sector = raid5_compute_sector(bh->b_rsector, - raid_disks, data_disks, &dd_idx, &pd_idx, conf); - - PRINTK("raid5_make_request, sector %lu\n", new_sector); - sh = get_lock_stripe(conf, new_sector, bh->b_size); -#if 0 - if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) { - PRINTK("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd); - lock_stripe(sh); - if (!md_atomic_read(&sh->nr_pending)) - handle_stripe(sh); - goto repeat; - } -#endif - sh->pd_idx = pd_idx; - if (sh->phase != PHASE_COMPLETE && sh->phase != PHASE_BEGIN) - PRINTK("stripe %lu catching the bus!\n", sh->sector); - if (sh->bh_new[dd_idx]) - BUG(); - add_stripe_bh(sh, bh, dd_idx, rw); - - md_wakeup_thread(conf->thread); - return 0; -} - -/* - * Determine correct block size for this device. - */ -unsigned int device_bsize (kdev_t dev) -{ - unsigned int i, correct_size; - - correct_size = BLOCK_SIZE; - if (blksize_size[MAJOR(dev)]) { - i = blksize_size[MAJOR(dev)][MINOR(dev)]; - if (i) - correct_size = i; - } - - return correct_size; -} - -static int raid5_sync_request (mddev_t *mddev, unsigned long block_nr) -{ - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - struct stripe_head *sh; - int sectors_per_chunk = conf->chunk_size >> 9; - unsigned long stripe = (block_nr<<2)/sectors_per_chunk; - int chunk_offset = (block_nr<<2) % sectors_per_chunk; - int dd_idx, pd_idx; - unsigned long first_sector; - int raid_disks = conf->raid_disks; - int data_disks = raid_disks-1; - int redone = 0; - int bufsize; - - if (!conf->buffer_size) - conf->buffer_size = /* device_bsize(mddev_to_kdev(mddev))*/ PAGE_SIZE; - bufsize = conf->buffer_size; - /* Hmm... race on buffer_size ?? */ - redone = block_nr% (bufsize>>10); - block_nr -= redone; - sh = get_lock_stripe(conf, block_nr<<1, bufsize); - first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk - + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); - sh->pd_idx = pd_idx; - sh->cmd = STRIPE_SYNC; - sh->phase = PHASE_BEGIN; - sh->sync_redone = redone; - atomic_inc(&conf->nr_pending_stripes); - atomic_inc(&conf->nr_handle); - md_wakeup_thread(conf->thread); - return (bufsize>>10)-redone; -} - -/* - * This is our raid5 kernel thread. - * - * We scan the hash table for stripes which can be handled now. - * During the scan, completed stripes are saved for us by the interrupt - * handler, so that they will not have to wait for our next wakeup. - */ -static void raid5d (void *data) -{ - struct stripe_head *sh; - raid5_conf_t *conf = data; - mddev_t *mddev = conf->mddev; - int i, handled; - - PRINTK("+++ raid5d active\n"); - - handled = 0; - md_spin_lock_irq(&conf->device_lock); - clear_bit(THREAD_WAKEUP, &conf->thread->flags); -repeat_pass: - if (mddev->sb_dirty) { - md_spin_unlock_irq(&conf->device_lock); - mddev->sb_dirty = 0; - md_update_sb(mddev); - md_spin_lock_irq(&conf->device_lock); - } - for (i = 0; i < NR_HASH; i++) { -repeat: - sh = conf->stripe_hashtbl[i]; - for (; sh; sh = sh->hash_next) { - if (sh->raid_conf != conf) - continue; - if (sh->phase == PHASE_COMPLETE) - continue; - if (md_atomic_read(&sh->nr_pending)) - continue; - md_spin_unlock_irq(&conf->device_lock); - if (!atomic_read(&sh->count)) - BUG(); - - handled++; - handle_stripe(sh); - md_spin_lock_irq(&conf->device_lock); - goto repeat; - } - } - if (conf) { - PRINTK("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)); - if (test_and_clear_bit(THREAD_WAKEUP, &conf->thread->flags) && - md_atomic_read(&conf->nr_handle)) - goto repeat_pass; - } - md_spin_unlock_irq(&conf->device_lock); - - PRINTK("--- raid5d inactive\n"); -} - -/* - * Private kernel thread for parity reconstruction after an unclean - * shutdown. Reconstruction on spare drives in case of a failed drive - * is done by the generic mdsyncd. - */ -static void raid5syncd (void *data) -{ - raid5_conf_t *conf = data; - mddev_t *mddev = conf->mddev; - - if (!conf->resync_parity) - return; - if (conf->resync_parity == 2) - return; - down(&mddev->recovery_sem); - if (md_do_sync(mddev,NULL)) { - up(&mddev->recovery_sem); - printk("raid5: resync aborted!\n"); - return; - } - conf->resync_parity = 0; - up(&mddev->recovery_sem); - printk("raid5: resync finished.\n"); -} - -static int __check_consistency (mddev_t *mddev, int row) -{ - raid5_conf_t *conf = mddev->private; - kdev_t dev; - struct buffer_head *bh[MD_SB_DISKS], *tmp = NULL; - int i, ret = 0, nr = 0, count; - struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; - - if (conf->working_disks != conf->raid_disks) - goto out; - tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); - tmp->b_size = 4096; - tmp->b_page = alloc_page(GFP_KERNEL); - tmp->b_data = page_address(tmp->b_page); - if (!tmp->b_data) - goto out; - md_clear_page(tmp->b_data); - memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *)); - for (i = 0; i < conf->raid_disks; i++) { - dev = conf->disks[i].dev; - set_blocksize(dev, 4096); - bh[i] = bread(dev, row / 4, 4096); - if (!bh[i]) - break; - nr++; - } - if (nr == conf->raid_disks) { - bh_ptr[0] = tmp; - count = 1; - for (i = 1; i < nr; i++) { - bh_ptr[count++] = bh[i]; - if (count == MAX_XOR_BLOCKS) { - xor_block(count, &bh_ptr[0]); - count = 1; - } - } - if (count != 1) { - xor_block(count, &bh_ptr[0]); - } - if (memcmp(tmp->b_data, bh[0]->b_data, 4096)) - ret = 1; - } - for (i = 0; i < conf->raid_disks; i++) { - dev = conf->disks[i].dev; - if (bh[i]) { - bforget(bh[i]); - bh[i] = NULL; - } - fsync_dev(dev); - invalidate_buffers(dev); - } - free_page((unsigned long) tmp->b_data); -out: - if (tmp) - kfree(tmp); - return ret; -} - -static int check_consistency (mddev_t *mddev) -{ - if (__check_consistency(mddev, 0)) -/* - * We are not checking this currently, as it's legitimate to have - * an inconsistent array, at creation time. - */ - return 0; - - return 0; -} - -static int raid5_run (mddev_t *mddev) -{ - raid5_conf_t *conf; - int i, j, raid_disk, memory; - mdp_super_t *sb = mddev->sb; - mdp_disk_t *desc; - mdk_rdev_t *rdev; - struct disk_info *disk; - struct md_list_head *tmp; - int start_recovery = 0; - - MOD_INC_USE_COUNT; - - if (sb->level != 5 && sb->level != 4) { - printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level); - MOD_DEC_USE_COUNT; - return -EIO; - } - - mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL); - if ((conf = mddev->private) == NULL) - goto abort; - memset (conf, 0, sizeof (*conf)); - conf->mddev = mddev; - - if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) - goto abort; - memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); - - conf->device_lock = MD_SPIN_LOCK_UNLOCKED; - md_init_waitqueue_head(&conf->wait_for_stripe); - PRINTK("raid5_run(md%d) called.\n", mdidx(mddev)); - - ITERATE_RDEV(mddev,rdev,tmp) { - /* - * This is important -- we are using the descriptor on - * the disk only to get a pointer to the descriptor on - * the main superblock, which might be more recent. - */ - desc = sb->disks + rdev->desc_nr; - raid_disk = desc->raid_disk; - disk = conf->disks + raid_disk; - - if (disk_faulty(desc)) { - printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev)); - if (!rdev->faulty) { - MD_BUG(); - goto abort; - } - disk->number = desc->number; - disk->raid_disk = raid_disk; - disk->dev = rdev->dev; - - disk->operational = 0; - disk->write_only = 0; - disk->spare = 0; - disk->used_slot = 1; - continue; - } - if (disk_active(desc)) { - if (!disk_sync(desc)) { - printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev)); - MD_BUG(); - goto abort; - } - if (raid_disk > sb->raid_disks) { - printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev)); - continue; - } - if (disk->operational) { - printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk); - continue; - } - printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk); - - disk->number = desc->number; - disk->raid_disk = raid_disk; - disk->dev = rdev->dev; - disk->operational = 1; - disk->used_slot = 1; - - conf->working_disks++; - } else { - /* - * Must be a spare disk .. - */ - printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev)); - disk->number = desc->number; - disk->raid_disk = raid_disk; - disk->dev = rdev->dev; - - disk->operational = 0; - disk->write_only = 0; - disk->spare = 1; - disk->used_slot = 1; - } - } - - for (i = 0; i < MD_SB_DISKS; i++) { - desc = sb->disks + i; - raid_disk = desc->raid_disk; - disk = conf->disks + raid_disk; - - if (disk_faulty(desc) && (raid_disk < sb->raid_disks) && - !conf->disks[raid_disk].used_slot) { - - disk->number = desc->number; - disk->raid_disk = raid_disk; - disk->dev = MKDEV(0,0); - - disk->operational = 0; - disk->write_only = 0; - disk->spare = 0; - disk->used_slot = 1; - } - } - - conf->raid_disks = sb->raid_disks; - /* - * 0 for a fully functional array, 1 for a degraded array. - */ - conf->failed_disks = conf->raid_disks - conf->working_disks; - conf->mddev = mddev; - conf->chunk_size = sb->chunk_size; - conf->level = sb->level; - conf->algorithm = sb->layout; - conf->max_nr_stripes = NR_STRIPES; - -#if 0 - for (i = 0; i < conf->raid_disks; i++) { - if (!conf->disks[i].used_slot) { - MD_BUG(); - goto abort; - } - } -#endif - if (!conf->chunk_size || conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev)); - goto abort; - } - if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev)); - goto abort; - } - if (conf->failed_disks > 1) { - printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks); - goto abort; - } - - if (conf->working_disks != sb->raid_disks) { - printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); - start_recovery = 1; - } - - if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) && - check_consistency(mddev)) { - printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n"); - sb->state &= ~(1 << MD_SB_CLEAN); - } - - { - const char * name = "raid5d"; - - conf->thread = md_register_thread(raid5d, conf, name); - if (!conf->thread) { - printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); - goto abort; - } - } - - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * (sizeof(struct buffer_head) + - 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) { - printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(conf, conf->max_nr_stripes); - goto abort; - } else - printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev)); - - /* - * Regenerate the "device is in sync with the raid set" bit for - * each device. - */ - for (i = 0; i < MD_SB_DISKS ; i++) { - mark_disk_nonsync(sb->disks + i); - for (j = 0; j < sb->raid_disks; j++) { - if (!conf->disks[j].operational) - continue; - if (sb->disks[i].number == conf->disks[j].number) - mark_disk_sync(sb->disks + i); - } - } - sb->active_disks = conf->working_disks; - - if (sb->active_disks == sb->raid_disks) - printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); - else - printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); - - if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { - const char * name = "raid5syncd"; - - conf->resync_thread = md_register_thread(raid5syncd, conf,name); - if (!conf->resync_thread) { - printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); - goto abort; - } - - printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev)); - conf->resync_parity = 1; - md_wakeup_thread(conf->resync_thread); - } - - print_raid5_conf(conf); - if (start_recovery) - md_recover_arrays(); - print_raid5_conf(conf); - - /* Ok, everything is just fine now */ - return (0); -abort: - if (conf) { - print_raid5_conf(conf); - if (conf->stripe_hashtbl) - free_pages((unsigned long) conf->stripe_hashtbl, - HASH_PAGES_ORDER); - kfree(conf); - } - mddev->private = NULL; - printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev)); - MOD_DEC_USE_COUNT; - return -EIO; -} - -static int raid5_stop_resync (mddev_t *mddev) -{ - raid5_conf_t *conf = mddev_to_conf(mddev); - mdk_thread_t *thread = conf->resync_thread; - - if (thread) { - if (conf->resync_parity) { - conf->resync_parity = 2; - md_interrupt_thread(thread); - printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n"); - return 1; - } - return 0; - } - return 0; -} - -static int raid5_restart_resync (mddev_t *mddev) -{ - raid5_conf_t *conf = mddev_to_conf(mddev); - - if (conf->resync_parity) { - if (!conf->resync_thread) { - MD_BUG(); - return 0; - } - printk("raid5: waking up raid5resync.\n"); - conf->resync_parity = 1; - md_wakeup_thread(conf->resync_thread); - return 1; - } else - printk("raid5: no restart-resync needed.\n"); - return 0; -} - - -static int raid5_stop (mddev_t *mddev) -{ - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - - shrink_stripe_cache(conf, conf->max_nr_stripes); - shrink_stripes(conf, conf->max_nr_stripes); - md_unregister_thread(conf->thread); - if (conf->resync_thread) - md_unregister_thread(conf->resync_thread); - free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); - kfree(conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return 0; -} - -#if RAID5_DEBUG -static void print_sh (struct stripe_head *sh) -{ - int i; - - printk("sh %lu, phase %d, size %d, pd_idx %d, state %ld, cmd %d.\n", sh->sector, sh->phase, sh->size, sh->pd_idx, sh->state, sh->cmd); - printk("sh %lu, write_method %d, nr_pending %d, count %d.\n", sh->sector, sh->write_method, atomic_read(&sh->nr_pending), atomic_read(&sh->count)); - printk("sh %lu, ", sh->sector); - for (i = 0; i < MD_SB_DISKS; i++) { - if (sh->bh_old[i]) - printk("(old%d: %p) ", i, sh->bh_old[i]); - if (sh->bh_new[i]) - printk("(new%d: %p) ", i, sh->bh_new[i]); - if (sh->bh_copy[i]) - printk("(copy%d: %p) ", i, sh->bh_copy[i]); - if (sh->bh_req[i]) - printk("(req%d: %p) ", i, sh->bh_req[i]); - } - printk("\n"); - for (i = 0; i < MD_SB_DISKS; i++) - printk("%d(%d/%d) ", i, sh->cmd_new[i], sh->new[i]); - printk("\n"); -} - -static void printall (raid5_conf_t *conf) -{ - struct stripe_head *sh; - int i; - - md_spin_lock_irq(&conf->device_lock); - for (i = 0; i < NR_HASH; i++) { - sh = conf->stripe_hashtbl[i]; - for (; sh; sh = sh->hash_next) { - if (sh->raid_conf != conf) - continue; - print_sh(sh); - } - } - md_spin_unlock_irq(&conf->device_lock); - - PRINTK("--- raid5d inactive\n"); -} -#endif - -static int raid5_status (char *page, mddev_t *mddev) -{ - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - mdp_super_t *sb = mddev->sb; - int sz = 0, i; - - sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout); - sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); - for (i = 0; i < conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_"); - sz += sprintf (page+sz, "]"); -#if RAID5_DEBUG -#define D(x) \ - sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x)) - D(nr_handle); - D(nr_stripes); - D(nr_hashed_stripes); - D(nr_locked_stripes); - D(nr_pending_stripes); - D(nr_cached_stripes); - D(nr_free_sh); - printall(conf); -#endif - return sz; -} - -static void print_raid5_conf (raid5_conf_t *conf) -{ - int i; - struct disk_info *tmp; - - printk("RAID5 conf printout:\n"); - if (!conf) { - printk("(conf==NULL)\n"); - return; - } - printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, - conf->working_disks, conf->failed_disks); - - for (i = 0; i < MD_SB_DISKS; i++) { - tmp = conf->disks + i; - printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", - i, tmp->spare,tmp->operational, - tmp->number,tmp->raid_disk,tmp->used_slot, - partition_name(tmp->dev)); - } -} - -static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state) -{ - int err = 0; - int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; - raid5_conf_t *conf = mddev->private; - struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; - mdp_super_t *sb = mddev->sb; - mdp_disk_t *failed_desc, *spare_desc, *added_desc; - - print_raid5_conf(conf); - md_spin_lock_irq(&conf->device_lock); - /* - * find the disk ... - */ - switch (state) { - - case DISKOP_SPARE_ACTIVE: - - /* - * Find the failed disk within the RAID5 configuration ... - * (this can only be in the first conf->raid_disks part) - */ - for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->disks + i; - if ((!tmp->operational && !tmp->spare) || - !tmp->used_slot) { - failed_disk = i; - break; - } - } - /* - * When we activate a spare disk we _must_ have a disk in - * the lower (active) part of the array to replace. - */ - if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { - MD_BUG(); - err = 1; - goto abort; - } - /* fall through */ - - case DISKOP_SPARE_WRITE: - case DISKOP_SPARE_INACTIVE: - - /* - * Find the spare disk ... (can only be in the 'high' - * area of the array) - */ - for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { - tmp = conf->disks + i; - if (tmp->spare && tmp->number == (*d)->number) { - spare_disk = i; - break; - } - } - if (spare_disk == -1) { - MD_BUG(); - err = 1; - goto abort; - } - break; - - case DISKOP_HOT_REMOVE_DISK: - - for (i = 0; i < MD_SB_DISKS; i++) { - tmp = conf->disks + i; - if (tmp->used_slot && (tmp->number == (*d)->number)) { - if (tmp->operational) { - err = -EBUSY; - goto abort; - } - removed_disk = i; - break; - } - } - if (removed_disk == -1) { - MD_BUG(); - err = 1; - goto abort; - } - break; - - case DISKOP_HOT_ADD_DISK: - - for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { - tmp = conf->disks + i; - if (!tmp->used_slot) { - added_disk = i; - break; - } - } - if (added_disk == -1) { - MD_BUG(); - err = 1; - goto abort; - } - break; - } - - switch (state) { - /* - * Switch the spare disk to write-only mode: - */ - case DISKOP_SPARE_WRITE: - if (conf->spare) { - MD_BUG(); - err = 1; - goto abort; - } - sdisk = conf->disks + spare_disk; - sdisk->operational = 1; - sdisk->write_only = 1; - conf->spare = sdisk; - break; - /* - * Deactivate a spare disk: - */ - case DISKOP_SPARE_INACTIVE: - sdisk = conf->disks + spare_disk; - sdisk->operational = 0; - sdisk->write_only = 0; - /* - * Was the spare being resynced? - */ - if (conf->spare == sdisk) - conf->spare = NULL; - break; - /* - * Activate (mark read-write) the (now sync) spare disk, - * which means we switch it's 'raid position' (->raid_disk) - * with the failed disk. (only the first 'conf->raid_disks' - * slots are used for 'real' disks and we must preserve this - * property) - */ - case DISKOP_SPARE_ACTIVE: - if (!conf->spare) { - MD_BUG(); - err = 1; - goto abort; - } - sdisk = conf->disks + spare_disk; - fdisk = conf->disks + failed_disk; - - spare_desc = &sb->disks[sdisk->number]; - failed_desc = &sb->disks[fdisk->number]; - - if (spare_desc != *d) { - MD_BUG(); - err = 1; - goto abort; - } - - if (spare_desc->raid_disk != sdisk->raid_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - if (sdisk->raid_disk != spare_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - if (failed_desc->raid_disk != fdisk->raid_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - if (fdisk->raid_disk != failed_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - /* - * do the switch finally - */ - xchg_values(*spare_desc, *failed_desc); - xchg_values(*fdisk, *sdisk); - - /* - * (careful, 'failed' and 'spare' are switched from now on) - * - * we want to preserve linear numbering and we want to - * give the proper raid_disk number to the now activated - * disk. (this means we switch back these values) - */ - - xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); - xchg_values(sdisk->raid_disk, fdisk->raid_disk); - xchg_values(spare_desc->number, failed_desc->number); - xchg_values(sdisk->number, fdisk->number); - - *d = failed_desc; - - if (sdisk->dev == MKDEV(0,0)) - sdisk->used_slot = 0; - - /* - * this really activates the spare. - */ - fdisk->spare = 0; - fdisk->write_only = 0; - - /* - * if we activate a spare, we definitely replace a - * non-operational disk slot in the 'low' area of - * the disk array. - */ - conf->failed_disks--; - conf->working_disks++; - conf->spare = NULL; - - break; - - case DISKOP_HOT_REMOVE_DISK: - rdisk = conf->disks + removed_disk; - - if (rdisk->spare && (removed_disk < conf->raid_disks)) { - MD_BUG(); - err = 1; - goto abort; - } - rdisk->dev = MKDEV(0,0); - rdisk->used_slot = 0; - - break; - - case DISKOP_HOT_ADD_DISK: - adisk = conf->disks + added_disk; - added_desc = *d; - - if (added_disk != added_desc->number) { - MD_BUG(); - err = 1; - goto abort; - } - - adisk->number = added_desc->number; - adisk->raid_disk = added_desc->raid_disk; - adisk->dev = MKDEV(added_desc->major,added_desc->minor); - - adisk->operational = 0; - adisk->write_only = 0; - adisk->spare = 1; - adisk->used_slot = 1; - - - break; - - default: - MD_BUG(); - err = 1; - goto abort; - } -abort: - md_spin_unlock_irq(&conf->device_lock); - print_raid5_conf(conf); - return err; -} - -static mdk_personality_t raid5_personality= -{ - name: "raid5", - make_request: raid5_make_request, - run: raid5_run, - stop: raid5_stop, - status: raid5_status, - error_handler: raid5_error, - diskop: raid5_diskop, - stop_resync: raid5_stop_resync, - restart_resync: raid5_restart_resync, - sync_request: raid5_sync_request -}; - -int raid5_init (void) -{ - int err; - - err = register_md_personality (RAID5, &raid5_personality); - if (err) - return err; - - /* - * pick a XOR routine, runtime. - */ - calibrate_xor_block(); - - return 0; -} - -#ifdef MODULE -int init_module (void) -{ - return raid5_init(); -} - -void cleanup_module (void) -{ - unregister_md_personality (RAID5); -} -#endif diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 802b04e89..6d07f3069 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -16,6 +16,7 @@ * handle GCR disks */ +#include <linux/config.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -246,6 +247,13 @@ static int floppy_revalidate(kdev_t dev); static int swim3_add_device(struct device_node *swims); int swim3_init(void); +#ifndef CONFIG_PMAC_PBOOK +static inline int check_media_bay(struct device_node *which_bay, int what) +{ + return 1; +} +#endif + static void swim3_select(struct floppy_state *fs, int sel) { volatile struct swim3 *sw = fs->swim3; diff --git a/drivers/block/xd.c b/drivers/block/xd.c index 8bd2ec3bc..b5df63bd9 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -116,7 +116,7 @@ static unsigned int xd_bases[] __initdata = }; static struct hd_struct xd_struct[XD_MAXDRIVES << 6]; -static int xd_sizes[XD_MAXDRIVES << 6], xd_access[XD_MAXDRIVES] = { 0, 0 }; +static int xd_sizes[XD_MAXDRIVES << 6], xd_access[XD_MAXDRIVES]; static int xd_blocksizes[XD_MAXDRIVES << 6]; extern struct block_device_operations xd_fops; @@ -141,12 +141,12 @@ static struct block_device_operations xd_fops = { static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int); static DECLARE_WAIT_QUEUE_HEAD(xd_wait_open); static u_char xd_valid[XD_MAXDRIVES] = { 0,0 }; -static u_char xd_drives = 0, xd_irq = 5, xd_dma = 3, xd_maxsectors; -static u_char xd_override __initdata = 0, xd_type = 0; +static u_char xd_drives, xd_irq = 5, xd_dma = 3, xd_maxsectors; +static u_char xd_override __initdata, xd_type __initdata; static u_short xd_iobase = 0x320; -static int xd_geo[XD_MAXDRIVES*3] __initdata = { 0,0,0,0,0,0 }; +static int xd_geo[XD_MAXDRIVES*3] __initdata; -static volatile int xdc_busy = 0; +static volatile int xdc_busy; static DECLARE_WAIT_QUEUE_HEAD(xdc_wait); static struct timer_list xd_timer, xd_watchdog_int; diff --git a/drivers/block/xor.c b/drivers/block/xor.c deleted file mode 100644 index eb99582de..000000000 --- a/drivers/block/xor.c +++ /dev/null @@ -1,1907 +0,0 @@ -/* - * xor.c : Multiple Devices driver for Linux - * - * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek - * - * - * optimized RAID-5 checksumming functions. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#include <linux/config.h> -#define BH_TRACE 0 -#include <linux/module.h> -#include <linux/raid/md.h> -#ifdef __sparc_v9__ -#include <asm/head.h> -#include <asm/asi.h> -#include <asm/visasm.h> -#endif - -/* - * we use the 'XOR function template' to register multiple xor - * functions runtime. The kernel measures their speed upon bootup - * and decides which one to use. (compile-time registration is - * not enough as certain CPU features like MMX can only be detected - * runtime) - * - * this architecture makes it pretty easy to add new routines - * that are faster on certain CPUs, without killing other CPU's - * 'native' routine. Although the current routines are belived - * to be the physically fastest ones on all CPUs tested, but - * feel free to prove me wrong and add yet another routine =B-) - * --mingo - */ - -#define MAX_XOR_BLOCKS 5 - -#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr) - -typedef void (*xor_block_t) XOR_ARGS; -xor_block_t xor_block = NULL; - -#ifndef __sparc_v9__ - -struct xor_block_template; - -struct xor_block_template { - char * name; - xor_block_t xor_block; - int speed; - struct xor_block_template * next; -}; - -struct xor_block_template * xor_functions = NULL; - -#define XORBLOCK_TEMPLATE(x) \ -static void xor_block_##x XOR_ARGS; \ -static struct xor_block_template t_xor_block_##x = \ - { #x, xor_block_##x, 0, NULL }; \ -static void xor_block_##x XOR_ARGS - -#ifdef __i386__ - -#ifdef CONFIG_X86_XMM -/* - * Cache avoiding checksumming functions utilizing KNI instructions - * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) - */ - -XORBLOCK_TEMPLATE(pIII_kni) -{ - char xmm_save[16*4]; - int cr0; - int lines = (bh_ptr[0]->b_size>>8); - - __asm__ __volatile__ ( - "movl %%cr0,%0 ;\n\t" - "clts ;\n\t" - "movups %%xmm0,(%1) ;\n\t" - "movups %%xmm1,0x10(%1) ;\n\t" - "movups %%xmm2,0x20(%1) ;\n\t" - "movups %%xmm3,0x30(%1) ;\n\t" - : "=r" (cr0) - : "r" (xmm_save) - : "memory" ); - -#define OFFS(x) "8*("#x"*2)" -#define PF0(x) \ - " prefetcht0 "OFFS(x)"(%1) ;\n" -#define LD(x,y) \ - " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" -#define ST(x,y) \ - " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" -#define PF1(x) \ - " prefetchnta "OFFS(x)"(%2) ;\n" -#define PF2(x) \ - " prefetchnta "OFFS(x)"(%3) ;\n" -#define PF3(x) \ - " prefetchnta "OFFS(x)"(%4) ;\n" -#define PF4(x) \ - " prefetchnta "OFFS(x)"(%5) ;\n" -#define PF5(x) \ - " prefetchnta "OFFS(x)"(%6) ;\n" -#define XO1(x,y) \ - " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" -#define XO2(x,y) \ - " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" -#define XO3(x,y) \ - " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" -#define XO4(x,y) \ - " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" -#define XO5(x,y) \ - " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" - - switch(count) { - case 2: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - PF1(i) \ - PF1(i+2) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data) - : "memory" ); - break; - case 3: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data) - : "memory" ); - break; - case 4: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - PF3(i) \ - PF3(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " addl $256, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data) - : "memory" ); - break; - case 5: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - PF3(i) \ - PF3(i+2) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - PF4(i) \ - PF4(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - XO4(i,0) \ - XO4(i+1,1) \ - XO4(i+2,2) \ - XO4(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " addl $256, %4 ;\n" - " addl $256, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data), - "r" (bh_ptr[4]->b_data) - : "memory"); - break; - } - - __asm__ __volatile__ ( - "sfence ;\n\t" - "movups (%1),%%xmm0 ;\n\t" - "movups 0x10(%1),%%xmm1 ;\n\t" - "movups 0x20(%1),%%xmm2 ;\n\t" - "movups 0x30(%1),%%xmm3 ;\n\t" - "movl %0,%%cr0 ;\n\t" - : - : "r" (cr0), "r" (xmm_save) - : "memory" ); -} - -#undef OFFS -#undef LD -#undef ST -#undef PF0 -#undef PF1 -#undef PF2 -#undef PF3 -#undef PF4 -#undef PF5 -#undef XO1 -#undef XO2 -#undef XO3 -#undef XO4 -#undef XO5 -#undef BLOCK - -#endif /* CONFIG_X86_XMM */ - -/* - * high-speed RAID5 checksumming functions utilizing MMX instructions - * Copyright (C) 1998 Ingo Molnar - */ -XORBLOCK_TEMPLATE(pII_mmx) -{ - char fpu_save[108]; - int lines = (bh_ptr[0]->b_size>>7); - - if (!(current->flags & PF_USEDFPU)) - __asm__ __volatile__ ( " clts;\n"); - - __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); - -#define LD(x,y) \ - " movq 8*("#x")(%1), %%mm"#y" ;\n" -#define ST(x,y) \ - " movq %%mm"#y", 8*("#x")(%1) ;\n" -#define XO1(x,y) \ - " pxor 8*("#x")(%2), %%mm"#y" ;\n" -#define XO2(x,y) \ - " pxor 8*("#x")(%3), %%mm"#y" ;\n" -#define XO3(x,y) \ - " pxor 8*("#x")(%4), %%mm"#y" ;\n" -#define XO4(x,y) \ - " pxor 8*("#x")(%5), %%mm"#y" ;\n" - - switch(count) { - case 2: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - ST(i,0) \ - XO1(i+1,1) \ - ST(i+1,1) \ - XO1(i+2,2) \ - ST(i+2,2) \ - XO1(i+3,3) \ - ST(i+3,3) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data) - : "memory"); - break; - case 3: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - ST(i,0) \ - XO2(i+1,1) \ - ST(i+1,1) \ - XO2(i+2,2) \ - ST(i+2,2) \ - XO2(i+3,3) \ - ST(i+3,3) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " addl $128, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data) - : "memory"); - break; - case 4: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - ST(i,0) \ - XO3(i+1,1) \ - ST(i+1,1) \ - XO3(i+2,2) \ - ST(i+2,2) \ - XO3(i+3,3) \ - ST(i+3,3) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " addl $128, %3 ;\n" - " addl $128, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data) - : "memory"); - break; - case 5: - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - XO4(i,0) \ - ST(i,0) \ - XO4(i+1,1) \ - ST(i+1,1) \ - XO4(i+2,2) \ - ST(i+2,2) \ - XO4(i+3,3) \ - ST(i+3,3) - - " .align 32,0x90 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " addl $128, %3 ;\n" - " addl $128, %4 ;\n" - " addl $128, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : - : "g" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data), - "r" (bh_ptr[4]->b_data) - : "memory"); - break; - } - - __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); - - if (!(current->flags & PF_USEDFPU)) - stts(); -} - -#undef LD -#undef XO1 -#undef XO2 -#undef XO3 -#undef XO4 -#undef ST -#undef BLOCK - -XORBLOCK_TEMPLATE(p5_mmx) -{ - char fpu_save[108]; - int lines = (bh_ptr[0]->b_size>>6); - - if (!(current->flags & PF_USEDFPU)) - __asm__ __volatile__ ( " clts;\n"); - - __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); - - switch(count) { - case 2: - __asm__ __volatile__ ( - - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " movq 16(%1), %%mm2 ;\n" - " movq %%mm0, (%1) ;\n" - " pxor 8(%2), %%mm1 ;\n" - " movq 24(%1), %%mm3 ;\n" - " movq %%mm1, 8(%1) ;\n" - " pxor 16(%2), %%mm2 ;\n" - " movq 32(%1), %%mm4 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 24(%2), %%mm3 ;\n" - " movq 40(%1), %%mm5 ;\n" - " movq %%mm3, 24(%1) ;\n" - " pxor 32(%2), %%mm4 ;\n" - " movq 48(%1), %%mm6 ;\n" - " movq %%mm4, 32(%1) ;\n" - " pxor 40(%2), %%mm5 ;\n" - " movq 56(%1), %%mm7 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 48(%2), %%mm6 ;\n" - " pxor 56(%2), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data) - : "memory" ); - break; - case 3: - __asm__ __volatile__ ( - - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " movq 16(%1), %%mm2 ;\n" - " pxor 8(%2), %%mm1 ;\n" - " pxor (%3), %%mm0 ;\n" - " pxor 16(%2), %%mm2 ;\n" - " movq %%mm0, (%1) ;\n" - " pxor 8(%3), %%mm1 ;\n" - " pxor 16(%3), %%mm2 ;\n" - " movq 24(%1), %%mm3 ;\n" - " movq %%mm1, 8(%1) ;\n" - " movq 32(%1), %%mm4 ;\n" - " movq 40(%1), %%mm5 ;\n" - " pxor 24(%2), %%mm3 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 32(%2), %%mm4 ;\n" - " pxor 24(%3), %%mm3 ;\n" - " pxor 40(%2), %%mm5 ;\n" - " movq %%mm3, 24(%1) ;\n" - " pxor 32(%3), %%mm4 ;\n" - " pxor 40(%3), %%mm5 ;\n" - " movq 48(%1), %%mm6 ;\n" - " movq %%mm4, 32(%1) ;\n" - " movq 56(%1), %%mm7 ;\n" - " pxor 48(%2), %%mm6 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 56(%2), %%mm7 ;\n" - " pxor 48(%3), %%mm6 ;\n" - " pxor 56(%3), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " addl $64, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data) - : "memory" ); - break; - case 4: - __asm__ __volatile__ ( - - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " movq 16(%1), %%mm2 ;\n" - " pxor 8(%2), %%mm1 ;\n" - " pxor (%3), %%mm0 ;\n" - " pxor 16(%2), %%mm2 ;\n" - " pxor 8(%3), %%mm1 ;\n" - " pxor (%4), %%mm0 ;\n" - " movq 24(%1), %%mm3 ;\n" - " pxor 16(%3), %%mm2 ;\n" - " pxor 8(%4), %%mm1 ;\n" - " movq %%mm0, (%1) ;\n" - " movq 32(%1), %%mm4 ;\n" - " pxor 24(%2), %%mm3 ;\n" - " pxor 16(%4), %%mm2 ;\n" - " movq %%mm1, 8(%1) ;\n" - " movq 40(%1), %%mm5 ;\n" - " pxor 32(%2), %%mm4 ;\n" - " pxor 24(%3), %%mm3 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 40(%2), %%mm5 ;\n" - " pxor 32(%3), %%mm4 ;\n" - " pxor 24(%4), %%mm3 ;\n" - " movq %%mm3, 24(%1) ;\n" - " movq 56(%1), %%mm7 ;\n" - " movq 48(%1), %%mm6 ;\n" - " pxor 40(%3), %%mm5 ;\n" - " pxor 32(%4), %%mm4 ;\n" - " pxor 48(%2), %%mm6 ;\n" - " movq %%mm4, 32(%1) ;\n" - " pxor 56(%2), %%mm7 ;\n" - " pxor 40(%4), %%mm5 ;\n" - " pxor 48(%3), %%mm6 ;\n" - " pxor 56(%3), %%mm7 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 48(%4), %%mm6 ;\n" - " pxor 56(%4), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " addl $64, %3 ;\n" - " addl $64, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "r" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data) - : "memory" ); - break; - case 5: - __asm__ __volatile__ ( - - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " pxor 8(%2), %%mm1 ;\n" - " movq 16(%1), %%mm2 ;\n" - " pxor (%3), %%mm0 ;\n" - " pxor 8(%3), %%mm1 ;\n" - " pxor 16(%2), %%mm2 ;\n" - " pxor (%4), %%mm0 ;\n" - " pxor 8(%4), %%mm1 ;\n" - " pxor 16(%3), %%mm2 ;\n" - " movq 24(%1), %%mm3 ;\n" - " pxor (%5), %%mm0 ;\n" - " pxor 8(%5), %%mm1 ;\n" - " movq %%mm0, (%1) ;\n" - " pxor 16(%4), %%mm2 ;\n" - " pxor 24(%2), %%mm3 ;\n" - " movq %%mm1, 8(%1) ;\n" - " pxor 16(%5), %%mm2 ;\n" - " pxor 24(%3), %%mm3 ;\n" - " movq 32(%1), %%mm4 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 24(%4), %%mm3 ;\n" - " pxor 32(%2), %%mm4 ;\n" - " movq 40(%1), %%mm5 ;\n" - " pxor 24(%5), %%mm3 ;\n" - " pxor 32(%3), %%mm4 ;\n" - " pxor 40(%2), %%mm5 ;\n" - " movq %%mm3, 24(%1) ;\n" - " pxor 32(%4), %%mm4 ;\n" - " pxor 40(%3), %%mm5 ;\n" - " movq 48(%1), %%mm6 ;\n" - " movq 56(%1), %%mm7 ;\n" - " pxor 32(%5), %%mm4 ;\n" - " pxor 40(%4), %%mm5 ;\n" - " pxor 48(%2), %%mm6 ;\n" - " pxor 56(%2), %%mm7 ;\n" - " movq %%mm4, 32(%1) ;\n" - " pxor 48(%3), %%mm6 ;\n" - " pxor 56(%3), %%mm7 ;\n" - " pxor 40(%5), %%mm5 ;\n" - " pxor 48(%4), %%mm6 ;\n" - " pxor 56(%4), %%mm7 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 48(%5), %%mm6 ;\n" - " pxor 56(%5), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " addl $64, %3 ;\n" - " addl $64, %4 ;\n" - " addl $64, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - - : - : "g" (lines), - "r" (bh_ptr[0]->b_data), - "r" (bh_ptr[1]->b_data), - "r" (bh_ptr[2]->b_data), - "r" (bh_ptr[3]->b_data), - "r" (bh_ptr[4]->b_data) - : "memory" ); - break; - } - - __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); - - if (!(current->flags & PF_USEDFPU)) - stts(); -} -#endif /* __i386__ */ -#endif /* !__sparc_v9__ */ - -#ifdef __sparc_v9__ -/* - * High speed xor_block operation for RAID4/5 utilizing the - * UltraSparc Visual Instruction Set. - * - * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) - * - * Requirements: - * !(((long)dest | (long)sourceN) & (64 - 1)) && - * !(len & 127) && len >= 256 - * - * It is done in pure assembly, as otherwise gcc makes it - * a non-leaf function, which is not what we want. - * Also, we don't measure the speeds as on other architectures, - * as the measuring routine does not take into account cold caches - * and the fact that xor_block_VIS bypasses the caches. - * xor_block_32regs might be 5% faster for count 2 if caches are hot - * and things just right (for count 3 VIS is about as fast as 32regs for - * hot caches and for count 4 and 5 VIS is faster by good margin always), - * but I think it is better not to pollute the caches. - * Actually, if I'd just fight for speed for hot caches, I could - * write a hybrid VIS/integer routine, which would do always two - * 64B blocks in VIS and two in IEUs, but I really care more about - * caches. - */ -extern void *VISenter(void); -extern void xor_block_VIS XOR_ARGS; - -void __xor_block_VIS(void) -{ -__asm__ (" - .globl xor_block_VIS -xor_block_VIS: - ldx [%%o1 + 0], %%o4 - ldx [%%o1 + 8], %%o3 - ldx [%%o4 + %1], %%g5 - ldx [%%o4 + %0], %%o4 - ldx [%%o3 + %0], %%o3 - rd %%fprs, %%o5 - andcc %%o5, %2, %%g0 - be,pt %%icc, 297f - sethi %%hi(%5), %%g1 - jmpl %%g1 + %%lo(%5), %%g7 - add %%g7, 8, %%g7 -297: wr %%g0, %4, %%fprs - membar #LoadStore|#StoreLoad|#StoreStore - sub %%g5, 64, %%g5 - ldda [%%o4] %3, %%f0 - ldda [%%o3] %3, %%f16 - cmp %%o0, 4 - bgeu,pt %%xcc, 10f - cmp %%o0, 3 - be,pn %%xcc, 13f - mov -64, %%g1 - sub %%g5, 64, %%g5 - rd %%asi, %%g1 - wr %%g0, %3, %%asi - -2: ldda [%%o4 + 64] %%asi, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - stda %%f16, [%%o4] %3 - ldda [%%o3 + 64] %%asi, %%f48 - ldda [%%o4 + 128] %%asi, %%f0 - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - add %%o4, 128, %%o4 - fxor %%f36, %%f52, %%f52 - add %%o3, 128, %%o3 - fxor %%f38, %%f54, %%f54 - subcc %%g5, 128, %%g5 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%%o4 - 64] %%asi - bne,pt %%xcc, 2b - ldda [%%o3] %3, %%f16 - - ldda [%%o4 + 64] %%asi, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - stda %%f16, [%%o4] %3 - ldda [%%o3 + 64] %%asi, %%f48 - membar #Sync - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - fxor %%f36, %%f52, %%f52 - fxor %%f38, %%f54, %%f54 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%%o4 + 64] %%asi - membar #Sync|#StoreStore|#StoreLoad - wr %%g0, 0, %%fprs - retl - wr %%g1, %%g0, %%asi - -13: ldx [%%o1 + 16], %%o2 - ldx [%%o2 + %0], %%o2 - -3: ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f48 - fxor %%f2, %%f18, %%f50 - add %%o4, 64, %%o4 - fxor %%f4, %%f20, %%f52 - fxor %%f6, %%f22, %%f54 - add %%o3, 64, %%o3 - fxor %%f8, %%f24, %%f56 - fxor %%f10, %%f26, %%f58 - fxor %%f12, %%f28, %%f60 - fxor %%f14, %%f30, %%f62 - ldda [%%o4] %3, %%f0 - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - add %%o2, 64, %%o2 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - subcc %%g5, 64, %%g5 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - stda %%f48, [%%o4 + %%g1] %3 - bne,pt %%xcc, 3b - ldda [%%o3] %3, %%f16 - - ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f48 - fxor %%f2, %%f18, %%f50 - fxor %%f4, %%f20, %%f52 - fxor %%f6, %%f22, %%f54 - fxor %%f8, %%f24, %%f56 - fxor %%f10, %%f26, %%f58 - fxor %%f12, %%f28, %%f60 - fxor %%f14, %%f30, %%f62 - membar #Sync - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - stda %%f48, [%%o4] %3 - membar #Sync|#StoreStore|#StoreLoad - retl - wr %%g0, 0, %%fprs - -10: cmp %%o0, 5 - be,pt %%xcc, 15f - mov -64, %%g1 - -14: ldx [%%o1 + 16], %%o2 - ldx [%%o1 + 24], %%o0 - ldx [%%o2 + %0], %%o2 - ldx [%%o0 + %0], %%o0 - -4: ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - add %%o4, 64, %%o4 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - add %%o3, 64, %%o3 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - ldda [%%o0] %3, %%f48 - fxor %%f16, %%f32, %%f32 - fxor %%f18, %%f34, %%f34 - fxor %%f20, %%f36, %%f36 - fxor %%f22, %%f38, %%f38 - add %%o2, 64, %%o2 - fxor %%f24, %%f40, %%f40 - fxor %%f26, %%f42, %%f42 - fxor %%f28, %%f44, %%f44 - fxor %%f30, %%f46, %%f46 - ldda [%%o4] %3, %%f0 - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - fxor %%f36, %%f52, %%f52 - add %%o0, 64, %%o0 - fxor %%f38, %%f54, %%f54 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - subcc %%g5, 64, %%g5 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%%o4 + %%g1] %3 - bne,pt %%xcc, 4b - ldda [%%o3] %3, %%f16 - - ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - ldda [%%o0] %3, %%f48 - fxor %%f16, %%f32, %%f32 - fxor %%f18, %%f34, %%f34 - fxor %%f20, %%f36, %%f36 - fxor %%f22, %%f38, %%f38 - fxor %%f24, %%f40, %%f40 - fxor %%f26, %%f42, %%f42 - fxor %%f28, %%f44, %%f44 - fxor %%f30, %%f46, %%f46 - membar #Sync - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - fxor %%f36, %%f52, %%f52 - fxor %%f38, %%f54, %%f54 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%%o4] %3 - membar #Sync|#StoreStore|#StoreLoad - retl - wr %%g0, 0, %%fprs - -15: ldx [%%o1 + 16], %%o2 - ldx [%%o1 + 24], %%o0 - ldx [%%o1 + 32], %%o1 - ldx [%%o2 + %0], %%o2 - ldx [%%o0 + %0], %%o0 - ldx [%%o1 + %0], %%o1 - -5: ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f48 - fxor %%f2, %%f18, %%f50 - add %%o4, 64, %%o4 - fxor %%f4, %%f20, %%f52 - fxor %%f6, %%f22, %%f54 - add %%o3, 64, %%o3 - fxor %%f8, %%f24, %%f56 - fxor %%f10, %%f26, %%f58 - fxor %%f12, %%f28, %%f60 - fxor %%f14, %%f30, %%f62 - ldda [%%o0] %3, %%f16 - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - add %%o2, 64, %%o2 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - ldda [%%o1] %3, %%f32 - fxor %%f48, %%f16, %%f48 - fxor %%f50, %%f18, %%f50 - add %%o0, 64, %%o0 - fxor %%f52, %%f20, %%f52 - fxor %%f54, %%f22, %%f54 - add %%o1, 64, %%o1 - fxor %%f56, %%f24, %%f56 - fxor %%f58, %%f26, %%f58 - fxor %%f60, %%f28, %%f60 - fxor %%f62, %%f30, %%f62 - ldda [%%o4] %3, %%f0 - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - subcc %%g5, 64, %%g5 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - stda %%f48, [%%o4 + %%g1] %3 - bne,pt %%xcc, 5b - ldda [%%o3] %3, %%f16 - - ldda [%%o2] %3, %%f32 - fxor %%f0, %%f16, %%f48 - fxor %%f2, %%f18, %%f50 - fxor %%f4, %%f20, %%f52 - fxor %%f6, %%f22, %%f54 - fxor %%f8, %%f24, %%f56 - fxor %%f10, %%f26, %%f58 - fxor %%f12, %%f28, %%f60 - fxor %%f14, %%f30, %%f62 - ldda [%%o0] %3, %%f16 - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - ldda [%%o1] %3, %%f32 - fxor %%f48, %%f16, %%f48 - fxor %%f50, %%f18, %%f50 - fxor %%f52, %%f20, %%f52 - fxor %%f54, %%f22, %%f54 - fxor %%f56, %%f24, %%f56 - fxor %%f58, %%f26, %%f58 - fxor %%f60, %%f28, %%f60 - fxor %%f62, %%f30, %%f62 - membar #Sync - fxor %%f48, %%f32, %%f48 - fxor %%f50, %%f34, %%f50 - fxor %%f52, %%f36, %%f52 - fxor %%f54, %%f38, %%f54 - fxor %%f56, %%f40, %%f56 - fxor %%f58, %%f42, %%f58 - fxor %%f60, %%f44, %%f60 - fxor %%f62, %%f46, %%f62 - stda %%f48, [%%o4] %3 - membar #Sync|#StoreStore|#StoreLoad - retl - wr %%g0, 0, %%fprs - " : : - "i" (&((struct buffer_head *)0)->b_data), - "i" (&((struct buffer_head *)0)->b_size), - "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P), - "i" (FPRS_FEF), "i" (VISenter)); -} -#endif /* __sparc_v9__ */ - -#if defined(__sparc__) && !defined(__sparc_v9__) -/* - * High speed xor_block operation for RAID4/5 utilizing the - * ldd/std SPARC instructions. - * - * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) - * - */ - -XORBLOCK_TEMPLATE(SPARC) -{ - int size = bh_ptr[0]->b_size; - int lines = size / (sizeof (long)) / 8, i; - long *destp = (long *) bh_ptr[0]->b_data; - long *source1 = (long *) bh_ptr[1]->b_data; - long *source2, *source3, *source4; - - switch (count) { - case 2: - for (i = lines; i > 0; i--) { - __asm__ __volatile__(" - ldd [%0 + 0x00], %%g2 - ldd [%0 + 0x08], %%g4 - ldd [%0 + 0x10], %%o0 - ldd [%0 + 0x18], %%o2 - ldd [%1 + 0x00], %%o4 - ldd [%1 + 0x08], %%l0 - ldd [%1 + 0x10], %%l2 - ldd [%1 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - std %%g2, [%0 + 0x00] - std %%g4, [%0 + 0x08] - std %%o0, [%0 + 0x10] - std %%o2, [%0 + 0x18] - " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", - "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); - destp += 8; - source1 += 8; - } - break; - case 3: - source2 = (long *) bh_ptr[2]->b_data; - for (i = lines; i > 0; i--) { - __asm__ __volatile__(" - ldd [%0 + 0x00], %%g2 - ldd [%0 + 0x08], %%g4 - ldd [%0 + 0x10], %%o0 - ldd [%0 + 0x18], %%o2 - ldd [%1 + 0x00], %%o4 - ldd [%1 + 0x08], %%l0 - ldd [%1 + 0x10], %%l2 - ldd [%1 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%2 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%2 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%2 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%2 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - std %%g2, [%0 + 0x00] - std %%g4, [%0 + 0x08] - std %%o0, [%0 + 0x10] - std %%o2, [%0 + 0x18] - " : : "r" (destp), "r" (source1), "r" (source2) - : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", - "l0", "l1", "l2", "l3", "l4", "l5"); - destp += 8; - source1 += 8; - source2 += 8; - } - break; - case 4: - source2 = (long *) bh_ptr[2]->b_data; - source3 = (long *) bh_ptr[3]->b_data; - for (i = lines; i > 0; i--) { - __asm__ __volatile__(" - ldd [%0 + 0x00], %%g2 - ldd [%0 + 0x08], %%g4 - ldd [%0 + 0x10], %%o0 - ldd [%0 + 0x18], %%o2 - ldd [%1 + 0x00], %%o4 - ldd [%1 + 0x08], %%l0 - ldd [%1 + 0x10], %%l2 - ldd [%1 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%2 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%2 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%2 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%2 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%3 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%3 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%3 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%3 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - std %%g2, [%0 + 0x00] - std %%g4, [%0 + 0x08] - std %%o0, [%0 + 0x10] - std %%o2, [%0 + 0x18] - " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3) - : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", - "l0", "l1", "l2", "l3", "l4", "l5"); - destp += 8; - source1 += 8; - source2 += 8; - source3 += 8; - } - break; - case 5: - source2 = (long *) bh_ptr[2]->b_data; - source3 = (long *) bh_ptr[3]->b_data; - source4 = (long *) bh_ptr[4]->b_data; - for (i = lines; i > 0; i--) { - __asm__ __volatile__(" - ldd [%0 + 0x00], %%g2 - ldd [%0 + 0x08], %%g4 - ldd [%0 + 0x10], %%o0 - ldd [%0 + 0x18], %%o2 - ldd [%1 + 0x00], %%o4 - ldd [%1 + 0x08], %%l0 - ldd [%1 + 0x10], %%l2 - ldd [%1 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%2 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%2 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%2 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%2 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%3 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%3 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%3 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%3 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - ldd [%4 + 0x00], %%o4 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - ldd [%4 + 0x08], %%l0 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - ldd [%4 + 0x10], %%l2 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - ldd [%4 + 0x18], %%l4 - xor %%g2, %%o4, %%g2 - xor %%g3, %%o5, %%g3 - xor %%g4, %%l0, %%g4 - xor %%g5, %%l1, %%g5 - xor %%o0, %%l2, %%o0 - xor %%o1, %%l3, %%o1 - xor %%o2, %%l4, %%o2 - xor %%o3, %%l5, %%o3 - std %%g2, [%0 + 0x00] - std %%g4, [%0 + 0x08] - std %%o0, [%0 + 0x10] - std %%o2, [%0 + 0x18] - " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4) - : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", - "l0", "l1", "l2", "l3", "l4", "l5"); - destp += 8; - source1 += 8; - source2 += 8; - source3 += 8; - source4 += 8; - } - break; - } -} -#endif /* __sparc_v[78]__ */ - -#ifndef __sparc_v9__ - -/* - * this one works reasonably on any x86 CPU - * (send me an assembly version for inclusion if you can make it faster) - * - * this one is just as fast as written in pure assembly on x86. - * the reason for this separate version is that the - * fast open-coded xor routine "32reg" produces suboptimal code - * on x86, due to lack of registers. - */ -XORBLOCK_TEMPLATE(8regs) -{ - int len = bh_ptr[0]->b_size; - long *destp = (long *) bh_ptr[0]->b_data; - long *source1, *source2, *source3, *source4; - long lines = len / (sizeof (long)) / 8, i; - - switch(count) { - case 2: - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(source1 + 0); - *(destp + 1) ^= *(source1 + 1); - *(destp + 2) ^= *(source1 + 2); - *(destp + 3) ^= *(source1 + 3); - *(destp + 4) ^= *(source1 + 4); - *(destp + 5) ^= *(source1 + 5); - *(destp + 6) ^= *(source1 + 6); - *(destp + 7) ^= *(source1 + 7); - source1 += 8; - destp += 8; - } - break; - case 3: - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(source1 + 0); - *(destp + 0) ^= *(source2 + 0); - *(destp + 1) ^= *(source1 + 1); - *(destp + 1) ^= *(source2 + 1); - *(destp + 2) ^= *(source1 + 2); - *(destp + 2) ^= *(source2 + 2); - *(destp + 3) ^= *(source1 + 3); - *(destp + 3) ^= *(source2 + 3); - *(destp + 4) ^= *(source1 + 4); - *(destp + 4) ^= *(source2 + 4); - *(destp + 5) ^= *(source1 + 5); - *(destp + 5) ^= *(source2 + 5); - *(destp + 6) ^= *(source1 + 6); - *(destp + 6) ^= *(source2 + 6); - *(destp + 7) ^= *(source1 + 7); - *(destp + 7) ^= *(source2 + 7); - source1 += 8; - source2 += 8; - destp += 8; - } - break; - case 4: - source3 = (long *) bh_ptr[3]->b_data; - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(source1 + 0); - *(destp + 0) ^= *(source2 + 0); - *(destp + 0) ^= *(source3 + 0); - *(destp + 1) ^= *(source1 + 1); - *(destp + 1) ^= *(source2 + 1); - *(destp + 1) ^= *(source3 + 1); - *(destp + 2) ^= *(source1 + 2); - *(destp + 2) ^= *(source2 + 2); - *(destp + 2) ^= *(source3 + 2); - *(destp + 3) ^= *(source1 + 3); - *(destp + 3) ^= *(source2 + 3); - *(destp + 3) ^= *(source3 + 3); - *(destp + 4) ^= *(source1 + 4); - *(destp + 4) ^= *(source2 + 4); - *(destp + 4) ^= *(source3 + 4); - *(destp + 5) ^= *(source1 + 5); - *(destp + 5) ^= *(source2 + 5); - *(destp + 5) ^= *(source3 + 5); - *(destp + 6) ^= *(source1 + 6); - *(destp + 6) ^= *(source2 + 6); - *(destp + 6) ^= *(source3 + 6); - *(destp + 7) ^= *(source1 + 7); - *(destp + 7) ^= *(source2 + 7); - *(destp + 7) ^= *(source3 + 7); - source1 += 8; - source2 += 8; - source3 += 8; - destp += 8; - } - break; - case 5: - source4 = (long *) bh_ptr[4]->b_data; - source3 = (long *) bh_ptr[3]->b_data; - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(source1 + 0); - *(destp + 0) ^= *(source2 + 0); - *(destp + 0) ^= *(source3 + 0); - *(destp + 0) ^= *(source4 + 0); - *(destp + 1) ^= *(source1 + 1); - *(destp + 1) ^= *(source2 + 1); - *(destp + 1) ^= *(source3 + 1); - *(destp + 1) ^= *(source4 + 1); - *(destp + 2) ^= *(source1 + 2); - *(destp + 2) ^= *(source2 + 2); - *(destp + 2) ^= *(source3 + 2); - *(destp + 2) ^= *(source4 + 2); - *(destp + 3) ^= *(source1 + 3); - *(destp + 3) ^= *(source2 + 3); - *(destp + 3) ^= *(source3 + 3); - *(destp + 3) ^= *(source4 + 3); - *(destp + 4) ^= *(source1 + 4); - *(destp + 4) ^= *(source2 + 4); - *(destp + 4) ^= *(source3 + 4); - *(destp + 4) ^= *(source4 + 4); - *(destp + 5) ^= *(source1 + 5); - *(destp + 5) ^= *(source2 + 5); - *(destp + 5) ^= *(source3 + 5); - *(destp + 5) ^= *(source4 + 5); - *(destp + 6) ^= *(source1 + 6); - *(destp + 6) ^= *(source2 + 6); - *(destp + 6) ^= *(source3 + 6); - *(destp + 6) ^= *(source4 + 6); - *(destp + 7) ^= *(source1 + 7); - *(destp + 7) ^= *(source2 + 7); - *(destp + 7) ^= *(source3 + 7); - *(destp + 7) ^= *(source4 + 7); - source1 += 8; - source2 += 8; - source3 += 8; - source4 += 8; - destp += 8; - } - break; - } -} - -/* - * platform independent RAID5 checksum calculation, this should - * be very fast on any platform that has a decent amount of - * registers. (32 or more) - */ -XORBLOCK_TEMPLATE(32regs) -{ - int size = bh_ptr[0]->b_size; - int lines = size / (sizeof (long)) / 8, i; - long *destp = (long *) bh_ptr[0]->b_data; - long *source1, *source2, *source3, *source4; - - /* LOTS of registers available... - We do explicite loop-unrolling here for code which - favours RISC machines. In fact this is almoast direct - RISC assembly on Alpha and SPARC :-) */ - - - switch(count) { - case 2: - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = destp[0]; /* Pull the stuff into registers */ - d1 = destp[1]; /* ... in bursts, if possible. */ - d2 = destp[2]; - d3 = destp[3]; - d4 = destp[4]; - d5 = destp[5]; - d6 = destp[6]; - d7 = destp[7]; - d0 ^= source1[0]; - d1 ^= source1[1]; - d2 ^= source1[2]; - d3 ^= source1[3]; - d4 ^= source1[4]; - d5 ^= source1[5]; - d6 ^= source1[6]; - d7 ^= source1[7]; - destp[0] = d0; /* Store the result (in burts) */ - destp[1] = d1; - destp[2] = d2; - destp[3] = d3; - destp[4] = d4; /* Store the result (in burts) */ - destp[5] = d5; - destp[6] = d6; - destp[7] = d7; - source1 += 8; - destp += 8; - } - break; - case 3: - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = destp[0]; /* Pull the stuff into registers */ - d1 = destp[1]; /* ... in bursts, if possible. */ - d2 = destp[2]; - d3 = destp[3]; - d4 = destp[4]; - d5 = destp[5]; - d6 = destp[6]; - d7 = destp[7]; - d0 ^= source1[0]; - d1 ^= source1[1]; - d2 ^= source1[2]; - d3 ^= source1[3]; - d4 ^= source1[4]; - d5 ^= source1[5]; - d6 ^= source1[6]; - d7 ^= source1[7]; - d0 ^= source2[0]; - d1 ^= source2[1]; - d2 ^= source2[2]; - d3 ^= source2[3]; - d4 ^= source2[4]; - d5 ^= source2[5]; - d6 ^= source2[6]; - d7 ^= source2[7]; - destp[0] = d0; /* Store the result (in burts) */ - destp[1] = d1; - destp[2] = d2; - destp[3] = d3; - destp[4] = d4; /* Store the result (in burts) */ - destp[5] = d5; - destp[6] = d6; - destp[7] = d7; - source1 += 8; - source2 += 8; - destp += 8; - } - break; - case 4: - source3 = (long *) bh_ptr[3]->b_data; - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = destp[0]; /* Pull the stuff into registers */ - d1 = destp[1]; /* ... in bursts, if possible. */ - d2 = destp[2]; - d3 = destp[3]; - d4 = destp[4]; - d5 = destp[5]; - d6 = destp[6]; - d7 = destp[7]; - d0 ^= source1[0]; - d1 ^= source1[1]; - d2 ^= source1[2]; - d3 ^= source1[3]; - d4 ^= source1[4]; - d5 ^= source1[5]; - d6 ^= source1[6]; - d7 ^= source1[7]; - d0 ^= source2[0]; - d1 ^= source2[1]; - d2 ^= source2[2]; - d3 ^= source2[3]; - d4 ^= source2[4]; - d5 ^= source2[5]; - d6 ^= source2[6]; - d7 ^= source2[7]; - d0 ^= source3[0]; - d1 ^= source3[1]; - d2 ^= source3[2]; - d3 ^= source3[3]; - d4 ^= source3[4]; - d5 ^= source3[5]; - d6 ^= source3[6]; - d7 ^= source3[7]; - destp[0] = d0; /* Store the result (in burts) */ - destp[1] = d1; - destp[2] = d2; - destp[3] = d3; - destp[4] = d4; /* Store the result (in burts) */ - destp[5] = d5; - destp[6] = d6; - destp[7] = d7; - source1 += 8; - source2 += 8; - source3 += 8; - destp += 8; - } - break; - case 5: - source4 = (long *) bh_ptr[4]->b_data; - source3 = (long *) bh_ptr[3]->b_data; - source2 = (long *) bh_ptr[2]->b_data; - source1 = (long *) bh_ptr[1]->b_data; - for (i = lines; i > 0; i--) { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = destp[0]; /* Pull the stuff into registers */ - d1 = destp[1]; /* ... in bursts, if possible. */ - d2 = destp[2]; - d3 = destp[3]; - d4 = destp[4]; - d5 = destp[5]; - d6 = destp[6]; - d7 = destp[7]; - d0 ^= source1[0]; - d1 ^= source1[1]; - d2 ^= source1[2]; - d3 ^= source1[3]; - d4 ^= source1[4]; - d5 ^= source1[5]; - d6 ^= source1[6]; - d7 ^= source1[7]; - d0 ^= source2[0]; - d1 ^= source2[1]; - d2 ^= source2[2]; - d3 ^= source2[3]; - d4 ^= source2[4]; - d5 ^= source2[5]; - d6 ^= source2[6]; - d7 ^= source2[7]; - d0 ^= source3[0]; - d1 ^= source3[1]; - d2 ^= source3[2]; - d3 ^= source3[3]; - d4 ^= source3[4]; - d5 ^= source3[5]; - d6 ^= source3[6]; - d7 ^= source3[7]; - d0 ^= source4[0]; - d1 ^= source4[1]; - d2 ^= source4[2]; - d3 ^= source4[3]; - d4 ^= source4[4]; - d5 ^= source4[5]; - d6 ^= source4[6]; - d7 ^= source4[7]; - destp[0] = d0; /* Store the result (in burts) */ - destp[1] = d1; - destp[2] = d2; - destp[3] = d3; - destp[4] = d4; /* Store the result (in burts) */ - destp[5] = d5; - destp[6] = d6; - destp[7] = d7; - source1 += 8; - source2 += 8; - source3 += 8; - source4 += 8; - destp += 8; - } - break; - } -} - -/* - * (the -6*32 shift factor colors the cache) - */ -#define SIZE (PAGE_SIZE-6*32) - -static void xor_speed ( struct xor_block_template * func, - struct buffer_head *b1, struct buffer_head *b2) -{ - int speed; - unsigned long now; - int i, count, max; - struct buffer_head *bh_ptr[6]; - - func->next = xor_functions; - xor_functions = func; - bh_ptr[0] = b1; - bh_ptr[1] = b2; - - /* - * count the number of XORs done during a whole jiffy. - * calculate the speed of checksumming from this. - * (we use a 2-page allocation to have guaranteed - * color L1-cache layout) - */ - max = 0; - for (i = 0; i < 5; i++) { - now = jiffies; - count = 0; - while (jiffies == now) { - mb(); - func->xor_block(2,bh_ptr); - mb(); - count++; - mb(); - } - if (count > max) - max = count; - } - - speed = max * (HZ*SIZE/1024); - func->speed = speed; - - printk( " %-10s: %5d.%03d MB/sec\n", func->name, - speed / 1000, speed % 1000); -} - -static inline void pick_fastest_function(void) -{ - struct xor_block_template *f, *fastest; - - fastest = xor_functions; - for (f = fastest; f; f = f->next) { - if (f->speed > fastest->speed) - fastest = f; - } -#ifdef CONFIG_X86_XMM - if (cpu_has_xmm) { - fastest = &t_xor_block_pIII_kni; - } -#endif - xor_block = fastest->xor_block; - printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name, - fastest->speed / 1000, fastest->speed % 1000); -} - -static struct buffer_head b1, b2; - -void calibrate_xor_block(void) -{ - if (xor_block) - return; - memset(&b1,0,sizeof(b1)); - b2 = b1; - - b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2); - if (!b1.b_data) { - pick_fastest_function(); - return; - } - b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE; - - b1.b_size = SIZE; - - printk(KERN_INFO "raid5: measuring checksumming speed\n"); - - sti(); /* should be safe */ - -#if defined(__sparc__) && !defined(__sparc_v9__) - printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n"); - xor_speed(&t_xor_block_SPARC,&b1,&b2); -#endif - -#ifdef CONFIG_X86_XMM - if (cpu_has_xmm) { - printk(KERN_INFO - "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n"); - /* we force the use of the KNI xor block because it - can write around l2. we may also be able - to load into the l1 only depending on how - the cpu deals with a load to a line that is - being prefetched. - */ - xor_speed(&t_xor_block_pIII_kni,&b1,&b2); - } -#endif /* CONFIG_X86_XMM */ - -#ifdef __i386__ - - if (md_cpu_has_mmx()) { - printk(KERN_INFO - "raid5: MMX detected, trying high-speed MMX checksum routines\n"); - xor_speed(&t_xor_block_pII_mmx,&b1,&b2); - xor_speed(&t_xor_block_p5_mmx,&b1,&b2); - } - -#endif /* __i386__ */ - - - xor_speed(&t_xor_block_8regs,&b1,&b2); - xor_speed(&t_xor_block_32regs,&b1,&b2); - - free_pages((unsigned long)b1.b_data,2); - pick_fastest_function(); -} - -#else /* __sparc_v9__ */ - -void calibrate_xor_block(void) -{ - if (xor_block) - return; - printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n"); - xor_block = xor_block_VIS; -} - -#endif /* __sparc_v9__ */ - -MD_EXPORT_SYMBOL(xor_block); -MD_EXPORT_SYMBOL(calibrate_xor_block); - -#ifdef MODULE -int init_module(void) -{ - calibrate_xor_block(); - return 0; -} -#endif |