diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/Config.in | 6 | ||||
-rw-r--r-- | drivers/block/Makefile | 24 | ||||
-rw-r--r-- | drivers/block/acsi_slm.c | 16 | ||||
-rw-r--r-- | drivers/block/blkpg.c | 14 | ||||
-rw-r--r-- | drivers/block/cs5530.c | 420 | ||||
-rw-r--r-- | drivers/block/floppy.c | 117 | ||||
-rw-r--r-- | drivers/block/hpt366.c | 7 | ||||
-rw-r--r-- | drivers/block/ide-floppy.c | 7 | ||||
-rw-r--r-- | drivers/block/ide-pci.c | 21 | ||||
-rw-r--r-- | drivers/block/ide-pmac.c | 622 | ||||
-rw-r--r-- | drivers/block/ide-probe.c | 10 | ||||
-rw-r--r-- | drivers/block/ide-tape.c | 25 | ||||
-rw-r--r-- | drivers/block/ide.c | 2 | ||||
-rw-r--r-- | drivers/block/linear.c | 283 | ||||
-rw-r--r-- | drivers/block/linear.h | 16 | ||||
-rw-r--r-- | drivers/block/ll_rw_blk.c | 291 | ||||
-rw-r--r-- | drivers/block/loop.c | 10 | ||||
-rw-r--r-- | drivers/block/md.c | 3793 | ||||
-rw-r--r-- | drivers/block/paride/pg.c | 16 | ||||
-rw-r--r-- | drivers/block/paride/pt.c | 17 | ||||
-rw-r--r-- | drivers/block/raid0.c | 515 | ||||
-rw-r--r-- | drivers/block/rd.c | 22 |
22 files changed, 4678 insertions, 1576 deletions
diff --git a/drivers/block/Config.in b/drivers/block/Config.in index 5634cc488..79bd8078e 100644 --- a/drivers/block/Config.in +++ b/drivers/block/Config.in @@ -79,8 +79,8 @@ else fi bool ' HPT366 chipset support' CONFIG_BLK_DEV_HPT366 if [ "$CONFIG_IDEDMA_PCI_EXPERIMENTAL" = "y" -a "$CONFIG_BLK_DEV_HPT366" = "y" ]; then - bool ' HPT366 Fast Interrupt support (EXPERIMENTAL) (WIP)' HPT366_FAST_IRQ_PREDICTION - bool ' HPT366 mode three unsupported (EXPERIMENTAL) (WIP)' HPT366_MODE3 + bool ' HPT366 Fast Interrupt support (EXPERIMENTAL) (WIP)' CONFIG_HPT366_FAST_IRQ_PREDICTION + bool ' HPT366 mode three unsupported (EXPERIMENTAL) (WIP)' CONFIG_HPT366_MODE3 fi if [ "$CONFIG_X86" = "y" ]; then bool ' Intel PIIXn chipsets support' CONFIG_BLK_DEV_PIIX @@ -106,6 +106,7 @@ else if [ "$CONFIG_X86" = "y" ]; then bool ' SiS5513 chipset support' CONFIG_BLK_DEV_SIS5513 fi + bool ' Cyrix CS5530 MediaGX chipset support' CONFIG_BLK_DEV_CS5530 fi if [ "$CONFIG_IDEDMA_PCI_EXPERIMENTAL" = "y" ]; then bool ' Tekram TRM290 chipset support (EXPERIMENTAL)' CONFIG_BLK_DEV_TRM290 @@ -239,6 +240,7 @@ if [ "$CONFIG_IDE_CHIPSETS" = "y" -o \ "$CONFIG_BLK_DEV_PDC202XX" = "y" -o \ "$CONFIG_BLK_DEV_PIIX" = "y" -o \ "$CONFIG_BLK_DEV_SIS5513" = "y" -o \ + "$CONFIG_BLK_DEV_CS5530" = "y" -o \ "$CONFIG_BLK_DEV_SL82C105" = "y" ]; then define_bool CONFIG_BLK_DEV_IDE_MODES y else diff --git a/drivers/block/Makefile b/drivers/block/Makefile index dca5831d7..3f9c5f85b 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -198,6 +198,10 @@ ifeq ($(CONFIG_BLK_DEV_PDC202XX),y) IDE_OBJS += pdc202xx.o endif +ifeq ($(CONFIG_BLK_DEV_CS5530),y) +IDE_OBJS += cs5530.o +endif + ifeq ($(CONFIG_BLK_DEV_PDC4030),y) IDE_OBJS += pdc4030.o endif @@ -350,13 +354,33 @@ else endif ifeq ($(CONFIG_MD_RAID5),y) +LX_OBJS += xor.o +CFLAGS_xor.o := $(PROFILING) -fomit-frame-pointer L_OBJS += raid5.o else ifeq ($(CONFIG_MD_RAID5),m) + LX_OBJS += xor.o + CFLAGS_xor.o := $(PROFILING) -fomit-frame-pointer M_OBJS += raid5.o endif endif +ifeq ($(CONFIG_MD_TRANSLUCENT),y) +L_OBJS += translucent.o +else + ifeq ($(CONFIG_MD_TRANSLUCENT),m) + M_OBJS += translucent.o + endif +endif + +ifeq ($(CONFIG_MD_HSM),y) +L_OBJS += hsm.o +else + ifeq ($(CONFIG_MD_HSM),m) + M_OBJS += hsm.o + endif +endif + endif ifeq ($(CONFIG_BLK_DEV_NBD),y) diff --git a/drivers/block/acsi_slm.c b/drivers/block/acsi_slm.c index d718075e2..e4d343be3 100644 --- a/drivers/block/acsi_slm.c +++ b/drivers/block/acsi_slm.c @@ -271,17 +271,11 @@ static int slm_get_pagesize( int device, int *w, int *h ); static struct timer_list slm_timer = { NULL, NULL, 0, 0, slm_test_ready }; static struct file_operations slm_fops = { - NULL, /* lseek - default */ - slm_read, /* read - status reading */ - slm_write, /* write - printing data write */ - NULL, /* readdir - bad */ - NULL, /* poll */ - slm_ioctl, /* ioctl */ - NULL, /* mmap */ - slm_open, /* open */ - NULL, /* flush */ - slm_release, /* release */ - NULL /* fsync */ + read: slm_read, + write: slm_write, + ioctl: slm_ioctl, + open: slm_open, + release: slm_release, }; diff --git a/drivers/block/blkpg.c b/drivers/block/blkpg.c index 6f5674072..591ff9310 100644 --- a/drivers/block/blkpg.c +++ b/drivers/block/blkpg.c @@ -65,20 +65,6 @@ static struct gendisk *get_gendisk(kdev_t dev) { return g; } -/* moved here from md.c - will be discarded later */ -char *partition_name (kdev_t dev) { - static char name[40]; /* kdevname returns 32 bytes */ - /* disk_name requires 32 bytes */ - struct gendisk *hd = get_gendisk (dev); - - if (!hd) { - sprintf (name, "[dev %s]", kdevname(dev)); - return (name); - } - - return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */ -} - /* * Add a partition. * diff --git a/drivers/block/cs5530.c b/drivers/block/cs5530.c new file mode 100644 index 000000000..cf8b5fdda --- /dev/null +++ b/drivers/block/cs5530.c @@ -0,0 +1,420 @@ +/* + * linux/drivers/block/cs5530.c Version 0.2 Jan 30, 2000 + * + * Copyright (C) 2000 Mark Lord <mlord@pobox.com> + * May be copied or modified under the terms of the GNU General Public License + * + * Development of this chipset driver was funded + * by the nice folks at National Semiconductor. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/ioport.h> +#include <linux/blkdev.h> +#include <linux/hdreg.h> +#include <linux/interrupt.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/ide.h> +#include <asm/io.h> +#include <asm/irq.h> +#include "ide_modes.h" + +/* + * Return the mode name for a drive transfer mode value: + */ +static const char *strmode (byte mode) +{ + switch (mode) { + case XFER_UDMA_4: return("UDMA4"); + case XFER_UDMA_3: return("UDMA3"); + case XFER_UDMA_2: return("UDMA2"); + case XFER_UDMA_1: return("UDMA1"); + case XFER_UDMA_0: return("UDMA0"); + case XFER_MW_DMA_2: return("MDMA2"); + case XFER_MW_DMA_1: return("MDMA1"); + case XFER_MW_DMA_0: return("MDMA0"); + case XFER_SW_DMA_2: return("SDMA2"); + case XFER_SW_DMA_1: return("SDMA1"); + case XFER_SW_DMA_0: return("SDMA0"); + case XFER_PIO_4: return("PIO4"); + case XFER_PIO_3: return("PIO3"); + case XFER_PIO_2: return("PIO2"); + case XFER_PIO_1: return("PIO1"); + case XFER_PIO_0: return("PIO0"); + default: return("???"); + } +} + +/* + * Set a new transfer mode at the drive + */ +int cs5530_set_xfer_mode (ide_drive_t *drive, byte mode) +{ + int i, error = 1; + byte stat; + ide_hwif_t *hwif = HWIF(drive); + + printk("%s: cs5530_set_xfer_mode(%s)\n", drive->name, strmode(mode)); + /* + * If this is a DMA mode setting, then turn off all DMA bits. + * We will set one of them back on afterwards, if all goes well. + * + * Not sure why this is needed (it looks very silly), + * but other IDE chipset drivers also do this fiddling. ???? -ml + */ + switch (mode) { + case XFER_UDMA_4: + case XFER_UDMA_3: + case XFER_UDMA_2: + case XFER_UDMA_1: + case XFER_UDMA_0: + case XFER_MW_DMA_2: + case XFER_MW_DMA_1: + case XFER_MW_DMA_0: + case XFER_SW_DMA_2: + case XFER_SW_DMA_1: + case XFER_SW_DMA_0: + drive->id->dma_ultra &= ~0xFF00; + drive->id->dma_mword &= ~0x0F00; + drive->id->dma_1word &= ~0x0F00; + } + + /* + * Select the drive, and issue the SETFEATURES command + */ + disable_irq(hwif->irq); + udelay(1); + SELECT_DRIVE(HWIF(drive), drive); + udelay(1); + if (IDE_CONTROL_REG) + OUT_BYTE(drive->ctl | 2, IDE_CONTROL_REG); + OUT_BYTE(mode, IDE_NSECTOR_REG); + OUT_BYTE(SETFEATURES_XFER, IDE_FEATURE_REG); + OUT_BYTE(WIN_SETFEATURES, IDE_COMMAND_REG); + udelay(1); /* spec allows drive 400ns to assert "BUSY" */ + + /* + * Wait for drive to become non-BUSY + */ + if ((stat = GET_STAT()) & BUSY_STAT) { + unsigned long flags, timeout; + __save_flags(flags); /* local CPU only */ + ide__sti(); /* local CPU only -- for jiffies */ + timeout = jiffies + WAIT_CMD; + while ((stat = GET_STAT()) & BUSY_STAT) { + if (0 < (signed long)(jiffies - timeout)) + break; + } + __restore_flags(flags); /* local CPU only */ + } + + /* + * Allow status to settle, then read it again. + * A few rare drives vastly violate the 400ns spec here, + * so we'll wait up to 10usec for a "good" status + * rather than expensively fail things immediately. + */ + for (i = 0; i < 10; i++) { + udelay(1); + if (OK_STAT((stat = GET_STAT()), DRIVE_READY, BUSY_STAT|DRQ_STAT|ERR_STAT)) { + error = 0; + break; + } + } + enable_irq(hwif->irq); + + /* + * Turn dma bit on if all is okay + */ + if (error) { + (void) ide_dump_status(drive, "cs5530_set_xfer_mode", stat); + } else { + switch (mode) { + case XFER_UDMA_4: drive->id->dma_ultra |= 0x1010; break; + case XFER_UDMA_3: drive->id->dma_ultra |= 0x0808; break; + case XFER_UDMA_2: drive->id->dma_ultra |= 0x0404; break; + case XFER_UDMA_1: drive->id->dma_ultra |= 0x0202; break; + case XFER_UDMA_0: drive->id->dma_ultra |= 0x0101; break; + case XFER_MW_DMA_2: drive->id->dma_mword |= 0x0404; break; + case XFER_MW_DMA_1: drive->id->dma_mword |= 0x0202; break; + case XFER_MW_DMA_0: drive->id->dma_mword |= 0x0101; break; + case XFER_SW_DMA_2: drive->id->dma_1word |= 0x0404; break; + case XFER_SW_DMA_1: drive->id->dma_1word |= 0x0202; break; + case XFER_SW_DMA_0: drive->id->dma_1word |= 0x0101; break; + } + } + return error; +} + +/* + * Here are the standard PIO mode 0-4 timings for each "format". + * Format-0 uses fast data reg timings, with slower command reg timings. + * Format-1 uses fast timings for all registers, but won't work with all drives. + */ +static unsigned int cs5530_pio_timings[2][5] = + {{0x00009172, 0x00012171, 0x00020080, 0x00032010, 0x00040010}, + {0xd1329172, 0x71212171, 0x30200080, 0x20102010, 0x00100010}}; + +/* + * After chip reset, the PIO timings are set to 0x0000e132, which is not valid. + */ +#define CS5530_BAD_PIO(timings) (((timings)&~0x80000000)==0x0000e132) +#define CS5530_BASEREG(hwif) (((hwif)->dma_base & ~0xf) + ((hwif)->channel ? 0x30 : 0x20)) + +/* + * cs5530_tuneproc() handles selection/setting of PIO modes + * for both the chipset and drive. + * + * The ide_init_cs5530() routine guarantees that all drives + * will have valid default PIO timings set up before we get here. + */ +static void cs5530_tuneproc (ide_drive_t *drive, byte pio) /* pio=255 means "autotune" */ +{ + ide_hwif_t *hwif = HWIF(drive); + unsigned int format, basereg = CS5530_BASEREG(hwif); + static byte modes[5] = {XFER_PIO_0, XFER_PIO_1, XFER_PIO_2, XFER_PIO_3, XFER_PIO_4}; + + pio = ide_get_best_pio_mode(drive, pio, 4, NULL); + if (!cs5530_set_xfer_mode(drive, modes[pio])) { + format = (inl(basereg+4) >> 31) & 1; + outl(cs5530_pio_timings[format][pio], basereg+(drive->select.b.unit<<3)); + } +} + +/* + * cs5530_config_dma() handles selection/setting of DMA/UDMA modes + * for both the chipset and drive. + */ +static int cs5530_config_dma (ide_drive_t *drive) +{ + int udma_ok = 1, mode = 0; + ide_hwif_t *hwif = HWIF(drive); + int unit = drive->select.b.unit; + ide_drive_t *mate = &hwif->drives[unit^1]; + struct hd_driveid *id = drive->id; + unsigned int basereg, reg, timings; + + + /* + * Default to DMA-off in case we run into trouble here. + */ + (void)hwif->dmaproc(ide_dma_off_quietly, drive); /* turn off DMA while we fiddle */ + outb(inb(hwif->dma_base+2)&~(unit?0x40:0x20), hwif->dma_base+2); /* clear DMA_capable bit */ + + /* + * The CS5530 specifies that two drives sharing a cable cannot + * mix UDMA/MDMA. It has to be one or the other, for the pair, + * though different timings can still be chosen for each drive. + * We could set the appropriate timing bits on the fly, + * but that might be a bit confusing. So, for now we statically + * handle this requirement by looking at our mate drive to see + * what it is capable of, before choosing a mode for our own drive. + */ + if (mate->present) { + struct hd_driveid *mateid = mate->id; + if (mateid && (mateid->capability & 1) && !hwif->dmaproc(ide_dma_bad_drive, mate)) { + if ((mateid->field_valid & 4) && (mateid->dma_ultra & 7)) + udma_ok = 1; + else if ((mateid->field_valid & 2) && (mateid->dma_mword & 7)) + udma_ok = 0; + else + udma_ok = 1; + } + } + + /* + * Now see what the current drive is capable of, + * selecting UDMA only if the mate said it was ok. + */ + if (id && (id->capability & 1) && hwif->autodma && !hwif->dmaproc(ide_dma_bad_drive, drive)) { + if (udma_ok && (id->field_valid & 4) && (id->dma_ultra & 7)) { + if (id->dma_ultra & 4) + mode = XFER_UDMA_2; + else if (id->dma_ultra & 2) + mode = XFER_UDMA_1; + else if (id->dma_ultra & 1) + mode = XFER_UDMA_0; + } + if (!mode && (id->field_valid & 2) && (id->dma_mword & 7)) { + if (id->dma_mword & 4) + mode = XFER_MW_DMA_2; + else if (id->dma_mword & 2) + mode = XFER_MW_DMA_1; + else if (id->dma_mword & 1) + mode = XFER_MW_DMA_0; + } + } + + /* + * Tell the drive to switch to the new mode; abort on failure. + */ + if (!mode || cs5530_set_xfer_mode(drive, mode)) + return 1; /* failure */ + + /* + * Now tune the chipset to match the drive: + */ + switch (mode) { + case XFER_UDMA_0: timings = 0x00921250; break; + case XFER_UDMA_1: timings = 0x00911140; break; + case XFER_UDMA_2: timings = 0x00911030; break; + case XFER_MW_DMA_0: timings = 0x00077771; break; + case XFER_MW_DMA_1: timings = 0x00012121; break; + case XFER_MW_DMA_2: timings = 0x00002020; break; + default: + printk("%s: cs5530_config_dma: huh? mode=%02x\n", drive->name, mode); + return 1; /* failure */ + } + basereg = CS5530_BASEREG(hwif); + reg = inl(basereg+4); /* get drive0 config register */ + timings |= reg & 0x80000000; /* preserve PIO format bit */ + if (unit == 0) { /* are we configuring drive0? */ + outl(timings, basereg+4); /* write drive0 config register */ + } else { + if (timings & 0x00100000) + reg |= 0x00100000; /* enable UDMA timings for both drives */ + else + reg &= ~0x00100000; /* disable UDMA timings for both drives */ + outl(reg, basereg+4); /* write drive0 config register */ + outl(timings, basereg+12); /* write drive1 config register */ + } + outb(inb(hwif->dma_base+2)|(unit?0x40:0x20), hwif->dma_base+2); /* set DMA_capable bit */ + + if (!strcmp(drive->name, "hdc")) /* FIXME */ + return 0; + /* + * Finally, turn DMA on in software, and exit. + */ + return hwif->dmaproc(ide_dma_on, drive); /* success */ +} + +/* + * This is a CS5530-specific wrapper for the standard ide_dmaproc(). + * We need it for our custom "ide_dma_check" function. + * All other requests are forwarded to the standard ide_dmaproc(). + */ +int cs5530_dmaproc (ide_dma_action_t func, ide_drive_t *drive) +{ + switch (func) { + case ide_dma_check: + return cs5530_config_dma(drive); + default: + return ide_dmaproc(func, drive); + } +} + +/* + * Initialize the cs5530 bridge for reliable IDE DMA operation. + */ +unsigned int __init pci_init_cs5530 (struct pci_dev *dev, const char *name) +{ + struct pci_dev *master_0 = NULL, *cs5530_0 = NULL; + unsigned short pcicmd = 0; + unsigned long flags; + + pci_for_each_dev (dev) { + if (dev->vendor == PCI_VENDOR_ID_CYRIX) { + switch (dev->device) { + case PCI_DEVICE_ID_CYRIX_PCI_MASTER: + master_0 = dev; + break; + case PCI_DEVICE_ID_CYRIX_5530_LEGACY: + cs5530_0 = dev; + break; + } + } + } + if (!master_0) { + printk("%s: unable to locate PCI MASTER function\n", name); + return 0; + } + if (!cs5530_0) { + printk("%s: unable to locate CS5530 LEGACY function\n", name); + return 0; + } + + save_flags(flags); + cli(); /* all CPUs (there should only be one CPU with this chipset) */ + + /* + * Enable BusMaster and MemoryWriteAndInvalidate for the cs5530: + * --> OR 0x14 into 16-bit PCI COMMAND reg of function 0 of the cs5530 + */ + pci_read_config_word (cs5530_0, PCI_COMMAND, &pcicmd); + pci_write_config_word(cs5530_0, PCI_COMMAND, pcicmd | PCI_COMMAND_MASTER | PCI_COMMAND_INVALIDATE); + + /* + * Set PCI CacheLineSize to 16-bytes: + * --> Write 0x04 into 8-bit PCI CACHELINESIZE reg of function 0 of the cs5530 + */ + pci_write_config_byte(cs5530_0, PCI_CACHE_LINE_SIZE, 0x04); + + /* + * Disable trapping of UDMA register accesses (Win98 hack): + * --> Write 0x5006 into 16-bit reg at offset 0xd0 of function 0 of the cs5530 + */ + pci_write_config_word(cs5530_0, 0xd0, 0x5006); + + /* + * Bit-1 at 0x40 enables MemoryWriteAndInvalidate on internal X-bus: + * The other settings are what is necessary to get the register + * into a sane state for IDE DMA operation. + */ + pci_write_config_byte(master_0, 0x40, 0x1e); + + /* + * Set max PCI burst size (16-bytes seems to work best): + * 16bytes: set bit-1 at 0x41 (reg value of 0x16) + * all others: clear bit-1 at 0x41, and do: + * 128bytes: OR 0x00 at 0x41 + * 256bytes: OR 0x04 at 0x41 + * 512bytes: OR 0x08 at 0x41 + * 1024bytes: OR 0x0c at 0x41 + */ + pci_write_config_byte(master_0, 0x41, 0x14); + + /* + * These settings are necessary to get the chip + * into a sane state for IDE DMA operation. + */ + pci_write_config_byte(master_0, 0x42, 0x00); + pci_write_config_byte(master_0, 0x43, 0xc1); + + restore_flags(flags); + return 0; +} + +/* + * This gets invoked by the IDE driver once for each channel, + * and performs channel-specific pre-initialization before drive probing. + */ +void __init ide_init_cs5530 (ide_hwif_t *hwif) +{ + if (hwif->mate) + hwif->serialized = hwif->mate->serialized = 1; + if (!hwif->dma_base) { + hwif->autodma = 0; + } else { + unsigned int basereg, d0_timings; + + hwif->dmaproc = &cs5530_dmaproc; + hwif->tuneproc = &cs5530_tuneproc; + basereg = CS5530_BASEREG(hwif); + d0_timings = inl(basereg+0); + if (CS5530_BAD_PIO(d0_timings)) { /* PIO timings not initialized? */ + outl(cs5530_pio_timings[(d0_timings>>31)&1][0], basereg+0); + if (!hwif->drives[0].autotune) + hwif->drives[0].autotune = 1; /* needs autotuning later */ + } + if (CS5530_BAD_PIO(inl(basereg+8))) { /* PIO timings not initialized? */ + outl(cs5530_pio_timings[(d0_timings>>31)&1][0], basereg+8); + if (!hwif->drives[1].autotune) + hwif->drives[1].autotune = 1; /* needs autotuning later */ + } + } +} diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 34f8a3c98..7b956dfae 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -289,9 +289,6 @@ static inline int DRIVE(kdev_t x) { #define CLEARSTRUCT(x) memset((x), 0, sizeof(*(x))) -#define INT_OFF save_flags(flags); cli() -#define INT_ON restore_flags(flags) - /* read/write */ #define COMMAND raw_cmd->cmd[0] #define DR_SELECT raw_cmd->cmd[1] @@ -471,7 +468,8 @@ static int probing = 0; #define FD_COMMAND_ERROR 2 #define FD_COMMAND_OKAY 3 -static volatile int command_status = FD_COMMAND_NONE, fdc_busy = 0; +static volatile int command_status = FD_COMMAND_NONE; +static unsigned long fdc_busy = 0; static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); static DECLARE_WAIT_QUEUE_HEAD(command_done); @@ -846,24 +844,36 @@ static void set_fdc(int drive) /* locks the driver */ static int lock_fdc(int drive, int interruptible) { - unsigned long flags; - if (!usage_count){ printk(KERN_ERR "Trying to lock fdc while usage count=0\n"); return -1; } if(floppy_grab_irq_and_dma()==-1) return -EBUSY; - INT_OFF; - while (fdc_busy && NO_SIGNAL) - interruptible_sleep_on(&fdc_wait); - if (fdc_busy){ - INT_ON; - return -EINTR; + + if (test_and_set_bit(0, &fdc_busy)) { + DECLARE_WAITQUEUE(wait, current); + add_wait_queue(&fdc_wait, &wait); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!test_and_set_bit(0, &fdc_busy)) + break; + + schedule(); + + if (!NO_SIGNAL) { + remove_wait_queue(&fdc_wait, &wait); + return -EINTR; + } + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&fdc_wait, &wait); } - fdc_busy = 1; - INT_ON; command_status = FD_COMMAND_NONE; + reschedule_timeout(drive, "lock fdc", 0); set_fdc(drive); return 0; @@ -886,7 +896,7 @@ static inline void unlock_fdc(void) command_status = FD_COMMAND_NONE; del_timer(&fd_timeout); cont = NULL; - fdc_busy = 0; + clear_bit(0, &fdc_busy); floppy_release_irq_and_dma(); wake_up(&fdc_wait); } @@ -1031,39 +1041,39 @@ static int wait_for_completion(unsigned long delay, timeout_fn function) return 0; } +static spinlock_t floppy_hlt_lock = SPIN_LOCK_UNLOCKED; static int hlt_disabled=0; static void floppy_disable_hlt(void) { unsigned long flags; - INT_OFF; - if (!hlt_disabled){ + spin_lock_irqsave(&floppy_hlt_lock, flags); + if (!hlt_disabled) { hlt_disabled=1; #ifdef HAVE_DISABLE_HLT disable_hlt(); #endif } - INT_ON; + spin_unlock_irqrestore(&floppy_hlt_lock, flags); } static void floppy_enable_hlt(void) { unsigned long flags; - INT_OFF; + spin_lock_irqsave(&floppy_hlt_lock, flags); if (hlt_disabled){ hlt_disabled=0; #ifdef HAVE_DISABLE_HLT enable_hlt(); #endif } - INT_ON; + spin_unlock_irqrestore(&floppy_hlt_lock, flags); } static void setup_DMA(void) { - unsigned long flags; unsigned long f; #ifdef FLOPPY_SANITY_CHECK @@ -1085,7 +1095,6 @@ static void setup_DMA(void) return; } #endif - INT_OFF; f=claim_dma_lock(); fd_disable_dma(FLOPPY_DMA); #ifdef fd_dma_setup @@ -1094,7 +1103,6 @@ static void setup_DMA(void) DMA_MODE_READ : DMA_MODE_WRITE, FDCS->address) < 0) { release_dma_lock(f); - INT_ON; cont->done(0); FDCS->reset=1; return; @@ -1113,7 +1121,6 @@ static void setup_DMA(void) fd_enable_dma(FLOPPY_DMA); release_dma_lock(f); #endif - INT_ON; floppy_disable_hlt(); } @@ -1761,14 +1768,7 @@ void floppy_interrupt(int irq, void *dev_id, struct pt_regs * regs) } while ((ST0 & 0x83) != UNIT(current_drive) && inr == 2 && max_sensei); } if (handler) { - int cpu = smp_processor_id(); - if(softirq_trylock(cpu)) { - /* got the lock, call the handler immediately */ - handler(); - softirq_endlock(cpu); - } else - /* we interrupted a bottom half. Defer handler */ - schedule_bh( (void *)(void *) handler); + schedule_bh( (void *)(void *) handler); } else FDCS->reset = 1; is_alive("normal interrupt end"); @@ -1856,7 +1856,7 @@ static void show_floppy(void) #endif printk("status=%x\n", fd_inb(FD_STATUS)); - printk("fdc_busy=%d\n", fdc_busy); + printk("fdc_busy=%lu\n", fdc_busy); if (DEVICE_INTR) printk("DEVICE_INTR=%p\n", DEVICE_INTR); if (floppy_tq.sync) @@ -2027,25 +2027,36 @@ static struct cont_t intr_cont={ static int wait_til_done(void (*handler)(void), int interruptible) { int ret; - unsigned long flags; schedule_bh((void *)(void *)handler); - INT_OFF; - while(command_status < 2 && NO_SIGNAL){ - is_alive("wait_til_done"); - if (interruptible) - interruptible_sleep_on(&command_done); - else - sleep_on(&command_done); + + if (command_status < 2 && NO_SIGNAL) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&command_done, &wait); + for (;;) { + set_current_state(interruptible? + TASK_INTERRUPTIBLE: + TASK_UNINTERRUPTIBLE); + + if (command_status >= 2 || !NO_SIGNAL) + break; + + is_alive("wait_til_done"); + + schedule(); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&command_done, &wait); } + if (command_status < 2){ cancel_activity(); cont = &intr_cont; reset_fdc(); - INT_ON; return -EINTR; } - INT_ON; if (FDCS->reset) command_status = FD_COMMAND_ERROR; @@ -4179,22 +4190,26 @@ int __init floppy_init(void) return have_no_fdc; } +static spinlock_t floppy_usage_lock = SPIN_LOCK_UNLOCKED; + static int floppy_grab_irq_and_dma(void) { unsigned long flags; - INT_OFF; + spin_lock_irqsave(&floppy_usage_lock, flags); if (usage_count++){ - INT_ON; + spin_unlock_irqrestore(&floppy_usage_lock, flags); return 0; } - INT_ON; + spin_unlock_irqrestore(&floppy_usage_lock, flags); MOD_INC_USE_COUNT; if (fd_request_irq(FLOPPY_IRQ)) { DPRINT("Unable to grab IRQ%d for the floppy driver\n", FLOPPY_IRQ); MOD_DEC_USE_COUNT; + spin_lock_irqsave(&floppy_usage_lock, flags); usage_count--; + spin_unlock_irqrestore(&floppy_usage_lock, flags); return -1; } if (fd_request_dma(FLOPPY_DMA)) { @@ -4202,7 +4217,9 @@ static int floppy_grab_irq_and_dma(void) FLOPPY_DMA); fd_free_irq(FLOPPY_IRQ); MOD_DEC_USE_COUNT; + spin_lock_irqsave(&floppy_usage_lock, flags); usage_count--; + spin_unlock_irqrestore(&floppy_usage_lock, flags); return -1; } @@ -4219,7 +4236,9 @@ static int floppy_grab_irq_and_dma(void) release_region(FDCS->address+7, 1); } MOD_DEC_USE_COUNT; + spin_lock_irqsave(&floppy_usage_lock, flags); usage_count--; + spin_unlock_irqrestore(&floppy_usage_lock, flags); return -1; } request_region(FDCS->address, 6, "floppy"); @@ -4261,12 +4280,12 @@ static void floppy_release_irq_and_dma(void) unsigned long tmpaddr; unsigned long flags; - INT_OFF; + spin_lock_irqsave(&floppy_usage_lock, flags); if (--usage_count){ - INT_ON; + spin_unlock_irqrestore(&floppy_usage_lock, flags); return; } - INT_ON; + spin_unlock_irqrestore(&floppy_usage_lock, flags); if(irqdma_allocated) { fd_disable_dma(FLOPPY_DMA); diff --git a/drivers/block/hpt366.c b/drivers/block/hpt366.c index 65c695183..7328ea7b8 100644 --- a/drivers/block/hpt366.c +++ b/drivers/block/hpt366.c @@ -10,6 +10,7 @@ * development and support. */ +#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/delay.h> @@ -262,20 +263,20 @@ static int config_chipset_for_dma (ide_drive_t *drive) pci_read_config_byte(HWIF(drive)->pci_dev, 0x51, ®51h); -#ifdef HPT366_FAST_IRQ_PREDICTION +#ifdef CONFIG_HPT366_FAST_IRQ_PREDICTION /* * Some drives prefer/allow for the method of handling interrupts. */ if (!(reg51h & 0x80)) pci_write_config_byte(HWIF(drive)->pci_dev, 0x51, reg51h|0x80); -#else /* ! HPT366_FAST_IRQ_PREDICTION */ +#else /* ! CONFIG_HPT366_FAST_IRQ_PREDICTION */ /* * Disable the "fast interrupt" prediction. * Instead, always wait for the real interrupt from the drive! */ if (reg51h & 0x80) pci_write_config_byte(HWIF(drive)->pci_dev, 0x51, reg51h & ~0x80); -#endif /* HPT366_FAST_IRQ_PREDICTION */ +#endif /* CONFIG_HPT366_FAST_IRQ_PREDICTION */ /* * Preserve existing PIO settings: diff --git a/drivers/block/ide-floppy.c b/drivers/block/ide-floppy.c index 4d6ec68b6..b24933637 100644 --- a/drivers/block/ide-floppy.c +++ b/drivers/block/ide-floppy.c @@ -1397,6 +1397,13 @@ static int idefloppy_identify_device (ide_drive_t *drive,struct hd_driveid *id) *((unsigned short *) &gcw) = id->config; +#ifdef CONFIG_PPC + /* kludge for Apple PowerBook internal zip */ + if ((gcw.device_type == 5) && !strstr(id->model, "CD-ROM") + && strstr(id->model, "ZIP")) + gcw.device_type = 0; +#endif + #if IDEFLOPPY_DEBUG_INFO printk (KERN_INFO "Dumping ATAPI Identify Device floppy parameters\n"); switch (gcw.protocol) { diff --git a/drivers/block/ide-pci.c b/drivers/block/ide-pci.c index 7f429eff2..a5593dffb 100644 --- a/drivers/block/ide-pci.c +++ b/drivers/block/ide-pci.c @@ -60,7 +60,7 @@ #define DEVID_ALI15X3 ((ide_pci_devid_t){PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M5229}) #define DEVID_CY82C693 ((ide_pci_devid_t){PCI_VENDOR_ID_CONTAQ, PCI_DEVICE_ID_CONTAQ_82C693}) #define DEVID_HINT ((ide_pci_devid_t){0x3388, 0x8013}) -#define DEVID_CX5530 ((ide_pci_devid_t){PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_IDE}) +#define DEVID_CS5530 ((ide_pci_devid_t){PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_IDE}) #define DEVID_AMD7409 ((ide_pci_devid_t){PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_VIPER_7409}) #define IDE_IGNORE ((void *)-1) @@ -132,7 +132,15 @@ extern void ide_init_cy82c693(ide_hwif_t *); #define INIT_CY82C693 NULL #endif -#define INIT_CX5530 NULL +#ifdef CONFIG_BLK_DEV_CS5530 +extern unsigned int pci_init_cs5530(struct pci_dev *, const char *); +extern void ide_init_cs5530(ide_hwif_t *); +#define INIT_CS5530 &ide_init_cs5530 +#define PCI_CS5530 &pci_init_cs5530 +#else +#define INIT_CS5530 NULL +#define PCI_CS5530 NULL +#endif #ifdef CONFIG_BLK_DEV_HPT34X extern unsigned int pci_init_hpt34x(struct pci_dev *, const char *); @@ -309,7 +317,7 @@ static ide_pci_device_t ide_pci_chipsets[] __initdata = { {DEVID_ALI15X3, "ALI15X3", PCI_ALI15X3, ATA66_ALI15X3, INIT_ALI15X3, DMA_ALI15X3, {{0x00,0x00,0x00}, {0x00,0x00,0x00}}, ON_BOARD, 0 }, {DEVID_CY82C693,"CY82C693", PCI_CY82C693, NULL, INIT_CY82C693, NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}}, ON_BOARD, 0 }, {DEVID_HINT, "HINT_IDE", NULL, NULL, NULL, NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}}, ON_BOARD, 0 }, - {DEVID_CX5530, "CX5530", NULL, NULL, INIT_CX5530, NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}}, ON_BOARD, 0 }, + {DEVID_CS5530, "CS5530", PCI_CS5530, NULL, INIT_CS5530, NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}}, ON_BOARD, 0 }, {DEVID_AMD7409, "AMD7409", NULL, ATA66_AMD7409, INIT_AMD7409, NULL, {{0x40,0x01,0x01}, {0x40,0x02,0x02}}, ON_BOARD, 0 }, {IDE_PCI_DEVID_NULL, "PCI_IDE", NULL, NULL, NULL, NULL, {{0x00,0x00,0x00}, {0x00,0x00,0x00}}, ON_BOARD, 0 }}; @@ -614,6 +622,7 @@ check_if_enabled: IDE_PCI_DEVID_EQ(d->devid, DEVID_HPT34X) || #endif /* CONFIG_BLK_DEV_HPT34X */ IDE_PCI_DEVID_EQ(d->devid, DEVID_HPT366) || + IDE_PCI_DEVID_EQ(d->devid, DEVID_CS5530) || IDE_PCI_DEVID_EQ(d->devid, DEVID_CY82C693) || ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && (dev->class & 0x80))) { unsigned long dma_base = ide_get_or_set_dma_base(hwif, (!mate && d->extra) ? d->extra : 0, d->name); @@ -682,12 +691,12 @@ static void __init hpt366_device_order_fixup (struct pci_dev *dev, ide_pci_devic printk("%s: IDE controller on PCI bus %02x dev %02x\n", d2->name, dev2->bus->number, dev2->devfn); if (hpt363_shared_pin && !hpt363_shared_irq) { printk("%s: IDE controller run unsupported mode three!!!\n", d2->name); -#ifndef HPT366_MODE3 +#ifndef CONFIG_HPT366_MODE3 printk("%s: IDE controller report to <andre@suse.com>\n", d->name); return; -#else /* HPT366_MODE3 */ +#else /* CONFIG_HPT366_MODE3 */ printk("%s: OVERRIDE IDE controller not advisable this mode!!!\n", d2->name); -#endif /* HPT366_MODE3 */ +#endif /* CONFIG_HPT366_MODE3 */ } ide_setup_pci_device(dev2, d2); } diff --git a/drivers/block/ide-pmac.c b/drivers/block/ide-pmac.c index e6947e560..28eb789c2 100644 --- a/drivers/block/ide-pmac.c +++ b/drivers/block/ide-pmac.c @@ -14,6 +14,12 @@ * * Copyright (c) 1995-1998 Mark Lord * + * BenH: I began adding more complete timing setup code, mostly because DMA + * won't work on new machines unless timings are setup correctly. This + * code was mainly stolen from Cmd646 driver and should be completed to + * include real timing calc. instead of hard coded values. The format of + * the timing register can be found in Darwin's source code, except for + * Keylargo ATA-4 controller. */ #include <linux/config.h> #include <linux/types.h> @@ -36,29 +42,92 @@ #endif #include "ide_modes.h" -int pmac_ide_ports_known; -ide_ioreg_t pmac_ide_regbase[MAX_HWIFS]; -int pmac_ide_irq[MAX_HWIFS]; -int pmac_ide_count; -struct device_node *pmac_ide_node[MAX_HWIFS]; +#undef IDE_PMAC_DEBUG +#define IDE_SYSCLK_NS 30 + +struct pmac_ide_hwif { + ide_ioreg_t regbase; + int irq; + int kind; + struct device_node* node; + u32 timings[2]; #ifdef CONFIG_BLK_DEV_IDEDMA_PMAC + volatile struct dbdma_regs* dma_regs; + struct dbdma_cmd* dma_table; +#endif + +} pmac_ide[MAX_HWIFS]; + +static int pmac_ide_count; + +enum { + controller_ohare, /* OHare based */ + controller_heathrow, /* Heathrow/Paddington */ + controller_kl_ata3, /* KeyLargo ATA-3 */ + controller_kl_ata4 /* KeyLargo ATA-4 */ +}; + + +#ifdef CONFIG_BLK_DEV_IDEDMA_PMAC + +typedef struct { + int accessTime; + int cycleTime; +} pmac_ide_timing; + +/* Multiword DMA timings */ +static pmac_ide_timing mdma_timings[] = +{ + { 215, 480 }, /* Mode 0 */ + { 80, 150 }, /* 1 */ + { 70, 120 } /* 2 */ +}; + +/* Ultra DMA timings (for use when I know how to calculate them */ +static pmac_ide_timing udma_timings[] = +{ + { 0, 114 }, /* Mode 0 */ + { 0, 73 }, /* 1 */ + { 0, 54 }, /* 2 */ + { 0, 39 }, /* 3 */ + { 0, 25 } /* 4 */ +}; + #define MAX_DCMDS 256 /* allow up to 256 DBDMA commands per xfer */ -static void pmac_ide_setup_dma(struct device_node *np, ide_hwif_t *hwif); +static void pmac_ide_setup_dma(struct device_node *np, int ix); static int pmac_ide_dmaproc(ide_dma_action_t func, ide_drive_t *drive); -static int pmac_ide_build_dmatable(ide_drive_t *drive, int wr); +static int pmac_ide_build_dmatable(ide_drive_t *drive, int ix, int wr); +static void pmac_ide_tuneproc(ide_drive_t *drive, byte pio); +static void pmac_ide_selectproc(ide_drive_t *drive); + #endif /* CONFIG_BLK_DEV_IDEDMA_PMAC */ #ifdef CONFIG_PMAC_PBOOK -static int idepmac_notify(struct pmu_sleep_notifier *self, int when); +static int idepmac_notify_sleep(struct pmu_sleep_notifier *self, int when); struct pmu_sleep_notifier idepmac_sleep_notifier = { - idepmac_notify, SLEEP_LEVEL_BLOCK, + idepmac_notify_sleep, SLEEP_LEVEL_BLOCK, }; #endif /* CONFIG_PMAC_PBOOK */ +static int +pmac_ide_find(ide_drive_t *drive) +{ + ide_hwif_t *hwif = HWIF(drive); + ide_ioreg_t base; + int i; + + for (i=0; i<pmac_ide_count; i++) { + base = pmac_ide[i].regbase; + if (base && base == hwif->io_ports[0]) + return i; + } + return -1; +} + /* - * N.B. this can't be an __init, because the media-bay task can + * N.B. this can't be an initfunc, because the media-bay task can * call ide_[un]register at any time. */ void pmac_ide_init_hwif_ports(hw_regs_t *hw, @@ -71,7 +140,7 @@ void pmac_ide_init_hwif_ports(hw_regs_t *hw, return; for (ix = 0; ix < MAX_HWIFS; ++ix) - if (data_port == pmac_ide_regbase[ix]) + if (data_port == pmac_ide[ix].regbase) break; if (ix >= MAX_HWIFS) { @@ -98,27 +167,125 @@ void pmac_ide_init_hwif_ports(hw_regs_t *hw, hw->io_ports[8] = data_port + 0x160; if (irq != NULL) - *irq = pmac_ide_irq[ix]; + *irq = pmac_ide[ix].irq; + + ide_hwifs[ix].tuneproc = pmac_ide_tuneproc; + ide_hwifs[ix].selectproc = pmac_ide_selectproc; + if (pmac_ide[ix].dma_regs && pmac_ide[ix].dma_table) { + ide_hwifs[ix].dmaproc = &pmac_ide_dmaproc; +#ifdef CONFIG_PMAC_IDEDMA_AUTO + ide_hwifs[ix].autodma = 1; +#endif + } } -void pmac_ide_tuneproc(ide_drive_t *drive, byte pio) +#if 0 +/* This one could be later extended to handle CMD IDE and be used by some kind + * of /proc interface. I want to be able to get the devicetree path of a block + * device for yaboot configuration + */ +struct device_node* +pmac_ide_get_devnode(ide_drive_t *drive) { - ide_pio_data_t d; + int i = pmac_ide_find(drive); + if (i < 0) + return NULL; + return pmac_ide[i].node; +} +#endif - if (_machine != _MACH_Pmac) +/* Setup timings for the selected drive (master/slave). I still need to verify if this + * is enough, I beleive selectproc will be called whenever an IDE command is started, + * but... */ +static void +pmac_ide_selectproc(ide_drive_t *drive) +{ + int i = pmac_ide_find(drive); + if (i < 0) return; + + if (drive->select.all & 0x10) + out_le32((unsigned *)(IDE_DATA_REG + 0x200 + _IO_BASE), pmac_ide[i].timings[1]); + else + out_le32((unsigned *)(IDE_DATA_REG + 0x200 + _IO_BASE), pmac_ide[i].timings[0]); +} + +/* Number of IDE_SYSCLK_NS ticks, argument is in nanoseconds */ +#define SYSCLK_TICKS(t) (((t) + IDE_SYSCLK_NS - 1) / IDE_SYSCLK_NS) + +static void +pmac_ide_tuneproc(ide_drive_t *drive, byte pio) +{ + ide_pio_data_t d; + int i; + u32 *timings; + int accessTicks, recTicks; + + i = pmac_ide_find(drive); + if (i < 0) + return; + + /* The "ata-4" IDE controller of UMA machines is a bit different. + * We don't do anything for PIO modes until we know how to do the + * calculation. + */ + if (pmac_ide[i].kind == controller_kl_ata4) + return; + pio = ide_get_best_pio_mode(drive, pio, 4, &d); - switch (pio) { - case 4: - out_le32((unsigned *)(IDE_DATA_REG + 0x200 + _IO_BASE), 0x211025); - break; - default: - out_le32((unsigned *)(IDE_DATA_REG + 0x200 + _IO_BASE), 0x2f8526); - break; + accessTicks = SYSCLK_TICKS(ide_pio_timings[pio].active_time); + if (accessTicks < 4) + accessTicks = 4; + recTicks = SYSCLK_TICKS(d.cycle_time) - accessTicks - 4; + if (recTicks < 1) + recTicks = 1; + if (drive->select.all & 0x10) + timings = &pmac_ide[i].timings[1]; + else + timings = &pmac_ide[i].timings[0]; + + *timings = ((*timings) & 0xFFFFFF800) | accessTicks | (recTicks << 5); +#ifdef IDE_PMAC_DEBUG + printk("ide_pmac: Set PIO timing for mode %d, reg: 0x%08x\n", + pio, *timings); +#endif + + if (drive->select.all == IN_BYTE(IDE_SELECT_REG)) + pmac_ide_selectproc(drive); +} + +ide_ioreg_t +pmac_ide_get_base(int index) +{ + return pmac_ide[index].regbase; +} + +static int ide_majors[] = { 3, 22, 33, 34, 56, 57 }; + +kdev_t __init +pmac_find_ide_boot(char *bootdevice, int n) +{ + int i; + + /* + * Look through the list of IDE interfaces for this one. + */ + for (i = 0; i < pmac_ide_count; ++i) { + char *name; + if (!pmac_ide[i].node || !pmac_ide[i].node->full_name) + continue; + name = pmac_ide[i].node->full_name; + if (memcmp(name, bootdevice, n) == 0 && name[n] == 0) { + /* XXX should cope with the 2nd drive as well... */ + return MKDEV(ide_majors[i], 0); + } } + + return 0; } -void __init pmac_ide_probe(void) +void __init +pmac_ide_probe(void) { struct device_node *np; int i; @@ -196,27 +363,70 @@ void __init pmac_ide_probe(void) } else { irq = np->intrs[0].line; } - pmac_ide_regbase[i] = base; - pmac_ide_irq[i] = irq; - pmac_ide_node[i] = np; + pmac_ide[i].regbase = base; + pmac_ide[i].irq = irq; + pmac_ide[i].node = np; + if (device_is_compatible(np, "keylargo-ata")) { + if (strcmp(np->name, "ata-4") == 0) + pmac_ide[i].kind = controller_kl_ata4; + else + pmac_ide[i].kind = controller_kl_ata3; + } else if (device_is_compatible(np, "heathrow-ata")) + pmac_ide[i].kind = controller_heathrow; + else + pmac_ide[i].kind = controller_ohare; if (np->parent && np->parent->name && strcasecmp(np->parent->name, "media-bay") == 0) { media_bay_set_ide_infos(np->parent,base,irq,i); - } else - feature_set(np, FEATURE_IDE_enable); + } else if (pmac_ide[i].kind == controller_ohare) { + /* The code below is having trouble on some ohare machines + * (timing related ?). Until I can put my hand on one of these + * units, I keep the old way + */ + feature_set(np, FEATURE_IDE0_enable); + } else { + /* This is necessary to enable IDE when net-booting */ + int *bidp = (int *)get_property(np, "AAPL,bus-id", NULL); + int bid = bidp ? *bidp : 0; + printk("pmac_ide: enabling IDE bus ID %d\n", bid); + switch(bid) { + case 0: + feature_set(np, FEATURE_IDE0_reset); + feature_set(np, FEATURE_IOBUS_enable); + mdelay(10); + feature_set(np, FEATURE_IDE0_enable); + mdelay(10); + feature_clear(np, FEATURE_IDE0_reset); + break; + case 1: + feature_set(np, FEATURE_Mediabay_IDE_reset); + mdelay(10); + feature_set(np, FEATURE_Mediabay_IDE_enable); + mdelay(10); + feature_clear(np, FEATURE_Mediabay_IDE_reset); + break; + case 2: + /* This one exists only for KL, I don't know about any + enable bit */ + feature_set(np, FEATURE_IDE2_reset); + mdelay(10); + feature_clear(np, FEATURE_IDE2_reset); + break; + } + mdelay(1000); + } hwif = &ide_hwifs[i]; pmac_ide_init_hwif_ports(&hwif->hw, base, 0, &hwif->irq); memcpy(hwif->io_ports, hwif->hw.io_ports, sizeof(hwif->io_ports)); hwif->chipset = ide_generic; hwif->noprobe = !hwif->io_ports[IDE_DATA_OFFSET]; - hwif->tuneproc = pmac_ide_tuneproc; #ifdef CONFIG_BLK_DEV_IDEDMA_PMAC if (np->n_addrs >= 2) { /* has a DBDMA controller channel */ - pmac_ide_setup_dma(np, hwif); + pmac_ide_setup_dma(np, i); } #endif /* CONFIG_BLK_DEV_IDEDMA_PMAC */ @@ -232,27 +442,28 @@ void __init pmac_ide_probe(void) #ifdef CONFIG_BLK_DEV_IDEDMA_PMAC static void __init -pmac_ide_setup_dma(struct device_node *np, ide_hwif_t *hwif) +pmac_ide_setup_dma(struct device_node *np, int ix) { - hwif->dma_base = (unsigned long) ioremap(np->addrs[1].address, 0x200); + pmac_ide[ix].dma_regs = + (volatile struct dbdma_regs*)ioremap(np->addrs[1].address, 0x200); /* * Allocate space for the DBDMA commands. * The +2 is +1 for the stop command and +1 to allow for * aligning the start address to a multiple of 16 bytes. */ - hwif->dmatable_cpu = (unsigned long *) + pmac_ide[ix].dma_table = (struct dbdma_cmd*) kmalloc((MAX_DCMDS + 2) * sizeof(struct dbdma_cmd), GFP_KERNEL); - if (hwif->dmatable_cpu == 0) { + if (pmac_ide[ix].dma_table == 0) { printk(KERN_ERR "%s: unable to allocate DMA command list\n", - hwif->name); + ide_hwifs[ix].name); return; } - hwif->dmaproc = &pmac_ide_dmaproc; -#ifdef CONFIG_IDEDMA_PMAC_AUTO - hwif->autodma = 1; -#endif /* CONFIG_IDEDMA_PMAC_AUTO */ + ide_hwifs[ix].dmaproc = &pmac_ide_dmaproc; +#ifdef CONFIG_PMAC_IDEDMA_AUTO + ide_hwifs[ix].autodma = 1; +#endif } /* @@ -260,19 +471,19 @@ pmac_ide_setup_dma(struct device_node *np, ide_hwif_t *hwif) * for a transfer and sets the DBDMA channel to point to it. */ static int -pmac_ide_build_dmatable(ide_drive_t *drive, int wr) +pmac_ide_build_dmatable(ide_drive_t *drive, int ix, int wr) { - ide_hwif_t *hwif = HWIF(drive); struct dbdma_cmd *table, *tstart; int count = 0; struct request *rq = HWGROUP(drive)->rq; struct buffer_head *bh = rq->bh; unsigned int size, addr; - volatile struct dbdma_regs *dma - = (volatile struct dbdma_regs *) hwif->dma_base; + volatile struct dbdma_regs *dma = pmac_ide[ix].dma_regs; - table = tstart = (struct dbdma_cmd *) DBDMA_ALIGN(hwif->dmatable_cpu); + table = tstart = (struct dbdma_cmd *) DBDMA_ALIGN(pmac_ide[ix].dma_table); out_le32(&dma->control, (RUN|PAUSE|FLUSH|WAKE|DEAD) << 16); + while (in_le32(&dma->status) & RUN) + udelay(1); do { /* @@ -335,30 +546,277 @@ pmac_ide_build_dmatable(ide_drive_t *drive, int wr) return 1; } + +/* This is fun. -DaveM */ +#define IDE_SETXFER 0x03 +#define IDE_SETFEATURE 0xef +#define IDE_DMA2_ENABLE 0x22 +#define IDE_DMA1_ENABLE 0x21 +#define IDE_DMA0_ENABLE 0x20 +#define IDE_UDMA4_ENABLE 0x44 +#define IDE_UDMA3_ENABLE 0x43 +#define IDE_UDMA2_ENABLE 0x42 +#define IDE_UDMA1_ENABLE 0x41 +#define IDE_UDMA0_ENABLE 0x40 + +static __inline__ unsigned char +dma_bits_to_command(unsigned char bits) +{ + if(bits & 0x04) + return IDE_DMA2_ENABLE; + if(bits & 0x02) + return IDE_DMA1_ENABLE; + return IDE_DMA0_ENABLE; +} + +static __inline__ unsigned char +udma_bits_to_command(unsigned char bits) +{ + if(bits & 0x10) + return IDE_UDMA4_ENABLE; + if(bits & 0x08) + return IDE_UDMA3_ENABLE; + if(bits & 0x04) + return IDE_UDMA2_ENABLE; + if(bits & 0x02) + return IDE_UDMA1_ENABLE; + if(bits & 0x01) + return IDE_UDMA0_ENABLE; + return 0; +} + +static __inline__ int +wait_for_ready(ide_drive_t *drive) +{ + /* Timeout bumped for some powerbooks */ + int timeout = 2000; + byte stat; + + while(--timeout) { + stat = GET_STAT(); + if(!(stat & BUSY_STAT)) { + if (drive->ready_stat == 0) + break; + else if((stat & drive->ready_stat) || (stat & ERR_STAT)) + break; + } + mdelay(1); + } + if((stat & ERR_STAT) || timeout <= 0) { + if (stat & ERR_STAT) { + printk("ide_pmace: wait_for_ready, error status: %x\n", stat); + } + return 1; + } + return 0; +} + +static int +pmac_ide_do_setfeature(ide_drive_t *drive, byte command) +{ + unsigned long flags; + byte old_select; + int result = 1; + + save_flags(flags); + cli(); + old_select = IN_BYTE(IDE_SELECT_REG); + OUT_BYTE(drive->select.all, IDE_SELECT_REG); + udelay(10); + OUT_BYTE(IDE_SETXFER, IDE_FEATURE_REG); + OUT_BYTE(command, IDE_NSECTOR_REG); + if(wait_for_ready(drive)) { + printk("pmac_ide_do_setfeature disk not ready before SET_FEATURE!\n"); + goto out; + } + OUT_BYTE(IDE_SETFEATURE, IDE_COMMAND_REG); + result = wait_for_ready(drive); + if (result) + printk("pmac_ide_do_setfeature disk not ready after SET_FEATURE !\n"); +out: + OUT_BYTE(old_select, IDE_SELECT_REG); + restore_flags(flags); + + return result; +} + +static int +pmac_ide_mdma_enable(ide_drive_t *drive, int idx) +{ + byte bits = drive->id->dma_mword & 0x07; + byte feature = dma_bits_to_command(bits); + u32 *timings; + int cycleTime, accessTime; + int accessTicks, recTicks; + struct hd_driveid *id = drive->id; + + /* For now, we don't know these values */ + if (pmac_ide[idx].kind == controller_kl_ata4 && feature != IDE_DMA2_ENABLE) + return 0; + if (pmac_ide[idx].kind != controller_kl_ata4 && feature == IDE_DMA0_ENABLE) + return 0; + + /* Set feature on drive */ + printk("%s: Enabling MultiWord DMA %d\n", drive->name, feature & 0xf); + if (pmac_ide_do_setfeature(drive, feature)) { + printk("%s: Failed !\n", drive->name); + return 0; + } + + /* which drive is it ? */ + if (drive->select.all & 0x10) + timings = &pmac_ide[idx].timings[1]; + else + timings = &pmac_ide[idx].timings[0]; + + /* Calculate accesstime and cycle time */ + cycleTime = mdma_timings[feature & 0xf].cycleTime; + accessTime = mdma_timings[feature & 0xf].accessTime; + if ((id->field_valid & 2) && (id->eide_dma_time)) + cycleTime = id->eide_dma_time; + if ((pmac_ide[idx].kind == controller_ohare) && (cycleTime < 150)) + cycleTime = 150; + + /* For ata-4 controller, we don't know the calculation */ + if (pmac_ide[idx].kind == controller_kl_ata4) { + *timings = 0x00019465; /* MDMA2 */ + } else { + int halfTick = 0; + int origAccessTime = accessTime; + int origCycleTime = cycleTime; + + accessTicks = SYSCLK_TICKS(accessTime); + if (accessTicks < 1) + accessTicks = 1; + accessTime = accessTicks * IDE_SYSCLK_NS; + recTicks = SYSCLK_TICKS(cycleTime - accessTime) - 1; + if (recTicks < 1) + recTicks = 1; + cycleTime = (recTicks + 1 + accessTicks) * IDE_SYSCLK_NS; + + if ((accessTicks > 1) && + ((accessTime - IDE_SYSCLK_NS/2) >= origAccessTime) && + ((cycleTime - IDE_SYSCLK_NS) >= origCycleTime)) { + halfTick = 1; + accessTicks--; + } + *timings = ((*timings) & 0x7FF) | + (accessTicks | (recTicks << 5) | (halfTick << 10)) << 11; + } +#ifdef IDE_PMAC_DEBUG + printk("ide_pmac: Set MDMA timing for mode %d, reg: 0x%08x\n", + feature & 0xf, *timings); +#endif + return 1; +} + +static int +pmac_ide_udma_enable(ide_drive_t *drive, int idx) +{ + byte bits = drive->id->dma_ultra & 0x1f; + byte feature = udma_bits_to_command(bits); + u32 timings; + + /* We support only those values */ + if (feature != IDE_UDMA4_ENABLE && feature != IDE_UDMA2_ENABLE) + return 0; + + /* Set feature on drive */ + printk("%s: Enabling Ultra DMA %d\n", drive->name, feature & 0xf); + if (pmac_ide_do_setfeature(drive, feature)) { + printk("%s: Failed !\n", drive->name); + return 0; + } + + /* Put this channel into UDMA mode. + * This value is set by MacOS on the iBook for U/DMA2 + */ + switch(feature) { + case IDE_UDMA4_ENABLE: + timings = 0x0cd00065; + break; + case IDE_UDMA2_ENABLE: + timings = 0x11100065; + break; + } + + if (drive->select.all & 0x10) + pmac_ide[idx].timings[1] = timings; + else + pmac_ide[idx].timings[0] = timings; + + return 1; +} + +static int +pmac_ide_dma_onoff(ide_drive_t *drive, int enable) +{ + int ata4, udma, idx; + struct hd_driveid *id = drive->id; + + drive->using_dma = 0; + + idx = pmac_ide_find(drive); + if (idx < 0) + return 0; + + if (drive->media == ide_floppy) + enable = 0; + if (((id->capability & 1) == 0) && !check_drive_lists(drive, GOOD_DMA_DRIVE)) + enable = 0; + if (check_drive_lists(drive, BAD_DMA_DRIVE)) + enable = 0; + + udma = 0; + ata4 = (pmac_ide[idx].kind == controller_kl_ata4); + + if(enable) { + if (ata4 && (drive->media == ide_disk) && + (id->field_valid & 0x0004) && (id->dma_ultra & 0x17)) { + /* UltraDMA modes. */ + drive->using_dma = pmac_ide_udma_enable(drive, idx); + } + if (!drive->using_dma && (id->dma_mword & 0x0007)) { + /* Normal MultiWord DMA modes. */ + drive->using_dma = pmac_ide_mdma_enable(drive, idx); + } + /* Without this, strange things will happen on Keylargo-based + * machines + */ + OUT_BYTE(0, IDE_CONTROL_REG); + if (drive->select.all == IN_BYTE(IDE_SELECT_REG)) + pmac_ide_selectproc(drive); + } + return 0; +} + int pmac_ide_dmaproc(ide_dma_action_t func, ide_drive_t *drive) { ide_hwif_t *hwif = HWIF(drive); - volatile struct dbdma_regs *dma - = (volatile struct dbdma_regs *) hwif->dma_base; - int dstat; + int ix, dstat; + volatile struct dbdma_regs *dma; + + /* Can we stuff a pointer to our intf structure in config_data + * or select_data in hwif ? + */ + ix = pmac_ide_find(drive); + if (ix < 0) + return 0; + dma = pmac_ide[ix].dma_regs; switch (func) { case ide_dma_on: - /* ide-floppy DMA doesn't work yet... */ - drive->using_dma = drive->media != ide_floppy; - break; case ide_dma_off: - printk(KERN_INFO "%s: DMA disabled\n", drive->name); case ide_dma_off_quietly: - drive->using_dma = 0; + pmac_ide_dma_onoff(drive, (func == ide_dma_on)); break; case ide_dma_check: - /* ide-floppy DMA doesn't work yet... */ - drive->using_dma = hwif->autodma && drive->media != ide_floppy; + if (hwif->autodma) + pmac_ide_dma_onoff(drive, 1); break; case ide_dma_read: case ide_dma_write: - if (!pmac_ide_build_dmatable(drive, func==ide_dma_write)) + if (!pmac_ide_build_dmatable(drive, ix, func==ide_dma_write)) return 1; drive->waiting_for_dma = 1; if (drive->media != ide_disk) @@ -387,11 +845,9 @@ int pmac_ide_dmaproc(ide_dma_action_t func, ide_drive_t *drive) #ifdef CONFIG_PMAC_PBOOK static void idepmac_sleep_disk(int i, unsigned long base) { + struct device_node* np = pmac_ide[i].node; int j; - /* Reset to PIO 0 */ - out_le32((unsigned *)(base + 0x200 + _IO_BASE), 0x2f8526); - /* FIXME: We only handle the master IDE */ if (ide_hwifs[i].drives[0].media == ide_disk) { /* Spin down the drive */ @@ -410,23 +866,30 @@ static void idepmac_sleep_disk(int i, unsigned long base) break; } } + feature_set(np, FEATURE_IDE0_reset); + feature_clear(np, FEATURE_IOBUS_enable); + feature_clear(np, FEATURE_IDE0_enable); + pmac_ide[i].timings[0] = 0; + pmac_ide[i].timings[1] = 0; } static void idepmac_wake_disk(int i, unsigned long base) { + struct device_node* np = pmac_ide[i].node; int j; /* Revive IDE disk and controller */ - feature_set(pmac_ide_node[i], FEATURE_IDE_enable); - mdelay(1); - feature_set(pmac_ide_node[i], FEATURE_IDE_DiskPower); - mdelay(100); - feature_set(pmac_ide_node[i], FEATURE_IDE_Reset); - mdelay(1); - /* Make sure we are still PIO0 */ - out_le32((unsigned *)(base + 0x200 + _IO_BASE), 0x2f8526); + feature_set(np, FEATURE_IOBUS_enable); + mdelay(10); + feature_set(np, FEATURE_IDE0_enable); + mdelay(10); + feature_clear(np, FEATURE_IDE0_reset); mdelay(100); + /* Reset timings */ + pmac_ide_selectproc(&ide_hwifs[i].drives[0]); + mdelay(10); + /* Wait up to 10 seconds (enough for recent drives) */ for (j = 0; j < 100; j++) { int status; @@ -443,14 +906,22 @@ idepmac_wake_bay(int i, unsigned long base) { int timeout; - timeout = 5000; + /* Reset timings */ + pmac_ide_selectproc(&ide_hwifs[i].drives[0]); + mdelay(10); + + timeout = 10000; while ((inb(base + 0x70) & BUSY_STAT) && timeout) { mdelay(1); --timeout; } } -static int idepmac_notify(struct pmu_sleep_notifier *self, int when) +/* Note: We support only master drives for now. This will have to be + * improved if we want to handle sleep on the iMacDV where the CD-ROM + * is a slave + */ +static int idepmac_notify_sleep(struct pmu_sleep_notifier *self, int when) { int i, ret; unsigned long base; @@ -462,10 +933,10 @@ static int idepmac_notify(struct pmu_sleep_notifier *self, int when) break; case PBOOK_SLEEP_NOW: for (i = 0; i < pmac_ide_count; ++i) { - if ((base = pmac_ide_regbase[i]) == 0) + if ((base = pmac_ide[i].regbase) == 0) continue; /* Disable irq during sleep */ - disable_irq(pmac_ide_irq[i]); + disable_irq(pmac_ide[i].irq); ret = check_media_bay_by_base(base, MB_CD); if (ret == -ENODEV) /* not media bay - put the disk to sleep */ @@ -474,15 +945,22 @@ static int idepmac_notify(struct pmu_sleep_notifier *self, int when) break; case PBOOK_WAKE: for (i = 0; i < pmac_ide_count; ++i) { - if ((base = pmac_ide_regbase[i]) == 0) + ide_hwif_t *hwif; + if ((base = pmac_ide[i].regbase) == 0) continue; + hwif = &ide_hwifs[i]; /* We don't handle media bay devices this way */ ret = check_media_bay_by_base(base, MB_CD); if (ret == -ENODEV) idepmac_wake_disk(i, base); else if (ret == 0) idepmac_wake_bay(i, base); - enable_irq(pmac_ide_irq[i]); + enable_irq(pmac_ide[i].irq); + +#ifdef CONFIG_BLK_DEV_IDEDMA_PMAC + if (hwif->drives[0].present && hwif->drives[0].using_dma) + pmac_ide_dma_onoff(&hwif->drives[0], 1); +#endif } break; } diff --git a/drivers/block/ide-probe.c b/drivers/block/ide-probe.c index 1c6f19eba..33ca2900b 100644 --- a/drivers/block/ide-probe.c +++ b/drivers/block/ide-probe.c @@ -117,8 +117,16 @@ static inline void do_identify (ide_drive_t *drive, byte cmd) } type = ide_cdrom; /* Early cdrom models used zero */ case ide_cdrom: - printk ("CDROM"); drive->removable = 1; +#ifdef CONFIG_PPC + /* kludge for Apple PowerBook internal zip */ + if (!strstr(id->model, "CD-ROM") && strstr(id->model, "ZIP")) { + printk ("FLOPPY"); + type = ide_floppy; + break; + } +#endif + printk ("CDROM"); break; case ide_tape: printk ("TAPE"); diff --git a/drivers/block/ide-tape.c b/drivers/block/ide-tape.c index cba18bced..9d2bc216f 100644 --- a/drivers/block/ide-tape.c +++ b/drivers/block/ide-tape.c @@ -4372,12 +4372,12 @@ static ssize_t idetape_chrdev_read (struct file *file, char *buf, return -ENXIO; } if (tape->onstream && (count != tape->tape_block_size)) { - printk(KERN_ERR "ide-tape: %s: use %d bytes as block size (%d used)\n", tape->name, tape->tape_block_size, count); + printk(KERN_ERR "ide-tape: %s: use %d bytes as block size (%Zd used)\n", tape->name, tape->tape_block_size, count); return -EINVAL; } #if IDETAPE_DEBUG_LOG if (tape->debug_level >= 3) - printk (KERN_INFO "ide-tape: Reached idetape_chrdev_read, count %d\n", count); + printk (KERN_INFO "ide-tape: Reached idetape_chrdev_read, count %Zd\n", count); #endif /* IDETAPE_DEBUG_LOG */ if (tape->chrdev_direction != idetape_direction_read) { @@ -4552,12 +4552,12 @@ static ssize_t idetape_chrdev_write (struct file *file, const char *buf, return -ENXIO; } if (tape->onstream && (count != tape->tape_block_size)) { - printk(KERN_ERR "ide-tape: %s: use %d bytes as block size (%d used)\n", tape->name, tape->tape_block_size, count); + printk(KERN_ERR "ide-tape: %s: use %d bytes as block size (%Zd used)\n", tape->name, tape->tape_block_size, count); return -EINVAL; } #if IDETAPE_DEBUG_LOG if (tape->debug_level >= 3) - printk (KERN_INFO "ide-tape: Reached idetape_chrdev_write, count %d\n", count); + printk (KERN_INFO "ide-tape: Reached idetape_chrdev_write, count %Zd\n", count); #endif /* IDETAPE_DEBUG_LOG */ if (tape->chrdev_direction != idetape_direction_write) { /* Initialize write operation */ @@ -5839,18 +5839,11 @@ static ide_module_t idetape_module = { * Our character device supporting functions, passed to register_chrdev. */ static struct file_operations idetape_fops = { - NULL, /* lseek - default */ - idetape_chrdev_read, /* read */ - idetape_chrdev_write, /* write */ - NULL, /* readdir - bad */ - NULL, /* poll */ - idetape_chrdev_ioctl, /* ioctl */ - NULL, /* mmap */ - idetape_chrdev_open, /* open */ - NULL, /* flush */ - idetape_chrdev_release, /* release */ - NULL, /* fsync */ - NULL, /* fasync */ + read: idetape_chrdev_read, + write: idetape_chrdev_write, + ioctl: idetape_chrdev_ioctl, + open: idetape_chrdev_open, + release: idetape_chrdev_release, }; /* diff --git a/drivers/block/ide.c b/drivers/block/ide.c index 396369651..06e1bbcc6 100644 --- a/drivers/block/ide.c +++ b/drivers/block/ide.c @@ -861,7 +861,7 @@ static void try_to_flush_leftover_data (ide_drive_t *drive) if (drive->media != ide_disk) return; while (i > 0) { - unsigned long buffer[16]; + u32 buffer[16]; unsigned int wcount = (i > 16) ? 16 : i; i -= wcount; ide_input_data (drive, buffer, wcount); diff --git a/drivers/block/linear.c b/drivers/block/linear.c index 3993417d0..1c3305bae 100644 --- a/drivers/block/linear.c +++ b/drivers/block/linear.c @@ -1,4 +1,3 @@ - /* linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc ZYNGIER @@ -19,186 +18,204 @@ #include <linux/module.h> -#include <linux/md.h> +#include <linux/raid/md.h> #include <linux/malloc.h> -#include <linux/init.h> -#include "linear.h" +#include <linux/raid/linear.h> #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -static int linear_run (int minor, struct md_dev *mddev) +static int linear_run (mddev_t *mddev) { - int cur=0, i, size, dev0_size, nb_zone; - struct linear_data *data; - - MOD_INC_USE_COUNT; - - mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL); - data=(struct linear_data *) mddev->private; - - /* - Find out the smallest device. This was previously done - at registry time, but since it violates modularity, - I moved it here... Any comment ? ;-) - */ - - data->smallest=mddev->devices; - for (i=1; i<mddev->nb_dev; i++) - if (data->smallest->size > mddev->devices[i].size) - data->smallest=mddev->devices+i; - - nb_zone=data->nr_zones= - md_size[minor]/data->smallest->size + - (md_size[minor]%data->smallest->size ? 1 : 0); + linear_conf_t *conf; + struct linear_hash *table; + mdk_rdev_t *rdev; + int size, i, j, nb_zone; + unsigned int curr_offset; + + MOD_INC_USE_COUNT; + + conf = kmalloc (sizeof (*conf), GFP_KERNEL); + if (!conf) + goto out; + mddev->private = conf; + + if (md_check_ordering(mddev)) { + printk("linear: disks are not ordered, aborting!\n"); + goto out; + } + /* + * Find the smallest device. + */ + + conf->smallest = NULL; + curr_offset = 0; + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + dev_info_t *disk = conf->disks + j; + + disk->dev = rdev->dev; + disk->size = rdev->size; + disk->offset = curr_offset; + + curr_offset += disk->size; + + if (!conf->smallest || (disk->size < conf->smallest->size)) + conf->smallest = disk; + } + + nb_zone = conf->nr_zones = + md_size[mdidx(mddev)] / conf->smallest->size + + ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0); - data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL); - - size=mddev->devices[cur].size; - - i=0; - while (cur<mddev->nb_dev) - { - data->hash_table[i].dev0=mddev->devices+cur; - - if (size>=data->smallest->size) /* If we completely fill the slot */ - { - data->hash_table[i++].dev1=NULL; - size-=data->smallest->size; - - if (!size) - { - if (++cur==mddev->nb_dev) continue; - size=mddev->devices[cur].size; - } - - continue; - } - - if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */ - { - data->hash_table[i].dev1=NULL; - continue; - } - - dev0_size=size; /* Here, we use a 2nd dev to fill the slot */ - size=mddev->devices[cur].size; - data->hash_table[i++].dev1=mddev->devices+cur; - size-=(data->smallest->size - dev0_size); - } - - return 0; + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone, + GFP_KERNEL); + if (!conf->hash_table) + goto out; + + /* + * Here we generate the linear hash table + */ + table = conf->hash_table; + i = 0; + size = 0; + for (j = 0; j < mddev->nb_dev; j++) { + dev_info_t *disk = conf->disks + j; + + if (size < 0) { + table->dev1 = disk; + table++; + } + size += disk->size; + + while (size) { + table->dev0 = disk; + size -= conf->smallest->size; + if (size < 0) + break; + table->dev1 = NULL; + table++; + } + } + table->dev1 = NULL; + + return 0; + +out: + if (conf) + kfree(conf); + MOD_DEC_USE_COUNT; + return 1; } -static int linear_stop (int minor, struct md_dev *mddev) +static int linear_stop (mddev_t *mddev) { - struct linear_data *data=(struct linear_data *) mddev->private; + linear_conf_t *conf = mddev_to_conf(mddev); - kfree (data->hash_table); - kfree (data); + kfree(conf->hash_table); + kfree(conf); - MOD_DEC_USE_COUNT; + MOD_DEC_USE_COUNT; - return 0; + return 0; } - -static int linear_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) +static int linear_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) { - struct linear_data *data=(struct linear_data *) mddev->private; - struct linear_hash *hash; - struct real_dev *tmp_dev; - long block; + linear_conf_t *conf = mddev_to_conf(mddev); + struct linear_hash *hash; + dev_info_t *tmp_dev; + long block; - block=*rsector >> 1; - hash=data->hash_table+(block/data->smallest->size); + block = bh->b_blocknr * (bh->b_size >> 10); + hash = conf->hash_table + (block / conf->smallest->size); - if (block >= (hash->dev0->size + hash->dev0->offset)) - { - if (!hash->dev1) - { - printk ("linear_map : hash->dev1==NULL for block %ld\n", block); - return (-1); - } - - tmp_dev=hash->dev1; - } - else - tmp_dev=hash->dev0; + if (block >= (hash->dev0->size + hash->dev0->offset)) { + if (!hash->dev1) { + printk ("linear_make_request : hash->dev1==NULL for block %ld\n", + block); + return -1; + } + tmp_dev = hash->dev1; + } else + tmp_dev = hash->dev0; - if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset) - printk ("Block %ld out of bounds on dev %s size %d offset %d\n", - block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); - - *rdev=tmp_dev->dev; - *rsector=(block-(tmp_dev->offset)) << 1; - - return (0); + if (block >= (tmp_dev->size + tmp_dev->offset) + || block < tmp_dev->offset) { + printk ("linear_make_request: Block %ld out of bounds on dev %s size %d offset %d\n", block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); + return -1; + } + bh->b_rdev = tmp_dev->dev; + bh->b_rsector = (block - tmp_dev->offset) << 1; + + generic_make_request(rw, bh); + return 0; } -static int linear_status (char *page, int minor, struct md_dev *mddev) +static int linear_status (char *page, mddev_t *mddev) { - int sz=0; + int sz = 0; #undef MD_DEBUG #ifdef MD_DEBUG - int j; - struct linear_data *data=(struct linear_data *) mddev->private; + int j; + linear_conf_t *conf = mddev_to_conf(mddev); - sz+=sprintf (page+sz, " "); - for (j=0; j<data->nr_zones; j++) - { - sz+=sprintf (page+sz, "[%s", - partition_name (data->hash_table[j].dev0->dev)); - - if (data->hash_table[j].dev1) - sz+=sprintf (page+sz, "/%s] ", - partition_name(data->hash_table[j].dev1->dev)); - else - sz+=sprintf (page+sz, "] "); - } - - sz+=sprintf (page+sz, "\n"); + sz += sprintf(page+sz, " "); + for (j = 0; j < conf->nr_zones; j++) + { + sz += sprintf(page+sz, "[%s", + partition_name(conf->hash_table[j].dev0->dev)); + + if (conf->hash_table[j].dev1) + sz += sprintf(page+sz, "/%s] ", + partition_name(conf->hash_table[j].dev1->dev)); + else + sz += sprintf(page+sz, "] "); + } + sz += sprintf(page+sz, "\n"); #endif - sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev))); - return sz; + sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024); + return sz; } -static struct md_personality linear_personality= +static mdk_personality_t linear_personality= { - "linear", - linear_map, - NULL, - NULL, - linear_run, - linear_stop, - linear_status, - NULL, /* no ioctls */ - 0 + "linear", + NULL, + linear_make_request, + NULL, + linear_run, + linear_stop, + linear_status, + NULL, + 0, + NULL, + NULL, + NULL, + NULL }; - #ifndef MODULE -void __init linear_init (void) +void md__init linear_init (void) { - register_md_personality (LINEAR, &linear_personality); + register_md_personality (LINEAR, &linear_personality); } #else int init_module (void) { - return (register_md_personality (LINEAR, &linear_personality)); + return (register_md_personality (LINEAR, &linear_personality)); } void cleanup_module (void) { - unregister_md_personality (LINEAR); + unregister_md_personality (LINEAR); } #endif + diff --git a/drivers/block/linear.h b/drivers/block/linear.h deleted file mode 100644 index 1146d8329..000000000 --- a/drivers/block/linear.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _LINEAR_H -#define _LINEAR_H - -struct linear_hash -{ - struct real_dev *dev0, *dev1; -}; - -struct linear_data -{ - struct linear_hash *hash_table; /* Dynamically allocated */ - struct real_dev *smallest; - int nr_zones; -}; - -#endif diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 731a2aece..3ed507694 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -23,6 +23,7 @@ #include <asm/io.h> #include <linux/blk.h> #include <linux/highmem.h> +#include <linux/raid/md.h> #include <linux/module.h> @@ -65,9 +66,9 @@ spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED; */ DECLARE_WAIT_QUEUE_HEAD(wait_for_request); -/* This specifies how many sectors to read ahead on the disk. */ +/* This specifies how many sectors to read ahead on the disk. */ -int read_ahead[MAX_BLKDEV] = {0, }; +int read_ahead[MAX_BLKDEV]; /* blk_dev_struct is: * *request_fn @@ -83,7 +84,7 @@ struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */ * * if (!blk_size[MAJOR]) then no minor size checking is done. */ -int * blk_size[MAX_BLKDEV] = { NULL, NULL, }; +int * blk_size[MAX_BLKDEV]; /* * blksize_size contains the size of all block-devices: @@ -92,7 +93,7 @@ int * blk_size[MAX_BLKDEV] = { NULL, NULL, }; * * if (!blksize_size[MAJOR]) then 1024 bytes is assumed. */ -int * blksize_size[MAX_BLKDEV] = { NULL, NULL, }; +int * blksize_size[MAX_BLKDEV]; /* * hardsect_size contains the size of the hardware sector of a device. @@ -106,17 +107,17 @@ int * blksize_size[MAX_BLKDEV] = { NULL, NULL, }; * This is currently set by some scsi devices and read by the msdos fs driver. * Other uses may appear later. */ -int * hardsect_size[MAX_BLKDEV] = { NULL, NULL, }; +int * hardsect_size[MAX_BLKDEV]; /* * The following tunes the read-ahead algorithm in mm/filemap.c */ -int * max_readahead[MAX_BLKDEV] = { NULL, NULL, }; +int * max_readahead[MAX_BLKDEV]; /* * Max number of sectors per request */ -int * max_sectors[MAX_BLKDEV] = { NULL, NULL, }; +int * max_sectors[MAX_BLKDEV]; static inline int get_max_sectors(kdev_t dev) { @@ -126,18 +127,24 @@ static inline int get_max_sectors(kdev_t dev) } /* - * Is called with the request spinlock aquired. * NOTE: the device-specific queue() functions * have to be atomic! */ -static inline request_queue_t *get_queue(kdev_t dev) +request_queue_t * blk_get_queue (kdev_t dev) { int major = MAJOR(dev); struct blk_dev_struct *bdev = blk_dev + major; + unsigned long flags; + request_queue_t *ret; + spin_lock_irqsave(&io_request_lock,flags); if (bdev->queue) - return bdev->queue(dev); - return &blk_dev[major].request_queue; + ret = bdev->queue(dev); + else + ret = &blk_dev[major].request_queue; + spin_unlock_irqrestore(&io_request_lock,flags); + + return ret; } void blk_cleanup_queue(request_queue_t * q) @@ -147,12 +154,17 @@ void blk_cleanup_queue(request_queue_t * q) void blk_queue_headactive(request_queue_t * q, int active) { - q->head_active = active; + q->head_active = active; +} + +void blk_queue_pluggable (request_queue_t * q, plug_device_fn *plug) +{ + q->plug_device_fn = plug; } -void blk_queue_pluggable(request_queue_t * q, int use_plug) +void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) { - q->use_plug = use_plug; + q->make_request_fn = mfn; } static int ll_merge_fn(request_queue_t *q, struct request *req, @@ -185,42 +197,23 @@ static int ll_merge_requests_fn(request_queue_t *q, struct request *req, void blk_init_queue(request_queue_t * q, request_fn_proc * rfn) { - q->request_fn = rfn; + q->request_fn = rfn; q->current_request = NULL; - q->merge_fn = ll_merge_fn; + q->merge_fn = ll_merge_fn; q->merge_requests_fn = ll_merge_requests_fn; - q->plug_tq.sync = 0; - q->plug_tq.routine = unplug_device; - q->plug_tq.data = q; - q->plugged = 0; + q->make_request_fn = NULL; + q->plug_tq.sync = 0; + q->plug_tq.routine = &generic_unplug_device; + q->plug_tq.data = q; + q->plugged = 0; /* * These booleans describe the queue properties. We set the * default (and most common) values here. Other drivers can * use the appropriate functions to alter the queue properties. * as appropriate. */ - q->use_plug = 1; - q->head_active = 1; -} - -/* - * remove the plug and let it rip.. - */ -void unplug_device(void * data) -{ - request_queue_t * q = (request_queue_t *) data; - unsigned long flags; - - spin_lock_irqsave(&io_request_lock,flags); - if( q->plugged ) - { - q->plugged = 0; - if( q->current_request != NULL ) - { - (q->request_fn)(q); - } - } - spin_unlock_irqrestore(&io_request_lock,flags); + q->plug_device_fn = NULL; + q->head_active = 1; } /* @@ -231,8 +224,12 @@ void unplug_device(void * data) * This is called with interrupts off and no requests on the queue. * (and with the request spinlock aquired) */ -static inline void plug_device(request_queue_t * q) +inline void generic_plug_device (request_queue_t *q, kdev_t dev) { + if (MAJOR(dev) == MD_MAJOR) { + spin_unlock_irq(&io_request_lock); + BUG(); + } if (q->current_request) return; @@ -241,6 +238,23 @@ static inline void plug_device(request_queue_t * q) } /* + * remove the plug and let it rip.. + */ +void generic_unplug_device(void * data) +{ + request_queue_t * q = (request_queue_t *) data; + unsigned long flags; + + spin_lock_irqsave(&io_request_lock,flags); + if (q->plugged) { + q->plugged = 0; + if (q->current_request) + (q->request_fn)(q); + } + spin_unlock_irqrestore(&io_request_lock,flags); +} + +/* * look for a free request in the first N entries. * NOTE: interrupts must be disabled on the way in (on SMP the request queue * spinlock has to be aquired), and will still be disabled on the way out. @@ -337,7 +351,7 @@ void set_device_ro(kdev_t dev,int flag) } static inline void drive_stat_acct(struct request *req, - unsigned long nr_sectors, int new_io) + unsigned long nr_sectors, int new_io) { int major = MAJOR(req->rq_dev); int minor = MINOR(req->rq_dev); @@ -384,23 +398,17 @@ static inline void drive_stat_acct(struct request *req, * which is important for drive_stat_acct() above. */ -static void add_request(request_queue_t * q, struct request * req) +static inline void __add_request(request_queue_t * q, struct request * req) { int major = MAJOR(req->rq_dev); struct request * tmp; - unsigned long flags; drive_stat_acct(req, req->nr_sectors, 1); req->next = NULL; - /* - * We use the goto to reduce locking complexity - */ - spin_lock_irqsave(&io_request_lock,flags); - if (!(tmp = q->current_request)) { q->current_request = req; - goto out; + return; } for ( ; tmp->next ; tmp = tmp->next) { const int after_current = IN_ORDER(tmp,req); @@ -420,7 +428,7 @@ static void add_request(request_queue_t * q, struct request * req) /* * FIXME(eric) I don't understand why there is a need for this * special case code. It clearly doesn't fit any more with - * the new queueing architecture, and it got added in 2.3.10. + * the new queueing architecture, and it got added in 2.3.10. * I am leaving this in here until I hear back from the COMPAQ * people. */ @@ -433,16 +441,13 @@ static void add_request(request_queue_t * q, struct request * req) { (q->request_fn)(q); } - -out: - spin_unlock_irqrestore(&io_request_lock,flags); } /* * Has to be called with the request spinlock aquired */ static inline void attempt_merge (request_queue_t * q, - struct request *req, + struct request *req, int max_sectors) { struct request *next = req->next; @@ -453,7 +458,6 @@ static inline void attempt_merge (request_queue_t * q, return; if (next->sem || req->cmd != next->cmd || req->rq_dev != next->rq_dev || req->nr_sectors + next->nr_sectors > max_sectors) return; - /* * If we are not allowed to merge these requests, then * return. If we are allowed to merge, then the count @@ -471,11 +475,10 @@ static inline void attempt_merge (request_queue_t * q, wake_up (&wait_for_request); } -static void __make_request(request_queue_t * q, - int major, - int rw, +static inline void __make_request(request_queue_t * q, int rw, struct buffer_head * bh) { + int major = MAJOR(bh->b_rdev); unsigned int sector, count; struct request * req; int rw_ahead, max_req, max_sectors; @@ -484,28 +487,22 @@ static void __make_request(request_queue_t * q, count = bh->b_size >> 9; sector = bh->b_rsector; - /* It had better not be a new buffer by the time we see it */ - if (buffer_new(bh)) - BUG(); - - /* Only one thread can actually submit the I/O. */ - if (test_and_set_bit(BH_Lock, &bh->b_state)) - return; - if (blk_size[major]) { unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1; if (maxsector < count || maxsector - count < sector) { bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped); - /* This may well happen - the kernel calls bread() - without checking the size of the device, e.g., - when mounting a device. */ + if (!blk_size[major][MINOR(bh->b_rdev)]) + goto end_io; + /* This may well happen - the kernel calls bread() + without checking the size of the device, e.g., + when mounting a device. */ printk(KERN_INFO - "attempt to access beyond end of device\n"); + "attempt to access beyond end of device\n"); printk(KERN_INFO "%s: rw=%d, want=%d, limit=%d\n", - kdevname(bh->b_rdev), rw, - (sector + count)>>1, - blk_size[major][MINOR(bh->b_rdev)]); + kdevname(bh->b_rdev), rw, + (sector + count)>>1, + blk_size[major][MINOR(bh->b_rdev)]); goto end_io; } } @@ -539,8 +536,7 @@ static void __make_request(request_queue_t * q, max_req = (NR_REQUEST * 2) / 3; break; default: - printk(KERN_ERR "make_request: bad block dev cmd," - " must be R/W/RA/WA\n"); + BUG(); goto end_io; } @@ -561,10 +557,12 @@ static void __make_request(request_queue_t * q, #endif /* look for a free request. */ - /* Loop uses two requests, 1 for loop and 1 for the real device. - * Cut max_req in half to avoid running out and deadlocking. */ + /* + * Loop uses two requests, 1 for loop and 1 for the real device. + * Cut max_req in half to avoid running out and deadlocking. + */ if ((major == LOOP_MAJOR) || (major == NBD_MAJOR)) - max_req >>= 1; + max_req >>= 1; /* * Try to coalesce the new request with old requests @@ -579,10 +577,10 @@ static void __make_request(request_queue_t * q, req = q->current_request; if (!req) { /* MD and loop can't handle plugging without deadlocking */ - if (major != MD_MAJOR && major != LOOP_MAJOR && - major != DDV_MAJOR && major != NBD_MAJOR - && q->use_plug) - plug_device(q); /* is atomic */ + if (q->plug_device_fn) + q->plug_device_fn(q, bh->b_rdev); /* is atomic */ + else + generic_plug_device(q, bh->b_rdev); /* is atomic */ goto get_rq; } @@ -667,13 +665,34 @@ static void __make_request(request_queue_t * q, get_rq: req = get_request(max_req, bh->b_rdev); - spin_unlock_irqrestore(&io_request_lock,flags); - -/* if no request available: if rw_ahead, forget it; otherwise try again blocking.. */ + /* + * if no request available: if rw_ahead, forget it, + * otherwise try again blocking.. + */ if (!req) { + spin_unlock_irqrestore(&io_request_lock,flags); if (rw_ahead) goto end_io; req = __get_request_wait(max_req, bh->b_rdev); + spin_lock_irqsave(&io_request_lock,flags); + } + /* + * Dont start the IO if the buffer has been + * invalidated meanwhile. (we have to do this + * within the io request lock and atomically + * before adding the request, see buffer.c's + * insert_into_queues_exclusive() function. + */ + if (!test_bit(BH_Req, &bh->b_state)) { + req->rq_status = RQ_INACTIVE; + spin_unlock_irqrestore(&io_request_lock,flags); + /* + * A fake 'everything went ok' completion event. + * The bh doesnt matter anymore, but we should not + * signal errors to RAID levels. + */ + bh->b_end_io(bh, 1); + return; } /* fill up the request-info, and add it to the queue */ @@ -689,52 +708,51 @@ get_rq: req->bh = bh; req->bhtail = bh; req->next = NULL; - add_request(q, req); + __add_request(q, req); + spin_unlock_irqrestore(&io_request_lock, flags); return; end_io: bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); } -void make_request(int major,int rw, struct buffer_head * bh) +void generic_make_request(int rw, struct buffer_head * bh) { request_queue_t * q; unsigned long flags; - - q = get_queue(bh->b_dev); - __make_request(q, major, rw, bh); + q = blk_get_queue(bh->b_rdev); + + __make_request(q, rw, bh); spin_lock_irqsave(&io_request_lock,flags); - if( !q->plugged ) + if (q && !q->plugged) (q->request_fn)(q); spin_unlock_irqrestore(&io_request_lock,flags); } - /* This function can be used to request a number of buffers from a block device. Currently the only restriction is that all buffers must belong to the same device */ -void ll_rw_block(int rw, int nr, struct buffer_head * bh[]) +static void __ll_rw_block(int rw, int nr, struct buffer_head * bh[],int haslock) { unsigned int major; int correct_size; - request_queue_t * q; - unsigned long flags; + request_queue_t *q; int i; - major = MAJOR(bh[0]->b_dev); - if (!(q = get_queue(bh[0]->b_dev))) { + q = blk_get_queue(bh[0]->b_dev); + if (!q) { printk(KERN_ERR "ll_rw_block: Trying to read nonexistent block-device %s (%ld)\n", kdevname(bh[0]->b_dev), bh[0]->b_blocknr); goto sorry; } - /* Determine correct block size for this device. */ + /* Determine correct block size for this device. */ correct_size = BLOCK_SIZE; if (blksize_size[major]) { i = blksize_size[major][MINOR(bh[0]->b_dev)]; @@ -742,7 +760,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bh[]) correct_size = i; } - /* Verify requested block sizes. */ + /* Verify requested block sizes. */ for (i = 0; i < nr; i++) { if (bh[i]->b_size != correct_size) { printk(KERN_NOTICE "ll_rw_block: device %s: " @@ -751,19 +769,6 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bh[]) correct_size, bh[i]->b_size); goto sorry; } - - /* Md remaps blocks now */ - bh[i]->b_rdev = bh[i]->b_dev; - bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9); -#ifdef CONFIG_BLK_DEV_MD - if (major==MD_MAJOR && - md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev, - &bh[i]->b_rsector, bh[i]->b_size >> 9)) { - printk (KERN_ERR - "Bad md_map in ll_rw_block\n"); - goto sorry; - } -#endif } if ((rw & WRITE) && is_read_only(bh[0]->b_dev)) { @@ -773,25 +778,29 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bh[]) } for (i = 0; i < nr; i++) { + /* Only one thread can actually submit the I/O. */ + if (haslock) { + if (!buffer_locked(bh[i])) + BUG(); + } else { + if (test_and_set_bit(BH_Lock, &bh[i]->b_state)) + continue; + } set_bit(BH_Req, &bh[i]->b_state); -#ifdef CONFIG_BLK_DEV_MD - if (MAJOR(bh[i]->b_dev) == MD_MAJOR) { - md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]); - continue; + + if (q->make_request_fn) + q->make_request_fn(rw, bh[i]); + else { + bh[i]->b_rdev = bh[i]->b_dev; + bh[i]->b_rsector = bh[i]->b_blocknr*(bh[i]->b_size>>9); + + generic_make_request(rw, bh[i]); } -#endif - __make_request(q, MAJOR(bh[i]->b_rdev), rw, bh[i]); } - spin_lock_irqsave(&io_request_lock,flags); - if( !q->plugged ) - { - (q->request_fn)(q); - } - spin_unlock_irqrestore(&io_request_lock,flags); return; - sorry: +sorry: for (i = 0; i < nr; i++) { mark_buffer_clean(bh[i]); /* remeber to refile it */ clear_bit(BH_Uptodate, &bh[i]->b_state); @@ -800,8 +809,18 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bh[]) return; } +void ll_rw_block(int rw, int nr, struct buffer_head * bh[]) +{ + __ll_rw_block(rw, nr, bh, 0); +} + +void ll_rw_block_locked(int rw, int nr, struct buffer_head * bh[]) +{ + __ll_rw_block(rw, nr, bh, 1); +} + #ifdef CONFIG_STRAM_SWAP -extern int stram_device_init( void ); +extern int stram_device_init (void); #endif /* @@ -811,8 +830,7 @@ extern int stram_device_init( void ); * 1 means we are done */ -int -end_that_request_first( struct request *req, int uptodate, char *name ) +int end_that_request_first (struct request *req, int uptodate, char *name) { struct buffer_head * bh; int nsect; @@ -847,8 +865,7 @@ end_that_request_first( struct request *req, int uptodate, char *name ) return 0; } -void -end_that_request_last( struct request *req ) +void end_that_request_last(struct request *req) { if (req->sem != NULL) up(req->sem); @@ -862,7 +879,7 @@ int __init blk_dev_init(void) struct blk_dev_struct *dev; for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;) { - dev->queue = NULL; + dev->queue = NULL; blk_init_queue(&dev->request_queue, NULL); } @@ -925,7 +942,7 @@ int __init blk_dev_init(void) #if !defined(CONFIG_SGI_IP22) && !defined(CONFIG_SGI_IP27) && \ !defined (__mc68000__) && !defined(CONFIG_PPC) && !defined(__sparc__) && \ !defined(CONFIG_APUS) && !defined(CONFIG_DECSTATION) && \ - !defined(CONFIG_BAGET_MIPS) && !defined(__sh__) + !defined(CONFIG_BAGET_MIPS) && !defined(__sh__) && !defined(__ia64__) outb_p(0xc, 0x3f2); #endif #endif @@ -945,7 +962,7 @@ int __init blk_dev_init(void) sbpcd_init(); #endif CONFIG_SBPCD #ifdef CONFIG_AZTCD - aztcd_init(); + aztcd_init(); #endif CONFIG_AZTCD #ifdef CONFIG_CDU535 sony535_init(); @@ -983,3 +1000,5 @@ EXPORT_SYMBOL(end_that_request_last); EXPORT_SYMBOL(blk_init_queue); EXPORT_SYMBOL(blk_cleanup_queue); EXPORT_SYMBOL(blk_queue_headactive); +EXPORT_SYMBOL(blk_queue_pluggable); +EXPORT_SYMBOL(generic_make_request); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index fc6024fdc..587156935 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -342,8 +342,8 @@ static int create_missing_block(struct loop_device *lo, int block, int blksize) set_fs(old_fs); if (retval < 0) { - printk(KERN_WARNING "loop: cannot create block - FS write failed: code %d\n", - retval); + printk(KERN_WARNING "loop: cannot create block - FS write failed: code %Zi\n", + retval); return FALSE; } else { return TRUE; @@ -386,7 +386,11 @@ static int loop_set_fd(struct loop_device *lo, kdev_t dev, unsigned int arg) a file structure */ lo->lo_backing_file = NULL; } else if (S_ISREG(inode->i_mode)) { - if (!inode->i_op->get_block) { + /* + * Total crap. We should just use pagecache instead of trying + * to redirect on block level. + */ + if (!inode->i_mapping->a_ops->bmap) { printk(KERN_ERR "loop: device has no block access/not implemented\n"); goto out_putf; } diff --git a/drivers/block/md.c b/drivers/block/md.c index b5b170069..752c7b0ab 100644 --- a/drivers/block/md.c +++ b/drivers/block/md.c @@ -1,21 +1,17 @@ - /* md.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - <zyngier@ufr-info-p7.ibp.fr> or - <maz@gloups.fdn.fr> + Copyright (C) 1998, 1999, 2000 Ingo Molnar - A lot of inspiration came from hd.c ... + completely rewritten, based on the MD driver code from Marc Zyngier - kerneld support by Boris Tobotras <boris@xtalk.msk.su> - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> + Changes: - RAID-1/RAID-5 extensions by: - Ingo Molnar, Miguel de Icaza, Gadi Oxman + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> + - kerneld support by Boris Tobotras <boris@xtalk.msk.su> + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> - Changes for kmod by: - Cyrus Durgin - This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) @@ -26,233 +22,1193 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/* - * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so - * the extra system load does not show up that much. Increase it if your - * system can take more. - */ -#define SPEED_LIMIT 1024 - #include <linux/config.h> -#include <linux/module.h> -#include <linux/version.h> -#include <linux/malloc.h> -#include <linux/mm.h> -#include <linux/md.h> -#include <linux/hdreg.h> -#include <linux/stat.h> -#include <linux/fs.h> -#include <linux/proc_fs.h> -#include <linux/blkdev.h> -#include <linux/genhd.h> -#include <linux/smp_lock.h> +#include <linux/raid/md.h> + #ifdef CONFIG_KMOD #include <linux/kmod.h> #endif -#include <linux/errno.h> -#include <linux/init.h> #define __KERNEL_SYSCALLS__ #include <linux/unistd.h> +#include <asm/unaligned.h> + +extern asmlinkage int sys_sched_yield(void); +extern asmlinkage int sys_setsid(void); + +extern unsigned long io_events[MAX_BLKDEV]; + #define MAJOR_NR MD_MAJOR #define MD_DRIVER #include <linux/blk.h> -#include <linux/blkpg.h> -#include <asm/uaccess.h> -#include <asm/bitops.h> -#include <asm/atomic.h> #ifdef CONFIG_MD_BOOT -extern kdev_t name_to_kdev_t(char *line) __init; +extern kdev_t name_to_kdev_t(char *line) md__init; #endif -static struct hd_struct md_hd_struct[MAX_MD_DEV]; -static int md_blocksizes[MAX_MD_DEV]; -int md_maxreadahead[MAX_MD_DEV]; -#if SUPPORT_RECONSTRUCTION -static struct md_thread *md_sync_thread = NULL; -#endif /* SUPPORT_RECONSTRUCTION */ +#define DEBUG 0 +#if DEBUG +# define dprintk(x...) printk(x) +#else +# define dprintk(x...) do { } while(0) +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, }; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_maxreadahead[MAX_MD_DEVS]; +static mdk_thread_t *md_recovery_thread = NULL; -int md_size[MAX_MD_DEV]={0, }; +int md_size[MAX_MD_DEVS] = {0, }; static struct gendisk md_gendisk= { - MD_MAJOR, - "md", - 0, - 1, - md_hd_struct, - md_size, - MAX_MD_DEV, - NULL, - NULL + MD_MAJOR, + "md", + 0, + 1, + md_hd_struct, + md_size, + MAX_MD_DEVS, + NULL, + NULL }; -static struct md_personality *pers[MAX_PERSONALITY]={NULL, }; -struct md_dev md_dev[MAX_MD_DEV]; +void md_plug_device (request_queue_t *mdqueue, kdev_t dev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + request_queue_t *q; + mddev_t *mddev; + + if (!md_test_and_set_bit(0, (atomic_t *)&mdqueue->plugged)) { + mddev = kdev_to_mddev(dev); + ITERATE_RDEV(mddev,rdev,tmp) { + q = blk_get_queue(rdev->dev); + generic_unplug_device(q); + } + queue_task(&mdqueue->plug_tq, &tq_disk); + } +} + +static void md_unplug_device (void * data) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + mddev_t *mddev = (mddev_t *)data; + request_queue_t *mdqueue = &mddev->queue, *q; + + clear_bit(0, (atomic_t *)&mdqueue->plugged); + ITERATE_RDEV(mddev,rdev,tmp) { + q = blk_get_queue(rdev->dev); + generic_unplug_device(q); + } +} + +/* + * Enables to iterate over all existing md arrays + */ +static MD_LIST_HEAD(all_mddevs); + +/* + * The mapping between kdev and mddev is not necessary a simple + * one! Eg. HSM uses several sub-devices to implement Logical + * Volumes. All these sub-devices map to the same mddev. + */ +dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, }; + +void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data) +{ + unsigned int minor = MINOR(dev); + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; + } + if (mddev_map[minor].mddev != NULL) { + MD_BUG(); + return; + } + mddev_map[minor].mddev = mddev; + mddev_map[minor].data = data; +} + +void del_mddev_mapping (mddev_t * mddev, kdev_t dev) +{ + unsigned int minor = MINOR(dev); + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; + } + if (mddev_map[minor].mddev != mddev) { + MD_BUG(); + return; + } + mddev_map[minor].mddev = NULL; + mddev_map[minor].data = NULL; +} + +static request_queue_t *md_get_queue (kdev_t dev) +{ + mddev_t *mddev = kdev_to_mddev(dev); -int md_thread(void * arg); + if (!mddev) + return NULL; + return &mddev->queue; +} -static int legacy_raid_sb (int minor, int pnum) +static void do_md_request (request_queue_t * q) { - int i, factor; + printk(KERN_ALERT "Got md request, not good..."); + BUG(); + return; +} - factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor))); +void md_make_request (int rw, struct buffer_head * bh) +{ + mddev_t *mddev = kdev_to_mddev(bh->b_dev); + + if (!mddev || !mddev->pers) + bh->b_end_io(bh, 0); + else { + if ((rw == READ || rw == READA) && buffer_uptodate(bh)) + bh->b_end_io(bh, 1); + else + mddev->pers->make_request(mddev, rw, bh); + } +} - /***** +static mddev_t * alloc_mddev (kdev_t dev) +{ + request_queue_t *q; + mddev_t *mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); + + mddev->__minor = MINOR(dev); + init_MUTEX(&mddev->reconfig_sem); + init_MUTEX(&mddev->recovery_sem); + init_MUTEX(&mddev->resync_sem); + MD_INIT_LIST_HEAD(&mddev->disks); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + + q = &mddev->queue; + blk_init_queue(q, DEVICE_REQUEST); + blk_queue_pluggable(q, md_plug_device); + blk_queue_make_request(q, md_make_request); + + q->plug_tq.sync = 0; + q->plug_tq.routine = &md_unplug_device; + q->plug_tq.data = mddev; + + /* + * The 'base' mddev is the one with data NULL. + * personalities can create additional mddevs + * if necessary. + */ + add_mddev_mapping(mddev, dev, 0); + md_list_add(&mddev->all_mddevs, &all_mddevs); + + return mddev; +} + +static void free_mddev (mddev_t *mddev) +{ + if (!mddev) { + MD_BUG(); + return; + } + + /* + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) + */ + while (md_atomic_read(&mddev->resync_sem.count) != 1) + schedule(); + while (md_atomic_read(&mddev->recovery_sem.count) != 1) + schedule(); + + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); + md_list_del(&mddev->all_mddevs); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + kfree(mddev); +} + +struct gendisk * find_gendisk (kdev_t dev) +{ + struct gendisk *tmp = gendisk_head; + + while (tmp != NULL) { + if (tmp->major == MAJOR(dev)) + return (tmp); + tmp = tmp->next; + } + return (NULL); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); + +char * partition_name (kdev_t dev) +{ + struct gendisk *hd; + static char nomem [] = "<nomem>"; + dev_name_t *dname; + struct md_list_head *tmp = device_names.next; + + while (tmp != &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + tmp = tmp->next; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + + if (!dname) + return nomem; + /* + * ok, add this new device name to the list + */ + hd = find_gendisk (dev); + + if (!hd) + sprintf (dname->name, "[dev %s]", kdevname(dev)); + else + disk_name (hd, MINOR(dev), dname->name); + + dname->dev = dev; + MD_INIT_LIST_HEAD(&dname->list); + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +static unsigned int zoned_raid_size (mddev_t *mddev) +{ + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* * do size and offset calculations. */ - for (i=0; i<md_dev[minor].nb_dev; i++) { - md_dev[minor].devices[i].size &= ~(factor - 1); - md_size[minor] += md_dev[minor].devices[i].size; - md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset + - md_dev[minor].devices[i-1].size) : 0; - } - if (pnum == RAID0 >> PERSONALITY_SHIFT) - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev; + mask = ~(mddev->sb->chunk_size/1024 - 1); + + ITERATE_RDEV(mddev,rdev,tmp) { + rdev->size &= mask; + md_size[mdidx(mddev)] += rdev->size; + } return 0; } -static void free_sb (struct md_dev *mddev) +/* + * We check wether all devices are numbered from 0 to nb_dev-1. The + * order is guaranteed even after device name changes. + * + * Some personalities (raid0, linear) use this. Personalities that + * provide data have to be able to deal with loss of individual + * disks, so they do their checking themselves. + */ +int md_check_ordering (mddev_t *mddev) { - int i; - struct real_dev *realdev; + int i, c; + mdk_rdev_t *rdev; + struct md_list_head *tmp; - if (mddev->sb) { - free_page((unsigned long) mddev->sb); - mddev->sb = NULL; + /* + * First, all devices must be fully functional + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk("md: md%d's device %s faulty, aborting.\n", + mdidx(mddev), partition_name(rdev->dev)); + goto abort; + } + } + + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + c++; } - for (i = 0; i <mddev->nb_dev; i++) { - realdev = mddev->devices + i; - if (realdev->sb) { - free_page((unsigned long) realdev->sb); - realdev->sb = NULL; + if (c != mddev->nb_dev) { + MD_BUG(); + goto abort; + } + if (mddev->nb_dev != mddev->sb->raid_disks) { + printk("md: md%d, array needs %d disks, has %d, aborting.\n", + mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev); + goto abort; + } + /* + * Now the numbering check + */ + for (i = 0; i < mddev->nb_dev; i++) { + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == i) + c++; + } + if (c == 0) { + printk("md: md%d, missing disk #%d, aborting.\n", + mdidx(mddev), i); + goto abort; + } + if (c > 1) { + printk("md: md%d, too many disks #%d, aborting.\n", + mdidx(mddev), i); + goto abort; } } + return 0; +abort: + return 1; } -/* - * Check one RAID superblock for generic plausibility - */ +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb) +{ + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} #define BAD_MAGIC KERN_ERR \ -"md: %s: invalid raid superblock magic (%x) on block %u\n" +"md: invalid raid superblock magic on %s\n" + +#define BAD_MINOR KERN_ERR \ +"md: %s: invalid raid minor (%x)\n" #define OUT_OF_MEM KERN_ALERT \ "md: out of memory.\n" -#define NO_DEVICE KERN_ERR \ -"md: disabled device %s\n" +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" -#define SUCCESS 0 -#define FAILURE -1 +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" -static int analyze_one_sb (struct real_dev * rdev) +static int alloc_array_sb (mddev_t * mddev) { - int ret = FAILURE; - struct buffer_head *bh; + if (mddev->sb) { + MD_BUG(); + return 0; + } + + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page((unsigned long)mddev->sb); + return 0; +} + +static int alloc_disk_sb (mdk_rdev_t * rdev) +{ + if (rdev->sb) + MD_BUG(); + + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL); + if (!rdev->sb) { + printk (OUT_OF_MEM); + return -EINVAL; + } + md_clear_page((unsigned long)rdev->sb); + + return 0; +} + +static void free_disk_sb (mdk_rdev_t * rdev) +{ + if (rdev->sb) { + free_page((unsigned long) rdev->sb); + rdev->sb = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); + } +} + +static void mark_rdev_faulty (mdk_rdev_t * rdev) +{ + if (!rdev) { + MD_BUG(); + return; + } + free_disk_sb(rdev); + rdev->faulty = 1; +} + +static int read_disk_sb (mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + struct buffer_head *bh = NULL; kdev_t dev = rdev->dev; - md_superblock_t *sb; + mdp_super_t *sb; + u32 sb_offset; + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + /* - * Read the superblock, it's at the end of the disk + * Calculate the position of the superblock, + * it's at the end of the disk */ - rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]); + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + printk("(read) %s's sb offset: %d", partition_name(dev), + sb_offset); + fsync_dev(dev); set_blocksize (dev, MD_SB_BYTES); - bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); + bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); if (bh) { - sb = (md_superblock_t *) bh->b_data; - if (sb->md_magic != MD_SB_MAGIC) { - printk (BAD_MAGIC, kdevname(dev), - sb->md_magic, rdev->sb_offset); - goto abort; - } - rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL); - if (!rdev->sb) { - printk (OUT_OF_MEM); - goto abort; - } - memcpy (rdev->sb, bh->b_data, MD_SB_BYTES); - - rdev->size = sb->size; - } else - printk (NO_DEVICE,kdevname(rdev->dev)); - ret = SUCCESS; + sb = (mdp_super_t *) bh->b_data; + memcpy (rdev->sb, sb, MD_SB_BYTES); + } else { + printk (NO_SB,partition_name(rdev->dev)); + goto abort; + } + printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events)); + ret = 0; abort: if (bh) brelse (bh); return ret; } -#undef SUCCESS -#undef FAILURE +static unsigned int calc_sb_csum (mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb (mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk (BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk (BAD_MINOR, partition_name(rdev->dev), + sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) + printk(BAD_CSUM, partition_name(rdev->dev)); + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = find_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + mddev->nb_dev++; + printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev); +} + +static void unbind_rdev_from_array (mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + md_list_del(&rdev->same_set); + MD_INIT_LIST_HEAD(&rdev->same_set); + rdev->mddev->nb_dev--; + printk("unbind<%s,%d>\n", partition_name(rdev->dev), + rdev->mddev->nb_dev); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev (mdk_rdev_t *rdev) +{ + int err = 0; + + /* + * First insert a dummy inode. + */ + if (rdev->inode) + MD_BUG(); + rdev->inode = get_empty_inode(); + if (!rdev->inode) + return -ENOMEM; + /* + * we dont care about any other fields + */ + rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev; + insert_inode_hash(rdev->inode); + + memset(&rdev->filp, 0, sizeof(rdev->filp)); + rdev->filp.f_mode = 3; /* read write */ + return err; +} + +static void unlock_rdev (mdk_rdev_t *rdev) +{ + if (!rdev->inode) + MD_BUG(); + iput(rdev->inode); + rdev->inode = NULL; +} + +static void export_rdev (mdk_rdev_t * rdev) +{ + printk("export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + md_list_del(&rdev->all); + MD_INIT_LIST_HEAD(&rdev->all); + if (rdev->pending.next != &rdev->pending) { + printk("(%s was pending)\n",partition_name(rdev->dev)); + md_list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array (mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array (mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (mddev->nb_dev) + MD_BUG(); +} + +#undef BAD_CSUM #undef BAD_MAGIC #undef OUT_OF_MEM -#undef NO_DEVICE +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)get_unaligned(&sb->events)); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + printk(" D %2d: ", i); + print_desc(desc); + } + printk(" THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk("rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk("no rdev superblock!\n"); +} + +void md_print_devices (void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk(" **********************************\n"); + printk(" * <COMPLETE RAID STATE PRINTOUT> *\n"); + printk(" **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk(" **********************************\n"); + printk("\n"); +} + +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all (kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + tmp = all_raid_disks.next; + while (tmp != &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + tmp = tmp->next; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + struct buffer_head *bh; + kdev_t dev; + u32 sb_offset, size; + mdp_super_t *sb; + + if (!rdev->sb) { + MD_BUG(); + return -1; + } + if (rdev->faulty) { + MD_BUG(); + return -1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return -1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * it's size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size); + goto skip; + } + + printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset); + fsync_dev(dev); + set_blocksize(dev, MD_SB_BYTES); + bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); + if (!bh) { + printk(GETBLK_FAILED, partition_name(dev)); + return 1; + } + memset(bh->b_data,0,bh->b_size); + sb = (mdp_super_t *) bh->b_data; + memcpy(sb, rdev->sb, MD_SB_BYTES); + + mark_buffer_uptodate(bh, 1); + mark_buffer_dirty(bh, 1); + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + brelse(bh); + fsync_dev(dev); +skip: + return 0; +} +#undef GETBLK_FAILED KERN_ERR + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +int md_update_sb(mddev_t * mddev) +{ + int first, err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + __u64 ev; + +repeat: + mddev->sb->utime = CURRENT_TIME; + ev = get_unaligned(&mddev->sb->events); + ++ev; + put_unaligned(ev,&mddev->sb->events); + if (ev == (__u64)0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + --ev; + put_unaligned(ev,&mddev->sb->events); + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return 0; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + first = 1; + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (!first) { + first = 0; + printk(", "); + } + if (rdev->faulty) + printk("(skipping faulty "); + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty) { + printk("[events: %08lx]", + (unsigned long)get_unaligned(&rdev->sb->events)); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + printk(".\n"); + if (err) { + printk("errors occured during superblock update, repeating\n"); + if (--count) + goto repeat; + printk("excessive errors occured during superblock update, exiting\n"); + } + return 0; +} + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static int md_import_device (kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk("could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (!fs_may_mount(newdev)) { + printk("md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk("md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk("md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk("md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk("md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + + if (rdev->faulty && rdev->sb) + free_disk_sb(rdev); + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->inode) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} /* * Check a full RAID array for plausibility */ #define INCONSISTENT KERN_ERR \ -"md: superblock inconsistency -- run ckraid\n" +"md: fatal superblock inconsistency in %s -- removing from array\n" #define OUT_OF_DATE KERN_ERR \ -"md: superblock update time inconsistenty -- using the most recent one\n" +"md: superblock update time inconsistency -- using the most recent one\n" #define OLD_VERSION KERN_ALERT \ -"md: %s: unsupported raid array version %d.%d.%d\n" - -#define NOT_CLEAN KERN_ERR \ -"md: %s: raid array is not clean -- run ckraid\n" +"md: md%d: unsupported raid array version %d.%d.%d\n" #define NOT_CLEAN_IGNORE KERN_ERR \ -"md: %s: raid array is not clean -- reconstructing parity\n" +"md: md%d: raid array is not clean -- starting background reconstruction\n" #define UNKNOWN_LEVEL KERN_ERR \ -"md: %s: unsupported raid level %d\n" +"md: md%d: unsupported raid level %d\n" -static int analyze_sbs (int minor, int pnum) +static int analyze_sbs (mddev_t * mddev) { - struct md_dev *mddev = md_dev + minor; - int i, N = mddev->nb_dev, out_of_date = 0; - struct real_dev * disks = mddev->devices; - md_superblock_t *sb, *freshest = NULL; - - /* - * RAID-0 and linear don't use a RAID superblock - */ - if (pnum == RAID0 >> PERSONALITY_SHIFT || - pnum == LINEAR >> PERSONALITY_SHIFT) - return legacy_raid_sb (minor, pnum); + int out_of_date = 0, i; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; /* * Verify the RAID superblock on each real device */ - for (i = 0; i < N; i++) - if (analyze_one_sb(disks+i)) + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } /* * The superblock constant part has to be the same * for all disks in the array. */ sb = NULL; - for (i = 0; i < N; i++) { - if (!disks[i].sb) - continue; + + ITERATE_RDEV(mddev,rdev,tmp) { if (!sb) { - sb = disks[i].sb; + sb = rdev->sb; continue; } - if (memcmp(sb, - disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) { - printk (INCONSISTENT); - goto abort; + if (!sb_equal(sb, rdev->sb)) { + printk (INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; } } @@ -261,496 +1217,1662 @@ static int analyze_sbs (int minor, int pnum) * find the freshest superblock, that one will be the superblock * that represents the whole array. */ - if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL) - goto abort; + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; freshest = NULL; - for (i = 0; i < N; i++) { - if (!disks[i].sb) - continue; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + __u64 ev = get_unaligned(&rdev->sb->events); + if (ev != (__u64)0) { + --ev; + put_unaligned(ev,&rdev->sb->events); + } + } + + printk("%s's event counter: %08lx\n", partition_name(rdev->dev), + (unsigned long)get_unaligned(&rdev->sb->events)); if (!freshest) { - freshest = disks[i].sb; + freshest = rdev; continue; } /* * Find the newest superblock version */ - if (disks[i].sb->utime != freshest->utime) { + ev1 = get_unaligned(&rdev->sb->events); + ev2 = get_unaligned(&freshest->sb->events); + if (ev1 != ev2) { out_of_date = 1; - if (disks[i].sb->utime > freshest->utime) - freshest = disks[i].sb; + if (ev1 > ev2) + freshest = rdev; } } - if (out_of_date) + if (out_of_date) { printk(OUT_OF_DATE); - memcpy (sb, freshest, sizeof(*freshest)); + printk("freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); /* - * Check if we can support this RAID array + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. */ - if (sb->major_version != MD_MAJOR_VERSION || - sb->minor_version > MD_MINOR_VERSION) { - - printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)), - sb->major_version, sb->minor_version, - sb->patch_version); - goto abort; + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices faulty + */ + __u64 ev1, ev2; + ev1 = get_unaligned(&rdev->sb->events); + ev2 = get_unaligned(&sb->events); + ++ev1; + if (ev1 < ev2) { + printk("md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } } /* - * We need to add this as a superblock option. + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. */ -#if SUPPORT_RECONSTRUCTION - if (sb->state != (1 << MD_SB_CLEAN)) { - if (sb->level == 1) { - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor))); + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty) { /* REMOVEME */ + MD_BUG(); goto abort; - } else - printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor))); + } + ev1 = get_unaligned(&rdev->sb->events); + ev2 = get_unaligned(&sb->events); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } } -#else - if (sb->state != (1 << MD_SB_CLEAN)) { - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + */ + if (disk_faulty(desc)) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk("md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk("md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); } -#endif /* SUPPORT_RECONSTRUCTION */ - switch (sb->level) { - case 1: - md_size[minor] = sb->size; - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD; - break; - case 4: - case 5: - md_size[minor] = sb->size * (sb->raid_disks - 1); - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1); - break; - default: - printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)), - sb->level); + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } } + + /* + * Do a final reality check. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk (OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk (NOT_CLEAN_IGNORE, mdidx(mddev)); + return 0; abort: - free_sb(mddev); return 1; } #undef INCONSISTENT #undef OUT_OF_DATE #undef OLD_VERSION -#undef NOT_CLEAN #undef OLD_LEVEL -int md_update_sb(int minor) +static int device_size_calculation (mddev_t * mddev) { - struct md_dev *mddev = md_dev + minor; - struct buffer_head *bh; - md_superblock_t *sb = mddev->sb; - struct real_dev *realdev; - kdev_t dev; - int i; - u32 sb_offset; + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; - sb->utime = CURRENT_TIME; - for (i = 0; i < mddev->nb_dev; i++) { - realdev = mddev->devices + i; - if (!realdev->sb) + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) continue; - dev = realdev->dev; - sb_offset = realdev->sb_offset; - set_blocksize(dev, MD_SB_BYTES); - printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset); - bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - if (bh) { - sb = (md_superblock_t *) bh->b_data; - memcpy(sb, mddev->sb, MD_SB_BYTES); - memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4); - mark_buffer_uptodate(bh, 1); - mark_buffer_dirty(bh, 1); - ll_rw_block(WRITE, 1, &bh); - wait_on_buffer(bh); - bforget(bh); - fsync_dev(dev); - invalidate_buffers(dev); - } else - printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev)); + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk (KERN_WARNING + "Dev %s smaller than chunk_size: %dk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } } + + switch (sb->level) { + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) + readahead = mddev->sb->chunk_size * 4 * data_disks; + if (readahead < data_disks * MAX_SECTORS*512*2) + readahead = data_disks * MAX_SECTORS*512*2; + else { + if (sb->level == -3) + readahead = 0; + } + md_maxreadahead[mdidx(mddev)] = readahead; + + printk(KERN_INFO "md%d: max total readahead window set to %dk\n", + mdidx(mddev), readahead/1024); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %dk\n", + mdidx(mddev), data_disks, readahead/data_disks/1024); return 0; +abort: + return 1; } -static int do_md_run (int minor, int repart) + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run (mddev_t * mddev) { - int pnum, i, min, factor, err; + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; - if (!md_dev[minor].nb_dev) - return -EINVAL; - - if (md_dev[minor].pers) - return -EBUSY; - md_dev[minor].repartition=repart; + if (!mddev->nb_dev) { + MD_BUG(); + return -EINVAL; + } - if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT)) - >= MAX_PERSONALITY) - return -EINVAL; - - /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */ - if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){ - for (i = 0; i < md_dev [minor].nb_dev; i++) - if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR) - return -EINVAL; - } - if (!pers[pnum]) - { + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + mddev->param.chunk_size = chunk_size; + mddev->param.personality = pnum; + + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + + if (!pers[pnum]) + { #ifdef CONFIG_KMOD - char module_name[80]; - sprintf (module_name, "md-personality-%d", pnum); - request_module (module_name); - if (!pers[pnum]) + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) #endif - return -EINVAL; - } + return -EINVAL; + } - factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor))); + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + fsync_dev(rdev->dev); + invalidate_buffers(rdev->dev); + } - for (i=0; i<md_dev[minor].nb_dev; i++) - if (md_dev[minor].devices[i].size<min) - { - printk ("Dev %s smaller than %dk, cannot shrink\n", - partition_name (md_dev[minor].devices[i].dev), min); - return -EINVAL; - } - - for (i=0; i<md_dev[minor].nb_dev; i++) { - fsync_dev(md_dev[minor].devices[i].dev); - invalidate_buffers(md_dev[minor].devices[i].dev); - } + mddev->pers = pers[pnum]; - /* Resize devices according to the factor. It is used to align - partitions size on a given chunk size. */ - md_size[minor]=0; + err = mddev->pers->run(mddev); + if (err) { + printk("pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } - /* - * Analyze the raid superblock - */ - if (analyze_sbs(minor, pnum)) - return -EINVAL; + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + md_update_sb(mddev); - md_dev[minor].pers=pers[pnum]; - - if ((err=md_dev[minor].pers->run (minor, md_dev+minor))) - { - md_dev[minor].pers=NULL; - free_sb(md_dev + minor); - return (err); - } - - if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT) - { - md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN); - md_update_sb(minor); - } - - /* FIXME : We assume here we have blocks - that are twice as large as sectors. - THIS MAY NOT BE TRUE !!! */ - md_hd_struct[minor].start_sect=0; - md_hd_struct[minor].nr_sects=md_size[minor]<<1; + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1; - read_ahead[MD_MAJOR] = 128; - return (0); + read_ahead[MD_MAJOR] = 1024; + return (0); } -static int do_md_stop (int minor, struct inode *inode) +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +#define OUT(x) do { err = (x); goto out; } while (0) + +static int restart_array (mddev_t *mddev) { - int i; - - if (inode->i_count>1 || md_dev[minor].busy>1) { + int err = 0; + + /* + * Complain if it has no devices + */ + if (!mddev->nb_dev) + OUT(-ENXIO); + + if (mddev->pers) { + if (!mddev->ro) + OUT(-EBUSY); + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk (KERN_INFO + "md%d switched to read-write mode.\n", mdidx(mddev)); /* - * ioctl : one open channel + * Kick recovery or resync if necessary */ - printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n", - minor, inode->i_count, md_dev[minor].busy); - return -EBUSY; + md_recover_arrays(); + if (mddev->pers->restart_resync) + mddev->pers->restart_resync(mddev); + } else + err = -EINVAL; + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" + +static int do_md_stop (mddev_t * mddev, int ro) +{ + int err = 0, resync_interrupted = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (!ro && !fs_may_mount (dev)) { + printk (STILL_MOUNTED, mdidx(mddev)); + OUT(-EBUSY); } - if (md_dev[minor].pers) { + /* + * complain if it's already stopped + */ + if (!mddev->nb_dev) + OUT(-ENXIO); + + if (mddev->pers) { /* * It is safe to call stop here, it only frees private * data. Also, it tells us if a device is unstoppable * (eg. resyncing is in progress) */ - if (md_dev[minor].pers->stop (minor, md_dev+minor)) - return -EBUSY; + if (mddev->pers->stop_resync) + if (mddev->pers->stop_resync(mddev)) + resync_interrupted = 1; + + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + /* - * The device won't exist anymore -> flush it now + * This synchronizes with signal delivery to the + * resync or reconstruction thread. It also nicely + * hangs the process if some reconstruction has not + * finished. */ - fsync_dev (inode->i_rdev); - invalidate_buffers (inode->i_rdev); - if (md_dev[minor].sb) { - md_dev[minor].sb->state |= 1 << MD_SB_CLEAN; - md_update_sb(minor); + down(&mddev->recovery_sem); + up(&mddev->recovery_sem); + + /* + * sync and invalidate buffers because we cannot kill the + * main thread with valid IO transfers still around. + * the kernel lock protects us from new requests being + * added after invalidate_buffers(). + */ + fsync_dev (mddev_to_kdev(mddev)); + fsync_dev (dev); + invalidate_buffers (dev); + + if (ro) { + if (mddev->ro) + OUT(-ENXIO); + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + if (mddev->ro) + set_device_ro(dev, 1); + OUT(-EBUSY); + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (!mddev->recovery_running && !resync_interrupted) { + printk("marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + md_update_sb(mddev); } + if (ro) + set_device_ro(dev, 1); } - - /* Remove locks. */ - if (md_dev[minor].sb) - free_sb(md_dev + minor); - for (i=0; i<md_dev[minor].nb_dev; i++) - clear_inode (md_dev[minor].devices[i].inode); - - md_dev[minor].nb_dev=md_size[minor]=0; - md_hd_struct[minor].nr_sects=0; - md_dev[minor].pers=NULL; - - read_ahead[MD_MAJOR] = 128; - - return (0); + + /* + * Free resources if final stop + */ + if (!ro) { + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; + free_mddev(mddev); + + printk (KERN_INFO "md%d stopped.\n", mdidx(mddev)); + } else + printk (KERN_INFO + "md%d switched to read-only mode.\n", mdidx(mddev)); +out: + return err; +} + +#undef OUT + +/* + * We have to safely support old arrays too. + */ +int detect_old_array (mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; } -static int do_md_add (int minor, kdev_t dev) + +static void autorun_array (mddev_t *mddev) { + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (mddev->disks.prev == &mddev->disks) { + MD_BUG(); + return; + } + + printk("running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\nnow!\n"); + + err = do_md_run (mddev); + if (err) { + printk("do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + */ +static void autorun_devices (void) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + kdev_t md_kdev; + + + printk("autorun ...\n"); + while (pending_raid_disks.next != &pending_raid_disks) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk("considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(" adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); + mddev = kdev_to_mddev(md_kdev); + if (mddev) { + printk("md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + continue; + } + mddev = alloc_mddev(md_kdev); + printk("created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + md_list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + autorun_array(mddev); + } + printk("... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array (kdev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk("could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk("can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + +struct { + int set; + int noautodetect; + +} raid_setup_args md__initdata = { 0, 0 }; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +void md__init autodetect_raid(void) +{ +#ifdef CONFIG_AUTODETECT_RAID + struct gendisk *disk; + mdk_rdev_t *rdev; int i; - int hot_add=0; - struct real_dev *realdev; - if (md_dev[minor].nb_dev==MAX_REAL) + if (raid_setup_args.noautodetect) { + printk(KERN_INFO "skipping autodetection of RAID arrays\n"); + return; + } + printk(KERN_INFO "autodetecting RAID arrays\n"); + + for (disk = gendisk_head ; disk ; disk = disk->next) { + for (i = 0; i < disk->max_p*disk->nr_real; i++) { + kdev_t dev = MKDEV(disk->major,i); + + if (disk->part[i].type != LINUX_RAID_PARTITION) + continue; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + } + + autorun_devices(); +#endif +} + +static int get_version (void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info (mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) + return -EINVAL; + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info (mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) return -EINVAL; - if (!fs_may_mount (dev)) + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= mddev->sb->nr_disks) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info.x + +static int add_new_disk (mddev_t * mddev, void * arg) +{ + int err, size, persistent; + mdu_disk_info_t info; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= mddev->sb->nr_disks) + return -EINVAL; + + dev = MKDEV(info.major,info.minor); + + if (find_rdev_all(dev)) { + printk("device %s already used in a RAID array!\n", + partition_name(dev)); return -EBUSY; + } - if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) { - printk("md_add(): zero device size, huh, bailing out.\n"); + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info.state & (1<<MD_DISK_FAULTY))==0) { + err = md_import_device (dev, 0); + if (err) { + printk("md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + + rdev->old_dev = dev; + rdev->desc_nr = info.number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk("nonpersistent superblock ...\n"); + if (!mddev->sb->chunk_size) + printk("no chunksize?\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB + +static int hot_remove_disk (mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk("trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk("md%d: personality does not support diskops!\n", + mdidx(mddev)); return -EINVAL; } - if (md_dev[minor].pers) { - /* - * The array is already running, hot-add the drive, or - * bail out: - */ - if (!md_dev[minor].pers->hot_add_disk) - return -EBUSY; - else - hot_add=1; + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + if (disk_removed(disk)) { + MD_BUG(); + return -EINVAL; + } + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + return 0; +busy: + printk("cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk (mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk("trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk("md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk("md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + return -ENOSPC; + } + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk("md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk("md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; } + bind_rdev_to_array(rdev, mddev); /* - * Careful. We cannot increase nb_dev for a running array. + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... */ - i=md_dev[minor].nb_dev; - realdev = &md_dev[minor].devices[i]; - realdev->dev=dev; - - /* Lock the device by inserting a dummy inode. This doesn't - smell very good, but I need to be consistent with the - mount stuff, specially with fs_may_mount. If someone have - a better idea, please help ! */ - - realdev->inode=get_empty_inode (); - if (!realdev->inode) - return -ENOMEM; - realdev->inode->i_dev=dev; /* don't care about other fields */ - insert_inode_hash (realdev->inode); - - /* Sizes are now rounded at run time */ - -/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/ + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); - realdev->size=blk_size[MAJOR(dev)][MINOR(dev)]; + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; - if (hot_add) { - /* - * Check the superblock for consistency. - * The personality itself has to check whether it's getting - * added with the proper flags. The personality has to be - * checked too. ;) - */ - if (analyze_one_sb (realdev)) - return -EINVAL; + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + printk("md%d: can not hot-add to full array!\n", mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + if (disk_removed(disk)) { /* - * hot_add has to bump up nb_dev itself + * reuse slot */ - if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) { - /* - * FIXME: here we should free up the inode and stuff - */ - printk ("FIXME\n"); - return -EINVAL; + if (disk->number != i) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; } - } else - md_dev[minor].nb_dev++; + } else { + disk->number = i; + } - printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor); - return (0); + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + mddev->sb_dirty = 1; + + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; } -static int md_ioctl (struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +#define SET_SB(x) mddev->sb->x = info.x +static int set_array_info (mddev_t * mddev, void * arg) { - int minor, err; - struct hd_geometry *loc = (struct hd_geometry *) arg; + mdu_array_info_t info; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + if (mddev->sb) { + printk("array md%d already has a superblock!\n", + mdidx(mddev)); + return -EBUSY; + } - if (((minor=MINOR(inode->i_rdev)) & 0x80) && - (minor & 0x7f) < MAX_PERSONALITY && - pers[minor & 0x7f] && - pers[minor & 0x7f]->ioctl) - return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg)); - - if (minor >= MAX_MD_DEV) - return -EINVAL; - - switch (cmd) - { - case REGISTER_DEV: - return do_md_add (minor, to_kdev_t ((dev_t) arg)); - - case START_MD: - return do_md_run (minor, (int) arg); - - case STOP_MD: - return do_md_stop (minor, inode); - - case BLKGETSIZE: /* Return device size */ - if (!arg) return -EINVAL; - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg); - if (err) - return err; - break; - - - /* We have a problem here : there is no easy way to give a CHS - virtual geometry. We currently pretend that we have a 2 heads - 4 sectors (with a BIG number of cylinders...). This drives dosfs - just mad... ;-) */ - - case HDIO_GETGEO: - if (!loc) return -EINVAL; - err = put_user (2, (char *) &loc->heads); - if (err) - return err; - err = put_user (4, (char *) &loc->sectors); - if (err) - return err; - err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders); - if (err) - return err; - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect, - (long *) &loc->start); - if (err) - return err; - break; - - case BLKROSET: - case BLKROGET: - case BLKRAGET: - case BLKRASET: - case BLKFLSBUF: - return blk_ioctl(inode->i_rdev, cmd, arg); - - default: - return -EINVAL; - } - - return (0); + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; } +#undef SET_SB -static int md_open (struct inode *inode, struct file *file) +static int set_disk_info (mddev_t * mddev, void * arg) { - int minor=MINOR(inode->i_rdev); + printk("not yet"); + return -EINVAL; +} - md_dev[minor].busy++; - return (0); /* Always succeed */ +static int clear_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; } +static int write_raid_info (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} -static int md_release (struct inode *inode, struct file *file) +static int protect_array (mddev_t * mddev) { - int minor=MINOR(inode->i_rdev); - md_dev[minor].busy--; - return 0; + printk("not yet"); + return -EINVAL; } -static struct block_device_operations md_fops= +static int unprotect_array (mddev_t * mddev) { - open: md_open, - release: md_release, - ioctl: md_ioctl, -}; + printk("not yet"); + return -EINVAL; +} -int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size) +static int set_disk_faulty (mddev_t *mddev, kdev_t dev) { - if ((unsigned int) minor >= MAX_MD_DEV) - { - printk ("Bad md device %d\n", minor); - return (-1); - } - - if (!md_dev[minor].pers) - { - printk ("Oops ! md%d not running, giving up !\n", minor); - return (-1); - } + int ret; - return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size)); + fsync_dev(mddev_to_kdev(mddev)); + ret = md_error(mddev_to_kdev(mddev), dev); + return ret; } - -int md_make_request (int minor, int rw, struct buffer_head * bh) + +static int md_ioctl (struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) { - if (md_dev [minor].pers->make_request) { - if (buffer_locked(bh)) - return 0; - set_bit(BH_Lock, &bh->b_state); - if (rw == WRITE) { - if (!buffer_dirty(bh)) { - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); - return 0; + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!md_capable_admin()) + return -EACCES; + + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) + return -EINVAL; + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done_unlock; + + case BLKGETSIZE: /* Return device size */ + if (!arg) { + err = -EINVAL; + goto abort; } - } - if (rw == READ || rw == READA) { - if (buffer_uptodate(bh)) { - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); - return 0; + err = md_put_user(md_hd_struct[minor].nr_sects, + (long *) arg); + goto done; + + case BLKFLSBUF: + fsync_dev(dev); + invalidate_buffers(dev); + goto done; + + case BLKRASET: + if (arg > 0xff) { + err = -EINVAL; + goto abort; + } + read_ahead[MAJOR(dev)] = arg; + goto done; + + case BLKRAGET: + if (!arg) { + err = -EINVAL; + goto abort; + } + err = md_put_user (read_ahead[ + MAJOR(dev)], (long *) arg); + goto done; + default: + } + + /* + * Commands creating/starting a new array: + */ + + mddev = kdev_to_mddev(dev); + + switch (cmd) + { + case SET_ARRAY_INFO: + case START_ARRAY: + if (mddev) { + printk("array md%d already exists!\n", + mdidx(mddev)); + err = -EEXIST; + goto abort; + } + default: + } + + switch (cmd) + { + case SET_ARRAY_INFO: + mddev = alloc_mddev(dev); + if (!mddev) { + err = -ENOMEM; + goto abort; + } + /* + * alloc_mddev() should possibly self-lock. + */ + err = lock_mddev(mddev); + if (err) { + printk("ioctl, reason %d, cmd %d\n", err, cmd); + goto abort; + } + err = set_array_info(mddev, (void *)arg); + if (err) { + printk("couldnt set array info. %d\n", err); + goto abort; + } + goto done_unlock; + + case START_ARRAY: + /* + * possibly make it lock the array ... + */ + err = autostart_array((kdev_t)arg); + if (err) { + printk("autostart %s failed!\n", + partition_name((kdev_t)arg)); + goto abort; + } + goto done; + + default: + } + + /* + * Commands querying/configuring an existing array: + */ + + if (!mddev) { + err = -ENODEV; + goto abort; + } + err = lock_mddev(mddev); + if (err) { + printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); + goto abort; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop (mddev, 0); + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case CLEAR_ARRAY: + err = clear_array(mddev); + goto done_unlock; + + case ADD_NEW_DISK: + err = add_new_disk(mddev, (void *)arg); + goto done_unlock; + + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_INFO: + err = set_disk_info(mddev, (void *)arg); + goto done_unlock; + + case WRITE_RAID_INFO: + err = write_raid_info(mddev); + goto done_unlock; + + case UNPROTECT_ARRAY: + err = unprotect_array(mddev); + goto done_unlock; + + case PROTECT_ARRAY: + err = protect_array(mddev); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { + mdu_param_t param; + + err = md_copy_from_user(¶m, (mdu_param_t *)arg, + sizeof(param)); + if (err) + goto abort_unlock; + + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } + goto done_unlock; } - return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh)); - } else { - make_request (MAJOR(bh->b_rdev), rw, bh); - return 0; + + default: + printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; } + +done_unlock: +abort_unlock: + if (mddev) + unlock_mddev(mddev); + else + printk("huh11?\n"); + + return err; +done: + if (err) + printk("huh12?\n"); +abort: + return err; } -static void do_md_request (request_queue_t * q) +static int md_open (struct inode *inode, struct file *file) { - printk ("Got md request, not good..."); - return; + /* + * Always succeed + */ + return (0); } -void md_wakeup_thread(struct md_thread *thread) +static struct block_device_operations md_fops= { + open: md_open, + ioctl: md_ioctl, +}; + + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + md_lock_kernel(); + exit_mm(current); + exit_files(current); + exit_fs(current); + + /* + * Detach thread + */ + sys_setsid(); + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->priority = 40; +// md_unlock_kernel(); + + up(thread->sem); + + for (;;) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&thread->wqueue, &wait); + if (!test_bit(THREAD_WAKEUP, &thread->flags)) { + set_task_state(current, TASK_INTERRUPTIBLE); + dprintk("thread %p went to sleep.\n", thread); + schedule(); + dprintk("thread %p woke up.\n", thread); + current->state = TASK_RUNNING; + } + remove_wait_queue(&thread->wqueue, &wait); + clear_bit(THREAD_WAKEUP, &thread->flags); + + if (thread->run) { + thread->run(thread->data); + run_task_queue(&tq_disk); + } else + break; + if (md_signal_pending(current)) { + printk("%8s(%d) flushing signals.\n", current->comm, + current->pid); + md_flush_signals(); + } + } + up(thread->sem); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + dprintk("waking up MD thread %p.\n", thread); set_bit(THREAD_WAKEUP, &thread->flags); wake_up(&thread->wqueue); } -struct md_thread *md_register_thread (void (*run) (void *), void *data) +mdk_thread_t *md_register_thread (void (*run) (void *), + void *data, const char *name) { - struct md_thread *thread = (struct md_thread *) - kmalloc(sizeof(struct md_thread), GFP_KERNEL); + mdk_thread_t *thread; int ret; DECLARE_MUTEX_LOCKED(sem); - if (!thread) return NULL; + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; - memset(thread, 0, sizeof(struct md_thread)); - init_waitqueue_head(&thread->wqueue); + memset(thread, 0, sizeof(mdk_thread_t)); + md_init_waitqueue_head(&thread->wqueue); thread->sem = &sem; thread->run = run; thread->data = data; + thread->name = name; ret = kernel_thread(md_thread, thread, 0); if (ret < 0) { kfree(thread); @@ -760,387 +2882,317 @@ struct md_thread *md_register_thread (void (*run) (void *), void *data) return thread; } -void md_unregister_thread (struct md_thread *thread) +void md_interrupt_thread (mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + printk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread (mdk_thread_t *thread) { DECLARE_MUTEX_LOCKED(sem); thread->sem = &sem; thread->run = NULL; - if (thread->tsk) - printk("Killing md_thread %d %p %s\n", - thread->tsk->pid, thread->tsk, thread->tsk->comm); - else - printk("Aiee. md_thread has 0 tsk\n"); - send_sig(SIGKILL, thread->tsk, 1); - printk("downing on %p\n", &sem); + thread->name = NULL; + if (!thread->tsk) { + MD_BUG(); + return; + } + md_interrupt_thread(thread); down(&sem); } -#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM)) +void md_recover_arrays (void) +{ + if (!md_recovery_thread) { + MD_BUG(); + return; + } + md_wakeup_thread(md_recovery_thread); +} -int md_thread(void * arg) + +int md_error (kdev_t dev, kdev_t rdev) { - struct md_thread *thread = arg; + mddev_t *mddev = kdev_to_mddev(dev); + mdk_rdev_t * rrdev; + int rc; - lock_kernel(); - exit_mm(current); - exit_files(current); - exit_fs(current); - - current->session = 1; - current->pgrp = 1; - sprintf(current->comm, "md_thread"); - siginitsetinv(¤t->blocked, SHUTDOWN_SIGS); - thread->tsk = current; - up(thread->sem); + printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3)); - for (;;) { - cli(); - if (!test_bit(THREAD_WAKEUP, &thread->flags)) { - do { - spin_lock(¤t->sigmask_lock); - flush_signals(current); - spin_unlock(¤t->sigmask_lock); - interruptible_sleep_on(&thread->wqueue); - cli(); - if (test_bit(THREAD_WAKEUP, &thread->flags)) - break; - if (!thread->run) { - sti(); - up(thread->sem); - return 0; - } - } while (signal_pending(current)); - } - sti(); - clear_bit(THREAD_WAKEUP, &thread->flags); - if (thread->run) { - thread->run(thread->data); - run_task_queue(&tq_disk); + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + mark_rdev_faulty(rrdev); + /* + * if recovery was running, stop it now. + */ + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + if (mddev->pers->error_handler) { + rc = mddev->pers->error_handler(mddev, rdev); + md_recover_arrays(); + return rc; + } + return 0; +} + +static int status_unused (char * page) +{ + int sz = 0, i = 0; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + sz += sprintf(page + sz, "unused devices: "); + + ITERATE_RDEV_ALL(rdev,tmp) { + if (!rdev->same_set.next && !rdev->same_set.prev) { + /* + * The device is not yet used by any array. + */ + i++; + sz += sprintf(page + sz, "%s ", + partition_name(rdev->dev)); } } + if (!i) + sz += sprintf(page + sz, "<none>"); + + sz += sprintf(page + sz, "\n"); + return sz; } -EXPORT_SYMBOL(md_size); -EXPORT_SYMBOL(md_maxreadahead); -EXPORT_SYMBOL(register_md_personality); -EXPORT_SYMBOL(unregister_md_personality); -EXPORT_SYMBOL(md_dev); -EXPORT_SYMBOL(md_error); -EXPORT_SYMBOL(md_register_thread); -EXPORT_SYMBOL(md_unregister_thread); -EXPORT_SYMBOL(md_update_sb); -EXPORT_SYMBOL(md_map); -EXPORT_SYMBOL(md_wakeup_thread); -EXPORT_SYMBOL(md_do_sync); -#ifdef CONFIG_PROC_FS +static int status_resync (char * page, mddev_t * mddev) +{ + int sz = 0; + unsigned int blocksize, max_blocks, resync, res, dt, tt, et; + + resync = mddev->curr_resync; + blocksize = blksize_size[MD_MAJOR][mdidx(mddev)]; + max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10); + + /* + * Should not happen. + */ + if (!max_blocks) { + MD_BUG(); + return 0; + } + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + sz += sprintf(page + sz, "["); + for (i = 0; i < x; i++) + sz += sprintf(page + sz, "="); + sz += sprintf(page + sz, ">"); + for (i = 0; i < y; i++) + sz += sprintf(page + sz, "."); + sz += sprintf(page + sz, "] "); + } + if (!mddev->recovery_running) + /* + * true resync + */ + sz += sprintf(page + sz, " resync =%3u.%u%% (%u/%u)", + res/10, res % 10, resync, max_blocks); + else + /* + * recovery ... + */ + sz += sprintf(page + sz, " recovery =%3u.%u%% (%u/%u)", + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time until now + * tt: total time + * et: estimated finish time + */ + dt = ((jiffies - mddev->resync_start) / HZ); + tt = (dt * (max_blocks / (resync/100+1)))/100; + if (tt > dt) + et = tt - dt; + else + /* + * ignore rounding effects near finish time + */ + et = 0; + + sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6); + + return sz; +} + static int md_status_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int sz = 0, i, j, size; - int begin = 0; + int sz = 0, j, size; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; - sz=sprintf( page, "Personalities : "); - for (i=0; i<MAX_PERSONALITY; i++) - if (pers[i]) - sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name); - page[sz-1]='\n'; + sz += sprintf(page + sz, "Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + sz += sprintf(page+sz, "[%s] ", pers[j]->name); - sz+=sprintf (page+sz, "read_ahead "); - if (read_ahead[MD_MAJOR]==INT_MAX) - sz+=sprintf (page+sz, "not set\n"); - else - sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); + sz += sprintf(page+sz, "\n"); - for (i=0; i<MAX_MD_DEV; i++) { - if (sz < off) { - begin += sz; - off -= sz; - sz = 0; - } - if (sz >= off+count) { - *eof = 1; - break; - } - sz+=sprintf (page+sz, "md%d : %sactive", - i, md_dev[i].pers ? "" : "in"); - if (md_dev[i].pers) - sz+=sprintf (page+sz, " %s", md_dev[i].pers->name); + sz += sprintf(page+sz, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + sz += sprintf(page+sz, "not set\n"); + else + sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); + + ITERATE_MDDEV(mddev,tmp) { + sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + sz += sprintf(page + sz, " (read-only)"); + sz += sprintf(page + sz, " %s", mddev->pers->name); + } - for (j=0, size=0; j<md_dev[i].nb_dev; j++) { - sz+=sprintf (page+sz, " %s", - partition_name(md_dev[i].devices[j].dev)); - size+=md_dev[i].devices[j].size; + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + sz += sprintf(page + sz, " %s[%d]", + partition_name(rdev->dev), rdev->desc_nr); + if (rdev->faulty) { + sz += sprintf(page + sz, "(F)"); + continue; + } + size += rdev->size; } - if (md_dev[i].nb_dev) { - if (md_dev[i].pers) - sz+=sprintf (page+sz, " %d blocks", md_size[i]); + if (mddev->nb_dev) { + if (mddev->pers) + sz += sprintf(page + sz, "\n %d blocks", + md_size[mdidx(mddev)]); else - sz+=sprintf (page+sz, " %d blocks", size); + sz += sprintf(page + sz, "\n %d blocks", size); } - if (!md_dev[i].pers) { - sz+=sprintf (page+sz, "\n"); + if (!mddev->pers) { + sz += sprintf(page+sz, "\n"); continue; } - if (md_dev[i].pers->max_invalid_dev) - sz+=sprintf (page+sz, " maxfault=%ld", - MAX_FAULT(md_dev+i)); + sz += mddev->pers->status (page+sz, mddev); - sz+=md_dev[i].pers->status (page+sz, i, md_dev+i); - sz+=sprintf (page+sz, "\n"); + sz += sprintf(page+sz, "\n "); + if (mddev->curr_resync) { + sz += status_resync (page+sz, mddev); + } else { + if (md_atomic_read(&mddev->resync_sem.count) != 1) + sz += sprintf(page + sz, " resync=DELAYED"); + } + sz += sprintf(page + sz, "\n"); } + sz += status_unused (page + sz); - sz -= off; - *start = page + off; - if (sz>count) - sz = count; - if (sz<0) - sz = 0; return sz; } -#endif - -static void md_geninit (void) -{ - int i; - - blksize_size[MD_MAJOR] = md_blocksizes; - max_readahead[MD_MAJOR] = md_maxreadahead; - for(i=0;i<MAX_MD_DEV;i++) - { - md_blocksizes[i] = 1024; - md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD; - md_dev[i].pers=NULL; - register_disk(&md_gendisk, MKDEV(MAJOR_NR,i), 1, &md_fops, 0); - } - -#ifdef CONFIG_PROC_FS - create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL); -#endif -} - -int md_error (kdev_t mddev, kdev_t rdev) -{ - unsigned int minor = MINOR (mddev); - int rc; - - if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV) - panic ("md_error gets unknown device\n"); - if (!md_dev [minor].pers) - panic ("md_error gets an error for an unknown device\n"); - if (md_dev [minor].pers->error_handler) { - rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev); -#if SUPPORT_RECONSTRUCTION - md_wakeup_thread(md_sync_thread); -#endif /* SUPPORT_RECONSTRUCTION */ - return rc; - } - return 0; -} -int register_md_personality (int p_num, struct md_personality *p) +int register_md_personality (int pnum, mdk_personality_t *p) { - int i=(p_num >> PERSONALITY_SHIFT); - - if (i >= MAX_PERSONALITY) - return -EINVAL; + if (pnum >= MAX_PERSONALITY) + return -EINVAL; - if (pers[i]) - return -EBUSY; + if (pers[pnum]) + return -EBUSY; - pers[i]=p; - printk ("%s personality registered\n", p->name); - return 0; + pers[pnum] = p; + printk(KERN_INFO "%s personality registered\n", p->name); + return 0; } -int unregister_md_personality (int p_num) +int unregister_md_personality (int pnum) { - int i=(p_num >> PERSONALITY_SHIFT); - - if (i >= MAX_PERSONALITY) - return -EINVAL; + if (pnum >= MAX_PERSONALITY) + return -EINVAL; - printk ("%s personality unregistered\n", pers[i]->name); - pers[i]=NULL; - return 0; + printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; } -static md_descriptor_t *get_spare(struct md_dev *mddev) +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) { - int i; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; - - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) - continue; - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) - continue; - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) - continue; - return descriptor; - } - return NULL; -} + struct md_list_head *tmp; + mddev_t *mddev; -/* - * parallel resyncing thread. - * - * FIXME: - make it abort with a dirty array on mdstop, now it just blocks - * - fix read error handing - */ - -int md_do_sync(struct md_dev *mddev) -{ - struct buffer_head *bh; - int max_blocks, blocksize, curr_bsize, percent=1, j; - kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev); - int major = MAJOR(read_disk), minor = MINOR(read_disk); - unsigned long starttime; - - blocksize = blksize_size[major][minor]; - max_blocks = blk_size[major][minor] / (blocksize >> 10); + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { - printk("... resync log\n"); - printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev); - printk(" .... raid array: %s\n", kdevname(read_disk)); - printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize); - printk("md: syncing RAID array %s\n", kdevname(read_disk)); - - mddev->busy++; - - starttime=jiffies; - for (j = 0; j < max_blocks; j++) { + printk(KERN_INFO "stopping all md devices.\n"); + ITERATE_MDDEV(mddev,tmp) + do_md_stop (mddev, 1); /* - * B careful. When some1 mounts a non-'blocksize' filesystem - * then we get the blocksize changed right under us. Go deal - * with it transparently, recalculate 'blocksize', 'j' and - * 'max_blocks': + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... */ - curr_bsize = blksize_size[major][minor]; - if (curr_bsize != blocksize) { - diff_blocksize: - if (curr_bsize > blocksize) - /* - * this is safe, rounds downwards. - */ - j /= curr_bsize/blocksize; - else - j *= blocksize/curr_bsize; - - blocksize = curr_bsize; - max_blocks = blk_size[major][minor] / (blocksize >> 10); - } - if ((bh = breada (read_disk, j, blocksize, j * blocksize, - max_blocks * blocksize)) != NULL) { - mark_buffer_dirty(bh, 1); - brelse(bh); - } else { - /* - * FIXME: Ugly, but set_blocksize() isnt safe ... - */ - curr_bsize = blksize_size[major][minor]; - if (curr_bsize != blocksize) - goto diff_blocksize; - - /* - * It's a real read problem. FIXME, handle this - * a better way. - */ - printk ( KERN_ALERT - "read error, stopping reconstruction.\n"); - mddev->busy--; - return 1; - } - - /* - * Let's sleep some if we are faster than our speed limit: - */ - while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT) - { - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(1); - } - - /* - * FIXME: put this status bar thing into /proc - */ - if (!(j%(max_blocks/100))) { - if (!(percent%10)) - printk (" %03d%% done.\n",percent); - else - printk ("."); - percent++; - } + md_mdelay(1000*1); } - fsync_dev(read_disk); - printk("md: %s: sync done.\n", kdevname(read_disk)); - mddev->busy--; - return 0; + return NOTIFY_DONE; } -/* - * This is a kernel thread which: syncs a spare disk with the active array - * - * the amount of foolproofing might seem to be a tad excessive, but an - * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs - * of my root partition with the first 0.5 gigs of my /home partition ... so - * i'm a bit nervous ;) - */ -void mdsyncd (void *data) -{ - int i; - struct md_dev *mddev; - md_superblock_t *sb; - md_descriptor_t *spare; - unsigned long flags; +struct notifier_block md_notifier = { + md_notify_reboot, + NULL, + 0 +}; - for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) { - if ((sb = mddev->sb) == NULL) - continue; - if (sb->active_disks == sb->raid_disks) - continue; - if (!sb->spare_disks) - continue; - if ((spare = get_spare(mddev)) == NULL) - continue; - if (!mddev->pers->mark_spare) - continue; - if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE)) - continue; - if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) { - mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE); +void md__init raid_setup(char *str, int *ints) +{ + char tmpline[100]; + int len, pos, nr, i; + + len = strlen(str) + 1; + nr = 0; + pos = 0; + + for (i = 0; i < len; i++) { + char c = str[i]; + + if (c == ',' || !c) { + tmpline[pos] = 0; + if (!strcmp(tmpline,"noautodetect")) + raid_setup_args.noautodetect = 1; + nr++; + pos = 0; continue; } - save_flags(flags); - cli(); - mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE); - spare->state |= (1 << MD_SYNC_DEVICE); - spare->state |= (1 << MD_ACTIVE_DEVICE); - sb->spare_disks--; - sb->active_disks++; - mddev->sb_dirty = 1; - md_update_sb(mddev - md_dev); - restore_flags(flags); + tmpline[pos] = c; + pos++; } - + raid_setup_args.set = 1; + return; } #ifdef CONFIG_MD_BOOT struct { unsigned long set; - int pers[MAX_MD_DEV]; - kdev_t devices[MAX_MD_DEV][MAX_REAL]; -} md_setup_args __initdata = { + int pers[MAX_MD_DEVS]; + kdev_t devices[MAX_MD_DEVS][MAX_REAL]; +} md_setup_args md__initdata = { 0,{0},{{0}} }; @@ -1155,7 +3207,7 @@ struct { * the MD devices (by specifying multiple "md=" lines) * instead of just one. -- KTK */ -int __init md_setup(char *str) +static int __init md_setup(char *str) { int minor, level, factor, fault, i; kdev_t device; @@ -1167,31 +3219,31 @@ int __init md_setup(char *str) get_option(&str, &fault) != 2) { printk("md: Too few arguments supplied to md=.\n"); return 0; - } else if (minor >= MAX_MD_DEV) { - printk ("md: Minor device number too high.\n"); + } else if (minor >= MAX_MD_DEVS) { + printk ("md: Minor device number too high.\n"); return 0; } else if (md_setup_args.set & (1 << minor)) { printk ("md: Warning - md=%d,... has been specified twice;\n" " will discard the first definition.\n", minor); - } + } switch(level) { #ifdef CONFIG_MD_LINEAR case -1: level = LINEAR; pername = "linear"; - break; + break; #endif #ifdef CONFIG_MD_STRIPED case 0: level = STRIPED; pername = "striped"; - break; + break; #endif default: printk ("md: The kernel has not been configured for raid%d" " support!\n", level); return 0; - } + } devnames = str; for (i = 0; str; i++) { if ((device = name_to_kdev_t(str))) { @@ -1215,61 +3267,87 @@ int __init md_setup(char *str) md_setup_args.set |= (1 << minor); return 0; } - #endif +static void md_geninit (void) +{ + int i; + + blksize_size[MD_MAJOR] = md_blocksizes; + max_readahead[MD_MAJOR] = md_maxreadahead; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_maxreadahead[i] = MD_READAHEAD; + register_disk(&md_gendisk, MKDEV(MAJOR_NR,i), 1, &md_fops, 0); + + } + + printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL); +#endif +} +void hsm_init (void); +void translucent_init (void); void linear_init (void); void raid0_init (void); void raid1_init (void); void raid5_init (void); -int __init md_init (void) +int md__init md_init (void) { - printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n", - MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION, - MAX_MD_DEV, MAX_REAL); + printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL); + + if (register_blkdev (MD_MAJOR, "md", &md_fops)) + { + printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR); + return (-1); + } - if (register_blkdev (MD_MAJOR, "md", &md_fops)) - { - printk ("Unable to get major %d for md\n", MD_MAJOR); - return (-1); - } + blk_dev[MD_MAJOR].queue = md_get_queue; - blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST); - read_ahead[MD_MAJOR]=INT_MAX; - memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev)); - md_gendisk.next=gendisk_head; + read_ahead[MD_MAJOR] = INT_MAX; + md_gendisk.next = gendisk_head; - gendisk_head=&md_gendisk; + gendisk_head = &md_gendisk; -#if SUPPORT_RECONSTRUCTION - if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL) - printk("md: bug: md_sync_thread == NULL\n"); -#endif /* SUPPORT_RECONSTRUCTION */ + md_register_reboot_notifier(&md_notifier); #ifdef CONFIG_MD_LINEAR - linear_init (); + linear_init (); #endif #ifdef CONFIG_MD_STRIPED - raid0_init (); + raid0_init (); #endif #ifdef CONFIG_MD_MIRRORING - raid1_init (); + raid1_init (); #endif #ifdef CONFIG_MD_RAID5 - raid5_init (); + raid5_init (); #endif - md_geninit(); - return (0); +#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE) + /* + * pick a XOR routine, runtime. + */ + calibrate_xor_block(); +#endif + md_geninit(); + return (0); } #ifdef CONFIG_MD_BOOT -void __init md_setup_drive(void) +static void __init md_setup_drive(void) { + if(md_setup_args.set) + do_md_setup(md_setup_args.str, md_setup_args.ints); int minor, i; kdev_t dev; - for (minor = 0; minor < MAX_MD_DEV; minor++) { + for (minor = 0; minor < MAX_MD_DEVS; minor++) { if ((md_setup_args.set & (1 << minor)) == 0) continue; printk("md: Loading md%d.\n", minor); @@ -1281,3 +3359,20 @@ void __init md_setup_drive(void) __setup("md=", md_setup); #endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_recover_arrays); +MD_EXPORT_SYMBOL(md_register_thread); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_interrupt_thread); +MD_EXPORT_SYMBOL(mddev_map); +MD_EXPORT_SYMBOL(md_check_ordering); + diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c index 4a09f71c4..434fac029 100644 --- a/drivers/block/paride/pg.c +++ b/drivers/block/paride/pg.c @@ -260,18 +260,10 @@ static char pg_scratch[512]; /* scratch block buffer */ /* kernel glue structures */ static struct file_operations pg_fops = { - NULL, /* lseek - default */ - pg_read, /* read */ - pg_write, /* write */ - NULL, /* readdir - bad */ - NULL, /* select */ - NULL, /* ioctl */ - NULL, /* mmap */ - pg_open, /* open */ - NULL, /* flush */ - pg_release, /* release */ - NULL, /* fsync */ - NULL, /* fasync */ + read: pg_read, + write: pg_write, + open: pg_open, + release: pg_release, }; void pg_init_units( void ) diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index b1851e999..ba24c9956 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c @@ -262,18 +262,11 @@ static char pt_scratch[512]; /* scratch block buffer */ /* kernel glue structures */ static struct file_operations pt_fops = { - NULL, /* lseek - default */ - pt_read, /* read */ - pt_write, /* write */ - NULL, /* readdir - bad */ - NULL, /* select */ - pt_ioctl, /* ioctl */ - NULL, /* mmap */ - pt_open, /* open */ - NULL, /* flush */ - pt_release, /* release */ - NULL, /* fsync */ - NULL, /* fasync */ + read: pt_read, + write: pt_write, + ioctl: pt_ioctl, + open: pt_open, + release: pt_release, }; void pt_init_units( void ) diff --git a/drivers/block/raid0.c b/drivers/block/raid0.c index 37b2035cd..661855a18 100644 --- a/drivers/block/raid0.c +++ b/drivers/block/raid0.c @@ -1,9 +1,10 @@ - /* raid0.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr> + Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + RAID-0 management functions. @@ -18,146 +19,201 @@ */ #include <linux/module.h> -#include <linux/md.h> -#include <linux/raid0.h> -#include <linux/vmalloc.h> +#include <linux/raid/raid0.h> #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -static int create_strip_zones (int minor, struct md_dev *mddev) +static int create_strip_zones (mddev_t *mddev) { - int i, j, c=0; - int current_offset=0; - struct real_dev *smallest_by_zone; - struct raid0_data *data=(struct raid0_data *) mddev->private; - - data->nr_strip_zones=1; - - for (i=1; i<mddev->nb_dev; i++) - { - for (j=0; j<i; j++) - if (mddev->devices[i].size==mddev->devices[j].size) - { - c=1; - break; - } - - if (!c) - data->nr_strip_zones++; - - c=0; - } - - if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL) - return 1; - - data->smallest=NULL; - - for (i=0; i<data->nr_strip_zones; i++) - { - data->strip_zone[i].dev_offset=current_offset; - smallest_by_zone=NULL; - c=0; - - for (j=0; j<mddev->nb_dev; j++) - if (mddev->devices[j].size>current_offset) - { - data->strip_zone[i].dev[c++]=mddev->devices+j; - if (!smallest_by_zone || - smallest_by_zone->size > mddev->devices[j].size) - smallest_by_zone=mddev->devices+j; - } - - data->strip_zone[i].nb_dev=c; - data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c; - - if (!data->smallest || - data->smallest->size > data->strip_zone[i].size) - data->smallest=data->strip_zone+i; - - data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+ - data->strip_zone[i-1].size) : 0; - current_offset=smallest_by_zone->size; - } - return 0; + int i, c, j, j1, j2; + int current_offset, curr_zone_offset; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; + + /* + * The number of 'same size groups' + */ + conf->nr_strip_zones = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev1,j1) { + printk("raid0: looking at %s\n", partition_name(rdev1->dev)); + c = 0; + ITERATE_RDEV_ORDERED(mddev,rdev2,j2) { + printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size); + if (rdev2 == rdev1) { + printk("raid0: END\n"); + break; + } + if (rdev2->size == rdev1->size) + { + /* + * Not unique, dont count it as a new + * group + */ + printk("raid0: EQUAL\n"); + c = 1; + break; + } + printk("raid0: NOT EQUAL\n"); + } + if (!c) { + printk("raid0: ==> UNIQUE\n"); + conf->nr_strip_zones++; + printk("raid0: %d zones\n", conf->nr_strip_zones); + } + } + printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); + + conf->strip_zone = vmalloc(sizeof(struct strip_zone)* + conf->nr_strip_zones); + if (!conf->strip_zone) + return 1; + + + conf->smallest = NULL; + current_offset = 0; + curr_zone_offset = 0; + + for (i = 0; i < conf->nr_strip_zones; i++) + { + struct strip_zone *zone = conf->strip_zone + i; + + printk("zone %d\n", i); + zone->dev_offset = current_offset; + smallest = NULL; + c = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + + printk(" checking %s ...", partition_name(rdev->dev)); + if (rdev->size > current_offset) + { + printk(" contained as device %d\n", c); + zone->dev[c] = rdev; + c++; + if (!smallest || (rdev->size <smallest->size)) { + smallest = rdev; + printk(" (%d) is smallest!.\n", rdev->size); + } + } else + printk(" nope.\n"); + } + + zone->nb_dev = c; + zone->size = (smallest->size - current_offset) * c; + printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size); + + if (!conf->smallest || (zone->size < conf->smallest->size)) + conf->smallest = zone; + + zone->zone_offset = curr_zone_offset; + curr_zone_offset += zone->size; + + current_offset = smallest->size; + printk("current zone offset: %d\n", current_offset); + } + printk("done.\n"); + return 0; } -static int raid0_run (int minor, struct md_dev *mddev) +static int raid0_run (mddev_t *mddev) { - int cur=0, i=0, size, zone0_size, nb_zone; - struct raid0_data *data; - - MOD_INC_USE_COUNT; - - if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1; - data=(struct raid0_data *) mddev->private; - - if (create_strip_zones (minor, mddev)) - { - vfree(data); - return 1; - } - - nb_zone=data->nr_zones= - md_size[minor]/data->smallest->size + - (md_size[minor]%data->smallest->size ? 1 : 0); - - printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone); - if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL) - { - vfree(data->strip_zone); - vfree(data); - return 1; - } - size=data->strip_zone[cur].size; - - i=0; - while (cur<data->nr_strip_zones) - { - data->hash_table[i].zone0=data->strip_zone+cur; - - if (size>=data->smallest->size)/* If we completely fill the slot */ - { - data->hash_table[i++].zone1=NULL; - size-=data->smallest->size; - - if (!size) - { - if (++cur==data->nr_strip_zones) continue; - size=data->strip_zone[cur].size; - } - - continue; - } - - if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */ - { - data->hash_table[i].zone1=NULL; - continue; - } - - zone0_size=size; /* Here, we use a 2nd dev to fill the slot */ - size=data->strip_zone[cur].size; - data->hash_table[i++].zone1=data->strip_zone+cur; - size-=(data->smallest->size - zone0_size); - } - - return (0); + int cur=0, i=0, size, zone0_size, nb_zone; + raid0_conf_t *conf; + + MOD_INC_USE_COUNT; + + conf = vmalloc(sizeof (raid0_conf_t)); + if (!conf) + goto out; + mddev->private = (void *)conf; + + if (md_check_ordering(mddev)) { + printk("raid0: disks are not ordered, aborting!\n"); + goto out_free_conf; + } + + if (create_strip_zones (mddev)) + goto out_free_conf; + + printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]); + printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size); + nb_zone = md_size[mdidx(mddev)]/conf->smallest->size + + (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0); + printk("raid0 : nb_zone is %d.\n", nb_zone); + conf->nr_zones = nb_zone; + + printk("raid0 : Allocating %d bytes for hash.\n", + sizeof(struct raid0_hash)*nb_zone); + + conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone); + if (!conf->hash_table) + goto out_free_zone_conf; + size = conf->strip_zone[cur].size; + + i = 0; + while (cur < conf->nr_strip_zones) { + conf->hash_table[i].zone0 = conf->strip_zone + cur; + + /* + * If we completely fill the slot + */ + if (size >= conf->smallest->size) { + conf->hash_table[i++].zone1 = NULL; + size -= conf->smallest->size; + + if (!size) { + if (++cur == conf->nr_strip_zones) + continue; + size = conf->strip_zone[cur].size; + } + continue; + } + if (++cur == conf->nr_strip_zones) { + /* + * Last dev, set unit1 as NULL + */ + conf->hash_table[i].zone1=NULL; + continue; + } + + /* + * Here we use a 2nd dev to fill the slot + */ + zone0_size = size; + size = conf->strip_zone[cur].size; + conf->hash_table[i++].zone1 = conf->strip_zone + cur; + size -= (conf->smallest->size - zone0_size); + } + return 0; + +out_free_zone_conf: + vfree(conf->strip_zone); + conf->strip_zone = NULL; + +out_free_conf: + vfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return 1; } - -static int raid0_stop (int minor, struct md_dev *mddev) +static int raid0_stop (mddev_t *mddev) { - struct raid0_data *data=(struct raid0_data *) mddev->private; + raid0_conf_t *conf = mddev_to_conf(mddev); - vfree (data->hash_table); - vfree (data->strip_zone); - vfree (data); + vfree (conf->hash_table); + conf->hash_table = NULL; + vfree (conf->strip_zone); + conf->strip_zone = NULL; + vfree (conf); + mddev->private = NULL; - MOD_DEC_USE_COUNT; - return 0; + MOD_DEC_USE_COUNT; + return 0; } /* @@ -167,135 +223,142 @@ static int raid0_stop (int minor, struct md_dev *mddev) * Of course, those facts may not be valid anymore (and surely won't...) * Hey guys, there's some work out there ;-) */ -static int raid0_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) +static int raid0_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) { - struct raid0_data *data=(struct raid0_data *) mddev->private; - static struct raid0_hash *hash; - struct strip_zone *zone; - struct real_dev *tmp_dev; - int blk_in_chunk, factor, chunk, chunk_size; - long block, rblock; - - factor=FACTOR(mddev); - chunk_size=(1UL << FACTOR_SHIFT(factor)); - block=*rsector >> 1; - hash=data->hash_table+(block/data->smallest->size); - - if (hash - data->hash_table > data->nr_zones) - { - printk(KERN_DEBUG "raid0_map: invalid block %ul\n", block); - return -1; - } - - /* Sanity check */ - if ((chunk_size*2)<(*rsector % (chunk_size*2))+size) - { - printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size); - return (-1); - } - - if (block >= (hash->zone0->size + - hash->zone0->zone_offset)) - { - if (!hash->zone1) - { - printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block); - return (-1); - } + unsigned long size = bh->b_size >> 10; + raid0_conf_t *conf = mddev_to_conf(mddev); + struct raid0_hash *hash; + struct strip_zone *zone; + mdk_rdev_t *tmp_dev; + int blk_in_chunk, chunksize_bits, chunk, chunk_size; + long block, rblock; + + chunk_size = mddev->param.chunk_size >> 10; + chunksize_bits = ffz(~chunk_size); + block = bh->b_blocknr * size; + hash = conf->hash_table + block / conf->smallest->size; + + /* Sanity check */ + if (chunk_size < (block % chunk_size) + size) + goto bad_map; + + if (!hash) + goto bad_hash; + + if (!hash->zone0) + goto bad_zone0; + + if (block >= (hash->zone0->size + hash->zone0->zone_offset)) { + if (!hash->zone1) + goto bad_zone1; + zone = hash->zone1; + } else + zone = hash->zone0; - zone=hash->zone1; - } - else - zone=hash->zone0; - - blk_in_chunk=block & (chunk_size -1); - chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor)); - tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev]; - rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset; - - *rdev=tmp_dev->dev; - *rsector=rblock<<1; - - return (0); + blk_in_chunk = block & (chunk_size -1); + chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits); + tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev]; + rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset; + + /* + * Important, at this point we are not guaranteed to be the only + * CPU modifying b_rdev and b_rsector! Only __make_request() later + * on serializes the IO. So in 2.4 we must never write temporary + * values to bh->b_rdev, like 2.2 and 2.0 did. + */ + bh->b_rdev = tmp_dev->dev; + bh->b_rsector = rblock << 1; + + generic_make_request(rw, bh); + + return 0; + +bad_map: + printk ("raid0_make_request bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, bh->b_rsector, size); + return -1; +bad_hash: + printk("raid0_make_request bug: hash==NULL for block %ld\n", block); + return -1; +bad_zone0: + printk ("raid0_make_request bug: hash->zone0==NULL for block %ld\n", block); + return -1; +bad_zone1: + printk ("raid0_make_request bug: hash->zone1==NULL for block %ld\n", block); + return -1; } - -static int raid0_status (char *page, int minor, struct md_dev *mddev) +static int raid0_status (char *page, mddev_t *mddev) { - int sz=0; + int sz = 0; #undef MD_DEBUG #ifdef MD_DEBUG - int j, k; - struct raid0_data *data=(struct raid0_data *) mddev->private; + int j, k; + raid0_conf_t *conf = mddev_to_conf(mddev); - sz+=sprintf (page+sz, " "); - for (j=0; j<data->nr_zones; j++) - { - sz+=sprintf (page+sz, "[z%d", - data->hash_table[j].zone0-data->strip_zone); - if (data->hash_table[j].zone1) - sz+=sprintf (page+sz, "/z%d] ", - data->hash_table[j].zone1-data->strip_zone); - else - sz+=sprintf (page+sz, "] "); - } + sz += sprintf(page + sz, " "); + for (j = 0; j < conf->nr_zones; j++) { + sz += sprintf(page + sz, "[z%d", + conf->hash_table[j].zone0 - conf->strip_zone); + if (conf->hash_table[j].zone1) + sz += sprintf(page+sz, "/z%d] ", + conf->hash_table[j].zone1 - conf->strip_zone); + else + sz += sprintf(page+sz, "] "); + } - sz+=sprintf (page+sz, "\n"); + sz += sprintf(page + sz, "\n"); - for (j=0; j<data->nr_strip_zones; j++) - { - sz+=sprintf (page+sz, " z%d=[", j); - for (k=0; k<data->strip_zone[j].nb_dev; k++) - sz+=sprintf (page+sz, "%s/", - partition_name(data->strip_zone[j].dev[k]->dev)); - sz--; - sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n", - data->strip_zone[j].zone_offset, - data->strip_zone[j].dev_offset, - data->strip_zone[j].size); - } + for (j = 0; j < conf->nr_strip_zones; j++) { + sz += sprintf(page + sz, " z%d=[", j); + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + sz += sprintf (page+sz, "%s/", partition_name( + conf->strip_zone[j].dev[k]->dev)); + sz--; + sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n", + conf->strip_zone[j].zone_offset, + conf->strip_zone[j].dev_offset, + conf->strip_zone[j].size); + } #endif - sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev))); - return sz; + sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024); + return sz; } - -static struct md_personality raid0_personality= +static mdk_personality_t raid0_personality= { - "raid0", - raid0_map, - NULL, /* no special make_request */ - NULL, /* no special end_request */ - raid0_run, - raid0_stop, - raid0_status, - NULL, /* no ioctls */ - 0, - NULL, /* no error_handler */ - NULL, /* hot_add_disk */ - NULL, /* hot_remove_disk */ - NULL /* mark_spare */ + "raid0", + NULL, /* no special map */ + raid0_make_request, + NULL, /* no special end_request */ + raid0_run, + raid0_stop, + raid0_status, + NULL, /* no ioctls */ + 0, + NULL, /* no error_handler */ + NULL, /* no diskop */ + NULL, /* no stop resync */ + NULL /* no restart resync */ }; - #ifndef MODULE void raid0_init (void) { - register_md_personality (RAID0, &raid0_personality); + register_md_personality (RAID0, &raid0_personality); } #else int init_module (void) { - return (register_md_personality (RAID0, &raid0_personality)); + return (register_md_personality (RAID0, &raid0_personality)); } void cleanup_module (void) { - unregister_md_personality (RAID0); + unregister_md_personality (RAID0); } #endif + diff --git a/drivers/block/rd.c b/drivers/block/rd.c index f3803b4b3..17a745d5b 100644 --- a/drivers/block/rd.c +++ b/drivers/block/rd.c @@ -270,7 +270,7 @@ repeat: } } if (rbh) { - set_bit(BH_Protected, &rbh->b_state); + mark_buffer_protected(rbh); brelse(rbh); } @@ -290,7 +290,10 @@ static int rd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, un switch (cmd) { case BLKFLSBUF: if (!capable(CAP_SYS_ADMIN)) return -EACCES; - invalidate_buffers(inode->i_rdev); + /* special: we want to release the ramdisk memory, + it's not like with the other blockdevices where + this ioctl only flushes away the buffer cache. */ + destroy_buffers(inode->i_rdev); break; case BLKGETSIZE: /* Return device size */ @@ -338,17 +341,8 @@ static int initrd_release(struct inode *inode,struct file *file) static struct file_operations initrd_fops = { - NULL, /* lseek */ - initrd_read, /* read */ - NULL, /* write */ - NULL, /* readdir */ - NULL, /* poll */ - NULL, /* ioctl */ - NULL, /* mmap */ - NULL, /* open */ - NULL, /* flush */ - initrd_release, /* release */ - NULL /* fsync */ + read: initrd_read, + release: initrd_release, }; #endif @@ -391,7 +385,7 @@ static void __exit rd_cleanup (void) int i; for (i = 0 ; i < NUM_RAMDISKS; i++) - invalidate_buffers(MKDEV(MAJOR_NR, i)); + destroy_buffers(MKDEV(MAJOR_NR, i)); unregister_blkdev( MAJOR_NR, "ramdisk" ); blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR)); |