diff options
Diffstat (limited to 'drivers/block/raid1.c')
-rw-r--r-- | drivers/block/raid1.c | 1716 |
1 files changed, 1217 insertions, 499 deletions
diff --git a/drivers/block/raid1.c b/drivers/block/raid1.c index 057be0d64..89b3e7514 100644 --- a/drivers/block/raid1.c +++ b/drivers/block/raid1.c @@ -1,6 +1,9 @@ -/************************************************************************ +/* * raid1.c : Multiple Devices driver for Linux - * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * RAID-1 management functions. * @@ -15,50 +18,86 @@ */ #include <linux/module.h> -#include <linux/locks.h> #include <linux/malloc.h> -#include <linux/md.h> -#include <linux/raid1.h> -#include <asm/bitops.h> +#include <linux/raid/raid1.h> #include <asm/atomic.h> #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY +#define MAX_LINEAR_SECTORS 128 + +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#define MIN(a,b) ((a) < (b) ? (a) : (b)) + /* * The following can be used to debug the driver */ -/*#define RAID1_DEBUG*/ -#ifdef RAID1_DEBUG -#define PRINTK(x) do { printk x; } while (0); +#define RAID1_DEBUG 0 + +#if RAID1_DEBUG +#define PRINTK(x...) printk(x) +#define inline +#define __inline__ #else -#define PRINTK(x) do { ; } while (0); +#define inline +#define __inline__ +#define PRINTK(x...) do { } while (0) #endif -#define MAX(a,b) ((a) > (b) ? (a) : (b)) -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -static struct md_personality raid1_personality; -static struct md_thread *raid1_thread = NULL; -struct buffer_head *raid1_retry_list = NULL; +static mdk_personality_t raid1_personality; +static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; +struct buffer_head *raid1_retry_list = NULL, **raid1_retry_tail; + +static void * raid1_kmalloc (int size) +{ + void * ptr; + /* + * now we are rather fault tolerant than nice, but + * there are a couple of places in the RAID code where we + * simply can not afford to fail an allocation because + * there is no failure return path (eg. make_request()) + */ + while (!(ptr = kmalloc (size, GFP_KERNEL))) + printk ("raid1: out of memory, retrying...\n"); + + memset(ptr, 0, size); + return ptr; +} + +static struct page * raid1_gfp (void) +{ + struct page *page; + /* + * now we are rather fault tolerant than nice, but + * there are a couple of places in the RAID code where we + * simply can not afford to fail an allocation because + * there is no failure return path (eg. make_request()) + * FIXME: be nicer here. + */ + while (!(page = (void*)alloc_page(GFP_KERNEL))) { + printk ("raid1: GFP out of memory, retrying...\n"); + schedule_timeout(2); + } + + return page; +} -static int __raid1_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) +static int raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - int i, n = raid_conf->raid_disks; + raid1_conf_t *conf = mddev_to_conf(mddev); + int i, disks = MD_SB_DISKS; /* * Later we do read balancing on the read side * now we use the first available disk. */ - PRINTK(("raid1_map().\n")); - - for (i=0; i<n; i++) { - if (raid_conf->mirrors[i].operational) { - *rdev = raid_conf->mirrors[i].dev; + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].operational) { + *rdev = conf->mirrors[i].dev; return (0); } } @@ -67,59 +106,80 @@ static int __raid1_map (struct md_dev *mddev, kdev_t *rdev, return (-1); } -static int raid1_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) +static void raid1_reschedule_retry (struct buffer_head *bh) { - return 0; + unsigned long flags; + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); + mddev_t *mddev = r1_bh->mddev; + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_spin_lock_irqsave(&retry_list_lock, flags); + if (raid1_retry_list == NULL) + raid1_retry_tail = &raid1_retry_list; + *raid1_retry_tail = bh; + raid1_retry_tail = &r1_bh->next_retry; + r1_bh->next_retry = NULL; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + md_wakeup_thread(conf->thread); } -void raid1_reschedule_retry (struct buffer_head *bh) -{ - struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); - PRINTK(("raid1_reschedule_retry().\n")); +static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector < conf->start_active) + conf->cnt_done--; + else if (sector >= conf->start_future && conf->phase == phase) + conf->cnt_future--; + else if (!--conf->cnt_pending) + wake_up(&conf->wait_ready); + + spin_unlock_irqrestore(&conf->segment_lock, flags); +} - r1_bh->next_retry = raid1_retry_list; - raid1_retry_list = bh; - md_wakeup_thread(raid1_thread); +static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector >= conf->start_ready) + --conf->cnt_ready; + else if (sector >= conf->start_active) { + if (!--conf->cnt_active) { + conf->start_active = conf->start_ready; + wake_up(&conf->wait_done); + } + } + spin_unlock_irqrestore(&conf->segment_lock, flags); } /* - * raid1_end_buffer_io() is called when we have finished servicing a mirrored + * raid1_end_bh_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate) +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) { struct buffer_head *bh = r1_bh->master_bh; + io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), + test_bit(R1BH_SyncPhase, &r1_bh->state)); + bh->b_end_io(bh, uptodate); kfree(r1_bh); } - -int raid1_one_error=0; - void raid1_end_request (struct buffer_head *bh, int uptodate) { struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); - unsigned long flags; - save_flags(flags); - cli(); - PRINTK(("raid1_end_request().\n")); - - if (raid1_one_error) { - raid1_one_error=0; - uptodate=0; - } /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) md_error (bh->b_dev, bh->b_rdev); - else { + else /* - * Set BH_Uptodate in our master buffer_head, so that + * Set R1BH_Uptodate in our master buffer_head, so that * we will return a good error code for to the higher * levels even if IO on some other mirrored buffer fails. * @@ -127,8 +187,7 @@ void raid1_end_request (struct buffer_head *bh, int uptodate) * user-side. So if something waits for IO, then it will * wait for the 'master' buffer_head. */ - set_bit (BH_Uptodate, &r1_bh->state); - } + set_bit (R1BH_Uptodate, &r1_bh->state); /* * We split up the read and write side, imho they are @@ -136,239 +195,253 @@ void raid1_end_request (struct buffer_head *bh, int uptodate) */ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { - - PRINTK(("raid1_end_request(), read branch.\n")); - /* * we have only one buffer_head on the read side */ + if (uptodate) { - PRINTK(("raid1_end_request(), read branch, uptodate.\n")); - raid1_end_buffer_io(r1_bh, uptodate); - restore_flags(flags); + raid1_end_bh_io(r1_bh, uptodate); return; } /* * oops, read error: */ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", - kdevname(bh->b_dev), bh->b_blocknr); - raid1_reschedule_retry (bh); - restore_flags(flags); + partition_name(bh->b_dev), bh->b_blocknr); + raid1_reschedule_retry(bh); return; } /* - * WRITE. - */ - PRINTK(("raid1_end_request(), write branch.\n")); - - /* + * WRITE: + * * Let's see if all mirrored write operations have finished - * already [we have irqs off, so we can decrease]: + * already. */ - if (!--r1_bh->remaining) { - struct md_dev *mddev = r1_bh->mddev; - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - int i, n = raid_conf->raid_disks; - - PRINTK(("raid1_end_request(), remaining == 0.\n")); + if (atomic_dec_and_test(&r1_bh->remaining)) { + int i, disks = MD_SB_DISKS; - for ( i=0; i<n; i++) - if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]); + for ( i = 0; i < disks; i++) { + struct buffer_head *bh = r1_bh->mirror_bh[i]; + if (bh) { + // FIXME: make us a regular bcache member + kfree(bh); + } + } - raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state)); + raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); } - else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining)); - restore_flags(flags); } -/* This routine checks if the undelying device is an md device and in that - * case it maps the blocks before putting the request on the queue - */ -static inline void -map_and_make_request (int rw, struct buffer_head *bh) -{ - if (MAJOR (bh->b_rdev) == MD_MAJOR) - md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9); - clear_bit(BH_Lock, &bh->b_state); - make_request (MAJOR (bh->b_rdev), rw, bh); -} - -static int -raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh) +static int raid1_make_request (request_queue_t *q, mddev_t *mddev, int rw, + struct buffer_head * bh) { - - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req; struct raid1_bh * r1_bh; - int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors; + int disks = MD_SB_DISKS; + int i, sum_bhs = 0, switch_disks = 0, sectors; struct mirror_info *mirror; + DECLARE_WAITQUEUE(wait, current); - PRINTK(("raid1_make_request().\n")); - - while (!( /* FIXME: now we are rather fault tolerant than nice */ - r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_KERNEL) - ) ) - { - printk ("raid1_make_request(#1): out of memory\n"); - current->policy |= SCHED_YIELD; - schedule(); - } - memset (r1_bh, 0, sizeof (struct raid1_bh)); - + if (!buffer_locked(bh)) + BUG(); + /* * make_request() can abort the operation when READA is being * used and no empty request is available. * * Currently, just replace the command with READ/WRITE. */ - if (rw == READA) rw = READ; + if (rw == READA) + rw = READ; - if (rw == WRITE) - mark_buffer_clean(bh); /* Too early ? */ + if (rw == WRITE) { + rw = WRITERAW; + /* + * we first clean the bh, then we start the IO, then + * when the IO has finished, we end_io the bh and + * mark it uptodate. This way we do not miss the + * case when the bh got dirty again during the IO. + * + * We do an important optimization here - if the + * buffer was not dirty and we are during resync or + * reconstruction, then we can skip writing it back + * to the master disk! (we still have to write it + * back to the other disks, because we are not sync + * yet.) + */ + if (atomic_set_buffer_clean(bh)) + __mark_buffer_clean(bh); + else { + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); + return 0; + } + } + r1_bh = raid1_kmalloc (sizeof (struct raid1_bh)); -/* - * i think the read and write branch should be separated completely, since we want - * to do read balancing on the read side for example. Comments? :) --mingo - */ - r1_bh->master_bh=bh; - r1_bh->mddev=mddev; + spin_lock_irq(&conf->segment_lock); + wait_event_lock_irq(conf->wait_done, + bh->b_rsector < conf->start_active || + bh->b_rsector >= conf->start_future, + conf->segment_lock); + if (bh->b_rsector < conf->start_active) + conf->cnt_done++; + else { + conf->cnt_future++; + if (conf->phase) + set_bit(R1BH_SyncPhase, &r1_bh->state); + } + spin_unlock_irq(&conf->segment_lock); + + /* + * i think the read and write branch should be separated completely, + * since we want to do read balancing on the read side for example. + * Alternative implementations? :) --mingo + */ + + r1_bh->master_bh = bh; + r1_bh->mddev = mddev; r1_bh->cmd = rw; + bh->b_rsector = bh->b_blocknr * (bh->b_size>>9); + + if (rw == READ) { + int last_used = conf->last_used; - if (rw==READ || rw==READA) { - int last_used = raid_conf->last_used; - PRINTK(("raid1_make_request(), read branch.\n")); - mirror = raid_conf->mirrors + last_used; + /* + * read balancing logic: + */ + mirror = conf->mirrors + last_used; bh->b_rdev = mirror->dev; sectors = bh->b_size >> 9; - if (bh->b_blocknr * sectors == raid_conf->next_sect) { - raid_conf->sect_count += sectors; - if (raid_conf->sect_count >= mirror->sect_limit) + + switch_disks = 0; + if (bh->b_blocknr * sectors == conf->next_sect) { + conf->sect_count += sectors; + if (conf->sect_count >= mirror->sect_limit) switch_disks = 1; } else switch_disks = 1; - raid_conf->next_sect = (bh->b_blocknr + 1) * sectors; - if (switch_disks) { - PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count)); - raid_conf->sect_count = 0; - last_used = raid_conf->last_used = mirror->next; + conf->next_sect = (bh->b_blocknr + 1) * sectors; + /* + * Do not switch disks if full resync is in progress ... + */ + if (switch_disks && !conf->resync_mirrors) { + conf->sect_count = 0; + last_used = conf->last_used = mirror->next; /* - * Do not switch to write-only disks ... resyncing - * is in progress + * Do not switch to write-only disks ... + * reconstruction is in progress */ - while (raid_conf->mirrors[last_used].write_only) - raid_conf->last_used = raid_conf->mirrors[last_used].next; + while (conf->mirrors[last_used].write_only) + conf->last_used = conf->mirrors[last_used].next; } - PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev))); bh_req = &r1_bh->bh_req; memcpy(bh_req, bh, sizeof(*bh)); bh_req->b_end_io = raid1_end_request; bh_req->b_dev_id = r1_bh; - map_and_make_request (rw, bh_req); + q = blk_get_queue(bh_req->b_rdev); + generic_make_request (q, rw, bh_req); return 0; } /* - * WRITE. + * WRITE: */ - PRINTK(("raid1_make_request(n=%d), write branch.\n",n)); - for (i = 0; i < n; i++) { + for (i = 0; i < disks; i++) { - if (!raid_conf->mirrors [i].operational) { + if (!conf->mirrors[i].operational) { /* * the r1_bh->mirror_bh[i] pointer remains NULL */ mirror_bh[i] = NULL; continue; } - + /* * We should use a private pool (size depending on NR_REQUEST), * to avoid writes filling up the memory with bhs * - * Such pools are much faster than kmalloc anyways (so we waste almost - * nothing by not using the master bh when writing and win alot of cleanness) - * - * but for now we are cool enough. --mingo - * + * Such pools are much faster than kmalloc anyways (so we waste + * almost nothing by not using the master bh when writing and + * win alot of cleanness) but for now we are cool enough. --mingo + * * It's safe to sleep here, buffer heads cannot be used in a shared - * manner in the write branch. Look how we lock the buffer at the beginning - * of this function to grok the difference ;) + * manner in the write branch. Look how we lock the buffer at the + * beginning of this function to grok the difference ;) */ - while (!( /* FIXME: now we are rather fault tolerant than nice */ - mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_KERNEL) - ) ) - { - printk ("raid1_make_request(#2): out of memory\n"); - current->policy |= SCHED_YIELD; - schedule(); - } - memset (mirror_bh[i], 0, sizeof (struct buffer_head)); - - /* - * prepare mirrored bh (fields ordered for max mem throughput): - */ - mirror_bh [i]->b_blocknr = bh->b_blocknr; - mirror_bh [i]->b_dev = bh->b_dev; - mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev; - mirror_bh [i]->b_rsector = bh->b_rsector; - mirror_bh [i]->b_state = (1<<BH_Req) | (1<<BH_Dirty); - atomic_set(&mirror_bh [i]->b_count, 1); - mirror_bh [i]->b_size = bh->b_size; - mirror_bh [i]->b_data = bh->b_data; - mirror_bh [i]->b_list = BUF_LOCKED; - mirror_bh [i]->b_end_io = raid1_end_request; - mirror_bh [i]->b_dev_id = r1_bh; - - r1_bh->mirror_bh[i] = mirror_bh[i]; + mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head)); + mirror_bh[i]->b_this_page = (struct buffer_head *)1; + + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mirror_bh[i]->b_blocknr = bh->b_blocknr; + mirror_bh[i]->b_dev = bh->b_dev; + mirror_bh[i]->b_rdev = conf->mirrors[i].dev; + mirror_bh[i]->b_rsector = bh->b_rsector; + mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty) | + (1<<BH_Mapped) | (1<<BH_Lock); + + atomic_set(&mirror_bh[i]->b_count, 1); + mirror_bh[i]->b_size = bh->b_size; + mirror_bh[i]->b_data = bh->b_data; + mirror_bh[i]->b_list = BUF_LOCKED; + mirror_bh[i]->b_end_io = raid1_end_request; + mirror_bh[i]->b_dev_id = r1_bh; + + r1_bh->mirror_bh[i] = mirror_bh[i]; sum_bhs++; } - r1_bh->remaining = sum_bhs; - - PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs)); + md_atomic_set(&r1_bh->remaining, sum_bhs); /* - * We have to be a bit careful about the semaphore above, thats why we - * start the requests separately. Since kmalloc() could fail, sleep and - * make_request() can sleep too, this is the safer solution. Imagine, - * end_request decreasing the semaphore before we could have set it up ... - * We could play tricks with the semaphore (presetting it and correcting - * at the end if sum_bhs is not 'n' but we have to do end_request by hand - * if all requests finish until we had a chance to set up the semaphore - * correctly ... lots of races). + * We have to be a bit careful about the semaphore above, thats + * why we start the requests separately. Since kmalloc() could + * fail, sleep and make_request() can sleep too, this is the + * safer solution. Imagine, end_request decreasing the semaphore + * before we could have set it up ... We could play tricks with + * the semaphore (presetting it and correcting at the end if + * sum_bhs is not 'n' but we have to do end_request by hand if + * all requests finish until we had a chance to set up the + * semaphore correctly ... lots of races). */ - for (i = 0; i < n; i++) - if (mirror_bh [i] != NULL) - map_and_make_request (rw, mirror_bh [i]); - + for (i = 0; i < disks; i++) { + struct buffer_head *mbh = mirror_bh[i]; + if (mbh) { + q = blk_get_queue(mbh->b_rdev); + generic_make_request(q, rw, mbh); + } + } return (0); } - -static int raid1_status (char *page, int minor, struct md_dev *mddev) + +static int raid1_status (char *page, mddev_t *mddev) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); int sz = 0, i; - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks); - for (i = 0; i < raid_conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_"); + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", + conf->mirrors[i].operational ? "U" : "_"); sz += sprintf (page+sz, "]"); return sz; } -static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index) +static void unlink_disk (raid1_conf_t *conf, int target) { - int disks = raid_conf->raid_disks; - int j; + int disks = MD_SB_DISKS; + int i; - for (j = 0; j < disks; j++) - if (raid_conf->mirrors [j].next == failed_index) - raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next; + for (i = 0; i < disks; i++) + if (conf->mirrors[i].next == target) + conf->mirrors[i].next = conf->mirrors[target].next; } #define LAST_DISK KERN_ALERT \ @@ -379,7 +452,7 @@ static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index) #define DISK_FAILED KERN_ALERT \ "raid1: Disk failure on %s, disabling device. \n" \ -" Operation continuing on %d devices\n" +" Operation continuing on %d devices\n" #define START_SYNCING KERN_ALERT \ "raid1: start syncing spare disk.\n" @@ -387,48 +460,53 @@ static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index) #define ALREADY_SYNCING KERN_INFO \ "raid1: syncing already in progress.\n" -static int raid1_error (struct md_dev *mddev, kdev_t dev) +static void mark_disk_bad (mddev_t *mddev, int failed) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - struct mirror_info *mirror; - md_superblock_t *sb = mddev->sb; - int disks = raid_conf->raid_disks; - int i; + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror = conf->mirrors+failed; + mdp_super_t *sb = mddev->sb; + + mirror->operational = 0; + unlink_disk(conf, failed); + mark_disk_faulty(sb->disks+mirror->number); + mark_disk_nonsync(sb->disks+mirror->number); + mark_disk_inactive(sb->disks+mirror->number); + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + conf->working_disks--; + printk (DISK_FAILED, partition_name (mirror->dev), + conf->working_disks); +} - PRINTK(("raid1_error called\n")); +static int raid1_error (mddev_t *mddev, kdev_t dev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info * mirrors = conf->mirrors; + int disks = MD_SB_DISKS; + int i; - if (raid_conf->working_disks == 1) { + if (conf->working_disks == 1) { /* * Uh oh, we can do nothing if this is our last disk, but * first check if this is a queued request for a device * which has just failed. */ - for (i = 0, mirror = raid_conf->mirrors; i < disks; - i++, mirror++) - if (mirror->dev == dev && !mirror->operational) + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && !mirrors[i].operational) return 0; + } printk (LAST_DISK); } else { - /* Mark disk as unusable */ - for (i = 0, mirror = raid_conf->mirrors; i < disks; - i++, mirror++) { - if (mirror->dev == dev && mirror->operational){ - mirror->operational = 0; - raid1_fix_links (raid_conf, i); - sb->disks[mirror->number].state |= - (1 << MD_FAULTY_DEVICE); - sb->disks[mirror->number].state &= - ~(1 << MD_SYNC_DEVICE); - sb->disks[mirror->number].state &= - ~(1 << MD_ACTIVE_DEVICE); - sb->active_disks--; - sb->working_disks--; - sb->failed_disks++; - mddev->sb_dirty = 1; - md_wakeup_thread(raid1_thread); - raid_conf->working_disks--; - printk (DISK_FAILED, kdevname (dev), - raid_conf->working_disks); + /* + * Mark disk as unusable + */ + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && mirrors[i].operational) { + mark_disk_bad(mddev, i); + break; } } } @@ -441,219 +519,658 @@ static int raid1_error (struct md_dev *mddev, kdev_t dev) #undef START_SYNCING /* - * This is the personality-specific hot-addition routine + * Insert the spare disk into the drive-ring */ +static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror) +{ + int j, next; + int disks = MD_SB_DISKS; + struct mirror_info *p = conf->mirrors; -#define NO_SUPERBLOCK KERN_ERR \ -"raid1: cannot hot-add disk to the array with no RAID superblock\n" - -#define WRONG_LEVEL KERN_ERR \ -"raid1: hot-add: level of disk is not RAID-1\n" + for (j = 0; j < disks; j++, p++) + if (p->operational && !p->write_only) { + next = p->next; + p->next = mirror->raid_disk; + mirror->next = next; + return; + } -#define HOT_ADD_SUCCEEDED KERN_INFO \ -"raid1: device %s hot-added\n" + printk("raid1: bug: no read-operational devices\n"); +} -static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev) +static void print_raid1_conf (raid1_conf_t *conf) { - unsigned long flags; - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - struct mirror_info *mirror; - md_superblock_t *sb = mddev->sb; - struct real_dev * realdev; - int n; + int i; + struct mirror_info *tmp; + + printk("RAID1 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, + conf->raid_disks, conf->nr_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid1_conf_t *conf = mddev->private; + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + + print_raid1_conf(conf); + md_spin_lock_irq(&conf->device_lock); /* - * The device has its superblock already read and it was found - * to be consistent for generic RAID usage. Now we check whether - * it's usable for RAID-1 hot addition. + * find the disk ... */ + switch (state) { - n = mddev->nb_dev++; - realdev = &mddev->devices[n]; - if (!realdev->sb) { - printk (NO_SUPERBLOCK); - return -EINVAL; - } - if (realdev->sb->level != 1) { - printk (WRONG_LEVEL); - return -EINVAL; + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID1 configuration ... + * (this can only be in the first conf->working_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; } - /* FIXME: are there other things left we could sanity-check? */ + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + break; /* - * We have to disable interrupts, as our RAID-1 state is used - * from irq handlers as well. + * Deactivate a spare disk: */ - save_flags(flags); - cli(); + case DISKOP_SPARE_INACTIVE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->nr_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: - raid_conf->raid_disks++; - mirror = raid_conf->mirrors+n; + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; - mirror->number=n; - mirror->raid_disk=n; - mirror->dev=dev; - mirror->next=0; /* FIXME */ - mirror->sect_limit=128; + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; - mirror->operational=0; - mirror->spare=1; - mirror->write_only=0; + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } - sb->disks[n].state |= (1 << MD_FAULTY_DEVICE); - sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE); - sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE); - sb->nr_disks++; - sb->spare_disks++; + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } - restore_flags(flags); + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } - md_update_sb(MINOR(dev)); + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } - printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev)); + /* + * do the switch finally + */ + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); - return 0; -} + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); -#undef NO_SUPERBLOCK -#undef WRONG_LEVEL -#undef HOT_ADD_SUCCEEDED + *d = failed_desc; -/* - * Insert the spare disk into the drive-ring - */ -static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror) -{ - int j, next; - struct mirror_info *p = raid_conf->mirrors; + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + link_disk(conf, fdisk); - for (j = 0; j < raid_conf->raid_disks; j++, p++) - if (p->operational && !p->write_only) { - next = p->next; - p->next = mirror->raid_disk; - mirror->next = next; - return; + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + + conf->working_disks++; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->mirrors + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + conf->nr_disks--; + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->mirrors + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; } - printk("raid1: bug: no read-operational devices\n"); -} -static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, - int state) -{ - int i = 0, failed_disk = -1; - struct raid1_data *raid_conf = mddev->private; - struct mirror_info *mirror = raid_conf->mirrors; - md_descriptor_t *descriptor; - unsigned long flags; + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); - for (i = 0; i < MD_SB_DISKS; i++, mirror++) { - if (mirror->spare && mirror->number == spare->number) - goto found; - } - return 1; -found: - for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks; - i++, mirror++) - if (!mirror->operational) - failed_disk = i; + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + conf->nr_disks++; - save_flags(flags); - cli(); - switch (state) { - case SPARE_WRITE: - mirror->operational = 1; - mirror->write_only = 1; - raid_conf->raid_disks = MAX(raid_conf->raid_disks, - mirror->raid_disk + 1); - break; - case SPARE_INACTIVE: - mirror->operational = 0; - mirror->write_only = 0; - break; - case SPARE_ACTIVE: - mirror->spare = 0; - mirror->write_only = 0; - raid_conf->working_disks++; - add_ring(raid_conf, mirror); - - if (failed_disk != -1) { - descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number]; - i = spare->raid_disk; - spare->raid_disk = descriptor->raid_disk; - descriptor->raid_disk = i; - } - break; - default: - printk("raid1_mark_spare: bug: state == %d\n", state); - restore_flags(flags); - return 1; + break; + + default: + MD_BUG(); + err = 1; + goto abort; } - restore_flags(flags); - return 0; +abort: + md_spin_unlock_irq(&conf->device_lock); + print_raid1_conf(conf); + return err; } + +#define IO_ERROR KERN_ALERT \ +"raid1: %s: unrecoverable I/O read error for block %lu\n" + +#define REDIRECT_SECTOR KERN_ERR \ +"raid1: %s: redirecting sector %lu to another mirror\n" + /* * This is a kernel thread which: * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. */ -void raid1d (void *data) +static void end_sync_write(struct buffer_head *bh, int uptodate); +static void end_sync_read(struct buffer_head *bh, int uptodate); + +static void raid1d (void *data) { + struct raid1_bh *r1_bh; struct buffer_head *bh; - kdev_t dev; unsigned long flags; - struct raid1_bh * r1_bh; - struct md_dev *mddev; + request_queue_t *q; + mddev_t *mddev; + kdev_t dev; - PRINTK(("raid1d() active\n")); - save_flags(flags); - cli(); - while (raid1_retry_list) { + + for (;;) { + md_spin_lock_irqsave(&retry_list_lock, flags); bh = raid1_retry_list; + if (!bh) + break; r1_bh = (struct raid1_bh *)(bh->b_dev_id); raid1_retry_list = r1_bh->next_retry; - restore_flags(flags); + md_spin_unlock_irqrestore(&retry_list_lock, flags); - mddev = md_dev + MINOR(bh->b_dev); + mddev = kdev_to_mddev(bh->b_dev); if (mddev->sb_dirty) { - printk("dirty sb detected, updating.\n"); + printk(KERN_INFO "dirty sb detected, updating.\n"); mddev->sb_dirty = 0; - md_update_sb(MINOR(bh->b_dev)); - } - dev = bh->b_rdev; - __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9); - if (bh->b_rdev == dev) { - printk (KERN_ALERT - "raid1: %s: unrecoverable I/O read error for block %lu\n", - kdevname(bh->b_dev), bh->b_blocknr); - raid1_end_buffer_io(r1_bh, 0); - } else { - printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n", - kdevname(bh->b_dev), bh->b_blocknr); - map_and_make_request (r1_bh->cmd, bh); + md_update_sb(mddev); } - cli(); + switch(r1_bh->cmd) { + case SPECIAL: + /* have to allocate lots of bh structures and + * schedule writes + */ + if (test_bit(R1BH_Uptodate, &r1_bh->state)) { + int i, sum_bhs = 0; + int disks = MD_SB_DISKS; + struct buffer_head *mirror_bh[MD_SB_DISKS]; + raid1_conf_t *conf; + + conf = mddev_to_conf(mddev); + for (i = 0; i < disks ; i++) { + if (!conf->mirrors[i].operational) { + mirror_bh[i] = NULL; + continue; + } + if (i==conf->last_used) { + /* we read from here, no need to write */ + mirror_bh[i] = NULL; + continue; + } + if (i < conf->raid_disks + && !conf->resync_mirrors) { + /* don't need to write this, + * we are just rebuilding */ + mirror_bh[i] = NULL; + continue; + } + + mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head)); + mirror_bh[i]->b_this_page = (struct buffer_head *)1; + + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mirror_bh[i]->b_blocknr = bh->b_blocknr; + mirror_bh[i]->b_dev = bh->b_dev; + mirror_bh[i]->b_rdev = conf->mirrors[i].dev; + mirror_bh[i]->b_rsector = bh->b_rsector; + mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty) | + (1<<BH_Mapped) | (1<<BH_Lock); + + atomic_set(&mirror_bh[i]->b_count, 1); + mirror_bh[i]->b_size = bh->b_size; + mirror_bh[i]->b_data = bh->b_data; + mirror_bh[i]->b_list = BUF_LOCKED; + mirror_bh[i]->b_end_io = end_sync_write; + mirror_bh[i]->b_dev_id = r1_bh; + + r1_bh->mirror_bh[i] = mirror_bh[i]; + sum_bhs++; + } + md_atomic_set(&r1_bh->remaining, sum_bhs); + for ( i = 0; i < disks ; i++) { + struct buffer_head *mbh = mirror_bh[i]; + if (mbh) { + q = blk_get_queue(mbh->b_rdev); + generic_make_request(q, WRITE, mbh); + } + } + } else { + dev = bh->b_rdev; + raid1_map (mddev, &bh->b_rdev, bh->b_size >> 9); + if (bh->b_rdev == dev) { + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + md_done_sync(mddev, bh->b_size>>10, 0); + } else { + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); + q = blk_get_queue(bh->b_rdev); + generic_make_request (q, READ, bh); + } + } + + break; + case READ: + case READA: + dev = bh->b_rdev; + + raid1_map (mddev, &bh->b_rdev, bh->b_size >> 9); + if (bh->b_rdev == dev) { + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + raid1_end_bh_io(r1_bh, 0); + } else { + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); + q = blk_get_queue(bh->b_rdev); + generic_make_request (q, r1_bh->cmd, bh); + } + break; + } + } + md_spin_unlock_irqrestore(&retry_list_lock, flags); +} +#undef IO_ERROR +#undef REDIRECT_SECTOR + +/* + * Private kernel thread to reconstruct mirrors after an unclean + * shutdown. + */ +static void raid1syncd (void *data) +{ + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_mirrors) + return; + if (conf->resync_mirrors == 2) + return; + down(&mddev->recovery_sem); + if (md_do_sync(mddev, NULL)) { + up(&mddev->recovery_sem); + return; + } + /* + * Only if everything went Ok. + */ + conf->resync_mirrors = 0; + up(&mddev->recovery_sem); +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * This is achieved by conceptually dividing the device space into a + * number of sections: + * DONE: 0 .. a-1 These blocks are in-sync + * ACTIVE: a.. b-1 These blocks may have active sync requests, but + * no normal IO requests + * READY: b .. c-1 These blocks have no normal IO requests - sync + * request may be happening + * PENDING: c .. d-1 These blocks may have IO requests, but no new + * ones will be added + * FUTURE: d .. end These blocks are not to be considered yet. IO may + * be happening, but not sync + * + * We keep a + * phase which flips (0 or 1) each time d moves and + * a count of: + * z = active io requests in FUTURE since d moved - marked with + * current phase + * y = active io requests in FUTURE before d moved, or PENDING - + * marked with previous phase + * x = active sync requests in READY + * w = active sync requests in ACTIVE + * v = active io requests in DONE + * + * Normally, a=b=c=d=0 and z= active io requests + * or a=b=c=d=END and v= active io requests + * Allowed changes to a,b,c,d: + * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase + * B: y==0 -> c=d + * C: b=c, w+=x, x=0 + * D: w==0 -> a=b + * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 + * + * At start of sync we apply A. + * When y reaches 0, we apply B then A then being sync requests + * When sync point reaches c-1, we wait for y==0, and W==0, and + * then apply apply B then A then D then C. + * Finally, we apply E + * + * The sync request simply issues a "read" against a working drive + * This is marked so that on completion the raid1d thread is woken to + * issue suitable write requests + */ + +static int raid1_sync_request (mddev_t *mddev, unsigned long block_nr) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror; + request_queue_t *q; + struct raid1_bh *r1_bh; + struct buffer_head *bh; + int bsize; + + spin_lock_irq(&conf->segment_lock); + if (!block_nr) { + /* initialize ...*/ + conf->start_active = 0; + conf->start_ready = 0; + conf->start_pending = 0; + conf->start_future = 0; + conf->phase = 0; + conf->window = 128; + conf->cnt_future += conf->cnt_done+conf->cnt_pending; + conf->cnt_done = conf->cnt_pending = 0; + if (conf->cnt_ready || conf->cnt_active) + MD_BUG(); + } + while ((block_nr<<1) >= conf->start_pending) { + PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", + block_nr<<1, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, + conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); + wait_event_lock_irq(conf->wait_done, + !conf->cnt_active, + conf->segment_lock); + wait_event_lock_irq(conf->wait_ready, + !conf->cnt_pending, + conf->segment_lock); + conf->start_active = conf->start_ready; + conf->start_ready = conf->start_pending; + conf->start_pending = conf->start_future; + conf->start_future = conf->start_future+conf->window; + // Note: falling of the end is not a problem + conf->phase = conf->phase ^1; + conf->cnt_active = conf->cnt_ready; + conf->cnt_ready = 0; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + wake_up(&conf->wait_done); + } + conf->cnt_ready++; + spin_unlock_irq(&conf->segment_lock); + + + /* If reconstructing, and >1 working disc, + * could dedicate one to rebuild and others to + * service read requests .. + */ + mirror = conf->mirrors+conf->last_used; + + r1_bh = raid1_kmalloc (sizeof (struct raid1_bh)); + r1_bh->master_bh = NULL; + r1_bh->mddev = mddev; + r1_bh->cmd = SPECIAL; + bh = &r1_bh->bh_req; + memset(bh, 0, sizeof(*bh)); + + bh->b_blocknr = block_nr; + bsize = 1024; + while (!(bh->b_blocknr & 1) && bsize < PAGE_SIZE + && (bh->b_blocknr+2)*(bsize>>10) < mddev->sb->size) { + bh->b_blocknr >>= 1; + bsize <<= 1; + } + bh->b_size = bsize; + bh->b_list = BUF_LOCKED; + bh->b_dev = mddev_to_kdev(mddev); + bh->b_rdev = mirror->dev; + bh->b_state = (1<<BH_Req) | (1<<BH_Mapped); + bh->b_page = raid1_gfp(); + bh->b_data = (char *) page_address(bh->b_page); + bh->b_end_io = end_sync_read; + bh->b_dev_id = (void *) r1_bh; + bh->b_rsector = block_nr<<1; + init_waitqueue_head(&bh->b_wait); + + q = blk_get_queue(bh->b_rdev); + generic_make_request(q, READ, bh); + drive_stat_acct(bh->b_rdev, READ, -bh->b_size/512, 0); + + return (bsize >> 10); +} + +static void end_sync_read(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); + + /* we have read a block, now it needs to be re-written, + * or re-read if the read failed. + * We don't do much here, just schedule handling by raid1d + */ + if (!uptodate) + md_error (bh->b_dev, bh->b_rdev); + else + set_bit(R1BH_Uptodate, &r1_bh->state); + raid1_reschedule_retry(bh); +} + +static void end_sync_write(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); + + if (!uptodate) + md_error (bh->b_dev, bh->b_rdev); + if (atomic_dec_and_test(&r1_bh->remaining)) { + int i, disks = MD_SB_DISKS; + mddev_t *mddev = r1_bh->mddev; + unsigned long sect = bh->b_rsector; + int size = bh->b_size; + + free_page((unsigned long)bh->b_data); + for ( i = 0; i < disks; i++) { + struct buffer_head *bh = r1_bh->mirror_bh[i]; + if (bh) { + // FIXME: make us a regular bcache member + kfree(bh); + } + } + kfree(r1_bh); + sync_request_done(sect, mddev_to_conf(mddev)); + md_done_sync(mddev,size>>10, uptodate); } - restore_flags(flags); } /* * This will catch the scenario in which one of the mirrors was * mounted as a normal device rather than as a part of a raid set. + * + * check_consistency is very personality-dependent, eg. RAID5 cannot + * do this check, it uses another method. */ -static int __check_consistency (struct md_dev *mddev, int row) +static int __check_consistency (mddev_t *mddev, int row) { - struct raid1_data *raid_conf = mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); + int disks = MD_SB_DISKS; kdev_t dev; struct buffer_head *bh = NULL; int i, rc = 0; char *buffer = NULL; - for (i = 0; i < raid_conf->raid_disks; i++) { - if (!raid_conf->mirrors[i].operational) + for (i = 0; i < disks; i++) { + printk("(checking disk %d)\n",i); + if (!conf->mirrors[i].operational) continue; - dev = raid_conf->mirrors[i].dev; + printk("(really checking disk %d)\n",i); + dev = conf->mirrors[i].dev; set_blocksize(dev, 4096); if ((bh = bread(dev, row / 4, 4096)) == NULL) break; @@ -682,187 +1199,389 @@ static int __check_consistency (struct md_dev *mddev, int row) return rc; } -static int check_consistency (struct md_dev *mddev) +static int check_consistency (mddev_t *mddev) { - int size = mddev->sb->size; - int row; + if (__check_consistency(mddev, 0)) +/* + * we do not do this currently, as it's perfectly possible to + * have an inconsistent array when it's freshly created. Only + * newly written data has to be consistent. + */ + return 0; - for (row = 0; row < size; row += size / 8) - if (__check_consistency(mddev, row)) - return 1; return 0; } -static int raid1_run (int minor, struct md_dev *mddev) +#define INVALID_LEVEL KERN_WARNING \ +"raid1: md%d: raid level not set to mirroring (%d)\n" + +#define NO_SB KERN_ERR \ +"raid1: disabled mirror %s (couldn't access raid superblock)\n" + +#define ERRORS KERN_ERR \ +"raid1: disabled mirror %s (errors detected)\n" + +#define NOT_IN_SYNC KERN_ERR \ +"raid1: disabled mirror %s (not in sync)\n" + +#define INCONSISTENT KERN_ERR \ +"raid1: disabled mirror %s (inconsistent descriptor)\n" + +#define ALREADY_RUNNING KERN_ERR \ +"raid1: disabled mirror %s (mirror %d already operational)\n" + +#define OPERATIONAL KERN_INFO \ +"raid1: device %s operational as mirror %d\n" + +#define MEM_ERROR KERN_ERR \ +"raid1: couldn't allocate memory for md%d\n" + +#define SPARE KERN_INFO \ +"raid1: spare disk %s\n" + +#define NONE_OPERATIONAL KERN_ERR \ +"raid1: no operational mirrors for md%d\n" + +#define RUNNING_CKRAID KERN_ERR \ +"raid1: detected mirror differences -- running resync\n" + +#define ARRAY_IS_ACTIVE KERN_INFO \ +"raid1: raid set md%d active with %d out of %d mirrors\n" + +#define THREAD_ERROR KERN_ERR \ +"raid1: couldn't allocate thread for md%d\n" + +#define START_RESYNC KERN_WARNING \ +"raid1: raid set md%d not clean; reconstructing mirrors\n" + +static int raid1_run (mddev_t *mddev) { - struct raid1_data *raid_conf; - int i, j, raid_disk; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; + raid1_conf_t *conf; + int i, j, disk_idx; + struct mirror_info *disk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int start_recovery = 0; MOD_INC_USE_COUNT; if (sb->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", - kdevname(MKDEV(MD_MAJOR, minor)), sb->level); - MOD_DEC_USE_COUNT; - return -EIO; - } - /**** - * copy the now verified devices into our private RAID1 bookkeeping - * area. [whatever we allocate in raid1_run(), should be freed in - * raid1_stop()] + printk(INVALID_LEVEL, mdidx(mddev), sb->level); + goto out; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in raid1_run(), + * should be freed in raid1_stop()] */ - while (!( /* FIXME: now we are rather fault tolerant than nice */ - mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL) - ) ) - { - printk ("raid1_run(): out of memory\n"); - current->policy |= SCHED_YIELD; - schedule(); + conf = raid1_kmalloc(sizeof(raid1_conf_t)); + mddev->private = conf; + if (!conf) { + printk(MEM_ERROR, mdidx(mddev)); + goto out; } - raid_conf = mddev->private; - memset(raid_conf, 0, sizeof(*raid_conf)); - PRINTK(("raid1_run(%d) called.\n", minor)); - - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) { - printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev)); + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk(ERRORS, partition_name(rdev->dev)); + } else { + if (!rdev->sb) { + MD_BUG(); + continue; + } + } + if (rdev->desc_nr == -1) { + MD_BUG(); continue; } - - /* - * This is important -- we are using the descriptor on - * the disk only to get a pointer to the descriptor on - * the main superblock, which might be more recent. - */ - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) { - printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev)); + descriptor = &sb->disks[rdev->desc_nr]; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor)) { + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; continue; } - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) { - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) { - printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev)); + if (disk_active(descriptor)) { + if (!disk_sync(descriptor)) { + printk(NOT_IN_SYNC, + partition_name(rdev->dev)); continue; } - raid_disk = descriptor->raid_disk; - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) { - printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev)); + if ((descriptor->number > MD_SB_DISKS) || + (disk_idx > sb->raid_disks)) { + + printk(INCONSISTENT, + partition_name(rdev->dev)); continue; } - if (raid_conf->mirrors[raid_disk].operational) { - printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk); + if (disk->operational) { + printk(ALREADY_RUNNING, + partition_name(rdev->dev), + disk_idx); continue; } - printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk); - raid_conf->mirrors[raid_disk].number = descriptor->number; - raid_conf->mirrors[raid_disk].raid_disk = raid_disk; - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev; - raid_conf->mirrors[raid_disk].operational = 1; - raid_conf->mirrors[raid_disk].sect_limit = 128; - raid_conf->working_disks++; + printk(OPERATIONAL, partition_name(rdev->dev), + disk_idx); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 1; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + conf->working_disks++; } else { /* * Must be a spare disk .. */ - printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev)); - raid_disk = descriptor->raid_disk; - raid_conf->mirrors[raid_disk].number = descriptor->number; - raid_conf->mirrors[raid_disk].raid_disk = raid_disk; - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev; - raid_conf->mirrors[raid_disk].sect_limit = 128; - - raid_conf->mirrors[raid_disk].operational = 0; - raid_conf->mirrors[raid_disk].write_only = 0; - raid_conf->mirrors[raid_disk].spare = 1; - } - } - if (!raid_conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - kfree(raid_conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return -EIO; - } - - raid_conf->raid_disks = sb->raid_disks; - raid_conf->mddev = mddev; - - for (j = 0; !raid_conf->mirrors[j].operational; j++); - raid_conf->last_used = j; - for (i = raid_conf->raid_disks - 1; i >= 0; i--) { - if (raid_conf->mirrors[i].operational) { - PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j)); - raid_conf->mirrors[i].next = j; + printk(SPARE, partition_name(rdev->dev)); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } + } + if (!conf->working_disks) { + printk(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + conf->raid_disks = sb->raid_disks; + conf->nr_disks = sb->nr_disks; + conf->mddev = mddev; + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + + conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_done); + init_waitqueue_head(&conf->wait_ready); + + + for (i = 0; i < MD_SB_DISKS; i++) { + + descriptor = sb->disks+i; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && + !disk->used_slot) { + + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } + } + + /* + * find the first working one and use it as a starting point + * to read balancing. + */ + for (j = 0; !conf->mirrors[j].operational; j++) + /* nothing */; + conf->last_used = j; + + /* + * initialize the 'working disks' list. + */ + for (i = conf->raid_disks - 1; i >= 0; i--) { + if (conf->mirrors[i].operational) { + conf->mirrors[i].next = j; j = i; } } - if (check_consistency(mddev)) { - printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n"); - sb->state |= 1 << MD_SB_ERRORS; - kfree(raid_conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return -EIO; + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; + } + + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) { + /* + * we do sanity checks even if the device says + * it's clean ... + */ + if (check_consistency(mddev)) { + printk(RUNNING_CKRAID); + sb->state &= ~(1 << MD_SB_CLEAN); + } + } + + { + const char * name = "raid1d"; + + conf->thread = md_register_thread(raid1d, conf, name); + if (!conf->thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + } + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { + const char * name = "raid1syncd"; + + conf->resync_thread = md_register_thread(raid1syncd, conf,name); + if (!conf->resync_thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + printk(START_RESYNC, mdidx(mddev)); + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); } /* * Regenerate the "device is in sync with the raid set" bit for * each device. */ - for (i = 0; i < sb->nr_disks ; i++) { - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE); + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks+i); for (j = 0; j < sb->raid_disks; j++) { - if (!raid_conf->mirrors[j].operational) + if (!conf->mirrors[j].operational) continue; - if (sb->disks[i].number == raid_conf->mirrors[j].number) - sb->disks[i].state |= 1 << MD_SYNC_DEVICE; + if (sb->disks[i].number == conf->mirrors[j].number) + mark_disk_sync(sb->disks+i); } } - sb->active_disks = raid_conf->working_disks; + sb->active_disks = conf->working_disks; - printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks); - /* Ok, everything is just fine now */ - return (0); + if (start_recovery) + md_recover_arrays(); + + + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); + /* + * Ok, everything is just fine now + */ + return 0; + +out_free_conf: + kfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return -EIO; +} + +#undef INVALID_LEVEL +#undef NO_SB +#undef ERRORS +#undef NOT_IN_SYNC +#undef INCONSISTENT +#undef ALREADY_RUNNING +#undef OPERATIONAL +#undef SPARE +#undef NONE_OPERATIONAL +#undef RUNNING_CKRAID +#undef ARRAY_IS_ACTIVE + +static int raid1_stop_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_thread) { + if (conf->resync_mirrors) { + conf->resync_mirrors = 2; + md_interrupt_thread(conf->resync_thread); + + /* this is really needed when recovery stops too... */ + spin_lock_irq(&conf->segment_lock); + wait_event_lock_irq(conf->wait_done, !conf->cnt_active, conf->segment_lock); + conf->start_active = conf->start_ready; + conf->start_ready = conf->start_pending; + conf->cnt_active = conf->cnt_ready; + conf->cnt_ready = 0; + wait_event_lock_irq(conf->wait_done, !conf->cnt_active, conf->segment_lock); + conf->start_active = conf->start_ready; + conf->cnt_ready = 0; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; + conf->start_future = mddev->sb->size+1; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + conf->phase = conf->phase ^1; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; + conf->phase = 0; + conf->cnt_done = conf->cnt_future; + conf->cnt_future = 0; + wake_up(&conf->wait_done); + + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid1_restart_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_mirrors) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } + return 0; } -static int raid1_stop (int minor, struct md_dev *mddev) +static int raid1_stop (mddev_t *mddev) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); - kfree (raid_conf); + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; return 0; } -static struct md_personality raid1_personality= +static mdk_personality_t raid1_personality= { "raid1", - raid1_map, raid1_make_request, raid1_end_request, raid1_run, raid1_stop, raid1_status, - NULL, /* no ioctls */ 0, raid1_error, - raid1_hot_add_disk, - /* raid1_hot_remove_drive */ NULL, - raid1_mark_spare + raid1_diskop, + raid1_stop_resync, + raid1_restart_resync, + raid1_sync_request }; int raid1_init (void) { - if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL) - return -EBUSY; return register_md_personality (RAID1, &raid1_personality); } @@ -874,7 +1593,6 @@ int init_module (void) void cleanup_module (void) { - md_unregister_thread (raid1_thread); unregister_md_personality (RAID1); } #endif |