raid1.c
上传用户:jlfgdled
上传日期:2013-04-10
资源大小:33168k
文件大小:46k
- /*
- * raid1.c : Multiple Devices driver for Linux
- *
- * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
- *
- * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
- *
- * RAID-1 management functions.
- *
- * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
- *
- * Fixes to reconstruction by Jakob 豷tergaard" <jakob@ostenfeld.dk>
- * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
- #include <linux/module.h>
- #include <linux/config.h>
- #include <linux/slab.h>
- #include <linux/raid/raid1.h>
- #include <asm/atomic.h>
- #define MAJOR_NR MD_MAJOR
- #define MD_DRIVER
- #define MD_PERSONALITY
- #define MAX_WORK_PER_DISK 128
- #define NR_RESERVED_BUFS 32
- /*
- * The following can be used to debug the driver
- */
- #define RAID1_DEBUG 0
- #if RAID1_DEBUG
- #define PRINTK(x...) printk(x)
- #define inline
- #define __inline__
- #else
- #define PRINTK(x...) do { } while (0)
- #endif
- static mdk_personality_t raid1_personality;
- static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
- struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
- static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
- {
- /* return a linked list of "cnt" struct buffer_heads.
- * don't take any off the free list unless we know we can
- * get all we need, otherwise we could deadlock
- */
- struct buffer_head *bh=NULL;
- while(cnt) {
- struct buffer_head *t;
- md_spin_lock_irq(&conf->device_lock);
- if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
- while (cnt) {
- t = conf->freebh;
- conf->freebh = t->b_next;
- t->b_next = bh;
- bh = t;
- t->b_state = 0;
- conf->freebh_cnt--;
- cnt--;
- }
- md_spin_unlock_irq(&conf->device_lock);
- if (cnt == 0)
- break;
- t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
- if (t) {
- t->b_next = bh;
- bh = t;
- cnt--;
- } else {
- PRINTK("raid1: waiting for %d bhn", cnt);
- conf->freebh_blocked = 1;
- wait_disk_event(conf->wait_buffer,
- !conf->freebh_blocked ||
- conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
- conf->freebh_blocked = 0;
- }
- }
- return bh;
- }
- static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
- {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- while (bh) {
- struct buffer_head *t = bh;
- bh=bh->b_next;
- if (t->b_pprev == NULL)
- kmem_cache_free(bh_cachep, t);
- else {
- t->b_next= conf->freebh;
- conf->freebh = t;
- conf->freebh_cnt++;
- }
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- wake_up(&conf->wait_buffer);
- }
- static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
- {
- /* allocate cnt buffer_heads, possibly less if kmalloc fails */
- int i = 0;
- while (i < cnt) {
- struct buffer_head *bh;
- bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
- if (!bh) break;
- md_spin_lock_irq(&conf->device_lock);
- bh->b_pprev = &conf->freebh;
- bh->b_next = conf->freebh;
- conf->freebh = bh;
- conf->freebh_cnt++;
- md_spin_unlock_irq(&conf->device_lock);
- i++;
- }
- return i;
- }
- static void raid1_shrink_bh(raid1_conf_t *conf)
- {
- /* discard all buffer_heads */
- md_spin_lock_irq(&conf->device_lock);
- while (conf->freebh) {
- struct buffer_head *bh = conf->freebh;
- conf->freebh = bh->b_next;
- kmem_cache_free(bh_cachep, bh);
- conf->freebh_cnt--;
- }
- md_spin_unlock_irq(&conf->device_lock);
- }
-
- static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
- {
- struct raid1_bh *r1_bh = NULL;
- do {
- md_spin_lock_irq(&conf->device_lock);
- if (!conf->freer1_blocked && conf->freer1) {
- r1_bh = conf->freer1;
- conf->freer1 = r1_bh->next_r1;
- conf->freer1_cnt--;
- r1_bh->next_r1 = NULL;
- r1_bh->state = (1 << R1BH_PreAlloc);
- r1_bh->bh_req.b_state = 0;
- }
- md_spin_unlock_irq(&conf->device_lock);
- if (r1_bh)
- return r1_bh;
- r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
- if (r1_bh) {
- memset(r1_bh, 0, sizeof(*r1_bh));
- return r1_bh;
- }
- conf->freer1_blocked = 1;
- wait_disk_event(conf->wait_buffer,
- !conf->freer1_blocked ||
- conf->freer1_cnt > NR_RESERVED_BUFS/2
- );
- conf->freer1_blocked = 0;
- } while (1);
- }
- static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
- {
- struct buffer_head *bh = r1_bh->mirror_bh_list;
- raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
- r1_bh->mirror_bh_list = NULL;
- if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- r1_bh->next_r1 = conf->freer1;
- conf->freer1 = r1_bh;
- conf->freer1_cnt++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /* don't need to wakeup wait_buffer because
- * raid1_free_bh below will do that
- */
- } else {
- kfree(r1_bh);
- }
- raid1_free_bh(conf, bh);
- }
- static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
- {
- int i = 0;
- while (i < cnt) {
- struct raid1_bh *r1_bh;
- r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
- if (!r1_bh)
- break;
- memset(r1_bh, 0, sizeof(*r1_bh));
- set_bit(R1BH_PreAlloc, &r1_bh->state);
- r1_bh->mddev = conf->mddev;
- raid1_free_r1bh(r1_bh);
- i++;
- }
- return i;
- }
- static void raid1_shrink_r1bh(raid1_conf_t *conf)
- {
- md_spin_lock_irq(&conf->device_lock);
- while (conf->freer1) {
- struct raid1_bh *r1_bh = conf->freer1;
- conf->freer1 = r1_bh->next_r1;
- conf->freer1_cnt--;
- kfree(r1_bh);
- }
- md_spin_unlock_irq(&conf->device_lock);
- }
- static inline void raid1_free_buf(struct raid1_bh *r1_bh)
- {
- unsigned long flags;
- struct buffer_head *bh = r1_bh->mirror_bh_list;
- raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
- r1_bh->mirror_bh_list = NULL;
-
- spin_lock_irqsave(&conf->device_lock, flags);
- r1_bh->next_r1 = conf->freebuf;
- conf->freebuf = r1_bh;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- raid1_free_bh(conf, bh);
- }
- static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
- {
- struct raid1_bh *r1_bh;
- md_spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
- r1_bh = conf->freebuf;
- conf->freebuf = r1_bh->next_r1;
- r1_bh->next_r1= NULL;
- md_spin_unlock_irq(&conf->device_lock);
- return r1_bh;
- }
- static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
- {
- int i = 0;
- struct raid1_bh *head = NULL, **tail;
- tail = &head;
- while (i < cnt) {
- struct raid1_bh *r1_bh;
- struct page *page;
- page = alloc_page(GFP_KERNEL);
- if (!page)
- break;
- r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
- if (!r1_bh) {
- __free_page(page);
- break;
- }
- memset(r1_bh, 0, sizeof(*r1_bh));
- r1_bh->bh_req.b_page = page;
- r1_bh->bh_req.b_data = page_address(page);
- *tail = r1_bh;
- r1_bh->next_r1 = NULL;
- tail = & r1_bh->next_r1;
- i++;
- }
- /* this lock probably isn't needed, as at the time when
- * we are allocating buffers, nobody else will be touching the
- * freebuf list. But it doesn't hurt....
- */
- md_spin_lock_irq(&conf->device_lock);
- *tail = conf->freebuf;
- conf->freebuf = head;
- md_spin_unlock_irq(&conf->device_lock);
- return i;
- }
- static void raid1_shrink_buffers (raid1_conf_t *conf)
- {
- struct raid1_bh *head;
- md_spin_lock_irq(&conf->device_lock);
- head = conf->freebuf;
- conf->freebuf = NULL;
- md_spin_unlock_irq(&conf->device_lock);
- while (head) {
- struct raid1_bh *r1_bh = head;
- head = r1_bh->next_r1;
- __free_page(r1_bh->bh_req.b_page);
- kfree(r1_bh);
- }
- }
- static int raid1_map (mddev_t *mddev, kdev_t *rdev)
- {
- raid1_conf_t *conf = mddev_to_conf(mddev);
- int i, disks = MD_SB_DISKS;
- /*
- * Later we do read balancing on the read side
- * now we use the first available disk.
- */
- for (i = 0; i < disks; i++) {
- if (conf->mirrors[i].operational) {
- *rdev = conf->mirrors[i].dev;
- return (0);
- }
- }
- printk (KERN_ERR "raid1_map(): huh, no more operational devices?n");
- return (-1);
- }
- static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
- {
- unsigned long flags;
- mddev_t *mddev = r1_bh->mddev;
- raid1_conf_t *conf = mddev_to_conf(mddev);
- md_spin_lock_irqsave(&retry_list_lock, flags);
- if (raid1_retry_list == NULL)
- raid1_retry_tail = &raid1_retry_list;
- *raid1_retry_tail = r1_bh;
- raid1_retry_tail = &r1_bh->next_r1;
- r1_bh->next_r1 = NULL;
- md_spin_unlock_irqrestore(&retry_list_lock, flags);
- md_wakeup_thread(conf->thread);
- }
- static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
- {
- unsigned long flags;
- spin_lock_irqsave(&conf->segment_lock, flags);
- if (sector < conf->start_active)
- conf->cnt_done--;
- else if (sector >= conf->start_future && conf->phase == phase)
- conf->cnt_future--;
- else if (!--conf->cnt_pending)
- wake_up(&conf->wait_ready);
- spin_unlock_irqrestore(&conf->segment_lock, flags);
- }
- static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
- {
- unsigned long flags;
- spin_lock_irqsave(&conf->segment_lock, flags);
- if (sector >= conf->start_ready)
- --conf->cnt_ready;
- else if (sector >= conf->start_active) {
- if (!--conf->cnt_active) {
- conf->start_active = conf->start_ready;
- wake_up(&conf->wait_done);
- }
- }
- spin_unlock_irqrestore(&conf->segment_lock, flags);
- }
- /*
- * raid1_end_bh_io() is called when we have finished servicing a mirrored
- * operation and are ready to return a success/failure code to the buffer
- * cache layer.
- */
- static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
- {
- struct buffer_head *bh = r1_bh->master_bh;
- io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
- test_bit(R1BH_SyncPhase, &r1_bh->state));
- bh->b_end_io(bh, uptodate);
- raid1_free_r1bh(r1_bh);
- }
- void raid1_end_request (struct buffer_head *bh, int uptodate)
- {
- struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
- /*
- * this branch is our 'one mirror IO has finished' event handler:
- */
- if (!uptodate)
- md_error (r1_bh->mddev, bh->b_dev);
- else
- /*
- * Set R1BH_Uptodate in our master buffer_head, so that
- * we will return a good error code for to the higher
- * levels even if IO on some other mirrored buffer fails.
- *
- * The 'master' represents the complex operation to
- * user-side. So if something waits for IO, then it will
- * wait for the 'master' buffer_head.
- */
- set_bit (R1BH_Uptodate, &r1_bh->state);
- /*
- * We split up the read and write side, imho they are
- * conceptually different.
- */
- if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
- /*
- * we have only one buffer_head on the read side
- */
-
- if (uptodate) {
- raid1_end_bh_io(r1_bh, uptodate);
- return;
- }
- /*
- * oops, read error:
- */
- printk(KERN_ERR "raid1: %s: rescheduling block %lun",
- partition_name(bh->b_dev), bh->b_blocknr);
- raid1_reschedule_retry(r1_bh);
- return;
- }
- /*
- * WRITE:
- *
- * Let's see if all mirrored write operations have finished
- * already.
- */
- if (atomic_dec_and_test(&r1_bh->remaining))
- raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
- }
- /*
- * This routine returns the disk from which the requested read should
- * be done. It bookkeeps the last read position for every disk
- * in array and when new read requests come, the disk which last
- * position is nearest to the request, is chosen.
- *
- * TODO: now if there are 2 mirrors in the same 2 devices, performance
- * degrades dramatically because position is mirror, not device based.
- * This should be changed to be device based. Also atomic sequential
- * reads should be somehow balanced.
- */
- static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
- {
- int new_disk = conf->last_used;
- const int sectors = bh->b_size >> 9;
- const unsigned long this_sector = bh->b_rsector;
- int disk = new_disk;
- unsigned long new_distance;
- unsigned long current_distance;
-
- /*
- * Check if it is sane at all to balance
- */
-
- if (conf->resync_mirrors)
- goto rb_out;
-
- /* make sure that disk is operational */
- while( !conf->mirrors[new_disk].operational) {
- if (new_disk <= 0) new_disk = conf->raid_disks;
- new_disk--;
- if (new_disk == disk) {
- /*
- * This means no working disk was found
- * Nothing much to do, lets not change anything
- * and hope for the best...
- */
-
- new_disk = conf->last_used;
- goto rb_out;
- }
- }
- disk = new_disk;
- /* now disk == new_disk == starting point for search */
-
- /*
- * Don't touch anything for sequential reads.
- */
- if (this_sector == conf->mirrors[new_disk].head_position)
- goto rb_out;
-
- /*
- * If reads have been done only on a single disk
- * for a time, lets give another disk a change.
- * This is for kicking those idling disks so that
- * they would find work near some hotspot.
- */
-
- if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
- conf->sect_count = 0;
- #if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
- /* Work around a compiler bug in egcs-2.92.11 19980921 */
- new_disk = *(volatile int *)&new_disk;
- #endif
- do {
- if (new_disk<=0)
- new_disk = conf->raid_disks;
- new_disk--;
- if (new_disk == disk)
- break;
- } while ((conf->mirrors[new_disk].write_only) ||
- (!conf->mirrors[new_disk].operational));
- goto rb_out;
- }
-
- current_distance = abs(this_sector -
- conf->mirrors[disk].head_position);
-
- /* Find the disk which is closest */
-
- do {
- if (disk <= 0)
- disk = conf->raid_disks;
- disk--;
-
- if ((conf->mirrors[disk].write_only) ||
- (!conf->mirrors[disk].operational))
- continue;
-
- new_distance = abs(this_sector -
- conf->mirrors[disk].head_position);
-
- if (new_distance < current_distance) {
- conf->sect_count = 0;
- current_distance = new_distance;
- new_disk = disk;
- }
- } while (disk != conf->last_used);
- rb_out:
- conf->mirrors[new_disk].head_position = this_sector + sectors;
- conf->last_used = new_disk;
- conf->sect_count += sectors;
- return new_disk;
- }
- static int raid1_make_request (mddev_t *mddev, int rw,
- struct buffer_head * bh)
- {
- raid1_conf_t *conf = mddev_to_conf(mddev);
- struct buffer_head *bh_req, *bhl;
- struct raid1_bh * r1_bh;
- int disks = MD_SB_DISKS;
- int i, sum_bhs = 0;
- struct mirror_info *mirror;
- if (!buffer_locked(bh))
- BUG();
-
- /*
- * make_request() can abort the operation when READA is being
- * used and no empty request is available.
- *
- * Currently, just replace the command with READ/WRITE.
- */
- if (rw == READA)
- rw = READ;
- r1_bh = raid1_alloc_r1bh (conf);
- spin_lock_irq(&conf->segment_lock);
- wait_event_lock_irq(conf->wait_done,
- bh->b_rsector < conf->start_active ||
- bh->b_rsector >= conf->start_future,
- conf->segment_lock);
- if (bh->b_rsector < conf->start_active)
- conf->cnt_done++;
- else {
- conf->cnt_future++;
- if (conf->phase)
- set_bit(R1BH_SyncPhase, &r1_bh->state);
- }
- spin_unlock_irq(&conf->segment_lock);
-
- /*
- * i think the read and write branch should be separated completely,
- * since we want to do read balancing on the read side for example.
- * Alternative implementations? :) --mingo
- */
- r1_bh->master_bh = bh;
- r1_bh->mddev = mddev;
- r1_bh->cmd = rw;
- if (rw == READ) {
- /*
- * read balancing logic:
- */
- mirror = conf->mirrors + raid1_read_balance(conf, bh);
- bh_req = &r1_bh->bh_req;
- memcpy(bh_req, bh, sizeof(*bh));
- bh_req->b_blocknr = bh->b_rsector;
- bh_req->b_dev = mirror->dev;
- bh_req->b_rdev = mirror->dev;
- /* bh_req->b_rsector = bh->n_rsector; */
- bh_req->b_end_io = raid1_end_request;
- bh_req->b_private = r1_bh;
- generic_make_request (rw, bh_req);
- return 0;
- }
- /*
- * WRITE:
- */
- bhl = raid1_alloc_bh(conf, conf->raid_disks);
- for (i = 0; i < disks; i++) {
- struct buffer_head *mbh;
- if (!conf->mirrors[i].operational)
- continue;
-
- /*
- * We should use a private pool (size depending on NR_REQUEST),
- * to avoid writes filling up the memory with bhs
- *
- * Such pools are much faster than kmalloc anyways (so we waste
- * almost nothing by not using the master bh when writing and
- * win alot of cleanness) but for now we are cool enough. --mingo
- *
- * It's safe to sleep here, buffer heads cannot be used in a shared
- * manner in the write branch. Look how we lock the buffer at the
- * beginning of this function to grok the difference ;)
- */
- mbh = bhl;
- if (mbh == NULL) {
- MD_BUG();
- break;
- }
- bhl = mbh->b_next;
- mbh->b_next = NULL;
- mbh->b_this_page = (struct buffer_head *)1;
-
- /*
- * prepare mirrored mbh (fields ordered for max mem throughput):
- */
- mbh->b_blocknr = bh->b_rsector;
- mbh->b_dev = conf->mirrors[i].dev;
- mbh->b_rdev = conf->mirrors[i].dev;
- mbh->b_rsector = bh->b_rsector;
- mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
- (1<<BH_Mapped) | (1<<BH_Lock);
- atomic_set(&mbh->b_count, 1);
- mbh->b_size = bh->b_size;
- mbh->b_page = bh->b_page;
- mbh->b_data = bh->b_data;
- mbh->b_list = BUF_LOCKED;
- mbh->b_end_io = raid1_end_request;
- mbh->b_private = r1_bh;
- mbh->b_next = r1_bh->mirror_bh_list;
- r1_bh->mirror_bh_list = mbh;
- sum_bhs++;
- }
- if (bhl) raid1_free_bh(conf,bhl);
- if (!sum_bhs) {
- /* Gag - all mirrors non-operational.. */
- raid1_end_bh_io(r1_bh, 0);
- return 0;
- }
- md_atomic_set(&r1_bh->remaining, sum_bhs);
- /*
- * We have to be a bit careful about the semaphore above, thats
- * why we start the requests separately. Since kmalloc() could
- * fail, sleep and make_request() can sleep too, this is the
- * safer solution. Imagine, end_request decreasing the semaphore
- * before we could have set it up ... We could play tricks with
- * the semaphore (presetting it and correcting at the end if
- * sum_bhs is not 'n' but we have to do end_request by hand if
- * all requests finish until we had a chance to set up the
- * semaphore correctly ... lots of races).
- */
- bh = r1_bh->mirror_bh_list;
- while(bh) {
- struct buffer_head *bh2 = bh;
- bh = bh->b_next;
- generic_make_request(rw, bh2);
- }
- return (0);
- }
- static int raid1_status (char *page, mddev_t *mddev)
- {
- raid1_conf_t *conf = mddev_to_conf(mddev);
- int sz = 0, i;
-
- sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
- conf->working_disks);
- for (i = 0; i < conf->raid_disks; i++)
- sz += sprintf (page+sz, "%s",
- conf->mirrors[i].operational ? "U" : "_");
- sz += sprintf (page+sz, "]");
- return sz;
- }
- #define LAST_DISK KERN_ALERT
- "raid1: only one disk left and IO error.n"
- #define NO_SPARE_DISK KERN_ALERT
- "raid1: no spare disk left, degrading mirror level by one.n"
- #define DISK_FAILED KERN_ALERT
- "raid1: Disk failure on %s, disabling device. n"
- " Operation continuing on %d devicesn"
- #define START_SYNCING KERN_ALERT
- "raid1: start syncing spare disk.n"
- #define ALREADY_SYNCING KERN_INFO
- "raid1: syncing already in progress.n"
- static void mark_disk_bad (mddev_t *mddev, int failed)
- {
- raid1_conf_t *conf = mddev_to_conf(mddev);
- struct mirror_info *mirror = conf->mirrors+failed;
- mdp_super_t *sb = mddev->sb;
- mirror->operational = 0;
- mark_disk_faulty(sb->disks+mirror->number);
- mark_disk_nonsync(sb->disks+mirror->number);
- mark_disk_inactive(sb->disks+mirror->number);
- if (!mirror->write_only)
- sb->active_disks--;
- sb->working_disks--;
- sb->failed_disks++;
- mddev->sb_dirty = 1;
- md_wakeup_thread(conf->thread);
- if (!mirror->write_only)
- conf->working_disks--;
- printk (DISK_FAILED, partition_name (mirror->dev),
- conf->working_disks);
- }
- static int raid1_error (mddev_t *mddev, kdev_t dev)
- {
- raid1_conf_t *conf = mddev_to_conf(mddev);
- struct mirror_info * mirrors = conf->mirrors;
- int disks = MD_SB_DISKS;
- int i;
- /* Find the drive.
- * If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
- * else mark the drive as failed
- */
- for (i = 0; i < disks; i++)
- if (mirrors[i].dev==dev && mirrors[i].operational)
- break;
- if (i == disks)
- return 0;
- if (i < conf->raid_disks && conf->working_disks == 1) {
- /* Don't fail the drive, act as though we were just a
- * normal single drive
- */
- return 1;
- }
- mark_disk_bad(mddev, i);
- return 0;
- }
- #undef LAST_DISK
- #undef NO_SPARE_DISK
- #undef DISK_FAILED
- #undef START_SYNCING
- static void print_raid1_conf (raid1_conf_t *conf)
- {
- int i;
- struct mirror_info *tmp;
- printk("RAID1 conf printout:n");
- if (!conf) {
- printk("(conf==NULL)n");
- return;
- }
- printk(" --- wd:%d rd:%d nd:%dn", conf->working_disks,
- conf->raid_disks, conf->nr_disks);
- for (i = 0; i < MD_SB_DISKS; i++) {
- tmp = conf->mirrors + i;
- printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%sn",
- i, tmp->spare,tmp->operational,
- tmp->number,tmp->raid_disk,tmp->used_slot,
- partition_name(tmp->dev));
- }
- }
- static void close_sync(raid1_conf_t *conf)
- {
- mddev_t *mddev = conf->mddev;
- /* If reconstruction was interrupted, we need to close the "active" and "pending"
- * holes.
- * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
- */
- /* this is really needed when recovery stops too... */
- spin_lock_irq(&conf->segment_lock);
- conf->start_active = conf->start_pending;
- conf->start_ready = conf->start_pending;
- wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
- conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
- conf->start_future = (mddev->sb->size<<1)+1;
- conf->cnt_pending = conf->cnt_future;
- conf->cnt_future = 0;
- conf->phase = conf->phase ^1;
- wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
- conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
- conf->phase = 0;
- conf->cnt_future = conf->cnt_done;;
- conf->cnt_done = 0;
- spin_unlock_irq(&conf->segment_lock);
- wake_up(&conf->wait_done);
- }
- static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
- {
- int err = 0;
- int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
- raid1_conf_t *conf = mddev->private;
- struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
- mdp_super_t *sb = mddev->sb;
- mdp_disk_t *failed_desc, *spare_desc, *added_desc;
- mdk_rdev_t *spare_rdev, *failed_rdev;
- print_raid1_conf(conf);
- switch (state) {
- case DISKOP_SPARE_ACTIVE:
- case DISKOP_SPARE_INACTIVE:
- /* need to wait for pending sync io before locking device */
- close_sync(conf);
- }
- md_spin_lock_irq(&conf->device_lock);
- /*
- * find the disk ...
- */
- switch (state) {
- case DISKOP_SPARE_ACTIVE:
- /*
- * Find the failed disk within the RAID1 configuration ...
- * (this can only be in the first conf->working_disks part)
- */
- for (i = 0; i < conf->raid_disks; i++) {
- tmp = conf->mirrors + i;
- if ((!tmp->operational && !tmp->spare) ||
- !tmp->used_slot) {
- failed_disk = i;
- break;
- }
- }
- /*
- * When we activate a spare disk we _must_ have a disk in
- * the lower (active) part of the array to replace.
- */
- if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- /* fall through */
- case DISKOP_SPARE_WRITE:
- case DISKOP_SPARE_INACTIVE:
- /*
- * Find the spare disk ... (can only be in the 'high'
- * area of the array)
- */
- for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
- tmp = conf->mirrors + i;
- if (tmp->spare && tmp->number == (*d)->number) {
- spare_disk = i;
- break;
- }
- }
- if (spare_disk == -1) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- break;
- case DISKOP_HOT_REMOVE_DISK:
- for (i = 0; i < MD_SB_DISKS; i++) {
- tmp = conf->mirrors + i;
- if (tmp->used_slot && (tmp->number == (*d)->number)) {
- if (tmp->operational) {
- err = -EBUSY;
- goto abort;
- }
- removed_disk = i;
- break;
- }
- }
- if (removed_disk == -1) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- break;
- case DISKOP_HOT_ADD_DISK:
- for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
- tmp = conf->mirrors + i;
- if (!tmp->used_slot) {
- added_disk = i;
- break;
- }
- }
- if (added_disk == -1) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- break;
- }
- switch (state) {
- /*
- * Switch the spare disk to write-only mode:
- */
- case DISKOP_SPARE_WRITE:
- sdisk = conf->mirrors + spare_disk;
- sdisk->operational = 1;
- sdisk->write_only = 1;
- break;
- /*
- * Deactivate a spare disk:
- */
- case DISKOP_SPARE_INACTIVE:
- if (conf->start_future > 0) {
- MD_BUG();
- err = -EBUSY;
- break;
- }
- sdisk = conf->mirrors + spare_disk;
- sdisk->operational = 0;
- sdisk->write_only = 0;
- break;
- /*
- * Activate (mark read-write) the (now sync) spare disk,
- * which means we switch it's 'raid position' (->raid_disk)
- * with the failed disk. (only the first 'conf->nr_disks'
- * slots are used for 'real' disks and we must preserve this
- * property)
- */
- case DISKOP_SPARE_ACTIVE:
- if (conf->start_future > 0) {
- MD_BUG();
- err = -EBUSY;
- break;
- }
- sdisk = conf->mirrors + spare_disk;
- fdisk = conf->mirrors + failed_disk;
- spare_desc = &sb->disks[sdisk->number];
- failed_desc = &sb->disks[fdisk->number];
- if (spare_desc != *d) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- if (spare_desc->raid_disk != sdisk->raid_disk) {
- MD_BUG();
- err = 1;
- goto abort;
- }
-
- if (sdisk->raid_disk != spare_disk) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- if (failed_desc->raid_disk != fdisk->raid_disk) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- if (fdisk->raid_disk != failed_disk) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- /*
- * do the switch finally
- */
- spare_rdev = find_rdev_nr(mddev, spare_desc->number);
- failed_rdev = find_rdev_nr(mddev, failed_desc->number);
- /* There must be a spare_rdev, but there may not be a
- * failed_rdev. That slot might be empty...
- */
- spare_rdev->desc_nr = failed_desc->number;
- if (failed_rdev)
- failed_rdev->desc_nr = spare_desc->number;
-
- xchg_values(*spare_desc, *failed_desc);
- xchg_values(*fdisk, *sdisk);
- /*
- * (careful, 'failed' and 'spare' are switched from now on)
- *
- * we want to preserve linear numbering and we want to
- * give the proper raid_disk number to the now activated
- * disk. (this means we switch back these values)
- */
-
- xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
- xchg_values(sdisk->raid_disk, fdisk->raid_disk);
- xchg_values(spare_desc->number, failed_desc->number);
- xchg_values(sdisk->number, fdisk->number);
- *d = failed_desc;
- if (sdisk->dev == MKDEV(0,0))
- sdisk->used_slot = 0;
- /*
- * this really activates the spare.
- */
- fdisk->spare = 0;
- fdisk->write_only = 0;
- /*
- * if we activate a spare, we definitely replace a
- * non-operational disk slot in the 'low' area of
- * the disk array.
- */
- conf->working_disks++;
- break;
- case DISKOP_HOT_REMOVE_DISK:
- rdisk = conf->mirrors + removed_disk;
- if (rdisk->spare && (removed_disk < conf->raid_disks)) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- rdisk->dev = MKDEV(0,0);
- rdisk->used_slot = 0;
- conf->nr_disks--;
- break;
- case DISKOP_HOT_ADD_DISK:
- adisk = conf->mirrors + added_disk;
- added_desc = *d;
- if (added_disk != added_desc->number) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- adisk->number = added_desc->number;
- adisk->raid_disk = added_desc->raid_disk;
- adisk->dev = MKDEV(added_desc->major,added_desc->minor);
- adisk->operational = 0;
- adisk->write_only = 0;
- adisk->spare = 1;
- adisk->used_slot = 1;
- adisk->head_position = 0;
- conf->nr_disks++;
- break;
- default:
- MD_BUG();
- err = 1;
- goto abort;
- }
- abort:
- md_spin_unlock_irq(&conf->device_lock);
- if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
- /* should move to "END_REBUILD" when such exists */
- raid1_shrink_buffers(conf);
- print_raid1_conf(conf);
- return err;
- }
- #define IO_ERROR KERN_ALERT
- "raid1: %s: unrecoverable I/O read error for block %lun"
- #define REDIRECT_SECTOR KERN_ERR
- "raid1: %s: redirecting sector %lu to another mirrorn"
- /*
- * This is a kernel thread which:
- *
- * 1. Retries failed read operations on working mirrors.
- * 2. Updates the raid superblock when problems encounter.
- * 3. Performs writes following reads for array syncronising.
- */
- static void end_sync_write(struct buffer_head *bh, int uptodate);
- static void end_sync_read(struct buffer_head *bh, int uptodate);
- static void raid1d (void *data)
- {
- struct raid1_bh *r1_bh;
- struct buffer_head *bh;
- unsigned long flags;
- raid1_conf_t *conf = data;
- mddev_t *mddev = conf->mddev;
- kdev_t dev;
- if (mddev->sb_dirty)
- md_update_sb(mddev);
- for (;;) {
- md_spin_lock_irqsave(&retry_list_lock, flags);
- r1_bh = raid1_retry_list;
- if (!r1_bh)
- break;
- raid1_retry_list = r1_bh->next_r1;
- md_spin_unlock_irqrestore(&retry_list_lock, flags);
- mddev = r1_bh->mddev;
- bh = &r1_bh->bh_req;
- switch(r1_bh->cmd) {
- case SPECIAL:
- /* have to allocate lots of bh structures and
- * schedule writes
- */
- if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
- int i, sum_bhs = 0;
- int disks = MD_SB_DISKS;
- struct buffer_head *bhl, *mbh;
-
- conf = mddev_to_conf(mddev);
- bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
- for (i = 0; i < disks ; i++) {
- if (!conf->mirrors[i].operational)
- continue;
- if (i==conf->last_used)
- /* we read from here, no need to write */
- continue;
- if (i < conf->raid_disks
- && !conf->resync_mirrors)
- /* don't need to write this,
- * we are just rebuilding */
- continue;
- mbh = bhl;
- if (!mbh) {
- MD_BUG();
- break;
- }
- bhl = mbh->b_next;
- mbh->b_this_page = (struct buffer_head *)1;
-
- /*
- * prepare mirrored bh (fields ordered for max mem throughput):
- */
- mbh->b_blocknr = bh->b_blocknr;
- mbh->b_dev = conf->mirrors[i].dev;
- mbh->b_rdev = conf->mirrors[i].dev;
- mbh->b_rsector = bh->b_blocknr;
- mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
- (1<<BH_Mapped) | (1<<BH_Lock);
- atomic_set(&mbh->b_count, 1);
- mbh->b_size = bh->b_size;
- mbh->b_page = bh->b_page;
- mbh->b_data = bh->b_data;
- mbh->b_list = BUF_LOCKED;
- mbh->b_end_io = end_sync_write;
- mbh->b_private = r1_bh;
- mbh->b_next = r1_bh->mirror_bh_list;
- r1_bh->mirror_bh_list = mbh;
- sum_bhs++;
- }
- md_atomic_set(&r1_bh->remaining, sum_bhs);
- if (bhl) raid1_free_bh(conf, bhl);
- mbh = r1_bh->mirror_bh_list;
- if (!sum_bhs) {
- /* nowhere to write this too... I guess we
- * must be done
- */
- sync_request_done(bh->b_blocknr, conf);
- md_done_sync(mddev, bh->b_size>>9, 0);
- raid1_free_buf(r1_bh);
- } else
- while (mbh) {
- struct buffer_head *bh1 = mbh;
- mbh = mbh->b_next;
- generic_make_request(WRITE, bh1);
- md_sync_acct(bh1->b_dev, bh1->b_size/512);
- }
- } else {
- /* There is no point trying a read-for-reconstruct
- * as reconstruct is about to be aborted
- */
- printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
- md_done_sync(mddev, bh->b_size>>9, 0);
- }
- break;
- case READ:
- case READA:
- dev = bh->b_dev;
- raid1_map (mddev, &bh->b_dev);
- if (bh->b_dev == dev) {
- printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
- raid1_end_bh_io(r1_bh, 0);
- } else {
- printk (REDIRECT_SECTOR,
- partition_name(bh->b_dev), bh->b_blocknr);
- bh->b_rdev = bh->b_dev;
- bh->b_rsector = bh->b_blocknr;
- generic_make_request (r1_bh->cmd, bh);
- }
- break;
- }
- }
- md_spin_unlock_irqrestore(&retry_list_lock, flags);
- }
- #undef IO_ERROR
- #undef REDIRECT_SECTOR
- /*
- * Private kernel thread to reconstruct mirrors after an unclean
- * shutdown.
- */
- static void raid1syncd (void *data)
- {
- raid1_conf_t *conf = data;
- mddev_t *mddev = conf->mddev;
- if (!conf->resync_mirrors)
- return;
- if (conf->resync_mirrors == 2)
- return;
- down(&mddev->recovery_sem);
- if (!md_do_sync(mddev, NULL)) {
- /*
- * Only if everything went Ok.
- */
- conf->resync_mirrors = 0;
- }
- close_sync(conf);
- up(&mddev->recovery_sem);
- raid1_shrink_buffers(conf);
- }
- /*
- * perform a "sync" on one "block"
- *
- * We need to make sure that no normal I/O request - particularly write
- * requests - conflict with active sync requests.
- * This is achieved by conceptually dividing the device space into a
- * number of sections:
- * DONE: 0 .. a-1 These blocks are in-sync
- * ACTIVE: a.. b-1 These blocks may have active sync requests, but
- * no normal IO requests
- * READY: b .. c-1 These blocks have no normal IO requests - sync
- * request may be happening
- * PENDING: c .. d-1 These blocks may have IO requests, but no new
- * ones will be added
- * FUTURE: d .. end These blocks are not to be considered yet. IO may
- * be happening, but not sync
- *
- * We keep a
- * phase which flips (0 or 1) each time d moves and
- * a count of:
- * z = active io requests in FUTURE since d moved - marked with
- * current phase
- * y = active io requests in FUTURE before d moved, or PENDING -
- * marked with previous phase
- * x = active sync requests in READY
- * w = active sync requests in ACTIVE
- * v = active io requests in DONE
- *
- * Normally, a=b=c=d=0 and z= active io requests
- * or a=b=c=d=END and v= active io requests
- * Allowed changes to a,b,c,d:
- * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
- * B: y==0 -> c=d
- * C: b=c, w+=x, x=0
- * D: w==0 -> a=b
- * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
- *
- * At start of sync we apply A.
- * When y reaches 0, we apply B then A then being sync requests
- * When sync point reaches c-1, we wait for y==0, and W==0, and
- * then apply apply B then A then D then C.
- * Finally, we apply E
- *
- * The sync request simply issues a "read" against a working drive
- * This is marked so that on completion the raid1d thread is woken to
- * issue suitable write requests
- */
- static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
- {
- raid1_conf_t *conf = mddev_to_conf(mddev);
- struct mirror_info *mirror;
- struct raid1_bh *r1_bh;
- struct buffer_head *bh;
- int bsize;
- int disk;
- int block_nr;
- int buffs;
- if (!sector_nr) {
- /* we want enough buffers to hold twice the window of 128*/
- buffs = 128 *2 / (PAGE_SIZE>>9);
- buffs = raid1_grow_buffers(conf, buffs);
- if (buffs < 2)
- goto nomem;
- conf->window = buffs*(PAGE_SIZE>>9)/2;
- }
- spin_lock_irq(&conf->segment_lock);
- if (!sector_nr) {
- /* initialize ...*/
- conf->start_active = 0;
- conf->start_ready = 0;
- conf->start_pending = 0;
- conf->start_future = 0;
- conf->phase = 0;
-
- conf->cnt_future += conf->cnt_done+conf->cnt_pending;
- conf->cnt_done = conf->cnt_pending = 0;
- if (conf->cnt_ready || conf->cnt_active)
- MD_BUG();
- }
- while (sector_nr >= conf->start_pending) {
- PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%dn",
- sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
- conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
- wait_event_lock_irq(conf->wait_done,
- !conf->cnt_active,
- conf->segment_lock);
- wait_event_lock_irq(conf->wait_ready,
- !conf->cnt_pending,
- conf->segment_lock);
- conf->start_active = conf->start_ready;
- conf->start_ready = conf->start_pending;
- conf->start_pending = conf->start_future;
- conf->start_future = conf->start_future+conf->window;
- // Note: falling off the end is not a problem
- conf->phase = conf->phase ^1;
- conf->cnt_active = conf->cnt_ready;
- conf->cnt_ready = 0;
- conf->cnt_pending = conf->cnt_future;
- conf->cnt_future = 0;
- wake_up(&conf->wait_done);
- }
- conf->cnt_ready++;
- spin_unlock_irq(&conf->segment_lock);
-
- /* If reconstructing, and >1 working disc,
- * could dedicate one to rebuild and others to
- * service read requests ..
- */
- disk = conf->last_used;
- /* make sure disk is operational */
- while (!conf->mirrors[disk].operational) {
- if (disk <= 0) disk = conf->raid_disks;
- disk--;
- if (disk == conf->last_used)
- break;
- }
- conf->last_used = disk;
-
- mirror = conf->mirrors+conf->last_used;
-
- r1_bh = raid1_alloc_buf (conf);
- r1_bh->master_bh = NULL;
- r1_bh->mddev = mddev;
- r1_bh->cmd = SPECIAL;
- bh = &r1_bh->bh_req;
- block_nr = sector_nr;
- bsize = 512;
- while (!(block_nr & 1) && bsize < PAGE_SIZE
- && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
- block_nr >>= 1;
- bsize <<= 1;
- }
- bh->b_size = bsize;
- bh->b_list = BUF_LOCKED;
- bh->b_dev = mirror->dev;
- bh->b_rdev = mirror->dev;
- bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
- if (!bh->b_page)
- BUG();
- if (!bh->b_data)
- BUG();
- if (bh->b_data != page_address(bh->b_page))
- BUG();
- bh->b_end_io = end_sync_read;
- bh->b_private = r1_bh;
- bh->b_blocknr = sector_nr;
- bh->b_rsector = sector_nr;
- init_waitqueue_head(&bh->b_wait);
- generic_make_request(READ, bh);
- md_sync_acct(bh->b_dev, bh->b_size/512);
- return (bsize >> 9);
- nomem:
- raid1_shrink_buffers(conf);
- return -ENOMEM;
- }
- static void end_sync_read(struct buffer_head *bh, int uptodate)
- {
- struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
- /* we have read a block, now it needs to be re-written,
- * or re-read if the read failed.
- * We don't do much here, just schedule handling by raid1d
- */
- if (!uptodate)
- md_error (r1_bh->mddev, bh->b_dev);
- else
- set_bit(R1BH_Uptodate, &r1_bh->state);
- raid1_reschedule_retry(r1_bh);
- }
- static void end_sync_write(struct buffer_head *bh, int uptodate)
- {
- struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
-
- if (!uptodate)
- md_error (r1_bh->mddev, bh->b_dev);
- if (atomic_dec_and_test(&r1_bh->remaining)) {
- mddev_t *mddev = r1_bh->mddev;
- unsigned long sect = bh->b_blocknr;
- int size = bh->b_size;
- raid1_free_buf(r1_bh);
- sync_request_done(sect, mddev_to_conf(mddev));
- md_done_sync(mddev,size>>9, uptodate);
- }
- }
- #define INVALID_LEVEL KERN_WARNING
- "raid1: md%d: raid level not set to mirroring (%d)n"
- #define NO_SB KERN_ERR
- "raid1: disabled mirror %s (couldn't access raid superblock)n"
- #define ERRORS KERN_ERR
- "raid1: disabled mirror %s (errors detected)n"
- #define NOT_IN_SYNC KERN_ERR
- "raid1: disabled mirror %s (not in sync)n"
- #define INCONSISTENT KERN_ERR
- "raid1: disabled mirror %s (inconsistent descriptor)n"
- #define ALREADY_RUNNING KERN_ERR
- "raid1: disabled mirror %s (mirror %d already operational)n"
- #define OPERATIONAL KERN_INFO
- "raid1: device %s operational as mirror %dn"
- #define MEM_ERROR KERN_ERR
- "raid1: couldn't allocate memory for md%dn"
- #define SPARE KERN_INFO
- "raid1: spare disk %sn"
- #define NONE_OPERATIONAL KERN_ERR
- "raid1: no operational mirrors for md%dn"
- #define ARRAY_IS_ACTIVE KERN_INFO
- "raid1: raid set md%d active with %d out of %d mirrorsn"
- #define THREAD_ERROR KERN_ERR
- "raid1: couldn't allocate thread for md%dn"
- #define START_RESYNC KERN_WARNING
- "raid1: raid set md%d not clean; reconstructing mirrorsn"
- static int raid1_run (mddev_t *mddev)
- {
- raid1_conf_t *conf;
- int i, j, disk_idx;
- struct mirror_info *disk;
- mdp_super_t *sb = mddev->sb;
- mdp_disk_t *descriptor;
- mdk_rdev_t *rdev;
- struct md_list_head *tmp;
- int start_recovery = 0;
- MOD_INC_USE_COUNT;
- if (sb->level != 1) {
- printk(INVALID_LEVEL, mdidx(mddev), sb->level);
- goto out;
- }
- /*
- * copy the already verified devices into our private RAID1
- * bookkeeping area. [whatever we allocate in raid1_run(),
- * should be freed in raid1_stop()]
- */
- conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
- mddev->private = conf;
- if (!conf) {
- printk(MEM_ERROR, mdidx(mddev));
- goto out;
- }
- memset(conf, 0, sizeof(*conf));
- ITERATE_RDEV(mddev,rdev,tmp) {
- if (rdev->faulty) {
- printk(ERRORS, partition_name(rdev->dev));
- } else {
- if (!rdev->sb) {
- MD_BUG();
- continue;
- }
- }
- if (rdev->desc_nr == -1) {
- MD_BUG();
- continue;
- }
- descriptor = &sb->disks[rdev->desc_nr];
- disk_idx = descriptor->raid_disk;
- disk = conf->mirrors + disk_idx;
- if (disk_faulty(descriptor)) {
- disk->number = descriptor->number;
- disk->raid_disk = disk_idx;
- disk->dev = rdev->dev;
- disk->sect_limit = MAX_WORK_PER_DISK;
- disk->operational = 0;
- disk->write_only = 0;
- disk->spare = 0;
- disk->used_slot = 1;
- disk->head_position = 0;
- continue;
- }
- if (disk_active(descriptor)) {
- if (!disk_sync(descriptor)) {
- printk(NOT_IN_SYNC,
- partition_name(rdev->dev));
- continue;
- }
- if ((descriptor->number > MD_SB_DISKS) ||
- (disk_idx > sb->raid_disks)) {
- printk(INCONSISTENT,
- partition_name(rdev->dev));
- continue;
- }
- if (disk->operational) {
- printk(ALREADY_RUNNING,
- partition_name(rdev->dev),
- disk_idx);
- continue;
- }
- printk(OPERATIONAL, partition_name(rdev->dev),
- disk_idx);
- disk->number = descriptor->number;
- disk->raid_disk = disk_idx;
- disk->dev = rdev->dev;
- disk->sect_limit = MAX_WORK_PER_DISK;
- disk->operational = 1;
- disk->write_only = 0;
- disk->spare = 0;
- disk->used_slot = 1;
- disk->head_position = 0;
- conf->working_disks++;
- } else {
- /*
- * Must be a spare disk ..
- */
- printk(SPARE, partition_name(rdev->dev));
- disk->number = descriptor->number;
- disk->raid_disk = disk_idx;
- disk->dev = rdev->dev;
- disk->sect_limit = MAX_WORK_PER_DISK;
- disk->operational = 0;
- disk->write_only = 0;
- disk->spare = 1;
- disk->used_slot = 1;
- disk->head_position = 0;
- }
- }
- conf->raid_disks = sb->raid_disks;
- conf->nr_disks = sb->nr_disks;
- conf->mddev = mddev;
- conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
- conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
- init_waitqueue_head(&conf->wait_buffer);
- init_waitqueue_head(&conf->wait_done);
- init_waitqueue_head(&conf->wait_ready);
- if (!conf->working_disks) {
- printk(NONE_OPERATIONAL, mdidx(mddev));
- goto out_free_conf;
- }
- /* pre-allocate some buffer_head structures.
- * As a minimum, 1 r1bh and raid_disks buffer_heads
- * would probably get us by in tight memory situations,
- * but a few more is probably a good idea.
- * For now, try NR_RESERVED_BUFS r1bh and
- * NR_RESERVED_BUFS*raid_disks bufferheads
- * This will allow at least NR_RESERVED_BUFS concurrent
- * reads or writes even if kmalloc starts failing
- */
- if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
- raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
- < NR_RESERVED_BUFS*conf->raid_disks) {
- printk(MEM_ERROR, mdidx(mddev));
- goto out_free_conf;
- }
- for (i = 0; i < MD_SB_DISKS; i++) {
-
- descriptor = sb->disks+i;
- disk_idx = descriptor->raid_disk;
- disk = conf->mirrors + disk_idx;
- if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
- !disk->used_slot) {
- disk->number = descriptor->number;
- disk->raid_disk = disk_idx;
- disk->dev = MKDEV(0,0);
- disk->operational = 0;
- disk->write_only = 0;
- disk->spare = 0;
- disk->used_slot = 1;
- disk->head_position = 0;
- }
- }
- /*
- * find the first working one and use it as a starting point
- * to read balancing.
- */
- for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
- /* nothing */;
- conf->last_used = j;
- if (conf->working_disks != sb->raid_disks) {
- printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover arrayn", mdidx(mddev));
- start_recovery = 1;
- }
- {
- const char * name = "raid1d";
- conf->thread = md_register_thread(raid1d, conf, name);
- if (!conf->thread) {
- printk(THREAD_ERROR, mdidx(mddev));
- goto out_free_conf;
- }
- }
- if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
- (conf->working_disks > 1)) {
- const char * name = "raid1syncd";
- conf->resync_thread = md_register_thread(raid1syncd, conf,name);
- if (!conf->resync_thread) {
- printk(THREAD_ERROR, mdidx(mddev));
- goto out_free_conf;
- }
- printk(START_RESYNC, mdidx(mddev));
- conf->resync_mirrors = 1;
- md_wakeup_thread(conf->resync_thread);
- }
- /*
- * Regenerate the "device is in sync with the raid set" bit for
- * each device.
- */
- for (i = 0; i < MD_SB_DISKS; i++) {
- mark_disk_nonsync(sb->disks+i);
- for (j = 0; j < sb->raid_disks; j++) {
- if (!conf->mirrors[j].operational)
- continue;
- if (sb->disks[i].number == conf->mirrors[j].number)
- mark_disk_sync(sb->disks+i);
- }
- }
- sb->active_disks = conf->working_disks;
- if (start_recovery)
- md_recover_arrays();
- printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
- /*
- * Ok, everything is just fine now
- */
- return 0;
- out_free_conf:
- raid1_shrink_r1bh(conf);
- raid1_shrink_bh(conf);
- raid1_shrink_buffers(conf);
- kfree(conf);
- mddev->private = NULL;
- out:
- MOD_DEC_USE_COUNT;
- return -EIO;
- }
- #undef INVALID_LEVEL
- #undef NO_SB
- #undef ERRORS
- #undef NOT_IN_SYNC
- #undef INCONSISTENT
- #undef ALREADY_RUNNING
- #undef OPERATIONAL
- #undef SPARE
- #undef NONE_OPERATIONAL
- #undef ARRAY_IS_ACTIVE
- static int raid1_stop_resync (mddev_t *mddev)
- {
- raid1_conf_t *conf = mddev_to_conf(mddev);
- if (conf->resync_thread) {
- if (conf->resync_mirrors) {
- conf->resync_mirrors = 2;
- md_interrupt_thread(conf->resync_thread);
- printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.n");
- return 1;
- }
- return 0;
- }
- return 0;
- }
- static int raid1_restart_resync (mddev_t *mddev)
- {
- raid1_conf_t *conf = mddev_to_conf(mddev);
- if (conf->resync_mirrors) {
- if (!conf->resync_thread) {
- MD_BUG();
- return 0;
- }
- conf->resync_mirrors = 1;
- md_wakeup_thread(conf->resync_thread);
- return 1;
- }
- return 0;
- }
- static int raid1_stop (mddev_t *mddev)
- {
- raid1_conf_t *conf = mddev_to_conf(mddev);
- md_unregister_thread(conf->thread);
- if (conf->resync_thread)
- md_unregister_thread(conf->resync_thread);
- raid1_shrink_r1bh(conf);
- raid1_shrink_bh(conf);
- raid1_shrink_buffers(conf);
- kfree(conf);
- mddev->private = NULL;
- MOD_DEC_USE_COUNT;
- return 0;
- }
- static mdk_personality_t raid1_personality=
- {
- name: "raid1",
- make_request: raid1_make_request,
- run: raid1_run,
- stop: raid1_stop,
- status: raid1_status,
- error_handler: raid1_error,
- diskop: raid1_diskop,
- stop_resync: raid1_stop_resync,
- restart_resync: raid1_restart_resync,
- sync_request: raid1_sync_request
- };
- static int md__init raid1_init (void)
- {
- return register_md_personality (RAID1, &raid1_personality);
- }
- static void raid1_exit (void)
- {
- unregister_md_personality (RAID1);
- }
- module_init(raid1_init);
- module_exit(raid1_exit);
- MODULE_LICENSE("GPL");