multipath.c
上传用户:jlfgdled
上传日期:2013-04-10
资源大小:33168k
文件大小:25k
- /*
- * multipath.c : Multiple Devices driver for Linux
- *
- * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
- *
- * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
- *
- * MULTIPATH management functions.
- *
- * derived from raid1.c.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
- #include <linux/module.h>
- #include <linux/slab.h>
- #include <linux/raid/multipath.h>
- #include <asm/atomic.h>
- #define MAJOR_NR MD_MAJOR
- #define MD_DRIVER
- #define MD_PERSONALITY
- #define MAX_WORK_PER_DISK 128
- #define NR_RESERVED_BUFS 32
- /*
- * The following can be used to debug the driver
- */
- #define MULTIPATH_DEBUG 0
- #if MULTIPATH_DEBUG
- #define PRINTK(x...) printk(x)
- #define inline
- #define __inline__
- #else
- #define PRINTK(x...) do { } while (0)
- #endif
- static mdk_personality_t multipath_personality;
- static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
- struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail;
- static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);
- static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
- {
- struct multipath_bh *mp_bh = NULL;
- do {
- md_spin_lock_irq(&conf->device_lock);
- if (!conf->freer1_blocked && conf->freer1) {
- mp_bh = conf->freer1;
- conf->freer1 = mp_bh->next_mp;
- conf->freer1_cnt--;
- mp_bh->next_mp = NULL;
- mp_bh->state = (1 << MPBH_PreAlloc);
- mp_bh->bh_req.b_state = 0;
- }
- md_spin_unlock_irq(&conf->device_lock);
- if (mp_bh)
- return mp_bh;
- mp_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
- GFP_NOIO);
- if (mp_bh) {
- memset(mp_bh, 0, sizeof(*mp_bh));
- return mp_bh;
- }
- conf->freer1_blocked = 1;
- wait_disk_event(conf->wait_buffer,
- !conf->freer1_blocked ||
- conf->freer1_cnt > NR_RESERVED_BUFS/2
- );
- conf->freer1_blocked = 0;
- } while (1);
- }
- static inline void multipath_free_mpbh(struct multipath_bh *mp_bh)
- {
- multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
- if (test_bit(MPBH_PreAlloc, &mp_bh->state)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- mp_bh->next_mp = conf->freer1;
- conf->freer1 = mp_bh;
- conf->freer1_cnt++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- wake_up(&conf->wait_buffer);
- } else {
- kfree(mp_bh);
- }
- }
- static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt)
- {
- int i = 0;
- while (i < cnt) {
- struct multipath_bh *mp_bh;
- mp_bh = (struct multipath_bh*)kmalloc(sizeof(*mp_bh), GFP_KERNEL);
- if (!mp_bh)
- break;
- memset(mp_bh, 0, sizeof(*mp_bh));
- set_bit(MPBH_PreAlloc, &mp_bh->state);
- mp_bh->mddev = conf->mddev;
- multipath_free_mpbh(mp_bh);
- i++;
- }
- return i;
- }
- static void multipath_shrink_mpbh(multipath_conf_t *conf)
- {
- md_spin_lock_irq(&conf->device_lock);
- while (conf->freer1) {
- struct multipath_bh *mp_bh = conf->freer1;
- conf->freer1 = mp_bh->next_mp;
- conf->freer1_cnt--;
- kfree(mp_bh);
- }
- md_spin_unlock_irq(&conf->device_lock);
- }
- static int multipath_map (mddev_t *mddev, kdev_t *rdev)
- {
- multipath_conf_t *conf = mddev_to_conf(mddev);
- int i, disks = MD_SB_DISKS;
- /*
- * Later we do read balancing on the read side
- * now we use the first available disk.
- */
- for (i = 0; i < disks; i++) {
- if (conf->multipaths[i].operational) {
- *rdev = conf->multipaths[i].dev;
- return (0);
- }
- }
- printk (KERN_ERR "multipath_map(): no more operational IO paths?n");
- return (-1);
- }
- static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
- {
- unsigned long flags;
- mddev_t *mddev = mp_bh->mddev;
- multipath_conf_t *conf = mddev_to_conf(mddev);
- md_spin_lock_irqsave(&retry_list_lock, flags);
- if (multipath_retry_list == NULL)
- multipath_retry_tail = &multipath_retry_list;
- *multipath_retry_tail = mp_bh;
- multipath_retry_tail = &mp_bh->next_mp;
- mp_bh->next_mp = NULL;
- md_spin_unlock_irqrestore(&retry_list_lock, flags);
- md_wakeup_thread(conf->thread);
- }
- /*
- * multipath_end_bh_io() is called when we have finished servicing a multipathed
- * operation and are ready to return a success/failure code to the buffer
- * cache layer.
- */
- static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
- {
- struct buffer_head *bh = mp_bh->master_bh;
- bh->b_end_io(bh, uptodate);
- multipath_free_mpbh(mp_bh);
- }
- void multipath_end_request (struct buffer_head *bh, int uptodate)
- {
- struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
- /*
- * this branch is our 'one multipath IO has finished' event handler:
- */
- if (!uptodate)
- md_error (mp_bh->mddev, bh->b_dev);
- else
- /*
- * Set MPBH_Uptodate in our master buffer_head, so that
- * we will return a good error code for to the higher
- * levels even if IO on some other multipathed buffer fails.
- *
- * The 'master' represents the complex operation to
- * user-side. So if something waits for IO, then it will
- * wait for the 'master' buffer_head.
- */
- set_bit (MPBH_Uptodate, &mp_bh->state);
-
- if (uptodate) {
- multipath_end_bh_io(mp_bh, uptodate);
- return;
- }
- /*
- * oops, IO error:
- */
- printk(KERN_ERR "multipath: %s: rescheduling block %lun",
- partition_name(bh->b_dev), bh->b_blocknr);
- multipath_reschedule_retry(mp_bh);
- return;
- }
- /*
- * This routine returns the disk from which the requested read should
- * be done.
- */
- static int multipath_read_balance (multipath_conf_t *conf)
- {
- int disk;
- for (disk = 0; disk < conf->raid_disks; disk++)
- if (conf->multipaths[disk].operational)
- return disk;
- BUG();
- return 0;
- }
- static int multipath_make_request (mddev_t *mddev, int rw,
- struct buffer_head * bh)
- {
- multipath_conf_t *conf = mddev_to_conf(mddev);
- struct buffer_head *bh_req;
- struct multipath_bh * mp_bh;
- struct multipath_info *multipath;
- if (!buffer_locked(bh))
- BUG();
-
- /*
- * make_request() can abort the operation when READA is being
- * used and no empty request is available.
- *
- * Currently, just replace the command with READ/WRITE.
- */
- if (rw == READA)
- rw = READ;
- mp_bh = multipath_alloc_mpbh (conf);
- mp_bh->master_bh = bh;
- mp_bh->mddev = mddev;
- mp_bh->cmd = rw;
- /*
- * read balancing logic:
- */
- multipath = conf->multipaths + multipath_read_balance(conf);
- bh_req = &mp_bh->bh_req;
- memcpy(bh_req, bh, sizeof(*bh));
- bh_req->b_blocknr = bh->b_rsector;
- bh_req->b_dev = multipath->dev;
- bh_req->b_rdev = multipath->dev;
- /* bh_req->b_rsector = bh->n_rsector; */
- bh_req->b_end_io = multipath_end_request;
- bh_req->b_private = mp_bh;
- generic_make_request (rw, bh_req);
- return 0;
- }
- static int multipath_status (char *page, mddev_t *mddev)
- {
- multipath_conf_t *conf = mddev_to_conf(mddev);
- int sz = 0, i;
-
- sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
- conf->working_disks);
- for (i = 0; i < conf->raid_disks; i++)
- sz += sprintf (page+sz, "%s",
- conf->multipaths[i].operational ? "U" : "_");
- sz += sprintf (page+sz, "]");
- return sz;
- }
- #define LAST_DISK KERN_ALERT
- "multipath: only one IO path left and IO error.n"
- #define NO_SPARE_DISK KERN_ALERT
- "multipath: no spare IO path left!n"
- #define DISK_FAILED KERN_ALERT
- "multipath: IO failure on %s, disabling IO path. n"
- " Operation continuing on %d IO paths.n"
- static void mark_disk_bad (mddev_t *mddev, int failed)
- {
- multipath_conf_t *conf = mddev_to_conf(mddev);
- struct multipath_info *multipath = conf->multipaths+failed;
- mdp_super_t *sb = mddev->sb;
- multipath->operational = 0;
- mark_disk_faulty(sb->disks+multipath->number);
- mark_disk_nonsync(sb->disks+multipath->number);
- mark_disk_inactive(sb->disks+multipath->number);
- sb->active_disks--;
- sb->working_disks--;
- sb->failed_disks++;
- mddev->sb_dirty = 1;
- md_wakeup_thread(conf->thread);
- conf->working_disks--;
- printk (DISK_FAILED, partition_name (multipath->dev),
- conf->working_disks);
- }
- /*
- * Careful, this can execute in IRQ contexts as well!
- */
- static int multipath_error (mddev_t *mddev, kdev_t dev)
- {
- multipath_conf_t *conf = mddev_to_conf(mddev);
- struct multipath_info * multipaths = conf->multipaths;
- int disks = MD_SB_DISKS;
- int other_paths = 1;
- int i;
- if (conf->working_disks == 1) {
- other_paths = 0;
- for (i = 0; i < disks; i++) {
- if (multipaths[i].spare) {
- other_paths = 1;
- break;
- }
- }
- }
- if (!other_paths) {
- /*
- * Uh oh, we can do nothing if this is our last path, but
- * first check if this is a queued request for a device
- * which has just failed.
- */
- for (i = 0; i < disks; i++) {
- if (multipaths[i].dev==dev && !multipaths[i].operational)
- return 0;
- }
- printk (LAST_DISK);
- } else {
- /*
- * Mark disk as unusable
- */
- for (i = 0; i < disks; i++) {
- if (multipaths[i].dev==dev && multipaths[i].operational) {
- mark_disk_bad(mddev, i);
- break;
- }
- }
- if (!conf->working_disks) {
- int err = 1;
- mdp_disk_t *spare;
- mdp_super_t *sb = mddev->sb;
- spare = get_spare(mddev);
- if (spare) {
- err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE);
- printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)n", err, disk_faulty(spare));
- }
- if (!err && !disk_faulty(spare)) {
- multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
- mark_disk_sync(spare);
- mark_disk_active(spare);
- sb->active_disks++;
- sb->spare_disks--;
- }
- }
- }
- return 0;
- }
- #undef LAST_DISK
- #undef NO_SPARE_DISK
- #undef DISK_FAILED
- static void print_multipath_conf (multipath_conf_t *conf)
- {
- int i;
- struct multipath_info *tmp;
- printk("MULTIPATH conf printout:n");
- if (!conf) {
- printk("(conf==NULL)n");
- return;
- }
- printk(" --- wd:%d rd:%d nd:%dn", conf->working_disks,
- conf->raid_disks, conf->nr_disks);
- for (i = 0; i < MD_SB_DISKS; i++) {
- tmp = conf->multipaths + i;
- if (tmp->spare || tmp->operational || tmp->number ||
- tmp->raid_disk || tmp->used_slot)
- printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%sn",
- i, tmp->spare,tmp->operational,
- tmp->number,tmp->raid_disk,tmp->used_slot,
- partition_name(tmp->dev));
- }
- }
- static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
- {
- int err = 0;
- int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
- multipath_conf_t *conf = mddev->private;
- struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
- mdp_super_t *sb = mddev->sb;
- mdp_disk_t *failed_desc, *spare_desc, *added_desc;
- mdk_rdev_t *spare_rdev, *failed_rdev;
- print_multipath_conf(conf);
- md_spin_lock_irq(&conf->device_lock);
- /*
- * find the disk ...
- */
- switch (state) {
- case DISKOP_SPARE_ACTIVE:
- /*
- * Find the failed disk within the MULTIPATH configuration ...
- * (this can only be in the first conf->working_disks part)
- */
- for (i = 0; i < conf->raid_disks; i++) {
- tmp = conf->multipaths + i;
- if ((!tmp->operational && !tmp->spare) ||
- !tmp->used_slot) {
- failed_disk = i;
- break;
- }
- }
- /*
- * When we activate a spare disk we _must_ have a disk in
- * the lower (active) part of the array to replace.
- */
- if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- /* fall through */
- case DISKOP_SPARE_WRITE:
- case DISKOP_SPARE_INACTIVE:
- /*
- * Find the spare disk ... (can only be in the 'high'
- * area of the array)
- */
- for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
- tmp = conf->multipaths + i;
- if (tmp->spare && tmp->number == (*d)->number) {
- spare_disk = i;
- break;
- }
- }
- if (spare_disk == -1) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- break;
- case DISKOP_HOT_REMOVE_DISK:
- for (i = 0; i < MD_SB_DISKS; i++) {
- tmp = conf->multipaths + i;
- if (tmp->used_slot && (tmp->number == (*d)->number)) {
- if (tmp->operational) {
- printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!n", i, (*d)->number);
- err = -EBUSY;
- goto abort;
- }
- removed_disk = i;
- break;
- }
- }
- if (removed_disk == -1) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- break;
- case DISKOP_HOT_ADD_DISK:
- for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
- tmp = conf->multipaths + i;
- if (!tmp->used_slot) {
- added_disk = i;
- break;
- }
- }
- if (added_disk == -1) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- break;
- }
- switch (state) {
- /*
- * Switch the spare disk to write-only mode:
- */
- case DISKOP_SPARE_WRITE:
- sdisk = conf->multipaths + spare_disk;
- sdisk->operational = 1;
- break;
- /*
- * Deactivate a spare disk:
- */
- case DISKOP_SPARE_INACTIVE:
- sdisk = conf->multipaths + spare_disk;
- sdisk->operational = 0;
- break;
- /*
- * Activate (mark read-write) the (now sync) spare disk,
- * which means we switch it's 'raid position' (->raid_disk)
- * with the failed disk. (only the first 'conf->nr_disks'
- * slots are used for 'real' disks and we must preserve this
- * property)
- */
- case DISKOP_SPARE_ACTIVE:
- sdisk = conf->multipaths + spare_disk;
- fdisk = conf->multipaths + failed_disk;
- spare_desc = &sb->disks[sdisk->number];
- failed_desc = &sb->disks[fdisk->number];
- if (spare_desc != *d) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- if (spare_desc->raid_disk != sdisk->raid_disk) {
- MD_BUG();
- err = 1;
- goto abort;
- }
-
- if (sdisk->raid_disk != spare_disk) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- if (failed_desc->raid_disk != fdisk->raid_disk) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- if (fdisk->raid_disk != failed_disk) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- /*
- * do the switch finally
- */
- spare_rdev = find_rdev_nr(mddev, spare_desc->number);
- failed_rdev = find_rdev_nr(mddev, failed_desc->number);
- xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr);
- spare_rdev->alias_device = 0;
- failed_rdev->alias_device = 1;
- xchg_values(*spare_desc, *failed_desc);
- xchg_values(*fdisk, *sdisk);
- /*
- * (careful, 'failed' and 'spare' are switched from now on)
- *
- * we want to preserve linear numbering and we want to
- * give the proper raid_disk number to the now activated
- * disk. (this means we switch back these values)
- */
-
- xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
- xchg_values(sdisk->raid_disk, fdisk->raid_disk);
- xchg_values(spare_desc->number, failed_desc->number);
- xchg_values(sdisk->number, fdisk->number);
- *d = failed_desc;
- if (sdisk->dev == MKDEV(0,0))
- sdisk->used_slot = 0;
- /*
- * this really activates the spare.
- */
- fdisk->spare = 0;
- /*
- * if we activate a spare, we definitely replace a
- * non-operational disk slot in the 'low' area of
- * the disk array.
- */
- conf->working_disks++;
- break;
- case DISKOP_HOT_REMOVE_DISK:
- rdisk = conf->multipaths + removed_disk;
- if (rdisk->spare && (removed_disk < conf->raid_disks)) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- rdisk->dev = MKDEV(0,0);
- rdisk->used_slot = 0;
- conf->nr_disks--;
- break;
- case DISKOP_HOT_ADD_DISK:
- adisk = conf->multipaths + added_disk;
- added_desc = *d;
- if (added_disk != added_desc->number) {
- MD_BUG();
- err = 1;
- goto abort;
- }
- adisk->number = added_desc->number;
- adisk->raid_disk = added_desc->raid_disk;
- adisk->dev = MKDEV(added_desc->major,added_desc->minor);
- adisk->operational = 0;
- adisk->spare = 1;
- adisk->used_slot = 1;
- conf->nr_disks++;
- break;
- default:
- MD_BUG();
- err = 1;
- goto abort;
- }
- abort:
- md_spin_unlock_irq(&conf->device_lock);
- print_multipath_conf(conf);
- return err;
- }
- #define IO_ERROR KERN_ALERT
- "multipath: %s: unrecoverable IO read error for block %lun"
- #define REDIRECT_SECTOR KERN_ERR
- "multipath: %s: redirecting sector %lu to another IO pathn"
- /*
- * This is a kernel thread which:
- *
- * 1. Retries failed read operations on working multipaths.
- * 2. Updates the raid superblock when problems encounter.
- * 3. Performs writes following reads for array syncronising.
- */
- static void multipathd (void *data)
- {
- struct multipath_bh *mp_bh;
- struct buffer_head *bh;
- unsigned long flags;
- mddev_t *mddev;
- kdev_t dev;
- for (;;) {
- md_spin_lock_irqsave(&retry_list_lock, flags);
- mp_bh = multipath_retry_list;
- if (!mp_bh)
- break;
- multipath_retry_list = mp_bh->next_mp;
- md_spin_unlock_irqrestore(&retry_list_lock, flags);
- mddev = mp_bh->mddev;
- if (mddev->sb_dirty)
- md_update_sb(mddev);
- bh = &mp_bh->bh_req;
- dev = bh->b_dev;
-
- multipath_map (mddev, &bh->b_dev);
- if (bh->b_dev == dev) {
- printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
- multipath_end_bh_io(mp_bh, 0);
- } else {
- printk (REDIRECT_SECTOR,
- partition_name(bh->b_dev), bh->b_blocknr);
- bh->b_rdev = bh->b_dev;
- bh->b_rsector = bh->b_blocknr;
- generic_make_request (mp_bh->cmd, bh);
- }
- }
- md_spin_unlock_irqrestore(&retry_list_lock, flags);
- }
- #undef IO_ERROR
- #undef REDIRECT_SECTOR
- /*
- * This will catch the scenario in which one of the multipaths was
- * mounted as a normal device rather than as a part of a raid set.
- *
- * check_consistency is very personality-dependent, eg. RAID5 cannot
- * do this check, it uses another method.
- */
- static int __check_consistency (mddev_t *mddev, int row)
- {
- multipath_conf_t *conf = mddev_to_conf(mddev);
- int disks = MD_SB_DISKS;
- kdev_t dev;
- struct buffer_head *bh = NULL;
- int i, rc = 0;
- char *buffer = NULL;
- for (i = 0; i < disks; i++) {
- if (!conf->multipaths[i].operational)
- continue;
- printk("(checking disk %d)n",i);
- dev = conf->multipaths[i].dev;
- set_blocksize(dev, 4096);
- if ((bh = bread(dev, row / 4, 4096)) == NULL)
- break;
- if (!buffer) {
- buffer = (char *) __get_free_page(GFP_KERNEL);
- if (!buffer)
- break;
- memcpy(buffer, bh->b_data, 4096);
- } else if (memcmp(buffer, bh->b_data, 4096)) {
- rc = 1;
- break;
- }
- bforget(bh);
- fsync_dev(dev);
- invalidate_buffers(dev);
- bh = NULL;
- }
- if (buffer)
- free_page((unsigned long) buffer);
- if (bh) {
- dev = bh->b_dev;
- bforget(bh);
- fsync_dev(dev);
- invalidate_buffers(dev);
- }
- return rc;
- }
- static int check_consistency (mddev_t *mddev)
- {
- if (__check_consistency(mddev, 0))
- /*
- * we do not do this currently, as it's perfectly possible to
- * have an inconsistent array when it's freshly created. Only
- * newly written data has to be consistent.
- */
- return 0;
- return 0;
- }
- #define INVALID_LEVEL KERN_WARNING
- "multipath: md%d: raid level not set to multipath IO (%d)n"
- #define NO_SB KERN_ERR
- "multipath: disabled IO path %s (couldn't access raid superblock)n"
- #define ERRORS KERN_ERR
- "multipath: disabled IO path %s (errors detected)n"
- #define NOT_IN_SYNC KERN_ERR
- "multipath: making IO path %s a spare path (not in sync)n"
- #define INCONSISTENT KERN_ERR
- "multipath: disabled IO path %s (inconsistent descriptor)n"
- #define ALREADY_RUNNING KERN_ERR
- "multipath: disabled IO path %s (multipath %d already operational)n"
- #define OPERATIONAL KERN_INFO
- "multipath: device %s operational as IO path %dn"
- #define MEM_ERROR KERN_ERR
- "multipath: couldn't allocate memory for md%dn"
- #define SPARE KERN_INFO
- "multipath: spare IO path %sn"
- #define NONE_OPERATIONAL KERN_ERR
- "multipath: no operational IO paths for md%dn"
- #define SB_DIFFERENCES KERN_ERR
- "multipath: detected IO path differences!n"
- #define ARRAY_IS_ACTIVE KERN_INFO
- "multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)n"
- #define THREAD_ERROR KERN_ERR
- "multipath: couldn't allocate thread for md%dn"
- static int multipath_run (mddev_t *mddev)
- {
- multipath_conf_t *conf;
- int i, j, disk_idx;
- struct multipath_info *disk, *disk2;
- mdp_super_t *sb = mddev->sb;
- mdp_disk_t *desc, *desc2;
- mdk_rdev_t *rdev, *def_rdev = NULL;
- struct md_list_head *tmp;
- int num_rdevs = 0;
- MOD_INC_USE_COUNT;
- if (sb->level != -4) {
- printk(INVALID_LEVEL, mdidx(mddev), sb->level);
- goto out;
- }
- /*
- * copy the already verified devices into our private MULTIPATH
- * bookkeeping area. [whatever we allocate in multipath_run(),
- * should be freed in multipath_stop()]
- */
- conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
- mddev->private = conf;
- if (!conf) {
- printk(MEM_ERROR, mdidx(mddev));
- goto out;
- }
- memset(conf, 0, sizeof(*conf));
- ITERATE_RDEV(mddev,rdev,tmp) {
- if (rdev->faulty) {
- /* this is a "should never happen" case and if it */
- /* ever does happen, a continue; won't help */
- printk(ERRORS, partition_name(rdev->dev));
- continue;
- } else {
- /* this is a "should never happen" case and if it */
- /* ever does happen, a continue; won't help */
- if (!rdev->sb) {
- MD_BUG();
- continue;
- }
- }
- if (rdev->desc_nr == -1) {
- MD_BUG();
- continue;
- }
- desc = &sb->disks[rdev->desc_nr];
- disk_idx = desc->raid_disk;
- disk = conf->multipaths + disk_idx;
- if (!disk_sync(desc))
- printk(NOT_IN_SYNC, partition_name(rdev->dev));
- /*
- * Mark all disks as spare to start with, then pick our
- * active disk. If we have a disk that is marked active
- * in the sb, then use it, else use the first rdev.
- */
- disk->number = desc->number;
- disk->raid_disk = desc->raid_disk;
- disk->dev = rdev->dev;
- disk->operational = 0;
- disk->spare = 1;
- disk->used_slot = 1;
- mark_disk_sync(desc);
- if (disk_active(desc)) {
- if(!conf->working_disks) {
- printk(OPERATIONAL, partition_name(rdev->dev),
- desc->raid_disk);
- disk->operational = 1;
- disk->spare = 0;
- conf->working_disks++;
- def_rdev = rdev;
- } else {
- mark_disk_spare(desc);
- }
- } else
- mark_disk_spare(desc);
- if(!num_rdevs++) def_rdev = rdev;
- }
- if(!conf->working_disks && num_rdevs) {
- desc = &sb->disks[def_rdev->desc_nr];
- disk = conf->multipaths + desc->raid_disk;
- printk(OPERATIONAL, partition_name(def_rdev->dev),
- disk->raid_disk);
- disk->operational = 1;
- disk->spare = 0;
- conf->working_disks++;
- mark_disk_active(desc);
- }
- /*
- * Make sure our active path is in desc spot 0
- */
- if(def_rdev->desc_nr != 0) {
- rdev = find_rdev_nr(mddev, 0);
- desc = &sb->disks[def_rdev->desc_nr];
- desc2 = sb->disks;
- disk = conf->multipaths + desc->raid_disk;
- disk2 = conf->multipaths + desc2->raid_disk;
- xchg_values(*desc2,*desc);
- xchg_values(*disk2,*disk);
- xchg_values(desc2->number, desc->number);
- xchg_values(disk2->number, disk->number);
- xchg_values(desc2->raid_disk, desc->raid_disk);
- xchg_values(disk2->raid_disk, disk->raid_disk);
- if(rdev) {
- xchg_values(def_rdev->desc_nr,rdev->desc_nr);
- } else {
- def_rdev->desc_nr = 0;
- }
- }
- conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
- conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
- sb->failed_disks = 0;
- sb->spare_disks = num_rdevs - 1;
- mddev->sb_dirty = 1;
- conf->mddev = mddev;
- conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
- init_waitqueue_head(&conf->wait_buffer);
- if (!conf->working_disks) {
- printk(NONE_OPERATIONAL, mdidx(mddev));
- goto out_free_conf;
- }
- /* pre-allocate some buffer_head structures.
- * As a minimum, 1 mpbh and raid_disks buffer_heads
- * would probably get us by in tight memory situations,
- * but a few more is probably a good idea.
- * For now, try NR_RESERVED_BUFS mpbh and
- * NR_RESERVED_BUFS*raid_disks bufferheads
- * This will allow at least NR_RESERVED_BUFS concurrent
- * reads or writes even if kmalloc starts failing
- */
- if (multipath_grow_mpbh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) {
- printk(MEM_ERROR, mdidx(mddev));
- goto out_free_conf;
- }
- if ((sb->state & (1 << MD_SB_CLEAN))) {
- /*
- * we do sanity checks even if the device says
- * it's clean ...
- */
- if (check_consistency(mddev)) {
- printk(SB_DIFFERENCES);
- sb->state &= ~(1 << MD_SB_CLEAN);
- }
- }
- {
- const char * name = "multipathd";
- conf->thread = md_register_thread(multipathd, conf, name);
- if (!conf->thread) {
- printk(THREAD_ERROR, mdidx(mddev));
- goto out_free_conf;
- }
- }
- /*
- * Regenerate the "device is in sync with the raid set" bit for
- * each device.
- */
- for (i = 0; i < MD_SB_DISKS; i++) {
- mark_disk_nonsync(sb->disks+i);
- for (j = 0; j < sb->raid_disks; j++) {
- if (sb->disks[i].number == conf->multipaths[j].number)
- mark_disk_sync(sb->disks+i);
- }
- }
- printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks,
- sb->raid_disks, sb->spare_disks);
- /*
- * Ok, everything is just fine now
- */
- return 0;
- out_free_conf:
- multipath_shrink_mpbh(conf);
- kfree(conf);
- mddev->private = NULL;
- out:
- MOD_DEC_USE_COUNT;
- return -EIO;
- }
- #undef INVALID_LEVEL
- #undef NO_SB
- #undef ERRORS
- #undef NOT_IN_SYNC
- #undef INCONSISTENT
- #undef ALREADY_RUNNING
- #undef OPERATIONAL
- #undef SPARE
- #undef NONE_OPERATIONAL
- #undef SB_DIFFERENCES
- #undef ARRAY_IS_ACTIVE
- static int multipath_stop (mddev_t *mddev)
- {
- multipath_conf_t *conf = mddev_to_conf(mddev);
- md_unregister_thread(conf->thread);
- multipath_shrink_mpbh(conf);
- kfree(conf);
- mddev->private = NULL;
- MOD_DEC_USE_COUNT;
- return 0;
- }
- static mdk_personality_t multipath_personality=
- {
- name: "multipath",
- make_request: multipath_make_request,
- run: multipath_run,
- stop: multipath_stop,
- status: multipath_status,
- error_handler: multipath_error,
- diskop: multipath_diskop,
- };
- static int md__init multipath_init (void)
- {
- return register_md_personality (MULTIPATH, &multipath_personality);
- }
- static void multipath_exit (void)
- {
- unregister_md_personality (MULTIPATH);
- }
- module_init(multipath_init);
- module_exit(multipath_exit);
- MODULE_LICENSE("GPL");