scsi_error.c
上传用户:lgb322
上传日期:2013-02-24
资源大小:30529k
文件大小:59k
- /*
- * scsi_error.c Copyright (C) 1997 Eric Youngdale
- *
- * SCSI error/timeout handling
- * Initial versions: Eric Youngdale. Based upon conversations with
- * Leonard Zubkoff and David Miller at Linux Expo,
- * ideas originating from all over the place.
- *
- */
- #define __NO_VERSION__
- #include <linux/module.h>
- #include <linux/sched.h>
- #include <linux/timer.h>
- #include <linux/string.h>
- #include <linux/slab.h>
- #include <linux/ioport.h>
- #include <linux/kernel.h>
- #include <linux/stat.h>
- #include <linux/blk.h>
- #include <linux/interrupt.h>
- #include <linux/delay.h>
- #include <linux/smp_lock.h>
- #define __KERNEL_SYSCALLS__
- #include <linux/unistd.h>
- #include <asm/system.h>
- #include <asm/irq.h>
- #include <asm/dma.h>
- #include "scsi.h"
- #include "hosts.h"
- #include "constants.h"
- /*
- * We must always allow SHUTDOWN_SIGS. Even if we are not a module,
- * the host drivers that we are using may be loaded as modules, and
- * when we unload these, we need to ensure that the error handler thread
- * can be shut down.
- *
- * Note - when we unload a module, we send a SIGHUP. We mustn't
- * enable SIGTERM, as this is how the init shuts things down when you
- * go to single-user mode. For that matter, init also sends SIGKILL,
- * so we mustn't enable that one either. We use SIGHUP instead. Other
- * options would be SIGPWR, I suppose.
- */
- #define SHUTDOWN_SIGS (sigmask(SIGHUP))
- #ifdef DEBUG
- #define SENSE_TIMEOUT SCSI_TIMEOUT
- #define ABORT_TIMEOUT SCSI_TIMEOUT
- #define RESET_TIMEOUT SCSI_TIMEOUT
- #else
- #define SENSE_TIMEOUT (10*HZ)
- #define RESET_TIMEOUT (2*HZ)
- #define ABORT_TIMEOUT (15*HZ)
- #endif
- #define STATIC
- /*
- * These should *probably* be handled by the host itself.
- * Since it is allowed to sleep, it probably should.
- */
- #define BUS_RESET_SETTLE_TIME 5*HZ
- #define HOST_RESET_SETTLE_TIME 10*HZ
- static const char RCSid[] = "$Header: /mnt/ide/home/eric/CVSROOT/linux/drivers/scsi/scsi_error.c,v 1.10 1997/12/08 04:50:35 eric Exp $";
- STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt);
- STATIC int scsi_request_sense(Scsi_Cmnd *);
- STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout);
- STATIC int scsi_try_to_abort_command(Scsi_Cmnd *, int);
- STATIC int scsi_test_unit_ready(Scsi_Cmnd *);
- STATIC int scsi_try_bus_device_reset(Scsi_Cmnd *, int timeout);
- STATIC int scsi_try_bus_reset(Scsi_Cmnd *);
- STATIC int scsi_try_host_reset(Scsi_Cmnd *);
- STATIC int scsi_unit_is_ready(Scsi_Cmnd *);
- STATIC void scsi_eh_action_done(Scsi_Cmnd *, int);
- STATIC int scsi_eh_retry_command(Scsi_Cmnd *);
- STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt);
- STATIC void scsi_restart_operations(struct Scsi_Host *);
- STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt);
- /*
- * Function: scsi_add_timer()
- *
- * Purpose: Start timeout timer for a single scsi command.
- *
- * Arguments: SCset - command that is about to start running.
- * timeout - amount of time to allow this command to run.
- * complete - timeout function to call if timer isn't
- * canceled.
- *
- * Returns: Nothing
- *
- * Notes: This should be turned into an inline function.
- *
- * More Notes: Each scsi command has it's own timer, and as it is added to
- * the queue, we set up the timer. When the command completes,
- * we cancel the timer. Pretty simple, really, especially
- * compared to the old way of handling this crap.
- */
- void scsi_add_timer(Scsi_Cmnd * SCset,
- int timeout,
- void (*complete) (Scsi_Cmnd *))
- {
- /*
- * If the clock was already running for this command, then
- * first delete the timer. The timer handling code gets rather
- * confused if we don't do this.
- */
- if (SCset->eh_timeout.function != NULL) {
- del_timer(&SCset->eh_timeout);
- }
- SCset->eh_timeout.data = (unsigned long) SCset;
- SCset->eh_timeout.expires = jiffies + timeout;
- SCset->eh_timeout.function = (void (*)(unsigned long)) complete;
- SCset->done_late = 0;
- SCSI_LOG_ERROR_RECOVERY(5, printk("Adding timer for command %p at %d (%p)n", SCset, timeout, complete));
- add_timer(&SCset->eh_timeout);
- }
- /*
- * Function: scsi_delete_timer()
- *
- * Purpose: Delete/cancel timer for a given function.
- *
- * Arguments: SCset - command that we are canceling timer for.
- *
- * Returns: 1 if we were able to detach the timer. 0 if we
- * blew it, and the timer function has already started
- * to run.
- *
- * Notes: This should be turned into an inline function.
- */
- int scsi_delete_timer(Scsi_Cmnd * SCset)
- {
- int rtn;
- rtn = del_timer(&SCset->eh_timeout);
- SCSI_LOG_ERROR_RECOVERY(5, printk("Clearing timer for command %p %dn", SCset, rtn));
- SCset->eh_timeout.data = (unsigned long) NULL;
- SCset->eh_timeout.function = NULL;
- return rtn;
- }
- /*
- * Function: scsi_times_out()
- *
- * Purpose: Timeout function for normal scsi commands..
- *
- * Arguments: SCpnt - command that is timing out.
- *
- * Returns: Nothing.
- *
- * Notes: We do not need to lock this. There is the potential for
- * a race only in that the normal completion handling might
- * run, but if the normal completion function determines
- * that the timer has already fired, then it mustn't do
- * anything.
- */
- void scsi_times_out(Scsi_Cmnd * SCpnt)
- {
- /*
- * Notify the low-level code that this operation failed and we are
- * reposessing the command.
- */
- #ifdef ERIC_neverdef
- /*
- * FIXME(eric)
- * Allow the host adapter to push a queue ordering tag
- * out to the bus to force the command in question to complete.
- * If the host wants to do this, then we just restart the timer
- * for the command. Before we really do this, some real thought
- * as to the optimum way to handle this should be done. We *do*
- * need to force ordering every so often to ensure that all requests
- * do eventually complete, but I am not sure if this is the best way
- * to actually go about it.
- *
- * Better yet, force a sync here, but don't block since we are in an
- * interrupt.
- */
- if (SCpnt->host->hostt->eh_ordered_queue_tag) {
- if ((*SCpnt->host->hostt->eh_ordered_queue_tag) (SCpnt)) {
- scsi_add_timer(SCpnt, SCpnt->internal_timeout,
- scsi_times_out);
- return;
- }
- }
- /*
- * FIXME(eric) - add a second special interface to handle this
- * case. Ideally that interface can also be used to request
- * a queu
- */
- if (SCpnt->host->can_queue) {
- SCpnt->host->hostt->queuecommand(SCpnt, NULL);
- }
- #endif
- /* Set the serial_number_at_timeout to the current serial_number */
- SCpnt->serial_number_at_timeout = SCpnt->serial_number;
- SCpnt->eh_state = FAILED;
- SCpnt->state = SCSI_STATE_TIMEOUT;
- SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
- SCpnt->host->in_recovery = 1;
- SCpnt->host->host_failed++;
- SCSI_LOG_TIMEOUT(3, printk("Command timed out active=%d busy=%d failed=%dn",
- atomic_read(&SCpnt->host->host_active),
- SCpnt->host->host_busy,
- SCpnt->host->host_failed));
- /*
- * If the host is having troubles, then look to see if this was the last
- * command that might have failed. If so, wake up the error handler.
- */
- if( SCpnt->host->eh_wait == NULL ) {
- panic("Error handler thread not present at %p %p %s %d",
- SCpnt, SCpnt->host, __FILE__, __LINE__);
- }
- if (SCpnt->host->host_busy == SCpnt->host->host_failed) {
- up(SCpnt->host->eh_wait);
- }
- }
- /*
- * Function scsi_block_when_processing_errors
- *
- * Purpose: Prevent more commands from being queued while error recovery
- * is taking place.
- *
- * Arguments: SDpnt - device on which we are performing recovery.
- *
- * Returns: FALSE The device was taken offline by error recovery.
- * TRUE OK to proceed.
- *
- * Notes: We block until the host is out of error recovery, and then
- * check to see whether the host or the device is offline.
- */
- int scsi_block_when_processing_errors(Scsi_Device * SDpnt)
- {
- SCSI_SLEEP(&SDpnt->host->host_wait, SDpnt->host->in_recovery);
- SCSI_LOG_ERROR_RECOVERY(5, printk("Open returning %dn", SDpnt->online));
- return SDpnt->online;
- }
- /*
- * Function: scsi_eh_times_out()
- *
- * Purpose: Timeout function for error handling.
- *
- * Arguments: SCpnt - command that is timing out.
- *
- * Returns: Nothing.
- *
- * Notes: During error handling, the kernel thread will be sleeping
- * waiting for some action to complete on the device. Our only
- * job is to record that it timed out, and to wake up the
- * thread.
- */
- STATIC
- void scsi_eh_times_out(Scsi_Cmnd * SCpnt)
- {
- SCpnt->eh_state = SCSI_STATE_TIMEOUT;
- SCSI_LOG_ERROR_RECOVERY(5, printk("In scsi_eh_times_out %pn", SCpnt));
- if (SCpnt->host->eh_action != NULL)
- up(SCpnt->host->eh_action);
- else
- printk("Missing scsi error handler threadn");
- }
- /*
- * Function: scsi_eh_done()
- *
- * Purpose: Completion function for error handling.
- *
- * Arguments: SCpnt - command that is timing out.
- *
- * Returns: Nothing.
- *
- * Notes: During error handling, the kernel thread will be sleeping
- * waiting for some action to complete on the device. Our only
- * job is to record that the action completed, and to wake up the
- * thread.
- */
- STATIC
- void scsi_eh_done(Scsi_Cmnd * SCpnt)
- {
- int rtn;
- /*
- * If the timeout handler is already running, then just set the
- * flag which says we finished late, and return. We have no
- * way of stopping the timeout handler from running, so we must
- * always defer to it.
- */
- rtn = del_timer(&SCpnt->eh_timeout);
- if (!rtn) {
- SCpnt->done_late = 1;
- return;
- }
- SCpnt->request.rq_status = RQ_SCSI_DONE;
- SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
- SCpnt->eh_state = SUCCESS;
- SCSI_LOG_ERROR_RECOVERY(5, printk("In eh_done %p result:%xn", SCpnt,
- SCpnt->result));
- if (SCpnt->host->eh_action != NULL)
- up(SCpnt->host->eh_action);
- }
- /*
- * Function: scsi_eh_action_done()
- *
- * Purpose: Completion function for error handling.
- *
- * Arguments: SCpnt - command that is timing out.
- * answer - boolean that indicates whether operation succeeded.
- *
- * Returns: Nothing.
- *
- * Notes: This callback is only used for abort and reset operations.
- */
- STATIC
- void scsi_eh_action_done(Scsi_Cmnd * SCpnt, int answer)
- {
- SCpnt->request.rq_status = RQ_SCSI_DONE;
- SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
- SCpnt->eh_state = (answer ? SUCCESS : FAILED);
- if (SCpnt->host->eh_action != NULL)
- up(SCpnt->host->eh_action);
- }
- /*
- * Function: scsi_sense_valid()
- *
- * Purpose: Determine whether a host has automatically obtained sense
- * information or not. If we have it, then give a recommendation
- * as to what we should do next.
- */
- int scsi_sense_valid(Scsi_Cmnd * SCpnt)
- {
- if (((SCpnt->sense_buffer[0] & 0x70) >> 4) != 7) {
- return FALSE;
- }
- return TRUE;
- }
- /*
- * Function: scsi_eh_retry_command()
- *
- * Purpose: Retry the original command
- *
- * Returns: SUCCESS - we were able to get the sense data.
- * FAILED - we were not able to get the sense data.
- *
- * Notes: This function will *NOT* return until the command either
- * times out, or it completes.
- */
- STATIC int scsi_eh_retry_command(Scsi_Cmnd * SCpnt)
- {
- memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
- sizeof(SCpnt->data_cmnd));
- SCpnt->request_buffer = SCpnt->buffer;
- SCpnt->request_bufflen = SCpnt->bufflen;
- SCpnt->use_sg = SCpnt->old_use_sg;
- SCpnt->cmd_len = SCpnt->old_cmd_len;
- SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
- SCpnt->underflow = SCpnt->old_underflow;
- scsi_send_eh_cmnd(SCpnt, SCpnt->timeout_per_command);
- /*
- * Hey, we are done. Let's look to see what happened.
- */
- return SCpnt->eh_state;
- }
- /*
- * Function: scsi_request_sense()
- *
- * Purpose: Request sense data from a particular target.
- *
- * Returns: SUCCESS - we were able to get the sense data.
- * FAILED - we were not able to get the sense data.
- *
- * Notes: Some hosts automatically obtain this information, others
- * require that we obtain it on our own.
- *
- * This function will *NOT* return until the command either
- * times out, or it completes.
- */
- STATIC int scsi_request_sense(Scsi_Cmnd * SCpnt)
- {
- static unsigned char generic_sense[6] =
- {REQUEST_SENSE, 0, 0, 0, 255, 0};
- unsigned char scsi_result0[256], *scsi_result = NULL;
- int saved_result;
- ASSERT_LOCK(&io_request_lock, 0);
- memcpy((void *) SCpnt->cmnd, (void *) generic_sense,
- sizeof(generic_sense));
- if (SCpnt->device->scsi_level <= SCSI_2)
- SCpnt->cmnd[1] = SCpnt->lun << 5;
- scsi_result = (!SCpnt->host->hostt->unchecked_isa_dma)
- ? &scsi_result0[0] : kmalloc(512, GFP_ATOMIC | GFP_DMA);
- if (scsi_result == NULL) {
- printk("cannot allocate scsi_result in scsi_request_sense.n");
- return FAILED;
- }
- /*
- * Zero the sense buffer. Some host adapters automatically always request
- * sense, so it is not a good idea that SCpnt->request_buffer and
- * SCpnt->sense_buffer point to the same address (DB).
- * 0 is not a valid sense code.
- */
- memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
- memset((void *) scsi_result, 0, 256);
- saved_result = SCpnt->result;
- SCpnt->request_buffer = scsi_result;
- SCpnt->request_bufflen = 256;
- SCpnt->use_sg = 0;
- SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
- SCpnt->sc_data_direction = SCSI_DATA_READ;
- SCpnt->underflow = 0;
- scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
- /* Last chance to have valid sense data */
- if (!scsi_sense_valid(SCpnt))
- memcpy((void *) SCpnt->sense_buffer,
- SCpnt->request_buffer,
- sizeof(SCpnt->sense_buffer));
- if (scsi_result != &scsi_result0[0] && scsi_result != NULL)
- kfree(scsi_result);
- /*
- * When we eventually call scsi_finish, we really wish to complete
- * the original request, so let's restore the original data. (DB)
- */
- memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
- sizeof(SCpnt->data_cmnd));
- SCpnt->result = saved_result;
- SCpnt->request_buffer = SCpnt->buffer;
- SCpnt->request_bufflen = SCpnt->bufflen;
- SCpnt->use_sg = SCpnt->old_use_sg;
- SCpnt->cmd_len = SCpnt->old_cmd_len;
- SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
- SCpnt->underflow = SCpnt->old_underflow;
- /*
- * Hey, we are done. Let's look to see what happened.
- */
- return SCpnt->eh_state;
- }
- /*
- * Function: scsi_test_unit_ready()
- *
- * Purpose: Run test unit ready command to see if the device is talking to us or not.
- *
- */
- STATIC int scsi_test_unit_ready(Scsi_Cmnd * SCpnt)
- {
- static unsigned char tur_command[6] =
- {TEST_UNIT_READY, 0, 0, 0, 0, 0};
- memcpy((void *) SCpnt->cmnd, (void *) tur_command,
- sizeof(tur_command));
- if (SCpnt->device->scsi_level <= SCSI_2)
- SCpnt->cmnd[1] = SCpnt->lun << 5;
- /*
- * Zero the sense buffer. The SCSI spec mandates that any
- * untransferred sense data should be interpreted as being zero.
- */
- memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
- SCpnt->request_buffer = NULL;
- SCpnt->request_bufflen = 0;
- SCpnt->use_sg = 0;
- SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
- SCpnt->underflow = 0;
- SCpnt->sc_data_direction = SCSI_DATA_NONE;
- scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
- /*
- * When we eventually call scsi_finish, we really wish to complete
- * the original request, so let's restore the original data. (DB)
- */
- memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
- sizeof(SCpnt->data_cmnd));
- SCpnt->request_buffer = SCpnt->buffer;
- SCpnt->request_bufflen = SCpnt->bufflen;
- SCpnt->use_sg = SCpnt->old_use_sg;
- SCpnt->cmd_len = SCpnt->old_cmd_len;
- SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
- SCpnt->underflow = SCpnt->old_underflow;
- /*
- * Hey, we are done. Let's look to see what happened.
- */
- SCSI_LOG_ERROR_RECOVERY(3,
- printk("scsi_test_unit_ready: SCpnt %p eh_state %xn",
- SCpnt, SCpnt->eh_state));
- return SCpnt->eh_state;
- }
- /*
- * This would normally need to get the IO request lock,
- * but as it doesn't actually touch anything that needs
- * to be locked we can avoid the lock here..
- */
- STATIC
- void scsi_sleep_done(struct semaphore *sem)
- {
- if (sem != NULL) {
- up(sem);
- }
- }
- void scsi_sleep(int timeout)
- {
- DECLARE_MUTEX_LOCKED(sem);
- struct timer_list timer;
- init_timer(&timer);
- timer.data = (unsigned long) &sem;
- timer.expires = jiffies + timeout;
- timer.function = (void (*)(unsigned long)) scsi_sleep_done;
- SCSI_LOG_ERROR_RECOVERY(5, printk("Sleeping for timer tics %dn", timeout));
- add_timer(&timer);
- down(&sem);
- del_timer(&timer);
- }
- /*
- * Function: scsi_send_eh_cmnd
- *
- * Purpose: Send a command out to a device as part of error recovery.
- *
- * Notes: The initialization of the structures is quite a bit different
- * in this case, and furthermore, there is a different completion
- * handler.
- */
- STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout)
- {
- unsigned long flags;
- struct Scsi_Host *host;
- ASSERT_LOCK(&io_request_lock, 0);
- host = SCpnt->host;
- retry:
- /*
- * We will use a queued command if possible, otherwise we will emulate the
- * queuing and calling of completion function ourselves.
- */
- SCpnt->owner = SCSI_OWNER_LOWLEVEL;
- if (host->can_queue) {
- DECLARE_MUTEX_LOCKED(sem);
- SCpnt->eh_state = SCSI_STATE_QUEUED;
- scsi_add_timer(SCpnt, timeout, scsi_eh_times_out);
- /*
- * Set up the semaphore so we wait for the command to complete.
- */
- SCpnt->host->eh_action = &sem;
- SCpnt->request.rq_status = RQ_SCSI_BUSY;
- spin_lock_irqsave(&io_request_lock, flags);
- host->hostt->queuecommand(SCpnt, scsi_eh_done);
- spin_unlock_irqrestore(&io_request_lock, flags);
- down(&sem);
- SCpnt->host->eh_action = NULL;
- /*
- * See if timeout. If so, tell the host to forget about it.
- * In other words, we don't want a callback any more.
- */
- if (SCpnt->eh_state == SCSI_STATE_TIMEOUT) {
- SCpnt->owner = SCSI_OWNER_LOWLEVEL;
- /*
- * As far as the low level driver is
- * concerned, this command is still active, so
- * we must give the low level driver a chance
- * to abort it. (DB)
- *
- * FIXME(eric) - we are not tracking whether we could
- * abort a timed out command or not. Not sure how
- * we should treat them differently anyways.
- */
- spin_lock_irqsave(&io_request_lock, flags);
- if (SCpnt->host->hostt->eh_abort_handler)
- SCpnt->host->hostt->eh_abort_handler(SCpnt);
- spin_unlock_irqrestore(&io_request_lock, flags);
-
- SCpnt->request.rq_status = RQ_SCSI_DONE;
- SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
-
- SCpnt->eh_state = FAILED;
- }
- SCSI_LOG_ERROR_RECOVERY(5, printk("send_eh_cmnd: %p eh_state:%xn",
- SCpnt, SCpnt->eh_state));
- } else {
- int temp;
- /*
- * We damn well had better never use this code. There is no timeout
- * protection here, since we would end up waiting in the actual low
- * level driver, we don't know how to wake it up.
- */
- spin_lock_irqsave(&io_request_lock, flags);
- temp = host->hostt->command(SCpnt);
- spin_unlock_irqrestore(&io_request_lock, flags);
- SCpnt->result = temp;
- /* Fall through to code below to examine status. */
- SCpnt->eh_state = SUCCESS;
- }
- /*
- * Now examine the actual status codes to see whether the command actually
- * did complete normally.
- */
- if (SCpnt->eh_state == SUCCESS) {
- int ret = scsi_eh_completed_normally(SCpnt);
- SCSI_LOG_ERROR_RECOVERY(3,
- printk("scsi_send_eh_cmnd: scsi_eh_completed_normally %xn", ret));
- switch (ret) {
- case SUCCESS:
- SCpnt->eh_state = SUCCESS;
- break;
- case NEEDS_RETRY:
- goto retry;
- case FAILED:
- default:
- SCpnt->eh_state = FAILED;
- break;
- }
- } else {
- SCpnt->eh_state = FAILED;
- }
- }
- /*
- * Function: scsi_unit_is_ready()
- *
- * Purpose: Called after TEST_UNIT_READY is run, to test to see if
- * the unit responded in a way that indicates it is ready.
- */
- STATIC int scsi_unit_is_ready(Scsi_Cmnd * SCpnt)
- {
- if (SCpnt->result) {
- if (((driver_byte(SCpnt->result) & DRIVER_SENSE) ||
- (status_byte(SCpnt->result) & CHECK_CONDITION)) &&
- ((SCpnt->sense_buffer[0] & 0x70) >> 4) == 7) {
- if (((SCpnt->sense_buffer[2] & 0xf) != NOT_READY) &&
- ((SCpnt->sense_buffer[2] & 0xf) != UNIT_ATTENTION) &&
- ((SCpnt->sense_buffer[2] & 0xf) != ILLEGAL_REQUEST)) {
- return 0;
- }
- }
- }
- return 1;
- }
- /*
- * Function: scsi_eh_finish_command
- *
- * Purpose: Handle a command that we are finished with WRT error handling.
- *
- * Arguments: SClist - pointer to list into which we are putting completed commands.
- * SCpnt - command that is completing
- *
- * Notes: We don't want to use the normal command completion while we are
- * are still handling errors - it may cause other commands to be queued,
- * and that would disturb what we are doing. Thus we really want to keep
- * a list of pending commands for final completion, and once we
- * are ready to leave error handling we handle completion for real.
- */
- STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt)
- {
- SCpnt->state = SCSI_STATE_BHQUEUE;
- SCpnt->bh_next = *SClist;
- /*
- * Set this back so that the upper level can correctly free up
- * things.
- */
- SCpnt->use_sg = SCpnt->old_use_sg;
- SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
- SCpnt->underflow = SCpnt->old_underflow;
- *SClist = SCpnt;
- }
- /*
- * Function: scsi_try_to_abort_command
- *
- * Purpose: Ask host adapter to abort a running command.
- *
- * Returns: FAILED Operation failed or not supported.
- * SUCCESS Succeeded.
- *
- * Notes: This function will not return until the user's completion
- * function has been called. There is no timeout on this
- * operation. If the author of the low-level driver wishes
- * this operation to be timed, they can provide this facility
- * themselves. Helper functions in scsi_error.c can be supplied
- * to make this easier to do.
- *
- * Notes: It may be possible to combine this with all of the reset
- * handling to eliminate a lot of code duplication. I don't
- * know what makes more sense at the moment - this is just a
- * prototype.
- */
- STATIC int scsi_try_to_abort_command(Scsi_Cmnd * SCpnt, int timeout)
- {
- int rtn;
- unsigned long flags;
- SCpnt->eh_state = FAILED; /* Until we come up with something better */
- if (SCpnt->host->hostt->eh_abort_handler == NULL) {
- return FAILED;
- }
- /*
- * scsi_done was called just after the command timed out and before
- * we had a chance to process it. (DB)
- */
- if (SCpnt->serial_number == 0)
- return SUCCESS;
- SCpnt->owner = SCSI_OWNER_LOWLEVEL;
- spin_lock_irqsave(&io_request_lock, flags);
- rtn = SCpnt->host->hostt->eh_abort_handler(SCpnt);
- spin_unlock_irqrestore(&io_request_lock, flags);
- return rtn;
- }
- /*
- * Function: scsi_try_bus_device_reset
- *
- * Purpose: Ask host adapter to perform a bus device reset for a given
- * device.
- *
- * Returns: FAILED Operation failed or not supported.
- * SUCCESS Succeeded.
- *
- * Notes: There is no timeout for this operation. If this operation is
- * unreliable for a given host, then the host itself needs to put a
- * timer on it, and set the host back to a consistent state prior
- * to returning.
- */
- STATIC int scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt, int timeout)
- {
- unsigned long flags;
- int rtn;
- SCpnt->eh_state = FAILED; /* Until we come up with something better */
- if (SCpnt->host->hostt->eh_device_reset_handler == NULL) {
- return FAILED;
- }
- SCpnt->owner = SCSI_OWNER_LOWLEVEL;
- spin_lock_irqsave(&io_request_lock, flags);
- rtn = SCpnt->host->hostt->eh_device_reset_handler(SCpnt);
- spin_unlock_irqrestore(&io_request_lock, flags);
- if (rtn == SUCCESS)
- SCpnt->eh_state = SUCCESS;
- return SCpnt->eh_state;
- }
- /*
- * Function: scsi_try_bus_reset
- *
- * Purpose: Ask host adapter to perform a bus reset for a host.
- *
- * Returns: FAILED Operation failed or not supported.
- * SUCCESS Succeeded.
- *
- * Notes:
- */
- STATIC int scsi_try_bus_reset(Scsi_Cmnd * SCpnt)
- {
- unsigned long flags;
- int rtn;
- SCpnt->eh_state = FAILED; /* Until we come up with something better */
- SCpnt->owner = SCSI_OWNER_LOWLEVEL;
- SCpnt->serial_number_at_timeout = SCpnt->serial_number;
- if (SCpnt->host->hostt->eh_bus_reset_handler == NULL) {
- return FAILED;
- }
- spin_lock_irqsave(&io_request_lock, flags);
- rtn = SCpnt->host->hostt->eh_bus_reset_handler(SCpnt);
- spin_unlock_irqrestore(&io_request_lock, flags);
- if (rtn == SUCCESS)
- SCpnt->eh_state = SUCCESS;
- /*
- * If we had a successful bus reset, mark the command blocks to expect
- * a condition code of unit attention.
- */
- scsi_sleep(BUS_RESET_SETTLE_TIME);
- if (SCpnt->eh_state == SUCCESS) {
- Scsi_Device *SDloop;
- for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
- if (SCpnt->channel == SDloop->channel) {
- SDloop->was_reset = 1;
- SDloop->expecting_cc_ua = 1;
- }
- }
- }
- return SCpnt->eh_state;
- }
- /*
- * Function: scsi_try_host_reset
- *
- * Purpose: Ask host adapter to reset itself, and the bus.
- *
- * Returns: FAILED Operation failed or not supported.
- * SUCCESS Succeeded.
- *
- * Notes:
- */
- STATIC int scsi_try_host_reset(Scsi_Cmnd * SCpnt)
- {
- unsigned long flags;
- int rtn;
- SCpnt->eh_state = FAILED; /* Until we come up with something better */
- SCpnt->owner = SCSI_OWNER_LOWLEVEL;
- SCpnt->serial_number_at_timeout = SCpnt->serial_number;
- if (SCpnt->host->hostt->eh_host_reset_handler == NULL) {
- return FAILED;
- }
- spin_lock_irqsave(&io_request_lock, flags);
- rtn = SCpnt->host->hostt->eh_host_reset_handler(SCpnt);
- spin_unlock_irqrestore(&io_request_lock, flags);
- if (rtn == SUCCESS)
- SCpnt->eh_state = SUCCESS;
- /*
- * If we had a successful host reset, mark the command blocks to expect
- * a condition code of unit attention.
- */
- scsi_sleep(HOST_RESET_SETTLE_TIME);
- if (SCpnt->eh_state == SUCCESS) {
- Scsi_Device *SDloop;
- for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
- SDloop->was_reset = 1;
- SDloop->expecting_cc_ua = 1;
- }
- }
- return SCpnt->eh_state;
- }
- /*
- * Function: scsi_decide_disposition
- *
- * Purpose: Examine a command block that has come back from the low-level
- * and figure out what to do next.
- *
- * Returns: SUCCESS - pass on to upper level.
- * FAILED - pass on to error handler thread.
- * RETRY - command should be retried.
- * SOFTERR - command succeeded, but we need to log
- * a soft error.
- *
- * Notes: This is *ONLY* called when we are examining the status
- * after sending out the actual data command. Any commands
- * that are queued for error recovery (i.e. TEST_UNIT_READY)
- * do *NOT* come through here.
- *
- * NOTE - When this routine returns FAILED, it means the error
- * handler thread is woken. In cases where the error code
- * indicates an error that doesn't require the error handler
- * thread (i.e. we don't need to abort/reset), then this function
- * should return SUCCESS.
- */
- int scsi_decide_disposition(Scsi_Cmnd * SCpnt)
- {
- int rtn;
- /*
- * If the device is offline, then we clearly just pass the result back
- * up to the top level.
- */
- if (SCpnt->device->online == FALSE) {
- SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: device offline - report as SUCCESSn"));
- return SUCCESS;
- }
- /*
- * First check the host byte, to see if there is anything in there
- * that would indicate what we need to do.
- */
- switch (host_byte(SCpnt->result)) {
- case DID_PASSTHROUGH:
- /*
- * No matter what, pass this through to the upper layer.
- * Nuke this special code so that it looks like we are saying
- * DID_OK.
- */
- SCpnt->result &= 0xff00ffff;
- return SUCCESS;
- case DID_OK:
- /*
- * Looks good. Drop through, and check the next byte.
- */
- break;
- case DID_NO_CONNECT:
- case DID_BAD_TARGET:
- case DID_ABORT:
- /*
- * Note - this means that we just report the status back to the
- * top level driver, not that we actually think that it indicates
- * success.
- */
- return SUCCESS;
- /*
- * When the low level driver returns DID_SOFT_ERROR,
- * it is responsible for keeping an internal retry counter
- * in order to avoid endless loops (DB)
- *
- * Actually this is a bug in this function here. We should
- * be mindful of the maximum number of retries specified
- * and not get stuck in a loop.
- */
- case DID_SOFT_ERROR:
- goto maybe_retry;
- case DID_BUS_BUSY:
- case DID_PARITY:
- case DID_ERROR:
- goto maybe_retry;
- case DID_TIME_OUT:
- /*
- * When we scan the bus, we get timeout messages for
- * these commands if there is no device available.
- * Other hosts report DID_NO_CONNECT for the same thing.
- */
- if ((SCpnt->cmnd[0] == TEST_UNIT_READY ||
- SCpnt->cmnd[0] == INQUIRY)) {
- return SUCCESS;
- } else {
- return FAILED;
- }
- case DID_RESET:
- /*
- * In the normal case where we haven't initiated a reset, this is
- * a failure.
- */
- if (SCpnt->flags & IS_RESETTING) {
- SCpnt->flags &= ~IS_RESETTING;
- goto maybe_retry;
- }
- /*
- * Examine the sense data to figure out how to proceed from here.
- * If there is no sense data, we will be forced into the error
- * handler thread, where we get to examine the thing in a lot more
- * detail.
- */
- return scsi_check_sense(SCpnt);
- default:
- return FAILED;
- }
- /*
- * Next, check the message byte.
- */
- if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
- return FAILED;
- }
- /*
- * Now, check the status byte to see if this indicates anything special.
- */
- switch (status_byte(SCpnt->result)) {
- case QUEUE_FULL:
- /*
- * The case of trying to send too many commands to a tagged queueing
- * device.
- */
- return ADD_TO_MLQUEUE;
- case GOOD:
- case COMMAND_TERMINATED:
- return SUCCESS;
- case CHECK_CONDITION:
- rtn = scsi_check_sense(SCpnt);
- if (rtn == NEEDS_RETRY) {
- goto maybe_retry;
- }
- return rtn;
- case CONDITION_GOOD:
- case INTERMEDIATE_GOOD:
- case INTERMEDIATE_C_GOOD:
- /*
- * Who knows? FIXME(eric)
- */
- return SUCCESS;
- case BUSY:
- case RESERVATION_CONFLICT:
- goto maybe_retry;
- default:
- return FAILED;
- }
- return FAILED;
- maybe_retry:
- if ((++SCpnt->retries) < SCpnt->allowed) {
- return NEEDS_RETRY;
- } else {
- /*
- * No more retries - report this one back to upper level.
- */
- return SUCCESS;
- }
- }
- /*
- * Function: scsi_eh_completed_normally
- *
- * Purpose: Examine a command block that has come back from the low-level
- * and figure out what to do next.
- *
- * Returns: SUCCESS - pass on to upper level.
- * FAILED - pass on to error handler thread.
- * RETRY - command should be retried.
- * SOFTERR - command succeeded, but we need to log
- * a soft error.
- *
- * Notes: This is *ONLY* called when we are examining the status
- * of commands queued during error recovery. The main
- * difference here is that we don't allow for the possibility
- * of retries here, and we are a lot more restrictive about what
- * we consider acceptable.
- */
- STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt)
- {
- /*
- * First check the host byte, to see if there is anything in there
- * that would indicate what we need to do.
- */
- if (host_byte(SCpnt->result) == DID_RESET) {
- if (SCpnt->flags & IS_RESETTING) {
- /*
- * OK, this is normal. We don't know whether in fact the
- * command in question really needs to be rerun or not -
- * if this was the original data command then the answer is yes,
- * otherwise we just flag it as success.
- */
- SCpnt->flags &= ~IS_RESETTING;
- return NEEDS_RETRY;
- }
- /*
- * Rats. We are already in the error handler, so we now get to try
- * and figure out what to do next. If the sense is valid, we have
- * a pretty good idea of what to do. If not, we mark it as failed.
- */
- return scsi_check_sense(SCpnt);
- }
- if (host_byte(SCpnt->result) != DID_OK) {
- return FAILED;
- }
- /*
- * Next, check the message byte.
- */
- if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
- return FAILED;
- }
- /*
- * Now, check the status byte to see if this indicates anything special.
- */
- switch (status_byte(SCpnt->result)) {
- case GOOD:
- case COMMAND_TERMINATED:
- return SUCCESS;
- case CHECK_CONDITION:
- return scsi_check_sense(SCpnt);
- case CONDITION_GOOD:
- case INTERMEDIATE_GOOD:
- case INTERMEDIATE_C_GOOD:
- /*
- * Who knows? FIXME(eric)
- */
- return SUCCESS;
- case BUSY:
- case QUEUE_FULL:
- case RESERVATION_CONFLICT:
- default:
- return FAILED;
- }
- return FAILED;
- }
- /*
- * Function: scsi_check_sense
- *
- * Purpose: Examine sense information - give suggestion as to what
- * we should do with it.
- */
- STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt)
- {
- if (!scsi_sense_valid(SCpnt)) {
- return FAILED;
- }
- if (SCpnt->sense_buffer[2] & 0xe0)
- return SUCCESS;
- switch (SCpnt->sense_buffer[2] & 0xf) {
- case NO_SENSE:
- return SUCCESS;
- case RECOVERED_ERROR:
- return /* SOFT_ERROR */ SUCCESS;
- case ABORTED_COMMAND:
- return NEEDS_RETRY;
- case NOT_READY:
- case UNIT_ATTENTION:
- /*
- * If we are expecting a CC/UA because of a bus reset that we
- * performed, treat this just as a retry. Otherwise this is
- * information that we should pass up to the upper-level driver
- * so that we can deal with it there.
- */
- if (SCpnt->device->expecting_cc_ua) {
- SCpnt->device->expecting_cc_ua = 0;
- return NEEDS_RETRY;
- }
- /*
- * If the device is in the process of becoming ready, we
- * should retry.
- */
- if ((SCpnt->sense_buffer[12] == 0x04) &&
- (SCpnt->sense_buffer[13] == 0x01)) {
- return NEEDS_RETRY;
- }
- return SUCCESS;
- /* these three are not supported */
- case COPY_ABORTED:
- case VOLUME_OVERFLOW:
- case MISCOMPARE:
- return SUCCESS;
- case MEDIUM_ERROR:
- return NEEDS_RETRY;
- case ILLEGAL_REQUEST:
- case BLANK_CHECK:
- case DATA_PROTECT:
- case HARDWARE_ERROR:
- default:
- return SUCCESS;
- }
- }
- /*
- * Function: scsi_restart_operations
- *
- * Purpose: Restart IO operations to the specified host.
- *
- * Arguments: host - host that we are restarting
- *
- * Lock status: Assumed that locks are not held upon entry.
- *
- * Returns: Nothing
- *
- * Notes: When we entered the error handler, we blocked all further
- * I/O to this device. We need to 'reverse' this process.
- */
- STATIC void scsi_restart_operations(struct Scsi_Host *host)
- {
- Scsi_Device *SDpnt;
- unsigned long flags;
- ASSERT_LOCK(&io_request_lock, 0);
- /*
- * Next free up anything directly waiting upon the host. This will be
- * requests for character device operations, and also for ioctls to queued
- * block devices.
- */
- SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: Waking up host to restartn"));
- wake_up(&host->host_wait);
- /*
- * Finally we need to re-initiate requests that may be pending. We will
- * have had everything blocked while error handling is taking place, and
- * now that error recovery is done, we will need to ensure that these
- * requests are started.
- */
- spin_lock_irqsave(&io_request_lock, flags);
- for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
- request_queue_t *q;
- if ((host->can_queue > 0 && (host->host_busy >= host->can_queue))
- || (host->host_blocked)
- || (host->host_self_blocked)
- || (SDpnt->device_blocked)) {
- break;
- }
- q = &SDpnt->request_queue;
- q->request_fn(q);
- }
- spin_unlock_irqrestore(&io_request_lock, flags);
- }
- /*
- * Function: scsi_unjam_host
- *
- * Purpose: Attempt to fix a host which has a command that failed for
- * some reason.
- *
- * Arguments: host - host that needs unjamming.
- *
- * Returns: Nothing
- *
- * Notes: When we come in here, we *know* that all commands on the
- * bus have either completed, failed or timed out. We also
- * know that no further commands are being sent to the host,
- * so things are relatively quiet and we have freedom to
- * fiddle with things as we wish.
- *
- * Additional note: This is only the *default* implementation. It is possible
- * for individual drivers to supply their own version of this
- * function, and if the maintainer wishes to do this, it is
- * strongly suggested that this function be taken as a template
- * and modified. This function was designed to correctly handle
- * problems for about 95% of the different cases out there, and
- * it should always provide at least a reasonable amount of error
- * recovery.
- *
- * Note3: Any command marked 'FAILED' or 'TIMEOUT' must eventually
- * have scsi_finish_command() called for it. We do all of
- * the retry stuff here, so when we restart the host after we
- * return it should have an empty queue.
- */
- STATIC int scsi_unjam_host(struct Scsi_Host *host)
- {
- int devices_failed;
- int numfailed;
- int ourrtn;
- int rtn = FALSE;
- int result;
- Scsi_Cmnd *SCloop;
- Scsi_Cmnd *SCpnt;
- Scsi_Device *SDpnt;
- Scsi_Device *SDloop;
- Scsi_Cmnd *SCdone;
- int timed_out;
- ASSERT_LOCK(&io_request_lock, 0);
- SCdone = NULL;
- /*
- * First, protect against any sort of race condition. If any of the outstanding
- * commands are in states that indicate that we are not yet blocked (i.e. we are
- * not in a quiet state) then we got woken up in error. If we ever end up here,
- * we need to re-examine some of the assumptions.
- */
- for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
- for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
- if (SCpnt->state == SCSI_STATE_FAILED
- || SCpnt->state == SCSI_STATE_TIMEOUT
- || SCpnt->state == SCSI_STATE_INITIALIZING
- || SCpnt->state == SCSI_STATE_UNUSED) {
- continue;
- }
- /*
- * Rats. Something is still floating around out there. This could
- * be the result of the fact that the upper level drivers are still frobbing
- * commands that might have succeeded. There are two outcomes. One is that
- * the command block will eventually be freed, and the other one is that
- * the command will be queued and will be finished along the way.
- */
- SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler prematurely woken - commands still active (%p %x %d)n", SCpnt, SCpnt->state, SCpnt->target));
- /*
- * panic("SCSI Error handler woken too earlyn");
- *
- * This is no longer a problem, since now the code cares only about
- * SCSI_STATE_TIMEOUT and SCSI_STATE_FAILED.
- * Other states are useful only to release active commands when devices are
- * set offline. If (host->host_active == host->host_busy) we can safely assume
- * that there are no commands in state other then TIMEOUT od FAILED. (DB)
- *
- * FIXME:
- * It is not easy to release correctly commands according to their state when
- * devices are set offline, when the state is neither TIMEOUT nor FAILED.
- * When a device is set offline, we can have some command with
- * rq_status=RQ_SCSY_BUSY, owner=SCSI_STATE_HIGHLEVEL,
- * state=SCSI_STATE_INITIALIZING and the driver module cannot be released.
- * (DB, 17 May 1998)
- */
- }
- }
- /*
- * Next, see if we need to request sense information. if so,
- * then get it now, so we have a better idea of what to do.
- * FIXME(eric) this has the unfortunate side effect that if a host
- * adapter does not automatically request sense information, that we end
- * up shutting it down before we request it. All hosts should be doing this
- * anyways, so for now all I have to say is tough noogies if you end up in here.
- * On second thought, this is probably a good idea. We *really* want to give
- * authors an incentive to automatically request this.
- */
- SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we need to request sensen"));
- for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
- for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
- if (SCpnt->state != SCSI_STATE_FAILED || scsi_sense_valid(SCpnt)) {
- continue;
- }
- SCSI_LOG_ERROR_RECOVERY(2, printk("scsi_unjam_host: Requesting sense for %dn",
- SCpnt->target));
- rtn = scsi_request_sense(SCpnt);
- if (rtn != SUCCESS) {
- continue;
- }
- SCSI_LOG_ERROR_RECOVERY(3, printk("Sense requested for %p - result %xn",
- SCpnt, SCpnt->result));
- SCSI_LOG_ERROR_RECOVERY(3, print_sense("bh", SCpnt));
- result = scsi_decide_disposition(SCpnt);
- /*
- * If the result was normal, then just pass it along to the
- * upper level.
- */
- if (result == SUCCESS) {
- SCpnt->host->host_failed--;
- scsi_eh_finish_command(&SCdone, SCpnt);
- }
- if (result != NEEDS_RETRY) {
- continue;
- }
- /*
- * We only come in here if we want to retry a
- * command. The test to see whether the command
- * should be retried should be keeping track of the
- * number of tries, so we don't end up looping, of
- * course.
- */
- SCpnt->state = NEEDS_RETRY;
- rtn = scsi_eh_retry_command(SCpnt);
- if (rtn != SUCCESS) {
- continue;
- }
- /*
- * We eventually hand this one back to the top level.
- */
- SCpnt->host->host_failed--;
- scsi_eh_finish_command(&SCdone, SCpnt);
- }
- }
- /*
- * Go through the list of commands and figure out where we stand and how bad things
- * really are.
- */
- numfailed = 0;
- timed_out = 0;
- devices_failed = 0;
- for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
- unsigned int device_error = 0;
- for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
- if (SCpnt->state == SCSI_STATE_FAILED) {
- SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d failedn",
- SCpnt->target));
- numfailed++;
- device_error++;
- }
- if (SCpnt->state == SCSI_STATE_TIMEOUT) {
- SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d timedoutn",
- SCpnt->target));
- timed_out++;
- device_error++;
- }
- }
- if (device_error > 0) {
- devices_failed++;
- }
- }
- SCSI_LOG_ERROR_RECOVERY(2, printk("Total of %d+%d commands on %d devices require eh workn",
- numfailed, timed_out, devices_failed));
- if (host->host_failed == 0) {
- ourrtn = TRUE;
- goto leave;
- }
- /*
- * Next, try and see whether or not it makes sense to try and abort
- * the running command. This only works out to be the case if we have
- * one command that has timed out. If the command simply failed, it
- * makes no sense to try and abort the command, since as far as the
- * host adapter is concerned, it isn't running.
- */
- SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try abortn"));
- for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
- for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
- if (SCloop->state != SCSI_STATE_TIMEOUT) {
- continue;
- }
- rtn = scsi_try_to_abort_command(SCloop, ABORT_TIMEOUT);
- if (rtn == SUCCESS) {
- rtn = scsi_test_unit_ready(SCloop);
- if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
- rtn = scsi_eh_retry_command(SCloop);
- if (rtn == SUCCESS) {
- SCloop->host->host_failed--;
- scsi_eh_finish_command(&SCdone, SCloop);
- }
- }
- }
- }
- }
- /*
- * If we have corrected all of the problems, then we are done.
- */
- if (host->host_failed == 0) {
- ourrtn = TRUE;
- goto leave;
- }
- /*
- * Either the abort wasn't appropriate, or it didn't succeed.
- * Now try a bus device reset. Still, look to see whether we have
- * multiple devices that are jammed or not - if we have multiple devices,
- * it makes no sense to try BUS_DEVICE_RESET - we really would need
- * to try a BUS_RESET instead.
- *
- * Does this make sense - should we try BDR on each device individually?
- * Yes, definitely.
- */
- SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDRn"));
- for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
- for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
- if (SCloop->state == SCSI_STATE_FAILED
- || SCloop->state == SCSI_STATE_TIMEOUT) {
- break;
- }
- }
- if (SCloop == NULL) {
- continue;
- }
- /*
- * OK, we have a device that is having problems. Try and send
- * a bus device reset to it.
- *
- * FIXME(eric) - make sure we handle the case where multiple
- * commands to the same device have failed. They all must
- * get properly restarted.
- */
- rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);
- if (rtn == SUCCESS) {
- rtn = scsi_test_unit_ready(SCloop);
- if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
- rtn = scsi_eh_retry_command(SCloop);
- if (rtn == SUCCESS) {
- SCloop->host->host_failed--;
- scsi_eh_finish_command(&SCdone, SCloop);
- }
- }
- }
- }
- if (host->host_failed == 0) {
- ourrtn = TRUE;
- goto leave;
- }
- /*
- * If we ended up here, we have serious problems. The only thing left
- * to try is a full bus reset. If someone has grabbed the bus and isn't
- * letting go, then perhaps this will help.
- */
- SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus resetn"));
- /*
- * We really want to loop over the various channels, and do this on
- * a channel by channel basis. We should also check to see if any
- * of the failed commands are on soft_reset devices, and if so, skip
- * the reset.
- */
- for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
- next_device:
- for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
- if (SCpnt->state != SCSI_STATE_FAILED
- && SCpnt->state != SCSI_STATE_TIMEOUT) {
- continue;
- }
- /*
- * We have a failed command. Make sure there are no other failed
- * commands on the same channel that are timed out and implement a
- * soft reset.
- */
- for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
- for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
- if (SCloop->channel != SCpnt->channel) {
- continue;
- }
- if (SCloop->state != SCSI_STATE_FAILED
- && SCloop->state != SCSI_STATE_TIMEOUT) {
- continue;
- }
- if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) {
- /*
- * If this device uses the soft reset option, and this
- * is one of the devices acting up, then our only
- * option is to wait a bit, since the command is
- * supposedly still running.
- *
- * FIXME(eric) - right now we will just end up falling
- * through to the 'take device offline' case.
- *
- * FIXME(eric) - It is possible that the command completed
- * *after* the error recovery procedure started, and if this
- * is the case, we are worrying about nothing here.
- */
- scsi_sleep(1 * HZ);
- goto next_device;
- }
- }
- }
- /*
- * We now know that we are able to perform a reset for the
- * bus that SCpnt points to. There are no soft-reset devices
- * with outstanding timed out commands.
- */
- rtn = scsi_try_bus_reset(SCpnt);
- if (rtn == SUCCESS) {
- for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
- for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
- if (SCloop->channel != SCpnt->channel) {
- continue;
- }
- if (SCloop->state != SCSI_STATE_FAILED
- && SCloop->state != SCSI_STATE_TIMEOUT) {
- continue;
- }
- rtn = scsi_test_unit_ready(SCloop);
- if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
- rtn = scsi_eh_retry_command(SCloop);
- if (rtn == SUCCESS) {
- SCpnt->host->host_failed--;
- scsi_eh_finish_command(&SCdone, SCloop);
- }
- }
- /*
- * If the bus reset worked, but we are still unable to
- * talk to the device, take it offline.
- * FIXME(eric) - is this really the correct thing to do?
- */
- if (rtn != SUCCESS) {
- printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after bus reset: host %d channel %d id %d lun %dn", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
- SDloop->online = FALSE;
- SDloop->host->host_failed--;
- scsi_eh_finish_command(&SCdone, SCloop);
- }
- }
- }
- }
- }
- }
- if (host->host_failed == 0) {
- ourrtn = TRUE;
- goto leave;
- }
- /*
- * If we ended up here, we have serious problems. The only thing left
- * to try is a full host reset - perhaps the firmware on the device
- * crashed, or something like that.
- *
- * It is assumed that a succesful host reset will cause *all* information
- * about the command to be flushed from both the host adapter *and* the
- * device.
- *
- * FIXME(eric) - it isn't clear that devices that implement the soft reset
- * option can ever be cleared except via cycling the power. The problem is
- * that sending the host reset command will cause the host to forget
- * about the pending command, but the device won't forget. For now, we
- * skip the host reset option if any of the failed devices are configured
- * to use the soft reset option.
- */
- for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
- next_device2:
- for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
- if (SCpnt->state != SCSI_STATE_FAILED
- && SCpnt->state != SCSI_STATE_TIMEOUT) {
- continue;
- }
- if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) {
- /*
- * If this device uses the soft reset option, and this
- * is one of the devices acting up, then our only
- * option is to wait a bit, since the command is
- * supposedly still running.
- *
- * FIXME(eric) - right now we will just end up falling
- * through to the 'take device offline' case.
- */
- SCSI_LOG_ERROR_RECOVERY(3,
- printk("scsi_unjam_host: Unable to try hard host resetn"));
- /*
- * Due to the spinlock, we will never get out of this
- * loop without a proper wait. (DB)
- */
- scsi_sleep(1 * HZ);
- goto next_device2;
- }
- SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host resetn"));
- /*
- * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
- */
- rtn = scsi_try_host_reset(SCpnt);
- if (rtn == SUCCESS) {
- /*
- * FIXME(eric) we assume that all commands are flushed from the
- * controller. We should get a DID_RESET for all of the commands
- * that were pending. We should ignore these so that we can
- * guarantee that we are in a consistent state.
- *
- * I believe this to be the case right now, but this needs to be
- * tested.
- */
- for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
- for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
- if (SCloop->state != SCSI_STATE_FAILED
- && SCloop->state != SCSI_STATE_TIMEOUT) {
- continue;
- }
- rtn = scsi_test_unit_ready(SCloop);
- if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
- rtn = scsi_eh_retry_command(SCloop);
- if (rtn == SUCCESS) {
- SCpnt->host->host_failed--;
- scsi_eh_finish_command(&SCdone, SCloop);
- }
- }
- if (rtn != SUCCESS) {
- printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after host reset: host %d channel %d id %d lun %dn", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
- SDloop->online = FALSE;
- SDloop->host->host_failed--;
- scsi_eh_finish_command(&SCdone, SCloop);
- }
- }
- }
- }
- }
- }
- /*
- * If we solved all of the problems, then let's rev up the engines again.
- */
- if (host->host_failed == 0) {
- ourrtn = TRUE;
- goto leave;
- }
- /*
- * If the HOST RESET failed, then for now we assume that the entire host
- * adapter is too hosed to be of any use. For our purposes, however, it is
- * easier to simply take the devices offline that correspond to commands
- * that failed.
- */
- SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offlinen"));
- for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
- for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
- if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) {
- SDloop = SCloop->device;
- if (SDloop->online == TRUE) {
- printk(KERN_INFO "scsi: device set offline - command error recover failed: host %d channel %d id %d lun %dn", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
- SDloop->online = FALSE;
- }
- /*
- * This should pass the failure up to the top level driver, and
- * it will have to try and do something intelligent with it.
- */
- SCloop->host->host_failed--;
- if (SCloop->state == SCSI_STATE_TIMEOUT) {
- SCloop->result |= (DRIVER_TIMEOUT << 24);
- }
- SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %xn",
- SDloop->id, SCloop->result));
- scsi_eh_finish_command(&SCdone, SCloop);
- }
- }
- }
- if (host->host_failed != 0) {
- panic("scsi_unjam_host: Miscount of number of failed commands.n");
- }
- SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returningn"));
- ourrtn = FALSE;
- leave:
- /*
- * We should have a list of commands that we 'finished' during the course of
- * error recovery. This should be the same as the list of commands that timed out
- * or failed. We are currently holding these things in a linked list - we didn't
- * put them in the bottom half queue because we wanted to keep things quiet while
- * we were working on recovery, and passing them up to the top level could easily
- * cause the top level to try and queue something else again.
- *
- * Start by marking that the host is no longer in error recovery.
- */
- host->in_recovery = 0;
- /*
- * Take the list of commands, and stick them in the bottom half queue.
- * The current implementation of scsi_done will do this for us - if need
- * be we can create a special version of this function to do the
- * same job for us.
- */
- for (SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) {
- SCdone = SCpnt->bh_next;
- SCpnt->bh_next = NULL;
- /*
- * Oh, this is a vile hack. scsi_done() expects a timer
- * to be running on the command. If there isn't, it assumes
- * that the command has actually timed out, and a timer
- * handler is running. That may well be how we got into
- * this fix, but right now things are stable. We add
- * a timer back again so that we can report completion.
- * scsi_done() will immediately remove said timer from
- * the command, and then process it.
- */
- scsi_add_timer(SCpnt, 100, scsi_eh_times_out);
- scsi_done(SCpnt);
- }
- return (ourrtn);
- }
- /*
- * Function: scsi_error_handler
- *
- * Purpose: Handle errors/timeouts of scsi commands, try and clean up
- * and unjam the bus, and restart things.
- *
- * Arguments: host - host for which we are running.
- *
- * Returns: Never returns.
- *
- * Notes: This is always run in the context of a kernel thread. The
- * idea is that we start this thing up when the kernel starts
- * up (one per host that we detect), and it immediately goes to
- * sleep and waits for some event (i.e. failure). When this
- * takes place, we have the job of trying to unjam the bus
- * and restarting things.
- *
- */
- void scsi_error_handler(void *data)
- {
- struct Scsi_Host *host = (struct Scsi_Host *) data;
- int rtn;
- DECLARE_MUTEX_LOCKED(sem);
- /*
- * We only listen to signals if the HA was loaded as a module.
- * If the HA was compiled into the kernel, then we don't listen
- * to any signals.
- */
- if( host->loaded_as_module ) {
- siginitsetinv(¤t->blocked, SHUTDOWN_SIGS);
- } else {
- siginitsetinv(¤t->blocked, 0);
- }
- lock_kernel();
- /*
- * Flush resources
- */
- daemonize();
- /*
- * Set the name of this process.
- */
- sprintf(current->comm, "scsi_eh_%d", host->host_no);
- host->eh_wait = &sem;
- host->ehandler = current;
- unlock_kernel();
- /*
- * Wake up the thread that created us.
- */
- SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %dn", host->eh_notify->count.counter));
- up(host->eh_notify);
- while (1) {
- /*
- * If we get a signal, it means we are supposed to go
- * away and die. This typically happens if the user is
- * trying to unload a module.
- */
- SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleepingn"));
- /*
- * Note - we always use down_interruptible with the semaphore
- * even if the module was loaded as part of the kernel. The
- * reason is that down() will cause this thread to be counted
- * in the load average as a running process, and down
- * interruptible doesn't. Given that we need to allow this
- * thread to die if the driver was loaded as a module, using
- * semaphores isn't unreasonable.
- */
- down_interruptible(&sem);
- if( host->loaded_as_module ) {
- if (signal_pending(current))
- break;
- }
- SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking upn"));
- host->eh_active = 1;
- /*
- * We have a host that is failing for some reason. Figure out
- * what we need to do to get it up and online again (if we can).
- * If we fail, we end up taking the thing offline.
- */
- if (host->hostt->eh_strategy_handler != NULL) {
- rtn = host->hostt->eh_strategy_handler(host);
- } else {
- rtn = scsi_unjam_host(host);
- }
- host->eh_active = 0;
- /*
- * Note - if the above fails completely, the action is to take
- * individual devices offline and flush the queue of any
- * outstanding requests that may have been pending. When we
- * restart, we restart any I/O to any other devices on the bus
- * which are still online.
- */
- scsi_restart_operations(host);
- }
- SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exitingn"));
- /*
- * Make sure that nobody tries to wake us up again.
- */
- host->eh_wait = NULL;
- /*
- * Knock this down too. From this point on, the host is flying
- * without a pilot. If this is because the module is being unloaded,
- * that's fine. If the user sent a signal to this thing, we are
- * potentially in real danger.
- */
- host->in_recovery = 0;
- host->eh_active = 0;
- host->ehandler = NULL;
- /*
- * If anyone is waiting for us to exit (i.e. someone trying to unload
- * a driver), then wake up that process to let them know we are on
- * the way out the door. This may be overkill - I *think* that we
- * could probably just unload the driver and send the signal, and when
- * the error handling thread wakes up that it would just exit without
- * needing to touch any memory associated with the driver itself.
- */
- if (host->eh_notify != NULL)
- up(host->eh_notify);
- }
- /*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only. This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 4
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -4
- * c-argdecl-indent: 4
- * c-label-offset: -4
- * c-continued-statement-offset: 4
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */