scsi: fix race between simultaneous decrements of ->host_failed

author Wei Fang <fangwei1@huawei.com>

Tue, 7 Jun 2016 06:53:56 +0000 (14:53 +0800)

committer Martin K. Petersen <martin.petersen@oracle.com>

Thu, 9 Jun 2016 03:08:04 +0000 (23:08 -0400)
author Wei Fang <fangwei1@huawei.com>
Tue, 7 Jun 2016 06:53:56 +0000 (14:53 +0800)
committer Martin K. Petersen <martin.petersen@oracle.com>
Thu, 9 Jun 2016 03:08:04 +0000 (23:08 -0400)
diff --git a/Documentation/scsi/scsi_eh.txt b/Documentation/scsi/scsi_eh.txt

index 8638f61..37eca00 100644 (file)
--- a/Documentation/scsi/scsi_eh.txt
+++ b/Documentation/scsi/scsi_eh.txt
@@ -263,19 +263,23 @@ scmd->allowed.
  
   3. scmd recovered
      ACTION: scsi_eh_finish_cmd() is invoked to EH-finish scmd
-       - shost->host_failed--
         - clear scmd->eh_eflags
         - scsi_setup_cmd_retry()
         - move from local eh_work_q to local eh_done_q
      LOCKING: none
+    CONCURRENCY: at most one thread per separate eh_work_q to
+                keep queue manipulation lockless
  
   4. EH completes
      ACTION: scsi_eh_flush_done_q() retries scmds or notifies upper
-           layer of failure.
+           layer of failure. May be called concurrently but must have
+           a no more than one thread per separate eh_work_q to
+           manipulate the queue locklessly
         - scmd is removed from eh_done_q and scmd->eh_entry is cleared
         - if retry is necessary, scmd is requeued using
            scsi_queue_insert()
         - otherwise, scsi_finish_command() is invoked for scmd
+       - zero shost->host_failed
      LOCKING: queue or finish function performs appropriate locking
  
  
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c

index 961acc7..91a9e6a 100644 (file)
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -606,7 +606,7 @@ void ata_scsi_error(struct Scsi_Host *host)
         ata_scsi_port_error_handler(host, ap);
  
         /* finish or retry handled scmd's and clean up */
-       WARN_ON(host->host_failed || !list_empty(&eh_work_q));
+       WARN_ON(!list_empty(&eh_work_q));
  
         DPRINTK("EXIT\n");
  }
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c

index 984ddcb..1b9c049 100644 (file)
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1127,7 +1127,6 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn)
   */
  void scsi_eh_finish_cmd(struct scsi_cmnd *scmd, struct list_head *done_q)
  {
-       scmd->device->host->host_failed--;
         scmd->eh_eflags = 0;
         list_move_tail(&scmd->eh_entry, done_q);
  }
@@ -2226,6 +2225,9 @@ int scsi_error_handler(void *data)
                 else
                         scsi_unjam_host(shost);
  
+               /* All scmds have been handled */
+               shost->host_failed = 0;
+
                 /*
                  * Note - if the above fails completely, the action is to take
                  * individual devices offline and flush the queue of any
author	Wei Fang <fangwei1@huawei.com>
	Tue, 7 Jun 2016 06:53:56 +0000 (14:53 +0800)
committer	Martin K. Petersen <martin.petersen@oracle.com>
	Thu, 9 Jun 2016 03:08:04 +0000 (23:08 -0400)
Documentation/scsi/scsi_eh.txt		patch \| blob \| history
drivers/ata/libata-eh.c		patch \| blob \| history
drivers/scsi/scsi_error.c		patch \| blob \| history