xfs: kill xfs_bioerror_relse

[cascardo/linux.git] / fs / xfs / xfs_buf.c
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c

index 7a34a1a..108eba7 100644 (file)
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -130,7 +130,7 @@ xfs_buf_get_maps(
         bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
                                 KM_NOFS);
         if (!bp->b_maps)
-               return ENOMEM;
+               return -ENOMEM;
         return 0;
  }
  
@@ -344,7 +344,7 @@ retry:
                 if (unlikely(page == NULL)) {
                         if (flags & XBF_READ_AHEAD) {
                                 bp->b_page_count = i;
-                               error = ENOMEM;
+                               error = -ENOMEM;
                                 goto out_free_pages;
                         }
  
@@ -465,7 +465,7 @@ _xfs_buf_find(
         eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
         if (blkno >= eofs) {
                 /*
-                * XXX (dgc): we should really be returning EFSCORRUPTED here,
+                * XXX (dgc): we should really be returning -EFSCORRUPTED here,
                  * but none of the higher level infrastructure supports
                  * returning a specific error on buffer lookup failures.
                  */
@@ -998,53 +998,60 @@ xfs_buf_wait_unpin(
   *     Buffer Utility Routines
   */
  
-STATIC void
-xfs_buf_iodone_work(
-       struct work_struct      *work)
+void
+xfs_buf_ioend(
+       struct xfs_buf  *bp)
  {
-       struct xfs_buf          *bp =
-               container_of(work, xfs_buf_t, b_iodone_work);
-       bool                    read = !!(bp->b_flags & XBF_READ);
+       bool            read = bp->b_flags & XBF_READ;
+
+       trace_xfs_buf_iodone(bp, _RET_IP_);
  
         bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
  
-       /* only validate buffers that were read without errors */
-       if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
+       /*
+        * Pull in IO completion errors now. We are guaranteed to be running
+        * single threaded, so we don't need the lock to read b_io_error.
+        */
+       if (!bp->b_error && bp->b_io_error)
+               xfs_buf_ioerror(bp, bp->b_io_error);
+
+       /* Only validate buffers that were read without errors */
+       if (read && !bp->b_error && bp->b_ops) {
+               ASSERT(!bp->b_iodone);
                 bp->b_ops->verify_read(bp);
+       }
+
+       if (!bp->b_error)
+               bp->b_flags |= XBF_DONE;
  
         if (bp->b_iodone)
                 (*(bp->b_iodone))(bp);
         else if (bp->b_flags & XBF_ASYNC)
                 xfs_buf_relse(bp);
         else {
-               ASSERT(read && bp->b_ops);
                 complete(&bp->b_iowait);
+
+               /* release the !XBF_ASYNC ref now we are done. */
+               xfs_buf_rele(bp);
         }
  }
  
-void
-xfs_buf_ioend(
-       struct xfs_buf  *bp,
-       int             schedule)
+static void
+xfs_buf_ioend_work(
+       struct work_struct      *work)
  {
-       bool            read = !!(bp->b_flags & XBF_READ);
-
-       trace_xfs_buf_iodone(bp, _RET_IP_);
+       struct xfs_buf          *bp =
+               container_of(work, xfs_buf_t, b_iodone_work);
  
-       if (bp->b_error == 0)
-               bp->b_flags |= XBF_DONE;
+       xfs_buf_ioend(bp);
+}
  
-       if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
-               if (schedule) {
-                       INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
-                       queue_work(xfslogd_workqueue, &bp->b_iodone_work);
-               } else {
-                       xfs_buf_iodone_work(&bp->b_iodone_work);
-               }
-       } else {
-               bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-               complete(&bp->b_iowait);
-       }
+void
+xfs_buf_ioend_async(
+       struct xfs_buf  *bp)
+{
+       INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
+       queue_work(xfslogd_workqueue, &bp->b_iodone_work);
  }
  
  void
@@ -1052,8 +1059,8 @@ xfs_buf_ioerror(
         xfs_buf_t               *bp,
         int                     error)
  {
-       ASSERT(error >= 0 && error <= 0xffff);
-       bp->b_error = (unsigned short)error;
+       ASSERT(error <= 0 && error >= -1000);
+       bp->b_error = error;
         trace_xfs_buf_ioerror(bp, error, _RET_IP_);
  }
  
@@ -1064,97 +1071,7 @@ xfs_buf_ioerror_alert(
  {
         xfs_alert(bp->b_target->bt_mount,
  "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
-               (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
-}
-
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
- * so that the proper iodone callbacks get called.
- */
-STATIC int
-xfs_bioerror(
-       xfs_buf_t *bp)
-{
-#ifdef XFSERRORDEBUG
-       ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-
-       /*
-        * No need to wait until the buffer is unpinned, we aren't flushing it.
-        */
-       xfs_buf_ioerror(bp, EIO);
-
-       /*
-        * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
-        */
-       XFS_BUF_UNREAD(bp);
-       XFS_BUF_UNDONE(bp);
-       xfs_buf_stale(bp);
-
-       xfs_buf_ioend(bp, 0);
-
-       return EIO;
-}
-
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the xfs_buf_ioend call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
-       struct xfs_buf  *bp)
-{
-       int64_t         fl = bp->b_flags;
-       /*
-        * No need to wait until the buffer is unpinned.
-        * We aren't flushing it.
-        *
-        * chunkhold expects B_DONE to be set, whether
-        * we actually finish the I/O or not. We don't want to
-        * change that interface.
-        */
-       XFS_BUF_UNREAD(bp);
-       XFS_BUF_DONE(bp);
-       xfs_buf_stale(bp);
-       bp->b_iodone = NULL;
-       if (!(fl & XBF_ASYNC)) {
-               /*
-                * Mark b_error and B_ERROR _both_.
-                * Lot's of chunkcache code assumes that.
-                * There's no reason to mark error for
-                * ASYNC buffers.
-                */
-               xfs_buf_ioerror(bp, EIO);
-               complete(&bp->b_iowait);
-       } else {
-               xfs_buf_relse(bp);
-       }
-
-       return EIO;
-}
-
-STATIC int
-xfs_bdstrat_cb(
-       struct xfs_buf  *bp)
-{
-       if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
-               trace_xfs_bdstrat_shut(bp, _RET_IP_);
-               /*
-                * Metadata write that didn't get logged but
-                * written delayed anyway. These aren't associated
-                * with a transaction, and can be ignored.
-                */
-               if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
-                       return xfs_bioerror_relse(bp);
-               else
-                       return xfs_bioerror(bp);
-       }
-
-       xfs_buf_iorequest(bp);
-       return 0;
+               (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
  }
  
  int
@@ -1166,9 +1083,22 @@ xfs_bwrite(
         ASSERT(xfs_buf_islocked(bp));
  
         bp->b_flags |= XBF_WRITE;
-       bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
+       bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+                        XBF_WRITE_FAIL | XBF_DONE);
+
+       if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+               trace_xfs_bdstrat_shut(bp, _RET_IP_);
  
-       xfs_bdstrat_cb(bp);
+               xfs_buf_ioerror(bp, -EIO);
+               xfs_buf_stale(bp);
+
+               /* sync IO, xfs_buf_ioend is going to remove a ref here */
+               xfs_buf_hold(bp);
+               xfs_buf_ioend(bp);
+               return -EIO;
+       }
+
+       xfs_buf_iorequest(bp);
  
         error = xfs_buf_iowait(bp);
         if (error) {
@@ -1178,15 +1108,6 @@ xfs_bwrite(
         return error;
  }
  
-STATIC void
-_xfs_buf_ioend(
-       xfs_buf_t               *bp,
-       int                     schedule)
-{
-       if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
-               xfs_buf_ioend(bp, schedule);
-}
-
  STATIC void
  xfs_buf_bio_end_io(
         struct bio              *bio,
@@ -1198,13 +1119,18 @@ xfs_buf_bio_end_io(
          * don't overwrite existing errors - otherwise we can lose errors on
          * buffers that require multiple bios to complete.
          */
-       if (!bp->b_error)
-               xfs_buf_ioerror(bp, -error);
+       if (error) {
+               spin_lock(&bp->b_lock);
+               if (!bp->b_io_error)
+                       bp->b_io_error = error;
+               spin_unlock(&bp->b_lock);
+       }
  
         if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
  
-       _xfs_buf_ioend(bp, 1);
+       if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+               xfs_buf_ioend_async(bp);
         bio_put(bio);
  }
  
@@ -1286,7 +1212,7 @@ next_chunk:
                  * because the caller (xfs_buf_iorequest) holds a count itself.
                  */
                 atomic_dec(&bp->b_io_remaining);
-               xfs_buf_ioerror(bp, EIO);
+               xfs_buf_ioerror(bp, -EIO);
                 bio_put(bio);
         }
  
@@ -1330,6 +1256,20 @@ _xfs_buf_ioapply(
                                                    SHUTDOWN_CORRUPT_INCORE);
                                 return;
                         }
+               } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
+                       struct xfs_mount *mp = bp->b_target->bt_mount;
+
+                       /*
+                        * non-crc filesystems don't attach verifiers during
+                        * log recovery, so don't warn for such filesystems.
+                        */
+                       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                               xfs_warn(mp,
+                                       "%s: no ops on block 0x%llx/0x%x",
+                                       __func__, bp->b_bn, bp->b_length);
+                               xfs_hex_dump(bp->b_addr, 64);
+                               dump_stack();
+                       }
                 }
         } else if (bp->b_flags & XBF_READ_AHEAD) {
                 rw = READA;
@@ -1369,22 +1309,53 @@ xfs_buf_iorequest(
  
         if (bp->b_flags & XBF_WRITE)
                 xfs_buf_wait_unpin(bp);
+
+       /* clear the internal error state to avoid spurious errors */
+       bp->b_io_error = 0;
+
+       /*
+        * Take references to the buffer. For XBF_ASYNC buffers, holding a
+        * reference for as long as submission takes is all that is necessary
+        * here. The IO inherits the lock and hold count from the submitter,
+        * and these are release during IO completion processing. Taking a hold
+        * over submission ensures that the buffer is not freed until we have
+        * completed all processing, regardless of when IO errors occur or are
+        * reported.
+        *
+        * However, for synchronous IO, the IO does not inherit the submitters
+        * reference count, nor the buffer lock. Hence we need to take an extra
+        * reference to the buffer for the for the IO context so that we can
+        * guarantee the buffer is not freed until all IO completion processing
+        * is done. Otherwise the caller can drop their reference while the IO
+        * is still in progress and hence trigger a use-after-free situation.
+        */
         xfs_buf_hold(bp);
+       if (!(bp->b_flags & XBF_ASYNC))
+               xfs_buf_hold(bp);
+
  
         /*
-        * Set the count to 1 initially, this will stop an I/O
-        * completion callout which happens before we have started
-        * all the I/O from calling xfs_buf_ioend too early.
+        * Set the count to 1 initially, this will stop an I/O completion
+        * callout which happens before we have started all the I/O from calling
+        * xfs_buf_ioend too early.
          */
         atomic_set(&bp->b_io_remaining, 1);
         _xfs_buf_ioapply(bp);
+
         /*
-        * If _xfs_buf_ioapply failed, we'll get back here with
-        * only the reference we took above.  _xfs_buf_ioend will
-        * drop it to zero, so we'd better not queue it for later,
-        * or we'll free it before it's done.
+        * If _xfs_buf_ioapply failed or we are doing synchronous IO that
+        * completes extremely quickly, we can get back here with only the IO
+        * reference we took above. If we drop it to zero, run completion
+        * processing synchronously so that we don't return to the caller with
+        * completion still pending. This avoids unnecessary context switches
+        * associated with the end_io workqueue.
          */
-       _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
+       if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+               if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
+                       xfs_buf_ioend(bp);
+               else
+                       xfs_buf_ioend_async(bp);
+       }
  
         xfs_buf_rele(bp);
  }
@@ -1628,7 +1599,7 @@ xfs_setsize_buftarg(
                 xfs_warn(btp->bt_mount,
                         "Cannot set_blocksize to %u on device %s",
                         sectorsize, name);
-               return EINVAL;
+               return -EINVAL;
         }
  
         /* Set up device logical sector size mask */
@@ -1799,13 +1770,27 @@ __xfs_buf_delwri_submit(
         blk_start_plug(&plug);
         list_for_each_entry_safe(bp, n, io_list, b_list) {
                 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-               bp->b_flags |= XBF_WRITE;
+               bp->b_flags |= XBF_WRITE | XBF_ASYNC;
  
-               if (!wait) {
-                       bp->b_flags |= XBF_ASYNC;
+               /*
+                * we do all Io submission async. This means if we need to wait
+                * for IO completion we need to take an extra reference so the
+                * buffer is still valid on the other side.
+                */
+               if (wait)
+                       xfs_buf_hold(bp);
+               else
                         list_del_init(&bp->b_list);
+
+               if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+                       trace_xfs_bdstrat_shut(bp, _RET_IP_);
+
+                       xfs_buf_ioerror(bp, -EIO);
+                       xfs_buf_stale(bp);
+                       xfs_buf_ioend(bp);
+                       continue;
                 }
-               xfs_bdstrat_cb(bp);
+               xfs_buf_iorequest(bp);
         }
         blk_finish_plug(&plug);
  
@@ -1852,7 +1837,10 @@ xfs_buf_delwri_submit(
                 bp = list_first_entry(&io_list, struct xfs_buf, b_list);
  
                 list_del_init(&bp->b_list);
-               error2 = xfs_buf_iowait(bp);
+
+               /* locking the buffer will wait for async IO completion. */
+               xfs_buf_lock(bp);
+               error2 = bp->b_error;
                 xfs_buf_relse(bp);
                 if (!error)
                         error = error2;