bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
KM_NOFS);
if (!bp->b_maps)
- return ENOMEM;
+ return -ENOMEM;
return 0;
}
if (unlikely(page == NULL)) {
if (flags & XBF_READ_AHEAD) {
bp->b_page_count = i;
- error = ENOMEM;
+ error = -ENOMEM;
goto out_free_pages;
}
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
if (blkno >= eofs) {
/*
- * XXX (dgc): we should really be returning EFSCORRUPTED here,
+ * XXX (dgc): we should really be returning -EFSCORRUPTED here,
* but none of the higher level infrastructure supports
* returning a specific error on buffer lookup failures.
*/
* Buffer Utility Routines
*/
-STATIC void
-xfs_buf_iodone_work(
- struct work_struct *work)
+void
+xfs_buf_ioend(
+ struct xfs_buf *bp)
{
- struct xfs_buf *bp =
- container_of(work, xfs_buf_t, b_iodone_work);
- bool read = !!(bp->b_flags & XBF_READ);
+ bool read = bp->b_flags & XBF_READ;
+
+ trace_xfs_buf_iodone(bp, _RET_IP_);
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
- /* only validate buffers that were read without errors */
- if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
+ /*
+ * Pull in IO completion errors now. We are guaranteed to be running
+ * single threaded, so we don't need the lock to read b_io_error.
+ */
+ if (!bp->b_error && bp->b_io_error)
+ xfs_buf_ioerror(bp, bp->b_io_error);
+
+ /* Only validate buffers that were read without errors */
+ if (read && !bp->b_error && bp->b_ops) {
+ ASSERT(!bp->b_iodone);
bp->b_ops->verify_read(bp);
+ }
+
+ if (!bp->b_error)
+ bp->b_flags |= XBF_DONE;
if (bp->b_iodone)
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
else {
- ASSERT(read && bp->b_ops);
complete(&bp->b_iowait);
+
+ /* release the !XBF_ASYNC ref now we are done. */
+ xfs_buf_rele(bp);
}
}
-void
-xfs_buf_ioend(
- struct xfs_buf *bp,
- int schedule)
+static void
+xfs_buf_ioend_work(
+ struct work_struct *work)
{
- bool read = !!(bp->b_flags & XBF_READ);
-
- trace_xfs_buf_iodone(bp, _RET_IP_);
+ struct xfs_buf *bp =
+ container_of(work, xfs_buf_t, b_iodone_work);
- if (bp->b_error == 0)
- bp->b_flags |= XBF_DONE;
+ xfs_buf_ioend(bp);
+}
- if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
- if (schedule) {
- INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
- queue_work(xfslogd_workqueue, &bp->b_iodone_work);
- } else {
- xfs_buf_iodone_work(&bp->b_iodone_work);
- }
- } else {
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
- complete(&bp->b_iowait);
- }
+void
+xfs_buf_ioend_async(
+ struct xfs_buf *bp)
+{
+ INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
+ queue_work(xfslogd_workqueue, &bp->b_iodone_work);
}
void
xfs_buf_t *bp,
int error)
{
- ASSERT(error >= 0 && error <= 0xffff);
- bp->b_error = (unsigned short)error;
+ ASSERT(error <= 0 && error >= -1000);
+ bp->b_error = error;
trace_xfs_buf_ioerror(bp, error, _RET_IP_);
}
{
xfs_alert(bp->b_target->bt_mount,
"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
- (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
-}
-
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
- * so that the proper iodone callbacks get called.
- */
-STATIC int
-xfs_bioerror(
- xfs_buf_t *bp)
-{
-#ifdef XFSERRORDEBUG
- ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-
- /*
- * No need to wait until the buffer is unpinned, we aren't flushing it.
- */
- xfs_buf_ioerror(bp, EIO);
-
- /*
- * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
- */
- XFS_BUF_UNREAD(bp);
- XFS_BUF_UNDONE(bp);
- xfs_buf_stale(bp);
-
- xfs_buf_ioend(bp, 0);
-
- return EIO;
-}
-
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the xfs_buf_ioend call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
- struct xfs_buf *bp)
-{
- int64_t fl = bp->b_flags;
- /*
- * No need to wait until the buffer is unpinned.
- * We aren't flushing it.
- *
- * chunkhold expects B_DONE to be set, whether
- * we actually finish the I/O or not. We don't want to
- * change that interface.
- */
- XFS_BUF_UNREAD(bp);
- XFS_BUF_DONE(bp);
- xfs_buf_stale(bp);
- bp->b_iodone = NULL;
- if (!(fl & XBF_ASYNC)) {
- /*
- * Mark b_error and B_ERROR _both_.
- * Lot's of chunkcache code assumes that.
- * There's no reason to mark error for
- * ASYNC buffers.
- */
- xfs_buf_ioerror(bp, EIO);
- complete(&bp->b_iowait);
- } else {
- xfs_buf_relse(bp);
- }
-
- return EIO;
-}
-
-STATIC int
-xfs_bdstrat_cb(
- struct xfs_buf *bp)
-{
- if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
- trace_xfs_bdstrat_shut(bp, _RET_IP_);
- /*
- * Metadata write that didn't get logged but
- * written delayed anyway. These aren't associated
- * with a transaction, and can be ignored.
- */
- if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
- return xfs_bioerror_relse(bp);
- else
- return xfs_bioerror(bp);
- }
-
- xfs_buf_iorequest(bp);
- return 0;
+ (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
}
int
ASSERT(xfs_buf_islocked(bp));
bp->b_flags |= XBF_WRITE;
- bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+ XBF_WRITE_FAIL | XBF_DONE);
+
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ trace_xfs_bdstrat_shut(bp, _RET_IP_);
- xfs_bdstrat_cb(bp);
+ xfs_buf_ioerror(bp, -EIO);
+ xfs_buf_stale(bp);
+
+ /* sync IO, xfs_buf_ioend is going to remove a ref here */
+ xfs_buf_hold(bp);
+ xfs_buf_ioend(bp);
+ return -EIO;
+ }
+
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error) {
return error;
}
-STATIC void
-_xfs_buf_ioend(
- xfs_buf_t *bp,
- int schedule)
-{
- if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
- xfs_buf_ioend(bp, schedule);
-}
-
STATIC void
xfs_buf_bio_end_io(
struct bio *bio,
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
*/
- if (!bp->b_error)
- xfs_buf_ioerror(bp, -error);
+ if (error) {
+ spin_lock(&bp->b_lock);
+ if (!bp->b_io_error)
+ bp->b_io_error = error;
+ spin_unlock(&bp->b_lock);
+ }
if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
- _xfs_buf_ioend(bp, 1);
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ xfs_buf_ioend_async(bp);
bio_put(bio);
}
* because the caller (xfs_buf_iorequest) holds a count itself.
*/
atomic_dec(&bp->b_io_remaining);
- xfs_buf_ioerror(bp, EIO);
+ xfs_buf_ioerror(bp, -EIO);
bio_put(bio);
}
SHUTDOWN_CORRUPT_INCORE);
return;
}
+ } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ /*
+ * non-crc filesystems don't attach verifiers during
+ * log recovery, so don't warn for such filesystems.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_warn(mp,
+ "%s: no ops on block 0x%llx/0x%x",
+ __func__, bp->b_bn, bp->b_length);
+ xfs_hex_dump(bp->b_addr, 64);
+ dump_stack();
+ }
}
} else if (bp->b_flags & XBF_READ_AHEAD) {
rw = READA;
if (bp->b_flags & XBF_WRITE)
xfs_buf_wait_unpin(bp);
+
+ /* clear the internal error state to avoid spurious errors */
+ bp->b_io_error = 0;
+
+ /*
+ * Take references to the buffer. For XBF_ASYNC buffers, holding a
+ * reference for as long as submission takes is all that is necessary
+ * here. The IO inherits the lock and hold count from the submitter,
+ * and these are release during IO completion processing. Taking a hold
+ * over submission ensures that the buffer is not freed until we have
+ * completed all processing, regardless of when IO errors occur or are
+ * reported.
+ *
+ * However, for synchronous IO, the IO does not inherit the submitters
+ * reference count, nor the buffer lock. Hence we need to take an extra
+ * reference to the buffer for the for the IO context so that we can
+ * guarantee the buffer is not freed until all IO completion processing
+ * is done. Otherwise the caller can drop their reference while the IO
+ * is still in progress and hence trigger a use-after-free situation.
+ */
xfs_buf_hold(bp);
+ if (!(bp->b_flags & XBF_ASYNC))
+ xfs_buf_hold(bp);
+
/*
- * Set the count to 1 initially, this will stop an I/O
- * completion callout which happens before we have started
- * all the I/O from calling xfs_buf_ioend too early.
+ * Set the count to 1 initially, this will stop an I/O completion
+ * callout which happens before we have started all the I/O from calling
+ * xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
_xfs_buf_ioapply(bp);
+
/*
- * If _xfs_buf_ioapply failed, we'll get back here with
- * only the reference we took above. _xfs_buf_ioend will
- * drop it to zero, so we'd better not queue it for later,
- * or we'll free it before it's done.
+ * If _xfs_buf_ioapply failed or we are doing synchronous IO that
+ * completes extremely quickly, we can get back here with only the IO
+ * reference we took above. If we drop it to zero, run completion
+ * processing synchronously so that we don't return to the caller with
+ * completion still pending. This avoids unnecessary context switches
+ * associated with the end_io workqueue.
*/
- _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+ if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
+ xfs_buf_ioend(bp);
+ else
+ xfs_buf_ioend_async(bp);
+ }
xfs_buf_rele(bp);
}
xfs_warn(btp->bt_mount,
"Cannot set_blocksize to %u on device %s",
sectorsize, name);
- return EINVAL;
+ return -EINVAL;
}
/* Set up device logical sector size mask */
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, io_list, b_list) {
bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
- bp->b_flags |= XBF_WRITE;
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC;
- if (!wait) {
- bp->b_flags |= XBF_ASYNC;
+ /*
+ * we do all Io submission async. This means if we need to wait
+ * for IO completion we need to take an extra reference so the
+ * buffer is still valid on the other side.
+ */
+ if (wait)
+ xfs_buf_hold(bp);
+ else
list_del_init(&bp->b_list);
+
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ trace_xfs_bdstrat_shut(bp, _RET_IP_);
+
+ xfs_buf_ioerror(bp, -EIO);
+ xfs_buf_stale(bp);
+ xfs_buf_ioend(bp);
+ continue;
}
- xfs_bdstrat_cb(bp);
+ xfs_buf_iorequest(bp);
}
blk_finish_plug(&plug);
bp = list_first_entry(&io_list, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
- error2 = xfs_buf_iowait(bp);
+
+ /* locking the buffer will wait for async IO completion. */
+ xfs_buf_lock(bp);
+ error2 = bp->b_error;
xfs_buf_relse(bp);
if (!error)
error = error2;