Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Jan 2011 23:24:17 +0000 (15:24 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Jan 2011 23:24:17 +0000 (15:24 -0800)
* 'for-linus' of git://oss.sgi.com/xfs/xfs:
  xfs: prevent NMI timeouts in cmn_err
  xfs: Add log level to assertion printk
  xfs: fix an assignment within an ASSERT()
  xfs: fix error handling for synchronous writes
  xfs: add FITRIM support
  xfs: ensure log covering transactions are synchronous
  xfs: serialise unaligned direct IOs
  xfs: factor common write setup code
  xfs: split buffered IO write path from xfs_file_aio_write
  xfs: split direct IO write path from xfs_file_aio_write
  xfs: introduce xfs_rw_lock() helpers for locking the inode
  xfs: factor post-write newsize updates
  xfs: factor common post-write isize handling code
  xfs: ensure sync write errors are returned

23 files changed:
fs/xfs/Makefile
fs/xfs/linux-2.6/xfs_buf.c
fs/xfs/linux-2.6/xfs_buf.h
fs/xfs/linux-2.6/xfs_discard.c [new file with mode: 0644]
fs/xfs/linux-2.6/xfs_discard.h [new file with mode: 0644]
fs/xfs/linux-2.6/xfs_file.c
fs/xfs/linux-2.6/xfs_ioctl.c
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/linux-2.6/xfs_sync.c
fs/xfs/linux-2.6/xfs_sysctl.c
fs/xfs/linux-2.6/xfs_trace.h
fs/xfs/support/debug.c
fs/xfs/support/debug.h
fs/xfs/xfs_alloc.c
fs/xfs/xfs_alloc.h
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_error.c
fs/xfs/xfs_error.h
fs/xfs/xfs_fsops.c
fs/xfs/xfs_fsops.h
fs/xfs/xfs_log.c
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_trans.c

index 0dce969..faca449 100644 (file)
@@ -98,6 +98,7 @@ xfs-y                         += $(addprefix $(XFS_LINUX)/, \
                                   kmem.o \
                                   xfs_aops.o \
                                   xfs_buf.o \
+                                  xfs_discard.o \
                                   xfs_export.o \
                                   xfs_file.o \
                                   xfs_fs_subr.o \
index 92f1f2a..ac1c7e8 100644 (file)
@@ -896,7 +896,6 @@ xfs_buf_rele(
        trace_xfs_buf_rele(bp, _RET_IP_);
 
        if (!pag) {
-               ASSERT(!bp->b_relse);
                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
@@ -908,11 +907,7 @@ xfs_buf_rele(
 
        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-               if (bp->b_relse) {
-                       atomic_inc(&bp->b_hold);
-                       spin_unlock(&pag->pag_buf_lock);
-                       bp->b_relse(bp);
-               } else if (!(bp->b_flags & XBF_STALE) &&
+               if (!(bp->b_flags & XBF_STALE) &&
                           atomic_read(&bp->b_lru_ref)) {
                        xfs_buf_lru_add(bp);
                        spin_unlock(&pag->pag_buf_lock);
index a76c242..cbe6595 100644 (file)
@@ -152,8 +152,6 @@ typedef struct xfs_buftarg {
 
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 
 #define XB_PAGES       2
 
@@ -183,7 +181,6 @@ typedef struct xfs_buf {
        void                    *b_addr;        /* virtual address of buffer */
        struct work_struct      b_iodone_work;
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
-       xfs_buf_relse_t         b_relse;        /* releasing function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
@@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_FSPRIVATE2(bp, type)           ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)                ((bp)->b_fspriv2 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                  do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)      ((bp)->b_relse = (func))
 
 #define XFS_BUF_PTR(bp)                        (xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)  xfs_buf_associate_memory(bp, val, cnt)
@@ -360,8 +356,7 @@ xfs_buf_set_ref(
 
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-       if (!bp->b_relse)
-               xfs_buf_unlock(bp);
+       xfs_buf_unlock(bp);
        xfs_buf_rele(bp);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644 (file)
index 0000000..05201ae
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_trim_extents(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno,
+       xfs_fsblock_t           start,
+       xfs_fsblock_t           len,
+       xfs_fsblock_t           minlen,
+       __uint64_t              *blocks_trimmed)
+{
+       struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
+       struct xfs_btree_cur    *cur;
+       struct xfs_buf          *agbp;
+       struct xfs_perag        *pag;
+       int                     error;
+       int                     i;
+
+       pag = xfs_perag_get(mp, agno);
+
+       error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+       if (error || !agbp)
+               goto out_put_perag;
+
+       cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+       /*
+        * Force out the log.  This means any transactions that might have freed
+        * space before we took the AGF buffer lock are now on disk, and the
+        * volatile disk cache is flushed.
+        */
+       xfs_log_force(mp, XFS_LOG_SYNC);
+
+       /*
+        * Look up the longest btree in the AGF and start with it.
+        */
+       error = xfs_alloc_lookup_le(cur, 0,
+                                   XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+       if (error)
+               goto out_del_cursor;
+
+       /*
+        * Loop until we are done with all extents that are large
+        * enough to be worth discarding.
+        */
+       while (i) {
+               xfs_agblock_t fbno;
+               xfs_extlen_t flen;
+
+               error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+               if (error)
+                       goto out_del_cursor;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+               ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+
+               /*
+                * Too small?  Give up.
+                */
+               if (flen < minlen) {
+                       trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+                       goto out_del_cursor;
+               }
+
+               /*
+                * If the extent is entirely outside of the range we are
+                * supposed to discard skip it.  Do not bother to trim
+                * down partially overlapping ranges for now.
+                */
+               if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+                   XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+                       trace_xfs_discard_exclude(mp, agno, fbno, flen);
+                       goto next_extent;
+               }
+
+               /*
+                * If any blocks in the range are still busy, skip the
+                * discard and try again the next time.
+                */
+               if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+                       trace_xfs_discard_busy(mp, agno, fbno, flen);
+                       goto next_extent;
+               }
+
+               trace_xfs_discard_extent(mp, agno, fbno, flen);
+               error = -blkdev_issue_discard(bdev,
+                               XFS_AGB_TO_DADDR(mp, agno, fbno),
+                               XFS_FSB_TO_BB(mp, flen),
+                               GFP_NOFS, 0);
+               if (error)
+                       goto out_del_cursor;
+               *blocks_trimmed += flen;
+
+next_extent:
+               error = xfs_btree_decrement(cur, 0, &i);
+               if (error)
+                       goto out_del_cursor;
+       }
+
+out_del_cursor:
+       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       xfs_buf_relse(agbp);
+out_put_perag:
+       xfs_perag_put(pag);
+       return error;
+}
+
+int
+xfs_ioc_trim(
+       struct xfs_mount                *mp,
+       struct fstrim_range __user      *urange)
+{
+       struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+       unsigned int            granularity = q->limits.discard_granularity;
+       struct fstrim_range     range;
+       xfs_fsblock_t           start, len, minlen;
+       xfs_agnumber_t          start_agno, end_agno, agno;
+       __uint64_t              blocks_trimmed = 0;
+       int                     error, last_error = 0;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&range, urange, sizeof(range)))
+               return -XFS_ERROR(EFAULT);
+
+       /*
+        * Truncating down the len isn't actually quite correct, but using
+        * XFS_B_TO_FSB would mean we trivially get overflows for values
+        * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+        * used by the fstrim application.  In the end it really doesn't
+        * matter as trimming blocks is an advisory interface.
+        */
+       start = XFS_B_TO_FSBT(mp, range.start);
+       len = XFS_B_TO_FSBT(mp, range.len);
+       minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+
+       start_agno = XFS_FSB_TO_AGNO(mp, start);
+       if (start_agno >= mp->m_sb.sb_agcount)
+               return -XFS_ERROR(EINVAL);
+
+       end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+       if (end_agno >= mp->m_sb.sb_agcount)
+               end_agno = mp->m_sb.sb_agcount - 1;
+
+       for (agno = start_agno; agno <= end_agno; agno++) {
+               error = -xfs_trim_extents(mp, agno, start, len, minlen,
+                                         &blocks_trimmed);
+               if (error)
+                       last_error = error;
+       }
+
+       if (last_error)
+               return last_error;
+
+       range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+       if (copy_to_user(urange, &range, sizeof(range)))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644 (file)
index 0000000..e82b6dd
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+
+extern int     xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+
+#endif /* XFS_DISCARD_H */
index ba8ad42..ef51eb4 100644 (file)
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
+/*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+       struct xfs_inode        *ip,
+       int                     type)
+{
+       if (type & XFS_IOLOCK_EXCL)
+               mutex_lock(&VFS_I(ip)->i_mutex);
+       xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+       struct xfs_inode        *ip,
+       int                     type)
+{
+       xfs_iunlock(ip, type);
+       if (type & XFS_IOLOCK_EXCL)
+               mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+       struct xfs_inode        *ip,
+       int                     type)
+{
+       xfs_ilock_demote(ip, type);
+       if (type & XFS_IOLOCK_EXCL)
+               mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
 /*
  *     xfs_iozero
  *
@@ -262,22 +296,21 @@ xfs_file_aio_read(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
 
-       if (unlikely(ioflags & IO_ISDIRECT))
-               mutex_lock(&inode->i_mutex);
-       xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
        if (unlikely(ioflags & IO_ISDIRECT)) {
+               xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
                if (inode->i_mapping->nrpages) {
                        ret = -xfs_flushinval_pages(ip,
                                        (iocb->ki_pos & PAGE_CACHE_MASK),
                                        -1, FI_REMAPF_LOCKED);
+                       if (ret) {
+                               xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+                               return ret;
+                       }
                }
-               mutex_unlock(&inode->i_mutex);
-               if (ret) {
-                       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                       return ret;
-               }
-       }
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+       } else
+               xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 
        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
 
@@ -285,7 +318,7 @@ xfs_file_aio_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
 
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
 
@@ -309,7 +342,7 @@ xfs_file_splice_read(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
 
-       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 
@@ -317,10 +350,61 @@ xfs_file_splice_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
 
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
 
+STATIC void
+xfs_aio_write_isize_update(
+       struct inode    *inode,
+       loff_t          *ppos,
+       ssize_t         bytes_written)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       xfs_fsize_t             isize = i_size_read(inode);
+
+       if (bytes_written > 0)
+               XFS_STATS_ADD(xs_write_bytes, bytes_written);
+
+       if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+                                       *ppos > isize))
+               *ppos = isize;
+
+       if (*ppos > ip->i_size) {
+               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+               if (*ppos > ip->i_size)
+                       ip->i_size = *ppos;
+               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+       }
+}
+
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occured.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+       struct xfs_inode        *ip)
+{
+       if (ip->i_new_size) {
+               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+               ip->i_new_size = 0;
+               if (ip->i_d.di_size > ip->i_size)
+                       ip->i_d.di_size = ip->i_size;
+               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+       }
+}
+
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
 STATIC ssize_t
 xfs_file_splice_write(
        struct pipe_inode_info  *pipe,
@@ -331,7 +415,7 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             isize, new_size;
+       xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
 
@@ -355,27 +439,9 @@ xfs_file_splice_write(
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-       if (ret > 0)
-               XFS_STATS_ADD(xs_write_bytes, ret);
-
-       isize = i_size_read(inode);
-       if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-               *ppos = isize;
 
-       if (*ppos > ip->i_size) {
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               if (*ppos > ip->i_size)
-                       ip->i_size = *ppos;
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-
-       if (ip->i_new_size) {
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               ip->i_new_size = 0;
-               if (ip->i_d.di_size > ip->i_size)
-                       ip->i_d.di_size = ip->i_size;
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       }
+       xfs_aio_write_isize_update(inode, ppos, ret);
+       xfs_aio_write_newsize_update(ip);
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -562,245 +628,258 @@ out_lock:
        return error;
 }
 
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
 STATIC ssize_t
-xfs_file_aio_write(
-       struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos)
+xfs_file_aio_write_checks(
+       struct file             *file,
+       loff_t                  *pos,
+       size_t                  *count,
+       int                     *iolock)
 {
-       struct file             *file = iocb->ki_filp;
-       struct address_space    *mapping = file->f_mapping;
-       struct inode            *inode = mapping->host;
+       struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       ssize_t                 ret = 0, error = 0;
-       int                     ioflags = 0;
-       xfs_fsize_t             isize, new_size;
-       int                     iolock;
-       size_t                  ocount = 0, count;
-       int                     need_i_mutex;
+       xfs_fsize_t             new_size;
+       int                     error = 0;
 
-       XFS_STATS_INC(xs_write_calls);
+       error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+       if (error) {
+               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+               *iolock = 0;
+               return error;
+       }
 
-       BUG_ON(iocb->ki_pos != pos);
+       new_size = *pos + *count;
+       if (new_size > ip->i_size)
+               ip->i_new_size = new_size;
 
-       if (unlikely(file->f_flags & O_DIRECT))
-               ioflags |= IO_ISDIRECT;
-       if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
+       if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+               file_update_time(file);
 
-       error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+       /*
+        * If the offset is beyond the size of the file, we need to zero any
+        * blocks that fall between the existing EOF and the start of this
+        * write.
+        */
+       if (*pos > ip->i_size)
+               error = -xfs_zero_eof(ip, *pos, ip->i_size);
+
+       xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
 
-       count = ocount;
-       if (count == 0)
-               return 0;
-
-       xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
-
-relock:
-       if (ioflags & IO_ISDIRECT) {
-               iolock = XFS_IOLOCK_SHARED;
-               need_i_mutex = 0;
-       } else {
-               iolock = XFS_IOLOCK_EXCL;
-               need_i_mutex = 1;
-               mutex_lock(&inode->i_mutex);
-       }
+       /*
+        * If we're writing the file then make sure to clear the setuid and
+        * setgid bits if the process is not being run by root.  This keeps
+        * people from modifying setuid and setgid binaries.
+        */
+       return file_remove_suid(file);
 
-       xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+}
 
-start:
-       error = -generic_write_checks(file, &pos, &count,
-                                       S_ISBLK(inode->i_mode));
-       if (error) {
-               xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-               goto out_unlock_mutex;
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By separating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+       struct kiocb            *iocb,
+       const struct iovec      *iovp,
+       unsigned long           nr_segs,
+       loff_t                  pos,
+       size_t                  ocount,
+       int                     *iolock)
+{
+       struct file             *file = iocb->ki_filp;
+       struct address_space    *mapping = file->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 ret = 0;
+       size_t                  count = ocount;
+       int                     unaligned_io = 0;
+       struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
+                                       mp->m_rtdev_targp : mp->m_ddev_targp;
+
+       *iolock = 0;
+       if ((pos & target->bt_smask) || (count & target->bt_smask))
+               return -XFS_ERROR(EINVAL);
+
+       if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+               unaligned_io = 1;
+
+       if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+               *iolock = XFS_IOLOCK_EXCL;
+       else
+               *iolock = XFS_IOLOCK_SHARED;
+       xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+
+       ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+       if (ret)
+               return ret;
+
+       if (mapping->nrpages) {
+               WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+               ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                                                       FI_REMAPF_LOCKED);
+               if (ret)
+                       return ret;
        }
 
-       if (ioflags & IO_ISDIRECT) {
-               xfs_buftarg_t   *target =
-                       XFS_IS_REALTIME_INODE(ip) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp;
-
-               if ((pos & target->bt_smask) || (count & target->bt_smask)) {
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-                       return XFS_ERROR(-EINVAL);
-               }
-
-               if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-                       iolock = XFS_IOLOCK_EXCL;
-                       need_i_mutex = 1;
-                       mutex_lock(&inode->i_mutex);
-                       xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
-                       goto start;
-               }
+       /*
+        * If we are doing unaligned IO, wait for all other IO to drain,
+        * otherwise demote the lock if we had to flush cached pages
+        */
+       if (unaligned_io)
+               xfs_ioend_wait(ip);
+       else if (*iolock == XFS_IOLOCK_EXCL) {
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               *iolock = XFS_IOLOCK_SHARED;
        }
 
-       new_size = pos + count;
-       if (new_size > ip->i_size)
-               ip->i_new_size = new_size;
+       trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+       ret = generic_file_direct_write(iocb, iovp,
+                       &nr_segs, pos, &iocb->ki_pos, count, ocount);
 
-       if (likely(!(ioflags & IO_INVIS)))
-               file_update_time(file);
+       /* No fallback to buffered IO on errors for XFS. */
+       ASSERT(ret < 0 || ret == count);
+       return ret;
+}
 
-       /*
-        * If the offset is beyond the size of the file, we have a couple
-        * of things to do. First, if there is already space allocated
-        * we need to either create holes or zero the disk or ...
-        *
-        * If there is a page where the previous size lands, we need
-        * to zero it out up to the new size.
-        */
+STATIC ssize_t
+xfs_file_buffered_aio_write(
+       struct kiocb            *iocb,
+       const struct iovec      *iovp,
+       unsigned long           nr_segs,
+       loff_t                  pos,
+       size_t                  ocount,
+       int                     *iolock)
+{
+       struct file             *file = iocb->ki_filp;
+       struct address_space    *mapping = file->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       ssize_t                 ret;
+       int                     enospc = 0;
+       size_t                  count = ocount;
 
-       if (pos > ip->i_size) {
-               error = xfs_zero_eof(ip, pos, ip->i_size);
-               if (error) {
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                       goto out_unlock_internal;
-               }
-       }
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       *iolock = XFS_IOLOCK_EXCL;
+       xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 
-       /*
-        * If we're writing the file then make sure to clear the
-        * setuid and setgid bits if the process is not being run
-        * by root.  This keeps people from modifying setuid and
-        * setgid binaries.
-        */
-       error = -file_remove_suid(file);
-       if (unlikely(error))
-               goto out_unlock_internal;
+       ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+       if (ret)
+               return ret;
 
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
 
-       if ((ioflags & IO_ISDIRECT)) {
-               if (mapping->nrpages) {
-                       WARN_ON(need_i_mutex == 0);
-                       error = xfs_flushinval_pages(ip,
-                                       (pos & PAGE_CACHE_MASK),
-                                       -1, FI_REMAPF_LOCKED);
-                       if (error)
-                               goto out_unlock_internal;
-               }
-
-               if (need_i_mutex) {
-                       /* demote the lock now the cached pages are gone */
-                       xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
-                       mutex_unlock(&inode->i_mutex);
+write_retry:
+       trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+       ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+                       pos, &iocb->ki_pos, count, ret);
+       /*
+        * if we just got an ENOSPC, flush the inode now we aren't holding any
+        * page locks and retry *once*
+        */
+       if (ret == -ENOSPC && !enospc) {
+               ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+               if (ret)
+                       return ret;
+               enospc = 1;
+               goto write_retry;
+       }
+       current->backing_dev_info = NULL;
+       return ret;
+}
 
-                       iolock = XFS_IOLOCK_SHARED;
-                       need_i_mutex = 0;
-               }
+STATIC ssize_t
+xfs_file_aio_write(
+       struct kiocb            *iocb,
+       const struct iovec      *iovp,
+       unsigned long           nr_segs,
+       loff_t                  pos)
+{
+       struct file             *file = iocb->ki_filp;
+       struct address_space    *mapping = file->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       ssize_t                 ret;
+       int                     iolock;
+       size_t                  ocount = 0;
 
-               trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
-               ret = generic_file_direct_write(iocb, iovp,
-                               &nr_segs, pos, &iocb->ki_pos, count, ocount);
+       XFS_STATS_INC(xs_write_calls);
 
-               /*
-                * direct-io write to a hole: fall through to buffered I/O
-                * for completing the rest of the request.
-                */
-               if (ret >= 0 && ret != count) {
-                       XFS_STATS_ADD(xs_write_bytes, ret);
+       BUG_ON(iocb->ki_pos != pos);
 
-                       pos += ret;
-                       count -= ret;
+       ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+       if (ret)
+               return ret;
 
-                       ioflags &= ~IO_ISDIRECT;
-                       xfs_iunlock(ip, iolock);
-                       goto relock;
-               }
-       } else {
-               int enospc = 0;
-               ssize_t ret2 = 0;
+       if (ocount == 0)
+               return 0;
 
-write_retry:
-               trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
-               ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
-                               pos, &iocb->ki_pos, count, ret);
-               /*
-                * if we just got an ENOSPC, flush the inode now we
-                * aren't holding any page locks and retry *once*
-                */
-               if (ret2 == -ENOSPC && !enospc) {
-                       error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                       if (error)
-                               goto out_unlock_internal;
-                       enospc = 1;
-                       goto write_retry;
-               }
-               ret = ret2;
-       }
+       xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
 
-       current->backing_dev_info = NULL;
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -EIO;
 
-       isize = i_size_read(inode);
-       if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
-               iocb->ki_pos = isize;
+       if (unlikely(file->f_flags & O_DIRECT))
+               ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+                                               ocount, &iolock);
+       else
+               ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+                                               ocount, &iolock);
 
-       if (iocb->ki_pos > ip->i_size) {
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               if (iocb->ki_pos > ip->i_size)
-                       ip->i_size = iocb->ki_pos;
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       }
+       xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
 
-       error = -ret;
        if (ret <= 0)
-               goto out_unlock_internal;
-
-       XFS_STATS_ADD(xs_write_bytes, ret);
+               goto out_unlock;
 
        /* Handle various SYNC-type writes */
        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
                loff_t end = pos + ret - 1;
-               int error2;
-
-               xfs_iunlock(ip, iolock);
-               if (need_i_mutex)
-                       mutex_unlock(&inode->i_mutex);
+               int error, error2;
 
-               error2 = filemap_write_and_wait_range(mapping, pos, end);
-               if (!error)
-                       error = error2;
-               if (need_i_mutex)
-                       mutex_lock(&inode->i_mutex);
-               xfs_ilock(ip, iolock);
+               xfs_rw_iunlock(ip, iolock);
+               error = filemap_write_and_wait_range(mapping, pos, end);
+               xfs_rw_ilock(ip, iolock);
 
                error2 = -xfs_file_fsync(file,
                                         (file->f_flags & __O_SYNC) ? 0 : 1);
-               if (!error)
-                       error = error2;
+               if (error)
+                       ret = error;
+               else if (error2)
+                       ret = error2;
        }
 
- out_unlock_internal:
-       if (ip->i_new_size) {
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               ip->i_new_size = 0;
-               /*
-                * If this was a direct or synchronous I/O that failed (such
-                * as ENOSPC) then part of the I/O may have been written to
-                * disk before the error occured.  In this case the on-disk
-                * file size may have been adjusted beyond the in-memory file
-                * size and now needs to be truncated back.
-                */
-               if (ip->i_d.di_size > ip->i_size)
-                       ip->i_d.di_size = ip->i_size;
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-       xfs_iunlock(ip, iolock);
- out_unlock_mutex:
-       if (need_i_mutex)
-               mutex_unlock(&inode->i_mutex);
-       return -error;
+out_unlock:
+       xfs_aio_write_newsize_update(ip);
+       xfs_rw_iunlock(ip, iolock);
+       return ret;
 }
 
 STATIC int
index ad442d9..b06ede1 100644 (file)
@@ -39,6 +39,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
@@ -1294,6 +1295,8 @@ xfs_file_ioctl(
        trace_xfs_file_ioctl(ip);
 
        switch (cmd) {
+       case FITRIM:
+               return xfs_ioc_trim(mp, arg);
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
index bd07f73..9731898 100644 (file)
@@ -1414,7 +1414,7 @@ xfs_fs_freeze(
 
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-       return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+       return -xfs_fs_log_dummy(mp);
 }
 
 STATIC int
index a02480d..e22f005 100644 (file)
@@ -362,7 +362,7 @@ xfs_quiesce_data(
 
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-               error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+               error2 = xfs_fs_log_dummy(mp);
 
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -503,13 +503,14 @@ xfs_sync_worker(
        int             error;
 
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-               xfs_log_force(mp, 0);
-               xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
-               error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                if (mp->m_super->s_frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
-                       error = xfs_fs_log_dummy(mp, 0);
+                       error = xfs_fs_log_dummy(mp);
+               else
+                       xfs_log_force(mp, 0);
+               xfs_reclaim_inodes(mp, 0);
+               error = xfs_qm_sync(mp, SYNC_TRYLOCK);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
index 7bb5092..ee3cee0 100644 (file)
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include "xfs_error.h"
 
 static struct ctl_table_header *xfs_table_header;
 
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
 
        return ret;
 }
+
+STATIC int
+xfs_panic_mask_proc_handler(
+       ctl_table       *ctl,
+       int             write,
+       void            __user *buffer,
+       size_t          *lenp,
+       loff_t          *ppos)
+{
+       int             ret, *valp = ctl->data;
+
+       ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+       if (!ret && write) {
+               xfs_panic_mask = *valp;
+#ifdef DEBUG
+               xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+       }
+       return ret;
+}
 #endif /* CONFIG_PROC_FS */
 
 static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
+               .proc_handler   = xfs_panic_mask_proc_handler,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
index 647af2a..2d0bcb4 100644 (file)
@@ -1759,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
 
+DECLARE_EVENT_CLASS(xfs_discard_class,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                xfs_agblock_t agbno, xfs_extlen_t len),
+       TP_ARGS(mp, agno, agbno, len),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agblock_t, agbno)
+               __field(xfs_extlen_t, len)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agbno = agbno;
+               __entry->len = len;
+       ),
+       TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                xfs_agblock_t agbno, xfs_extlen_t len), \
+       TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
index 975aa10..e6cf955 100644 (file)
 #include "xfs_mount.h"
 #include "xfs_error.h"
 
-static char            message[1024];  /* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL      7
-#define XFS_ERR_MASK           ((1 << 3) - 1)
-static const char * const      err_level[XFS_MAX_ERR_LEVEL+1] =
-                                       {KERN_EMERG, KERN_ALERT, KERN_CRIT,
-                                        KERN_ERR, KERN_WARNING, KERN_NOTICE,
-                                        KERN_INFO, KERN_DEBUG};
-
 void
-cmn_err(register int level, char *fmt, ...)
+cmn_err(
+       const char      *lvl,
+       const char      *fmt,
+       ...)
 {
-       char    *fp = fmt;
-       int     len;
-       ulong   flags;
-       va_list ap;
-
-       level &= XFS_ERR_MASK;
-       if (level > XFS_MAX_ERR_LEVEL)
-               level = XFS_MAX_ERR_LEVEL;
-       spin_lock_irqsave(&xfs_err_lock,flags);
-       va_start(ap, fmt);
-       if (*fmt == '!') fp++;
-       len = vsnprintf(message, sizeof(message), fp, ap);
-       if (len >= sizeof(message))
-               len = sizeof(message) - 1;
-       if (message[len-1] == '\n')
-               message[len-1] = 0;
-       printk("%s%s\n", err_level[level], message);
-       va_end(ap);
-       spin_unlock_irqrestore(&xfs_err_lock,flags);
-       BUG_ON(level == CE_PANIC);
+       struct va_format vaf;
+       va_list         args;
+
+       va_start(args, fmt);
+       vaf.fmt = fmt;
+       vaf.va = &args;
+
+       printk("%s%pV", lvl, &vaf);
+       va_end(args);
+
+       BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
 }
 
 void
-xfs_fs_vcmn_err(
-       int                     level,
+xfs_fs_cmn_err(
+       const char              *lvl,
        struct xfs_mount        *mp,
-       char                    *fmt,
-       va_list                 ap)
+       const char              *fmt,
+       ...)
 {
-       unsigned long           flags;
-       int                     len = 0;
+       struct va_format        vaf;
+       va_list                 args;
 
-       level &= XFS_ERR_MASK;
-       if (level > XFS_MAX_ERR_LEVEL)
-               level = XFS_MAX_ERR_LEVEL;
+       va_start(args, fmt);
+       vaf.fmt = fmt;
+       vaf.va = &args;
 
-       spin_lock_irqsave(&xfs_err_lock,flags);
+       printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
+       va_end(args);
 
-       if (mp) {
-               len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+       BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
+}
+
+/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
+void
+xfs_cmn_err(
+       int                     panic_tag,
+       const char              *lvl,
+       struct xfs_mount        *mp,
+       const char              *fmt,
+       ...)
+{
+       struct va_format        vaf;
+       va_list                 args;
+       int                     panic = 0;
 
-               /*
-                * Skip the printk if we can't print anything useful
-                * due to an over-long device name.
-                */
-               if (len >= sizeof(message))
-                       goto out;
+       if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+               printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
+               panic = 1;
        }
 
-       len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
-       if (len >= sizeof(message))
-               len = sizeof(message) - 1;
-       if (message[len-1] == '\n')
-               message[len-1] = 0;
+       va_start(args, fmt);
+       vaf.fmt = fmt;
+       vaf.va = &args;
 
-       printk("%s%s\n", err_level[level], message);
- out:
-       spin_unlock_irqrestore(&xfs_err_lock,flags);
+       printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
+       va_end(args);
 
-       BUG_ON(level == CE_PANIC);
+       BUG_ON(panic);
 }
 
 void
 assfail(char *expr, char *file, int line)
 {
-       printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
+       printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
+              file, line);
        BUG();
 }
 
index d2d2046..05699f6 100644 (file)
 
 #include <stdarg.h>
 
-#define CE_DEBUG        7               /* debug        */
-#define CE_CONT         6               /* continuation */
-#define CE_NOTE         5               /* notice       */
-#define CE_WARN         4               /* warning      */
-#define CE_ALERT        1               /* alert        */
-#define CE_PANIC        0               /* panic        */
-
-extern void cmn_err(int, char *, ...)
-       __attribute__ ((format (printf, 2, 3)));
+struct xfs_mount;
+
+#define CE_DEBUG        KERN_DEBUG
+#define CE_CONT         KERN_INFO
+#define CE_NOTE         KERN_NOTICE
+#define CE_WARN         KERN_WARNING
+#define CE_ALERT        KERN_ALERT
+#define CE_PANIC        KERN_EMERG
+
+void cmn_err(const char *lvl, const char *fmt, ...)
+               __attribute__ ((format (printf, 2, 3)));
+void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
+               const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
+               const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
+
 extern void assfail(char *expr, char *f, int l);
 
 #define ASSERT_ALWAYS(expr)    \
index fa8723f..f322798 100644 (file)
 #define        XFSA_FIXUP_BNO_OK       1
 #define        XFSA_FIXUP_CNT_OK       2
 
-static int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                   xfs_agblock_t bno, xfs_extlen_t len);
-
 /*
  * Prototypes for per-ag allocation routines
  */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
  * Lookup the first record less than or equal to [bno, len]
  * in the btree given by cur.
  */
-STATIC int                             /* error */
+int                                    /* error */
 xfs_alloc_lookup_le(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           bno,    /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
 /*
  * Get the data from the pointed-to record.
  */
-STATIC int                             /* error */
+int                                    /* error */
 xfs_alloc_get_rec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           *bno,   /* output: starting block of extent */
@@ -2615,7 +2611,7 @@ restart:
  * will require a synchronous transaction, but it can still be
  * used to distinguish between a partial or exact match.
  */
-static int
+int
 xfs_alloc_busy_search(
        struct xfs_mount        *mp,
        xfs_agnumber_t          agno,
index 895009a..0ab56b3 100644 (file)
@@ -19,6 +19,7 @@
 #define        __XFS_ALLOC_H__
 
 struct xfs_buf;
+struct xfs_btree_cur;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
@@ -118,16 +119,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
                struct xfs_perag *pag);
 
 #ifdef __KERNEL__
-
 void
-xfs_alloc_busy_insert(xfs_trans_t *tp,
-               xfs_agnumber_t agno,
-               xfs_agblock_t bno,
-               xfs_extlen_t len);
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+       xfs_agblock_t bno, xfs_extlen_t len);
 
 void
 xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
 
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+       xfs_agblock_t bno, xfs_extlen_t len);
 #endif /* __KERNEL__ */
 
 /*
@@ -205,4 +206,18 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
 
+int                                    /* error */
+xfs_alloc_lookup_le(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat); /* success/failure */
+
+int                                    /* error */
+xfs_alloc_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           *bno,   /* output: starting block of extent */
+       xfs_extlen_t            *len,   /* output: length of extent */
+       int                     *stat); /* output: success/failure */
+
 #endif /* __XFS_ALLOC_H__ */
index ed2b65f..98c6f73 100644 (file)
@@ -141,7 +141,6 @@ xfs_buf_item_log_check(
 #define                xfs_buf_item_log_check(x)
 #endif
 
-STATIC void    xfs_buf_error_relse(xfs_buf_t *bp);
 STATIC void    xfs_buf_do_callbacks(struct xfs_buf *bp);
 
 /*
@@ -959,128 +958,76 @@ xfs_buf_do_callbacks(
  */
 void
 xfs_buf_iodone_callbacks(
-       xfs_buf_t       *bp)
+       struct xfs_buf          *bp)
 {
-       xfs_log_item_t  *lip;
-       static ulong    lasttime;
-       static xfs_buftarg_t *lasttarg;
-       xfs_mount_t     *mp;
+       struct xfs_log_item     *lip = bp->b_fspriv;
+       struct xfs_mount        *mp = lip->li_mountp;
+       static ulong            lasttime;
+       static xfs_buftarg_t    *lasttarg;
 
-       ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
-       lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+       if (likely(!XFS_BUF_GETERROR(bp)))
+               goto do_callbacks;
 
-       if (XFS_BUF_GETERROR(bp) != 0) {
-               /*
-                * If we've already decided to shutdown the filesystem
-                * because of IO errors, there's no point in giving this
-                * a retry.
-                */
-               mp = lip->li_mountp;
-               if (XFS_FORCED_SHUTDOWN(mp)) {
-                       ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
-                       XFS_BUF_SUPER_STALE(bp);
-                       trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                       xfs_buf_do_callbacks(bp);
-                       XFS_BUF_SET_FSPRIVATE(bp, NULL);
-                       XFS_BUF_CLR_IODONE_FUNC(bp);
-                       xfs_buf_ioend(bp, 0);
-                       return;
-               }
+       /*
+        * If we've already decided to shutdown the filesystem because of
+        * I/O errors, there's no point in giving this a retry.
+        */
+       if (XFS_FORCED_SHUTDOWN(mp)) {
+               XFS_BUF_SUPER_STALE(bp);
+               trace_xfs_buf_item_iodone(bp, _RET_IP_);
+               goto do_callbacks;
+       }
 
-               if ((XFS_BUF_TARGET(bp) != lasttarg) ||
-                   (time_after(jiffies, (lasttime + 5*HZ)))) {
-                       lasttime = jiffies;
-                       cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
-                                       " block 0x%llx in %s",
-                               XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                             (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
-               }
-               lasttarg = XFS_BUF_TARGET(bp);
+       if (XFS_BUF_TARGET(bp) != lasttarg ||
+           time_after(jiffies, (lasttime + 5*HZ))) {
+               lasttime = jiffies;
+               cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+                               " block 0x%llx in %s",
+                       XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+                     (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+       }
+       lasttarg = XFS_BUF_TARGET(bp);
 
-               if (XFS_BUF_ISASYNC(bp)) {
-                       /*
-                        * If the write was asynchronous then noone will be
-                        * looking for the error.  Clear the error state
-                        * and write the buffer out again delayed write.
-                        *
-                        * XXXsup This is OK, so long as we catch these
-                        * before we start the umount; we don't want these
-                        * DELWRI metadata bufs to be hanging around.
-                        */
-                       XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
-
-                       if (!(XFS_BUF_ISSTALE(bp))) {
-                               XFS_BUF_DELAYWRITE(bp);
-                               XFS_BUF_DONE(bp);
-                               XFS_BUF_SET_START(bp);
-                       }
-                       ASSERT(XFS_BUF_IODONE_FUNC(bp));
-                       trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-                       xfs_buf_relse(bp);
-               } else {
-                       /*
-                        * If the write of the buffer was not asynchronous,
-                        * then we want to make sure to return the error
-                        * to the caller of bwrite().  Because of this we
-                        * cannot clear the B_ERROR state at this point.
-                        * Instead we install a callback function that
-                        * will be called when the buffer is released, and
-                        * that routine will clear the error state and
-                        * set the buffer to be written out again after
-                        * some delay.
-                        */
-                       /* We actually overwrite the existing b-relse
-                          function at times, but we're gonna be shutting down
-                          anyway. */
-                       XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
+       /*
+        * If the write was asynchronous then noone will be looking for the
+        * error.  Clear the error state and write the buffer out again.
+        *
+        * During sync or umount we'll write all pending buffers again
+        * synchronous, which will catch these errors if they keep hanging
+        * around.
+        */
+       if (XFS_BUF_ISASYNC(bp)) {
+               XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
+
+               if (!XFS_BUF_ISSTALE(bp)) {
+                       XFS_BUF_DELAYWRITE(bp);
                        XFS_BUF_DONE(bp);
-                       XFS_BUF_FINISH_IOWAIT(bp);
+                       XFS_BUF_SET_START(bp);
                }
+               ASSERT(XFS_BUF_IODONE_FUNC(bp));
+               trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+               xfs_buf_relse(bp);
                return;
        }
 
-       xfs_buf_do_callbacks(bp);
-       XFS_BUF_SET_FSPRIVATE(bp, NULL);
-       XFS_BUF_CLR_IODONE_FUNC(bp);
-       xfs_buf_ioend(bp, 0);
-}
-
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
-       xfs_buf_t       *bp)
-{
-       xfs_log_item_t  *lip;
-       xfs_mount_t     *mp;
-
-       lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-       mp = (xfs_mount_t *)lip->li_mountp;
-       ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
-
+       /*
+        * If the write of the buffer was synchronous, we want to make
+        * sure to return the error to the caller of xfs_bwrite().
+        */
        XFS_BUF_STALE(bp);
        XFS_BUF_DONE(bp);
        XFS_BUF_UNDELAYWRITE(bp);
-       XFS_BUF_ERROR(bp,0);
 
        trace_xfs_buf_error_relse(bp, _RET_IP_);
+       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 
-       if (! XFS_FORCED_SHUTDOWN(mp))
-               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-       /*
-        * We have to unpin the pinned buffers so do the
-        * callbacks.
-        */
+do_callbacks:
        xfs_buf_do_callbacks(bp);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
-       XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
-       xfs_buf_relse(bp);
+       xfs_buf_ioend(bp, 0);
 }
 
-
 /*
  * This is the iodone() function for buffers which have been
  * logged.  It is called when they are eventually flushed out.
index c78cc6a..4c7db74 100644 (file)
@@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
 
-
-void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
-{
-       va_list ap;
-
-       va_start(ap, fmt);
-       xfs_fs_vcmn_err(level, mp, fmt, ap);
-       va_end(ap);
-}
-
-void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
-{
-       va_list ap;
-
-#ifdef DEBUG
-       xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-
-       if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
-           && (level & CE_ALERT)) {
-               level &= ~CE_ALERT;
-               level |= CE_PANIC;
-               cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
-       }
-       va_start(ap, fmt);
-       xfs_fs_vcmn_err(level, mp, fmt, ap);
-       va_end(ap);
-}
-
 void
 xfs_error_report(
        const char              *tag,
index f338847..10dce54 100644 (file)
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
                        (rf))))
 
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)      (expr)
 #define xfs_errortag_add(tag, mp)              (ENOSYS)
@@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 
 struct xfs_mount;
 
-extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
-               char *fmt, va_list ap)
-       __attribute__ ((format (printf, 3, 0)));
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                       char *fmt, ...)
-       __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
-       __attribute__ ((format (printf, 3, 4)));
-
 extern void xfs_hex_dump(void *p, int length);
 
 #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
        xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
 
 #define xfs_fs_mount_cmn_err(f, fmt, args...) \
-       ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
+       do { \
+               if (!(f & XFS_MFSI_QUIET))      \
+                       cmn_err(CE_WARN, "XFS: " fmt, ## args); \
+       } while (0)
 
 #endif /* __XFS_ERROR_H__ */
index f56d30e..cec89dd 100644 (file)
@@ -612,12 +612,13 @@ out:
  *
  * We cannot use an inode here for this - that will push dirty state back up
  * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead.
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
  */
 int
 xfs_fs_log_dummy(
-       xfs_mount_t     *mp,
-       int             flags)
+       xfs_mount_t     *mp)
 {
        xfs_trans_t     *tp;
        int             error;
@@ -632,8 +633,7 @@ xfs_fs_log_dummy(
 
        /* log the UUID because it is an unchanging field */
        xfs_mod_sb(tp, XFS_SB_UUID);
-       if (flags & SYNC_WAIT)
-               xfs_trans_set_sync(tp);
+       xfs_trans_set_sync(tp);
        return xfs_trans_commit(tp, 0);
 }
 
index a786c52..1b6a98b 100644 (file)
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 
 #endif /* __XFS_FSOPS_H__ */
index 0bf24b1..ae6fef1 100644 (file)
@@ -377,7 +377,7 @@ xfs_log_mount(
                cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
        else {
                cmn_err(CE_NOTE,
-                       "!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+                       "Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
                        mp->m_fsname);
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
index 204d8e5..aa0ebb7 100644 (file)
@@ -3800,7 +3800,7 @@ xlog_recover_finish(
                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
        } else {
                cmn_err(CE_DEBUG,
-                       "!Ending clean XFS mount for filesystem: %s\n",
+                       "Ending clean XFS mount for filesystem: %s\n",
                        log->l_mp->m_fsname);
        }
        return 0;
index f80a067..33dbc4e 100644 (file)
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
        if (blkdelta)
                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
 out:
-       ASSERT(error = 0);
+       ASSERT(error == 0);
        return;
 }