Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
diff --git a/fs/Kconfig b/fs/Kconfig

index b8fcb41..4524916 100644 (file)
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
  
  if BLOCK
  
+config FS_IOMAP
+       bool
+
  source "fs/ext2/Kconfig"
  source "fs/ext4/Kconfig"
  source "fs/jbd2/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile

index 85b6e13..ed2b632 100644 (file)
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP)                += coredump.o
  obj-$(CONFIG_SYSCTL)           += drop_caches.o
  
  obj-$(CONFIG_FHANDLE)          += fhandle.o
+obj-$(CONFIG_FS_IOMAP)         += iomap.o
  
  obj-y                          += quota/
  
diff --git a/fs/buffer.c b/fs/buffer.c

index b9fa1be..9c8eb9b 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
  #include <linux/kernel.h>
  #include <linux/syscalls.h>
  #include <linux/fs.h>
+#include <linux/iomap.h>
  #include <linux/mm.h>
  #include <linux/percpu.h>
  #include <linux/slab.h>
@@ -1892,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
  }
  EXPORT_SYMBOL(page_zero_new_buffers);
  
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-               get_block_t *get_block)
+static void
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+               struct iomap *iomap)
+{
+       loff_t offset = block << inode->i_blkbits;
+
+       bh->b_bdev = iomap->bdev;
+
+       /*
+        * Block points to offset in file we need to map, iomap contains
+        * the offset at which the map starts. If the map ends before the
+        * current block, then do not map the buffer and let the caller
+        * handle it.
+        */
+       BUG_ON(offset >= iomap->offset + iomap->length);
+
+       switch (iomap->type) {
+       case IOMAP_HOLE:
+               /*
+                * If the buffer is not up to date or beyond the current EOF,
+                * we need to mark it as new to ensure sub-block zeroing is
+                * executed if necessary.
+                */
+               if (!buffer_uptodate(bh) ||
+                   (offset >= i_size_read(inode)))
+                       set_buffer_new(bh);
+               break;
+       case IOMAP_DELALLOC:
+               if (!buffer_uptodate(bh) ||
+                   (offset >= i_size_read(inode)))
+                       set_buffer_new(bh);
+               set_buffer_uptodate(bh);
+               set_buffer_mapped(bh);
+               set_buffer_delay(bh);
+               break;
+       case IOMAP_UNWRITTEN:
+               /*
+                * For unwritten regions, we always need to ensure that
+                * sub-block writes cause the regions in the block we are not
+                * writing to are zeroed. Set the buffer as new to ensure this.
+                */
+               set_buffer_new(bh);
+               set_buffer_unwritten(bh);
+               /* FALLTHRU */
+       case IOMAP_MAPPED:
+               if (offset >= i_size_read(inode))
+                       set_buffer_new(bh);
+               bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+                               ((offset - iomap->offset) >> inode->i_blkbits);
+               set_buffer_mapped(bh);
+               break;
+       }
+}
+
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block, struct iomap *iomap)
  {
         unsigned from = pos & (PAGE_SIZE - 1);
         unsigned to = from + len;
@@ -1929,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                         clear_buffer_new(bh);
                 if (!buffer_mapped(bh)) {
                         WARN_ON(bh->b_size != blocksize);
-                       err = get_block(inode, block, bh, 1);
-                       if (err)
-                               break;
+                       if (get_block) {
+                               err = get_block(inode, block, bh, 1);
+                               if (err)
+                                       break;
+                       } else {
+                               iomap_to_bh(inode, block, bh, iomap);
+                       }
+
                         if (buffer_new(bh)) {
                                 unmap_underlying_metadata(bh->b_bdev,
                                                         bh->b_blocknr);
@@ -1972,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                 page_zero_new_buffers(page, from, to);
         return err;
  }
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block)
+{
+       return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
  EXPORT_SYMBOL(__block_write_begin);
  
  static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h

index f57ced5..cef0913 100644 (file)
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
  
  struct super_block;
  struct file_system_type;
+struct iomap;
  struct linux_binprm;
  struct path;
  struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
   * buffer.c
   */
  extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block, struct iomap *iomap);
  
  /*
   * char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c

new file mode 100644 (file)

index 0000000..48141b8
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/dax.h>
+#include "internal.h"
+
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+               void *data, struct iomap *iomap);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+               struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+       struct iomap iomap = { 0 };
+       loff_t written = 0, ret;
+
+       /*
+        * Need to map a range from start position for length bytes. This can
+        * span multiple pages - it is only guaranteed to return a range of a
+        * single type of pages (e.g. all into a hole, all mapped or all
+        * unwritten). Failure at this point has nothing to undo.
+        *
+        * If allocation is required for this range, reserve the space now so
+        * that the allocation is guaranteed to succeed later on. Once we copy
+        * the data into the page cache pages, then we cannot fail otherwise we
+        * expose transient stale data. If the reserve fails, we can safely
+        * back out at this point as there is nothing to undo.
+        */
+       ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+       if (ret)
+               return ret;
+       if (WARN_ON(iomap.offset > pos))
+               return -EIO;
+
+       /*
+        * Cut down the length to the one actually provided by the filesystem,
+        * as it might not be able to give us the whole size that we requested.
+        */
+       if (iomap.offset + iomap.length < pos + length)
+               length = iomap.offset + iomap.length - pos;
+
+       /*
+        * Now that we have guaranteed that the space allocation will succeed.
+        * we can do the copy-in page by page without having to worry about
+        * failures exposing transient data.
+        */
+       written = actor(inode, pos, length, data, &iomap);
+
+       /*
+        * Now the data has been copied, commit the range we've copied.  This
+        * should not fail unless the filesystem has had a fatal error.
+        */
+       ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+                       flags, &iomap);
+
+       return written ? written : ret;
+}
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+       loff_t i_size = i_size_read(inode);
+
+       /*
+        * Only truncate newly allocated pages beyoned EOF, even if the
+        * write started inside the existing inode size.
+        */
+       if (pos + len > i_size)
+               truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+               struct page **pagep, struct iomap *iomap)
+{
+       pgoff_t index = pos >> PAGE_SHIFT;
+       struct page *page;
+       int status = 0;
+
+       BUG_ON(pos + len > iomap->offset + iomap->length);
+
+       page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+       if (!page)
+               return -ENOMEM;
+
+       status = __block_write_begin_int(page, pos, len, NULL, iomap);
+       if (unlikely(status)) {
+               unlock_page(page);
+               put_page(page);
+               page = NULL;
+
+               iomap_write_failed(inode, pos, len);
+       }
+
+       *pagep = page;
+       return status;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+               unsigned copied, struct page *page)
+{
+       int ret;
+
+       ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+                       copied, page, NULL);
+       if (ret < len)
+               iomap_write_failed(inode, pos, len);
+       return ret;
+}
+
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+               struct iomap *iomap)
+{
+       struct iov_iter *i = data;
+       long status = 0;
+       ssize_t written = 0;
+       unsigned int flags = AOP_FLAG_NOFS;
+
+       /*
+        * Copies from kernel address space cannot fail (NFSD is a big user).
+        */
+       if (!iter_is_iovec(i))
+               flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+       do {
+               struct page *page;
+               unsigned long offset;   /* Offset into pagecache page */
+               unsigned long bytes;    /* Bytes to write to page */
+               size_t copied;          /* Bytes copied from user */
+
+               offset = (pos & (PAGE_SIZE - 1));
+               bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                               iov_iter_count(i));
+again:
+               if (bytes > length)
+                       bytes = length;
+
+               /*
+                * Bring in the user page that we will copy from _first_.
+                * Otherwise there's a nasty deadlock on copying from the
+                * same page as we're writing to, without it being marked
+                * up-to-date.
+                *
+                * Not only is this an optimisation, but it is also required
+                * to check that the address is actually valid, when atomic
+                * usercopies are used, below.
+                */
+               if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                       status = -EFAULT;
+                       break;
+               }
+
+               status = iomap_write_begin(inode, pos, bytes, flags, &page,
+                               iomap);
+               if (unlikely(status))
+                       break;
+
+               if (mapping_writably_mapped(inode->i_mapping))
+                       flush_dcache_page(page);
+
+               pagefault_disable();
+               copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+               pagefault_enable();
+
+               flush_dcache_page(page);
+               mark_page_accessed(page);
+
+               status = iomap_write_end(inode, pos, bytes, copied, page);
+               if (unlikely(status < 0))
+                       break;
+               copied = status;
+
+               cond_resched();
+
+               iov_iter_advance(i, copied);
+               if (unlikely(copied == 0)) {
+                       /*
+                        * If we were unable to copy any data at all, we must
+                        * fall back to a single segment length write.
+                        *
+                        * If we didn't fallback here, we could livelock
+                        * because not all segments in the iov can be copied at
+                        * once without a pagefault.
+                        */
+                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                               iov_iter_single_seg_count(i));
+                       goto again;
+               }
+               pos += copied;
+               written += copied;
+               length -= copied;
+
+               balance_dirty_pages_ratelimited(inode->i_mapping);
+       } while (iov_iter_count(i) && length);
+
+       return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+               struct iomap_ops *ops)
+{
+       struct inode *inode = iocb->ki_filp->f_mapping->host;
+       loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+
+       while (iov_iter_count(iter)) {
+               ret = iomap_apply(inode, pos, iov_iter_count(iter),
+                               IOMAP_WRITE, ops, iter, iomap_write_actor);
+               if (ret <= 0)
+                       break;
+               pos += ret;
+               written += ret;
+       }
+
+       return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
+               unsigned bytes, struct iomap *iomap)
+{
+       struct page *page;
+       int status;
+
+       status = iomap_write_begin(inode, pos, bytes,
+                       AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+       if (status)
+               return status;
+
+       zero_user(page, offset, bytes);
+       mark_page_accessed(page);
+
+       return iomap_write_end(inode, pos, bytes, bytes, page);
+}
+
+static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
+               struct iomap *iomap)
+{
+       sector_t sector = iomap->blkno +
+               (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+
+       return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+}
+
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+               void *data, struct iomap *iomap)
+{
+       bool *did_zero = data;
+       loff_t written = 0;
+       int status;
+
+       /* already zeroed?  we're done. */
+       if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+               return count;
+
+       do {
+               unsigned offset, bytes;
+
+               offset = pos & (PAGE_SIZE - 1); /* Within page */
+               bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+
+               if (IS_DAX(inode))
+                       status = iomap_dax_zero(pos, offset, bytes, iomap);
+               else
+                       status = iomap_zero(inode, pos, offset, bytes, iomap);
+               if (status < 0)
+                       return status;
+
+               pos += bytes;
+               count -= bytes;
+               written += bytes;
+               if (did_zero)
+                       *did_zero = true;
+       } while (count > 0);
+
+       return written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+               struct iomap_ops *ops)
+{
+       loff_t ret;
+
+       while (len > 0) {
+               ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+                               ops, did_zero, iomap_zero_range_actor);
+               if (ret <= 0)
+                       return ret;
+
+               pos += ret;
+               len -= ret;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+               struct iomap_ops *ops)
+{
+       unsigned blocksize = (1 << inode->i_blkbits);
+       unsigned off = pos & (blocksize - 1);
+
+       /* Block boundary? Nothing to do */
+       if (!off)
+               return 0;
+       return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+               void *data, struct iomap *iomap)
+{
+       struct page *page = data;
+       int ret;
+
+       ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+                       NULL, iomap);
+       if (ret)
+               return ret;
+
+       block_commit_write(page, 0, length);
+       return length;
+}
+
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+               struct iomap_ops *ops)
+{
+       struct page *page = vmf->page;
+       struct inode *inode = file_inode(vma->vm_file);
+       unsigned long length;
+       loff_t offset, size;
+       ssize_t ret;
+
+       lock_page(page);
+       size = i_size_read(inode);
+       if ((page->mapping != inode->i_mapping) ||
+           (page_offset(page) > size)) {
+               /* We overload EFAULT to mean page got truncated */
+               ret = -EFAULT;
+               goto out_unlock;
+       }
+
+       /* page is wholly or partially inside EOF */
+       if (((page->index + 1) << PAGE_SHIFT) > size)
+               length = size & ~PAGE_MASK;
+       else
+               length = PAGE_SIZE;
+
+       offset = page_offset(page);
+       while (length > 0) {
+               ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+                               ops, page, iomap_page_mkwrite_actor);
+               if (unlikely(ret <= 0))
+                       goto out_unlock;
+               offset += ret;
+               length -= ret;
+       }
+
+       set_page_dirty(page);
+       wait_for_stable_page(page);
+       return 0;
+out_unlock:
+       unlock_page(page);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+struct fiemap_ctx {
+       struct fiemap_extent_info *fi;
+       struct iomap prev;
+};
+
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+               struct iomap *iomap, u32 flags)
+{
+       switch (iomap->type) {
+       case IOMAP_HOLE:
+               /* skip holes */
+               return 0;
+       case IOMAP_DELALLOC:
+               flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+               break;
+       case IOMAP_UNWRITTEN:
+               flags |= FIEMAP_EXTENT_UNWRITTEN;
+               break;
+       case IOMAP_MAPPED:
+               break;
+       }
+
+       return fiemap_fill_next_extent(fi, iomap->offset,
+                       iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+                       iomap->length, flags | FIEMAP_EXTENT_MERGED);
+
+}
+
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+               struct iomap *iomap)
+{
+       struct fiemap_ctx *ctx = data;
+       loff_t ret = length;
+
+       if (iomap->type == IOMAP_HOLE)
+               return length;
+
+       ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+       ctx->prev = *iomap;
+       switch (ret) {
+       case 0:         /* success */
+               return length;
+       case 1:         /* extent array full */
+               return 0;
+       default:
+               return ret;
+       }
+}
+
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+               loff_t start, loff_t len, struct iomap_ops *ops)
+{
+       struct fiemap_ctx ctx;
+       loff_t ret;
+
+       memset(&ctx, 0, sizeof(ctx));
+       ctx.fi = fi;
+       ctx.prev.type = IOMAP_HOLE;
+
+       ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+       if (ret)
+               return ret;
+
+       ret = filemap_write_and_wait(inode->i_mapping);
+       if (ret)
+               return ret;
+
+       while (len > 0) {
+               ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+                               iomap_fiemap_actor);
+               if (ret < 0)
+                       return ret;
+               if (ret == 0)
+                       break;
+
+               start += ret;
+               len -= ret;
+       }
+
+       if (ctx.prev.type != IOMAP_HOLE) {
+               ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c

index 31f3df1..ad2c05e 100644 (file)
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
   * Copyright (c) 2014-2016 Christoph Hellwig.
   */
  #include <linux/exportfs.h>
+#include <linux/iomap.h>
  #include <linux/genhd.h>
  #include <linux/slab.h>
  #include <linux/pr.h>
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c

index 6c3b316..4ebaaf4 100644 (file)
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
   */
  #include <linux/sunrpc/svc.h>
  #include <linux/exportfs.h>
+#include <linux/iomap.h>
  #include <linux/nfs4.h>
  
  #include "nfsd.h"
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig

index 5d47b4d..35faf12 100644 (file)
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
         depends on (64BIT || LBDAF)
         select EXPORTFS
         select LIBCRC32C
+       select FS_IOMAP
         help
           XFS is a high performance journaling filesystem which originated
           on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c

index a708e38..88c26b8 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -84,7 +84,7 @@ xfs_alloc_lookup_ge(
   * Lookup the first record less than or equal to [bno, len]
   * in the btree given by cur.
   */
-int                                    /* error */
+static int                             /* error */
  xfs_alloc_lookup_le(
         struct xfs_btree_cur    *cur,   /* btree cursor */
         xfs_agblock_t           bno,    /* starting block of extent */
@@ -1839,19 +1839,8 @@ void
  xfs_alloc_compute_maxlevels(
         xfs_mount_t     *mp)    /* file system mount structure */
  {
-       int             level;
-       uint            maxblocks;
-       uint            maxleafents;
-       int             minleafrecs;
-       int             minnoderecs;
-
-       maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
-       minleafrecs = mp->m_alloc_mnr[0];
-       minnoderecs = mp->m_alloc_mnr[1];
-       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-       for (level = 1; maxblocks > 1; level++)
-               maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-       mp->m_ag_maxlevels = level;
+       mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
+                       (mp->m_sb.sb_agblocks + 1) / 2);
  }
  
  /*
@@ -2658,55 +2647,79 @@ error0:
         return error;
  }
  
-/*
- * Free an extent.
- * Just break up the extent address and hand off to xfs_free_ag_extent
- * after fixing up the freelist.
- */
-int                            /* error */
-xfs_free_extent(
-       xfs_trans_t     *tp,    /* transaction pointer */
-       xfs_fsblock_t   bno,    /* starting block number of extent */
-       xfs_extlen_t    len)    /* length of extent */
+/* Ensure that the freelist is at full capacity. */
+int
+xfs_free_extent_fix_freelist(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       struct xfs_buf          **agbp)
  {
-       xfs_alloc_arg_t args;
-       int             error;
+       struct xfs_alloc_arg    args;
+       int                     error;
  
-       ASSERT(len != 0);
-       memset(&args, 0, sizeof(xfs_alloc_arg_t));
+       memset(&args, 0, sizeof(struct xfs_alloc_arg));
         args.tp = tp;
         args.mp = tp->t_mountp;
+       args.agno = agno;
  
         /*
          * validate that the block number is legal - the enables us to detect
          * and handle a silent filesystem corruption rather than crashing.
          */
-       args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
         if (args.agno >= args.mp->m_sb.sb_agcount)
                 return -EFSCORRUPTED;
  
-       args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-       if (args.agbno >= args.mp->m_sb.sb_agblocks)
-               return -EFSCORRUPTED;
-
         args.pag = xfs_perag_get(args.mp, args.agno);
         ASSERT(args.pag);
  
         error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
         if (error)
-               goto error0;
+               goto out;
+
+       *agbp = args.agbp;
+out:
+       xfs_perag_put(args.pag);
+       return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int                            /* error */
+xfs_free_extent(
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_fsblock_t           bno,    /* starting block number of extent */
+       xfs_extlen_t            len)    /* length of extent */
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_buf          *agbp;
+       xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, bno);
+       xfs_agblock_t           agbno = XFS_FSB_TO_AGBNO(mp, bno);
+       int                     error;
+
+       ASSERT(len != 0);
+
+       error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+       if (error)
+               return error;
+
+       XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
  
         /* validate the extent size is legal now we have the agf locked */
-       if (args.agbno + len >
-                       be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-               error = -EFSCORRUPTED;
-               goto error0;
-       }
+       XFS_WANT_CORRUPTED_GOTO(mp,
+               agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
+                               err);
  
-       error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
-       if (!error)
-               xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
-error0:
-       xfs_perag_put(args.pag);
+       error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
+       if (error)
+               goto err;
+
+       xfs_extent_busy_insert(tp, agno, agbno, len, 0);
+       return 0;
+
+err:
+       xfs_trans_brelse(tp, agbp);
         return error;
  }
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h

index 135eb3d..cf268b2 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -212,13 +212,6 @@ xfs_free_extent(
         xfs_fsblock_t   bno,    /* starting block number of extent */
         xfs_extlen_t    len);   /* length of extent */
  
-int                                    /* error */
-xfs_alloc_lookup_le(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len,    /* length of extent */
-       int                     *stat); /* success/failure */
-
  int                            /* error */
  xfs_alloc_lookup_ge(
         struct xfs_btree_cur    *cur,   /* btree cursor */
@@ -236,5 +229,7 @@ xfs_alloc_get_rec(
  int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
                         xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
  int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
+               struct xfs_buf **agbp);
  
  #endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h

index 882c8d3..4f2aed0 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -50,7 +50,6 @@ int   xfs_attr_shortform_lookup(struct xfs_da_args *args);
  int    xfs_attr_shortform_getvalue(struct xfs_da_args *args);
  int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
  int    xfs_attr_shortform_remove(struct xfs_da_args *args);
-int    xfs_attr_shortform_list(struct xfs_attr_list_context *context);
  int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
  int    xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
  void   xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
@@ -88,8 +87,6 @@ int   xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
  void   xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
                                        struct xfs_da_state_blk *drop_blk,
                                        struct xfs_da_state_blk *save_blk);
-int    xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
-
  /*
   * Utility routines.
   */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index 932381c..2f2c85c 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -570,14 +570,12 @@ xfs_bmap_validate_ret(
   */
  void
  xfs_bmap_add_free(
+       struct xfs_mount        *mp,            /* mount point structure */
+       struct xfs_bmap_free    *flist,         /* list of extents */
         xfs_fsblock_t           bno,            /* fs block number of extent */
-       xfs_filblks_t           len,            /* length of extent */
-       xfs_bmap_free_t         *flist,         /* list of extents */
-       xfs_mount_t             *mp)            /* mount point structure */
+       xfs_filblks_t           len)            /* length of extent */
  {
-       xfs_bmap_free_item_t    *cur;           /* current (next) element */
-       xfs_bmap_free_item_t    *new;           /* new element */
-       xfs_bmap_free_item_t    *prev;          /* previous element */
+       struct xfs_bmap_free_item       *new;           /* new element */
  #ifdef DEBUG
         xfs_agnumber_t          agno;
         xfs_agblock_t           agbno;
@@ -597,17 +595,7 @@ xfs_bmap_add_free(
         new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
         new->xbfi_startblock = bno;
         new->xbfi_blockcount = (xfs_extlen_t)len;
-       for (prev = NULL, cur = flist->xbf_first;
-            cur != NULL;
-            prev = cur, cur = cur->xbfi_next) {
-               if (cur->xbfi_startblock >= bno)
-                       break;
-       }
-       if (prev)
-               prev->xbfi_next = new;
-       else
-               flist->xbf_first = new;
-       new->xbfi_next = cur;
+       list_add(&new->xbfi_list, &flist->xbf_flist);
         flist->xbf_count++;
  }
  
@@ -617,14 +605,10 @@ xfs_bmap_add_free(
   */
  void
  xfs_bmap_del_free(
-       xfs_bmap_free_t         *flist, /* free item list header */
-       xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
-       xfs_bmap_free_item_t    *free)  /* list item to be freed */
+       struct xfs_bmap_free            *flist, /* free item list header */
+       struct xfs_bmap_free_item       *free)  /* list item to be freed */
  {
-       if (prev)
-               prev->xbfi_next = free->xbfi_next;
-       else
-               flist->xbf_first = free->xbfi_next;
+       list_del(&free->xbfi_list);
         flist->xbf_count--;
         kmem_zone_free(xfs_bmap_free_item_zone, free);
  }
@@ -634,17 +618,16 @@ xfs_bmap_del_free(
   */
  void
  xfs_bmap_cancel(
-       xfs_bmap_free_t         *flist) /* list of bmap_free_items */
+       struct xfs_bmap_free            *flist) /* list of bmap_free_items */
  {
-       xfs_bmap_free_item_t    *free;  /* free list item */
-       xfs_bmap_free_item_t    *next;
+       struct xfs_bmap_free_item       *free;  /* free list item */
  
         if (flist->xbf_count == 0)
                 return;
-       ASSERT(flist->xbf_first != NULL);
-       for (free = flist->xbf_first; free; free = next) {
-               next = free->xbfi_next;
-               xfs_bmap_del_free(flist, NULL, free);
+       while (!list_empty(&flist->xbf_flist)) {
+               free = list_first_entry(&flist->xbf_flist,
+                               struct xfs_bmap_free_item, xbfi_list);
+               xfs_bmap_del_free(flist, free);
         }
         ASSERT(flist->xbf_count == 0);
  }
@@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents(
         cblock = XFS_BUF_TO_BLOCK(cbp);
         if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
                 return error;
-       xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+       xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
         ip->i_d.di_nblocks--;
         xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
         xfs_trans_binval(tp, cbp);
@@ -5073,8 +5056,8 @@ xfs_bmap_del_extent(
          * If we need to, add to list of extents to delete.
          */
         if (do_fx)
-               xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
-                       mp);
+               xfs_bmap_add_free(mp, flist, del->br_startblock,
+                       del->br_blockcount);
         /*
          * Adjust inode # blocks in the file.
          */
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h

index 423a34e..f1f3ae6 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -62,12 +62,12 @@ struct xfs_bmalloca {
   * List of extents to be free "later".
   * The list is kept sorted on xbf_startblock.
   */
-typedef struct xfs_bmap_free_item
+struct xfs_bmap_free_item
  {
         xfs_fsblock_t           xbfi_startblock;/* starting fs block number */
         xfs_extlen_t            xbfi_blockcount;/* number of blocks in extent */
-       struct xfs_bmap_free_item *xbfi_next;   /* link to next entry */
-} xfs_bmap_free_item_t;
+       struct list_head        xbfi_list;
+};
  
  /*
   * Header for free extent list.
@@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item
   */
  typedef        struct xfs_bmap_free
  {
-       xfs_bmap_free_item_t    *xbf_first;     /* list of to-be-free extents */
+       struct list_head        xbf_flist;      /* list of to-be-free extents */
         int                     xbf_count;      /* count of items on list */
         int                     xbf_low;        /* alloc in low mode */
  } xfs_bmap_free_t;
@@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w)
  
  static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
  {
-       ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
-               (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+       INIT_LIST_HEAD(&flp->xbf_flist);
+       flp->xbf_count = 0;
+       flp->xbf_low = 0;
+       *fbp = NULLFSBLOCK;
  }
  
  /*
@@ -191,8 +193,8 @@ void        xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
  
  int    xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
  void   xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void   xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
-               struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void   xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
+                         xfs_fsblock_t bno, xfs_filblks_t len);
  void   xfs_bmap_cancel(struct xfs_bmap_free *flist);
  int    xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
                         struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c

index 6282f6e..db0c71e 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -526,7 +526,7 @@ xfs_bmbt_free_block(
         struct xfs_trans        *tp = cur->bc_tp;
         xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
  
-       xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+       xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
         ip->i_d.di_nblocks--;
  
         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c

index 1f88e1c..07eeb0b 100644 (file)
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -543,12 +543,12 @@ xfs_btree_ptr_addr(
   */
  STATIC struct xfs_btree_block *
  xfs_btree_get_iroot(
-       struct xfs_btree_cur    *cur)
+       struct xfs_btree_cur    *cur)
  {
-       struct xfs_ifork        *ifp;
+       struct xfs_ifork        *ifp;
  
-       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
-       return (struct xfs_btree_block *)ifp->if_broot;
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
  }
  
  /*
@@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify(
  
         return true;
  }
+
+/*
+ * Calculate the number of btree levels needed to store a given number of
+ * records in a short-format btree.
+ */
+uint
+xfs_btree_compute_maxlevels(
+       struct xfs_mount        *mp,
+       uint                    *limits,
+       unsigned long           len)
+{
+       uint                    level;
+       unsigned long           maxblocks;
+
+       maxblocks = (len + limits[0] - 1) / limits[0];
+       for (level = 1; maxblocks > 1; level++)
+               maxblocks = (maxblocks + limits[1] - 1) / limits[1];
+       return level;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h

index 2e874be..785a996 100644 (file)
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
  
  bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
  bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
+                                unsigned long len);
  
  #endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c

index 097bf77..0f1f165 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -356,7 +356,6 @@ xfs_da3_split(
         struct xfs_da_state_blk *newblk;
         struct xfs_da_state_blk *addblk;
         struct xfs_da_intnode   *node;
-       struct xfs_buf          *bp;
         int                     max;
         int                     action = 0;
         int                     error;
@@ -397,7 +396,9 @@ xfs_da3_split(
                                 break;
                         }
                         /*
-                        * Entry wouldn't fit, split the leaf again.
+                        * Entry wouldn't fit, split the leaf again. The new
+                        * extrablk will be consumed by xfs_da3_node_split if
+                        * the node is split.
                          */
                         state->extravalid = 1;
                         if (state->inleaf) {
@@ -445,6 +446,14 @@ xfs_da3_split(
         if (!addblk)
                 return 0;
  
+       /*
+        * xfs_da3_node_split() should have consumed any extra blocks we added
+        * during a double leaf split in the attr fork. This is guaranteed as
+        * we can't be here if the attr fork only has a single leaf block.
+        */
+       ASSERT(state->extravalid == 0 ||
+              state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
         /*
          * Split the root node.
          */
@@ -457,43 +466,33 @@ xfs_da3_split(
         }
  
         /*
-        * Update pointers to the node which used to be block 0 and
-        * just got bumped because of the addition of a new root node.
-        * There might be three blocks involved if a double split occurred,
-        * and the original block 0 could be at any position in the list.
+        * Update pointers to the node which used to be block 0 and just got
+        * bumped because of the addition of a new root node.  Note that the
+        * original block 0 could be at any position in the list of blocks in
+        * the tree.
          *
-        * Note: the magic numbers and sibling pointers are in the same
-        * physical place for both v2 and v3 headers (by design). Hence it
-        * doesn't matter which version of the xfs_da_intnode structure we use
-        * here as the result will be the same using either structure.
+        * Note: the magic numbers and sibling pointers are in the same physical
+        * place for both v2 and v3 headers (by design). Hence it doesn't matter
+        * which version of the xfs_da_intnode structure we use here as the
+        * result will be the same using either structure.
          */
         node = oldblk->bp->b_addr;
         if (node->hdr.info.forw) {
-               if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
-                       bp = addblk->bp;
-               } else {
-                       ASSERT(state->extravalid);
-                       bp = state->extrablk.bp;
-               }
-               node = bp->b_addr;
+               ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
+               node = addblk->bp->b_addr;
                 node->hdr.info.back = cpu_to_be32(oldblk->blkno);
-               xfs_trans_log_buf(state->args->trans, bp,
-                   XFS_DA_LOGRANGE(node, &node->hdr.info,
-                   sizeof(node->hdr.info)));
+               xfs_trans_log_buf(state->args->trans, addblk->bp,
+                                 XFS_DA_LOGRANGE(node, &node->hdr.info,
+                                 sizeof(node->hdr.info)));
         }
         node = oldblk->bp->b_addr;
         if (node->hdr.info.back) {
-               if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
-                       bp = addblk->bp;
-               } else {
-                       ASSERT(state->extravalid);
-                       bp = state->extrablk.bp;
-               }
-               node = bp->b_addr;
+               ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
+               node = addblk->bp->b_addr;
                 node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
-               xfs_trans_log_buf(state->args->trans, bp,
-                   XFS_DA_LOGRANGE(node, &node->hdr.info,
-                   sizeof(node->hdr.info)));
+               xfs_trans_log_buf(state->args->trans, addblk->bp,
+                                 XFS_DA_LOGRANGE(node, &node->hdr.info,
+                                 sizeof(node->hdr.info)));
         }
         addblk->bp = NULL;
         return 0;
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c

index 9d624a6..f1e8d4d 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -40,8 +40,7 @@ xfs_dir2_sf_entsize(
         int count = sizeof(struct xfs_dir2_sf_entry);   /* namelen + offset */
  
         count += len;                                   /* name */
-       count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
-                               sizeof(xfs_dir2_ino4_t); /* ino # */
+       count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
         return count;
  }
  
@@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype(
  static xfs_ino_t
  xfs_dir2_sf_get_ino(
         struct xfs_dir2_sf_hdr  *hdr,
-       xfs_dir2_inou_t         *from)
+       __uint8_t               *from)
  {
         if (hdr->i8count)
-               return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+               return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
         else
-               return get_unaligned_be32(&from->i4.i);
+               return get_unaligned_be32(from);
  }
  
  static void
  xfs_dir2_sf_put_ino(
         struct xfs_dir2_sf_hdr  *hdr,
-       xfs_dir2_inou_t         *to,
+       __uint8_t               *to,
         xfs_ino_t               ino)
  {
         ASSERT((ino & 0xff00000000000000ULL) == 0);
  
         if (hdr->i8count)
-               put_unaligned_be64(ino, &to->i8.i);
+               put_unaligned_be64(ino, to);
         else
-               put_unaligned_be32(ino, &to->i4.i);
+               put_unaligned_be32(ino, to);
  }
  
  static xfs_ino_t
  xfs_dir2_sf_get_parent_ino(
         struct xfs_dir2_sf_hdr  *hdr)
  {
-       return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+       return xfs_dir2_sf_get_ino(hdr, hdr->parent);
  }
  
  static void
@@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino(
         struct xfs_dir2_sf_hdr  *hdr,
         xfs_ino_t               ino)
  {
-       xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+       xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
  }
  
  /*
@@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino(
         struct xfs_dir2_sf_hdr  *hdr,
         struct xfs_dir2_sf_entry *sfep)
  {
-       return xfs_dir2_sf_get_ino(hdr,
-                               (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+       return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
  }
  
  static void
@@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino(
         struct xfs_dir2_sf_entry *sfep,
         xfs_ino_t               ino)
  {
-       xfs_dir2_sf_put_ino(hdr,
-                           (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+       xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
  }
  
  static xfs_ino_t
@@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino(
         struct xfs_dir2_sf_hdr  *hdr,
         struct xfs_dir2_sf_entry *sfep)
  {
-       return xfs_dir2_sf_get_ino(hdr,
-                       (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+       return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
  }
  
  static void
@@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino(
         struct xfs_dir2_sf_entry *sfep,
         xfs_ino_t               ino)
  {
-       xfs_dir2_sf_put_ino(hdr,
-                       (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+       xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
  }
  
  
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h

index 8d4d8bc..685f23b 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -191,12 +191,6 @@ typedef    __uint16_t      xfs_dir2_data_off_t;
  #define        NULLDATAOFF     0xffffU
  typedef uint           xfs_dir2_data_aoff_t;   /* argument form */
  
-/*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
-
  /*
   * Offset in data space of a data entry.
   */
@@ -214,22 +208,10 @@ typedef   xfs_off_t       xfs_dir2_off_t;
   */
  typedef        __uint32_t      xfs_dir2_db_t;
  
-/*
- * Inode number stored as 8 8-bit values.
- */
-typedef        struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
-
-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+#define XFS_INO32_SIZE 4
+#define XFS_INO64_SIZE 8
+#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE)
  
-typedef union {
-       xfs_dir2_ino8_t i8;
-       xfs_dir2_ino4_t i4;
-} xfs_dir2_inou_t;
  #define        XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
  
  /*
@@ -246,39 +228,38 @@ typedef union {
  typedef struct xfs_dir2_sf_hdr {
         __uint8_t               count;          /* count of entries */
         __uint8_t               i8count;        /* count of 8-byte inode #s */
-       xfs_dir2_inou_t         parent;         /* parent dir inode number */
-} __arch_pack xfs_dir2_sf_hdr_t;
+       __uint8_t               parent[8];      /* parent dir inode number */
+} __packed xfs_dir2_sf_hdr_t;
  
  typedef struct xfs_dir2_sf_entry {
         __u8                    namelen;        /* actual name length */
-       xfs_dir2_sf_off_t       offset;         /* saved offset */
+       __u8                    offset[2];      /* saved offset */
         __u8                    name[];         /* name, variable size */
         /*
          * A single byte containing the file type field follows the inode
          * number for version 3 directory entries.
          *
-        * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
-        * variable offset after the name.
+        * A 64-bit or 32-bit inode number follows here, at a variable offset
+        * after the name.
          */
-} __arch_pack xfs_dir2_sf_entry_t;
+} xfs_dir2_sf_entry_t;
  
  static inline int xfs_dir2_sf_hdr_size(int i8count)
  {
         return sizeof(struct xfs_dir2_sf_hdr) -
-               (i8count == 0) *
-               (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+               (i8count == 0) * XFS_INO64_DIFF;
  }
  
  static inline xfs_dir2_data_aoff_t
  xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
  {
-       return get_unaligned_be16(&sfep->offset.i);
+       return get_unaligned_be16(sfep->offset);
  }
  
  static inline void
  xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
  {
-       put_unaligned_be16(off, &sfep->offset.i);
+       put_unaligned_be16(off, sfep->offset);
  }
  
  static inline struct xfs_dir2_sf_entry *
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c

index e5bb9cc..c6809ff 100644 (file)
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -126,13 +126,12 @@ xfs_dir2_block_sfsize(
                 /*
                  * Calculate the new size, see if we should give up yet.
                  */
-               size = xfs_dir2_sf_hdr_size(i8count) +          /* header */
-                      count +                                  /* namelen */
-                      count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
-                      namelen +                                /* name */
-                      (i8count ?                               /* inumber */
-                               (uint)sizeof(xfs_dir2_ino8_t) * count :
-                               (uint)sizeof(xfs_dir2_ino4_t) * count);
+               size = xfs_dir2_sf_hdr_size(i8count) +  /* header */
+                      count * 3 * sizeof(u8) +         /* namelen + offset */
+                      namelen +                        /* name */
+                      (i8count ?                       /* inumber */
+                               count * XFS_INO64_SIZE :
+                               count * XFS_INO32_SIZE);
                 if (size > XFS_IFORK_DSIZE(dp))
                         return size;            /* size value is a failure */
         }
@@ -319,10 +318,7 @@ xfs_dir2_sf_addname(
                 /*
                  * Yes, adjust the inode size.  old count + (parent + new)
                  */
-               incr_isize +=
-                       (sfp->count + 2) *
-                       ((uint)sizeof(xfs_dir2_ino8_t) -
-                        (uint)sizeof(xfs_dir2_ino4_t));
+               incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
                 objchange = 1;
         }
  
@@ -897,11 +893,7 @@ xfs_dir2_sf_replace(
                 int     error;                  /* error return value */
                 int     newsize;                /* new inode size */
  
-               newsize =
-                       dp->i_df.if_bytes +
-                       (sfp->count + 1) *
-                       ((uint)sizeof(xfs_dir2_ino8_t) -
-                        (uint)sizeof(xfs_dir2_ino4_t));
+               newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
                 /*
                  * Won't fit as shortform, convert to block then do replace.
                  */
@@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4(
         /*
          * Compute the new inode size.
          */
-       newsize =
-               oldsize -
-               (oldsfp->count + 1) *
-               ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+       newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
         xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
         xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
         /*
@@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4(
              i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
                   oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
                 sfep->namelen = oldsfep->namelen;
-               sfep->offset = oldsfep->offset;
+               memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
                 memcpy(sfep->name, oldsfep->name, sfep->namelen);
                 dp->d_ops->sf_put_ino(sfp, sfep,
                                       dp->d_ops->sf_get_ino(oldsfp, oldsfep));
@@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8(
         /*
          * Compute the new inode size (nb: entry count + 1 for parent)
          */
-       newsize =
-               oldsize +
-               (oldsfp->count + 1) *
-               ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+       newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
         xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
         xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
         /*
@@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8(
              i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
                   oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
                 sfep->namelen = oldsfep->namelen;
-               sfep->offset = oldsfep->offset;
+               memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
                 memcpy(sfep->name, oldsfep->name, sfep->namelen);
                 dp->d_ops->sf_put_ino(sfp, sfep,
                                       dp->d_ops->sf_get_ino(oldsfp, oldsfep));
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h

index dc97eb2..adb204d 100644 (file)
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
   * with the crc feature bit, and all accesses to them must be conditional on
   * that flag.
   */
+/* short form block header */
+struct xfs_btree_block_shdr {
+       __be32          bb_leftsib;
+       __be32          bb_rightsib;
+
+       __be64          bb_blkno;
+       __be64          bb_lsn;
+       uuid_t          bb_uuid;
+       __be32          bb_owner;
+       __le32          bb_crc;
+};
+
+/* long form block header */
+struct xfs_btree_block_lhdr {
+       __be64          bb_leftsib;
+       __be64          bb_rightsib;
+
+       __be64          bb_blkno;
+       __be64          bb_lsn;
+       uuid_t          bb_uuid;
+       __be64          bb_owner;
+       __le32          bb_crc;
+       __be32          bb_pad; /* padding for alignment */
+};
+
  struct xfs_btree_block {
         __be32          bb_magic;       /* magic number for block type */
         __be16          bb_level;       /* 0 is a leaf */
         __be16          bb_numrecs;     /* current # of data records */
         union {
-               struct {
-                       __be32          bb_leftsib;
-                       __be32          bb_rightsib;
-
-                       __be64          bb_blkno;
-                       __be64          bb_lsn;
-                       uuid_t          bb_uuid;
-                       __be32          bb_owner;
-                       __le32          bb_crc;
-               } s;                    /* short form pointers */
-               struct  {
-                       __be64          bb_leftsib;
-                       __be64          bb_rightsib;
-
-                       __be64          bb_blkno;
-                       __be64          bb_lsn;
-                       uuid_t          bb_uuid;
-                       __be64          bb_owner;
-                       __le32          bb_crc;
-                       __be32          bb_pad; /* padding for alignment */
-               } l;                    /* long form pointers */
+               struct xfs_btree_block_shdr s;
+               struct xfs_btree_block_lhdr l;
         } bb_u;                         /* rest */
  };
  
-#define XFS_BTREE_SBLOCK_LEN   16      /* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN   24      /* size of a long form block */
+/* size of a short form block */
+#define XFS_BTREE_SBLOCK_LEN \
+       (offsetof(struct xfs_btree_block, bb_u) + \
+        offsetof(struct xfs_btree_block_shdr, bb_blkno))
+/* size of a long form block */
+#define XFS_BTREE_LBLOCK_LEN \
+       (offsetof(struct xfs_btree_block, bb_u) + \
+        offsetof(struct xfs_btree_block_lhdr, bb_blkno))
  
  /* sizes of CRC enabled btree blocks */
-#define XFS_BTREE_SBLOCK_CRC_LEN       (XFS_BTREE_SBLOCK_LEN + 40)
-#define XFS_BTREE_LBLOCK_CRC_LEN       (XFS_BTREE_LBLOCK_LEN + 48)
+#define XFS_BTREE_SBLOCK_CRC_LEN \
+       (offsetof(struct xfs_btree_block, bb_u) + \
+        sizeof(struct xfs_btree_block_shdr))
+#define XFS_BTREE_LBLOCK_CRC_LEN \
+       (offsetof(struct xfs_btree_block, bb_u) + \
+        sizeof(struct xfs_btree_block_lhdr))
  
  #define XFS_BTREE_SBLOCK_CRC_OFF \
         offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h

index fffe3d0..f5ec9c5 100644 (file)
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -521,12 +521,8 @@ typedef struct xfs_swapext
  #define XFS_IOC_ERROR_CLEARALL      _IOW ('X', 117, struct xfs_error_injection)
  /*     XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118      */
  
-/*     XFS_IOC_FREEZE            -- FIFREEZE   119      */
-/*     XFS_IOC_THAW              -- FITHAW     120      */
-#ifndef FIFREEZE
-#define XFS_IOC_FREEZE              _IOWR('X', 119, int)
-#define XFS_IOC_THAW                _IOWR('X', 120, int)
-#endif
+#define XFS_IOC_FREEZE              _IOWR('X', 119, int)       /* aka FIFREEZE */
+#define XFS_IOC_THAW                _IOWR('X', 120, int)       /* aka FITHAW */
  
  #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
  #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c

index 22297f9..4b1e408 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk(
  
         if (!xfs_inobt_issparse(rec->ir_holemask)) {
                 /* not sparse, calculate extent info directly */
-               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-                                 XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
-                                 mp->m_ialloc_blks, flist, mp);
+               xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
+                                 mp->m_ialloc_blks);
                 return;
         }
  
@@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk(
  
                 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
                 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
-               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
-                                 flist, mp);
+               xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
+                                 contigblk);
  
                 /* reset range to current bit and carry on... */
                 startidx = endidx = nextbit;
@@ -2395,20 +2394,11 @@ void
  xfs_ialloc_compute_maxlevels(
         xfs_mount_t     *mp)            /* file system mount structure */
  {
-       int             level;
-       uint            maxblocks;
-       uint            maxleafents;
-       int             minleafrecs;
-       int             minnoderecs;
-
-       maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
-               XFS_INODES_PER_CHUNK_LOG;
-       minleafrecs = mp->m_inobt_mnr[0];
-       minnoderecs = mp->m_inobt_mnr[1];
-       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-       for (level = 1; maxblocks > 1; level++)
-               maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-       mp->m_in_maxlevels = level;
+       uint            inodes;
+
+       inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+       mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
+                                                        inodes);
  }
  
  /*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c

index 951c044..e2e1106 100644 (file)
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
   * Get a buffer for the bitmap or summary file block specified.
   * The buffer is returned read and locked.
   */
-int
+static int
  xfs_rtbuf_get(
         xfs_mount_t     *mp,            /* file system mount structure */
         xfs_trans_t     *tp,            /* transaction pointer */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index 87d2b21..7575cfc 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
   * We're now finished for good with this page.  Update the page state via the
   * associated buffer_heads, paying attention to the start and end offsets that
   * we need to process on the page.
+ *
+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+ * the page at all, as we may be racing with memory reclaim and it can free both
+ * the bufferhead chain and the page as it will see the page as clean and
+ * unused.
   */
  static void
  xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
         int                     error)
  {
         unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
-       struct buffer_head      *head, *bh;
+       struct buffer_head      *head, *bh, *next;
         unsigned int            off = 0;
+       unsigned int            bsize;
  
         ASSERT(bvec->bv_offset < PAGE_SIZE);
         ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
  
         bh = head = page_buffers(bvec->bv_page);
  
+       bsize = bh->b_size;
         do {
+               next = bh->b_this_page;
                 if (off < bvec->bv_offset)
                         goto next_bh;
                 if (off > end)
                         break;
                 bh->b_end_io(bh, !error);
  next_bh:
-               off += bh->b_size;
-       } while ((bh = bh->b_this_page) != head);
+               off += bsize;
+       } while ((bh = next) != head);
  }
  
  /*
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
  
         trace_xfs_releasepage(page->mapping->host, page, 0, 0);
  
+       /*
+        * mm accommodates an old ext3 case where clean pages might not have had
+        * the dirty bit cleared. Thus, it can send actual dirty pages to
+        * ->releasepage() via shrink_active_list(). Conversely,
+        * block_invalidatepage() can send pages that are still marked dirty
+        * but otherwise have invalidated buffers.
+        *
+        * We've historically freed buffers on the latter. Instead, quietly
+        * filter out all dirty pages to avoid spurious buffer state warnings.
+        * This can likely be removed once shrink_active_list() is fixed.
+        */
+       if (PageDirty(page))
+               return 0;
+
         xfs_count_page_state(page, &delalloc, &unwritten);
  
         if (WARN_ON_ONCE(delalloc))
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
         ssize_t                 size;
         int                     new = 0;
  
+       BUG_ON(create && !direct);
+
         if (XFS_FORCED_SHUTDOWN(mp))
                 return -EIO;
  
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
         ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
         size = bh_result->b_size;
  
-       if (!create && direct && offset >= i_size_read(inode))
+       if (!create && offset >= i_size_read(inode))
                 return 0;
  
         /*
          * Direct I/O is usually done on preallocated files, so try getting
-        * a block mapping without an exclusive lock first.  For buffered
-        * writes we already have the exclusive iolock anyway, so avoiding
-        * a lock roundtrip here by taking the ilock exclusive from the
-        * beginning is a useful micro optimization.
+        * a block mapping without an exclusive lock first.
          */
-       if (create && !direct) {
-               lockmode = XFS_ILOCK_EXCL;
-               xfs_ilock(ip, lockmode);
-       } else {
-               lockmode = xfs_ilock_data_map_shared(ip);
-       }
+       lockmode = xfs_ilock_data_map_shared(ip);
  
         ASSERT(offset <= mp->m_super->s_maxbytes);
         if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
              (imap.br_startblock == HOLESTARTBLOCK ||
               imap.br_startblock == DELAYSTARTBLOCK) ||
              (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-               if (direct || xfs_get_extsz_hint(ip)) {
-                       /*
-                        * xfs_iomap_write_direct() expects the shared lock. It
-                        * is unlocked on return.
-                        */
-                       if (lockmode == XFS_ILOCK_EXCL)
-                               xfs_ilock_demote(ip, lockmode);
-
-                       error = xfs_iomap_write_direct(ip, offset, size,
-                                                      &imap, nimaps);
-                       if (error)
-                               return error;
-                       new = 1;
+               /*
+                * xfs_iomap_write_direct() expects the shared lock. It
+                * is unlocked on return.
+                */
+               if (lockmode == XFS_ILOCK_EXCL)
+                       xfs_ilock_demote(ip, lockmode);
  
-               } else {
-                       /*
-                        * Delalloc reservations do not require a transaction,
-                        * we can go on without dropping the lock here. If we
-                        * are allocating a new delalloc block, make sure that
-                        * we set the new flag so that we mark the buffer new so
-                        * that we know that it is newly allocated if the write
-                        * fails.
-                        */
-                       if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-                               new = 1;
-                       error = xfs_iomap_write_delay(ip, offset, size, &imap);
-                       if (error)
-                               goto out_unlock;
+               error = xfs_iomap_write_direct(ip, offset, size,
+                                              &imap, nimaps);
+               if (error)
+                       return error;
+               new = 1;
  
-                       xfs_iunlock(ip, lockmode);
-               }
                 trace_xfs_get_blocks_alloc(ip, offset, size,
                                 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
                                                    : XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
         }
  
         /* trim mapping down to size requested */
-       if (direct || size > (1 << inode->i_blkbits))
-               xfs_map_trim_size(inode, iblock, bh_result,
-                                 &imap, offset, size);
+       xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
  
         /*
          * For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
                 if (ISUNWRITTEN(&imap))
                         set_buffer_unwritten(bh_result);
                 /* direct IO needs special help */
-               if (create && direct) {
+               if (create) {
                         if (dax_fault)
                                 ASSERT(!ISUNWRITTEN(&imap));
                         else
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
              (new || ISUNWRITTEN(&imap))))
                 set_buffer_new(bh_result);
  
-       if (imap.br_startblock == DELAYSTARTBLOCK) {
-               BUG_ON(direct);
-               if (create) {
-                       set_buffer_uptodate(bh_result);
-                       set_buffer_mapped(bh_result);
-                       set_buffer_delay(bh_result);
-               }
-       }
+       BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
  
         return 0;
  
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
   * whereas if we have flags set we will always be called in task context
   * (i.e. from a workqueue).
   */
-STATIC int
+int
  xfs_end_io_direct_write(
         struct kiocb            *iocb,
         loff_t                  offset,
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
         struct kiocb            *iocb,
         struct iov_iter         *iter)
  {
-       struct inode            *inode = iocb->ki_filp->f_mapping->host;
-       dio_iodone_t            *endio = NULL;
-       int                     flags = 0;
-       struct block_device     *bdev;
-
-       if (iov_iter_rw(iter) == WRITE) {
-               endio = xfs_end_io_direct_write;
-               flags = DIO_ASYNC_EXTEND;
-       }
-
-       if (IS_DAX(inode)) {
-               return dax_do_io(iocb, inode, iter,
-                                xfs_get_blocks_direct, endio, 0);
-       }
-
-       bdev = xfs_find_bdev_for_inode(inode);
-       return  __blockdev_direct_IO(iocb, inode, bdev, iter,
-                       xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
-       struct inode            *inode,
-       loff_t                  start,
-       loff_t                  end)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fileoff_t           start_fsb;
-       xfs_fileoff_t           end_fsb;
-       int                     error;
-
-       start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-       end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-       if (end_fsb <= start_fsb)
-               return;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-                                               end_fsb - start_fsb);
-       if (error) {
-               /* something screwed, just bail */
-               if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                       xfs_alert(ip->i_mount,
-               "xfs_vm_write_failed: unable to clean up ino %lld",
-                                       ip->i_ino);
-               }
-       }
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
-       struct inode            *inode,
-       struct page             *page,
-       loff_t                  pos,
-       unsigned                len)
-{
-       loff_t                  block_offset;
-       loff_t                  block_start;
-       loff_t                  block_end;
-       loff_t                  from = pos & (PAGE_SIZE - 1);
-       loff_t                  to = from + len;
-       struct buffer_head      *bh, *head;
-       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
-
         /*
-        * The request pos offset might be 32 or 64 bit, this is all fine
-        * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-        * platform, the high 32-bit will be masked off if we evaluate the
-        * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-        * 0xfffff000 as an unsigned long, hence the result is incorrect
-        * which could cause the following ASSERT failed in most cases.
-        * In order to avoid this, we can evaluate the block_offset of the
-        * start of the page by using shifts rather than masks the mismatch
-        * problem.
+        * We just need the method present so that open/fcntl allow direct I/O.
          */
-       block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
-
-       ASSERT(block_offset + from == pos);
-
-       head = page_buffers(page);
-       block_start = 0;
-       for (bh = head; bh != head || !block_start;
-            bh = bh->b_this_page, block_start = block_end,
-                                  block_offset += bh->b_size) {
-               block_end = block_start + bh->b_size;
-
-               /* skip buffers before the write */
-               if (block_end <= from)
-                       continue;
-
-               /* if the buffer is after the write, we're done */
-               if (block_start >= to)
-                       break;
-
-               /*
-                * Process delalloc and unwritten buffers beyond EOF. We can
-                * encounter unwritten buffers in the event that a file has
-                * post-EOF unwritten extents and an extending write happens to
-                * fail (e.g., an unaligned write that also involves a delalloc
-                * to the same page).
-                */
-               if (!buffer_delay(bh) && !buffer_unwritten(bh))
-                       continue;
-
-               if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-                   block_offset < i_size_read(inode))
-                       continue;
-
-               if (buffer_delay(bh))
-                       xfs_vm_kill_delalloc_range(inode, block_offset,
-                                                  block_offset + bh->b_size);
-
-               /*
-                * This buffer does not contain data anymore. make sure anyone
-                * who finds it knows that for certain.
-                */
-               clear_buffer_delay(bh);
-               clear_buffer_uptodate(bh);
-               clear_buffer_mapped(bh);
-               clear_buffer_new(bh);
-               clear_buffer_dirty(bh);
-               clear_buffer_unwritten(bh);
-       }
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                flags,
-       struct page             **pagep,
-       void                    **fsdata)
-{
-       pgoff_t                 index = pos >> PAGE_SHIFT;
-       struct page             *page;
-       int                     status;
-       struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
-
-       ASSERT(len <= PAGE_SIZE);
-
-       page = grab_cache_page_write_begin(mapping, index, flags);
-       if (!page)
-               return -ENOMEM;
-
-       status = __block_write_begin(page, pos, len, xfs_get_blocks);
-       if (xfs_mp_fail_writes(mp))
-               status = -EIO;
-       if (unlikely(status)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
-
-               xfs_vm_write_failed(inode, page, pos, len);
-               unlock_page(page);
-
-               /*
-                * If the write is beyond EOF, we only want to kill blocks
-                * allocated in this write, not blocks that were previously
-                * written successfully.
-                */
-               if (xfs_mp_fail_writes(mp))
-                       isize = 0;
-               if (pos + len > isize) {
-                       ssize_t start = max_t(ssize_t, pos, isize);
-
-                       truncate_pagecache_range(inode, start, pos + len);
-               }
-
-               put_page(page);
-               page = NULL;
-       }
-
-       *pagep = page;
-       return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                copied,
-       struct page             *page,
-       void                    *fsdata)
-{
-       int                     ret;
-
-       ASSERT(len <= PAGE_SIZE);
-
-       ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-       if (unlikely(ret < len)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
-               loff_t          to = pos + len;
-
-               if (to > isize) {
-                       /* only kill blocks in this write beyond EOF */
-                       if (pos > isize)
-                               isize = pos;
-                       xfs_vm_kill_delalloc_range(inode, isize, to);
-                       truncate_pagecache_range(inode, isize, to);
-               }
-       }
-       return ret;
+       return -EINVAL;
  }
  
  STATIC sector_t
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
         .set_page_dirty         = xfs_vm_set_page_dirty,
         .releasepage            = xfs_vm_releasepage,
         .invalidatepage         = xfs_vm_invalidatepage,
-       .write_begin            = xfs_vm_write_begin,
-       .write_end              = xfs_vm_write_end,
         .bmap                   = xfs_vm_bmap,
         .direct_IO              = xfs_vm_direct_IO,
         .migratepage            = buffer_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h

index 814aab7..bf2d9a1 100644 (file)
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -60,6 +60,9 @@ int   xfs_get_blocks_direct(struct inode *inode, sector_t offset,
  int    xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
                                  struct buffer_head *map_bh, int create);
  
+int    xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
+               ssize_t size, void *private);
+
  extern void xfs_count_page_state(struct page *, int *, int *);
  extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
  
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c

index 55d2149..be0b79d 100644 (file)
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -322,7 +322,7 @@ xfs_attr3_node_inactive(
   * Recurse (gasp!) through the attribute nodes until we find leaves.
   * We're doing a depth-first traversal in order to invalidate everything.
   */
-int
+static int
  xfs_attr3_root_inactive(
         struct xfs_trans        **trans,
         struct xfs_inode        *dp)
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c

index d25f26b..25e76cd 100644 (file)
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
   * we have to calculate each entries' hashvalue and sort them before
   * we can begin returning them to the user.
   */
-int
+static int
  xfs_attr_shortform_list(xfs_attr_list_context_t *context)
  {
         attrlist_cursor_kern_t *cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index 586bb64..cd4a850 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -79,6 +79,23 @@ xfs_zero_extent(
                 GFP_NOFS, true);
  }
  
+/* Sort bmap items by AG. */
+static int
+xfs_bmap_free_list_cmp(
+       void                    *priv,
+       struct list_head        *a,
+       struct list_head        *b)
+{
+       struct xfs_mount        *mp = priv;
+       struct xfs_bmap_free_item       *ra;
+       struct xfs_bmap_free_item       *rb;
+
+       ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
+       rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
+       return  XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
+               XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
+}
+
  /*
   * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
   * caller.  Frees all the extents that need freeing, which must be done
@@ -99,14 +116,15 @@ xfs_bmap_finish(
         int                             error;  /* error return value */
         int                             committed;/* xact committed or not */
         struct xfs_bmap_free_item       *free;  /* free extent item */
-       struct xfs_bmap_free_item       *next;  /* next item on free list */
  
         ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
         if (flist->xbf_count == 0)
                 return 0;
  
+       list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
+
         efi = xfs_trans_get_efi(*tp, flist->xbf_count);
-       for (free = flist->xbf_first; free; free = free->xbfi_next)
+       list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
                 xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
                         free->xbfi_blockcount);
  
@@ -125,9 +143,7 @@ xfs_bmap_finish(
                 if (committed) {
                         xfs_efi_release(efi);
                         xfs_force_shutdown((*tp)->t_mountp,
-                               (error == -EFSCORRUPTED) ?
-                                       SHUTDOWN_CORRUPT_INCORE :
-                                       SHUTDOWN_META_IO_ERROR);
+                                          SHUTDOWN_META_IO_ERROR);
                 }
                 return error;
         }
@@ -138,15 +154,15 @@ xfs_bmap_finish(
          * on error.
          */
         efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
-       for (free = flist->xbf_first; free != NULL; free = next) {
-               next = free->xbfi_next;
-
+       while (!list_empty(&flist->xbf_flist)) {
+               free = list_first_entry(&flist->xbf_flist,
+                               struct xfs_bmap_free_item, xbfi_list);
                 error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
                                               free->xbfi_blockcount);
                 if (error)
                         return error;
  
-               xfs_bmap_del_free(flist, NULL, free);
+               xfs_bmap_del_free(flist, free);
         }
  
         return 0;
@@ -409,7 +425,7 @@ xfs_bmap_count_tree(
  /*
   * Count fsblocks of the given fork.
   */
-int                                            /* error */
+static int                                     /* error */
  xfs_bmap_count_blocks(
         xfs_trans_t             *tp,            /* transaction pointer */
         xfs_inode_t             *ip,            /* incore inode */
@@ -799,7 +815,7 @@ xfs_bmap_punch_delalloc_range(
                 if (error)
                         break;
  
-               ASSERT(!flist.xbf_count && !flist.xbf_first);
+               ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
  next_block:
                 start_fsb++;
                 remaining--;
@@ -1089,99 +1105,120 @@ error1:        /* Just cancel transaction */
         return error;
  }
  
-/*
- * Zero file bytes between startoff and endoff inclusive.
- * The iolock is held exclusive and no blocks are buffered.
- *
- * This function is used by xfs_free_file_space() to zero
- * partial blocks when the range to free is not block aligned.
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
-       xfs_inode_t             *ip,
-       xfs_off_t               startoff,
-       xfs_off_t               endoff)
+static int
+xfs_unmap_extent(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           startoffset_fsb,
+       xfs_filblks_t           len_fsb,
+       int                     *done)
  {
-       xfs_bmbt_irec_t         imap;
-       xfs_fileoff_t           offset_fsb;
-       xfs_off_t               lastoffset;
-       xfs_off_t               offset;
-       xfs_buf_t               *bp;
-       xfs_mount_t             *mp = ip->i_mount;
-       int                     nimap;
-       int                     error = 0;
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp;
+       struct xfs_bmap_free    free_list;
+       xfs_fsblock_t           firstfsb;
+       uint                    resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+       int                     error;
  
-       /*
-        * Avoid doing I/O beyond eof - it's not necessary
-        * since nothing can read beyond eof.  The space will
-        * be zeroed when the file is extended anyway.
-        */
-       if (startoff >= XFS_ISIZE(ip))
-               return 0;
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+       if (error) {
+               ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+               return error;
+       }
  
-       if (endoff > XFS_ISIZE(ip))
-               endoff = XFS_ISIZE(ip);
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
+                       ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+       if (error)
+               goto out_trans_cancel;
  
-       for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
-               uint lock_mode;
+       xfs_trans_ijoin(tp, ip, 0);
  
-               offset_fsb = XFS_B_TO_FSBT(mp, offset);
-               nimap = 1;
+       xfs_bmap_init(&free_list, &firstfsb);
+       error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
+                       &free_list, done);
+       if (error)
+               goto out_bmap_cancel;
  
-               lock_mode = xfs_ilock_data_map_shared(ip);
-               error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
-               xfs_iunlock(ip, lock_mode);
+       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       if (error)
+               goto out_bmap_cancel;
  
-               if (error || nimap < 1)
-                       break;
-               ASSERT(imap.br_blockcount >= 1);
-               ASSERT(imap.br_startoff == offset_fsb);
-               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+       error = xfs_trans_commit(tp);
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
  
-               if (imap.br_startblock == HOLESTARTBLOCK ||
-                   imap.br_state == XFS_EXT_UNWRITTEN) {
-                       /* skip the entire extent */
-                       lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
-                                                     imap.br_blockcount) - 1;
-                       continue;
-               }
+out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+out_trans_cancel:
+       xfs_trans_cancel(tp);
+       goto out_unlock;
+}
  
-               lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
-               if (lastoffset > endoff)
-                       lastoffset = endoff;
+static int
+xfs_adjust_extent_unmap_boundaries(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           *startoffset_fsb,
+       xfs_fileoff_t           *endoffset_fsb)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_bmbt_irec    imap;
+       int                     nimap, error;
+       xfs_extlen_t            mod = 0;
  
-               /* DAX can just zero the backing device directly */
-               if (IS_DAX(VFS_I(ip))) {
-                       error = dax_zero_page_range(VFS_I(ip), offset,
-                                                   lastoffset - offset + 1,
-                                                   xfs_get_blocks_direct);
-                       if (error)
-                               return error;
-                       continue;
-               }
+       nimap = 1;
+       error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
+       if (error)
+               return error;
  
-               error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp,
-                               xfs_fsb_to_db(ip, imap.br_startblock),
-                               BTOBB(mp->m_sb.sb_blocksize),
-                               0, &bp, NULL);
-               if (error)
-                       return error;
+       if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+               xfs_daddr_t     block;
  
-               memset(bp->b_addr +
-                               (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
-                      0, lastoffset - offset + 1);
+               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+               block = imap.br_startblock;
+               mod = do_div(block, mp->m_sb.sb_rextsize);
+               if (mod)
+                       *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+       }
  
-               error = xfs_bwrite(bp);
-               xfs_buf_relse(bp);
-               if (error)
-                       return error;
+       nimap = 1;
+       error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
+       if (error)
+               return error;
+
+       if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+               mod++;
+               if (mod && mod != mp->m_sb.sb_rextsize)
+                       *endoffset_fsb -= mod;
         }
-       return error;
+
+       return 0;
+}
+
+static int
+xfs_flush_unmap_range(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct inode            *inode = VFS_I(ip);
+       xfs_off_t               rounding, start, end;
+       int                     error;
+
+       /* wait for the completion of any pending DIOs */
+       inode_dio_wait(inode);
+
+       rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+       start = round_down(offset, rounding);
+       end = round_up(offset + len, rounding) - 1;
+
+       error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       if (error)
+               return error;
+       truncate_pagecache_range(inode, start, end);
+       return 0;
  }
  
  int
@@ -1190,24 +1227,10 @@ xfs_free_file_space(
         xfs_off_t               offset,
         xfs_off_t               len)
  {
-       int                     done;
-       xfs_fileoff_t           endoffset_fsb;
-       int                     error;
-       xfs_fsblock_t           firstfsb;
-       xfs_bmap_free_t         free_list;
-       xfs_bmbt_irec_t         imap;
-       xfs_off_t               ioffset;
-       xfs_off_t               iendoffset;
-       xfs_extlen_t            mod=0;
-       xfs_mount_t             *mp;
-       int                     nimap;
-       uint                    resblks;
-       xfs_off_t               rounding;
-       int                     rt;
+       struct xfs_mount        *mp = ip->i_mount;
         xfs_fileoff_t           startoffset_fsb;
-       xfs_trans_t             *tp;
-
-       mp = ip->i_mount;
+       xfs_fileoff_t           endoffset_fsb;
+       int                     done = 0, error;
  
         trace_xfs_free_file_space(ip);
  
@@ -1215,135 +1238,45 @@ xfs_free_file_space(
         if (error)
                 return error;
  
-       error = 0;
         if (len <= 0)   /* if nothing being freed */
-               return error;
-       rt = XFS_IS_REALTIME_INODE(ip);
-       startoffset_fsb = XFS_B_TO_FSB(mp, offset);
-       endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-
-       /* wait for the completion of any pending DIOs */
-       inode_dio_wait(VFS_I(ip));
+               return 0;
  
-       rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
-       ioffset = round_down(offset, rounding);
-       iendoffset = round_up(offset + len, rounding) - 1;
-       error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
-                                            iendoffset);
+       error = xfs_flush_unmap_range(ip, offset, len);
         if (error)
-               goto out;
-       truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
+               return error;
+
+       startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+       endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
  
         /*
-        * Need to zero the stuff we're not freeing, on disk.
-        * If it's a realtime file & can't use unwritten extents then we
-        * actually need to zero the extent edges.  Otherwise xfs_bunmapi
-        * will take care of it for us.
+        * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
+        * and we can't use unwritten extents then we actually need to ensure
+        * to zero the whole extent, otherwise we just need to take of block
+        * boundaries, and xfs_bunmapi will handle the rest.
          */
-       if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-               nimap = 1;
-               error = xfs_bmapi_read(ip, startoffset_fsb, 1,
-                                       &imap, &nimap, 0);
-               if (error)
-                       goto out;
-               ASSERT(nimap == 0 || nimap == 1);
-               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                       xfs_daddr_t     block;
-
-                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                       block = imap.br_startblock;
-                       mod = do_div(block, mp->m_sb.sb_rextsize);
-                       if (mod)
-                               startoffset_fsb += mp->m_sb.sb_rextsize - mod;
-               }
-               nimap = 1;
-               error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
-                                       &imap, &nimap, 0);
+       if (XFS_IS_REALTIME_INODE(ip) &&
+           !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+               error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
+                               &endoffset_fsb);
                 if (error)
-                       goto out;
-               ASSERT(nimap == 0 || nimap == 1);
-               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                       mod++;
-                       if (mod && (mod != mp->m_sb.sb_rextsize))
-                               endoffset_fsb -= mod;
-               }
-       }
-       if ((done = (endoffset_fsb <= startoffset_fsb)))
-               /*
-                * One contiguous piece to clear
-                */
-               error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
-       else {
-               /*
-                * Some full blocks, possibly two pieces to clear
-                */
-               if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
-                       error = xfs_zero_remaining_bytes(ip, offset,
-                               XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
-               if (!error &&
-                   XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
-                       error = xfs_zero_remaining_bytes(ip,
-                               XFS_FSB_TO_B(mp, endoffset_fsb),
-                               offset + len - 1);
+                       return error;
         }
  
-       /*
-        * free file space until done or until there is an error
-        */
-       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-       while (!error && !done) {
-
-               /*
-                * allocate and setup the transaction. Allow this
-                * transaction to dip into the reserve blocks to ensure
-                * the freeing of the space succeeds at ENOSPC.
-                */
-               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
-                               &tp);
-               if (error) {
-                       ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       break;
+       if (endoffset_fsb > startoffset_fsb) {
+               while (!done) {
+                       error = xfs_unmap_extent(ip, startoffset_fsb,
+                                       endoffset_fsb - startoffset_fsb, &done);
+                       if (error)
+                               return error;
                 }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_trans_reserve_quota(tp, mp,
-                               ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
-                               resblks, 0, XFS_QMOPT_RES_REGBLKS);
-               if (error)
-                       goto error1;
-
-               xfs_trans_ijoin(tp, ip, 0);
-
-               /*
-                * issue the bunmapi() call to free the blocks
-                */
-               xfs_bmap_init(&free_list, &firstfsb);
-               error = xfs_bunmapi(tp, ip, startoffset_fsb,
-                                 endoffset_fsb - startoffset_fsb,
-                                 0, 2, &firstfsb, &free_list, &done);
-               if (error)
-                       goto error0;
-
-               /*
-                * complete the transaction
-                */
-               error = xfs_bmap_finish(&tp, &free_list, NULL);
-               if (error)
-                       goto error0;
-
-               error = xfs_trans_commit(tp);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
         }
  
- out:
-       return error;
-
- error0:
-       xfs_bmap_cancel(&free_list);
- error1:
-       xfs_trans_cancel(tp);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       goto out;
+       /*
+        * Now that we've unmap all full blocks we'll have to zero out any
+        * partial block at the beginning and/or end.  xfs_zero_range is
+        * smart enough to skip any holes, including those we just created.
+        */
+       return xfs_zero_range(ip, offset, len, NULL);
  }
  
  /*
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h

index af97d9a..f200714 100644 (file)
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,8 +31,6 @@ struct xfs_bmalloca;
  int    xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
  int    xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
                      int whichfork, int *eof);
-int    xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
-                             int whichfork, int *count);
  int    xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
                 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
  
@@ -43,7 +41,6 @@ int   xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
  
  /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
  void   xfs_bmap_del_free(struct xfs_bmap_free *flist,
-                         struct xfs_bmap_free_item *prev,
                           struct xfs_bmap_free_item *free);
  int    xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
                                struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c

index a87a0d5..47a318c 100644 (file)
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -79,6 +79,47 @@ xfs_buf_vmap_len(
         return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
  }
  
+/*
+ * Bump the I/O in flight count on the buftarg if we haven't yet done so for
+ * this buffer. The count is incremented once per buffer (per hold cycle)
+ * because the corresponding decrement is deferred to buffer release. Buffers
+ * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
+ * tracking adds unnecessary overhead. This is used for sychronization purposes
+ * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+ * in-flight buffers.
+ *
+ * Buffers that are never released (e.g., superblock, iclog buffers) must set
+ * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
+ * never reaches zero and unmount hangs indefinitely.
+ */
+static inline void
+xfs_buf_ioacct_inc(
+       struct xfs_buf  *bp)
+{
+       if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
+               return;
+
+       ASSERT(bp->b_flags & XBF_ASYNC);
+       bp->b_flags |= _XBF_IN_FLIGHT;
+       percpu_counter_inc(&bp->b_target->bt_io_count);
+}
+
+/*
+ * Clear the in-flight state on a buffer about to be released to the LRU or
+ * freed and unaccount from the buftarg.
+ */
+static inline void
+xfs_buf_ioacct_dec(
+       struct xfs_buf  *bp)
+{
+       if (!(bp->b_flags & _XBF_IN_FLIGHT))
+               return;
+
+       ASSERT(bp->b_flags & XBF_ASYNC);
+       bp->b_flags &= ~_XBF_IN_FLIGHT;
+       percpu_counter_dec(&bp->b_target->bt_io_count);
+}
+
  /*
   * When we mark a buffer stale, we remove the buffer from the LRU and clear the
   * b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -102,6 +143,14 @@ xfs_buf_stale(
          */
         bp->b_flags &= ~_XBF_DELWRI_Q;
  
+       /*
+        * Once the buffer is marked stale and unlocked, a subsequent lookup
+        * could reset b_flags. There is no guarantee that the buffer is
+        * unaccounted (released to LRU) before that occurs. Drop in-flight
+        * status now to preserve accounting consistency.
+        */
+       xfs_buf_ioacct_dec(bp);
+
         spin_lock(&bp->b_lock);
         atomic_set(&bp->b_lru_ref, 0);
         if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
         struct xfs_buf          *bp;
         DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
  
-       bp = _xfs_buf_alloc(target, &map, 1, 0);
+       /* flags might contain irrelevant bits, pass only what we care about */
+       bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
         if (unlikely(bp == NULL))
                 goto fail;
  
@@ -866,63 +916,85 @@ xfs_buf_hold(
  }
  
  /*
- *     Releases a hold on the specified buffer.  If the
- *     the hold count is 1, calls xfs_buf_free.
+ * Release a hold on the specified buffer. If the hold count is 1, the buffer is
+ * placed on LRU or freed (depending on b_lru_ref).
   */
  void
  xfs_buf_rele(
         xfs_buf_t               *bp)
  {
         struct xfs_perag        *pag = bp->b_pag;
+       bool                    release;
+       bool                    freebuf = false;
  
         trace_xfs_buf_rele(bp, _RET_IP_);
  
         if (!pag) {
                 ASSERT(list_empty(&bp->b_lru));
                 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
-               if (atomic_dec_and_test(&bp->b_hold))
+               if (atomic_dec_and_test(&bp->b_hold)) {
+                       xfs_buf_ioacct_dec(bp);
                         xfs_buf_free(bp);
+               }
                 return;
         }
  
         ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
  
         ASSERT(atomic_read(&bp->b_hold) > 0);
-       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-               spin_lock(&bp->b_lock);
-               if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
-                       /*
-                        * If the buffer is added to the LRU take a new
-                        * reference to the buffer for the LRU and clear the
-                        * (now stale) dispose list state flag
-                        */
-                       if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
-                               bp->b_state &= ~XFS_BSTATE_DISPOSE;
-                               atomic_inc(&bp->b_hold);
-                       }
-                       spin_unlock(&bp->b_lock);
-                       spin_unlock(&pag->pag_buf_lock);
-               } else {
-                       /*
-                        * most of the time buffers will already be removed from
-                        * the LRU, so optimise that case by checking for the
-                        * XFS_BSTATE_DISPOSE flag indicating the last list the
-                        * buffer was on was the disposal list
-                        */
-                       if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-                               list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
-                       } else {
-                               ASSERT(list_empty(&bp->b_lru));
-                       }
-                       spin_unlock(&bp->b_lock);
  
-                       ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
-                       spin_unlock(&pag->pag_buf_lock);
-                       xfs_perag_put(pag);
-                       xfs_buf_free(bp);
+       release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+       spin_lock(&bp->b_lock);
+       if (!release) {
+               /*
+                * Drop the in-flight state if the buffer is already on the LRU
+                * and it holds the only reference. This is racy because we
+                * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
+                * ensures the decrement occurs only once per-buf.
+                */
+               if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+                       xfs_buf_ioacct_dec(bp);
+               goto out_unlock;
+       }
+
+       /* the last reference has been dropped ... */
+       xfs_buf_ioacct_dec(bp);
+       if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+               /*
+                * If the buffer is added to the LRU take a new reference to the
+                * buffer for the LRU and clear the (now stale) dispose list
+                * state flag
+                */
+               if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+                       bp->b_state &= ~XFS_BSTATE_DISPOSE;
+                       atomic_inc(&bp->b_hold);
+               }
+               spin_unlock(&pag->pag_buf_lock);
+       } else {
+               /*
+                * most of the time buffers will already be removed from the
+                * LRU, so optimise that case by checking for the
+                * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
+                * was on was the disposal list
+                */
+               if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+                       list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+               } else {
+                       ASSERT(list_empty(&bp->b_lru));
                 }
+
+               ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+               rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
+               freebuf = true;
         }
+
+out_unlock:
+       spin_unlock(&bp->b_lock);
+
+       if (freebuf)
+               xfs_buf_free(bp);
  }
  
  
@@ -944,10 +1016,12 @@ xfs_buf_trylock(
         int                     locked;
  
         locked = down_trylock(&bp->b_sema) == 0;
-       if (locked)
+       if (locked) {
                 XB_SET_OWNER(bp);
-
-       trace_xfs_buf_trylock(bp, _RET_IP_);
+               trace_xfs_buf_trylock(bp, _RET_IP_);
+       } else {
+               trace_xfs_buf_trylock_fail(bp, _RET_IP_);
+       }
         return locked;
  }
  
@@ -1339,6 +1413,7 @@ xfs_buf_submit(
          * xfs_buf_ioend too early.
          */
         atomic_set(&bp->b_io_remaining, 1);
+       xfs_buf_ioacct_inc(bp);
         _xfs_buf_ioapply(bp);
  
         /*
@@ -1524,13 +1599,19 @@ xfs_wait_buftarg(
         int loop = 0;
  
         /*
-        * We need to flush the buffer workqueue to ensure that all IO
-        * completion processing is 100% done. Just waiting on buffer locks is
-        * not sufficient for async IO as the reference count held over IO is
-        * not released until after the buffer lock is dropped. Hence we need to
-        * ensure here that all reference counts have been dropped before we
-        * start walking the LRU list.
+        * First wait on the buftarg I/O count for all in-flight buffers to be
+        * released. This is critical as new buffers do not make the LRU until
+        * they are released.
+        *
+        * Next, flush the buffer workqueue to ensure all completion processing
+        * has finished. Just waiting on buffer locks is not sufficient for
+        * async IO as the reference count held over IO is not released until
+        * after the buffer lock is dropped. Hence we need to ensure here that
+        * all reference counts have been dropped before we start walking the
+        * LRU list.
          */
+       while (percpu_counter_sum(&btp->bt_io_count))
+               delay(100);
         drain_workqueue(btp->bt_mount->m_buf_workqueue);
  
         /* loop until there is nothing left on the lru list. */
@@ -1627,6 +1708,8 @@ xfs_free_buftarg(
         struct xfs_buftarg      *btp)
  {
         unregister_shrinker(&btp->bt_shrinker);
+       ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+       percpu_counter_destroy(&btp->bt_io_count);
         list_lru_destroy(&btp->bt_lru);
  
         if (mp->m_flags & XFS_MOUNT_BARRIER)
@@ -1691,6 +1774,9 @@ xfs_alloc_buftarg(
         if (list_lru_init(&btp->bt_lru))
                 goto error;
  
+       if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+               goto error;
+
         btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
         btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
         btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1774,18 +1860,33 @@ xfs_buf_cmp(
         return 0;
  }
  
+/*
+ * submit buffers for write.
+ *
+ * When we have a large buffer list, we do not want to hold all the buffers
+ * locked while we block on the request queue waiting for IO dispatch. To avoid
+ * this problem, we lock and submit buffers in groups of 50, thereby minimising
+ * the lock hold times for lists which may contain thousands of objects.
+ *
+ * To do this, we sort the buffer list before we walk the list to lock and
+ * submit buffers, and we plug and unplug around each group of buffers we
+ * submit.
+ */
  static int
-__xfs_buf_delwri_submit(
+xfs_buf_delwri_submit_buffers(
         struct list_head        *buffer_list,
-       struct list_head        *io_list,
-       bool                    wait)
+       struct list_head        *wait_list)
  {
-       struct blk_plug         plug;
         struct xfs_buf          *bp, *n;
+       LIST_HEAD               (submit_list);
         int                     pinned = 0;
+       struct blk_plug         plug;
  
+       list_sort(NULL, buffer_list, xfs_buf_cmp);
+
+       blk_start_plug(&plug);
         list_for_each_entry_safe(bp, n, buffer_list, b_list) {
-               if (!wait) {
+               if (!wait_list) {
                         if (xfs_buf_ispinned(bp)) {
                                 pinned++;
                                 continue;
@@ -1808,25 +1909,21 @@ __xfs_buf_delwri_submit(
                         continue;
                 }
  
-               list_move_tail(&bp->b_list, io_list);
                 trace_xfs_buf_delwri_split(bp, _RET_IP_);
-       }
-
-       list_sort(NULL, io_list, xfs_buf_cmp);
-
-       blk_start_plug(&plug);
-       list_for_each_entry_safe(bp, n, io_list, b_list) {
-               bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-               bp->b_flags |= XBF_WRITE | XBF_ASYNC;
  
                 /*
-                * we do all Io submission async. This means if we need to wait
-                * for IO completion we need to take an extra reference so the
-                * buffer is still valid on the other side.
+                * We do all IO submission async. This means if we need
+                * to wait for IO completion we need to take an extra
+                * reference so the buffer is still valid on the other
+                * side. We need to move the buffer onto the io_list
+                * at this point so the caller can still access it.
                  */
-               if (wait)
+               bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+               bp->b_flags |= XBF_WRITE | XBF_ASYNC;
+               if (wait_list) {
                         xfs_buf_hold(bp);
-               else
+                       list_move_tail(&bp->b_list, wait_list);
+               } else
                         list_del_init(&bp->b_list);
  
                 xfs_buf_submit(bp);
@@ -1849,8 +1946,7 @@ int
  xfs_buf_delwri_submit_nowait(
         struct list_head        *buffer_list)
  {
-       LIST_HEAD               (io_list);
-       return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+       return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
  }
  
  /*
@@ -1865,15 +1961,15 @@ int
  xfs_buf_delwri_submit(
         struct list_head        *buffer_list)
  {
-       LIST_HEAD               (io_list);
+       LIST_HEAD               (wait_list);
         int                     error = 0, error2;
         struct xfs_buf          *bp;
  
-       __xfs_buf_delwri_submit(buffer_list, &io_list, true);
+       xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
  
         /* Wait for IO to complete. */
-       while (!list_empty(&io_list)) {
-               bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+       while (!list_empty(&wait_list)) {
+               bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
  
                 list_del_init(&bp->b_list);
  
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h

index 8bfb974..1c2e52b 100644 (file)
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -43,6 +43,7 @@ typedef enum {
  #define XBF_READ        (1 << 0) /* buffer intended for reading from device */
  #define XBF_WRITE       (1 << 1) /* buffer intended for writing to device */
  #define XBF_READ_AHEAD  (1 << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT   (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
  #define XBF_ASYNC       (1 << 4) /* initiator will not wait for completion */
  #define XBF_DONE        (1 << 5) /* all pages in the buffer uptodate */
  #define XBF_STALE       (1 << 6) /* buffer has been staled, do not find it */
@@ -62,6 +63,7 @@ typedef enum {
  #define _XBF_KMEM       (1 << 21)/* backed by heap memory */
  #define _XBF_DELWRI_Q   (1 << 22)/* buffer on a delwri queue */
  #define _XBF_COMPOUND   (1 << 23)/* compound buffer */
+#define _XBF_IN_FLIGHT  (1 << 25) /* I/O in flight, for accounting purposes */
  
  typedef unsigned int xfs_buf_flags_t;
  
@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
         { _XBF_PAGES,           "PAGES" }, \
         { _XBF_KMEM,            "KMEM" }, \
         { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
-       { _XBF_COMPOUND,        "COMPOUND" }
+       { _XBF_COMPOUND,        "COMPOUND" }, \
+       { _XBF_IN_FLIGHT,       "IN_FLIGHT" }
  
  
  /*
@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
         /* LRU control structures */
         struct shrinker         bt_shrinker;
         struct list_lru         bt_lru;
+
+       struct percpu_counter   bt_io_count;
  } xfs_buftarg_t;
  
  struct xfs_buf;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c

index 3425799..e455f90 100644 (file)
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -359,7 +359,7 @@ xfs_buf_item_format(
         for (i = 0; i < bip->bli_format_count; i++) {
                 xfs_buf_item_format_segment(bip, lv, &vecp, offset,
                                             &bip->bli_formats[i]);
-               offset += bp->b_maps[i].bm_len;
+               offset += BBTOB(bp->b_maps[i].bm_len);
         }
  
         /*
@@ -915,20 +915,28 @@ xfs_buf_item_log(
         for (i = 0; i < bip->bli_format_count; i++) {
                 if (start > last)
                         break;
-               end = start + BBTOB(bp->b_maps[i].bm_len);
+               end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
+
+               /* skip to the map that includes the first byte to log */
                 if (first > end) {
                         start += BBTOB(bp->b_maps[i].bm_len);
                         continue;
                 }
+
+               /*
+                * Trim the range to this segment and mark it in the bitmap.
+                * Note that we must convert buffer offsets to segment relative
+                * offsets (e.g., the first byte of each segment is byte 0 of
+                * that segment).
+                */
                 if (first < start)
                         first = start;
                 if (end > last)
                         end = last;
-
-               xfs_buf_item_log_segment(first, end,
+               xfs_buf_item_log_segment(first - start, end - start,
                                          &bip->bli_formats[i].blf_data_map[0]);
  
-               start += bp->b_maps[i].bm_len;
+               start += BBTOB(bp->b_maps[i].bm_len);
         }
  }
  
@@ -949,6 +957,7 @@ xfs_buf_item_free(
         xfs_buf_log_item_t      *bip)
  {
         xfs_buf_item_free_format(bip);
+       kmem_free(bip->bli_item.li_lv_shadow);
         kmem_zone_free(xfs_buf_item_zone, bip);
  }
  
@@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error(
         trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
         ASSERT(bp->b_iodone != NULL);
  
+       cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+
         /*
          * If the write was asynchronous then no one will be looking for the
          * error.  If this is the first failure of this type, clear the error
@@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error(
          * async write failure at least once, but we also need to set the buffer
          * up to behave correctly now for repeated failures.
          */
-       if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
+       if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
              bp->b_last_error != bp->b_error) {
-               bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
-                               XBF_DONE | XBF_WRITE_FAIL);
+               bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
                 bp->b_last_error = bp->b_error;
-               bp->b_retries = 0;
-               bp->b_first_retry_time = jiffies;
+               if (cfg->retry_timeout && !bp->b_first_retry_time)
+                       bp->b_first_retry_time = jiffies;
  
                 xfs_buf_ioerror(bp, 0);
                 xfs_buf_submit(bp);
@@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error(
          * Repeated failure on an async write. Take action according to the
          * error configuration we have been set up to use.
          */
-       cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
  
         if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
             ++bp->b_retries > cfg->max_retries)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c

index e064665..ccb0811 100644 (file)
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
  {
         ASSERT(list_empty(&dqp->q_lru));
  
+       kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
         mutex_destroy(&dqp->q_qlock);
  
         XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c

index 814cff9..2c7a162 100644 (file)
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
         spin_lock(&ailp->xa_lock);
         xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
  
+       kmem_free(qfs->qql_item.li_lv_shadow);
+       kmem_free(lip->li_lv_shadow);
         kmem_free(qfs);
         kmem_free(qfe);
         return (xfs_lsn_t)-1;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c

index 88693a9..ed7ee4e 100644 (file)
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
  }
  
  int
-xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
  {
         int i;
         int len;
         int64_t fsid;
  
+       if (error_tag >= XFS_ERRTAG_MAX)
+               return -EINVAL;
+
         memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
  
         for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h

index 4ed3042..2e4f67f 100644 (file)
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
          xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
                         (rf))))
  
-extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
  extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
  #else
  #define XFS_TEST_ERROR(expr, mp, tag, rf)      (expr)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c

index 4aa0153..ab77946 100644 (file)
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,6 +40,7 @@ void
  xfs_efi_item_free(
         struct xfs_efi_log_item *efip)
  {
+       kmem_free(efip->efi_item.li_lv_shadow);
         if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
                 kmem_free(efip);
         else
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
  STATIC void
  xfs_efd_item_free(struct xfs_efd_log_item *efdp)
  {
+       kmem_free(efdp->efd_item.li_lv_shadow);
         if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
                 kmem_free(efdp);
         else
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 1b3dc9d..ed95e5b 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
  #include "xfs_log.h"
  #include "xfs_icache.h"
  #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
  
  #include <linux/dcache.h>
  #include <linux/falloc.h>
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
  }
  
  /*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
   */
  int
-xfs_iozero(
-       struct xfs_inode        *ip,    /* inode                        */
-       loff_t                  pos,    /* offset in file               */
-       size_t                  count)  /* size of data to zero         */
+xfs_zero_range(
+       struct xfs_inode        *ip,
+       xfs_off_t               pos,
+       xfs_off_t               count,
+       bool                    *did_zero)
  {
-       struct page             *page;
-       struct address_space    *mapping;
-       int                     status = 0;
-
-
-       mapping = VFS_I(ip)->i_mapping;
-       do {
-               unsigned offset, bytes;
-               void *fsdata;
-
-               offset = (pos & (PAGE_SIZE -1)); /* Within page */
-               bytes = PAGE_SIZE - offset;
-               if (bytes > count)
-                       bytes = count;
-
-               if (IS_DAX(VFS_I(ip))) {
-                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
-                                                    xfs_get_blocks_direct);
-                       if (status)
-                               break;
-               } else {
-                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                               AOP_FLAG_UNINTERRUPTIBLE,
-                                               &page, &fsdata);
-                       if (status)
-                               break;
-
-                       zero_user(page, offset, bytes);
-
-                       status = pagecache_write_end(NULL, mapping, pos, bytes,
-                                               bytes, page, fsdata);
-                       WARN_ON(status <= 0); /* can't return less than zero! */
-                       status = 0;
-               }
-               pos += bytes;
-               count -= bytes;
-       } while (count);
-
-       return status;
+       return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
  }
  
  int
@@ -282,48 +239,35 @@ xfs_file_fsync(
  }
  
  STATIC ssize_t
-xfs_file_read_iter(
+xfs_file_dio_aio_read(
         struct kiocb            *iocb,
         struct iov_iter         *to)
  {
-       struct file             *file = iocb->ki_filp;
-       struct inode            *inode = file->f_mapping->host;
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       size_t                  size = iov_iter_count(to);
+       loff_t                  isize = i_size_read(inode);
+       size_t                  count = iov_iter_count(to);
+       struct iov_iter         data;
+       struct xfs_buftarg      *target;
         ssize_t                 ret = 0;
-       int                     ioflags = 0;
-       xfs_fsize_t             n;
-       loff_t                  pos = iocb->ki_pos;
  
-       XFS_STATS_INC(mp, xs_read_calls);
-
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
-               ioflags |= XFS_IO_ISDIRECT;
-       if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
-       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
-               xfs_buftarg_t   *target =
-                       XFS_IS_REALTIME_INODE(ip) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp;
-               /* DIO must be aligned to device logical sector size */
-               if ((pos | size) & target->bt_logical_sectormask) {
-                       if (pos == i_size_read(inode))
-                               return 0;
-                       return -EINVAL;
-               }
-       }
+       trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
  
-       n = mp->m_super->s_maxbytes - pos;
-       if (n <= 0 || size == 0)
-               return 0;
+       if (!count)
+               return 0; /* skip atime */
  
-       if (n < size)
-               size = n;
+       if (XFS_IS_REALTIME_INODE(ip))
+               target = ip->i_mount->m_rtdev_targp;
+       else
+               target = ip->i_mount->m_ddev_targp;
  
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
+       /* DIO must be aligned to device logical sector size */
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+               if (iocb->ki_pos == isize)
+                       return 0;
+               return -EINVAL;
+       }
  
         /*
          * Locking is a bit tricky here. If we take an exclusive lock for direct
@@ -336,7 +280,7 @@ xfs_file_read_iter(
          * serialisation.
          */
         xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+       if (mapping->nrpages) {
                 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
  
@@ -351,8 +295,8 @@ xfs_file_read_iter(
                  * flush and reduce the chances of repeated iolock cycles going
                  * forward.
                  */
-               if (inode->i_mapping->nrpages) {
-                       ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+               if (mapping->nrpages) {
+                       ret = filemap_write_and_wait(mapping);
                         if (ret) {
                                 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                 return ret;
@@ -363,20 +307,95 @@ xfs_file_read_iter(
                          * we fail to invalidate a page, but this should never
                          * happen on XFS. Warn if it does fail.
                          */
-                       ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+                       ret = invalidate_inode_pages2(mapping);
                         WARN_ON_ONCE(ret);
                         ret = 0;
                 }
                 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
         }
  
-       trace_xfs_file_read(ip, size, pos, ioflags);
+       data = *to;
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, NULL, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct iov_iter         data = *to;
+       size_t                  count = iov_iter_count(to);
+       ssize_t                 ret = 0;
+
+       trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+
+       if (!count)
+               return 0; /* skip atime */
+
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+       ssize_t                 ret;
+
+       trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
         ret = generic_file_read_iter(iocb, to);
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
+       ssize_t                 ret = 0;
+
+       XFS_STATS_INC(mp, xs_read_calls);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_read(iocb, to);
+       else if (iocb->ki_flags & IOCB_DIRECT)
+               ret = xfs_file_dio_aio_read(iocb, to);
+       else
+               ret = xfs_file_buffered_aio_read(iocb, to);
+
         if (ret > 0)
                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
-
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
         return ret;
  }
  
@@ -389,18 +408,14 @@ xfs_file_splice_read(
         unsigned int            flags)
  {
         struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-       int                     ioflags = 0;
         ssize_t                 ret;
  
         XFS_STATS_INC(ip->i_mount, xs_read_calls);
  
-       if (infilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
  
-       trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+       trace_xfs_file_splice_read(ip, count, *ppos);
  
         /*
          * DAX inodes cannot ues the page cache for splice, so we have to push
@@ -423,49 +438,6 @@ out:
         return ret;
  }
  
-/*
- * This routine is called to handle zeroing any space in the last block of the
- * file that is beyond the EOF.  We do this since the size is being increased
- * without writing anything to that block and we don't want to read the
- * garbage on the disk.
- */
-STATIC int                             /* error (positive) */
-xfs_zero_last_block(
-       struct xfs_inode        *ip,
-       xfs_fsize_t             offset,
-       xfs_fsize_t             isize,
-       bool                    *did_zeroing)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           last_fsb = XFS_B_TO_FSBT(mp, isize);
-       int                     zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-       int                     zero_len;
-       int                     nimaps = 1;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       if (error)
-               return error;
-
-       ASSERT(nimaps > 0);
-
-       /*
-        * If the block underlying isize is just a hole, then there
-        * is nothing to zero.
-        */
-       if (imap.br_startblock == HOLESTARTBLOCK)
-               return 0;
-
-       zero_len = mp->m_sb.sb_blocksize - zero_offset;
-       if (isize + zero_len > offset)
-               zero_len = offset - isize;
-       *did_zeroing = true;
-       return xfs_iozero(ip, isize, zero_len);
-}
-
  /*
   * Zero any on disk space between the current EOF and the new, larger EOF.
   *
@@ -484,94 +456,11 @@ xfs_zero_eof(
         xfs_fsize_t             isize,          /* current inode size */
         bool                    *did_zeroing)
  {
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           start_zero_fsb;
-       xfs_fileoff_t           end_zero_fsb;
-       xfs_fileoff_t           zero_count_fsb;
-       xfs_fileoff_t           last_fsb;
-       xfs_fileoff_t           zero_off;
-       xfs_fsize_t             zero_len;
-       int                     nimaps;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
-
         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
         ASSERT(offset > isize);
  
         trace_xfs_zero_eof(ip, isize, offset - isize);
-
-       /*
-        * First handle zeroing the block on which isize resides.
-        *
-        * We only zero a part of that block so it is handled specially.
-        */
-       if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
-               error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
-               if (error)
-                       return error;
-       }
-
-       /*
-        * Calculate the range between the new size and the old where blocks
-        * needing to be zeroed may exist.
-        *
-        * To get the block where the last byte in the file currently resides,
-        * we need to subtract one from the size and truncate back to a block
-        * boundary.  We subtract 1 in case the size is exactly on a block
-        * boundary.
-        */
-       last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-       start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-       end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-       ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-       if (last_fsb == end_zero_fsb) {
-               /*
-                * The size was only incremented on its last block.
-                * We took care of that above, so just return.
-                */
-               return 0;
-       }
-
-       ASSERT(start_zero_fsb <= end_zero_fsb);
-       while (start_zero_fsb <= end_zero_fsb) {
-               nimaps = 1;
-               zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
-                                         &imap, &nimaps, 0);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (error)
-                       return error;
-
-               ASSERT(nimaps > 0);
-
-               if (imap.br_state == XFS_EXT_UNWRITTEN ||
-                   imap.br_startblock == HOLESTARTBLOCK) {
-                       start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                       ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-                       continue;
-               }
-
-               /*
-                * There are blocks we need to zero.
-                */
-               zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-               zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
-               if ((zero_off + zero_len) > offset)
-                       zero_len = offset - zero_off;
-
-               error = xfs_iozero(ip, zero_off, zero_len);
-               if (error)
-                       return error;
-
-               *did_zeroing = true;
-               start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-               ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-       }
-
-       return 0;
+       return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
  }
  
  /*
@@ -722,8 +611,7 @@ xfs_file_dio_aio_write(
                                         mp->m_rtdev_targp : mp->m_ddev_targp;
  
         /* DIO must be aligned to device logical sector size */
-       if (!IS_DAX(inode) &&
-           ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
                 return -EINVAL;
  
         /* "unaligned" here means not aligned to a filesystem block */
@@ -762,7 +650,7 @@ xfs_file_dio_aio_write(
         end = iocb->ki_pos + count - 1;
  
         /*
-        * See xfs_file_read_iter() for why we do a full-file flush here.
+        * See xfs_file_dio_aio_read() for why we do a full-file flush here.
          */
         if (mapping->nrpages) {
                 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -789,10 +677,12 @@ xfs_file_dio_aio_write(
                 iolock = XFS_IOLOCK_SHARED;
         }
  
-       trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+       trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
  
         data = *from;
-       ret = mapping->a_ops->direct_IO(iocb, &data);
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, xfs_end_io_direct_write,
+                       NULL, DIO_ASYNC_EXTEND);
  
         /* see generic_file_direct_write() for why this is necessary */
         if (mapping->nrpages) {
@@ -809,10 +699,70 @@ out:
         xfs_rw_iunlock(ip, iolock);
  
         /*
-        * No fallback to buffered IO on errors for XFS. DAX can result in
-        * partial writes, but direct IO will either complete fully or fail.
+        * No fallback to buffered IO on errors for XFS, direct IO will either
+        * complete fully or fail.
          */
-       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+       ASSERT(ret < 0 || ret == count);
+       return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_write(
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+{
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 ret = 0;
+       int                     unaligned_io = 0;
+       int                     iolock;
+       struct iov_iter         data;
+
+       /* "unaligned" here means not aligned to a filesystem block */
+       if ((iocb->ki_pos & mp->m_blockmask) ||
+           ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+               unaligned_io = 1;
+               iolock = XFS_IOLOCK_EXCL;
+       } else if (mapping->nrpages) {
+               iolock = XFS_IOLOCK_EXCL;
+       } else {
+               iolock = XFS_IOLOCK_SHARED;
+       }
+       xfs_rw_ilock(ip, iolock);
+
+       ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+       if (ret)
+               goto out;
+
+       /*
+        * Yes, even DAX files can have page cache attached to them:  A zeroed
+        * page is inserted into the pagecache when we have to serve a write
+        * fault on a hole.  It should never be dirtied and can simply be
+        * dropped from the pagecache once we get real data for the page.
+        */
+       if (mapping->nrpages) {
+               ret = invalidate_inode_pages2(mapping);
+               WARN_ON_ONCE(ret);
+       }
+
+       if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               iolock = XFS_IOLOCK_SHARED;
+       }
+
+       trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+
+       data = *from;
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+                       xfs_end_io_direct_write, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(from, ret);
+       }
+out:
+       xfs_rw_iunlock(ip, iolock);
         return ret;
  }
  
@@ -839,9 +789,8 @@ xfs_file_buffered_aio_write(
         current->backing_dev_info = inode_to_bdi(inode);
  
  write_retry:
-       trace_xfs_file_buffered_write(ip, iov_iter_count(from),
-                                     iocb->ki_pos, 0);
-       ret = generic_perform_write(file, from, iocb->ki_pos);
+       trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+       ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
         if (likely(ret >= 0))
                 iocb->ki_pos += ret;
  
@@ -895,7 +844,9 @@ xfs_file_write_iter(
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
  
-       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_write(iocb, from);
+       else if (iocb->ki_flags & IOCB_DIRECT)
                 ret = xfs_file_dio_aio_write(iocb, from);
         else
                 ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite(
         if (IS_DAX(inode)) {
                 ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
         } else {
-               ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                 ret = block_page_mkwrite_return(ret);
         }
  
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c

index b4d7582..7191c38 100644 (file)
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -667,8 +667,11 @@ xfs_reserve_blocks(
         __uint64_t              *inval,
         xfs_fsop_resblks_t      *outval)
  {
-       __int64_t               lcounter, delta, fdblks_delta;
+       __int64_t               lcounter, delta;
+       __int64_t               fdblks_delta = 0;
         __uint64_t              request;
+       __int64_t               free;
+       int                     error = 0;
  
         /* If inval is null, report current values and return */
         if (inval == (__uint64_t *)NULL) {
@@ -682,24 +685,23 @@ xfs_reserve_blocks(
         request = *inval;
  
         /*
-        * With per-cpu counters, this becomes an interesting
-        * problem. we needto work out if we are freeing or allocation
-        * blocks first, then we can do the modification as necessary.
+        * With per-cpu counters, this becomes an interesting problem. we need
+        * to work out if we are freeing or allocation blocks first, then we can
+        * do the modification as necessary.
          *
-        * We do this under the m_sb_lock so that if we are near
-        * ENOSPC, we will hold out any changes while we work out
-        * what to do. This means that the amount of free space can
-        * change while we do this, so we need to retry if we end up
-        * trying to reserve more space than is available.
+        * We do this under the m_sb_lock so that if we are near ENOSPC, we will
+        * hold out any changes while we work out what to do. This means that
+        * the amount of free space can change while we do this, so we need to
+        * retry if we end up trying to reserve more space than is available.
          */
-retry:
         spin_lock(&mp->m_sb_lock);
  
         /*
          * If our previous reservation was larger than the current value,
-        * then move any unused blocks back to the free pool.
+        * then move any unused blocks back to the free pool. Modify the resblks
+        * counters directly since we shouldn't have any problems unreserving
+        * space.
          */
-       fdblks_delta = 0;
         if (mp->m_resblks > request) {
                 lcounter = mp->m_resblks_avail - request;
                 if (lcounter  > 0) {            /* release unused blocks */
@@ -707,54 +709,67 @@ retry:
                         mp->m_resblks_avail -= lcounter;
                 }
                 mp->m_resblks = request;
-       } else {
-               __int64_t       free;
+               if (fdblks_delta) {
+                       spin_unlock(&mp->m_sb_lock);
+                       error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+                       spin_lock(&mp->m_sb_lock);
+               }
+
+               goto out;
+       }
  
+       /*
+        * If the request is larger than the current reservation, reserve the
+        * blocks before we update the reserve counters. Sample m_fdblocks and
+        * perform a partial reservation if the request exceeds free space.
+        */
+       error = -ENOSPC;
+       do {
                 free = percpu_counter_sum(&mp->m_fdblocks) -
                                                         XFS_ALLOC_SET_ASIDE(mp);
                 if (!free)
-                       goto out; /* ENOSPC and fdblks_delta = 0 */
+                       break;
  
                 delta = request - mp->m_resblks;
                 lcounter = free - delta;
-               if (lcounter < 0) {
+               if (lcounter < 0)
                         /* We can't satisfy the request, just get what we can */
-                       mp->m_resblks += free;
-                       mp->m_resblks_avail += free;
-                       fdblks_delta = -free;
-               } else {
-                       fdblks_delta = -delta;
-                       mp->m_resblks = request;
-                       mp->m_resblks_avail += delta;
-               }
-       }
-out:
-       if (outval) {
-               outval->resblks = mp->m_resblks;
-               outval->resblks_avail = mp->m_resblks_avail;
-       }
-       spin_unlock(&mp->m_sb_lock);
+                       fdblks_delta = free;
+               else
+                       fdblks_delta = delta;
  
-       if (fdblks_delta) {
                 /*
-                * If we are putting blocks back here, m_resblks_avail is
-                * already at its max so this will put it in the free pool.
-                *
-                * If we need space, we'll either succeed in getting it
-                * from the free block count or we'll get an enospc. If
-                * we get a ENOSPC, it means things changed while we were
-                * calculating fdblks_delta and so we should try again to
-                * see if there is anything left to reserve.
+                * We'll either succeed in getting space from the free block
+                * count or we'll get an ENOSPC. If we get a ENOSPC, it means
+                * things changed while we were calculating fdblks_delta and so
+                * we should try again to see if there is anything left to
+                * reserve.
                  *
                  * Don't set the reserved flag here - we don't want to reserve
                  * the extra reserve blocks from the reserve.....
                  */
-               int error;
-               error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
-               if (error == -ENOSPC)
-                       goto retry;
+               spin_unlock(&mp->m_sb_lock);
+               error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+               spin_lock(&mp->m_sb_lock);
+       } while (error == -ENOSPC);
+
+       /*
+        * Update the reserve counters if blocks have been successfully
+        * allocated.
+        */
+       if (!error && fdblks_delta) {
+               mp->m_resblks += fdblks_delta;
+               mp->m_resblks_avail += fdblks_delta;
         }
-       return 0;
+
+out:
+       if (outval) {
+               outval->resblks = mp->m_resblks;
+               outval->resblks_avail = mp->m_resblks_avail;
+       }
+
+       spin_unlock(&mp->m_sb_lock);
+       return error;
  }
  
  int
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c

index 99ee6ee..fb39a66 100644 (file)
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -765,7 +765,7 @@ restart:
   * Background scanning to trim post-EOF preallocated space. This is queued
   * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
   */
-STATIC void
+void
  xfs_queue_eofblocks(
         struct xfs_mount *mp)
  {
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h

index 62f1f91..05bac99 100644 (file)
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
  int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
  int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
  void xfs_eofblocks_worker(struct work_struct *);
+void xfs_queue_eofblocks(struct xfs_mount *);
  
  int xfs_inode_ag_iterator(struct xfs_mount *mp,
         int (*execute)(struct xfs_inode *ip, int flags, void *args),
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index ee6799e..8825bcf 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
   * lock more than one at a time, lockdep will report false positives saying we
   * have violated locking orders.
   */
-void
+static void
  xfs_lock_inodes(
         xfs_inode_t     **ips,
         int             inodes,
@@ -667,14 +667,6 @@ xfs_ip2xflags(
         return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
  }
  
-uint
-xfs_dic2xflags(
-       struct xfs_dinode       *dip)
-{
-       return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
-                               be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
-}
-
  /*
   * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
   * is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -748,7 +740,7 @@ out_unlock:
   * are not linked into the directory structure - they are attached
   * directly to the superblock - and so have no parent.
   */
-int
+static int
  xfs_ialloc(
         xfs_trans_t     *tp,
         xfs_inode_t     *pip,
@@ -1085,7 +1077,7 @@ xfs_dir_ialloc(
   * link count to go to zero, move the inode to AGI unlinked list so that it can
   * be freed when the last active reference goes away via xfs_inactive().
   */
-int                            /* error */
+static int                     /* error */
  xfs_droplink(
         xfs_trans_t *tp,
         xfs_inode_t *ip)
@@ -1104,7 +1096,7 @@ xfs_droplink(
  /*
   * Increment the link count on an inode & log the change.
   */
-int
+static int
  xfs_bumplink(
         xfs_trans_t *tp,
         xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index e52d7c7..8eb78ec 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -395,12 +395,8 @@ void               xfs_ilock_demote(xfs_inode_t *, uint);
  int            xfs_isilocked(xfs_inode_t *, uint);
  uint           xfs_ilock_data_map_shared(struct xfs_inode *);
  uint           xfs_ilock_attr_map_shared(struct xfs_inode *);
-int            xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
-                          xfs_nlink_t, xfs_dev_t, prid_t, int,
-                          struct xfs_buf **, xfs_inode_t **);
  
  uint           xfs_ip2xflags(struct xfs_inode *);
-uint           xfs_dic2xflags(struct xfs_dinode *);
  int            xfs_ifree(struct xfs_trans *, xfs_inode_t *,
                            struct xfs_bmap_free *);
  int            xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
@@ -411,7 +407,6 @@ void                xfs_iunpin_wait(xfs_inode_t *);
  #define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
  
  int            xfs_iflush(struct xfs_inode *, struct xfs_buf **);
-void           xfs_lock_inodes(xfs_inode_t **, int, uint);
  void           xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
  
  xfs_extlen_t   xfs_get_extsz_hint(struct xfs_inode *ip);
@@ -419,8 +414,6 @@ xfs_extlen_t        xfs_get_extsz_hint(struct xfs_inode *ip);
  int            xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
                                xfs_nlink_t, xfs_dev_t, prid_t, int,
                                struct xfs_inode **, int *);
-int            xfs_droplink(struct xfs_trans *, struct xfs_inode *);
-int            xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
  
  /* from xfs_file.c */
  enum xfs_prealloc_flags {
@@ -434,7 +427,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
                                   enum xfs_prealloc_flags flags);
  int    xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
                      xfs_fsize_t isize, bool *did_zeroing);
-int    xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+int    xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
+               bool *did_zero);
  loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
                              loff_t eof, int whence);
  
@@ -479,14 +473,4 @@ do { \
  
  extern struct kmem_zone        *xfs_inode_zone;
  
-/*
- * Flags for read/write calls
- */
-#define XFS_IO_ISDIRECT        0x00001         /* bypass page cache */
-#define XFS_IO_INVIS   0x00002         /* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
-       { XFS_IO_ISDIRECT,      "DIRECT" }, \
-       { XFS_IO_INVIS,         "INVIS"}
-
  #endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c

index a1b0761..892c2ac 100644 (file)
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,6 +651,7 @@ void
  xfs_inode_item_destroy(
         xfs_inode_t     *ip)
  {
+       kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
         kmem_zone_free(xfs_ili_zone, ip->i_itemp);
  }
  
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c

index 63a6ff2..9a7c878 100644 (file)
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -595,13 +595,12 @@ xfs_attrmulti_by_handle(
  
  int
  xfs_ioc_space(
-       struct xfs_inode        *ip,
-       struct inode            *inode,
         struct file             *filp,
-       int                     ioflags,
         unsigned int            cmd,
         xfs_flock64_t           *bf)
  {
+       struct inode            *inode = file_inode(filp);
+       struct xfs_inode        *ip = XFS_I(inode);
         struct iattr            iattr;
         enum xfs_prealloc_flags flags = 0;
         uint                    iolock = XFS_IOLOCK_EXCL;
@@ -626,7 +625,7 @@ xfs_ioc_space(
  
         if (filp->f_flags & O_DSYNC)
                 flags |= XFS_PREALLOC_SYNC;
-       if (ioflags & XFS_IO_INVIS)
+       if (filp->f_mode & FMODE_NOCMTIME)
                 flags |= XFS_PREALLOC_INVISIBLE;
  
         error = mnt_want_write_file(filp);
@@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
  
  STATIC int
  xfs_ioc_getbmap(
-       struct xfs_inode        *ip,
-       int                     ioflags,
+       struct file             *file,
         unsigned int            cmd,
         void                    __user *arg)
  {
@@ -1479,10 +1477,10 @@ xfs_ioc_getbmap(
                 return -EINVAL;
  
         bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
-       if (ioflags & XFS_IO_INVIS)
+       if (file->f_mode & FMODE_NOCMTIME)
                 bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
  
-       error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+       error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
                             (__force struct getbmap *)arg+1);
         if (error)
                 return error;
@@ -1575,6 +1573,11 @@ xfs_ioc_swapext(
                 goto out_put_tmp_file;
         }
  
+       /*
+        * We need to ensure that the fds passed in point to XFS inodes
+        * before we cast and access them as XFS structures as we have no
+        * control over what the user passes us here.
+        */
         if (f.file->f_op != &xfs_file_operations ||
             tmp.file->f_op != &xfs_file_operations) {
                 error = -EINVAL;
@@ -1625,12 +1628,8 @@ xfs_file_ioctl(
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
         void                    __user *arg = (void __user *)p;
-       int                     ioflags = 0;
         int                     error;
  
-       if (filp->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
         trace_xfs_file_ioctl(ip);
  
         switch (cmd) {
@@ -1649,7 +1648,7 @@ xfs_file_ioctl(
  
                 if (copy_from_user(&bf, arg, sizeof(bf)))
                         return -EFAULT;
-               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+               return xfs_ioc_space(filp, cmd, &bf);
         }
         case XFS_IOC_DIOINFO: {
                 struct dioattr  da;
@@ -1708,7 +1707,7 @@ xfs_file_ioctl(
  
         case XFS_IOC_GETBMAP:
         case XFS_IOC_GETBMAPA:
-               return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+               return xfs_ioc_getbmap(filp, cmd, arg);
  
         case XFS_IOC_GETBMAPX:
                 return xfs_ioc_getbmapx(ip, arg);
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h

index 77c02c7..8b52881 100644 (file)
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -20,10 +20,7 @@
  
  extern int
  xfs_ioc_space(
-       struct xfs_inode        *ip,
-       struct inode            *inode,
         struct file             *filp,
-       int                     ioflags,
         unsigned int            cmd,
         xfs_flock64_t           *bf);
  
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c

index 1a05d8a..321f577 100644 (file)
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
         void                    __user *arg = (void __user *)p;
-       int                     ioflags = 0;
         int                     error;
  
-       if (filp->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
         trace_xfs_file_compat_ioctl(ip);
  
         switch (cmd) {
@@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
                 if (xfs_compat_flock64_copyin(&bf, arg))
                         return -EFAULT;
                 cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+               return xfs_ioc_space(filp, cmd, &bf);
         }
         case XFS_IOC_FSGEOMETRY_V1_32:
                 return xfs_compat_ioc_fsgeometry_v1(mp, arg);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index 5839135..620fc91 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
   * along with this program; if not, write the Free Software Foundation,
   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
+#include <linux/iomap.h>
  #include "xfs.h"
  #include "xfs_fs.h"
  #include "xfs_shared.h"
@@ -940,3 +941,173 @@ error_on_bmapi_transaction:
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
  }
+
+void
+xfs_bmbt_to_iomap(
+       struct xfs_inode        *ip,
+       struct iomap            *iomap,
+       struct xfs_bmbt_irec    *imap)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+
+       if (imap->br_startblock == HOLESTARTBLOCK) {
+               iomap->blkno = IOMAP_NULL_BLOCK;
+               iomap->type = IOMAP_HOLE;
+       } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+               iomap->blkno = IOMAP_NULL_BLOCK;
+               iomap->type = IOMAP_DELALLOC;
+       } else {
+               iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+               if (imap->br_state == XFS_EXT_UNWRITTEN)
+                       iomap->type = IOMAP_UNWRITTEN;
+               else
+                       iomap->type = IOMAP_MAPPED;
+       }
+       iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+       iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+       iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+       return !nimaps ||
+               imap->br_startblock == HOLESTARTBLOCK ||
+               imap->br_startblock == DELAYSTARTBLOCK;
+}
+
+static int
+xfs_file_iomap_begin(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  length,
+       unsigned                flags,
+       struct iomap            *iomap)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_bmbt_irec    imap;
+       xfs_fileoff_t           offset_fsb, end_fsb;
+       int                     nimaps = 1, error = 0;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+       ASSERT(offset <= mp->m_super->s_maxbytes);
+       if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+               length = mp->m_super->s_maxbytes - offset;
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+       error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+                              &nimaps, XFS_BMAPI_ENTIRE);
+       if (error) {
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               return error;
+       }
+
+       if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+               /*
+                * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+                * pages to keep the chunks of work done where somewhat symmetric
+                * with the work writeback does. This is a completely arbitrary
+                * number pulled out of thin air as a best guess for initial
+                * testing.
+                *
+                * Note that the values needs to be less than 32-bits wide until
+                * the lower level functions are updated.
+                */
+               length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+               if (xfs_get_extsz_hint(ip)) {
+                       /*
+                        * xfs_iomap_write_direct() expects the shared lock. It
+                        * is unlocked on return.
+                        */
+                       xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+                       error = xfs_iomap_write_direct(ip, offset, length, &imap,
+                                       nimaps);
+               } else {
+                       error = xfs_iomap_write_delay(ip, offset, length, &imap);
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               }
+
+               if (error)
+                       return error;
+
+               trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+               xfs_bmbt_to_iomap(ip, iomap, &imap);
+       } else if (nimaps) {
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+               xfs_bmbt_to_iomap(ip, iomap, &imap);
+       } else {
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+               iomap->blkno = IOMAP_NULL_BLOCK;
+               iomap->type = IOMAP_HOLE;
+               iomap->offset = offset;
+               iomap->length = length;
+       }
+
+       return 0;
+}
+
+static int
+xfs_file_iomap_end_delalloc(
+       struct xfs_inode        *ip,
+       loff_t                  offset,
+       loff_t                  length,
+       ssize_t                 written)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           start_fsb;
+       xfs_fileoff_t           end_fsb;
+       int                     error = 0;
+
+       start_fsb = XFS_B_TO_FSB(mp, offset + written);
+       end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+       /*
+        * Trim back delalloc blocks if we didn't manage to write the whole
+        * range reserved.
+        *
+        * We don't need to care about racing delalloc as we hold i_mutex
+        * across the reserve/allocate/unreserve calls. If there are delalloc
+        * blocks in the range, they are ours.
+        */
+       if (start_fsb < end_fsb) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                              end_fsb - start_fsb);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+               if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+                       xfs_alert(mp, "%s: unable to clean up ino %lld",
+                               __func__, ip->i_ino);
+                       return error;
+               }
+       }
+
+       return 0;
+}
+
+static int
+xfs_file_iomap_end(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  length,
+       ssize_t                 written,
+       unsigned                flags,
+       struct iomap            *iomap)
+{
+       if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+               return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+                               length, written);
+       return 0;
+}
+
+struct iomap_ops xfs_iomap_ops = {
+       .iomap_begin            = xfs_file_iomap_begin,
+       .iomap_end              = xfs_file_iomap_end,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h

index 8688e66..e066d04 100644 (file)
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,8 @@
  #ifndef __XFS_IOMAP_H__
  #define __XFS_IOMAP_H__
  
+#include <linux/iomap.h>
+
  struct xfs_inode;
  struct xfs_bmbt_irec;
  
@@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
                         struct xfs_bmbt_irec *);
  int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
  
+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+               struct xfs_bmbt_irec *);
+
+extern struct iomap_ops xfs_iomap_ops;
+
  #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index c5d4eba..ab820f8 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,12 +38,13 @@
  #include "xfs_dir2.h"
  #include "xfs_trans_space.h"
  #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
  
  #include <linux/capability.h>
  #include <linux/xattr.h>
  #include <linux/posix_acl.h>
  #include <linux/security.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
  #include <linux/slab.h>
  
  /*
@@ -800,21 +801,31 @@ xfs_setattr_size(
         if (error)
                 return error;
  
+       /*
+        * Wait for all direct I/O to complete.
+        */
+       inode_dio_wait(inode);
+
         /*
          * File data changes must be complete before we start the transaction to
          * modify the inode.  This needs to be done before joining the inode to
          * the transaction because the inode cannot be unlocked once it is a
          * part of the transaction.
          *
-        * Start with zeroing any data block beyond EOF that we may expose on
-        * file extension.
+        * Start with zeroing any data beyond EOF that we may expose on file
+        * extension, or zeroing out the rest of the block on a downward
+        * truncate.
          */
         if (newsize > oldsize) {
                 error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
-               if (error)
-                       return error;
+       } else {
+               error = iomap_truncate_page(inode, newsize, &did_zeroing,
+                               &xfs_iomap_ops);
         }
  
+       if (error)
+               return error;
+
         /*
          * We are going to log the inode size change in this transaction so
          * any previous writes that are beyond the on disk EOF and the new
@@ -823,17 +834,14 @@ xfs_setattr_size(
          * problem. Note that this includes any block zeroing we did above;
          * otherwise those blocks may not be zeroed after a crash.
          */
-       if (newsize > ip->i_d.di_size &&
-           (oldsize != ip->i_d.di_size || did_zeroing)) {
+       if (did_zeroing ||
+           (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
                 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                                       ip->i_d.di_size, newsize);
                 if (error)
                         return error;
         }
  
-       /* Now wait for all direct I/O to complete. */
-       inode_dio_wait(inode);
-
         /*
          * We've already locked out new page faults, so now we can safely remove
          * pages from the page cache knowing they won't get refaulted until we
@@ -851,13 +859,6 @@ xfs_setattr_size(
          * to hope that the caller sees ENOMEM and retries the truncate
          * operation.
          */
-       if (IS_DAX(inode))
-               error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
-       else
-               error = block_truncate_page(inode->i_mapping, newsize,
-                                           xfs_get_blocks);
-       if (error)
-               return error;
         truncate_setsize(inode, newsize);
  
         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@@ -998,51 +999,6 @@ xfs_vn_update_time(
         return xfs_trans_commit(tp);
  }
  
-#define XFS_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
-       void                    **arg,
-       struct getbmapx         *bmv,
-       int                     *full)
-{
-       int                     error;
-       struct fiemap_extent_info *fieinfo = *arg;
-       u32                     fiemap_flags = 0;
-       u64                     logical, physical, length;
-
-       /* Do nothing for a hole */
-       if (bmv->bmv_block == -1LL)
-               return 0;
-
-       logical = BBTOB(bmv->bmv_offset);
-       physical = BBTOB(bmv->bmv_block);
-       length = BBTOB(bmv->bmv_length);
-
-       if (bmv->bmv_oflags & BMV_OF_PREALLOC)
-               fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
-       else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-               fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
-                                FIEMAP_EXTENT_UNKNOWN);
-               physical = 0;   /* no block yet */
-       }
-       if (bmv->bmv_oflags & BMV_OF_LAST)
-               fiemap_flags |= FIEMAP_EXTENT_LAST;
-
-       error = fiemap_fill_next_extent(fieinfo, logical, physical,
-                                       length, fiemap_flags);
-       if (error > 0) {
-               error = 0;
-               *full = 1;      /* user array now full */
-       }
-
-       return error;
-}
-
  STATIC int
  xfs_vn_fiemap(
         struct inode            *inode,
@@ -1050,38 +1006,13 @@ xfs_vn_fiemap(
         u64                     start,
         u64                     length)
  {
-       xfs_inode_t             *ip = XFS_I(inode);
-       struct getbmapx         bm;
         int                     error;
  
-       error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
-       if (error)
-               return error;
-
-       /* Set up bmap header for xfs internal routine */
-       bm.bmv_offset = BTOBBT(start);
-       /* Special case for whole file */
-       if (length == FIEMAP_MAX_OFFSET)
-               bm.bmv_length = -1LL;
-       else
-               bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
-
-       /* We add one because in getbmap world count includes the header */
-       bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
-                                       fieinfo->fi_extents_max + 1;
-       bm.bmv_count = min_t(__s32, bm.bmv_count,
-                            (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-       bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
-       if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
-               bm.bmv_iflags |= BMV_IF_ATTRFORK;
-       if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
-               bm.bmv_iflags |= BMV_IF_DELALLOC;
-
-       error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
-       if (error)
-               return error;
+       xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
+       error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+       xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
  
-       return 0;
+       return error;
  }
  
  STATIC int
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h

index a8192dc..b8d64d5 100644 (file)
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
         return x;
  }
  
-/* ARM old ABI has some weird alignment/padding */
-#if defined(__arm__) && !defined(__ARM_EABI__)
-#define __arch_pack __attribute__((packed))
-#else
-#define __arch_pack
-#endif
-
  #define ASSERT_ALWAYS(expr)    \
         (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
  
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index bde02f1..3b74fa0 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -788,7 +788,7 @@ xfs_log_mount_cancel(
   * As far as I know, there weren't any dependencies on the old behaviour.
   */
  
-int
+static int
  xfs_log_unmount_write(xfs_mount_t *mp)
  {
         struct xlog      *log = mp->m_log;
@@ -1036,7 +1036,7 @@ xfs_log_space_wake(
   * there's no point in running a dummy transaction at this point because we
   * can't start trying to idle the log until both the CIL and AIL are empty.
   */
-int
+static int
  xfs_log_need_covered(xfs_mount_t *mp)
  {
         struct xlog     *log = mp->m_log;
@@ -1177,7 +1177,7 @@ xlog_space_left(
   * The log manager needs its own routine, in order to control what
   * happens with the buffer after the write completes.
   */
-void
+static void
  xlog_iodone(xfs_buf_t *bp)
  {
         struct xlog_in_core     *iclog = bp->b_fspriv;
@@ -1302,7 +1302,7 @@ xfs_log_work_queue(
   * disk. If there is nothing dirty, then we might need to cover the log to
   * indicate that the filesystem is idle.
   */
-void
+static void
  xfs_log_worker(
         struct work_struct      *work)
  {
@@ -1415,7 +1415,7 @@ xlog_alloc_log(
          */
         error = -ENOMEM;
         bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
-                          BTOBB(log->l_iclog_size), 0);
+                          BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
         if (!bp)
                 goto out_free_log;
  
@@ -1454,7 +1454,8 @@ xlog_alloc_log(
                 prev_iclog = iclog;
  
                 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
-                                               BTOBB(log->l_iclog_size), 0);
+                                         BTOBB(log->l_iclog_size),
+                                         XBF_NO_IOACCT);
                 if (!bp)
                         goto out_free_iclog;
  
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h

index 80ba0c0..b5e7107 100644 (file)
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -163,12 +163,8 @@ int          xfs_log_reserve(struct xfs_mount *mp,
                           __uint8_t        clientid,
                           bool             permanent);
  int      xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-int      xfs_log_unmount_write(struct xfs_mount *mp);
  void      xfs_log_unmount(struct xfs_mount *mp);
  int      xfs_log_force_umount(struct xfs_mount *mp, int logerror);
-int      xfs_log_need_covered(struct xfs_mount *mp);
-
-void     xlog_iodone(struct xfs_buf *);
  
  struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
  void     xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -178,7 +174,6 @@ void        xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
  bool   xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
  
  void   xfs_log_work_queue(struct xfs_mount *mp);
-void   xfs_log_worker(struct work_struct *work);
  void   xfs_log_quiesce(struct xfs_mount *mp);
  bool   xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
  
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c

index 5e54e79..a4ab192 100644 (file)
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
         log->l_cilp->xc_ctx->sequence = 1;
  }
  
+static inline int
+xlog_cil_iovec_space(
+       uint    niovecs)
+{
+       return round_up((sizeof(struct xfs_log_vec) +
+                                       niovecs * sizeof(struct xfs_log_iovec)),
+                       sizeof(uint64_t));
+}
+
+/*
+ * Allocate or pin log vector buffers for CIL insertion.
+ *
+ * The CIL currently uses disposable buffers for copying a snapshot of the
+ * modified items into the log during a push. The biggest problem with this is
+ * the requirement to allocate the disposable buffer during the commit if:
+ *     a) does not exist; or
+ *     b) it is too small
+ *
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
+ * the memory allocation. This means that we have a potential deadlock situation
+ * under low memory conditions when we have lots of dirty metadata pinned in
+ * the CIL and we need a CIL commit to occur to free memory.
+ *
+ * To avoid this, we need to move the memory allocation outside the
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
+ * vector buffers between the check and the formatting of the item into the
+ * log vector buffer within the xc_ctx_lock.
+ *
+ * Because the log vector buffer needs to be unchanged during the CIL push
+ * process, we cannot share the buffer between the transaction commit (which
+ * modifies the buffer) and the CIL push context that is writing the changes
+ * into the log. This means skipping preallocation of buffer space is
+ * unreliable, but we most definitely do not want to be allocating and freeing
+ * buffers unnecessarily during commits when overwrites can be done safely.
+ *
+ * The simplest solution to this problem is to allocate a shadow buffer when a
+ * log item is committed for the second time, and then to only use this buffer
+ * if necessary. The buffer can remain attached to the log item until such time
+ * it is needed, and this is the buffer that is reallocated to match the size of
+ * the incoming modification. Then during the formatting of the item we can swap
+ * the active buffer with the new one if we can't reuse the existing buffer. We
+ * don't free the old buffer as it may be reused on the next modification if
+ * it's size is right, otherwise we'll free and reallocate it at that point.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and attaches the vector to the log item in preparation
+ * for the formatting step which occurs under the xc_ctx_lock.
+ *
+ * While this means the memory footprint goes up, it avoids the repeated
+ * alloc/free pattern that repeated modifications of an item would otherwise
+ * cause, and hence minimises the CPU overhead of such behaviour.
+ */
+static void
+xlog_cil_alloc_shadow_bufs(
+       struct xlog             *log,
+       struct xfs_trans        *tp)
+{
+       struct xfs_log_item_desc *lidp;
+
+       list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+               struct xfs_log_item *lip = lidp->lid_item;
+               struct xfs_log_vec *lv;
+               int     niovecs = 0;
+               int     nbytes = 0;
+               int     buf_size;
+               bool    ordered = false;
+
+               /* Skip items which aren't dirty in this transaction. */
+               if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                       continue;
+
+               /* get number of vecs and size of data to be stored */
+               lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
+               /*
+                * Ordered items need to be tracked but we do not wish to write
+                * them. We need a logvec to track the object, but we do not
+                * need an iovec or buffer to be allocated for copying data.
+                */
+               if (niovecs == XFS_LOG_VEC_ORDERED) {
+                       ordered = true;
+                       niovecs = 0;
+                       nbytes = 0;
+               }
+
+               /*
+                * We 64-bit align the length of each iovec so that the start
+                * of the next one is naturally aligned.  We'll need to
+                * account for that slack space here. Then round nbytes up
+                * to 64-bit alignment so that the initial buffer alignment is
+                * easy to calculate and verify.
+                */
+               nbytes += niovecs * sizeof(uint64_t);
+               nbytes = round_up(nbytes, sizeof(uint64_t));
+
+               /*
+                * The data buffer needs to start 64-bit aligned, so round up
+                * that space to ensure we can align it appropriately and not
+                * overrun the buffer.
+                */
+               buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+
+               /*
+                * if we have no shadow buffer, or it is too small, we need to
+                * reallocate it.
+                */
+               if (!lip->li_lv_shadow ||
+                   buf_size > lip->li_lv_shadow->lv_size) {
+
+                       /*
+                        * We free and allocate here as a realloc would copy
+                        * unecessary data. We don't use kmem_zalloc() for the
+                        * same reason - we don't need to zero the data area in
+                        * the buffer, only the log vector header and the iovec
+                        * storage.
+                        */
+                       kmem_free(lip->li_lv_shadow);
+
+                       lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
+                       memset(lv, 0, xlog_cil_iovec_space(niovecs));
+
+                       lv->lv_item = lip;
+                       lv->lv_size = buf_size;
+                       if (ordered)
+                               lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                       else
+                               lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+                       lip->li_lv_shadow = lv;
+               } else {
+                       /* same or smaller, optimise common overwrite case */
+                       lv = lip->li_lv_shadow;
+                       if (ordered)
+                               lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                       else
+                               lv->lv_buf_len = 0;
+                       lv->lv_bytes = 0;
+                       lv->lv_next = NULL;
+               }
+
+               /* Ensure the lv is set up according to ->iop_size */
+               lv->lv_niovecs = niovecs;
+
+               /* The allocated data region lies beyond the iovec region */
+               lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
+       }
+
+}
+
  /*
   * Prepare the log item for insertion into the CIL. Calculate the difference in
   * log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
         /*
          * If there is no old LV, this is the first time we've seen the item in
          * this CIL context and so we need to pin it. If we are replacing the
-        * old_lv, then remove the space it accounts for and free it.
+        * old_lv, then remove the space it accounts for and make it the shadow
+        * buffer for later freeing. In both cases we are now switching to the
+        * shadow buffer, so update the the pointer to it appropriately.
          */
-       if (!old_lv)
+       if (!old_lv) {
                 lv->lv_item->li_ops->iop_pin(lv->lv_item);
-       else if (old_lv != lv) {
+               lv->lv_item->li_lv_shadow = NULL;
+       } else if (old_lv != lv) {
                 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
  
                 *diff_len -= old_lv->lv_bytes;
                 *diff_iovecs -= old_lv->lv_niovecs;
-               kmem_free(old_lv);
+               lv->lv_item->li_lv_shadow = old_lv;
         }
  
         /* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
   * write it out asynchronously without needing to relock the object that was
   * modified at the time it gets written into the iclog.
   *
- * This function builds a vector for the changes in each log item in the
- * transaction. It then works out the length of the buffer needed for each log
- * item, allocates them and formats the vector for the item into the buffer.
- * The buffer is then attached to the log item are then inserted into the
- * Committed Item List for tracking until the next checkpoint is written out.
+ * This function takes the prepared log vectors attached to each log item, and
+ * formats the changes into the log vector buffer. The buffer it uses is
+ * dependent on the current state of the vector in the CIL - the shadow lv is
+ * guaranteed to be large enough for the current modification, but we will only
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
+ * done lazily either by th enext modification or the freeing of the log item.
   *
   * We don't set up region headers during this process; we simply copy the
   * regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
         list_for_each_entry(lidp, &tp->t_items, lid_trans) {
                 struct xfs_log_item *lip = lidp->lid_item;
                 struct xfs_log_vec *lv;
-               struct xfs_log_vec *old_lv;
-               int     niovecs = 0;
-               int     nbytes = 0;
-               int     buf_size;
+               struct xfs_log_vec *old_lv = NULL;
+               struct xfs_log_vec *shadow;
                 bool    ordered = false;
  
                 /* Skip items which aren't dirty in this transaction. */
                 if (!(lidp->lid_flags & XFS_LID_DIRTY))
                         continue;
  
-               /* get number of vecs and size of data to be stored */
-               lip->li_ops->iop_size(lip, &niovecs, &nbytes);
-
-               /* Skip items that do not have any vectors for writing */
-               if (!niovecs)
-                       continue;
-
                 /*
-                * Ordered items need to be tracked but we do not wish to write
-                * them. We need a logvec to track the object, but we do not
-                * need an iovec or buffer to be allocated for copying data.
+                * The formatting size information is already attached to
+                * the shadow lv on the log item.
                  */
-               if (niovecs == XFS_LOG_VEC_ORDERED) {
+               shadow = lip->li_lv_shadow;
+               if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
                         ordered = true;
-                       niovecs = 0;
-                       nbytes = 0;
-               }
  
-               /*
-                * We 64-bit align the length of each iovec so that the start
-                * of the next one is naturally aligned.  We'll need to
-                * account for that slack space here. Then round nbytes up
-                * to 64-bit alignment so that the initial buffer alignment is
-                * easy to calculate and verify.
-                */
-               nbytes += niovecs * sizeof(uint64_t);
-               nbytes = round_up(nbytes, sizeof(uint64_t));
-
-               /* grab the old item if it exists for reservation accounting */
-               old_lv = lip->li_lv;
-
-               /*
-                * The data buffer needs to start 64-bit aligned, so round up
-                * that space to ensure we can align it appropriately and not
-                * overrun the buffer.
-                */
-               buf_size = nbytes +
-                          round_up((sizeof(struct xfs_log_vec) +
-                                    niovecs * sizeof(struct xfs_log_iovec)),
-                                   sizeof(uint64_t));
+               /* Skip items that do not have any vectors for writing */
+               if (!shadow->lv_niovecs && !ordered)
+                       continue;
  
                 /* compare to existing item size */
-               if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+               old_lv = lip->li_lv;
+               if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
                         /* same or smaller, optimise common overwrite case */
                         lv = lip->li_lv;
                         lv->lv_next = NULL;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
                          */
                         *diff_iovecs -= lv->lv_niovecs;
                         *diff_len -= lv->lv_bytes;
+
+                       /* Ensure the lv is set up according to ->iop_size */
+                       lv->lv_niovecs = shadow->lv_niovecs;
+
+                       /* reset the lv buffer information for new formatting */
+                       lv->lv_buf_len = 0;
+                       lv->lv_bytes = 0;
+                       lv->lv_buf = (char *)lv +
+                                       xlog_cil_iovec_space(lv->lv_niovecs);
                 } else {
-                       /* allocate new data chunk */
-                       lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+                       /* switch to shadow buffer! */
+                       lv = shadow;
                         lv->lv_item = lip;
-                       lv->lv_size = buf_size;
                         if (ordered) {
                                 /* track as an ordered logvec */
                                 ASSERT(lip->li_lv == NULL);
-                               lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
                                 goto insert;
                         }
-                       lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
                 }
  
-               /* Ensure the lv is set up according to ->iop_size */
-               lv->lv_niovecs = niovecs;
-
-               /* The allocated data region lies beyond the iovec region */
-               lv->lv_buf_len = 0;
-               lv->lv_bytes = 0;
-               lv->lv_buf = (char *)lv + buf_size - nbytes;
                 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
-
                 lip->li_ops->iop_format(lip, lv);
  insert:
-               ASSERT(lv->lv_buf_len <= nbytes);
                 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
         }
  }
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
         struct xlog             *log = mp->m_log;
         struct xfs_cil          *cil = log->l_cilp;
  
+       /*
+        * Do all necessary memory allocation before we lock the CIL.
+        * This ensures the allocation does not deadlock with a CIL
+        * push in memory reclaim (e.g. from kswapd).
+        */
+       xlog_cil_alloc_shadow_bufs(log, tp);
+
         /* lock out background commit */
         down_read(&cil->xc_ctx_lock);
  
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index e39b023..970c19b 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -272,13 +272,15 @@ xfs_readsb(
         buf_ops = NULL;
  
         /*
-        * Allocate a (locked) buffer to hold the superblock.
-        * This will be kept around at all times to optimize
-        * access to the superblock.
+        * Allocate a (locked) buffer to hold the superblock. This will be kept
+        * around at all times to optimize access to the superblock. Therefore,
+        * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
+        * elevated.
          */
  reread:
         error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-                                  BTOBB(sector_size), 0, &bp, buf_ops);
+                                     BTOBB(sector_size), XBF_NO_IOACCT, &bp,
+                                     buf_ops);
         if (error) {
                 if (loud)
                         xfs_warn(mp, "SB validate failed with error %d.", error);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h

index 184c44e..0cc8d8f 100644 (file)
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -22,6 +22,11 @@
         BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
                 #structname ") is wrong, expected " #size)
  
+#define XFS_CHECK_OFFSET(structname, member, off) \
+       BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+               "XFS: offsetof(" #structname ", " #member ") is wrong, " \
+               "expected " #off)
+
  static inline void __init
  xfs_check_ondisk_structs(void)
  {
@@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void)
         XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key,              8);
         XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec,              16);
         XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block,            4);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr,      48);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr,      64);
         XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block,           72);
         XFS_CHECK_STRUCT_SIZE(struct xfs_dinode,                176);
         XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot,            104);
@@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void)
         XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,      12);
          */
  
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen,  0);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen,   2);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval,   3);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen,  8);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name,     9);
         XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t,             40);
-       XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t,             8);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize,     0);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count,       2);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags,   6);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
         XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t,                 12);
         XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t,                 16);
         XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t,              8);
         XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t,                16);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t,             4);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t,              16);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t,           6);
+       XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag,       0);
+       XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length,        2);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t,              16);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t,                  16);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t,                  4);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t,                  8);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t,                  8);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t,            8);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t,              16);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t,                  16);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t,             4);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t,              3);
+       XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen,          0);
+       XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset,           1);
+       XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name,             3);
         XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t,                10);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t,                2);
  
         /* log structures */
         XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat,          24);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c

index d5b7566..0f14b2e 100644 (file)
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
  /*
   * Copyright (c) 2014 Christoph Hellwig.
   */
+#include <linux/iomap.h>
  #include "xfs.h"
  #include "xfs_format.h"
  #include "xfs_log_format.h"
@@ -79,32 +80,6 @@ xfs_fs_get_uuid(
         return 0;
  }
  
-static void
-xfs_bmbt_to_iomap(
-       struct xfs_inode        *ip,
-       struct iomap            *iomap,
-       struct xfs_bmbt_irec    *imap)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-
-       if (imap->br_startblock == HOLESTARTBLOCK) {
-               iomap->blkno = IOMAP_NULL_BLOCK;
-               iomap->type = IOMAP_HOLE;
-       } else if (imap->br_startblock == DELAYSTARTBLOCK) {
-               iomap->blkno = IOMAP_NULL_BLOCK;
-               iomap->type = IOMAP_DELALLOC;
-       } else {
-               iomap->blkno =
-                       XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
-               if (imap->br_state == XFS_EXT_UNWRITTEN)
-                       iomap->type = IOMAP_UNWRITTEN;
-               else
-                       iomap->type = IOMAP_MAPPED;
-       }
-       iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-       iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
-
  /*
   * Get a layout for the pNFS client.
   */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h

index 76c0a4a..355dd9e 100644 (file)
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -98,8 +98,6 @@ xfs_growfs_rt(
  /*
   * From xfs_rtbitmap.c
   */
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
-                 xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
  int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
                       xfs_rtblock_t start, xfs_extlen_t len, int val,
                       xfs_rtblock_t *new, int *stat);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 11ea5d5..0303f10 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -546,7 +546,7 @@ xfs_showargs(
  
         return 0;
  }
-__uint64_t
+static __uint64_t
  xfs_max_file_offset(
         unsigned int            blockshift)
  {
@@ -1294,6 +1294,7 @@ xfs_fs_remount(
                  */
                 xfs_restore_resvblks(mp);
                 xfs_log_work_queue(mp);
+               xfs_queue_eofblocks(mp);
         }
  
         /* rw -> ro */
@@ -1306,6 +1307,13 @@ xfs_fs_remount(
                  * return it to the same size.
                  */
                 xfs_save_resvblks(mp);
+
+               /*
+                * Cancel background eofb scanning so it cannot race with the
+                * final log force+buftarg wait and deadlock the remount.
+                */
+               cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
                 xfs_quiesce_attr(mp);
                 mp->m_flags |= XFS_MOUNT_RDONLY;
         }
@@ -1565,10 +1573,6 @@ xfs_fs_fill_super(
                 }
         }
  
-       if (xfs_sb_version_hassparseinodes(&mp->m_sb))
-               xfs_alert(mp,
-       "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
-
         error = xfs_mountfs(mp);
         if (error)
                 goto out_filestream_unmount;
@@ -1692,8 +1696,9 @@ xfs_init_zones(void)
         if (!xfs_log_ticket_zone)
                 goto out_free_ioend_bioset;
  
-       xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-                                               "xfs_bmap_free_item");
+       xfs_bmap_free_item_zone = kmem_zone_init(
+                       sizeof(struct xfs_bmap_free_item),
+                       "xfs_bmap_free_item");
         if (!xfs_bmap_free_item_zone)
                 goto out_destroy_log_ticket_zone;
  
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h

index 2dfb1ce..529bce9 100644 (file)
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,8 +61,6 @@ struct xfs_mount;
  struct xfs_buftarg;
  struct block_device;
  
-extern __uint64_t xfs_max_file_offset(unsigned int);
-
  extern void xfs_flush_inodes(struct xfs_mount *mp);
  extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
  extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c

index 4c2c550..79cfd3f 100644 (file)
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -634,6 +634,9 @@ xfs_error_get_cfg(
  {
         struct xfs_error_cfg    *cfg;
  
+       if (error < 0)
+               error = -error;
+
         switch (error) {
         case EIO:
                 cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index ea94ee0..1451690 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait);
  DEFINE_BUF_EVENT(xfs_buf_bawrite);
  DEFINE_BUF_EVENT(xfs_buf_lock);
  DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
  DEFINE_BUF_EVENT(xfs_buf_trylock);
  DEFINE_BUF_EVENT(xfs_buf_unlock);
  DEFINE_BUF_EVENT(xfs_buf_iowait);
@@ -1134,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
  )
  
  DECLARE_EVENT_CLASS(xfs_file_class,
-       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
-       TP_ARGS(ip, count, offset, flags),
+       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
+       TP_ARGS(ip, count, offset),
         TP_STRUCT__entry(
                 __field(dev_t, dev)
                 __field(xfs_ino_t, ino)
                 __field(xfs_fsize_t, size)
                 __field(loff_t, offset)
                 __field(size_t, count)
-               __field(int, flags)
         ),
         TP_fast_assign(
                 __entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1150,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class,
                 __entry->size = ip->i_d.di_size;
                 __entry->offset = offset;
                 __entry->count = count;
-               __entry->flags = flags;
         ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
-                 "offset 0x%llx count 0x%zx ioflags %s",
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __entry->size,
                   __entry->offset,
-                 __entry->count,
-                 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+                 __entry->count)
  )
  
  #define DEFINE_RW_EVENT(name)          \
  DEFINE_EVENT(xfs_file_class, name,     \
-       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
-       TP_ARGS(ip, count, offset, flags))
-DEFINE_RW_EVENT(xfs_file_read);
+       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),    \
+       TP_ARGS(ip, count, offset))
+DEFINE_RW_EVENT(xfs_file_buffered_read);
+DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
  DEFINE_RW_EVENT(xfs_file_buffered_write);
  DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
  DEFINE_RW_EVENT(xfs_file_splice_read);
  
  DECLARE_EVENT_CLASS(xfs_page_class,
@@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
  DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
  DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
  DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
  
  DECLARE_EVENT_CLASS(xfs_simple_io_class,
         TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h

index 9a462e8..9b2b9fa 100644 (file)
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
         /* delayed logging */
         struct list_head                li_cil;         /* CIL pointers */
         struct xfs_log_vec              *li_lv;         /* active log vector */
+       struct xfs_log_vec              *li_lv_shadow;  /* standby vector */
         xfs_lsn_t                       li_seq;         /* CIL commit seq */
  } xfs_log_item_t;
  
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h

index d841450..b03c062 100644 (file)
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -6,6 +6,7 @@
  struct dentry;
  struct iattr;
  struct inode;
+struct iomap;
  struct super_block;
  struct vfsmount;
  
@@ -187,21 +188,6 @@ struct fid {
   *    get_name is not (which is possibly inconsistent)
   */
  
-/* types of block ranges for multipage write mappings. */
-#define IOMAP_HOLE     0x01    /* no blocks allocated, need allocation */
-#define IOMAP_DELALLOC 0x02    /* delayed allocation blocks */
-#define IOMAP_MAPPED   0x03    /* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN        0x04    /* blocks allocated @blkno in unwritten state */
-
-#define IOMAP_NULL_BLOCK -1LL  /* blkno is not valid */
-
-struct iomap {
-       sector_t        blkno;  /* first sector of mapping */
-       loff_t          offset; /* file offset of mapping, bytes */
-       u64             length; /* length of mapping, bytes */
-       int             type;   /* type of mapping */
-};
-
  struct export_operations {
         int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
                         struct inode *parent);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h

new file mode 100644 (file)

index 0000000..3267df4
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,70 @@
+#ifndef LINUX_IOMAP_H
+#define LINUX_IOMAP_H 1
+
+#include <linux/types.h>
+
+struct fiemap_extent_info;
+struct inode;
+struct iov_iter;
+struct kiocb;
+struct vm_area_struct;
+struct vm_fault;
+
+/*
+ * Types of block ranges for iomap mappings:
+ */
+#define IOMAP_HOLE     0x01    /* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC 0x02    /* delayed allocation blocks */
+#define IOMAP_MAPPED   0x03    /* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN        0x04    /* blocks allocated @blkno in unwritten state */
+
+/*
+ * Magic value for blkno:
+ */
+#define IOMAP_NULL_BLOCK -1LL  /* blkno is not valid */
+
+struct iomap {
+       sector_t                blkno;  /* 1st sector of mapping, 512b units */
+       loff_t                  offset; /* file offset of mapping, bytes */
+       u64                     length; /* length of mapping, bytes */
+       int                     type;   /* type of mapping */
+       struct block_device     *bdev;  /* block device for I/O */
+};
+
+/*
+ * Flags for iomap_begin / iomap_end.  No flag implies a read.
+ */
+#define IOMAP_WRITE            (1 << 0)
+#define IOMAP_ZERO             (1 << 1)
+
+struct iomap_ops {
+       /*
+        * Return the existing mapping at pos, or reserve space starting at
+        * pos for up to length, as long as we can do it as a single mapping.
+        * The actual length is returned in iomap->length.
+        */
+       int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
+                       unsigned flags, struct iomap *iomap);
+
+       /*
+        * Commit and/or unreserve space previous allocated using iomap_begin.
+        * Written indicates the length of the successful write operation which
+        * needs to be commited, while the rest needs to be unreserved.
+        * Written might be zero if no data was written.
+        */
+       int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
+                       ssize_t written, unsigned flags, struct iomap *iomap);
+};
+
+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+               struct iomap_ops *ops);
+int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
+               bool *did_zero, struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+               struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+               struct iomap_ops *ops);
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+               loff_t start, loff_t len, struct iomap_ops *ops);
+
+#endif /* LINUX_IOMAP_H */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
fs/Kconfig		patch \| blob \| history
fs/Makefile		patch \| blob \| history
fs/buffer.c		patch \| blob \| history
fs/internal.h		patch \| blob \| history
fs/iomap.c	[new file with mode: 0644]	patch \| blob
fs/nfsd/blocklayout.c		patch \| blob \| history
fs/nfsd/blocklayoutxdr.c		patch \| blob \| history
fs/xfs/Kconfig		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.h		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_leaf.h		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.h		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_btree.h		patch \| blob \| history
fs/xfs/libxfs/xfs_da_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_da_format.c		patch \| blob \| history
fs/xfs/libxfs/xfs_da_format.h		patch \| blob \| history
fs/xfs/libxfs/xfs_dir2_sf.c		patch \| blob \| history
fs/xfs/libxfs/xfs_format.h		patch \| blob \| history
fs/xfs/libxfs/xfs_fs.h		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_rtbitmap.c		patch \| blob \| history
fs/xfs/xfs_aops.c		patch \| blob \| history
fs/xfs/xfs_aops.h		patch \| blob \| history
fs/xfs/xfs_attr_inactive.c		patch \| blob \| history
fs/xfs/xfs_attr_list.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.h		patch \| blob \| history
fs/xfs/xfs_buf.c		patch \| blob \| history
fs/xfs/xfs_buf.h		patch \| blob \| history
fs/xfs/xfs_buf_item.c		patch \| blob \| history
fs/xfs/xfs_dquot.c		patch \| blob \| history
fs/xfs/xfs_dquot_item.c		patch \| blob \| history
fs/xfs/xfs_error.c		patch \| blob \| history
fs/xfs/xfs_error.h		patch \| blob \| history
fs/xfs/xfs_extfree_item.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_fsops.c		patch \| blob \| history
fs/xfs/xfs_icache.c		patch \| blob \| history
fs/xfs/xfs_icache.h		patch \| blob \| history
fs/xfs/xfs_inode.c		patch \| blob \| history
fs/xfs/xfs_inode.h		patch \| blob \| history
fs/xfs/xfs_inode_item.c		patch \| blob \| history
fs/xfs/xfs_ioctl.c		patch \| blob \| history
fs/xfs/xfs_ioctl.h		patch \| blob \| history
fs/xfs/xfs_ioctl32.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_iomap.h		patch \| blob \| history
fs/xfs/xfs_iops.c		patch \| blob \| history
fs/xfs/xfs_linux.h		patch \| blob \| history
fs/xfs/xfs_log.c		patch \| blob \| history
fs/xfs/xfs_log.h		patch \| blob \| history
fs/xfs/xfs_log_cil.c		patch \| blob \| history
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_ondisk.h		patch \| blob \| history
fs/xfs/xfs_pnfs.c		patch \| blob \| history
fs/xfs/xfs_rtalloc.h		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_super.h		patch \| blob \| history
fs/xfs/xfs_sysfs.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history
fs/xfs/xfs_trans.h		patch \| blob \| history
include/linux/exportfs.h		patch \| blob \| history
include/linux/iomap.h	[new file with mode: 0644]	patch \| blob