Merge branch 'work.splice_read' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 10 Oct 2016 20:38:49 +0000 (13:38 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 10 Oct 2016 20:38:49 +0000 (13:38 -0700)
Pull splice fixups from Al Viro:
 "A couple of fixups for interaction of pipe-backed iov_iter with
  O_DIRECT reads + constification of a couple of primitives in uio.h
  missed by previous rounds.

  Kudos to davej - his fuzzing has caught those bugs"

* 'work.splice_read' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
  [btrfs] fix check_direct_IO() for non-iovec iterators
  constify iov_iter_count() and iter_is_iovec()
  fix ITER_PIPE interaction with direct_IO

1  2 
fs/btrfs/inode.c
fs/xfs/xfs_file.c
include/linux/uio.h
mm/filemap.c

diff --combined fs/btrfs/inode.c
@@@ -5072,7 -5072,7 +5072,7 @@@ static int btrfs_setattr(struct dentry 
        if (btrfs_root_readonly(root))
                return -EROFS;
  
 -      err = inode_change_ok(inode, attr);
 +      err = setattr_prepare(dentry, attr);
        if (err)
                return err;
  
@@@ -8412,7 -8412,7 +8412,7 @@@ static int btrfs_submit_direct_hook(str
        if (!bio)
                return -ENOMEM;
  
 -      bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
 +      bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
        bio->bi_private = dip;
        bio->bi_end_io = btrfs_end_dio_bio;
        btrfs_io_bio(bio)->logical = file_offset;
@@@ -8450,8 -8450,7 +8450,8 @@@ next_block
                                                  start_sector, GFP_NOFS);
                        if (!bio)
                                goto out_err;
 -                      bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
 +                      bio_set_op_attrs(bio, bio_op(orig_bio),
 +                                       bio_flags(orig_bio));
                        bio->bi_private = dip;
                        bio->bi_end_io = btrfs_end_dio_bio;
                        btrfs_io_bio(bio)->logical = file_offset;
@@@ -8619,7 -8618,7 +8619,7 @@@ static ssize_t check_direct_IO(struct b
                goto out;
  
        /* If this is a write we don't need to check anymore */
-       if (iov_iter_rw(iter) == WRITE)
+       if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
                return 0;
        /*
         * Check to make sure we don't have duplicate iov_base's in this
@@@ -10544,6 -10543,21 +10544,6 @@@ out_inode
  
  }
  
 -/* Inspired by filemap_check_errors() */
 -int btrfs_inode_check_errors(struct inode *inode)
 -{
 -      int ret = 0;
 -
 -      if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
 -          test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
 -              ret = -ENOSPC;
 -      if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
 -          test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
 -              ret = -EIO;
 -
 -      return ret;
 -}
 -
  static const struct inode_operations btrfs_dir_inode_operations = {
        .getattr        = btrfs_getattr,
        .lookup         = btrfs_lookup,
diff --combined fs/xfs/xfs_file.c
@@@ -269,8 -269,6 +269,8 @@@ xfs_file_dio_aio_read
                return -EINVAL;
        }
  
 +      file_accessed(iocb->ki_filp);
 +
        /*
         * Locking is a bit tricky here. If we take an exclusive lock for direct
         * IO, we effectively serialise all new concurrent read IO to this file
        data = *to;
        ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
                        xfs_get_blocks_direct, NULL, NULL, 0);
-       if (ret > 0) {
+       if (ret >= 0) {
                iocb->ki_pos += ret;
                iov_iter_advance(to, ret);
        }
        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
 -      file_accessed(iocb->ki_filp);
        return ret;
  }
  
@@@ -333,7 -332,10 +333,7 @@@ xfs_file_dax_read
        struct kiocb            *iocb,
        struct iov_iter         *to)
  {
 -      struct address_space    *mapping = iocb->ki_filp->f_mapping;
 -      struct inode            *inode = mapping->host;
 -      struct xfs_inode        *ip = XFS_I(inode);
 -      struct iov_iter         data = *to;
 +      struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
        size_t                  count = iov_iter_count(to);
        ssize_t                 ret = 0;
  
                return 0; /* skip atime */
  
        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 -      ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
 -      if (ret > 0) {
 -              iocb->ki_pos += ret;
 -              iov_iter_advance(to, ret);
 -      }
 +      ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
        file_accessed(iocb->ki_filp);
@@@ -666,32 -672,70 +666,32 @@@ xfs_file_dax_write
        struct kiocb            *iocb,
        struct iov_iter         *from)
  {
 -      struct address_space    *mapping = iocb->ki_filp->f_mapping;
 -      struct inode            *inode = mapping->host;
 +      struct inode            *inode = iocb->ki_filp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
 -      struct xfs_mount        *mp = ip->i_mount;
 -      ssize_t                 ret = 0;
 -      int                     unaligned_io = 0;
 -      int                     iolock;
 -      struct iov_iter         data;
 +      int                     iolock = XFS_IOLOCK_EXCL;
 +      ssize_t                 ret, error = 0;
 +      size_t                  count;
 +      loff_t                  pos;
  
 -      /* "unaligned" here means not aligned to a filesystem block */
 -      if ((iocb->ki_pos & mp->m_blockmask) ||
 -          ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
 -              unaligned_io = 1;
 -              iolock = XFS_IOLOCK_EXCL;
 -      } else if (mapping->nrpages) {
 -              iolock = XFS_IOLOCK_EXCL;
 -      } else {
 -              iolock = XFS_IOLOCK_SHARED;
 -      }
        xfs_rw_ilock(ip, iolock);
 -
        ret = xfs_file_aio_write_checks(iocb, from, &iolock);
        if (ret)
                goto out;
  
 -      /*
 -       * Yes, even DAX files can have page cache attached to them:  A zeroed
 -       * page is inserted into the pagecache when we have to serve a write
 -       * fault on a hole.  It should never be dirtied and can simply be
 -       * dropped from the pagecache once we get real data for the page.
 -       *
 -       * XXX: This is racy against mmap, and there's nothing we can do about
 -       * it. dax_do_io() should really do this invalidation internally as
 -       * it will know if we've allocated over a holei for this specific IO and
 -       * if so it needs to update the mapping tree and invalidate existing
 -       * PTEs over the newly allocated range. Remove this invalidation when
 -       * dax_do_io() is fixed up.
 -       */
 -      if (mapping->nrpages) {
 -              loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
 +      pos = iocb->ki_pos;
 +      count = iov_iter_count(from);
  
 -              ret = invalidate_inode_pages2_range(mapping,
 -                                                  iocb->ki_pos >> PAGE_SHIFT,
 -                                                  end >> PAGE_SHIFT);
 -              WARN_ON_ONCE(ret);
 -      }
 +      trace_xfs_file_dax_write(ip, count, pos);
  
 -      if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
 -              xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 -              iolock = XFS_IOLOCK_SHARED;
 +      ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
 +      if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 +              i_size_write(inode, iocb->ki_pos);
 +              error = xfs_setfilesize(ip, pos, ret);
        }
  
 -      trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
 -
 -      data = *from;
 -      ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
 -                      xfs_end_io_direct_write, 0);
 -      if (ret > 0) {
 -              iocb->ki_pos += ret;
 -              iov_iter_advance(from, ret);
 -      }
  out:
        xfs_rw_iunlock(ip, iolock);
 -      return ret;
 +      return error ? error : ret;
  }
  
  STATIC ssize_t
@@@ -901,7 -945,7 +901,7 @@@ xfs_file_fallocate
  
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
 -              error = xfs_setattr_size(ip, &iattr);
 +              error = xfs_vn_setattr_size(file_dentry(file), &iattr);
                if (error)
                        goto out_unlock;
        }
@@@ -1430,7 -1474,7 +1430,7 @@@ xfs_filemap_page_mkwrite
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
        if (IS_DAX(inode)) {
 -              ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
 +              ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
        } else {
                ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                ret = block_page_mkwrite_return(ret);
@@@ -1464,7 -1508,7 +1464,7 @@@ xfs_filemap_fault
                 * changes to xfs_get_blocks_direct() to map unwritten extent
                 * ioend for conversion on read-only mappings.
                 */
 -              ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
 +              ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
        } else
                ret = filemap_fault(vma, vmf);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@@ -1579,7 -1623,6 +1579,7 @@@ const struct file_operations xfs_file_o
        .open           = xfs_file_open,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
 +      .get_unmapped_area = thp_get_unmapped_area,
        .fallocate      = xfs_file_fallocate,
  };
  
diff --combined include/linux/uio.h
@@@ -82,6 -82,7 +82,6 @@@ size_t iov_iter_copy_from_user_atomic(s
                struct iov_iter *i, unsigned long offset, size_t bytes);
  void iov_iter_advance(struct iov_iter *i, size_t bytes);
  int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 -#define iov_iter_fault_in_multipages_readable iov_iter_fault_in_readable
  size_t iov_iter_single_seg_count(const struct iov_iter *i);
  size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
@@@ -109,12 -110,12 +109,12 @@@ int iov_iter_npages(const struct iov_it
  
  const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
  
- static inline size_t iov_iter_count(struct iov_iter *i)
+ static inline size_t iov_iter_count(const struct iov_iter *i)
  {
        return i->count;
  }
  
- static inline bool iter_is_iovec(struct iov_iter *i)
+ static inline bool iter_is_iovec(const struct iov_iter *i)
  {
        return !(i->type & (ITER_BVEC | ITER_KVEC | ITER_PIPE));
  }
diff --combined mm/filemap.c
   *   ->tasklist_lock            (memory_failure, collect_procs_ao)
   */
  
 +static int page_cache_tree_insert(struct address_space *mapping,
 +                                struct page *page, void **shadowp)
 +{
 +      struct radix_tree_node *node;
 +      void **slot;
 +      int error;
 +
 +      error = __radix_tree_create(&mapping->page_tree, page->index, 0,
 +                                  &node, &slot);
 +      if (error)
 +              return error;
 +      if (*slot) {
 +              void *p;
 +
 +              p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 +              if (!radix_tree_exceptional_entry(p))
 +                      return -EEXIST;
 +
 +              mapping->nrexceptional--;
 +              if (!dax_mapping(mapping)) {
 +                      if (shadowp)
 +                              *shadowp = p;
 +                      if (node)
 +                              workingset_node_shadows_dec(node);
 +              } else {
 +                      /* DAX can replace empty locked entry with a hole */
 +                      WARN_ON_ONCE(p !=
 +                              (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
 +                                       RADIX_DAX_ENTRY_LOCK));
 +                      /* DAX accounts exceptional entries as normal pages */
 +                      if (node)
 +                              workingset_node_pages_dec(node);
 +                      /* Wakeup waiters for exceptional entry lock */
 +                      dax_wake_mapping_entry_waiter(mapping, page->index,
 +                                                    false);
 +              }
 +      }
 +      radix_tree_replace_slot(slot, page);
 +      mapping->nrpages++;
 +      if (node) {
 +              workingset_node_pages_inc(node);
 +              /*
 +               * Don't track node that contains actual pages.
 +               *
 +               * Avoid acquiring the list_lru lock if already
 +               * untracked.  The list_empty() test is safe as
 +               * node->private_list is protected by
 +               * mapping->tree_lock.
 +               */
 +              if (!list_empty(&node->private_list))
 +                      list_lru_del(&workingset_shadow_nodes,
 +                                   &node->private_list);
 +      }
 +      return 0;
 +}
 +
  static void page_cache_tree_delete(struct address_space *mapping,
                                   struct page *page, void *shadow)
  {
 -      struct radix_tree_node *node;
        int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
  
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(nr != 1 && shadow, page);
  
 -      if (shadow) {
 -              mapping->nrexceptional += nr;
 -              /*
 -               * Make sure the nrexceptional update is committed before
 -               * the nrpages update so that final truncate racing
 -               * with reclaim does not see both counters 0 at the
 -               * same time and miss a shadow entry.
 -               */
 -              smp_wmb();
 -      }
 -      mapping->nrpages -= nr;
 -
        for (i = 0; i < nr; i++) {
 -              node = radix_tree_replace_clear_tags(&mapping->page_tree,
 -                              page->index + i, shadow);
 +              struct radix_tree_node *node;
 +              void **slot;
 +
 +              __radix_tree_lookup(&mapping->page_tree, page->index + i,
 +                                  &node, &slot);
 +
 +              radix_tree_clear_tags(&mapping->page_tree, node, slot);
 +
                if (!node) {
                        VM_BUG_ON_PAGE(nr != 1, page);
 -                      return;
 +                      /*
 +                       * We need a node to properly account shadow
 +                       * entries. Don't plant any without. XXX
 +                       */
 +                      shadow = NULL;
                }
  
 +              radix_tree_replace_slot(slot, shadow);
 +
 +              if (!node)
 +                      break;
 +
                workingset_node_pages_dec(node);
                if (shadow)
                        workingset_node_shadows_inc(node);
                                        &node->private_list);
                }
        }
 +
 +      if (shadow) {
 +              mapping->nrexceptional += nr;
 +              /*
 +               * Make sure the nrexceptional update is committed before
 +               * the nrpages update so that final truncate racing
 +               * with reclaim does not see both counters 0 at the
 +               * same time and miss a shadow entry.
 +               */
 +              smp_wmb();
 +      }
 +      mapping->nrpages -= nr;
  }
  
  /*
@@@ -631,8 -561,9 +631,8 @@@ int replace_page_cache_page(struct pag
  
                spin_lock_irqsave(&mapping->tree_lock, flags);
                __delete_from_page_cache(old, NULL);
 -              error = radix_tree_insert(&mapping->page_tree, offset, new);
 +              error = page_cache_tree_insert(mapping, new, NULL);
                BUG_ON(error);
 -              mapping->nrpages++;
  
                /*
                 * hugetlb pages do not participate in page cache accounting.
  }
  EXPORT_SYMBOL_GPL(replace_page_cache_page);
  
 -static int page_cache_tree_insert(struct address_space *mapping,
 -                                struct page *page, void **shadowp)
 -{
 -      struct radix_tree_node *node;
 -      void **slot;
 -      int error;
 -
 -      error = __radix_tree_create(&mapping->page_tree, page->index, 0,
 -                                  &node, &slot);
 -      if (error)
 -              return error;
 -      if (*slot) {
 -              void *p;
 -
 -              p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 -              if (!radix_tree_exceptional_entry(p))
 -                      return -EEXIST;
 -
 -              mapping->nrexceptional--;
 -              if (!dax_mapping(mapping)) {
 -                      if (shadowp)
 -                              *shadowp = p;
 -                      if (node)
 -                              workingset_node_shadows_dec(node);
 -              } else {
 -                      /* DAX can replace empty locked entry with a hole */
 -                      WARN_ON_ONCE(p !=
 -                              (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
 -                                       RADIX_DAX_ENTRY_LOCK));
 -                      /* DAX accounts exceptional entries as normal pages */
 -                      if (node)
 -                              workingset_node_pages_dec(node);
 -                      /* Wakeup waiters for exceptional entry lock */
 -                      dax_wake_mapping_entry_waiter(mapping, page->index,
 -                                                    false);
 -              }
 -      }
 -      radix_tree_replace_slot(slot, page);
 -      mapping->nrpages++;
 -      if (node) {
 -              workingset_node_pages_inc(node);
 -              /*
 -               * Don't track node that contains actual pages.
 -               *
 -               * Avoid acquiring the list_lru lock if already
 -               * untracked.  The list_empty() test is safe as
 -               * node->private_list is protected by
 -               * mapping->tree_lock.
 -               */
 -              if (!list_empty(&node->private_list))
 -                      list_lru_del(&workingset_shadow_nodes,
 -                                   &node->private_list);
 -      }
 -      return 0;
 -}
 -
  static int __add_to_page_cache_locked(struct page *page,
                                      struct address_space *mapping,
                                      pgoff_t offset, gfp_t gfp_mask,
@@@ -1687,10 -1674,6 +1687,10 @@@ static ssize_t do_generic_file_read(str
        unsigned int prev_offset;
        int error = 0;
  
 +      if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
 +              return -EINVAL;
 +      iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
 +
        index = *ppos >> PAGE_SHIFT;
        prev_index = ra->prev_pos >> PAGE_SHIFT;
        prev_offset = ra->prev_pos & (PAGE_SIZE-1);
@@@ -1725,9 -1708,7 +1725,9 @@@ find_page
                         * wait_on_page_locked is used to avoid unnecessarily
                         * serialisations and why it's safe.
                         */
 -                      wait_on_page_locked_killable(page);
 +                      error = wait_on_page_locked_killable(page);
 +                      if (unlikely(error))
 +                              goto readpage_error;
                        if (PageUptodate(page))
                                goto page_ok;
  
@@@ -1929,19 -1910,17 +1929,19 @@@ generic_file_read_iter(struct kiocb *io
        if (iocb->ki_flags & IOCB_DIRECT) {
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;
 +              struct iov_iter data = *iter;
                loff_t size;
  
                size = i_size_read(inode);
                retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
                                        iocb->ki_pos + count - 1);
 -              if (!retval) {
 -                      struct iov_iter data = *iter;
 -                      retval = mapping->a_ops->direct_IO(iocb, &data);
 -              }
 +              if (retval < 0)
 +                      goto out;
  
-               if (retval > 0) {
 +              file_accessed(file);
 +
 +              retval = mapping->a_ops->direct_IO(iocb, &data);
+               if (retval >= 0) {
                        iocb->ki_pos += retval;
                        iov_iter_advance(iter, retval);
                }
                 * DAX files, so don't bother trying.
                 */
                if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size ||
 -                  IS_DAX(inode)) {
 -                      file_accessed(file);
 +                  IS_DAX(inode))
                        goto out;
 -              }
        }
  
        retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);