Merge tag 'iwlwifi-for-john-2014-10-23' of git://git.kernel.org/pub/scm/linux/kernel...

[cascardo/linux.git] / fs / btrfs / file.c
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index ff1cc03..a18ceab 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
  
         /* get the inode */
         key.objectid = defrag->root;
-       btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+       key.type = BTRFS_ROOT_ITEM_KEY;
         key.offset = (u64)-1;
  
         index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
         }
  
         key.objectid = defrag->ino;
-       btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+       key.type = BTRFS_INODE_ITEM_KEY;
         key.offset = 0;
         inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
         if (IS_ERR(inode)) {
@@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                 if (unlikely(copied == 0))
                         break;
  
-               if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+               if (copied < PAGE_CACHE_SIZE - offset) {
                         offset += copied;
                 } else {
                         pg++;
@@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
         bool force_page_uptodate = false;
         bool need_unlock;
  
-       nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
-                    PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
-                    (sizeof(struct page *)));
+       nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
+                       PAGE_CACHE_SIZE / (sizeof(struct page *)));
         nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
         nrptrs = max(nrptrs, 8);
         pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                 size_t write_bytes = min(iov_iter_count(i),
                                          nrptrs * (size_t)PAGE_CACHE_SIZE -
                                          offset);
-               size_t num_pages = (write_bytes + offset +
-                                   PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+               size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
+                                               PAGE_CACHE_SIZE);
                 size_t reserve_bytes;
                 size_t dirty_pages;
                 size_t copied;
@@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                  * our prealloc extent may be smaller than
                                  * write_bytes, so scale down.
                                  */
-                               num_pages = (write_bytes + offset +
-                                            PAGE_CACHE_SIZE - 1) >>
-                                       PAGE_CACHE_SHIFT;
+                               num_pages = DIV_ROUND_UP(write_bytes + offset,
+                                                        PAGE_CACHE_SIZE);
                                 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
                                 ret = 0;
                         } else {
@@ -1590,9 +1588,8 @@ again:
                         dirty_pages = 0;
                 } else {
                         force_page_uptodate = false;
-                       dirty_pages = (copied + offset +
-                                      PAGE_CACHE_SIZE - 1) >>
-                                      PAGE_CACHE_SHIFT;
+                       dirty_pages = DIV_ROUND_UP(copied + offset,
+                                                  PAGE_CACHE_SIZE);
                 }
  
                 /*
@@ -1653,7 +1650,7 @@ again:
                 cond_resched();
  
                 balance_dirty_pages_ratelimited(inode->i_mapping);
-               if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+               if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
                         btrfs_btree_balance_dirty(root);
  
                 pos += copied;
@@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
         if (sync)
                 atomic_inc(&BTRFS_I(inode)->sync_writers);
  
-       if (unlikely(file->f_flags & O_DIRECT)) {
+       if (file->f_flags & O_DIRECT) {
                 num_written = __btrfs_direct_write(iocb, from, pos);
         } else {
                 num_written = __btrfs_buffered_write(file, from, pos);
@@ -1852,6 +1849,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
         return 0;
  }
  
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+       int ret;
+
+       atomic_inc(&BTRFS_I(inode)->sync_writers);
+       ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+       if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                            &BTRFS_I(inode)->runtime_flags))
+               ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+       atomic_dec(&BTRFS_I(inode)->sync_writers);
+
+       return ret;
+}
+
  /*
   * fsync call for both files and directories.  This logs the inode into
   * the tree log instead of forcing full commits whenever possible.
@@ -1881,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
          * multi-task, and make the performance up.  See
          * btrfs_wait_ordered_range for an explanation of the ASYNC check.
          */
-       atomic_inc(&BTRFS_I(inode)->sync_writers);
-       ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
-       if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                            &BTRFS_I(inode)->runtime_flags))
-               ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
-       atomic_dec(&BTRFS_I(inode)->sync_writers);
+       ret = start_ordered_ops(inode, start, end);
         if (ret)
                 return ret;
  
         mutex_lock(&inode->i_mutex);
-
-       /*
-        * We flush the dirty pages again to avoid some dirty pages in the
-        * range being left.
-        */
         atomic_inc(&root->log_batch);
         full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                              &BTRFS_I(inode)->runtime_flags);
+       /*
+        * We might have have had more pages made dirty after calling
+        * start_ordered_ops and before acquiring the inode's i_mutex.
+        */
         if (full_sync) {
+               /*
+                * For a full sync, we need to make sure any ordered operations
+                * start and finish before we start logging the inode, so that
+                * all extents are persisted and the respective file extent
+                * items are in the fs/subvol btree.
+                */
                 ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
-               if (ret) {
-                       mutex_unlock(&inode->i_mutex);
-                       goto out;
-               }
+       } else {
+               /*
+                * Start any new ordered operations before starting to log the
+                * inode. We will wait for them to finish in btrfs_sync_log().
+                *
+                * Right before acquiring the inode's mutex, we might have new
+                * writes dirtying pages, which won't immediately start the
+                * respective ordered operations - that is done through the
+                * fill_delalloc callbacks invoked from the writepage and
+                * writepages address space operations. So make sure we start
+                * all ordered operations before starting to log our inode. Not
+                * doing this means that while logging the inode, writeback
+                * could start and invoke writepage/writepages, which would call
+                * the fill_delalloc callbacks (cow_file_range,
+                * submit_compressed_extents). These callbacks add first an
+                * extent map to the modified list of extents and then create
+                * the respective ordered operation, which means in
+                * tree-log.c:btrfs_log_inode() we might capture all existing
+                * ordered operations (with btrfs_get_logged_extents()) before
+                * the fill_delalloc callback adds its ordered operation, and by
+                * the time we visit the modified list of extent maps (with
+                * btrfs_log_changed_extents()), we see and process the extent
+                * map they created. We then use the extent map to construct a
+                * file extent item for logging without waiting for the
+                * respective ordered operation to finish - this file extent
+                * item points to a disk location that might not have yet been
+                * written to, containing random data - so after a crash a log
+                * replay will make our inode have file extent items that point
+                * to disk locations containing invalid data, as we returned
+                * success to userspace without waiting for the respective
+                * ordered operation to finish, because it wasn't captured by
+                * btrfs_get_logged_extents().
+                */
+               ret = start_ordered_ops(inode, start, end);
+       }
+       if (ret) {
+               mutex_unlock(&inode->i_mutex);
+               goto out;
         }
         atomic_inc(&root->log_batch);
  
@@ -1984,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
          */
         mutex_unlock(&inode->i_mutex);
  
+       /*
+        * If any of the ordered extents had an error, just return it to user
+        * space, so that the application knows some writes didn't succeed and
+        * can take proper action (retry for e.g.). Blindly committing the
+        * transaction in this case, would fool userspace that everything was
+        * successful. And we also want to make sure our log doesn't contain
+        * file extent items pointing to extents that weren't fully written to -
+        * just like in the non fast fsync path, where we check for the ordered
+        * operation's error flag before writing to the log tree and return -EIO
+        * if any of them had this flag set (btrfs_wait_ordered_range) -
+        * therefore we need to check for errors in the ordered operations,
+        * which are indicated by ctx.io_err.
+        */
+       if (ctx.io_err) {
+               btrfs_end_transaction(trans, root);
+               ret = ctx.io_err;
+               goto out;
+       }
+
         if (ret != BTRFS_NO_LOG_SYNC) {
                 if (!ret) {
                         ret = btrfs_sync_log(trans, root, &ctx);
@@ -2621,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct extent_map *em = NULL;
         struct extent_state *cached_state = NULL;
-       u64 lockstart = *offset;
-       u64 lockend = i_size_read(inode);
-       u64 start = *offset;
-       u64 len = i_size_read(inode);
+       u64 lockstart;
+       u64 lockend;
+       u64 start;
+       u64 len;
         int ret = 0;
  
-       lockend = max_t(u64, root->sectorsize, lockend);
+       if (inode->i_size == 0)
+               return -ENXIO;
+
+       /*
+        * *offset can be negative, in this case we start finding DATA/HOLE from
+        * the very start of the file.
+        */
+       start = max_t(loff_t, 0, *offset);
+
+       lockstart = round_down(start, root->sectorsize);
+       lockend = round_up(i_size_read(inode), root->sectorsize);
         if (lockend <= lockstart)
                 lockend = lockstart + root->sectorsize;
-
         lockend--;
         len = lockend - lockstart + 1;
  
-       len = max_t(u64, len, root->sectorsize);
-       if (inode->i_size == 0)
-               return -ENXIO;
-
         lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
                          &cached_state);