Merge branch 'for-linus-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...

[cascardo/linux.git] / fs / btrfs / inode.c
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 2aaba58..8b1212e 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -455,7 +455,7 @@ again:
  
         /*
          * skip compression for a small file range(<=blocksize) that
-        * isn't an inline extent, since it dosen't save disk space at all.
+        * isn't an inline extent, since it doesn't save disk space at all.
          */
         if (total_compressed <= blocksize &&
            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
@@ -824,6 +824,7 @@ retry:
                                                 async_extent->ram_size - 1, 0);
                         goto out_free_reserve;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
  
                 /*
                  * clear dirty, set writeback and unlock the pages.
@@ -861,6 +862,7 @@ retry:
         }
         return;
  out_free_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
  out_free:
         extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,
                                 goto out_drop_extent_cache;
                 }
  
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
                 if (disk_num_bytes < cur_alloc_size)
                         break;
  
@@ -1066,6 +1070,7 @@ out:
  out_drop_extent_cache:
         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
  out_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
  out_unlock:
         extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -1377,6 +1382,9 @@ next_slot:
                          */
                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
                                 goto out_check;
+                       if (!btrfs_inc_nocow_writers(root->fs_info,
+                                                    disk_bytenr))
+                               goto out_check;
                         nocow = 1;
                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                         extent_end = found_key.offset +
@@ -1391,6 +1399,9 @@ out_check:
                         path->slots[0]++;
                         if (!nolock && nocow)
                                 btrfs_end_write_no_snapshoting(root);
+                       if (nocow)
+                               btrfs_dec_nocow_writers(root->fs_info,
+                                                       disk_bytenr);
                         goto next_slot;
                 }
                 if (!nocow) {
@@ -1411,6 +1422,9 @@ out_check:
                         if (ret) {
                                 if (!nolock && nocow)
                                         btrfs_end_write_no_snapshoting(root);
+                               if (nocow)
+                                       btrfs_dec_nocow_writers(root->fs_info,
+                                                               disk_bytenr);
                                 goto error;
                         }
                         cow_start = (u64)-1;
@@ -1453,6 +1467,8 @@ out_check:
  
                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
                                                num_bytes, num_bytes, type);
+               if (nocow)
+                       btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
                 BUG_ON(ret); /* -ENOMEM */
  
                 if (root->root_key.objectid ==
@@ -1962,7 +1978,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
  {
         WARN_ON((end & (PAGE_SIZE - 1)) == 0);
         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-                                  cached_state, GFP_NOFS);
+                                  cached_state);
  }
  
  /* see btrfs_writepage_start_hook for details on why this is required */
@@ -3103,8 +3119,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
  
         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
-               clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
-                                 GFP_NOFS);
+               clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
                 return 0;
         }
  
@@ -3706,7 +3721,7 @@ cache_index:
          * and doesn't have an inode ref with the name "bar" anymore.
          *
          * Setting last_unlink_trans to last_trans is a pessimistic approach,
-        * but it guarantees correctness at the expense of ocassional full
+        * but it guarantees correctness at the expense of occasional full
          * transaction commits on fsync if our inode is a directory, or if our
          * inode is not a directory, logging its parent unnecessarily.
          */
@@ -4962,7 +4977,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                  * be instantly completed which will give us extents that need
                  * to be truncated.  If we fail to get an orphan inode down we
                  * could have left over extents that were never meant to live,
-                * so we need to garuntee from this point on that everything
+                * so we need to guarantee from this point on that everything
                  * will be consistent.
                  */
                 ret = btrfs_orphan_add(trans, inode);
@@ -5232,7 +5247,7 @@ void btrfs_evict_inode(struct inode *inode)
                 }
  
                 /*
-                * We can't just steal from the global reserve, we need tomake
+                * We can't just steal from the global reserve, we need to make
                  * sure there is room to do it, if not we need to commit and try
                  * again.
                  */
@@ -6964,7 +6979,18 @@ insert:
                  * existing will always be non-NULL, since there must be
                  * extent causing the -EEXIST.
                  */
-               if (start >= extent_map_end(existing) ||
+               if (existing->start == em->start &&
+                   extent_map_end(existing) == extent_map_end(em) &&
+                   em->block_start == existing->block_start) {
+                       /*
+                        * these two extents are the same, it happens
+                        * with inlines especially
+                        */
+                       free_extent_map(em);
+                       em = existing;
+                       err = 0;
+
+               } else if (start >= extent_map_end(existing) ||
                     start <= existing->start) {
                         /*
                          * The existing extent map is the one nearest to
@@ -7129,6 +7155,43 @@ out:
         return em;
  }
  
+static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+                                                 const u64 start,
+                                                 const u64 len,
+                                                 const u64 orig_start,
+                                                 const u64 block_start,
+                                                 const u64 block_len,
+                                                 const u64 orig_block_len,
+                                                 const u64 ram_bytes,
+                                                 const int type)
+{
+       struct extent_map *em = NULL;
+       int ret;
+
+       down_read(&BTRFS_I(inode)->dio_sem);
+       if (type != BTRFS_ORDERED_NOCOW) {
+               em = create_pinned_em(inode, start, len, orig_start,
+                                     block_start, block_len, orig_block_len,
+                                     ram_bytes, type);
+               if (IS_ERR(em))
+                       goto out;
+       }
+       ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+                                          len, block_len, type);
+       if (ret) {
+               if (em) {
+                       free_extent_map(em);
+                       btrfs_drop_extent_cache(inode, start,
+                                               start + len - 1, 0);
+               }
+               em = ERR_PTR(ret);
+       }
+ out:
+       up_read(&BTRFS_I(inode)->dio_sem);
+
+       return em;
+}
+
  static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                                                   u64 start, u64 len)
  {
@@ -7144,41 +7207,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
         if (ret)
                 return ERR_PTR(ret);
  
-       /*
-        * Create the ordered extent before the extent map. This is to avoid
-        * races with the fast fsync path that would lead to it logging file
-        * extent items that point to disk extents that were not yet written to.
-        * The fast fsync path collects ordered extents into a local list and
-        * then collects all the new extent maps, so we must create the ordered
-        * extent first and make sure the fast fsync path collects any new
-        * ordered extents after collecting new extent maps as well.
-        * The fsync path simply can not rely on inode_dio_wait() because it
-        * causes deadlock with AIO.
-        */
-       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
-                                          ins.offset, ins.offset, 0);
-       if (ret) {
+       em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+                                    ins.objectid, ins.offset, ins.offset,
+                                    ins.offset, 0);
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+       if (IS_ERR(em))
                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               return ERR_PTR(ret);
-       }
  
-       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                             ins.offset, ins.offset, ins.offset, 0);
-       if (IS_ERR(em)) {
-               struct btrfs_ordered_extent *oe;
-
-               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               oe = btrfs_lookup_ordered_extent(inode, start);
-               ASSERT(oe);
-               if (WARN_ON(!oe))
-                       return em;
-               set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
-               set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
-               btrfs_remove_ordered_extent(inode, oe);
-               /* Once for our lookup and once for the ordered extents tree. */
-               btrfs_put_ordered_extent(oe);
-               btrfs_put_ordered_extent(oe);
-       }
         return em;
  }
  
@@ -7408,7 +7443,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                                  cached_state);
                 /*
                  * We're concerned with the entire range that we're going to be
-                * doing DIO to, so we need to make sure theres no ordered
+                * doing DIO to, so we need to make sure there's no ordered
                  * extents in this range.
                  */
                 ordered = btrfs_lookup_ordered_range(inode, lockstart,
@@ -7570,7 +7605,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         if (current->journal_info) {
                 /*
                  * Need to pull our outstanding extents and set journal_info to NULL so
-                * that anything that needs to check if there's a transction doesn't get
+                * that anything that needs to check if there's a transaction doesn't get
                  * confused.
                  */
                 dio_data = current->journal_info;
@@ -7603,7 +7638,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
          * decompress it, so there will be buffering required no matter what we
          * do, so go ahead and fallback to buffered.
          *
-        * We return -ENOTBLK because thats what makes DIO go ahead and go back
+        * We return -ENOTBLK because that's what makes DIO go ahead and go back
          * to buffered IO.  Don't blame me, this is the price we pay for using
          * the generic code.
          */
@@ -7650,24 +7685,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 block_start = em->block_start + (start - em->start);
  
                 if (can_nocow_extent(inode, start, &len, &orig_start,
-                                    &orig_block_len, &ram_bytes) == 1) {
+                                    &orig_block_len, &ram_bytes) == 1 &&
+                   btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+                       struct extent_map *em2;
+
+                       em2 = btrfs_create_dio_extent(inode, start, len,
+                                                     orig_start, block_start,
+                                                     len, orig_block_len,
+                                                     ram_bytes, type);
+                       btrfs_dec_nocow_writers(root->fs_info, block_start);
                         if (type == BTRFS_ORDERED_PREALLOC) {
                                 free_extent_map(em);
-                               em = create_pinned_em(inode, start, len,
-                                                      orig_start,
-                                                      block_start, len,
-                                                      orig_block_len,
-                                                      ram_bytes, type);
-                               if (IS_ERR(em)) {
-                                       ret = PTR_ERR(em);
-                                       goto unlock_err;
-                               }
+                               em = em2;
                         }
-
-                       ret = btrfs_add_ordered_extent_dio(inode, start,
-                                          block_start, len, len, type);
-                       if (ret) {
-                               free_extent_map(em);
+                       if (em2 && IS_ERR(em2)) {
+                               ret = PTR_ERR(em2);
                                 goto unlock_err;
                         }
                         goto unlock;
@@ -8541,13 +8573,13 @@ out:
         return retval;
  }
  
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-                              loff_t offset)
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
  {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_dio_data dio_data = { 0 };
+       loff_t offset = iocb->ki_pos;
         size_t count = 0;
         int flags = 0;
         bool wakeup = true;
@@ -8607,7 +8639,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
  
         ret = __blockdev_direct_IO(iocb, inode,
                                    BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-                                  iter, offset, btrfs_get_blocks_direct, NULL,
+                                  iter, btrfs_get_blocks_direct, NULL,
                                    btrfs_submit_direct, flags);
         if (iov_iter_rw(iter) == WRITE) {
                 current->journal_info = NULL;
@@ -9019,7 +9051,7 @@ static int btrfs_truncate(struct inode *inode)
                 return ret;
  
         /*
-        * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
+        * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
          * 3 things going on here
          *
          * 1) We need to reserve space for our orphan item and the space to
@@ -9033,15 +9065,15 @@ static int btrfs_truncate(struct inode *inode)
          * space reserved in case it uses space during the truncate (thank you
          * very much snapshotting).
          *
-        * And we need these to all be seperate.  The fact is we can use alot of
+        * And we need these to all be separate.  The fact is we can use a lot of
          * space doing the truncate, and we have no earthly idea how much space
-        * we will use, so we need the truncate reservation to be seperate so it
+        * we will use, so we need the truncate reservation to be separate so it
          * doesn't end up using space reserved for updating the inode or
          * removing the orphan item.  We also need to be able to stop the
          * transaction and start a new one, which means we need to be able to
          * update the inode several times, and we have no idea of knowing how
          * many times that will be, so we can't just reserve 1 item for the
-        * entirety of the opration, so that has to be done seperately as well.
+        * entirety of the operation, so that has to be done separately as well.
          * Then there is the orphan item, which does indeed need to be held on
          * to for the whole operation, and we need nobody to touch this reserved
          * space except the orphan code.
@@ -9230,6 +9262,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         INIT_LIST_HEAD(&ei->delalloc_inodes);
         INIT_LIST_HEAD(&ei->delayed_iput);
         RB_CLEAR_NODE(&ei->rb_node);
+       init_rwsem(&ei->dio_sem);
  
         return inode;
  }
@@ -9387,10 +9420,281 @@ static int btrfs_getattr(struct vfsmount *mnt,
         return 0;
  }
  
+static int btrfs_rename_exchange(struct inode *old_dir,
+                             struct dentry *old_dentry,
+                             struct inode *new_dir,
+                             struct dentry *new_dentry)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(old_dir)->root;
+       struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+       struct inode *new_inode = new_dentry->d_inode;
+       struct inode *old_inode = old_dentry->d_inode;
+       struct timespec ctime = CURRENT_TIME;
+       struct dentry *parent;
+       u64 old_ino = btrfs_ino(old_inode);
+       u64 new_ino = btrfs_ino(new_inode);
+       u64 old_idx = 0;
+       u64 new_idx = 0;
+       u64 root_objectid;
+       int ret;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
+
+       /* we only allow rename subvolume link between subvolumes */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+               return -EXDEV;
+
+       /* close the race window with snapshot create/destroy ioctl */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&root->fs_info->subvol_sem);
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&dest->fs_info->subvol_sem);
+
+       /*
+        * We want to reserve the absolute worst case amount of items.  So if
+        * both inodes are subvols and we need to unlink them then that would
+        * require 4 item modifications, but if they are both normal inodes it
+        * would require 5 item modifications, so we'll assume their normal
+        * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+        * should cover the worst case number of items we'll modify.
+        */
+       trans = btrfs_start_transaction(root, 12);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
+
+       /*
+        * We need to find a free sequence number both in the source and
+        * in the destination directory for the exchange.
+        */
+       ret = btrfs_set_inode_index(new_dir, &old_idx);
+       if (ret)
+               goto out_fail;
+       ret = btrfs_set_inode_index(old_dir, &new_idx);
+       if (ret)
+               goto out_fail;
+
+       BTRFS_I(old_inode)->dir_index = 0ULL;
+       BTRFS_I(new_inode)->dir_index = 0ULL;
+
+       /* Reference for the source. */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(root->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, dest,
+                                            new_dentry->d_name.name,
+                                            new_dentry->d_name.len,
+                                            old_ino,
+                                            btrfs_ino(new_dir), old_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* And now for the dest. */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(dest->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, root,
+                                            old_dentry->d_name.name,
+                                            old_dentry->d_name.len,
+                                            new_ino,
+                                            btrfs_ino(old_dir), new_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* Update inode version and ctime/mtime. */
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
+       inode_inc_iversion(new_inode);
+       old_dir->i_ctime = old_dir->i_mtime = ctime;
+       new_dir->i_ctime = new_dir->i_mtime = ctime;
+       old_inode->i_ctime = ctime;
+       new_inode->i_ctime = ctime;
+
+       if (old_dentry->d_parent != new_dentry->d_parent) {
+               btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+               btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+       }
+
+       /* src is a subvolume */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, root, old_dir,
+                                         root_objectid,
+                                         old_dentry->d_name.name,
+                                         old_dentry->d_name.len);
+       } else { /* src is an inode */
+               ret = __btrfs_unlink_inode(trans, root, old_dir,
+                                          old_dentry->d_inode,
+                                          old_dentry->d_name.name,
+                                          old_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, old_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       /* dest is a subvolume */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, dest, new_dir,
+                                         root_objectid,
+                                         new_dentry->d_name.name,
+                                         new_dentry->d_name.len);
+       } else { /* dest is an inode */
+               ret = __btrfs_unlink_inode(trans, dest, new_dir,
+                                          new_dentry->d_inode,
+                                          new_dentry->d_name.name,
+                                          new_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, dest, new_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, new_dir, old_inode,
+                            new_dentry->d_name.name,
+                            new_dentry->d_name.len, 0, old_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, old_dir, new_inode,
+                            old_dentry->d_name.name,
+                            old_dentry->d_name.len, 0, new_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       if (old_inode->i_nlink == 1)
+               BTRFS_I(old_inode)->dir_index = old_idx;
+       if (new_inode->i_nlink == 1)
+               BTRFS_I(new_inode)->dir_index = new_idx;
+
+       if (root_log_pinned) {
+               parent = new_dentry->d_parent;
+               btrfs_log_new_name(trans, old_inode, old_dir, parent);
+               btrfs_end_log_trans(root);
+               root_log_pinned = false;
+       }
+       if (dest_log_pinned) {
+               parent = old_dentry->d_parent;
+               btrfs_log_new_name(trans, new_inode, new_dir, parent);
+               btrfs_end_log_trans(dest);
+               dest_log_pinned = false;
+       }
+out_fail:
+       /*
+        * If we have pinned a log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && (root_log_pinned || dest_log_pinned)) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               if (root_log_pinned) {
+                       btrfs_end_log_trans(root);
+                       root_log_pinned = false;
+               }
+               if (dest_log_pinned) {
+                       btrfs_end_log_trans(dest);
+                       dest_log_pinned = false;
+               }
+       }
+       ret = btrfs_end_transaction(trans, root);
+out_notrans:
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&dest->fs_info->subvol_sem);
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&root->fs_info->subvol_sem);
+
+       return ret;
+}
+
+static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct inode *dir,
+                                    struct dentry *dentry)
+{
+       int ret;
+       struct inode *inode;
+       u64 objectid;
+       u64 index;
+
+       ret = btrfs_find_free_ino(root, &objectid);
+       if (ret)
+               return ret;
+
+       inode = btrfs_new_inode(trans, root, dir,
+                               dentry->d_name.name,
+                               dentry->d_name.len,
+                               btrfs_ino(dir),
+                               objectid,
+                               S_IFCHR | WHITEOUT_MODE,
+                               &index);
+
+       if (IS_ERR(inode)) {
+               ret = PTR_ERR(inode);
+               return ret;
+       }
+
+       inode->i_op = &btrfs_special_inode_operations;
+       init_special_inode(inode, inode->i_mode,
+               WHITEOUT_DEV);
+
+       ret = btrfs_init_inode_security(trans, inode, dir,
+                               &dentry->d_name);
+       if (ret)
+               goto out;
+
+       ret = btrfs_add_nondir(trans, dir, dentry,
+                               inode, 0, index);
+       if (ret)
+               goto out;
+
+       ret = btrfs_update_inode(trans, root, inode);
+out:
+       unlock_new_inode(inode);
+       if (ret)
+               inode_dec_link_count(inode);
+       iput(inode);
+
+       return ret;
+}
+
  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry)
+                          struct inode *new_dir, struct dentry *new_dentry,
+                          unsigned int flags)
  {
         struct btrfs_trans_handle *trans;
+       unsigned int trans_num_items;
         struct btrfs_root *root = BTRFS_I(old_dir)->root;
         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
         struct inode *new_inode = d_inode(new_dentry);
@@ -9399,6 +9703,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         u64 root_objectid;
         int ret;
         u64 old_ino = btrfs_ino(old_inode);
+       bool log_pinned = false;
  
         if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                 return -EPERM;
@@ -9449,15 +9754,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
          * We want to reserve the absolute worst case amount of items.  So if
          * both inodes are subvols and we need to unlink them then that would
          * require 4 item modifications, but if they are both normal inodes it
-        * would require 5 item modifications, so we'll assume their normal
+        * would require 5 item modifications, so we'll assume they are normal
          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
          * should cover the worst case number of items we'll modify.
+        * If our rename has the whiteout flag, we need more 5 units for the
+        * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+        * when selinux is enabled).
          */
-       trans = btrfs_start_transaction(root, 11);
+       trans_num_items = 11;
+       if (flags & RENAME_WHITEOUT)
+               trans_num_items += 5;
+       trans = btrfs_start_transaction(root, trans_num_items);
         if (IS_ERR(trans)) {
-                ret = PTR_ERR(trans);
-                goto out_notrans;
-        }
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
  
         if (dest != root)
                 btrfs_record_root_in_trans(trans, dest);
@@ -9471,6 +9782,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(root->fs_info, trans);
         } else {
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@ -9478,14 +9791,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                                              btrfs_ino(new_dir), index);
                 if (ret)
                         goto out_fail;
-               /*
-                * this is an ugly little race, but the rename is required
-                * to make sure that if we crash, the inode is either at the
-                * old name or the new one.  pinning the log transaction lets
-                * us make sure we don't allow a log commit to come in after
-                * we unlink the name but before we add the new name back in.
-                */
-               btrfs_pin_log_trans(root);
         }
  
         inode_inc_iversion(old_dir);
@@ -9552,12 +9857,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (old_inode->i_nlink == 1)
                 BTRFS_I(old_inode)->dir_index = index;
  
-       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (log_pinned) {
                 struct dentry *parent = new_dentry->d_parent;
+
                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
                 btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
+
+       if (flags & RENAME_WHITEOUT) {
+               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+                                               old_dentry);
+
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_fail;
+               }
         }
  out_fail:
+       /*
+        * If we have pinned the log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && log_pinned) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
         btrfs_end_transaction(trans, root);
  out_notrans:
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9570,10 +9909,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
  {
-       if (flags & ~RENAME_NOREPLACE)
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                 return -EINVAL;
  
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (flags & RENAME_EXCHANGE)
+               return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+                                         new_dentry);
+
+       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
  }
  
  static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@ -9942,6 +10285,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                 btrfs_end_transaction(trans, root);
                         break;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
  
                 last_alloc = ins.offset;
                 ret = insert_reserved_file_extent(trans, inode,
@@ -10160,10 +10504,10 @@ static const struct inode_operations btrfs_dir_inode_operations = {
         .symlink        = btrfs_symlink,
         .setattr        = btrfs_setattr,
         .mknod          = btrfs_mknod,
-       .setxattr       = btrfs_setxattr,
+       .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
         .listxattr      = btrfs_listxattr,
-       .removexattr    = btrfs_removexattr,
+       .removexattr    = generic_removexattr,
         .permission     = btrfs_permission,
         .get_acl        = btrfs_get_acl,
         .set_acl        = btrfs_set_acl,
@@ -10184,7 +10528,7 @@ static const struct file_operations btrfs_dir_file_operations = {
         .iterate        = btrfs_real_readdir,
         .unlocked_ioctl = btrfs_ioctl,
  #ifdef CONFIG_COMPAT
-       .compat_ioctl   = btrfs_ioctl,
+       .compat_ioctl   = btrfs_compat_ioctl,
  #endif
         .release        = btrfs_release_file,
         .fsync          = btrfs_sync_file,
@@ -10237,10 +10581,10 @@ static const struct address_space_operations btrfs_symlink_aops = {
  static const struct inode_operations btrfs_file_inode_operations = {
         .getattr        = btrfs_getattr,
         .setattr        = btrfs_setattr,
-       .setxattr       = btrfs_setxattr,
+       .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
         .listxattr      = btrfs_listxattr,
-       .removexattr    = btrfs_removexattr,
+       .removexattr    = generic_removexattr,
         .permission     = btrfs_permission,
         .fiemap         = btrfs_fiemap,
         .get_acl        = btrfs_get_acl,
@@ -10251,10 +10595,10 @@ static const struct inode_operations btrfs_special_inode_operations = {
         .getattr        = btrfs_getattr,
         .setattr        = btrfs_setattr,
         .permission     = btrfs_permission,
-       .setxattr       = btrfs_setxattr,
+       .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
         .listxattr      = btrfs_listxattr,
-       .removexattr    = btrfs_removexattr,
+       .removexattr    = generic_removexattr,
         .get_acl        = btrfs_get_acl,
         .set_acl        = btrfs_set_acl,
         .update_time    = btrfs_update_time,
@@ -10265,10 +10609,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
         .getattr        = btrfs_getattr,
         .setattr        = btrfs_setattr,
         .permission     = btrfs_permission,
-       .setxattr       = btrfs_setxattr,
+       .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
         .listxattr      = btrfs_listxattr,
-       .removexattr    = btrfs_removexattr,
+       .removexattr    = generic_removexattr,
         .update_time    = btrfs_update_time,
  };