Merge branch 'for-linus-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...
[cascardo/linux.git] / fs / btrfs / inode.c
index 2aaba58..8b1212e 100644 (file)
@@ -455,7 +455,7 @@ again:
 
        /*
         * skip compression for a small file range(<=blocksize) that
-        * isn't an inline extent, since it dosen't save disk space at all.
+        * isn't an inline extent, since it doesn't save disk space at all.
         */
        if (total_compressed <= blocksize &&
           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
@@ -824,6 +824,7 @@ retry:
                                                async_extent->ram_size - 1, 0);
                        goto out_free_reserve;
                }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
 
                /*
                 * clear dirty, set writeback and unlock the pages.
@@ -861,6 +862,7 @@ retry:
        }
        return;
 out_free_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_free:
        extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,
                                goto out_drop_extent_cache;
                }
 
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+
                if (disk_num_bytes < cur_alloc_size)
                        break;
 
@@ -1066,6 +1070,7 @@ out:
 out_drop_extent_cache:
        btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
 out_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_unlock:
        extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -1377,6 +1382,9 @@ next_slot:
                         */
                        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
                                goto out_check;
+                       if (!btrfs_inc_nocow_writers(root->fs_info,
+                                                    disk_bytenr))
+                               goto out_check;
                        nocow = 1;
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                        extent_end = found_key.offset +
@@ -1391,6 +1399,9 @@ out_check:
                        path->slots[0]++;
                        if (!nolock && nocow)
                                btrfs_end_write_no_snapshoting(root);
+                       if (nocow)
+                               btrfs_dec_nocow_writers(root->fs_info,
+                                                       disk_bytenr);
                        goto next_slot;
                }
                if (!nocow) {
@@ -1411,6 +1422,9 @@ out_check:
                        if (ret) {
                                if (!nolock && nocow)
                                        btrfs_end_write_no_snapshoting(root);
+                               if (nocow)
+                                       btrfs_dec_nocow_writers(root->fs_info,
+                                                               disk_bytenr);
                                goto error;
                        }
                        cow_start = (u64)-1;
@@ -1453,6 +1467,8 @@ out_check:
 
                ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
                                               num_bytes, num_bytes, type);
+               if (nocow)
+                       btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
                BUG_ON(ret); /* -ENOMEM */
 
                if (root->root_key.objectid ==
@@ -1962,7 +1978,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 {
        WARN_ON((end & (PAGE_SIZE - 1)) == 0);
        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-                                  cached_state, GFP_NOFS);
+                                  cached_state);
 }
 
 /* see btrfs_writepage_start_hook for details on why this is required */
@@ -3103,8 +3119,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
-               clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
-                                 GFP_NOFS);
+               clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
                return 0;
        }
 
@@ -3706,7 +3721,7 @@ cache_index:
         * and doesn't have an inode ref with the name "bar" anymore.
         *
         * Setting last_unlink_trans to last_trans is a pessimistic approach,
-        * but it guarantees correctness at the expense of ocassional full
+        * but it guarantees correctness at the expense of occasional full
         * transaction commits on fsync if our inode is a directory, or if our
         * inode is not a directory, logging its parent unnecessarily.
         */
@@ -4962,7 +4977,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                 * be instantly completed which will give us extents that need
                 * to be truncated.  If we fail to get an orphan inode down we
                 * could have left over extents that were never meant to live,
-                * so we need to garuntee from this point on that everything
+                * so we need to guarantee from this point on that everything
                 * will be consistent.
                 */
                ret = btrfs_orphan_add(trans, inode);
@@ -5232,7 +5247,7 @@ void btrfs_evict_inode(struct inode *inode)
                }
 
                /*
-                * We can't just steal from the global reserve, we need tomake
+                * We can't just steal from the global reserve, we need to make
                 * sure there is room to do it, if not we need to commit and try
                 * again.
                 */
@@ -6964,7 +6979,18 @@ insert:
                 * existing will always be non-NULL, since there must be
                 * extent causing the -EEXIST.
                 */
-               if (start >= extent_map_end(existing) ||
+               if (existing->start == em->start &&
+                   extent_map_end(existing) == extent_map_end(em) &&
+                   em->block_start == existing->block_start) {
+                       /*
+                        * these two extents are the same, it happens
+                        * with inlines especially
+                        */
+                       free_extent_map(em);
+                       em = existing;
+                       err = 0;
+
+               } else if (start >= extent_map_end(existing) ||
                    start <= existing->start) {
                        /*
                         * The existing extent map is the one nearest to
@@ -7129,6 +7155,43 @@ out:
        return em;
 }
 
+static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+                                                 const u64 start,
+                                                 const u64 len,
+                                                 const u64 orig_start,
+                                                 const u64 block_start,
+                                                 const u64 block_len,
+                                                 const u64 orig_block_len,
+                                                 const u64 ram_bytes,
+                                                 const int type)
+{
+       struct extent_map *em = NULL;
+       int ret;
+
+       down_read(&BTRFS_I(inode)->dio_sem);
+       if (type != BTRFS_ORDERED_NOCOW) {
+               em = create_pinned_em(inode, start, len, orig_start,
+                                     block_start, block_len, orig_block_len,
+                                     ram_bytes, type);
+               if (IS_ERR(em))
+                       goto out;
+       }
+       ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+                                          len, block_len, type);
+       if (ret) {
+               if (em) {
+                       free_extent_map(em);
+                       btrfs_drop_extent_cache(inode, start,
+                                               start + len - 1, 0);
+               }
+               em = ERR_PTR(ret);
+       }
+ out:
+       up_read(&BTRFS_I(inode)->dio_sem);
+
+       return em;
+}
+
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                                                  u64 start, u64 len)
 {
@@ -7144,41 +7207,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        if (ret)
                return ERR_PTR(ret);
 
-       /*
-        * Create the ordered extent before the extent map. This is to avoid
-        * races with the fast fsync path that would lead to it logging file
-        * extent items that point to disk extents that were not yet written to.
-        * The fast fsync path collects ordered extents into a local list and
-        * then collects all the new extent maps, so we must create the ordered
-        * extent first and make sure the fast fsync path collects any new
-        * ordered extents after collecting new extent maps as well.
-        * The fsync path simply can not rely on inode_dio_wait() because it
-        * causes deadlock with AIO.
-        */
-       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
-                                          ins.offset, ins.offset, 0);
-       if (ret) {
+       em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+                                    ins.objectid, ins.offset, ins.offset,
+                                    ins.offset, 0);
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+       if (IS_ERR(em))
                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               return ERR_PTR(ret);
-       }
 
-       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                             ins.offset, ins.offset, ins.offset, 0);
-       if (IS_ERR(em)) {
-               struct btrfs_ordered_extent *oe;
-
-               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               oe = btrfs_lookup_ordered_extent(inode, start);
-               ASSERT(oe);
-               if (WARN_ON(!oe))
-                       return em;
-               set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
-               set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
-               btrfs_remove_ordered_extent(inode, oe);
-               /* Once for our lookup and once for the ordered extents tree. */
-               btrfs_put_ordered_extent(oe);
-               btrfs_put_ordered_extent(oe);
-       }
        return em;
 }
 
@@ -7408,7 +7443,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                                 cached_state);
                /*
                 * We're concerned with the entire range that we're going to be
-                * doing DIO to, so we need to make sure theres no ordered
+                * doing DIO to, so we need to make sure there's no ordered
                 * extents in this range.
                 */
                ordered = btrfs_lookup_ordered_range(inode, lockstart,
@@ -7570,7 +7605,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (current->journal_info) {
                /*
                 * Need to pull our outstanding extents and set journal_info to NULL so
-                * that anything that needs to check if there's a transction doesn't get
+                * that anything that needs to check if there's a transaction doesn't get
                 * confused.
                 */
                dio_data = current->journal_info;
@@ -7603,7 +7638,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         * decompress it, so there will be buffering required no matter what we
         * do, so go ahead and fallback to buffered.
         *
-        * We return -ENOTBLK because thats what makes DIO go ahead and go back
+        * We return -ENOTBLK because that's what makes DIO go ahead and go back
         * to buffered IO.  Don't blame me, this is the price we pay for using
         * the generic code.
         */
@@ -7650,24 +7685,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                block_start = em->block_start + (start - em->start);
 
                if (can_nocow_extent(inode, start, &len, &orig_start,
-                                    &orig_block_len, &ram_bytes) == 1) {
+                                    &orig_block_len, &ram_bytes) == 1 &&
+                   btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+                       struct extent_map *em2;
+
+                       em2 = btrfs_create_dio_extent(inode, start, len,
+                                                     orig_start, block_start,
+                                                     len, orig_block_len,
+                                                     ram_bytes, type);
+                       btrfs_dec_nocow_writers(root->fs_info, block_start);
                        if (type == BTRFS_ORDERED_PREALLOC) {
                                free_extent_map(em);
-                               em = create_pinned_em(inode, start, len,
-                                                      orig_start,
-                                                      block_start, len,
-                                                      orig_block_len,
-                                                      ram_bytes, type);
-                               if (IS_ERR(em)) {
-                                       ret = PTR_ERR(em);
-                                       goto unlock_err;
-                               }
+                               em = em2;
                        }
-
-                       ret = btrfs_add_ordered_extent_dio(inode, start,
-                                          block_start, len, len, type);
-                       if (ret) {
-                               free_extent_map(em);
+                       if (em2 && IS_ERR(em2)) {
+                               ret = PTR_ERR(em2);
                                goto unlock_err;
                        }
                        goto unlock;
@@ -8541,13 +8573,13 @@ out:
        return retval;
 }
 
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-                              loff_t offset)
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_data dio_data = { 0 };
+       loff_t offset = iocb->ki_pos;
        size_t count = 0;
        int flags = 0;
        bool wakeup = true;
@@ -8607,7 +8639,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
        ret = __blockdev_direct_IO(iocb, inode,
                                   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-                                  iter, offset, btrfs_get_blocks_direct, NULL,
+                                  iter, btrfs_get_blocks_direct, NULL,
                                   btrfs_submit_direct, flags);
        if (iov_iter_rw(iter) == WRITE) {
                current->journal_info = NULL;
@@ -9019,7 +9051,7 @@ static int btrfs_truncate(struct inode *inode)
                return ret;
 
        /*
-        * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
+        * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
         * 3 things going on here
         *
         * 1) We need to reserve space for our orphan item and the space to
@@ -9033,15 +9065,15 @@ static int btrfs_truncate(struct inode *inode)
         * space reserved in case it uses space during the truncate (thank you
         * very much snapshotting).
         *
-        * And we need these to all be seperate.  The fact is we can use alot of
+        * And we need these to all be separate.  The fact is we can use a lot of
         * space doing the truncate, and we have no earthly idea how much space
-        * we will use, so we need the truncate reservation to be seperate so it
+        * we will use, so we need the truncate reservation to be separate so it
         * doesn't end up using space reserved for updating the inode or
         * removing the orphan item.  We also need to be able to stop the
         * transaction and start a new one, which means we need to be able to
         * update the inode several times, and we have no idea of knowing how
         * many times that will be, so we can't just reserve 1 item for the
-        * entirety of the opration, so that has to be done seperately as well.
+        * entirety of the operation, so that has to be done separately as well.
         * Then there is the orphan item, which does indeed need to be held on
         * to for the whole operation, and we need nobody to touch this reserved
         * space except the orphan code.
@@ -9230,6 +9262,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->delayed_iput);
        RB_CLEAR_NODE(&ei->rb_node);
+       init_rwsem(&ei->dio_sem);
 
        return inode;
 }
@@ -9387,10 +9420,281 @@ static int btrfs_getattr(struct vfsmount *mnt,
        return 0;
 }
 
+static int btrfs_rename_exchange(struct inode *old_dir,
+                             struct dentry *old_dentry,
+                             struct inode *new_dir,
+                             struct dentry *new_dentry)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(old_dir)->root;
+       struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+       struct inode *new_inode = new_dentry->d_inode;
+       struct inode *old_inode = old_dentry->d_inode;
+       struct timespec ctime = CURRENT_TIME;
+       struct dentry *parent;
+       u64 old_ino = btrfs_ino(old_inode);
+       u64 new_ino = btrfs_ino(new_inode);
+       u64 old_idx = 0;
+       u64 new_idx = 0;
+       u64 root_objectid;
+       int ret;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
+
+       /* we only allow rename subvolume link between subvolumes */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+               return -EXDEV;
+
+       /* close the race window with snapshot create/destroy ioctl */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&root->fs_info->subvol_sem);
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&dest->fs_info->subvol_sem);
+
+       /*
+        * We want to reserve the absolute worst case amount of items.  So if
+        * both inodes are subvols and we need to unlink them then that would
+        * require 4 item modifications, but if they are both normal inodes it
+        * would require 5 item modifications, so we'll assume their normal
+        * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+        * should cover the worst case number of items we'll modify.
+        */
+       trans = btrfs_start_transaction(root, 12);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
+
+       /*
+        * We need to find a free sequence number both in the source and
+        * in the destination directory for the exchange.
+        */
+       ret = btrfs_set_inode_index(new_dir, &old_idx);
+       if (ret)
+               goto out_fail;
+       ret = btrfs_set_inode_index(old_dir, &new_idx);
+       if (ret)
+               goto out_fail;
+
+       BTRFS_I(old_inode)->dir_index = 0ULL;
+       BTRFS_I(new_inode)->dir_index = 0ULL;
+
+       /* Reference for the source. */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(root->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, dest,
+                                            new_dentry->d_name.name,
+                                            new_dentry->d_name.len,
+                                            old_ino,
+                                            btrfs_ino(new_dir), old_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* And now for the dest. */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(dest->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, root,
+                                            old_dentry->d_name.name,
+                                            old_dentry->d_name.len,
+                                            new_ino,
+                                            btrfs_ino(old_dir), new_idx);
+               if (ret)
+                       goto out_fail;
+       }
+
+       /* Update inode version and ctime/mtime. */
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
+       inode_inc_iversion(new_inode);
+       old_dir->i_ctime = old_dir->i_mtime = ctime;
+       new_dir->i_ctime = new_dir->i_mtime = ctime;
+       old_inode->i_ctime = ctime;
+       new_inode->i_ctime = ctime;
+
+       if (old_dentry->d_parent != new_dentry->d_parent) {
+               btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+               btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+       }
+
+       /* src is a subvolume */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, root, old_dir,
+                                         root_objectid,
+                                         old_dentry->d_name.name,
+                                         old_dentry->d_name.len);
+       } else { /* src is an inode */
+               ret = __btrfs_unlink_inode(trans, root, old_dir,
+                                          old_dentry->d_inode,
+                                          old_dentry->d_name.name,
+                                          old_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, old_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       /* dest is a subvolume */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, dest, new_dir,
+                                         root_objectid,
+                                         new_dentry->d_name.name,
+                                         new_dentry->d_name.len);
+       } else { /* dest is an inode */
+               ret = __btrfs_unlink_inode(trans, dest, new_dir,
+                                          new_dentry->d_inode,
+                                          new_dentry->d_name.name,
+                                          new_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, dest, new_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, new_dir, old_inode,
+                            new_dentry->d_name.name,
+                            new_dentry->d_name.len, 0, old_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       ret = btrfs_add_link(trans, old_dir, new_inode,
+                            old_dentry->d_name.name,
+                            old_dentry->d_name.len, 0, new_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+
+       if (old_inode->i_nlink == 1)
+               BTRFS_I(old_inode)->dir_index = old_idx;
+       if (new_inode->i_nlink == 1)
+               BTRFS_I(new_inode)->dir_index = new_idx;
+
+       if (root_log_pinned) {
+               parent = new_dentry->d_parent;
+               btrfs_log_new_name(trans, old_inode, old_dir, parent);
+               btrfs_end_log_trans(root);
+               root_log_pinned = false;
+       }
+       if (dest_log_pinned) {
+               parent = old_dentry->d_parent;
+               btrfs_log_new_name(trans, new_inode, new_dir, parent);
+               btrfs_end_log_trans(dest);
+               dest_log_pinned = false;
+       }
+out_fail:
+       /*
+        * If we have pinned a log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && (root_log_pinned || dest_log_pinned)) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               if (root_log_pinned) {
+                       btrfs_end_log_trans(root);
+                       root_log_pinned = false;
+               }
+               if (dest_log_pinned) {
+                       btrfs_end_log_trans(dest);
+                       dest_log_pinned = false;
+               }
+       }
+       ret = btrfs_end_transaction(trans, root);
+out_notrans:
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&dest->fs_info->subvol_sem);
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&root->fs_info->subvol_sem);
+
+       return ret;
+}
+
+static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct inode *dir,
+                                    struct dentry *dentry)
+{
+       int ret;
+       struct inode *inode;
+       u64 objectid;
+       u64 index;
+
+       ret = btrfs_find_free_ino(root, &objectid);
+       if (ret)
+               return ret;
+
+       inode = btrfs_new_inode(trans, root, dir,
+                               dentry->d_name.name,
+                               dentry->d_name.len,
+                               btrfs_ino(dir),
+                               objectid,
+                               S_IFCHR | WHITEOUT_MODE,
+                               &index);
+
+       if (IS_ERR(inode)) {
+               ret = PTR_ERR(inode);
+               return ret;
+       }
+
+       inode->i_op = &btrfs_special_inode_operations;
+       init_special_inode(inode, inode->i_mode,
+               WHITEOUT_DEV);
+
+       ret = btrfs_init_inode_security(trans, inode, dir,
+                               &dentry->d_name);
+       if (ret)
+               goto out;
+
+       ret = btrfs_add_nondir(trans, dir, dentry,
+                               inode, 0, index);
+       if (ret)
+               goto out;
+
+       ret = btrfs_update_inode(trans, root, inode);
+out:
+       unlock_new_inode(inode);
+       if (ret)
+               inode_dec_link_count(inode);
+       iput(inode);
+
+       return ret;
+}
+
 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry)
+                          struct inode *new_dir, struct dentry *new_dentry,
+                          unsigned int flags)
 {
        struct btrfs_trans_handle *trans;
+       unsigned int trans_num_items;
        struct btrfs_root *root = BTRFS_I(old_dir)->root;
        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
        struct inode *new_inode = d_inode(new_dentry);
@@ -9399,6 +9703,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        u64 root_objectid;
        int ret;
        u64 old_ino = btrfs_ino(old_inode);
+       bool log_pinned = false;
 
        if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                return -EPERM;
@@ -9449,15 +9754,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * We want to reserve the absolute worst case amount of items.  So if
         * both inodes are subvols and we need to unlink them then that would
         * require 4 item modifications, but if they are both normal inodes it
-        * would require 5 item modifications, so we'll assume their normal
+        * would require 5 item modifications, so we'll assume they are normal
         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
         * should cover the worst case number of items we'll modify.
+        * If our rename has the whiteout flag, we need more 5 units for the
+        * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+        * when selinux is enabled).
         */
-       trans = btrfs_start_transaction(root, 11);
+       trans_num_items = 11;
+       if (flags & RENAME_WHITEOUT)
+               trans_num_items += 5;
+       trans = btrfs_start_transaction(root, trans_num_items);
        if (IS_ERR(trans)) {
-                ret = PTR_ERR(trans);
-                goto out_notrans;
-        }
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
 
        if (dest != root)
                btrfs_record_root_in_trans(trans, dest);
@@ -9471,6 +9782,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(root->fs_info, trans);
        } else {
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                ret = btrfs_insert_inode_ref(trans, dest,
                                             new_dentry->d_name.name,
                                             new_dentry->d_name.len,
@@ -9478,14 +9791,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                                             btrfs_ino(new_dir), index);
                if (ret)
                        goto out_fail;
-               /*
-                * this is an ugly little race, but the rename is required
-                * to make sure that if we crash, the inode is either at the
-                * old name or the new one.  pinning the log transaction lets
-                * us make sure we don't allow a log commit to come in after
-                * we unlink the name but before we add the new name back in.
-                */
-               btrfs_pin_log_trans(root);
        }
 
        inode_inc_iversion(old_dir);
@@ -9552,12 +9857,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (old_inode->i_nlink == 1)
                BTRFS_I(old_inode)->dir_index = index;
 
-       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (log_pinned) {
                struct dentry *parent = new_dentry->d_parent;
+
                btrfs_log_new_name(trans, old_inode, old_dir, parent);
                btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
+
+       if (flags & RENAME_WHITEOUT) {
+               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+                                               old_dentry);
+
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_fail;
+               }
        }
 out_fail:
+       /*
+        * If we have pinned the log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && log_pinned) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+
+               btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
        btrfs_end_transaction(trans, root);
 out_notrans:
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9570,10 +9909,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
 {
-       if (flags & ~RENAME_NOREPLACE)
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;
 
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (flags & RENAME_EXCHANGE)
+               return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+                                         new_dentry);
+
+       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@ -9942,6 +10285,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                btrfs_end_transaction(trans, root);
                        break;
                }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
 
                last_alloc = ins.offset;
                ret = insert_reserved_file_extent(trans, inode,
@@ -10160,10 +10504,10 @@ static const struct inode_operations btrfs_dir_inode_operations = {
        .symlink        = btrfs_symlink,
        .setattr        = btrfs_setattr,
        .mknod          = btrfs_mknod,
-       .setxattr       = btrfs_setxattr,
+       .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = btrfs_listxattr,
-       .removexattr    = btrfs_removexattr,
+       .removexattr    = generic_removexattr,
        .permission     = btrfs_permission,
        .get_acl        = btrfs_get_acl,
        .set_acl        = btrfs_set_acl,
@@ -10184,7 +10528,7 @@ static const struct file_operations btrfs_dir_file_operations = {
        .iterate        = btrfs_real_readdir,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
-       .compat_ioctl   = btrfs_ioctl,
+       .compat_ioctl   = btrfs_compat_ioctl,
 #endif
        .release        = btrfs_release_file,
        .fsync          = btrfs_sync_file,
@@ -10237,10 +10581,10 @@ static const struct address_space_operations btrfs_symlink_aops = {
 static const struct inode_operations btrfs_file_inode_operations = {
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
-       .setxattr       = btrfs_setxattr,
+       .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = btrfs_listxattr,
-       .removexattr    = btrfs_removexattr,
+       .removexattr    = generic_removexattr,
        .permission     = btrfs_permission,
        .fiemap         = btrfs_fiemap,
        .get_acl        = btrfs_get_acl,
@@ -10251,10 +10595,10 @@ static const struct inode_operations btrfs_special_inode_operations = {
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .permission     = btrfs_permission,
-       .setxattr       = btrfs_setxattr,
+       .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = btrfs_listxattr,
-       .removexattr    = btrfs_removexattr,
+       .removexattr    = generic_removexattr,
        .get_acl        = btrfs_get_acl,
        .set_acl        = btrfs_set_acl,
        .update_time    = btrfs_update_time,
@@ -10265,10 +10609,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .permission     = btrfs_permission,
-       .setxattr       = btrfs_setxattr,
+       .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = btrfs_listxattr,
-       .removexattr    = btrfs_removexattr,
+       .removexattr    = generic_removexattr,
        .update_time    = btrfs_update_time,
 };