Btrfs: fix lockdep warning on deadlock against an inode's log mutex
[cascardo/linux.git] / fs / btrfs / tree-log.c
index 517d0cc..e935035 100644 (file)
@@ -27,6 +27,7 @@
 #include "backref.h"
 #include "hash.h"
 #include "compression.h"
+#include "qgroup.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                ins.type = BTRFS_EXTENT_ITEM_KEY;
                offset = key->offset - btrfs_file_extent_offset(eb, item);
 
+               /*
+                * Manually record dirty extent, as here we did a shallow
+                * file extent item copy and skip normal backref update,
+                * but modifying extent tree all by ourselves.
+                * So need to manually record dirty extent for qgroup,
+                * as the owner of the file extent changed from log tree
+                * (doesn't affect qgroup) to fs/file tree(affects qgroup)
+                */
+               ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+                               btrfs_file_extent_disk_bytenr(eb, item),
+                               btrfs_file_extent_disk_num_bytes(eb, item),
+                               GFP_NOFS);
+               if (ret < 0)
+                       goto out;
+
                if (ins.objectid > 0) {
                        u64 csum_start;
                        u64 csum_end;
@@ -2330,7 +2346,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                                break;
 
                        /* for regular files, make sure corresponding
-                        * orhpan item exist. extents past the new EOF
+                        * orphan item exist. extents past the new EOF
                         * will be truncated later by orphan cleanup.
                         */
                        if (S_ISREG(mode)) {
@@ -2422,8 +2438,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                root_owner = btrfs_header_owner(parent);
 
                next = btrfs_find_create_tree_block(root, bytenr);
-               if (!next)
-                       return -ENOMEM;
+               if (IS_ERR(next))
+                       return PTR_ERR(next);
 
                if (*level == 1) {
                        ret = wc->process_func(root, next, wc, ptr_gen);
@@ -2757,7 +2773,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        while (1) {
                int batch = atomic_read(&root->log_batch);
                /* when we're on an ssd, just kick the log commit out */
-               if (!btrfs_test_opt(root, SSD) &&
+               if (!btrfs_test_opt(root->fs_info, SSD) &&
                    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
                        mutex_unlock(&root->log_mutex);
                        schedule_timeout_uninterruptible(1);
@@ -2788,7 +2804,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
        if (ret) {
                blk_finish_plug(&plug);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                btrfs_free_logged_extents(log, log_transid);
                btrfs_set_log_full_commit(root->fs_info, trans);
                mutex_unlock(&root->log_mutex);
@@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         */
        mutex_unlock(&root->log_mutex);
 
-       btrfs_init_log_ctx(&root_log_ctx);
+       btrfs_init_log_ctx(&root_log_ctx, NULL);
 
        mutex_lock(&log_root_tree->log_mutex);
        atomic_inc(&log_root_tree->log_batch);
@@ -2838,7 +2854,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                btrfs_set_log_full_commit(root->fs_info, trans);
 
                if (ret != -ENOSPC) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                        mutex_unlock(&log_root_tree->log_mutex);
                        goto out;
                }
@@ -2898,7 +2914,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        blk_finish_plug(&plug);
        if (ret) {
                btrfs_set_log_full_commit(root->fs_info, trans);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out_wake_log_root;
@@ -2934,7 +2950,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
        if (ret) {
                btrfs_set_log_full_commit(root->fs_info, trans);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                goto out_wake_log_root;
        }
 
@@ -2991,7 +3007,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
        ret = walk_log_tree(trans, log, &wc);
        /* I don't think this can happen but just in case */
        if (ret)
-               btrfs_abort_transaction(trans, log, ret);
+               btrfs_abort_transaction(trans, ret);
 
        while (1) {
                ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -3001,7 +3017,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
                        break;
 
                clear_extent_bits(&log->dirty_log_pages, start, end,
-                                 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+                                 EXTENT_DIRTY | EXTENT_NEW);
        }
 
        /*
@@ -3160,7 +3176,7 @@ out_unlock:
                btrfs_set_log_full_commit(root->fs_info, trans);
                ret = 0;
        } else if (ret < 0)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
 
        btrfs_end_log_trans(root);
 
@@ -3193,7 +3209,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
                btrfs_set_log_full_commit(root->fs_info, trans);
                ret = 0;
        } else if (ret < 0 && ret != -ENOENT)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
        btrfs_end_log_trans(root);
 
        return ret;
@@ -4141,6 +4157,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
        INIT_LIST_HEAD(&extents);
 
+       down_write(&BTRFS_I(inode)->dio_sem);
        write_lock(&tree->lock);
        test_gen = root->fs_info->last_trans_committed;
 
@@ -4169,13 +4186,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        }
 
        list_sort(NULL, &extents, extent_cmp);
+       btrfs_get_logged_extents(inode, logged_list, start, end);
        /*
-        * Collect any new ordered extents within the range. This is to
-        * prevent logging file extent items without waiting for the disk
-        * location they point to being written. We do this only to deal
-        * with races against concurrent lockless direct IO writes.
+        * Some ordered extents started by fsync might have completed
+        * before we could collect them into the list logged_list, which
+        * means they're gone, not in our logged_list nor in the inode's
+        * ordered tree. We want the application/user space to know an
+        * error happened while attempting to persist file data so that
+        * it can take proper action. If such error happened, we leave
+        * without writing to the log tree and the fsync must report the
+        * file data write error and not commit the current transaction.
         */
-       btrfs_get_logged_extents(inode, logged_list, start, end);
+       ret = btrfs_inode_check_errors(inode);
+       if (ret)
+               ctx->io_err = ret;
 process:
        while (!list_empty(&extents)) {
                em = list_entry(extents.next, struct extent_map, list);
@@ -4202,6 +4226,7 @@ process:
        }
        WARN_ON(!list_empty(&extents));
        write_unlock(&tree->lock);
+       up_write(&BTRFS_I(inode)->dio_sem);
 
        btrfs_release_path(path);
        return ret;
@@ -4460,7 +4485,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                         const int slot,
                                         const struct btrfs_key *key,
-                                        struct inode *inode)
+                                        struct inode *inode,
+                                        u64 *other_ino)
 {
        int ret;
        struct btrfs_path *search_path;
@@ -4519,7 +4545,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                           search_path, parent,
                                           name, this_name_len, 0);
                if (di && !IS_ERR(di)) {
-                       ret = 1;
+                       struct btrfs_key di_key;
+
+                       btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+                                                 di, &di_key);
+                       if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+                               ret = 1;
+                               *other_ino = di_key.objectid;
+                       } else {
+                               ret = -EAGAIN;
+                       }
                        goto out;
                } else if (IS_ERR(di)) {
                        ret = PTR_ERR(di);
@@ -4622,23 +4657,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
        mutex_lock(&BTRFS_I(inode)->log_mutex);
 
-       /*
-        * Collect ordered extents only if we are logging data. This is to
-        * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
-        * will process the ordered extents if they still exists at the time,
-        * because when we collect them we test and set for the flag
-        * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
-        * same ordered extents. The consequence for the LOG_INODE_ALL log mode
-        * not processing the ordered extents is that we end up logging the
-        * corresponding file extent items, based on the extent maps in the
-        * inode's extent_map_tree's modified_list, without logging the
-        * respective checksums (since the may still be only attached to the
-        * ordered extents and have not been inserted in the csum tree by
-        * btrfs_finish_ordered_io() yet).
-        */
-       if (inode_only == LOG_INODE_ALL)
-               btrfs_get_logged_extents(inode, &logged_list, start, end);
-
        /*
         * a brute force approach to making sure we get the most uptodate
         * copies of everything.
@@ -4711,6 +4729,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                ins_nr = 0;
                ret = btrfs_search_forward(root, &min_key,
                                           path, trans->transid);
+               if (ret < 0) {
+                       err = ret;
+                       goto out_unlock;
+               }
                if (ret != 0)
                        break;
 again:
@@ -4726,16 +4748,72 @@ again:
                if ((min_key.type == BTRFS_INODE_REF_KEY ||
                     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
                    BTRFS_I(inode)->generation == trans->transid) {
+                       u64 other_ino = 0;
+
                        ret = btrfs_check_ref_name_override(path->nodes[0],
                                                            path->slots[0],
-                                                           &min_key, inode);
+                                                           &min_key, inode,
+                                                           &other_ino);
                        if (ret < 0) {
                                err = ret;
                                goto out_unlock;
-                       } else if (ret > 0) {
-                               err = 1;
-                               btrfs_set_log_full_commit(root->fs_info, trans);
-                               goto out_unlock;
+                       } else if (ret > 0 && ctx &&
+                                  other_ino != btrfs_ino(ctx->inode)) {
+                               struct btrfs_key inode_key;
+                               struct inode *other_inode;
+
+                               if (ins_nr > 0) {
+                                       ins_nr++;
+                               } else {
+                                       ins_nr = 1;
+                                       ins_start_slot = path->slots[0];
+                               }
+                               ret = copy_items(trans, inode, dst_path, path,
+                                                &last_extent, ins_start_slot,
+                                                ins_nr, inode_only,
+                                                logged_isize);
+                               if (ret < 0) {
+                                       err = ret;
+                                       goto out_unlock;
+                               }
+                               ins_nr = 0;
+                               btrfs_release_path(path);
+                               inode_key.objectid = other_ino;
+                               inode_key.type = BTRFS_INODE_ITEM_KEY;
+                               inode_key.offset = 0;
+                               other_inode = btrfs_iget(root->fs_info->sb,
+                                                        &inode_key, root,
+                                                        NULL);
+                               /*
+                                * If the other inode that had a conflicting dir
+                                * entry was deleted in the current transaction,
+                                * we don't need to do more work nor fallback to
+                                * a transaction commit.
+                                */
+                               if (IS_ERR(other_inode) &&
+                                   PTR_ERR(other_inode) == -ENOENT) {
+                                       goto next_key;
+                               } else if (IS_ERR(other_inode)) {
+                                       err = PTR_ERR(other_inode);
+                                       goto out_unlock;
+                               }
+                               /*
+                                * We are safe logging the other inode without
+                                * acquiring its i_mutex as long as we log with
+                                * the LOG_INODE_EXISTS mode. We're safe against
+                                * concurrent renames of the other inode as well
+                                * because during a rename we pin the log and
+                                * update the log with the new name before we
+                                * unpin it.
+                                */
+                               err = btrfs_log_inode(trans, root, other_inode,
+                                                     LOG_INODE_EXISTS,
+                                                     0, LLONG_MAX, ctx);
+                               iput(other_inode);
+                               if (err)
+                                       goto out_unlock;
+                               else
+                                       goto next_key;
                        }
                }
 
@@ -4803,7 +4881,7 @@ next_slot:
                        ins_nr = 0;
                }
                btrfs_release_path(path);
-
+next_key:
                if (min_key.offset < (u64)-1) {
                        min_key.offset++;
                } else if (min_key.type < max_key.type) {
@@ -4846,21 +4924,6 @@ log_extents:
                        goto out_unlock;
        }
        if (fast_search) {
-               /*
-                * Some ordered extents started by fsync might have completed
-                * before we collected the ordered extents in logged_list, which
-                * means they're gone, not in our logged_list nor in the inode's
-                * ordered tree. We want the application/user space to know an
-                * error happened while attempting to persist file data so that
-                * it can take proper action. If such error happened, we leave
-                * without writing to the log tree and the fsync must report the
-                * file data write error and not commit the current transaction.
-                */
-               err = btrfs_inode_check_errors(inode);
-               if (err) {
-                       ctx->io_err = err;
-                       goto out_unlock;
-               }
                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
                                                &logged_list, ctx, start, end);
                if (ret) {
@@ -4937,7 +5000,7 @@ out_unlock:
  * the actual unlink operation, so if we do this check before a concurrent task
  * sets last_unlink_trans it means we've logged a consistent version/state of
  * all the inode items, otherwise we are not sure and must do a transaction
- * commit (the concurrent task migth have only updated last_unlink_trans before
+ * commit (the concurrent task might have only updated last_unlink_trans before
  * we logged the inode or it might have also done the unlink).
  */
 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
@@ -4988,7 +5051,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                        goto out;
 
        if (!S_ISDIR(inode->i_mode)) {
-               if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+               if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                        goto out;
                inode = d_inode(parent);
        }
@@ -4996,7 +5059,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
        while (1) {
                /*
                 * If we are logging a directory then we start with our inode,
-                * not our parents inode, so we need to skipp setting the
+                * not our parent's inode, so we need to skip setting the
                 * logged_trans so that further down in the log code we don't
                 * think this inode has already been logged.
                 */
@@ -5009,11 +5072,15 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                        break;
                }
 
-               if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+               if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                        break;
 
-               if (IS_ROOT(parent))
+               if (IS_ROOT(parent)) {
+                       inode = d_inode(parent);
+                       if (btrfs_must_commit_transaction(trans, inode))
+                               ret = 1;
                        break;
+               }
 
                parent = dget_parent(parent);
                dput(old_parent);
@@ -5158,7 +5225,7 @@ process_leaf:
                        }
 
                        ctx->log_new_dentries = false;
-                       if (type == BTRFS_FT_DIR)
+                       if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
                                log_mode = LOG_INODE_ALL;
                        btrfs_release_path(path);
                        ret = btrfs_log_inode(trans, root, di_inode,
@@ -5278,11 +5345,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
                        if (IS_ERR(dir_inode))
                                continue;
 
+                       if (ctx)
+                               ctx->log_new_dentries = false;
                        ret = btrfs_log_inode(trans, root, dir_inode,
                                              LOG_INODE_ALL, 0, LLONG_MAX, ctx);
                        if (!ret &&
                            btrfs_must_commit_transaction(trans, dir_inode))
                                ret = 1;
+                       if (!ret && ctx && ctx->log_new_dentries)
+                               ret = log_new_dir_dentries(trans, root,
+                                                          dir_inode, ctx);
                        iput(dir_inode);
                        if (ret)
                                goto out;
@@ -5319,7 +5391,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
        sb = inode->i_sb;
 
-       if (btrfs_test_opt(root, NOTREELOG)) {
+       if (btrfs_test_opt(root->fs_info, NOTREELOG)) {
                ret = 1;
                goto end_no_trans;
        }
@@ -5375,7 +5447,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                log_dentries = true;
 
        /*
-        * On unlink we must make sure all our current and old parent directores
+        * On unlink we must make sure all our current and old parent directory
         * inodes are fully logged. This is to prevent leaving dangling
         * directory index entries in directories that were our parents but are
         * not anymore. Not doing this results in old parent directory being
@@ -5422,7 +5494,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        }
 
        while (1) {
-               if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+               if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                        break;
 
                inode = d_inode(parent);
@@ -5519,7 +5591,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 
        ret = walk_log_tree(trans, log_root_tree, &wc);
        if (ret) {
-               btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
+               btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while "
                            "recovering log root tree.");
                goto error;
        }
@@ -5533,7 +5605,7 @@ again:
                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 
                if (ret < 0) {
-                       btrfs_std_error(fs_info, ret,
+                       btrfs_handle_fs_error(fs_info, ret,
                                    "Couldn't find tree log root.");
                        goto error;
                }
@@ -5551,7 +5623,7 @@ again:
                log = btrfs_read_fs_root(log_root_tree, &found_key);
                if (IS_ERR(log)) {
                        ret = PTR_ERR(log);
-                       btrfs_std_error(fs_info, ret,
+                       btrfs_handle_fs_error(fs_info, ret,
                                    "Couldn't read tree log root.");
                        goto error;
                }
@@ -5566,7 +5638,7 @@ again:
                        free_extent_buffer(log->node);
                        free_extent_buffer(log->commit_root);
                        kfree(log);
-                       btrfs_std_error(fs_info, ret, "Couldn't read target root "
+                       btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root "
                                    "for tree log recovery.");
                        goto error;
                }
@@ -5652,11 +5724,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
         * into the file.  When the file is logged we check it and
         * don't log the parents if the file is fully on disk.
         */
-       if (S_ISREG(inode->i_mode)) {
-               mutex_lock(&BTRFS_I(inode)->log_mutex);
-               BTRFS_I(inode)->last_unlink_trans = trans->transid;
-               mutex_unlock(&BTRFS_I(inode)->log_mutex);
-       }
+       mutex_lock(&BTRFS_I(inode)->log_mutex);
+       BTRFS_I(inode)->last_unlink_trans = trans->transid;
+       mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
        /*
         * if this directory was already logged any new