Btrfs: fix lockdep warning on deadlock against an inode's log mutex

[cascardo/linux.git] / fs / btrfs / tree-log.c
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 517d0cc..e935035 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
  #include "backref.h"
  #include "hash.h"
  #include "compression.h"
+#include "qgroup.h"
  
  /* magic values for the inode_only field in btrfs_log_inode:
   *
@@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                 ins.type = BTRFS_EXTENT_ITEM_KEY;
                 offset = key->offset - btrfs_file_extent_offset(eb, item);
  
+               /*
+                * Manually record dirty extent, as here we did a shallow
+                * file extent item copy and skip normal backref update,
+                * but modifying extent tree all by ourselves.
+                * So need to manually record dirty extent for qgroup,
+                * as the owner of the file extent changed from log tree
+                * (doesn't affect qgroup) to fs/file tree(affects qgroup)
+                */
+               ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+                               btrfs_file_extent_disk_bytenr(eb, item),
+                               btrfs_file_extent_disk_num_bytes(eb, item),
+                               GFP_NOFS);
+               if (ret < 0)
+                       goto out;
+
                 if (ins.objectid > 0) {
                         u64 csum_start;
                         u64 csum_end;
@@ -2330,7 +2346,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                                 break;
  
                         /* for regular files, make sure corresponding
-                        * orhpan item exist. extents past the new EOF
+                        * orphan item exist. extents past the new EOF
                          * will be truncated later by orphan cleanup.
                          */
                         if (S_ISREG(mode)) {
@@ -2422,8 +2438,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                 root_owner = btrfs_header_owner(parent);
  
                 next = btrfs_find_create_tree_block(root, bytenr);
-               if (!next)
-                       return -ENOMEM;
+               if (IS_ERR(next))
+                       return PTR_ERR(next);
  
                 if (*level == 1) {
                         ret = wc->process_func(root, next, wc, ptr_gen);
@@ -2757,7 +2773,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         while (1) {
                 int batch = atomic_read(&root->log_batch);
                 /* when we're on an ssd, just kick the log commit out */
-               if (!btrfs_test_opt(root, SSD) &&
+               if (!btrfs_test_opt(root->fs_info, SSD) &&
                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
                         mutex_unlock(&root->log_mutex);
                         schedule_timeout_uninterruptible(1);
@@ -2788,7 +2804,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
         if (ret) {
                 blk_finish_plug(&plug);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 btrfs_free_logged_extents(log, log_transid);
                 btrfs_set_log_full_commit(root->fs_info, trans);
                 mutex_unlock(&root->log_mutex);
@@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
          */
         mutex_unlock(&root->log_mutex);
  
-       btrfs_init_log_ctx(&root_log_ctx);
+       btrfs_init_log_ctx(&root_log_ctx, NULL);
  
         mutex_lock(&log_root_tree->log_mutex);
         atomic_inc(&log_root_tree->log_batch);
@@ -2838,7 +2854,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                 btrfs_set_log_full_commit(root->fs_info, trans);
  
                 if (ret != -ENOSPC) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         mutex_unlock(&log_root_tree->log_mutex);
                         goto out;
                 }
@@ -2898,7 +2914,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         blk_finish_plug(&plug);
         if (ret) {
                 btrfs_set_log_full_commit(root->fs_info, trans);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 btrfs_free_logged_extents(log, log_transid);
                 mutex_unlock(&log_root_tree->log_mutex);
                 goto out_wake_log_root;
@@ -2934,7 +2950,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
         if (ret) {
                 btrfs_set_log_full_commit(root->fs_info, trans);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_wake_log_root;
         }
  
@@ -2991,7 +3007,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
         ret = walk_log_tree(trans, log, &wc);
         /* I don't think this can happen but just in case */
         if (ret)
-               btrfs_abort_transaction(trans, log, ret);
+               btrfs_abort_transaction(trans, ret);
  
         while (1) {
                 ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -3001,7 +3017,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
                         break;
  
                 clear_extent_bits(&log->dirty_log_pages, start, end,
-                                 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+                                 EXTENT_DIRTY | EXTENT_NEW);
         }
  
         /*
@@ -3160,7 +3176,7 @@ out_unlock:
                 btrfs_set_log_full_commit(root->fs_info, trans);
                 ret = 0;
         } else if (ret < 0)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
  
         btrfs_end_log_trans(root);
  
@@ -3193,7 +3209,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
                 btrfs_set_log_full_commit(root->fs_info, trans);
                 ret = 0;
         } else if (ret < 0 && ret != -ENOENT)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
         btrfs_end_log_trans(root);
  
         return ret;
@@ -4141,6 +4157,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
  
         INIT_LIST_HEAD(&extents);
  
+       down_write(&BTRFS_I(inode)->dio_sem);
         write_lock(&tree->lock);
         test_gen = root->fs_info->last_trans_committed;
  
@@ -4169,13 +4186,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
         }
  
         list_sort(NULL, &extents, extent_cmp);
+       btrfs_get_logged_extents(inode, logged_list, start, end);
         /*
-        * Collect any new ordered extents within the range. This is to
-        * prevent logging file extent items without waiting for the disk
-        * location they point to being written. We do this only to deal
-        * with races against concurrent lockless direct IO writes.
+        * Some ordered extents started by fsync might have completed
+        * before we could collect them into the list logged_list, which
+        * means they're gone, not in our logged_list nor in the inode's
+        * ordered tree. We want the application/user space to know an
+        * error happened while attempting to persist file data so that
+        * it can take proper action. If such error happened, we leave
+        * without writing to the log tree and the fsync must report the
+        * file data write error and not commit the current transaction.
          */
-       btrfs_get_logged_extents(inode, logged_list, start, end);
+       ret = btrfs_inode_check_errors(inode);
+       if (ret)
+               ctx->io_err = ret;
  process:
         while (!list_empty(&extents)) {
                 em = list_entry(extents.next, struct extent_map, list);
@@ -4202,6 +4226,7 @@ process:
         }
         WARN_ON(!list_empty(&extents));
         write_unlock(&tree->lock);
+       up_write(&BTRFS_I(inode)->dio_sem);
  
         btrfs_release_path(path);
         return ret;
@@ -4460,7 +4485,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
  static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                          const int slot,
                                          const struct btrfs_key *key,
-                                        struct inode *inode)
+                                        struct inode *inode,
+                                        u64 *other_ino)
  {
         int ret;
         struct btrfs_path *search_path;
@@ -4519,7 +4545,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                            search_path, parent,
                                            name, this_name_len, 0);
                 if (di && !IS_ERR(di)) {
-                       ret = 1;
+                       struct btrfs_key di_key;
+
+                       btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+                                                 di, &di_key);
+                       if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+                               ret = 1;
+                               *other_ino = di_key.objectid;
+                       } else {
+                               ret = -EAGAIN;
+                       }
                         goto out;
                 } else if (IS_ERR(di)) {
                         ret = PTR_ERR(di);
@@ -4622,23 +4657,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  
         mutex_lock(&BTRFS_I(inode)->log_mutex);
  
-       /*
-        * Collect ordered extents only if we are logging data. This is to
-        * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
-        * will process the ordered extents if they still exists at the time,
-        * because when we collect them we test and set for the flag
-        * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
-        * same ordered extents. The consequence for the LOG_INODE_ALL log mode
-        * not processing the ordered extents is that we end up logging the
-        * corresponding file extent items, based on the extent maps in the
-        * inode's extent_map_tree's modified_list, without logging the
-        * respective checksums (since the may still be only attached to the
-        * ordered extents and have not been inserted in the csum tree by
-        * btrfs_finish_ordered_io() yet).
-        */
-       if (inode_only == LOG_INODE_ALL)
-               btrfs_get_logged_extents(inode, &logged_list, start, end);
-
         /*
          * a brute force approach to making sure we get the most uptodate
          * copies of everything.
@@ -4711,6 +4729,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                 ins_nr = 0;
                 ret = btrfs_search_forward(root, &min_key,
                                            path, trans->transid);
+               if (ret < 0) {
+                       err = ret;
+                       goto out_unlock;
+               }
                 if (ret != 0)
                         break;
  again:
@@ -4726,16 +4748,72 @@ again:
                 if ((min_key.type == BTRFS_INODE_REF_KEY ||
                      min_key.type == BTRFS_INODE_EXTREF_KEY) &&
                     BTRFS_I(inode)->generation == trans->transid) {
+                       u64 other_ino = 0;
+
                         ret = btrfs_check_ref_name_override(path->nodes[0],
                                                             path->slots[0],
-                                                           &min_key, inode);
+                                                           &min_key, inode,
+                                                           &other_ino);
                         if (ret < 0) {
                                 err = ret;
                                 goto out_unlock;
-                       } else if (ret > 0) {
-                               err = 1;
-                               btrfs_set_log_full_commit(root->fs_info, trans);
-                               goto out_unlock;
+                       } else if (ret > 0 && ctx &&
+                                  other_ino != btrfs_ino(ctx->inode)) {
+                               struct btrfs_key inode_key;
+                               struct inode *other_inode;
+
+                               if (ins_nr > 0) {
+                                       ins_nr++;
+                               } else {
+                                       ins_nr = 1;
+                                       ins_start_slot = path->slots[0];
+                               }
+                               ret = copy_items(trans, inode, dst_path, path,
+                                                &last_extent, ins_start_slot,
+                                                ins_nr, inode_only,
+                                                logged_isize);
+                               if (ret < 0) {
+                                       err = ret;
+                                       goto out_unlock;
+                               }
+                               ins_nr = 0;
+                               btrfs_release_path(path);
+                               inode_key.objectid = other_ino;
+                               inode_key.type = BTRFS_INODE_ITEM_KEY;
+                               inode_key.offset = 0;
+                               other_inode = btrfs_iget(root->fs_info->sb,
+                                                        &inode_key, root,
+                                                        NULL);
+                               /*
+                                * If the other inode that had a conflicting dir
+                                * entry was deleted in the current transaction,
+                                * we don't need to do more work nor fallback to
+                                * a transaction commit.
+                                */
+                               if (IS_ERR(other_inode) &&
+                                   PTR_ERR(other_inode) == -ENOENT) {
+                                       goto next_key;
+                               } else if (IS_ERR(other_inode)) {
+                                       err = PTR_ERR(other_inode);
+                                       goto out_unlock;
+                               }
+                               /*
+                                * We are safe logging the other inode without
+                                * acquiring its i_mutex as long as we log with
+                                * the LOG_INODE_EXISTS mode. We're safe against
+                                * concurrent renames of the other inode as well
+                                * because during a rename we pin the log and
+                                * update the log with the new name before we
+                                * unpin it.
+                                */
+                               err = btrfs_log_inode(trans, root, other_inode,
+                                                     LOG_INODE_EXISTS,
+                                                     0, LLONG_MAX, ctx);
+                               iput(other_inode);
+                               if (err)
+                                       goto out_unlock;
+                               else
+                                       goto next_key;
                         }
                 }
  
@@ -4803,7 +4881,7 @@ next_slot:
                         ins_nr = 0;
                 }
                 btrfs_release_path(path);
-
+next_key:
                 if (min_key.offset < (u64)-1) {
                         min_key.offset++;
                 } else if (min_key.type < max_key.type) {
@@ -4846,21 +4924,6 @@ log_extents:
                         goto out_unlock;
         }
         if (fast_search) {
-               /*
-                * Some ordered extents started by fsync might have completed
-                * before we collected the ordered extents in logged_list, which
-                * means they're gone, not in our logged_list nor in the inode's
-                * ordered tree. We want the application/user space to know an
-                * error happened while attempting to persist file data so that
-                * it can take proper action. If such error happened, we leave
-                * without writing to the log tree and the fsync must report the
-                * file data write error and not commit the current transaction.
-                */
-               err = btrfs_inode_check_errors(inode);
-               if (err) {
-                       ctx->io_err = err;
-                       goto out_unlock;
-               }
                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
                                                 &logged_list, ctx, start, end);
                 if (ret) {
@@ -4937,7 +5000,7 @@ out_unlock:
   * the actual unlink operation, so if we do this check before a concurrent task
   * sets last_unlink_trans it means we've logged a consistent version/state of
   * all the inode items, otherwise we are not sure and must do a transaction
- * commit (the concurrent task migth have only updated last_unlink_trans before
+ * commit (the concurrent task might have only updated last_unlink_trans before
   * we logged the inode or it might have also done the unlink).
   */
  static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
@@ -4988,7 +5051,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                         goto out;
  
         if (!S_ISDIR(inode->i_mode)) {
-               if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+               if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                         goto out;
                 inode = d_inode(parent);
         }
@@ -4996,7 +5059,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
         while (1) {
                 /*
                  * If we are logging a directory then we start with our inode,
-                * not our parents inode, so we need to skipp setting the
+                * not our parent's inode, so we need to skip setting the
                  * logged_trans so that further down in the log code we don't
                  * think this inode has already been logged.
                  */
@@ -5009,11 +5072,15 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                         break;
                 }
  
-               if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+               if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                         break;
  
-               if (IS_ROOT(parent))
+               if (IS_ROOT(parent)) {
+                       inode = d_inode(parent);
+                       if (btrfs_must_commit_transaction(trans, inode))
+                               ret = 1;
                         break;
+               }
  
                 parent = dget_parent(parent);
                 dput(old_parent);
@@ -5158,7 +5225,7 @@ process_leaf:
                         }
  
                         ctx->log_new_dentries = false;
-                       if (type == BTRFS_FT_DIR)
+                       if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
                                 log_mode = LOG_INODE_ALL;
                         btrfs_release_path(path);
                         ret = btrfs_log_inode(trans, root, di_inode,
@@ -5278,11 +5345,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
                         if (IS_ERR(dir_inode))
                                 continue;
  
+                       if (ctx)
+                               ctx->log_new_dentries = false;
                         ret = btrfs_log_inode(trans, root, dir_inode,
                                               LOG_INODE_ALL, 0, LLONG_MAX, ctx);
                         if (!ret &&
                             btrfs_must_commit_transaction(trans, dir_inode))
                                 ret = 1;
+                       if (!ret && ctx && ctx->log_new_dentries)
+                               ret = log_new_dir_dentries(trans, root,
+                                                          dir_inode, ctx);
                         iput(dir_inode);
                         if (ret)
                                 goto out;
@@ -5319,7 +5391,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
  
         sb = inode->i_sb;
  
-       if (btrfs_test_opt(root, NOTREELOG)) {
+       if (btrfs_test_opt(root->fs_info, NOTREELOG)) {
                 ret = 1;
                 goto end_no_trans;
         }
@@ -5375,7 +5447,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                 log_dentries = true;
  
         /*
-        * On unlink we must make sure all our current and old parent directores
+        * On unlink we must make sure all our current and old parent directory
          * inodes are fully logged. This is to prevent leaving dangling
          * directory index entries in directories that were our parents but are
          * not anymore. Not doing this results in old parent directory being
@@ -5422,7 +5494,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         }
  
         while (1) {
-               if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+               if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
                         break;
  
                 inode = d_inode(parent);
@@ -5519,7 +5591,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
  
         ret = walk_log_tree(trans, log_root_tree, &wc);
         if (ret) {
-               btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
+               btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while "
                             "recovering log root tree.");
                 goto error;
         }
@@ -5533,7 +5605,7 @@ again:
                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
  
                 if (ret < 0) {
-                       btrfs_std_error(fs_info, ret,
+                       btrfs_handle_fs_error(fs_info, ret,
                                     "Couldn't find tree log root.");
                         goto error;
                 }
@@ -5551,7 +5623,7 @@ again:
                 log = btrfs_read_fs_root(log_root_tree, &found_key);
                 if (IS_ERR(log)) {
                         ret = PTR_ERR(log);
-                       btrfs_std_error(fs_info, ret,
+                       btrfs_handle_fs_error(fs_info, ret,
                                     "Couldn't read tree log root.");
                         goto error;
                 }
@@ -5566,7 +5638,7 @@ again:
                         free_extent_buffer(log->node);
                         free_extent_buffer(log->commit_root);
                         kfree(log);
-                       btrfs_std_error(fs_info, ret, "Couldn't read target root "
+                       btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root "
                                     "for tree log recovery.");
                         goto error;
                 }
@@ -5652,11 +5724,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
          * into the file.  When the file is logged we check it and
          * don't log the parents if the file is fully on disk.
          */
-       if (S_ISREG(inode->i_mode)) {
-               mutex_lock(&BTRFS_I(inode)->log_mutex);
-               BTRFS_I(inode)->last_unlink_trans = trans->transid;
-               mutex_unlock(&BTRFS_I(inode)->log_mutex);
-       }
+       mutex_lock(&BTRFS_I(inode)->log_mutex);
+       BTRFS_I(inode)->last_unlink_trans = trans->transid;
+       mutex_unlock(&BTRFS_I(inode)->log_mutex);
  
         /*
          * if this directory was already logged any new