Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux
[cascardo/linux.git] / fs / xfs / xfs_log_recover.c
index e8638fd..846483d 100644 (file)
@@ -44,6 +44,7 @@
 #include "xfs_error.h"
 #include "xfs_dir2.h"
 #include "xfs_rmap_item.h"
+#include "xfs_buf_item.h"
 
 #define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
 
@@ -381,6 +382,15 @@ xlog_recover_iodone(
                                                SHUTDOWN_META_IO_ERROR);
                }
        }
+
+       /*
+        * On v5 supers, a bli could be attached to update the metadata LSN.
+        * Clean it up.
+        */
+       if (bp->b_fspriv)
+               xfs_buf_item_relse(bp);
+       ASSERT(bp->b_fspriv == NULL);
+
        bp->b_iodone = NULL;
        xfs_buf_ioend(bp);
 }
@@ -2360,12 +2370,14 @@ static void
 xlog_recover_validate_buf_type(
        struct xfs_mount        *mp,
        struct xfs_buf          *bp,
-       xfs_buf_log_format_t    *buf_f)
+       xfs_buf_log_format_t    *buf_f,
+       xfs_lsn_t               current_lsn)
 {
        struct xfs_da_blkinfo   *info = bp->b_addr;
        __uint32_t              magic32;
        __uint16_t              magic16;
        __uint16_t              magicda;
+       char                    *warnmsg = NULL;
 
        /*
         * We can only do post recovery validation on items on CRC enabled
@@ -2404,31 +2416,27 @@ xlog_recover_validate_buf_type(
                        bp->b_ops = &xfs_rmapbt_buf_ops;
                        break;
                default:
-                       xfs_warn(mp, "Bad btree block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad btree block magic!";
                        break;
                }
                break;
        case XFS_BLFT_AGF_BUF:
                if (magic32 != XFS_AGF_MAGIC) {
-                       xfs_warn(mp, "Bad AGF block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad AGF block magic!";
                        break;
                }
                bp->b_ops = &xfs_agf_buf_ops;
                break;
        case XFS_BLFT_AGFL_BUF:
                if (magic32 != XFS_AGFL_MAGIC) {
-                       xfs_warn(mp, "Bad AGFL block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad AGFL block magic!";
                        break;
                }
                bp->b_ops = &xfs_agfl_buf_ops;
                break;
        case XFS_BLFT_AGI_BUF:
                if (magic32 != XFS_AGI_MAGIC) {
-                       xfs_warn(mp, "Bad AGI block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad AGI block magic!";
                        break;
                }
                bp->b_ops = &xfs_agi_buf_ops;
@@ -2438,8 +2446,7 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_GDQUOT_BUF:
 #ifdef CONFIG_XFS_QUOTA
                if (magic16 != XFS_DQUOT_MAGIC) {
-                       xfs_warn(mp, "Bad DQUOT block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad DQUOT block magic!";
                        break;
                }
                bp->b_ops = &xfs_dquot_buf_ops;
@@ -2451,16 +2458,14 @@ xlog_recover_validate_buf_type(
                break;
        case XFS_BLFT_DINO_BUF:
                if (magic16 != XFS_DINODE_MAGIC) {
-                       xfs_warn(mp, "Bad INODE block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad INODE block magic!";
                        break;
                }
                bp->b_ops = &xfs_inode_buf_ops;
                break;
        case XFS_BLFT_SYMLINK_BUF:
                if (magic32 != XFS_SYMLINK_MAGIC) {
-                       xfs_warn(mp, "Bad symlink block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad symlink block magic!";
                        break;
                }
                bp->b_ops = &xfs_symlink_buf_ops;
@@ -2468,8 +2473,7 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_DIR_BLOCK_BUF:
                if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
                    magic32 != XFS_DIR3_BLOCK_MAGIC) {
-                       xfs_warn(mp, "Bad dir block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir block magic!";
                        break;
                }
                bp->b_ops = &xfs_dir3_block_buf_ops;
@@ -2477,8 +2481,7 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_DIR_DATA_BUF:
                if (magic32 != XFS_DIR2_DATA_MAGIC &&
                    magic32 != XFS_DIR3_DATA_MAGIC) {
-                       xfs_warn(mp, "Bad dir data magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir data magic!";
                        break;
                }
                bp->b_ops = &xfs_dir3_data_buf_ops;
@@ -2486,8 +2489,7 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_DIR_FREE_BUF:
                if (magic32 != XFS_DIR2_FREE_MAGIC &&
                    magic32 != XFS_DIR3_FREE_MAGIC) {
-                       xfs_warn(mp, "Bad dir3 free magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir3 free magic!";
                        break;
                }
                bp->b_ops = &xfs_dir3_free_buf_ops;
@@ -2495,8 +2497,7 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_DIR_LEAF1_BUF:
                if (magicda != XFS_DIR2_LEAF1_MAGIC &&
                    magicda != XFS_DIR3_LEAF1_MAGIC) {
-                       xfs_warn(mp, "Bad dir leaf1 magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir leaf1 magic!";
                        break;
                }
                bp->b_ops = &xfs_dir3_leaf1_buf_ops;
@@ -2504,8 +2505,7 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_DIR_LEAFN_BUF:
                if (magicda != XFS_DIR2_LEAFN_MAGIC &&
                    magicda != XFS_DIR3_LEAFN_MAGIC) {
-                       xfs_warn(mp, "Bad dir leafn magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir leafn magic!";
                        break;
                }
                bp->b_ops = &xfs_dir3_leafn_buf_ops;
@@ -2513,8 +2513,7 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_DA_NODE_BUF:
                if (magicda != XFS_DA_NODE_MAGIC &&
                    magicda != XFS_DA3_NODE_MAGIC) {
-                       xfs_warn(mp, "Bad da node magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad da node magic!";
                        break;
                }
                bp->b_ops = &xfs_da3_node_buf_ops;
@@ -2522,24 +2521,21 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_ATTR_LEAF_BUF:
                if (magicda != XFS_ATTR_LEAF_MAGIC &&
                    magicda != XFS_ATTR3_LEAF_MAGIC) {
-                       xfs_warn(mp, "Bad attr leaf magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad attr leaf magic!";
                        break;
                }
                bp->b_ops = &xfs_attr3_leaf_buf_ops;
                break;
        case XFS_BLFT_ATTR_RMT_BUF:
                if (magic32 != XFS_ATTR3_RMT_MAGIC) {
-                       xfs_warn(mp, "Bad attr remote magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad attr remote magic!";
                        break;
                }
                bp->b_ops = &xfs_attr3_rmt_buf_ops;
                break;
        case XFS_BLFT_SB_BUF:
                if (magic32 != XFS_SB_MAGIC) {
-                       xfs_warn(mp, "Bad SB block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad SB block magic!";
                        break;
                }
                bp->b_ops = &xfs_sb_buf_ops;
@@ -2556,6 +2552,40 @@ xlog_recover_validate_buf_type(
                         xfs_blft_from_flags(buf_f));
                break;
        }
+
+       /*
+        * Nothing else to do in the case of a NULL current LSN as this means
+        * the buffer is more recent than the change in the log and will be
+        * skipped.
+        */
+       if (current_lsn == NULLCOMMITLSN)
+               return;
+
+       if (warnmsg) {
+               xfs_warn(mp, warnmsg);
+               ASSERT(0);
+       }
+
+       /*
+        * We must update the metadata LSN of the buffer as it is written out to
+        * ensure that older transactions never replay over this one and corrupt
+        * the buffer. This can occur if log recovery is interrupted at some
+        * point after the current transaction completes, at which point a
+        * subsequent mount starts recovery from the beginning.
+        *
+        * Write verifiers update the metadata LSN from log items attached to
+        * the buffer. Therefore, initialize a bli purely to carry the LSN to
+        * the verifier. We'll clean it up in our ->iodone() callback.
+        */
+       if (bp->b_ops) {
+               struct xfs_buf_log_item *bip;
+
+               ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
+               bp->b_iodone = xlog_recover_iodone;
+               xfs_buf_item_init(bp, mp);
+               bip = bp->b_fspriv;
+               bip->bli_item.li_lsn = current_lsn;
+       }
 }
 
 /*
@@ -2569,7 +2599,8 @@ xlog_recover_do_reg_buffer(
        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
        struct xfs_buf          *bp,
-       xfs_buf_log_format_t    *buf_f)
+       xfs_buf_log_format_t    *buf_f,
+       xfs_lsn_t               current_lsn)
 {
        int                     i;
        int                     bit;
@@ -2642,7 +2673,7 @@ xlog_recover_do_reg_buffer(
        /* Shouldn't be any more regions */
        ASSERT(i == item->ri_total);
 
-       xlog_recover_validate_buf_type(mp, bp, buf_f);
+       xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
 }
 
 /*
@@ -2685,7 +2716,7 @@ xlog_recover_do_dquot_buffer(
        if (log->l_quotaoffs_flag & type)
                return false;
 
-       xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+       xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
        return true;
 }
 
@@ -2773,7 +2804,8 @@ xlog_recover_buffer_pass2(
         */
        lsn = xlog_recover_get_buf_lsn(mp, bp);
        if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
-               xlog_recover_validate_buf_type(mp, bp, buf_f);
+               trace_xfs_log_recover_buf_skip(log, buf_f);
+               xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
                goto out_release;
        }
 
@@ -2789,7 +2821,7 @@ xlog_recover_buffer_pass2(
                if (!dirty)
                        goto out_release;
        } else {
-               xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+               xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
        }
 
        /*
@@ -3846,14 +3878,13 @@ STATIC int
 xlog_recover_commit_trans(
        struct xlog             *log,
        struct xlog_recover     *trans,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
 {
        int                             error = 0;
-       int                             error2;
        int                             items_queued = 0;
        struct xlog_recover_item        *item;
        struct xlog_recover_item        *next;
-       LIST_HEAD                       (buffer_list);
        LIST_HEAD                       (ra_list);
        LIST_HEAD                       (done_list);
 
@@ -3876,7 +3907,7 @@ xlog_recover_commit_trans(
                        items_queued++;
                        if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
                                error = xlog_recover_items_pass2(log, trans,
-                                               &buffer_list, &ra_list);
+                                               buffer_list, &ra_list);
                                list_splice_tail_init(&ra_list, &done_list);
                                items_queued = 0;
                        }
@@ -3894,15 +3925,14 @@ out:
        if (!list_empty(&ra_list)) {
                if (!error)
                        error = xlog_recover_items_pass2(log, trans,
-                                       &buffer_list, &ra_list);
+                                       buffer_list, &ra_list);
                list_splice_tail_init(&ra_list, &done_list);
        }
 
        if (!list_empty(&done_list))
                list_splice_init(&done_list, &trans->r_itemq);
 
-       error2 = xfs_buf_delwri_submit(&buffer_list);
-       return error ? error : error2;
+       return error;
 }
 
 STATIC void
@@ -4085,7 +4115,8 @@ xlog_recovery_process_trans(
        char                    *dp,
        unsigned int            len,
        unsigned int            flags,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
 {
        int                     error = 0;
        bool                    freeit = false;
@@ -4109,7 +4140,8 @@ xlog_recovery_process_trans(
                error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
                break;
        case XLOG_COMMIT_TRANS:
-               error = xlog_recover_commit_trans(log, trans, pass);
+               error = xlog_recover_commit_trans(log, trans, pass,
+                                                 buffer_list);
                /* success or fail, we are now done with this transaction. */
                freeit = true;
                break;
@@ -4191,10 +4223,12 @@ xlog_recover_process_ophdr(
        struct xlog_op_header   *ohead,
        char                    *dp,
        char                    *end,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
 {
        struct xlog_recover     *trans;
        unsigned int            len;
+       int                     error;
 
        /* Do we understand who wrote this op? */
        if (ohead->oh_clientid != XFS_TRANSACTION &&
@@ -4221,8 +4255,39 @@ xlog_recover_process_ophdr(
                return 0;
        }
 
+       /*
+        * The recovered buffer queue is drained only once we know that all
+        * recovery items for the current LSN have been processed. This is
+        * required because:
+        *
+        * - Buffer write submission updates the metadata LSN of the buffer.
+        * - Log recovery skips items with a metadata LSN >= the current LSN of
+        *   the recovery item.
+        * - Separate recovery items against the same metadata buffer can share
+        *   a current LSN. I.e., consider that the LSN of a recovery item is
+        *   defined as the starting LSN of the first record in which its
+        *   transaction appears, that a record can hold multiple transactions,
+        *   and/or that a transaction can span multiple records.
+        *
+        * In other words, we are allowed to submit a buffer from log recovery
+        * once per current LSN. Otherwise, we may incorrectly skip recovery
+        * items and cause corruption.
+        *
+        * We don't know up front whether buffers are updated multiple times per
+        * LSN. Therefore, track the current LSN of each commit log record as it
+        * is processed and drain the queue when it changes. Use commit records
+        * because they are ordered correctly by the logging code.
+        */
+       if (log->l_recovery_lsn != trans->r_lsn &&
+           ohead->oh_flags & XLOG_COMMIT_TRANS) {
+               error = xfs_buf_delwri_submit(buffer_list);
+               if (error)
+                       return error;
+               log->l_recovery_lsn = trans->r_lsn;
+       }
+
        return xlog_recovery_process_trans(log, trans, dp, len,
-                                          ohead->oh_flags, pass);
+                                          ohead->oh_flags, pass, buffer_list);
 }
 
 /*
@@ -4240,7 +4305,8 @@ xlog_recover_process_data(
        struct hlist_head       rhash[],
        struct xlog_rec_header  *rhead,
        char                    *dp,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
 {
        struct xlog_op_header   *ohead;
        char                    *end;
@@ -4254,6 +4320,7 @@ xlog_recover_process_data(
        if (xlog_header_check_recover(log->l_mp, rhead))
                return -EIO;
 
+       trace_xfs_log_recover_record(log, rhead, pass);
        while ((dp < end) && num_logops) {
 
                ohead = (struct xlog_op_header *)dp;
@@ -4262,7 +4329,7 @@ xlog_recover_process_data(
 
                /* errors will abort recovery */
                error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
-                                                   dp, end, pass);
+                                                  dp, end, pass, buffer_list);
                if (error)
                        return error;
 
@@ -4685,7 +4752,8 @@ xlog_recover_process(
        struct hlist_head       rhash[],
        struct xlog_rec_header  *rhead,
        char                    *dp,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
 {
        int                     error;
        __le32                  crc;
@@ -4732,7 +4800,8 @@ xlog_recover_process(
        if (error)
                return error;
 
-       return xlog_recover_process_data(log, rhash, rhead, dp, pass);
+       return xlog_recover_process_data(log, rhash, rhead, dp, pass,
+                                        buffer_list);
 }
 
 STATIC int
@@ -4793,9 +4862,11 @@ xlog_do_recovery_pass(
        char                    *offset;
        xfs_buf_t               *hbp, *dbp;
        int                     error = 0, h_size, h_len;
+       int                     error2 = 0;
        int                     bblks, split_bblks;
        int                     hblks, split_hblks, wrapped_hblks;
        struct hlist_head       rhash[XLOG_RHASH_SIZE];
+       LIST_HEAD               (buffer_list);
 
        ASSERT(head_blk != tail_blk);
        rhead_blk = 0;
@@ -4981,7 +5052,7 @@ xlog_do_recovery_pass(
                        }
 
                        error = xlog_recover_process(log, rhash, rhead, offset,
-                                                    pass);
+                                                    pass, &buffer_list);
                        if (error)
                                goto bread_err2;
 
@@ -5012,7 +5083,8 @@ xlog_do_recovery_pass(
                if (error)
                        goto bread_err2;
 
-               error = xlog_recover_process(log, rhash, rhead, offset, pass);
+               error = xlog_recover_process(log, rhash, rhead, offset, pass,
+                                            &buffer_list);
                if (error)
                        goto bread_err2;
 
@@ -5025,10 +5097,17 @@ xlog_do_recovery_pass(
  bread_err1:
        xlog_put_bp(hbp);
 
+       /*
+        * Submit buffers that have been added from the last record processed,
+        * regardless of error status.
+        */
+       if (!list_empty(&buffer_list))
+               error2 = xfs_buf_delwri_submit(&buffer_list);
+
        if (error && first_bad)
                *first_bad = rhead_blk;
 
-       return error;
+       return error ? error : error2;
 }
 
 /*