fs/xfs/libxfs/xfs_defer.c

   1 /*
   2  * Copyright (C) 2016 Oracle.  All Rights Reserved.
   3  *
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version 2
   9  * of the License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it would be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write the Free Software Foundation,
  18  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
  19  */
  20 #include "xfs.h"
  21 #include "xfs_fs.h"
  22 #include "xfs_shared.h"
  23 #include "xfs_format.h"
  24 #include "xfs_log_format.h"
  25 #include "xfs_trans_resv.h"
  26 #include "xfs_bit.h"
  27 #include "xfs_sb.h"
  28 #include "xfs_mount.h"
  29 #include "xfs_defer.h"
  30 #include "xfs_trans.h"
  31 #include "xfs_trace.h"
  32
  33 /*
  34  * Deferred Operations in XFS
  35  *
  36  * Due to the way locking rules work in XFS, certain transactions (block
  37  * mapping and unmapping, typically) have permanent reservations so that
  38  * we can roll the transaction to adhere to AG locking order rules and
  39  * to unlock buffers between metadata updates.  Prior to rmap/reflink,
  40  * the mapping code had a mechanism to perform these deferrals for
  41  * extents that were going to be freed; this code makes that facility
  42  * more generic.
  43  *
  44  * When adding the reverse mapping and reflink features, it became
  45  * necessary to perform complex remapping multi-transactions to comply
  46  * with AG locking order rules, and to be able to spread a single
  47  * refcount update operation (an operation on an n-block extent can
  48  * update as many as n records!) among multiple transactions.  XFS can
  49  * roll a transaction to facilitate this, but using this facility
  50  * requires us to log "intent" items in case log recovery needs to
  51  * redo the operation, and to log "done" items to indicate that redo
  52  * is not necessary.
  53  *
  54  * Deferred work is tracked in xfs_defer_pending items.  Each pending
  55  * item tracks one type of deferred work.  Incoming work items (which
  56  * have not yet had an intent logged) are attached to a pending item
  57  * on the dop_intake list, where they wait for the caller to finish
  58  * the deferred operations.
  59  *
  60  * Finishing a set of deferred operations is an involved process.  To
  61  * start, we define "rolling a deferred-op transaction" as follows:
  62  *
  63  * > For each xfs_defer_pending item on the dop_intake list,
  64  *   - Sort the work items in AG order.  XFS locking
  65  *     order rules require us to lock buffers in AG order.
  66  *   - Create a log intent item for that type.
  67  *   - Attach it to the pending item.
  68  *   - Move the pending item from the dop_intake list to the
  69  *     dop_pending list.
  70  * > Roll the transaction.
  71  *
  72  * NOTE: To avoid exceeding the transaction reservation, we limit the
  73  * number of items that we attach to a given xfs_defer_pending.
  74  *
  75  * The actual finishing process looks like this:
  76  *
  77  * > For each xfs_defer_pending in the dop_pending list,
  78  *   - Roll the deferred-op transaction as above.
  79  *   - Create a log done item for that type, and attach it to the
  80  *     log intent item.
  81  *   - For each work item attached to the log intent item,
  82  *     * Perform the described action.
  83  *     * Attach the work item to the log done item.
  84  *
  85  * The key here is that we must log an intent item for all pending
  86  * work items every time we roll the transaction, and that we must log
  87  * a done item as soon as the work is completed.  With this mechanism
  88  * we can perform complex remapping operations, chaining intent items
  89  * as needed.
  90  *
  91  * This is an example of remapping the extent (E, E+B) into file X at
  92  * offset A and dealing with the extent (C, C+B) already being mapped
  93  * there:
  94  * +-------------------------------------------------+
  95  * | Unmap file X startblock C offset A length B     | t0
  96  * | Intent to reduce refcount for extent (C, B)     |
  97  * | Intent to remove rmap (X, C, A, B)              |
  98  * | Intent to free extent (D, 1) (bmbt block)       |
  99  * | Intent to map (X, A, B) at startblock E         |
 100  * +-------------------------------------------------+
 101  * | Map file X startblock E offset A length B       | t1
 102  * | Done mapping (X, E, A, B)                       |
 103  * | Intent to increase refcount for extent (E, B)   |
 104  * | Intent to add rmap (X, E, A, B)                 |
 105  * +-------------------------------------------------+
 106  * | Reduce refcount for extent (C, B)               | t2
 107  * | Done reducing refcount for extent (C, B)        |
 108  * | Increase refcount for extent (E, B)             |
 109  * | Done increasing refcount for extent (E, B)      |
 110  * | Intent to free extent (C, B)                    |
 111  * | Intent to free extent (F, 1) (refcountbt block) |
 112  * | Intent to remove rmap (F, 1, REFC)              |
 113  * +-------------------------------------------------+
 114  * | Remove rmap (X, C, A, B)                        | t3
 115  * | Done removing rmap (X, C, A, B)                 |
 116  * | Add rmap (X, E, A, B)                           |
 117  * | Done adding rmap (X, E, A, B)                   |
 118  * | Remove rmap (F, 1, REFC)                        |
 119  * | Done removing rmap (F, 1, REFC)                 |
 120  * +-------------------------------------------------+
 121  * | Free extent (C, B)                              | t4
 122  * | Done freeing extent (C, B)                      |
 123  * | Free extent (D, 1)                              |
 124  * | Done freeing extent (D, 1)                      |
 125  * | Free extent (F, 1)                              |
 126  * | Done freeing extent (F, 1)                      |
 127  * +-------------------------------------------------+
 128  *
 129  * If we should crash before t2 commits, log recovery replays
 130  * the following intent items:
 131  *
 132  * - Intent to reduce refcount for extent (C, B)
 133  * - Intent to remove rmap (X, C, A, B)
 134  * - Intent to free extent (D, 1) (bmbt block)
 135  * - Intent to increase refcount for extent (E, B)
 136  * - Intent to add rmap (X, E, A, B)
 137  *
 138  * In the process of recovering, it should also generate and take care
 139  * of these intent items:
 140  *
 141  * - Intent to free extent (C, B)
 142  * - Intent to free extent (F, 1) (refcountbt block)
 143  * - Intent to remove rmap (F, 1, REFC)
 144  */
 145
 146 static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
 147
 148 /*
 149  * For each pending item in the intake list, log its intent item and the
 150  * associated extents, then add the entire intake list to the end of
 151  * the pending list.
 152  */
 153 STATIC void
 154 xfs_defer_intake_work(
 155         struct xfs_trans                *tp,
 156         struct xfs_defer_ops            *dop)
 157 {
 158         struct list_head                *li;
 159         struct xfs_defer_pending        *dfp;
 160
 161         list_for_each_entry(dfp, &dop->dop_intake, dfp_list) {
 162                 trace_xfs_defer_intake_work(tp->t_mountp, dfp);
 163                 dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
 164                                 dfp->dfp_count);
 165                 list_sort(tp->t_mountp, &dfp->dfp_work,
 166                                 dfp->dfp_type->diff_items);
 167                 list_for_each(li, &dfp->dfp_work)
 168                         dfp->dfp_type->log_item(tp, dfp->dfp_intent, li);
 169         }
 170
 171         list_splice_tail_init(&dop->dop_intake, &dop->dop_pending);
 172 }
 173
 174 /* Abort all the intents that were committed. */
 175 STATIC void
 176 xfs_defer_trans_abort(
 177         struct xfs_trans                *tp,
 178         struct xfs_defer_ops            *dop,
 179         int                             error)
 180 {
 181         struct xfs_defer_pending        *dfp;
 182
 183         trace_xfs_defer_trans_abort(tp->t_mountp, dop);
 184         /*
 185          * If the transaction was committed, drop the intent reference
 186          * since we're bailing out of here. The other reference is
 187          * dropped when the intent hits the AIL.  If the transaction
 188          * was not committed, the intent is freed by the intent item
 189          * unlock handler on abort.
 190          */
 191         if (!dop->dop_committed)
 192                 return;
 193
 194         /* Abort intent items. */
 195         list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
 196                 trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
 197                 if (dfp->dfp_committed)
 198                         dfp->dfp_type->abort_intent(dfp->dfp_intent);
 199         }
 200
 201         /* Shut down FS. */
 202         xfs_force_shutdown(tp->t_mountp, (error == -EFSCORRUPTED) ?
 203                         SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR);
 204 }
 205
 206 /* Roll a transaction so we can do some deferred op processing. */
 207 STATIC int
 208 xfs_defer_trans_roll(
 209         struct xfs_trans                **tp,
 210         struct xfs_defer_ops            *dop,
 211         struct xfs_inode                *ip)
 212 {
 213         int                             i;
 214         int                             error;
 215
 216         /* Log all the joined inodes except the one we passed in. */
 217         for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
 218                 if (dop->dop_inodes[i] == ip)
 219                         continue;
 220                 xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
 221         }
 222
 223         trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
 224
 225         /* Roll the transaction. */
 226         error = xfs_trans_roll(tp, ip);
 227         if (error) {
 228                 trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error);
 229                 xfs_defer_trans_abort(*tp, dop, error);
 230                 return error;
 231         }
 232         dop->dop_committed = true;
 233
 234         /* Rejoin the joined inodes except the one we passed in. */
 235         for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
 236                 if (dop->dop_inodes[i] == ip)
 237                         continue;
 238                 xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
 239         }
 240
 241         return error;
 242 }
 243
 244 /* Do we have any work items to finish? */
 245 bool
 246 xfs_defer_has_unfinished_work(
 247         struct xfs_defer_ops            *dop)
 248 {
 249         return !list_empty(&dop->dop_pending) || !list_empty(&dop->dop_intake);
 250 }
 251
 252 /*
 253  * Add this inode to the deferred op.  Each joined inode is relogged
 254  * each time we roll the transaction, in addition to any inode passed
 255  * to xfs_defer_finish().
 256  */
 257 int
 258 xfs_defer_join(
 259         struct xfs_defer_ops            *dop,
 260         struct xfs_inode                *ip)
 261 {
 262         int                             i;
 263
 264         for (i = 0; i < XFS_DEFER_OPS_NR_INODES; i++) {
 265                 if (dop->dop_inodes[i] == ip)
 266                         return 0;
 267                 else if (dop->dop_inodes[i] == NULL) {
 268                         dop->dop_inodes[i] = ip;
 269                         return 0;
 270                 }
 271         }
 272
 273         return -EFSCORRUPTED;
 274 }
 275
 276 /*
 277  * Finish all the pending work.  This involves logging intent items for
 278  * any work items that wandered in since the last transaction roll (if
 279  * one has even happened), rolling the transaction, and finishing the
 280  * work items in the first item on the logged-and-pending list.
 281  *
 282  * If an inode is provided, relog it to the new transaction.
 283  */
 284 int
 285 xfs_defer_finish(
 286         struct xfs_trans                **tp,
 287         struct xfs_defer_ops            *dop,
 288         struct xfs_inode                *ip)
 289 {
 290         struct xfs_defer_pending        *dfp;
 291         struct list_head                *li;
 292         struct list_head                *n;
 293         void                            *done_item = NULL;
 294         void                            *state;
 295         int                             error = 0;
 296         void                            (*cleanup_fn)(struct xfs_trans *, void *, int);
 297
 298         ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 299
 300         trace_xfs_defer_finish((*tp)->t_mountp, dop);
 301
 302         /* Until we run out of pending work to finish... */
 303         while (xfs_defer_has_unfinished_work(dop)) {
 304                 /* Log intents for work items sitting in the intake. */
 305                 xfs_defer_intake_work(*tp, dop);
 306
 307                 /* Roll the transaction. */
 308                 error = xfs_defer_trans_roll(tp, dop, ip);
 309                 if (error)
 310                         goto out;
 311
 312                 /* Mark all pending intents as committed. */
 313                 list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) {
 314                         if (dfp->dfp_committed)
 315                                 break;
 316                         trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp);
 317                         dfp->dfp_committed = true;
 318                 }
 319
 320                 /* Log an intent-done item for the first pending item. */
 321                 dfp = list_first_entry(&dop->dop_pending,
 322                                 struct xfs_defer_pending, dfp_list);
 323                 trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
 324                 done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
 325                                 dfp->dfp_count);
 326                 cleanup_fn = dfp->dfp_type->finish_cleanup;
 327
 328                 /* Finish the work items. */
 329                 state = NULL;
 330                 list_for_each_safe(li, n, &dfp->dfp_work) {
 331                         list_del(li);
 332                         dfp->dfp_count--;
 333                         error = dfp->dfp_type->finish_item(*tp, dop, li,
 334                                         done_item, &state);
 335                         if (error) {
 336                                 /*
 337                                  * Clean up after ourselves and jump out.
 338                                  * xfs_defer_cancel will take care of freeing
 339                                  * all these lists and stuff.
 340                                  */
 341                                 if (cleanup_fn)
 342                                         cleanup_fn(*tp, state, error);
 343                                 xfs_defer_trans_abort(*tp, dop, error);
 344                                 goto out;
 345                         }
 346                 }
 347                 /* Done with the dfp, free it. */
 348                 list_del(&dfp->dfp_list);
 349                 kmem_free(dfp);
 350
 351                 if (cleanup_fn)
 352                         cleanup_fn(*tp, state, error);
 353         }
 354
 355 out:
 356         if (error)
 357                 trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error);
 358         else
 359                 trace_xfs_defer_finish_done((*tp)->t_mountp, dop);
 360         return error;
 361 }
 362
 363 /*
 364  * Free up any items left in the list.
 365  */
 366 void
 367 xfs_defer_cancel(
 368         struct xfs_defer_ops            *dop)
 369 {
 370         struct xfs_defer_pending        *dfp;
 371         struct xfs_defer_pending        *pli;
 372         struct list_head                *pwi;
 373         struct list_head                *n;
 374
 375         trace_xfs_defer_cancel(NULL, dop);
 376
 377         /*
 378          * Free the pending items.  Caller should already have arranged
 379          * for the intent items to be released.
 380          */
 381         list_for_each_entry_safe(dfp, pli, &dop->dop_intake, dfp_list) {
 382                 trace_xfs_defer_intake_cancel(NULL, dfp);
 383                 list_del(&dfp->dfp_list);
 384                 list_for_each_safe(pwi, n, &dfp->dfp_work) {
 385                         list_del(pwi);
 386                         dfp->dfp_count--;
 387                         dfp->dfp_type->cancel_item(pwi);
 388                 }
 389                 ASSERT(dfp->dfp_count == 0);
 390                 kmem_free(dfp);
 391         }
 392         list_for_each_entry_safe(dfp, pli, &dop->dop_pending, dfp_list) {
 393                 trace_xfs_defer_pending_cancel(NULL, dfp);
 394                 list_del(&dfp->dfp_list);
 395                 list_for_each_safe(pwi, n, &dfp->dfp_work) {
 396                         list_del(pwi);
 397                         dfp->dfp_count--;
 398                         dfp->dfp_type->cancel_item(pwi);
 399                 }
 400                 ASSERT(dfp->dfp_count == 0);
 401                 kmem_free(dfp);
 402         }
 403 }
 404
 405 /* Add an item for later deferred processing. */
 406 void
 407 xfs_defer_add(
 408         struct xfs_defer_ops            *dop,
 409         enum xfs_defer_ops_type         type,
 410         struct list_head                *li)
 411 {
 412         struct xfs_defer_pending        *dfp = NULL;
 413
 414         /*
 415          * Add the item to a pending item at the end of the intake list.
 416          * If the last pending item has the same type, reuse it.  Else,
 417          * create a new pending item at the end of the intake list.
 418          */
 419         if (!list_empty(&dop->dop_intake)) {
 420                 dfp = list_last_entry(&dop->dop_intake,
 421                                 struct xfs_defer_pending, dfp_list);
 422                 if (dfp->dfp_type->type != type ||
 423                     (dfp->dfp_type->max_items &&
 424                      dfp->dfp_count >= dfp->dfp_type->max_items))
 425                         dfp = NULL;
 426         }
 427         if (!dfp) {
 428                 dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
 429                                 KM_SLEEP | KM_NOFS);
 430                 dfp->dfp_type = defer_op_types[type];
 431                 dfp->dfp_committed = false;
 432                 dfp->dfp_intent = NULL;
 433                 dfp->dfp_count = 0;
 434                 INIT_LIST_HEAD(&dfp->dfp_work);
 435                 list_add_tail(&dfp->dfp_list, &dop->dop_intake);
 436         }
 437
 438         list_add_tail(li, &dfp->dfp_work);
 439         dfp->dfp_count++;
 440 }
 441
 442 /* Initialize a deferred operation list. */
 443 void
 444 xfs_defer_init_op_type(
 445         const struct xfs_defer_op_type  *type)
 446 {
 447         defer_op_types[type->type] = type;
 448 }
 449
 450 /* Initialize a deferred operation. */
 451 void
 452 xfs_defer_init(
 453         struct xfs_defer_ops            *dop,
 454         xfs_fsblock_t                   *fbp)
 455 {
 456         dop->dop_committed = false;
 457         dop->dop_low = false;
 458         memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
 459         *fbp = NULLFSBLOCK;
 460         INIT_LIST_HEAD(&dop->dop_intake);
 461         INIT_LIST_HEAD(&dop->dop_pending);
 462         trace_xfs_defer_init(NULL, dop);
 463 }