Merge branch 'xfs-4.7-inode-reclaim' into for-next

[cascardo/linux.git] / fs / xfs / xfs_icache.c
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c

index d7a490f..99ee6ee 100644 (file)
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -37,9 +37,6 @@
  #include <linux/kthread.h>
  #include <linux/freezer.h>
  
-STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
-                               struct xfs_perag *pag, struct xfs_inode *ip);
-
  /*
   * Allocate and initialise an xfs_inode.
   */
@@ -63,6 +60,9 @@ xfs_inode_alloc(
                 return NULL;
         }
  
+       /* VFS doesn't initialise i_mode! */
+       VFS_I(ip)->i_mode = 0;
+
         XFS_STATS_INC(mp, vn_active);
         ASSERT(atomic_read(&ip->i_pincount) == 0);
         ASSERT(!spin_is_locked(&ip->i_flags_lock));
@@ -79,7 +79,7 @@ xfs_inode_alloc(
         memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
         ip->i_flags = 0;
         ip->i_delayed_blks = 0;
-       memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+       memset(&ip->i_d, 0, sizeof(ip->i_d));
  
         return ip;
  }
@@ -91,14 +91,7 @@ xfs_inode_free_callback(
         struct inode            *inode = container_of(head, struct inode, i_rcu);
         struct xfs_inode        *ip = XFS_I(inode);
  
-       kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
-       struct xfs_inode        *ip)
-{
-       switch (ip->i_d.di_mode & S_IFMT) {
+       switch (VFS_I(ip)->i_mode & S_IFMT) {
         case S_IFREG:
         case S_IFDIR:
         case S_IFLNK:
@@ -115,6 +108,25 @@ xfs_inode_free(
                 ip->i_itemp = NULL;
         }
  
+       kmem_zone_free(xfs_inode_zone, ip);
+}
+
+static void
+__xfs_inode_free(
+       struct xfs_inode        *ip)
+{
+       /* asserts to verify all state is correct here */
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!xfs_isiflocked(ip));
+       XFS_STATS_DEC(ip->i_mount, vn_active);
+
+       call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+void
+xfs_inode_free(
+       struct xfs_inode        *ip)
+{
         /*
          * Because we use RCU freeing we need to ensure the inode always
          * appears to be reclaimed with an invalid inode number when in the
@@ -126,12 +138,151 @@ xfs_inode_free(
         ip->i_ino = 0;
         spin_unlock(&ip->i_flags_lock);
  
-       /* asserts to verify all state is correct here */
-       ASSERT(atomic_read(&ip->i_pincount) == 0);
-       ASSERT(!xfs_isiflocked(ip));
-       XFS_STATS_DEC(ip->i_mount, vn_active);
+       __xfs_inode_free(ip);
+}
  
-       call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+       struct xfs_mount        *mp)
+{
+
+       rcu_read_lock();
+       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+               queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+       }
+       rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+       struct work_struct *work)
+{
+       struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                       struct xfs_mount, m_reclaim_work);
+
+       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+       xfs_reclaim_work_queue(mp);
+}
+
+static void
+xfs_perag_set_reclaim_tag(
+       struct xfs_perag        *pag)
+{
+       struct xfs_mount        *mp = pag->pag_mount;
+
+       ASSERT(spin_is_locked(&pag->pag_ici_lock));
+       if (pag->pag_ici_reclaimable++)
+               return;
+
+       /* propagate the reclaim tag up into the perag radix tree */
+       spin_lock(&mp->m_perag_lock);
+       radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
+                          XFS_ICI_RECLAIM_TAG);
+       spin_unlock(&mp->m_perag_lock);
+
+       /* schedule periodic background inode reclaim */
+       xfs_reclaim_work_queue(mp);
+
+       trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+static void
+xfs_perag_clear_reclaim_tag(
+       struct xfs_perag        *pag)
+{
+       struct xfs_mount        *mp = pag->pag_mount;
+
+       ASSERT(spin_is_locked(&pag->pag_ici_lock));
+       if (--pag->pag_ici_reclaimable)
+               return;
+
+       /* clear the reclaim tag from the perag radix tree */
+       spin_lock(&mp->m_perag_lock);
+       radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
+                            XFS_ICI_RECLAIM_TAG);
+       spin_unlock(&mp->m_perag_lock);
+       trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+       struct xfs_inode        *ip)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_perag        *pag;
+
+       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+       spin_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+
+       radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
+                          XFS_ICI_RECLAIM_TAG);
+       xfs_perag_set_reclaim_tag(pag);
+       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+
+       spin_unlock(&ip->i_flags_lock);
+       spin_unlock(&pag->pag_ici_lock);
+       xfs_perag_put(pag);
+}
+
+STATIC void
+xfs_inode_clear_reclaim_tag(
+       struct xfs_perag        *pag,
+       xfs_ino_t               ino)
+{
+       radix_tree_tag_clear(&pag->pag_ici_root,
+                            XFS_INO_TO_AGINO(pag->pag_mount, ino),
+                            XFS_ICI_RECLAIM_TAG);
+       xfs_perag_clear_reclaim_tag(pag);
+}
+
+/*
+ * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
+ * part of the structure. This is made more complex by the fact we store
+ * information about the on-disk values in the VFS inode and so we can't just
+ * overwrite the values unconditionally. Hence we save the parameters we
+ * need to retain across reinitialisation, and rewrite them into the VFS inode
+ * after reinitialisation even if it fails.
+ */
+static int
+xfs_reinit_inode(
+       struct xfs_mount        *mp,
+       struct inode            *inode)
+{
+       int             error;
+       uint32_t        nlink = inode->i_nlink;
+       uint32_t        generation = inode->i_generation;
+       uint64_t        version = inode->i_version;
+       umode_t         mode = inode->i_mode;
+
+       error = inode_init_always(mp->m_super, inode);
+
+       set_nlink(inode, nlink);
+       inode->i_generation = generation;
+       inode->i_version = version;
+       inode->i_mode = mode;
+       return error;
  }
  
  /*
@@ -185,7 +336,7 @@ xfs_iget_cache_hit(
         /*
          * If lookup is racing with unlink return an error immediately.
          */
-       if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+       if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
                 error = -ENOENT;
                 goto out_error;
         }
@@ -208,7 +359,7 @@ xfs_iget_cache_hit(
                 spin_unlock(&ip->i_flags_lock);
                 rcu_read_unlock();
  
-               error = inode_init_always(mp->m_super, inode);
+               error = xfs_reinit_inode(mp, inode);
                 if (error) {
                         /*
                          * Re-initializing the inode failed, and we are in deep
@@ -233,7 +384,7 @@ xfs_iget_cache_hit(
                  */
                 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
                 ip->i_flags |= XFS_INEW;
-               __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+               xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
                 inode->i_state = I_NEW;
  
                 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
@@ -295,7 +446,7 @@ xfs_iget_cache_miss(
  
         trace_xfs_iget_miss(ip);
  
-       if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+       if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) {
                 error = -ENOENT;
                 goto out_destroy;
         }
@@ -444,7 +595,7 @@ again:
          * If we have a real type for an on-disk inode, we can setup the inode
          * now.  If it's a new inode being created, xfs_ialloc will handle it.
          */
-       if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+       if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
                 xfs_setup_existing_inode(ip);
         return 0;
  
@@ -691,121 +842,6 @@ xfs_inode_ag_iterator_tag(
         return last_error;
  }
  
-/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_work_queue(
-       struct xfs_mount        *mp)
-{
-
-       rcu_read_lock();
-       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-               queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
-                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-       }
-       rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(to_delayed_work(work),
-                                       struct xfs_mount, m_reclaim_work);
-
-       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-       xfs_reclaim_work_queue(mp);
-}
-
-static void
-__xfs_inode_set_reclaim_tag(
-       struct xfs_perag        *pag,
-       struct xfs_inode        *ip)
-{
-       radix_tree_tag_set(&pag->pag_ici_root,
-                          XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-                          XFS_ICI_RECLAIM_TAG);
-
-       if (!pag->pag_ici_reclaimable) {
-               /* propagate the reclaim tag up into the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-
-               /* schedule periodic background inode reclaim */
-               xfs_reclaim_work_queue(ip->i_mount);
-
-               trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-       pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-       xfs_inode_t     *ip)
-{
-       struct xfs_mount *mp = ip->i_mount;
-       struct xfs_perag *pag;
-
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-       spin_lock(&pag->pag_ici_lock);
-       spin_lock(&ip->i_flags_lock);
-       __xfs_inode_set_reclaim_tag(pag, ip);
-       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-       spin_unlock(&ip->i_flags_lock);
-       spin_unlock(&pag->pag_ici_lock);
-       xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       pag->pag_ici_reclaimable--;
-       if (!pag->pag_ici_reclaimable) {
-               /* clear the reclaim tag from the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-               trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-}
-
-STATIC void
-__xfs_inode_clear_reclaim_tag(
-       xfs_mount_t     *mp,
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       radix_tree_tag_clear(&pag->pag_ici_root,
-                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-       __xfs_inode_clear_reclaim(pag, ip);
-}
-
  /*
   * Grab the inode for reclaim exclusively.
   * Return 0 if we grabbed it, non-zero otherwise.
@@ -898,6 +934,7 @@ xfs_reclaim_inode(
         int                     sync_mode)
  {
         struct xfs_buf          *bp = NULL;
+       xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
         int                     error;
  
  restart:
@@ -962,6 +999,22 @@ restart:
  
         xfs_iflock(ip);
  reclaim:
+       /*
+        * Because we use RCU freeing we need to ensure the inode always appears
+        * to be reclaimed with an invalid inode number when in the free state.
+        * We do this as early as possible under the ILOCK and flush lock so
+        * that xfs_iflush_cluster() can be guaranteed to detect races with us
+        * here. By doing this, we guarantee that once xfs_iflush_cluster has
+        * locked both the XFS_ILOCK and the flush lock that it will see either
+        * a valid, flushable inode that will serialise correctly against the
+        * locks below, or it will see a clean (and invalid) inode that it can
+        * skip.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+       ip->i_ino = 0;
+       spin_unlock(&ip->i_flags_lock);
+
         xfs_ifunlock(ip);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
  
@@ -975,9 +1028,9 @@ reclaim:
          */
         spin_lock(&pag->pag_ici_lock);
         if (!radix_tree_delete(&pag->pag_ici_root,
-                               XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+                               XFS_INO_TO_AGINO(ip->i_mount, ino)))
                 ASSERT(0);
-       __xfs_inode_clear_reclaim(pag, ip);
+       xfs_perag_clear_reclaim_tag(pag);
         spin_unlock(&pag->pag_ici_lock);
  
         /*
@@ -992,7 +1045,7 @@ reclaim:
         xfs_qm_dqdetach(ip);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
  
-       xfs_inode_free(ip);
+       __xfs_inode_free(ip);
         return error;
  
  out_ifunlock: