mm: keep page cache radix tree nodes in check
[cascardo/linux.git] / mm / truncate.c
index 353b683..e5cc39a 100644 (file)
 #include <linux/cleancache.h>
 #include "internal.h"
 
+static void clear_exceptional_entry(struct address_space *mapping,
+                                   pgoff_t index, void *entry)
+{
+       struct radix_tree_node *node;
+       void **slot;
+
+       /* Handled by shmem itself */
+       if (shmem_mapping(mapping))
+               return;
+
+       spin_lock_irq(&mapping->tree_lock);
+       /*
+        * Regular page slots are stabilized by the page lock even
+        * without the tree itself locked.  These unlocked entries
+        * need verification under the tree lock.
+        */
+       if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
+               goto unlock;
+       if (*slot != entry)
+               goto unlock;
+       radix_tree_replace_slot(slot, NULL);
+       mapping->nrshadows--;
+       if (!node)
+               goto unlock;
+       workingset_node_shadows_dec(node);
+       /*
+        * Don't track node without shadow entries.
+        *
+        * Avoid acquiring the list_lru lock if already untracked.
+        * The list_empty() test is safe as node->private_list is
+        * protected by mapping->tree_lock.
+        */
+       if (!workingset_node_shadows(node) &&
+           !list_empty(&node->private_list))
+               list_lru_del(&workingset_shadow_nodes, &node->private_list);
+       __radix_tree_delete_node(&mapping->page_tree, node);
+unlock:
+       spin_unlock_irq(&mapping->tree_lock);
+}
 
 /**
  * do_invalidatepage - invalidate part or all of a page
@@ -208,11 +247,12 @@ void truncate_inode_pages_range(struct address_space *mapping,
        unsigned int    partial_start;  /* inclusive */
        unsigned int    partial_end;    /* exclusive */
        struct pagevec  pvec;
+       pgoff_t         indices[PAGEVEC_SIZE];
        pgoff_t         index;
        int             i;
 
        cleancache_invalidate_inode(mapping);
-       if (mapping->nrpages == 0)
+       if (mapping->nrpages == 0 && mapping->nrshadows == 0)
                return;
 
        /* Offsets within partial pages */
@@ -238,17 +278,23 @@ void truncate_inode_pages_range(struct address_space *mapping,
 
        pagevec_init(&pvec, 0);
        index = start;
-       while (index < end && pagevec_lookup(&pvec, mapping, index,
-                       min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
+       while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE),
+                       indices)) {
                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
 
                        /* We rely upon deletion not changing page->index */
-                       index = page->index;
+                       index = indices[i];
                        if (index >= end)
                                break;
 
+                       if (radix_tree_exceptional_entry(page)) {
+                               clear_exceptional_entry(mapping, index, page);
+                               continue;
+                       }
+
                        if (!trylock_page(page))
                                continue;
                        WARN_ON(page->index != index);
@@ -259,6 +305,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        truncate_inode_page(mapping, page);
                        unlock_page(page);
                }
+               pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
                cond_resched();
@@ -307,14 +354,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
        index = start;
        for ( ; ; ) {
                cond_resched();
-               if (!pagevec_lookup(&pvec, mapping, index,
-                       min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
+               if (!pagevec_lookup_entries(&pvec, mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE),
+                       indices)) {
                        if (index == start)
                                break;
                        index = start;
                        continue;
                }
-               if (index == start && pvec.pages[0]->index >= end) {
+               if (index == start && indices[0] >= end) {
+                       pagevec_remove_exceptionals(&pvec);
                        pagevec_release(&pvec);
                        break;
                }
@@ -323,16 +372,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        struct page *page = pvec.pages[i];
 
                        /* We rely upon deletion not changing page->index */
-                       index = page->index;
+                       index = indices[i];
                        if (index >= end)
                                break;
 
+                       if (radix_tree_exceptional_entry(page)) {
+                               clear_exceptional_entry(mapping, index, page);
+                               continue;
+                       }
+
                        lock_page(page);
                        WARN_ON(page->index != index);
                        wait_on_page_writeback(page);
                        truncate_inode_page(mapping, page);
                        unlock_page(page);
                }
+               pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
                index++;
@@ -359,6 +414,53 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 }
 EXPORT_SYMBOL(truncate_inode_pages);
 
+/**
+ * truncate_inode_pages_final - truncate *all* pages before inode dies
+ * @mapping: mapping to truncate
+ *
+ * Called under (and serialized by) inode->i_mutex.
+ *
+ * Filesystems have to use this in the .evict_inode path to inform the
+ * VM that this is the final truncate and the inode is going away.
+ */
+void truncate_inode_pages_final(struct address_space *mapping)
+{
+       unsigned long nrshadows;
+       unsigned long nrpages;
+
+       /*
+        * Page reclaim can not participate in regular inode lifetime
+        * management (can't call iput()) and thus can race with the
+        * inode teardown.  Tell it when the address space is exiting,
+        * so that it does not install eviction information after the
+        * final truncate has begun.
+        */
+       mapping_set_exiting(mapping);
+
+       /*
+        * When reclaim installs eviction entries, it increases
+        * nrshadows first, then decreases nrpages.  Make sure we see
+        * this in the right order or we might miss an entry.
+        */
+       nrpages = mapping->nrpages;
+       smp_rmb();
+       nrshadows = mapping->nrshadows;
+
+       if (nrpages || nrshadows) {
+               /*
+                * As truncation uses a lockless tree lookup, cycle
+                * the tree lock to make sure any ongoing tree
+                * modification that does not see AS_EXITING is
+                * completed before starting the final truncate.
+                */
+               spin_lock_irq(&mapping->tree_lock);
+               spin_unlock_irq(&mapping->tree_lock);
+
+               truncate_inode_pages(mapping, 0);
+       }
+}
+EXPORT_SYMBOL(truncate_inode_pages_final);
+
 /**
  * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
  * @mapping: the address_space which holds the pages to invalidate
@@ -375,6 +477,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
 {
+       pgoff_t indices[PAGEVEC_SIZE];
        struct pagevec pvec;
        pgoff_t index = start;
        unsigned long ret;
@@ -390,17 +493,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
         */
 
        pagevec_init(&pvec, 0);
-       while (index <= end && pagevec_lookup(&pvec, mapping, index,
-                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+       while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                       indices)) {
                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
 
                        /* We rely upon deletion not changing page->index */
-                       index = page->index;
+                       index = indices[i];
                        if (index > end)
                                break;
 
+                       if (radix_tree_exceptional_entry(page)) {
+                               clear_exceptional_entry(mapping, index, page);
+                               continue;
+                       }
+
                        if (!trylock_page(page))
                                continue;
                        WARN_ON(page->index != index);
@@ -414,6 +523,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                deactivate_page(page);
                        count += ret;
                }
+               pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
                cond_resched();
@@ -444,7 +554,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
                goto failed;
 
        BUG_ON(page_has_private(page));
-       __delete_from_page_cache(page);
+       __delete_from_page_cache(page, NULL);
        spin_unlock_irq(&mapping->tree_lock);
        mem_cgroup_uncharge_cache_page(page);
 
@@ -481,6 +591,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
 int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
 {
+       pgoff_t indices[PAGEVEC_SIZE];
        struct pagevec pvec;
        pgoff_t index;
        int i;
@@ -491,17 +602,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        cleancache_invalidate_inode(mapping);
        pagevec_init(&pvec, 0);
        index = start;
-       while (index <= end && pagevec_lookup(&pvec, mapping, index,
-                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+       while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                       indices)) {
                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
 
                        /* We rely upon deletion not changing page->index */
-                       index = page->index;
+                       index = indices[i];
                        if (index > end)
                                break;
 
+                       if (radix_tree_exceptional_entry(page)) {
+                               clear_exceptional_entry(mapping, index, page);
+                               continue;
+                       }
+
                        lock_page(page);
                        WARN_ON(page->index != index);
                        if (page->mapping != mapping) {
@@ -539,6 +656,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                ret = ret2;
                        unlock_page(page);
                }
+               pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
                cond_resched();