fs/dcache.c: avoid soft-lockup in dput()
[cascardo/linux.git] / fs / dcache.c
index ad4a542..f650a4f 100644 (file)
@@ -226,10 +226,9 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char
 
 static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
 {
-       const unsigned char *cs;
        /*
         * Be careful about RCU walk racing with rename:
-        * use ACCESS_ONCE to fetch the name pointer.
+        * use 'lockless_dereference' to fetch the name pointer.
         *
         * NOTE! Even if a rename will mean that the length
         * was not loaded atomically, we don't care. The
@@ -243,8 +242,8 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
         * early because the data cannot match (there can
         * be no NUL in the ct/tcount data)
         */
-       cs = ACCESS_ONCE(dentry->d_name.name);
-       smp_read_barrier_depends();
+       const unsigned char *cs = lockless_dereference(dentry->d_name.name);
+
        return dentry_string_cmp(cs, ct, tcount);
 }
 
@@ -335,44 +334,21 @@ static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
 
 /*
  * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined. Dentry has no refcount
- * and is unhashed.
- */
-static void dentry_iput(struct dentry * dentry)
-       __releases(dentry->d_lock)
-       __releases(dentry->d_inode->i_lock)
-{
-       struct inode *inode = dentry->d_inode;
-       if (inode) {
-               __d_clear_type_and_inode(dentry);
-               hlist_del_init(&dentry->d_u.d_alias);
-               spin_unlock(&dentry->d_lock);
-               spin_unlock(&inode->i_lock);
-               if (!inode->i_nlink)
-                       fsnotify_inoderemove(inode);
-               if (dentry->d_op && dentry->d_op->d_iput)
-                       dentry->d_op->d_iput(dentry, inode);
-               else
-                       iput(inode);
-       } else {
-               spin_unlock(&dentry->d_lock);
-       }
-}
-
-/*
- * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined. dentry remains in-use.
+ * d_iput() operation if defined.
  */
 static void dentry_unlink_inode(struct dentry * dentry)
        __releases(dentry->d_lock)
        __releases(dentry->d_inode->i_lock)
 {
        struct inode *inode = dentry->d_inode;
+       bool hashed = !d_unhashed(dentry);
 
-       raw_write_seqcount_begin(&dentry->d_seq);
+       if (hashed)
+               raw_write_seqcount_begin(&dentry->d_seq);
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
-       raw_write_seqcount_end(&dentry->d_seq);
+       if (hashed)
+               raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
@@ -507,6 +483,44 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
+static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
+{
+       struct dentry *next;
+       /*
+        * Inform d_walk() and shrink_dentry_list() that we are no longer
+        * attached to the dentry tree
+        */
+       dentry->d_flags |= DCACHE_DENTRY_KILLED;
+       if (unlikely(list_empty(&dentry->d_child)))
+               return;
+       __list_del_entry(&dentry->d_child);
+       /*
+        * Cursors can move around the list of children.  While we'd been
+        * a normal list member, it didn't matter - ->d_child.next would've
+        * been updated.  However, from now on it won't be and for the
+        * things like d_walk() it might end up with a nasty surprise.
+        * Normally d_walk() doesn't care about cursors moving around -
+        * ->d_lock on parent prevents that and since a cursor has no children
+        * of its own, we get through it without ever unlocking the parent.
+        * There is one exception, though - if we ascend from a child that
+        * gets killed as soon as we unlock it, the next sibling is found
+        * using the value left in its ->d_child.next.  And if _that_
+        * pointed to a cursor, and cursor got moved (e.g. by lseek())
+        * before d_walk() regains parent->d_lock, we'll end up skipping
+        * everything the cursor had been moved past.
+        *
+        * Solution: make sure that the pointer left behind in ->d_child.next
+        * points to something that won't be moving around.  I.e. skip the
+        * cursors.
+        */
+       while (dentry->d_child.next != &parent->d_subdirs) {
+               next = list_entry(dentry->d_child.next, struct dentry, d_child);
+               if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
+                       break;
+               dentry->d_child.next = next->d_child.next;
+       }
+}
+
 static void __dentry_kill(struct dentry *dentry)
 {
        struct dentry *parent = NULL;
@@ -532,20 +546,13 @@ static void __dentry_kill(struct dentry *dentry)
        }
        /* if it was on the hash then remove it */
        __d_drop(dentry);
-       __list_del_entry(&dentry->d_child);
-       /*
-        * Inform d_walk() that we are no longer attached to the
-        * dentry tree
-        */
-       dentry->d_flags |= DCACHE_DENTRY_KILLED;
+       dentry_unlist(dentry, parent);
        if (parent)
                spin_unlock(&parent->d_lock);
-       dentry_iput(dentry);
-       /*
-        * dentry_iput drops the locks, at which point nobody (except
-        * transient RCU lookups) can reach this dentry.
-        */
-       BUG_ON(dentry->d_lockref.count > 0);
+       if (dentry->d_inode)
+               dentry_unlink_inode(dentry);
+       else
+               spin_unlock(&dentry->d_lock);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
@@ -589,7 +596,6 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 
 failed:
        spin_unlock(&dentry->d_lock);
-       cpu_relax();
        return dentry; /* try again with same dentry */
 }
 
@@ -763,6 +769,8 @@ void dput(struct dentry *dentry)
                return;
 
 repeat:
+       might_sleep();
+
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
@@ -796,8 +804,10 @@ repeat:
 
 kill_it:
        dentry = dentry_kill(dentry);
-       if (dentry)
+       if (dentry) {
+               cond_resched();
                goto repeat;
+       }
 }
 EXPORT_SYMBOL(dput);
 
@@ -1203,6 +1213,9 @@ resume:
                struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
                next = tmp->next;
 
+               if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
+                       continue;
+
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
                ret = enter(data, dentry);
@@ -1559,6 +1572,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 {
        struct dentry *dentry;
        char *dname;
+       int err;
 
        dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
        if (!dentry)
@@ -1617,6 +1631,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
        INIT_LIST_HEAD(&dentry->d_child);
        d_set_d_op(dentry, dentry->d_sb->s_d_op);
 
+       if (dentry->d_op && dentry->d_op->d_init) {
+               err = dentry->d_op->d_init(dentry);
+               if (err) {
+                       if (dname_external(dentry))
+                               kfree(external_name(dentry));
+                       kmem_cache_free(dentry_cache, dentry);
+                       return NULL;
+               }
+       }
+
        this_cpu_inc(nr_dentry);
 
        return dentry;
@@ -1636,7 +1660,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        struct dentry *dentry = __d_alloc(parent->d_sb, name);
        if (!dentry)
                return NULL;
-
+       dentry->d_flags |= DCACHE_RCUACCESS;
        spin_lock(&parent->d_lock);
        /*
         * don't need child lock because it is not subject
@@ -1651,6 +1675,16 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 }
 EXPORT_SYMBOL(d_alloc);
 
+struct dentry *d_alloc_cursor(struct dentry * parent)
+{
+       struct dentry *dentry = __d_alloc(parent->d_sb, NULL);
+       if (dentry) {
+               dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR;
+               dentry->d_parent = dget(parent);
+       }
+       return dentry;
+}
+
 /**
  * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
  * @sb: the superblock
@@ -1683,7 +1717,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
                                DCACHE_OP_REVALIDATE    |
                                DCACHE_OP_WEAK_REVALIDATE       |
                                DCACHE_OP_DELETE        |
-                               DCACHE_OP_SELECT_INODE  |
                                DCACHE_OP_REAL));
        dentry->d_op = op;
        if (!op)
@@ -1700,8 +1733,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
                dentry->d_flags |= DCACHE_OP_DELETE;
        if (op->d_prune)
                dentry->d_flags |= DCACHE_OP_PRUNE;
-       if (op->d_select_inode)
-               dentry->d_flags |= DCACHE_OP_SELECT_INODE;
        if (op->d_real)
                dentry->d_flags |= DCACHE_OP_REAL;
 
@@ -1769,7 +1800,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
        raw_write_seqcount_end(&dentry->d_seq);
-       __fsnotify_d_instantiate(dentry);
+       fsnotify_update_flags(dentry);
        spin_unlock(&dentry->d_lock);
 }
 
@@ -2021,42 +2052,19 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 }
 EXPORT_SYMBOL(d_add_ci);
 
-/*
- * Do the slow-case of the dentry name compare.
- *
- * Unlike the dentry_cmp() function, we need to atomically
- * load the name and length information, so that the
- * filesystem can rely on them, and can use the 'name' and
- * 'len' information without worrying about walking off the
- * end of memory etc.
- *
- * Thus the read_seqcount_retry() and the "duplicate" info
- * in arguments (the low-level filesystem should not look
- * at the dentry inode or name contents directly, since
- * rename can change them while we're in RCU mode).
- */
-enum slow_d_compare {
-       D_COMP_OK,
-       D_COMP_NOMATCH,
-       D_COMP_SEQRETRY,
-};
 
-static noinline enum slow_d_compare slow_dentry_cmp(
-               const struct dentry *parent,
-               struct dentry *dentry,
-               unsigned int seq,
-               const struct qstr *name)
+static inline bool d_same_name(const struct dentry *dentry,
+                               const struct dentry *parent,
+                               const struct qstr *name)
 {
-       int tlen = dentry->d_name.len;
-       const char *tname = dentry->d_name.name;
-
-       if (read_seqcount_retry(&dentry->d_seq, seq)) {
-               cpu_relax();
-               return D_COMP_SEQRETRY;
+       if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
+               if (dentry->d_name.len != name->len)
+                       return false;
+               return dentry_cmp(dentry, name->name, name->len) == 0;
        }
-       if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
-               return D_COMP_NOMATCH;
-       return D_COMP_OK;
+       return parent->d_op->d_compare(parent, dentry,
+                                      dentry->d_name.len, dentry->d_name.name,
+                                      name) == 0;
 }
 
 /**
@@ -2135,6 +2143,9 @@ seqretry:
                 * dentry compare, we will do seqretries until it is stable,
                 * and if we end up with a successful lookup, we actually
                 * want to exit RCU lookup anyway.
+                *
+                * Note that raw_seqcount_begin still *does* smp_rmb(), so
+                * we are still guaranteed NUL-termination of ->d_name.name.
                 */
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
@@ -2143,24 +2154,28 @@ seqretry:
                        continue;
 
                if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
+                       int tlen;
+                       const char *tname;
                        if (dentry->d_name.hash != hashlen_hash(hashlen))
                                continue;
-                       *seqp = seq;
-                       switch (slow_dentry_cmp(parent, dentry, seq, name)) {
-                       case D_COMP_OK:
-                               return dentry;
-                       case D_COMP_NOMATCH:
-                               continue;
-                       default:
+                       tlen = dentry->d_name.len;
+                       tname = dentry->d_name.name;
+                       /* we want a consistent (name,len) pair */
+                       if (read_seqcount_retry(&dentry->d_seq, seq)) {
+                               cpu_relax();
                                goto seqretry;
                        }
+                       if (parent->d_op->d_compare(parent, dentry,
+                                                   tlen, tname, name) != 0)
+                               continue;
+               } else {
+                       if (dentry->d_name.hash_len != hashlen)
+                               continue;
+                       if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
+                               continue;
                }
-
-               if (dentry->d_name.hash_len != hashlen)
-                       continue;
                *seqp = seq;
-               if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
-                       return dentry;
+               return dentry;
        }
        return NULL;
 }
@@ -2208,9 +2223,7 @@ EXPORT_SYMBOL(d_lookup);
  */
 struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
 {
-       unsigned int len = name->len;
        unsigned int hash = name->hash;
-       const unsigned char *str = name->name;
        struct hlist_bl_head *b = d_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *found = NULL;
@@ -2249,21 +2262,8 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
                if (d_unhashed(dentry))
                        goto next;
 
-               /*
-                * It is safe to compare names since d_move() cannot
-                * change the qstr (protected by d_lock).
-                */
-               if (parent->d_flags & DCACHE_OP_COMPARE) {
-                       int tlen = dentry->d_name.len;
-                       const char *tname = dentry->d_name.name;
-                       if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
-                               goto next;
-               } else {
-                       if (dentry->d_name.len != len)
-                               goto next;
-                       if (dentry_cmp(dentry, str, len))
-                               goto next;
-               }
+               if (!d_same_name(dentry, parent, name))
+                       goto next;
 
                dentry->d_lockref.count++;
                found = dentry;
@@ -2358,7 +2358,6 @@ static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
 {
        BUG_ON(!d_unhashed(entry));
        hlist_bl_lock(b);
-       entry->d_flags |= DCACHE_RCUACCESS;
        hlist_bl_add_head_rcu(&entry->d_hash, b);
        hlist_bl_unlock(b);
 }
@@ -2417,9 +2416,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
                                const struct qstr *name,
                                wait_queue_head_t *wq)
 {
-       unsigned int len = name->len;
        unsigned int hash = name->hash;
-       const unsigned char *str = name->name;
        struct hlist_bl_head *b = in_lookup_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *new = d_alloc(parent, name);
@@ -2458,7 +2455,6 @@ retry:
                rcu_read_unlock();
                goto retry;
        }
-       rcu_read_unlock();
        /*
         * No changes for the parent since the beginning of d_lookup().
         * Since all removals from the chain happen with hlist_bl_lock(),
@@ -2471,22 +2467,20 @@ retry:
                        continue;
                if (dentry->d_parent != parent)
                        continue;
-               if (d_unhashed(dentry))
+               if (!d_same_name(dentry, parent, name))
                        continue;
-               if (parent->d_flags & DCACHE_OP_COMPARE) {
-                       int tlen = dentry->d_name.len;
-                       const char *tname = dentry->d_name.name;
-                       if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
-                               continue;
-               } else {
-                       if (dentry->d_name.len != len)
-                               continue;
-                       if (dentry_cmp(dentry, str, len))
-                               continue;
-               }
-               dget(dentry);
                hlist_bl_unlock(b);
-               /* somebody is doing lookup for it right now; wait for it */
+               /* now we can try to grab a reference */
+               if (!lockref_get_not_dead(&dentry->d_lockref)) {
+                       rcu_read_unlock();
+                       goto retry;
+               }
+
+               rcu_read_unlock();
+               /*
+                * somebody is likely to be still doing lookup for it;
+                * wait for them to finish
+                */
                spin_lock(&dentry->d_lock);
                d_wait_lookup(dentry);
                /*
@@ -2501,22 +2495,14 @@ retry:
                        goto mismatch;
                if (unlikely(d_unhashed(dentry)))
                        goto mismatch;
-               if (parent->d_flags & DCACHE_OP_COMPARE) {
-                       int tlen = dentry->d_name.len;
-                       const char *tname = dentry->d_name.name;
-                       if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
-                               goto mismatch;
-               } else {
-                       if (unlikely(dentry->d_name.len != len))
-                               goto mismatch;
-                       if (unlikely(dentry_cmp(dentry, str, len)))
-                               goto mismatch;
-               }
+               if (unlikely(!d_same_name(dentry, parent, name)))
+                       goto mismatch;
                /* OK, it *is* a hashed match; return it */
                spin_unlock(&dentry->d_lock);
                dput(new);
                return dentry;
        }
+       rcu_read_unlock();
        /* we can't take ->d_lock here; it's OK, though. */
        new->d_flags |= DCACHE_PAR_LOOKUP;
        new->d_wait = wq;
@@ -2563,7 +2549,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
                raw_write_seqcount_begin(&dentry->d_seq);
                __d_set_inode_and_type(dentry, inode, add_flags);
                raw_write_seqcount_end(&dentry->d_seq);
-               __fsnotify_d_instantiate(dentry);
+               fsnotify_update_flags(dentry);
        }
        _d_rehash(dentry);
        if (dir)
@@ -2606,8 +2592,6 @@ EXPORT_SYMBOL(d_add);
 struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
 {
        struct dentry *alias;
-       int len = entry->d_name.len;
-       const char *name = entry->d_name.name;
        unsigned int hash = entry->d_name.hash;
 
        spin_lock(&inode->i_lock);
@@ -2621,9 +2605,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
                        continue;
                if (alias->d_parent != entry->d_parent)
                        continue;
-               if (alias->d_name.len != len)
-                       continue;
-               if (dentry_cmp(alias, name, len))
+               if (!d_same_name(alias, entry->d_parent, &entry->d_name))
                        continue;
                spin_lock(&alias->d_lock);
                if (!d_unhashed(alias)) {
@@ -2843,6 +2825,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
        /* ... and switch them in the tree */
        if (IS_ROOT(dentry)) {
                /* splicing a tree */
+               dentry->d_flags |= DCACHE_RCUACCESS;
                dentry->d_parent = target->d_parent;
                target->d_parent = target;
                list_del_init(&target->d_child);
@@ -2853,8 +2836,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
                list_move(&target->d_child, &target->d_parent->d_subdirs);
                list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
                if (exchange)
-                       fsnotify_d_move(target);
-               fsnotify_d_move(dentry);
+                       fsnotify_update_flags(target);
+               fsnotify_update_flags(dentry);
        }
 
        write_seqcount_end(&target->d_seq);