Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[cascardo/linux.git] / net / netfilter / nf_conntrack_core.c
index 9934b0c..6570982 100644 (file)
@@ -72,12 +72,24 @@ EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
 struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
 
+struct conntrack_gc_work {
+       struct delayed_work     dwork;
+       u32                     last_bucket;
+       bool                    exiting;
+};
+
 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
 static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
-static __read_mostly seqcount_t nf_conntrack_generation;
 static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
 static __read_mostly bool nf_conntrack_locks_all;
 
+#define GC_MAX_BUCKETS_DIV     64u
+#define GC_MAX_BUCKETS         8192u
+#define GC_INTERVAL            (5 * HZ)
+#define GC_MAX_EVICTS          256u
+
+static struct conntrack_gc_work conntrack_gc_work;
+
 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
 {
        spin_lock(lock);
@@ -164,7 +176,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
 unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
+seqcount_t nf_conntrack_generation __read_mostly;
 
 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
@@ -372,7 +384,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
 
        pr_debug("destroy_conntrack(%p)\n", ct);
        NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
-       NF_CT_ASSERT(!timer_pending(&ct->timeout));
 
        if (unlikely(nf_ct_is_template(ct))) {
                nf_ct_tmpl_free(ct);
@@ -435,35 +446,30 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 {
        struct nf_conn_tstamp *tstamp;
 
+       if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
+               return false;
+
        tstamp = nf_conn_tstamp_find(ct);
        if (tstamp && tstamp->stop == 0)
                tstamp->stop = ktime_get_real_ns();
 
-       if (nf_ct_is_dying(ct))
-               goto delete;
-
        if (nf_conntrack_event_report(IPCT_DESTROY, ct,
                                    portid, report) < 0) {
-               /* destroy event was not delivered */
+               /* destroy event was not delivered. nf_ct_put will
+                * be done by event cache worker on redelivery.
+                */
                nf_ct_delete_from_lists(ct);
                nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
                return false;
        }
 
        nf_conntrack_ecache_work(nf_ct_net(ct));
-       set_bit(IPS_DYING_BIT, &ct->status);
- delete:
        nf_ct_delete_from_lists(ct);
        nf_ct_put(ct);
        return true;
 }
 EXPORT_SYMBOL_GPL(nf_ct_delete);
 
-static void death_by_timeout(unsigned long ul_conntrack)
-{
-       nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
-}
-
 static inline bool
 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
                const struct nf_conntrack_tuple *tuple,
@@ -481,22 +487,17 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
               net_eq(net, nf_ct_net(ct));
 }
 
-/* must be called with rcu read lock held */
-void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
+/* caller must hold rcu readlock and none of the nf_conntrack_locks */
+static void nf_ct_gc_expired(struct nf_conn *ct)
 {
-       struct hlist_nulls_head *hptr;
-       unsigned int sequence, hsz;
+       if (!atomic_inc_not_zero(&ct->ct_general.use))
+               return;
 
-       do {
-               sequence = read_seqcount_begin(&nf_conntrack_generation);
-               hsz = nf_conntrack_htable_size;
-               hptr = nf_conntrack_hash;
-       } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+       if (nf_ct_should_gc(ct))
+               nf_ct_kill(ct);
 
-       *hash = hptr;
-       *hsize = hsz;
+       nf_ct_put(ct);
 }
-EXPORT_SYMBOL_GPL(nf_conntrack_get_ht);
 
 /*
  * Warning :
@@ -510,16 +511,24 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_head *ct_hash;
        struct hlist_nulls_node *n;
-       unsigned int bucket, sequence;
+       unsigned int bucket, hsize;
 
 begin:
-       do {
-               sequence = read_seqcount_begin(&nf_conntrack_generation);
-               bucket = scale_hash(hash);
-               ct_hash = nf_conntrack_hash;
-       } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+       nf_conntrack_get_ht(&ct_hash, &hsize);
+       bucket = reciprocal_scale(hash, hsize);
 
        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
+               struct nf_conn *ct;
+
+               ct = nf_ct_tuplehash_to_ctrack(h);
+               if (nf_ct_is_expired(ct)) {
+                       nf_ct_gc_expired(ct);
+                       continue;
+               }
+
+               if (nf_ct_is_dying(ct))
+                       continue;
+
                if (nf_ct_key_equal(h, tuple, zone, net)) {
                        NF_CT_STAT_INC_ATOMIC(net, found);
                        return h;
@@ -618,7 +627,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
                                    zone, net))
                        goto out;
 
-       add_timer(&ct->timeout);
        smp_wmb();
        /* The caller holds a reference to this object */
        atomic_set(&ct->ct_general.use, 2);
@@ -771,8 +779,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
        /* Timer relative to confirmation time, not original
           setting time, otherwise we'd get timer wrap in
           weird delay cases. */
-       ct->timeout.expires += jiffies;
-       add_timer(&ct->timeout);
+       ct->timeout += nfct_time_stamp;
        atomic_inc(&ct->ct_general.use);
        ct->status |= IPS_CONFIRMED;
 
@@ -823,29 +830,41 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
        const struct nf_conntrack_zone *zone;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_head *ct_hash;
-       unsigned int hash, sequence;
+       unsigned int hash, hsize;
        struct hlist_nulls_node *n;
        struct nf_conn *ct;
 
        zone = nf_ct_zone(ignored_conntrack);
 
        rcu_read_lock();
-       do {
-               sequence = read_seqcount_begin(&nf_conntrack_generation);
-               hash = hash_conntrack(net, tuple);
-               ct_hash = nf_conntrack_hash;
-       } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+ begin:
+       nf_conntrack_get_ht(&ct_hash, &hsize);
+       hash = __hash_conntrack(net, tuple, hsize);
 
        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
                ct = nf_ct_tuplehash_to_ctrack(h);
-               if (ct != ignored_conntrack &&
-                   nf_ct_key_equal(h, tuple, zone, net)) {
+
+               if (ct == ignored_conntrack)
+                       continue;
+
+               if (nf_ct_is_expired(ct)) {
+                       nf_ct_gc_expired(ct);
+                       continue;
+               }
+
+               if (nf_ct_key_equal(h, tuple, zone, net)) {
                        NF_CT_STAT_INC_ATOMIC(net, found);
                        rcu_read_unlock();
                        return 1;
                }
                NF_CT_STAT_INC_ATOMIC(net, searched);
        }
+
+       if (get_nulls_value(n) != hash) {
+               NF_CT_STAT_INC_ATOMIC(net, search_restart);
+               goto begin;
+       }
+
        rcu_read_unlock();
 
        return 0;
@@ -867,6 +886,11 @@ static unsigned int early_drop_list(struct net *net,
        hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
                tmp = nf_ct_tuplehash_to_ctrack(h);
 
+               if (nf_ct_is_expired(tmp)) {
+                       nf_ct_gc_expired(tmp);
+                       continue;
+               }
+
                if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
                    !net_eq(nf_ct_net(tmp), net) ||
                    nf_ct_is_dying(tmp))
@@ -884,7 +908,6 @@ static unsigned int early_drop_list(struct net *net,
                 */
                if (net_eq(nf_ct_net(tmp), net) &&
                    nf_ct_is_confirmed(tmp) &&
-                   del_timer(&tmp->timeout) &&
                    nf_ct_delete(tmp, 0, 0))
                        drops++;
 
@@ -900,14 +923,11 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
 
        for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
                struct hlist_nulls_head *ct_hash;
-               unsigned hash, sequence, drops;
+               unsigned int hash, hsize, drops;
 
                rcu_read_lock();
-               do {
-                       sequence = read_seqcount_begin(&nf_conntrack_generation);
-                       hash = scale_hash(_hash++);
-                       ct_hash = nf_conntrack_hash;
-               } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+               nf_conntrack_get_ht(&ct_hash, &hsize);
+               hash = reciprocal_scale(_hash++, hsize);
 
                drops = early_drop_list(net, &ct_hash[hash]);
                rcu_read_unlock();
@@ -921,6 +941,69 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
        return false;
 }
 
+static void gc_worker(struct work_struct *work)
+{
+       unsigned int i, goal, buckets = 0, expired_count = 0;
+       unsigned long next_run = GC_INTERVAL;
+       unsigned int ratio, scanned = 0;
+       struct conntrack_gc_work *gc_work;
+
+       gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
+
+       goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS);
+       i = gc_work->last_bucket;
+
+       do {
+               struct nf_conntrack_tuple_hash *h;
+               struct hlist_nulls_head *ct_hash;
+               struct hlist_nulls_node *n;
+               unsigned int hashsz;
+               struct nf_conn *tmp;
+
+               i++;
+               rcu_read_lock();
+
+               nf_conntrack_get_ht(&ct_hash, &hashsz);
+               if (i >= hashsz)
+                       i = 0;
+
+               hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+                       tmp = nf_ct_tuplehash_to_ctrack(h);
+
+                       scanned++;
+                       if (nf_ct_is_expired(tmp)) {
+                               nf_ct_gc_expired(tmp);
+                               expired_count++;
+                               continue;
+                       }
+               }
+
+               /* could check get_nulls_value() here and restart if ct
+                * was moved to another chain.  But given gc is best-effort
+                * we will just continue with next hash slot.
+                */
+               rcu_read_unlock();
+               cond_resched_rcu_qs();
+       } while (++buckets < goal &&
+                expired_count < GC_MAX_EVICTS);
+
+       if (gc_work->exiting)
+               return;
+
+       ratio = scanned ? expired_count * 100 / scanned : 0;
+       if (ratio >= 90)
+               next_run = 0;
+
+       gc_work->last_bucket = i;
+       schedule_delayed_work(&gc_work->dwork, next_run);
+}
+
+static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
+{
+       INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
+       gc_work->exiting = false;
+}
+
 static struct nf_conn *
 __nf_conntrack_alloc(struct net *net,
                     const struct nf_conntrack_zone *zone,
@@ -957,8 +1040,6 @@ __nf_conntrack_alloc(struct net *net,
        /* save hash for reusing when confirming */
        *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
        ct->status = 0;
-       /* Don't set timer yet: wait for confirmation */
-       setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
        write_pnet(&ct->ct_net, net);
        memset(&ct->__nfct_init_offset[0], 0,
               offsetof(struct nf_conn, proto) -
@@ -1332,7 +1413,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
                          unsigned long extra_jiffies,
                          int do_acct)
 {
-       NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
        NF_CT_ASSERT(skb);
 
        /* Only update if this is not a fixed timeout */
@@ -1340,39 +1420,25 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
                goto acct;
 
        /* If not in hash table, timer will not be active yet */
-       if (!nf_ct_is_confirmed(ct)) {
-               ct->timeout.expires = extra_jiffies;
-       } else {
-               unsigned long newtime = jiffies + extra_jiffies;
-
-               /* Only update the timeout if the new timeout is at least
-                  HZ jiffies from the old timeout. Need del_timer for race
-                  avoidance (may already be dying). */
-               if (newtime - ct->timeout.expires >= HZ)
-                       mod_timer_pending(&ct->timeout, newtime);
-       }
+       if (nf_ct_is_confirmed(ct))
+               extra_jiffies += nfct_time_stamp;
 
+       ct->timeout = extra_jiffies;
 acct:
        if (do_acct)
                nf_ct_acct_update(ct, ctinfo, skb->len);
 }
 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
 
-bool __nf_ct_kill_acct(struct nf_conn *ct,
-                      enum ip_conntrack_info ctinfo,
-                      const struct sk_buff *skb,
-                      int do_acct)
+bool nf_ct_kill_acct(struct nf_conn *ct,
+                    enum ip_conntrack_info ctinfo,
+                    const struct sk_buff *skb)
 {
-       if (do_acct)
-               nf_ct_acct_update(ct, ctinfo, skb->len);
+       nf_ct_acct_update(ct, ctinfo, skb->len);
 
-       if (del_timer(&ct->timeout)) {
-               ct->timeout.function((unsigned long)ct);
-               return true;
-       }
-       return false;
+       return nf_ct_delete(ct, 0, 0);
 }
-EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
+EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
@@ -1505,11 +1571,8 @@ void nf_ct_iterate_cleanup(struct net *net,
 
        while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
                /* Time to push up daises... */
-               if (del_timer(&ct->timeout))
-                       nf_ct_delete(ct, portid, report);
-
-               /* ... else the timer will get him soon. */
 
+               nf_ct_delete(ct, portid, report);
                nf_ct_put(ct);
                cond_resched();
        }
@@ -1545,6 +1608,7 @@ static int untrack_refs(void)
 
 void nf_conntrack_cleanup_start(void)
 {
+       conntrack_gc_work.exiting = true;
        RCU_INIT_POINTER(ip_ct_attach, NULL);
 }
 
@@ -1554,6 +1618,7 @@ void nf_conntrack_cleanup_end(void)
        while (untrack_refs() > 0)
                schedule();
 
+       cancel_delayed_work_sync(&conntrack_gc_work.dwork);
        nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
 
        nf_conntrack_proto_fini();
@@ -1828,6 +1893,10 @@ int nf_conntrack_init_start(void)
        }
        /*  - and look it like as a confirmed connection */
        nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
+
+       conntrack_gc_work_init(&conntrack_gc_work);
+       schedule_delayed_work(&conntrack_gc_work.dwork, GC_INTERVAL);
+
        return 0;
 
 err_proto: