Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
authorDavid S. Miller <davem@davemloft.net>
Tue, 6 Sep 2016 19:45:26 +0000 (12:45 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 6 Sep 2016 19:45:26 +0000 (12:45 -0700)
Pablo Neira Ayuso says:

====================
Netfilter updates for net-next

The following patchset contains Netfilter updates for your net-next
tree.  Most relevant updates are the removal of per-conntrack timers to
use a workqueue/garbage collection approach instead from Florian
Westphal, the hash and numgen expression for nf_tables from Laura
Garcia, updates on nf_tables hash set to honor the NLM_F_EXCL flag,
removal of ip_conntrack sysctl and many other incremental updates on our
Netfilter codebase.

More specifically, they are:

1) Retrieve only 4 bytes to fetch ports in case of non-linear skb
   transport area in dccp, sctp, tcp, udp and udplite protocol
   conntrackers, from Gao Feng.

2) Missing whitespace on error message in physdev match, from Hangbin Liu.

3) Skip redundant IPv4 checksum calculation in nf_dup_ipv4, from Liping Zhang.

4) Add nf_ct_expires() helper function and use it, from Florian Westphal.

5) Replace opencoded nf_ct_kill() call in IPVS conntrack support, also
   from Florian.

6) Rename nf_tables set implementation to nft_set_{name}.c

7) Introduce the hash expression to allow arbitrary hashing of selector
   concatenations, from Laura Garcia Liebana.

8) Remove ip_conntrack sysctl backward compatibility code, this code has
   been around for long time already, and we have two interfaces to do
   this already: nf_conntrack sysctl and ctnetlink.

9) Use nf_conntrack_get_ht() helper function whenever possible, instead
   of opencoding fetch of hashtable pointer and size, patch from Liping Zhang.

10) Add quota expression for nf_tables.

11) Add number generator expression for nf_tables, this supports
    incremental and random generators that can be combined with maps,
    very useful for load balancing purpose, again from Laura Garcia Liebana.

12) Fix a typo in a debug message in FTP conntrack helper, from Colin Ian King.

13) Introduce a nft_chain_parse_hook() helper function to parse chain hook
    configuration, this is used by a follow up patch to perform better chain
    update validation.

14) Add rhashtable_lookup_get_insert_key() to rhashtable and use it from the
    nft_set_hash implementation to honor the NLM_F_EXCL flag.

15) Missing nulls check in nf_conntrack from nf_conntrack_tuple_taken(),
    patch from Florian Westphal.

16) Don't use the DYING bit to know if the conntrack event has been already
    delivered, instead a state variable to track event re-delivery
    states, also from Florian.

17) Remove the per-conntrack timer, use the workqueue approach that was
    discussed during the NFWS, from Florian Westphal.

18) Use the netlink conntrack table dump path to kill stale entries,
    again from Florian.

19) Add a garbage collector to get rid of stale conntracks, from
    Florian.

20) Reschedule garbage collector if eviction rate is high.

21) Get rid of the __nf_ct_kill_acct() helper.

22) Use ARPHRD_ETHER instead of hardcoded 1 from ARP logger.

23) Make nf_log_set() interface assertive on unsupported families.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
47 files changed:
include/linux/rhashtable.h
include/net/netfilter/nf_conntrack.h
include/net/netfilter/nf_conntrack_core.h
include/net/netfilter/nf_conntrack_ecache.h
include/net/netfilter/nf_conntrack_l4proto.h
include/net/netfilter/nf_log.h
include/net/netfilter/nf_tables.h
include/net/netns/conntrack.h
include/uapi/linux/netfilter/nf_tables.h
lib/rhashtable.c
net/bridge/netfilter/nf_log_bridge.c
net/ipv4/netfilter/Kconfig
net/ipv4/netfilter/Makefile
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c [deleted file]
net/ipv4/netfilter/nf_conntrack_proto_icmp.c
net/ipv4/netfilter/nf_dup_ipv4.c
net/ipv4/netfilter/nf_log_arp.c
net/ipv4/netfilter/nf_log_ipv4.c
net/ipv6/netfilter/nf_log_ipv6.c
net/netfilter/Kconfig
net/netfilter/Makefile
net/netfilter/ipvs/ip_vs_nfct.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_ecache.c
net/netfilter/nf_conntrack_ftp.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_pptp.c
net/netfilter/nf_conntrack_proto.c
net/netfilter/nf_conntrack_proto_dccp.c
net/netfilter/nf_conntrack_proto_generic.c
net/netfilter/nf_conntrack_proto_sctp.c
net/netfilter/nf_conntrack_proto_tcp.c
net/netfilter/nf_conntrack_proto_udp.c
net/netfilter/nf_conntrack_proto_udplite.c
net/netfilter/nf_conntrack_standalone.c
net/netfilter/nf_log.c
net/netfilter/nf_nat_core.c
net/netfilter/nf_tables_api.c
net/netfilter/nft_hash.c
net/netfilter/nft_numgen.c [new file with mode: 0644]
net/netfilter/nft_quota.c [new file with mode: 0644]
net/netfilter/nft_rbtree.c [deleted file]
net/netfilter/nft_set_hash.c [new file with mode: 0644]
net/netfilter/nft_set_rbtree.c [new file with mode: 0644]
net/netfilter/xt_conntrack.c
net/netfilter/xt_physdev.c

index 8b72ee7..fd82584 100644 (file)
@@ -343,7 +343,8 @@ int rhashtable_init(struct rhashtable *ht,
 struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
                                            const void *key,
                                            struct rhash_head *obj,
-                                           struct bucket_table *old_tbl);
+                                           struct bucket_table *old_tbl,
+                                           void **data);
 int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
 
 void rhashtable_walk_enter(struct rhashtable *ht,
@@ -563,8 +564,11 @@ restart:
        return NULL;
 }
 
-/* Internal function, please use rhashtable_insert_fast() instead */
-static inline int __rhashtable_insert_fast(
+/* Internal function, please use rhashtable_insert_fast() instead. This
+ * function returns the existing element already in hashes in there is a clash,
+ * otherwise it returns an error via ERR_PTR().
+ */
+static inline void *__rhashtable_insert_fast(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
 {
@@ -577,6 +581,7 @@ static inline int __rhashtable_insert_fast(
        spinlock_t *lock;
        unsigned int elasticity;
        unsigned int hash;
+       void *data = NULL;
        int err;
 
 restart:
@@ -601,11 +606,14 @@ restart:
 
        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (unlikely(new_tbl)) {
-               tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
+               tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data);
                if (!IS_ERR_OR_NULL(tbl))
                        goto slow_path;
 
                err = PTR_ERR(tbl);
+               if (err == -EEXIST)
+                       err = 0;
+
                goto out;
        }
 
@@ -619,25 +627,25 @@ slow_path:
                err = rhashtable_insert_rehash(ht, tbl);
                rcu_read_unlock();
                if (err)
-                       return err;
+                       return ERR_PTR(err);
 
                goto restart;
        }
 
-       err = -EEXIST;
+       err = 0;
        elasticity = ht->elasticity;
        rht_for_each(head, tbl, hash) {
                if (key &&
                    unlikely(!(params.obj_cmpfn ?
                               params.obj_cmpfn(&arg, rht_obj(ht, head)) :
-                              rhashtable_compare(&arg, rht_obj(ht, head)))))
+                              rhashtable_compare(&arg, rht_obj(ht, head))))) {
+                       data = rht_obj(ht, head);
                        goto out;
+               }
                if (!--elasticity)
                        goto slow_path;
        }
 
-       err = 0;
-
        head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 
        RCU_INIT_POINTER(obj->next, head);
@@ -652,7 +660,7 @@ out:
        spin_unlock_bh(lock);
        rcu_read_unlock();
 
-       return err;
+       return err ? ERR_PTR(err) : data;
 }
 
 /**
@@ -675,7 +683,13 @@ static inline int rhashtable_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
 {
-       return __rhashtable_insert_fast(ht, NULL, obj, params);
+       void *ret;
+
+       ret = __rhashtable_insert_fast(ht, NULL, obj, params);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+
+       return ret == NULL ? 0 : -EEXIST;
 }
 
 /**
@@ -704,11 +718,15 @@ static inline int rhashtable_lookup_insert_fast(
        const struct rhashtable_params params)
 {
        const char *key = rht_obj(ht, obj);
+       void *ret;
 
        BUG_ON(ht->p.obj_hashfn);
 
-       return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
-                                       params);
+       ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+
+       return ret == NULL ? 0 : -EEXIST;
 }
 
 /**
@@ -736,6 +754,32 @@ static inline int rhashtable_lookup_insert_fast(
 static inline int rhashtable_lookup_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
+{
+       void *ret;
+
+       BUG_ON(!ht->p.obj_hashfn || !key);
+
+       ret = __rhashtable_insert_fast(ht, key, obj, params);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+
+       return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
+ * @ht:                hash table
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ * @data:      pointer to element data already in hashes
+ *
+ * Just like rhashtable_lookup_insert_key(), but this function returns the
+ * object if it exists, NULL if it does not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_key(
+       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+       const struct rhashtable_params params)
 {
        BUG_ON(!ht->p.obj_hashfn || !key);
 
index 445b019..5041805 100644 (file)
@@ -42,7 +42,6 @@ union nf_conntrack_expect_proto {
 
 #include <linux/types.h>
 #include <linux/skbuff.h>
-#include <linux/timer.h>
 
 #ifdef CONFIG_NETFILTER_DEBUG
 #define NF_CT_ASSERT(x)                WARN_ON(!(x))
@@ -73,7 +72,7 @@ struct nf_conn_help {
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 
 struct nf_conn {
-       /* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
+       /* Usage count in here is 1 for hash table, 1 per skb,
         * plus 1 for any connection(s) we are `master' for
         *
         * Hint, SKB address this struct and refcnt via skb->nfct and
@@ -96,8 +95,8 @@ struct nf_conn {
        /* Have we seen traffic both ways yet? (bitset) */
        unsigned long status;
 
-       /* Timer function; drops refcnt when it goes off. */
-       struct timer_list timeout;
+       /* jiffies32 when this ct is considered dead */
+       u32 timeout;
 
        possible_net_t ct_net;
 
@@ -220,21 +219,14 @@ static inline void nf_ct_refresh(struct nf_conn *ct,
        __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, 0);
 }
 
-bool __nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-                      const struct sk_buff *skb, int do_acct);
-
 /* kill conntrack and do accounting */
-static inline bool nf_ct_kill_acct(struct nf_conn *ct,
-                                  enum ip_conntrack_info ctinfo,
-                                  const struct sk_buff *skb)
-{
-       return __nf_ct_kill_acct(ct, ctinfo, skb, 1);
-}
+bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+                    const struct sk_buff *skb);
 
 /* kill conntrack without accounting */
 static inline bool nf_ct_kill(struct nf_conn *ct)
 {
-       return __nf_ct_kill_acct(ct, 0, NULL, 0);
+       return nf_ct_delete(ct, 0, 0);
 }
 
 /* These are for NAT.  Icky. */
@@ -291,21 +283,55 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb)
        return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK;
 }
 
+#define nfct_time_stamp ((u32)(jiffies))
+
 /* jiffies until ct expires, 0 if already expired */
 static inline unsigned long nf_ct_expires(const struct nf_conn *ct)
 {
-       long timeout = (long)ct->timeout.expires - (long)jiffies;
+       s32 timeout = ct->timeout - nfct_time_stamp;
 
        return timeout > 0 ? timeout : 0;
 }
 
+static inline bool nf_ct_is_expired(const struct nf_conn *ct)
+{
+       return (__s32)(ct->timeout - nfct_time_stamp) <= 0;
+}
+
+/* use after obtaining a reference count */
+static inline bool nf_ct_should_gc(const struct nf_conn *ct)
+{
+       return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) &&
+              !nf_ct_is_dying(ct);
+}
+
 struct kernel_param;
 
 int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
 int nf_conntrack_hash_resize(unsigned int hashsize);
+
+extern struct hlist_nulls_head *nf_conntrack_hash;
 extern unsigned int nf_conntrack_htable_size;
+extern seqcount_t nf_conntrack_generation;
 extern unsigned int nf_conntrack_max;
 
+/* must be called with rcu read lock held */
+static inline void
+nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
+{
+       struct hlist_nulls_head *hptr;
+       unsigned int sequence, hsz;
+
+       do {
+               sequence = read_seqcount_begin(&nf_conntrack_generation);
+               hsz = nf_conntrack_htable_size;
+               hptr = nf_conntrack_hash;
+       } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
+       *hash = hptr;
+       *hsize = hsz;
+}
+
 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
                                 const struct nf_conntrack_zone *zone,
                                 gfp_t flags);
index 79d7ac5..62e17d1 100644 (file)
@@ -51,8 +51,6 @@ bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
                        const struct nf_conntrack_l3proto *l3proto,
                        const struct nf_conntrack_l4proto *l4proto);
 
-void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize);
-
 /* Find a connection corresponding to a tuple. */
 struct nf_conntrack_tuple_hash *
 nf_conntrack_find_get(struct net *net,
@@ -83,7 +81,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 
 #define CONNTRACK_LOCKS 1024
 
-extern struct hlist_nulls_head *nf_conntrack_hash;
 extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
 void nf_conntrack_lock(spinlock_t *lock);
 
index fa36447..12d967b 100644 (file)
 #include <linux/netfilter/nf_conntrack_tuple_common.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 
+enum nf_ct_ecache_state {
+       NFCT_ECACHE_UNKNOWN,            /* destroy event not sent */
+       NFCT_ECACHE_DESTROY_FAIL,       /* tried but failed to send destroy event */
+       NFCT_ECACHE_DESTROY_SENT,       /* sent destroy event after failure */
+};
+
 struct nf_conntrack_ecache {
-       unsigned long cache;    /* bitops want long */
-       unsigned long missed;   /* missed events */
-       u16 ctmask;             /* bitmask of ct events to be delivered */
-       u16 expmask;            /* bitmask of expect events to be delivered */
-       u32 portid;             /* netlink portid of destroyer */
+       unsigned long cache;            /* bitops want long */
+       unsigned long missed;           /* missed events */
+       u16 ctmask;                     /* bitmask of ct events to be delivered */
+       u16 expmask;                    /* bitmask of expect events to be delivered */
+       u32 portid;                     /* netlink portid of destroyer */
+       enum nf_ct_ecache_state state;  /* ecache state */
 };
 
 static inline struct nf_conntrack_ecache *
index 1a5fb36..de629f1 100644 (file)
@@ -134,14 +134,6 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
 int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto);
 void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto);
 
-static inline void nf_ct_kfree_compat_sysctl_table(struct nf_proto_net *pn)
-{
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-       kfree(pn->ctl_compat_table);
-       pn->ctl_compat_table = NULL;
-#endif
-}
-
 /* Generic netlink helpers */
 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
                               const struct nf_conntrack_tuple *tuple);
index 83d855b..ee07dc8 100644 (file)
@@ -60,8 +60,7 @@ struct nf_logger {
 int nf_log_register(u_int8_t pf, struct nf_logger *logger);
 void nf_log_unregister(struct nf_logger *logger);
 
-void nf_log_set(struct net *net, u_int8_t pf,
-               const struct nf_logger *logger);
+int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger);
 void nf_log_unset(struct net *net, const struct nf_logger *logger);
 
 int nf_log_bind_pf(struct net *net, u_int8_t pf,
index f2f1339..8972468 100644 (file)
@@ -251,7 +251,8 @@ struct nft_set_ops {
 
        int                             (*insert)(const struct net *net,
                                                  const struct nft_set *set,
-                                                 const struct nft_set_elem *elem);
+                                                 const struct nft_set_elem *elem,
+                                                 struct nft_set_ext **ext);
        void                            (*activate)(const struct net *net,
                                                    const struct nft_set *set,
                                                    const struct nft_set_elem *elem);
index 38b1a80..e469e85 100644 (file)
@@ -15,10 +15,6 @@ struct nf_proto_net {
 #ifdef CONFIG_SYSCTL
        struct ctl_table_header *ctl_table_header;
        struct ctl_table        *ctl_table;
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-       struct ctl_table_header *ctl_compat_header;
-       struct ctl_table        *ctl_compat_table;
-#endif
 #endif
        unsigned int            users;
 };
@@ -58,10 +54,6 @@ struct nf_ip_net {
        struct nf_udp_net       udp;
        struct nf_icmp_net      icmp;
        struct nf_icmp_net      icmpv6;
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-       struct ctl_table_header *ctl_table_header;
-       struct ctl_table        *ctl_table;
-#endif
 };
 
 struct ct_pcpu {
index c674ba2..28ce01d 100644 (file)
@@ -723,6 +723,26 @@ enum nft_meta_keys {
        NFT_META_PRANDOM,
 };
 
+/**
+ * enum nft_hash_attributes - nf_tables hash expression netlink attributes
+ *
+ * @NFTA_HASH_SREG: source register (NLA_U32)
+ * @NFTA_HASH_DREG: destination register (NLA_U32)
+ * @NFTA_HASH_LEN: source data length (NLA_U32)
+ * @NFTA_HASH_MODULUS: modulus value (NLA_U32)
+ * @NFTA_HASH_SEED: seed value (NLA_U32)
+ */
+enum nft_hash_attributes {
+       NFTA_HASH_UNSPEC,
+       NFTA_HASH_SREG,
+       NFTA_HASH_DREG,
+       NFTA_HASH_LEN,
+       NFTA_HASH_MODULUS,
+       NFTA_HASH_SEED,
+       __NFTA_HASH_MAX,
+};
+#define NFTA_HASH_MAX  (__NFTA_HASH_MAX - 1)
+
 /**
  * enum nft_meta_attributes - nf_tables meta expression netlink attributes
  *
@@ -880,6 +900,25 @@ enum nft_queue_attributes {
 #define NFT_QUEUE_FLAG_CPU_FANOUT      0x02 /* use current CPU (no hashing) */
 #define NFT_QUEUE_FLAG_MASK            0x03
 
+enum nft_quota_flags {
+       NFT_QUOTA_F_INV         = (1 << 0),
+};
+
+/**
+ * enum nft_quota_attributes - nf_tables quota expression netlink attributes
+ *
+ * @NFTA_QUOTA_BYTES: quota in bytes (NLA_U16)
+ * @NFTA_QUOTA_FLAGS: flags (NLA_U32)
+ */
+enum nft_quota_attributes {
+       NFTA_QUOTA_UNSPEC,
+       NFTA_QUOTA_BYTES,
+       NFTA_QUOTA_FLAGS,
+       NFTA_QUOTA_PAD,
+       __NFTA_QUOTA_MAX
+};
+#define NFTA_QUOTA_MAX         (__NFTA_QUOTA_MAX - 1)
+
 /**
  * enum nft_reject_types - nf_tables reject expression reject types
  *
@@ -1051,7 +1090,7 @@ enum nft_gen_attributes {
  * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32)
  * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32)
  */
-enum nft_trace_attibutes {
+enum nft_trace_attributes {
        NFTA_TRACE_UNSPEC,
        NFTA_TRACE_TABLE,
        NFTA_TRACE_CHAIN,
@@ -1082,4 +1121,28 @@ enum nft_trace_types {
        __NFT_TRACETYPE_MAX
 };
 #define NFT_TRACETYPE_MAX (__NFT_TRACETYPE_MAX - 1)
+
+/**
+ * enum nft_ng_attributes - nf_tables number generator expression netlink attributes
+ *
+ * @NFTA_NG_DREG: destination register (NLA_U32)
+ * @NFTA_NG_UNTIL: source value to increment the counter until reset (NLA_U32)
+ * @NFTA_NG_TYPE: operation type (NLA_U32)
+ */
+enum nft_ng_attributes {
+       NFTA_NG_UNSPEC,
+       NFTA_NG_DREG,
+       NFTA_NG_UNTIL,
+       NFTA_NG_TYPE,
+       __NFTA_NG_MAX
+};
+#define NFTA_NG_MAX    (__NFTA_NG_MAX - 1)
+
+enum nft_ng_types {
+       NFT_NG_INCREMENTAL,
+       NFT_NG_RANDOM,
+       __NFT_NG_MAX
+};
+#define NFT_NG_MAX     (__NFT_NG_MAX - 1)
+
 #endif /* _LINUX_NF_TABLES_H */
index 4320b92..06c2872 100644 (file)
@@ -444,7 +444,8 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
 struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
                                            const void *key,
                                            struct rhash_head *obj,
-                                           struct bucket_table *tbl)
+                                           struct bucket_table *tbl,
+                                           void **data)
 {
        struct rhash_head *head;
        unsigned int hash;
@@ -455,8 +456,11 @@ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
        spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
 
        err = -EEXIST;
-       if (key && rhashtable_lookup_fast(ht, key, ht->p))
-               goto exit;
+       if (key) {
+               *data = rhashtable_lookup_fast(ht, key, ht->p);
+               if (*data)
+                       goto exit;
+       }
 
        err = -E2BIG;
        if (unlikely(rht_grow_above_max(ht, tbl)))
index 5d9953a..1663df5 100644 (file)
@@ -50,8 +50,7 @@ static struct nf_logger nf_bridge_logger __read_mostly = {
 
 static int __net_init nf_log_bridge_net_init(struct net *net)
 {
-       nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
-       return 0;
+       return nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
 }
 
 static void __net_exit nf_log_bridge_net_exit(struct net *net)
index c187c60..d613309 100644 (file)
@@ -25,17 +25,6 @@ config NF_CONNTRACK_IPV4
 
          To compile it as a module, choose M here.  If unsure, say N.
 
-config NF_CONNTRACK_PROC_COMPAT
-       bool "proc/sysctl compatibility with old connection tracking"
-       depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4
-       default y
-       help
-         This option enables /proc and sysctl compatibility with the old
-         layer 3 dependent connection tracking. This is needed to keep
-         old programs that have not been adapted to the new names working.
-
-         If unsure, say Y.
-
 if NF_TABLES
 
 config NF_TABLES_IPV4
index 87b073d..853328f 100644 (file)
@@ -4,11 +4,6 @@
 
 # objects for l3 independent conntrack
 nf_conntrack_ipv4-y    :=  nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
-ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
-ifeq ($(CONFIG_PROC_FS),y)
-nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
-endif
-endif
 
 # connection tracking
 obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
index ae1a71a..870aebd 100644 (file)
@@ -202,47 +202,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
        },
 };
 
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-static int log_invalid_proto_min = 0;
-static int log_invalid_proto_max = 255;
-
-static struct ctl_table ip_ct_sysctl_table[] = {
-       {
-               .procname       = "ip_conntrack_max",
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
-       {
-               .procname       = "ip_conntrack_count",
-               .maxlen         = sizeof(int),
-               .mode           = 0444,
-               .proc_handler   = proc_dointvec,
-       },
-       {
-               .procname       = "ip_conntrack_buckets",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0444,
-               .proc_handler   = proc_dointvec,
-       },
-       {
-               .procname       = "ip_conntrack_checksum",
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
-       {
-               .procname       = "ip_conntrack_log_invalid",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &log_invalid_proto_min,
-               .extra2         = &log_invalid_proto_max,
-       },
-       { }
-};
-#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
-
 /* Fast function for those who don't want to parse /proc (and I don't
    blame them). */
 /* Reversing the socket's dst/src point of view gives us the reply
@@ -350,20 +309,6 @@ static struct nf_sockopt_ops so_getorigdst = {
 
 static int ipv4_init_net(struct net *net)
 {
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-       struct nf_ip_net *in = &net->ct.nf_ct_proto;
-       in->ctl_table = kmemdup(ip_ct_sysctl_table,
-                               sizeof(ip_ct_sysctl_table),
-                               GFP_KERNEL);
-       if (!in->ctl_table)
-               return -ENOMEM;
-
-       in->ctl_table[0].data = &nf_conntrack_max;
-       in->ctl_table[1].data = &net->ct.count;
-       in->ctl_table[2].data = &nf_conntrack_htable_size;
-       in->ctl_table[3].data = &net->ct.sysctl_checksum;
-       in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
-#endif
        return 0;
 }
 
@@ -379,9 +324,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
        .nlattr_tuple_size = ipv4_nlattr_tuple_size,
        .nlattr_to_tuple = ipv4_nlattr_to_tuple,
        .nla_policy      = ipv4_nla_policy,
-#endif
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-       .ctl_table_path  = "net/ipv4/netfilter",
 #endif
        .init_net        = ipv4_init_net,
        .me              = THIS_MODULE,
@@ -492,16 +434,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
                goto cleanup_icmpv4;
        }
 
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-       ret = nf_conntrack_ipv4_compat_init();
-       if (ret < 0)
-               goto cleanup_proto;
-#endif
        return ret;
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- cleanup_proto:
-       nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
-#endif
  cleanup_icmpv4:
        nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
  cleanup_udp4:
@@ -520,9 +453,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 static void __exit nf_conntrack_l3proto_ipv4_fini(void)
 {
        synchronize_net();
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-       nf_conntrack_ipv4_compat_fini();
-#endif
        nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
        nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
        nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
deleted file mode 100644 (file)
index 6392371..0000000
+++ /dev/null
@@ -1,492 +0,0 @@
-/* ip_conntrack proc compat - based on ip_conntrack_standalone.c
- *
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/percpu.h>
-#include <linux/security.h>
-#include <net/net_namespace.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_conntrack_acct.h>
-#include <linux/rculist_nulls.h>
-#include <linux/export.h>
-
-struct ct_iter_state {
-       struct seq_net_private p;
-       struct hlist_nulls_head *hash;
-       unsigned int htable_size;
-       unsigned int bucket;
-};
-
-static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
-{
-       struct ct_iter_state *st = seq->private;
-       struct hlist_nulls_node *n;
-
-       for (st->bucket = 0;
-            st->bucket < st->htable_size;
-            st->bucket++) {
-               n = rcu_dereference(
-                       hlist_nulls_first_rcu(&st->hash[st->bucket]));
-               if (!is_a_nulls(n))
-                       return n;
-       }
-       return NULL;
-}
-
-static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
-                                     struct hlist_nulls_node *head)
-{
-       struct ct_iter_state *st = seq->private;
-
-       head = rcu_dereference(hlist_nulls_next_rcu(head));
-       while (is_a_nulls(head)) {
-               if (likely(get_nulls_value(head) == st->bucket)) {
-                       if (++st->bucket >= st->htable_size)
-                               return NULL;
-               }
-               head = rcu_dereference(
-                       hlist_nulls_first_rcu(&st->hash[st->bucket]));
-       }
-       return head;
-}
-
-static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
-{
-       struct hlist_nulls_node *head = ct_get_first(seq);
-
-       if (head)
-               while (pos && (head = ct_get_next(seq, head)))
-                       pos--;
-       return pos ? NULL : head;
-}
-
-static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
-       __acquires(RCU)
-{
-       struct ct_iter_state *st = seq->private;
-
-       rcu_read_lock();
-
-       nf_conntrack_get_ht(&st->hash, &st->htable_size);
-       return ct_get_idx(seq, *pos);
-}
-
-static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       (*pos)++;
-       return ct_get_next(s, v);
-}
-
-static void ct_seq_stop(struct seq_file *s, void *v)
-       __releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
-{
-       int ret;
-       u32 len;
-       char *secctx;
-
-       ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
-       if (ret)
-               return;
-
-       seq_printf(s, "secctx=%s ", secctx);
-
-       security_release_secctx(secctx, len);
-}
-#else
-static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
-{
-}
-#endif
-
-static bool ct_seq_should_skip(const struct nf_conn *ct,
-                              const struct net *net,
-                              const struct nf_conntrack_tuple_hash *hash)
-{
-       /* we only want to print DIR_ORIGINAL */
-       if (NF_CT_DIRECTION(hash))
-               return true;
-
-       if (nf_ct_l3num(ct) != AF_INET)
-               return true;
-
-       if (!net_eq(nf_ct_net(ct), net))
-               return true;
-
-       return false;
-}
-
-static int ct_seq_show(struct seq_file *s, void *v)
-{
-       struct nf_conntrack_tuple_hash *hash = v;
-       struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
-       const struct nf_conntrack_l3proto *l3proto;
-       const struct nf_conntrack_l4proto *l4proto;
-       int ret = 0;
-
-       NF_CT_ASSERT(ct);
-       if (ct_seq_should_skip(ct, seq_file_net(s), hash))
-               return 0;
-
-       if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
-               return 0;
-
-       /* check if we raced w. object reuse */
-       if (!nf_ct_is_confirmed(ct) ||
-           ct_seq_should_skip(ct, seq_file_net(s), hash))
-               goto release;
-
-       l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
-       NF_CT_ASSERT(l3proto);
-       l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
-       NF_CT_ASSERT(l4proto);
-
-       ret = -ENOSPC;
-       seq_printf(s, "%-8s %u %ld ",
-                  l4proto->name, nf_ct_protonum(ct),
-                  timer_pending(&ct->timeout)
-                  ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
-
-       if (l4proto->print_conntrack)
-               l4proto->print_conntrack(s, ct);
-
-       if (seq_has_overflowed(s))
-               goto release;
-
-       print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
-                   l3proto, l4proto);
-
-       if (seq_has_overflowed(s))
-               goto release;
-
-       if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
-               goto release;
-
-       if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
-               seq_printf(s, "[UNREPLIED] ");
-
-       print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
-                   l3proto, l4proto);
-
-       if (seq_has_overflowed(s))
-               goto release;
-
-       if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
-               goto release;
-
-       if (test_bit(IPS_ASSURED_BIT, &ct->status))
-               seq_printf(s, "[ASSURED] ");
-
-#ifdef CONFIG_NF_CONNTRACK_MARK
-       seq_printf(s, "mark=%u ", ct->mark);
-#endif
-
-       ct_show_secctx(s, ct);
-
-       seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
-
-       if (seq_has_overflowed(s))
-               goto release;
-
-       ret = 0;
-release:
-       nf_ct_put(ct);
-       return ret;
-}
-
-static const struct seq_operations ct_seq_ops = {
-       .start = ct_seq_start,
-       .next  = ct_seq_next,
-       .stop  = ct_seq_stop,
-       .show  = ct_seq_show
-};
-
-static int ct_open(struct inode *inode, struct file *file)
-{
-       return seq_open_net(inode, file, &ct_seq_ops,
-                           sizeof(struct ct_iter_state));
-}
-
-static const struct file_operations ct_file_ops = {
-       .owner   = THIS_MODULE,
-       .open    = ct_open,
-       .read    = seq_read,
-       .llseek  = seq_lseek,
-       .release = seq_release_net,
-};
-
-/* expects */
-struct ct_expect_iter_state {
-       struct seq_net_private p;
-       unsigned int bucket;
-};
-
-static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
-{
-       struct ct_expect_iter_state *st = seq->private;
-       struct hlist_node *n;
-
-       for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
-               n = rcu_dereference(
-                       hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
-               if (n)
-                       return n;
-       }
-       return NULL;
-}
-
-static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
-                                            struct hlist_node *head)
-{
-       struct ct_expect_iter_state *st = seq->private;
-
-       head = rcu_dereference(hlist_next_rcu(head));
-       while (head == NULL) {
-               if (++st->bucket >= nf_ct_expect_hsize)
-                       return NULL;
-               head = rcu_dereference(
-                       hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
-       }
-       return head;
-}
-
-static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
-{
-       struct hlist_node *head = ct_expect_get_first(seq);
-
-       if (head)
-               while (pos && (head = ct_expect_get_next(seq, head)))
-                       pos--;
-       return pos ? NULL : head;
-}
-
-static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
-       __acquires(RCU)
-{
-       rcu_read_lock();
-       return ct_expect_get_idx(seq, *pos);
-}
-
-static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-       (*pos)++;
-       return ct_expect_get_next(seq, v);
-}
-
-static void exp_seq_stop(struct seq_file *seq, void *v)
-       __releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static int exp_seq_show(struct seq_file *s, void *v)
-{
-       struct nf_conntrack_expect *exp;
-       const struct hlist_node *n = v;
-
-       exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
-
-       if (!net_eq(nf_ct_net(exp->master), seq_file_net(s)))
-               return 0;
-
-       if (exp->tuple.src.l3num != AF_INET)
-               return 0;
-
-       if (exp->timeout.function)
-               seq_printf(s, "%ld ", timer_pending(&exp->timeout)
-                          ? (long)(exp->timeout.expires - jiffies)/HZ : 0);
-       else
-               seq_printf(s, "- ");
-
-       seq_printf(s, "proto=%u ", exp->tuple.dst.protonum);
-
-       print_tuple(s, &exp->tuple,
-                   __nf_ct_l3proto_find(exp->tuple.src.l3num),
-                   __nf_ct_l4proto_find(exp->tuple.src.l3num,
-                                        exp->tuple.dst.protonum));
-       seq_putc(s, '\n');
-
-       return 0;
-}
-
-static const struct seq_operations exp_seq_ops = {
-       .start = exp_seq_start,
-       .next = exp_seq_next,
-       .stop = exp_seq_stop,
-       .show = exp_seq_show
-};
-
-static int exp_open(struct inode *inode, struct file *file)
-{
-       return seq_open_net(inode, file, &exp_seq_ops,
-                           sizeof(struct ct_expect_iter_state));
-}
-
-static const struct file_operations ip_exp_file_ops = {
-       .owner   = THIS_MODULE,
-       .open    = exp_open,
-       .read    = seq_read,
-       .llseek  = seq_lseek,
-       .release = seq_release_net,
-};
-
-static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
-{
-       struct net *net = seq_file_net(seq);
-       int cpu;
-
-       if (*pos == 0)
-               return SEQ_START_TOKEN;
-
-       for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
-               if (!cpu_possible(cpu))
-                       continue;
-               *pos = cpu+1;
-               return per_cpu_ptr(net->ct.stat, cpu);
-       }
-
-       return NULL;
-}
-
-static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-       struct net *net = seq_file_net(seq);
-       int cpu;
-
-       for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
-               if (!cpu_possible(cpu))
-                       continue;
-               *pos = cpu+1;
-               return per_cpu_ptr(net->ct.stat, cpu);
-       }
-
-       return NULL;
-}
-
-static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
-{
-}
-
-static int ct_cpu_seq_show(struct seq_file *seq, void *v)
-{
-       struct net *net = seq_file_net(seq);
-       unsigned int nr_conntracks = atomic_read(&net->ct.count);
-       const struct ip_conntrack_stat *st = v;
-
-       if (v == SEQ_START_TOKEN) {
-               seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete search_restart\n");
-               return 0;
-       }
-
-       seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
-                       "%08x %08x %08x %08x %08x  %08x %08x %08x %08x\n",
-                  nr_conntracks,
-                  st->searched,
-                  st->found,
-                  st->new,
-                  st->invalid,
-                  st->ignore,
-                  st->delete,
-                  st->delete_list,
-                  st->insert,
-                  st->insert_failed,
-                  st->drop,
-                  st->early_drop,
-                  st->error,
-
-                  st->expect_new,
-                  st->expect_create,
-                  st->expect_delete,
-                  st->search_restart
-               );
-       return 0;
-}
-
-static const struct seq_operations ct_cpu_seq_ops = {
-       .start  = ct_cpu_seq_start,
-       .next   = ct_cpu_seq_next,
-       .stop   = ct_cpu_seq_stop,
-       .show   = ct_cpu_seq_show,
-};
-
-static int ct_cpu_seq_open(struct inode *inode, struct file *file)
-{
-       return seq_open_net(inode, file, &ct_cpu_seq_ops,
-                           sizeof(struct seq_net_private));
-}
-
-static const struct file_operations ct_cpu_seq_fops = {
-       .owner   = THIS_MODULE,
-       .open    = ct_cpu_seq_open,
-       .read    = seq_read,
-       .llseek  = seq_lseek,
-       .release = seq_release_net,
-};
-
-static int __net_init ip_conntrack_net_init(struct net *net)
-{
-       struct proc_dir_entry *proc, *proc_exp, *proc_stat;
-
-       proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
-       if (!proc)
-               goto err1;
-
-       proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
-                              &ip_exp_file_ops);
-       if (!proc_exp)
-               goto err2;
-
-       proc_stat = proc_create("ip_conntrack", S_IRUGO,
-                               net->proc_net_stat, &ct_cpu_seq_fops);
-       if (!proc_stat)
-               goto err3;
-       return 0;
-
-err3:
-       remove_proc_entry("ip_conntrack_expect", net->proc_net);
-err2:
-       remove_proc_entry("ip_conntrack", net->proc_net);
-err1:
-       return -ENOMEM;
-}
-
-static void __net_exit ip_conntrack_net_exit(struct net *net)
-{
-       remove_proc_entry("ip_conntrack", net->proc_net_stat);
-       remove_proc_entry("ip_conntrack_expect", net->proc_net);
-       remove_proc_entry("ip_conntrack", net->proc_net);
-}
-
-static struct pernet_operations ip_conntrack_net_ops = {
-       .init = ip_conntrack_net_init,
-       .exit = ip_conntrack_net_exit,
-};
-
-int __init nf_conntrack_ipv4_compat_init(void)
-{
-       return register_pernet_subsys(&ip_conntrack_net_ops);
-}
-
-void __exit nf_conntrack_ipv4_compat_fini(void)
-{
-       unregister_pernet_subsys(&ip_conntrack_net_ops);
-}
index c567e1b..4b5904b 100644 (file)
@@ -327,17 +327,6 @@ static struct ctl_table icmp_sysctl_table[] = {
        },
        { }
 };
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table icmp_compat_sysctl_table[] = {
-       {
-               .procname       = "ip_conntrack_icmp_timeout",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
 static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -355,40 +344,14 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
        return 0;
 }
 
-static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
-                                           struct nf_icmp_net *in)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-       pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
-                                      sizeof(icmp_compat_sysctl_table),
-                                      GFP_KERNEL);
-       if (!pn->ctl_compat_table)
-               return -ENOMEM;
-
-       pn->ctl_compat_table[0].data = &in->timeout;
-#endif
-#endif
-       return 0;
-}
-
 static int icmp_init_net(struct net *net, u_int16_t proto)
 {
-       int ret;
        struct nf_icmp_net *in = icmp_pernet(net);
        struct nf_proto_net *pn = &in->pn;
 
        in->timeout = nf_ct_icmp_timeout;
 
-       ret = icmp_kmemdup_compat_sysctl_table(pn, in);
-       if (ret < 0)
-               return ret;
-
-       ret = icmp_kmemdup_sysctl_table(pn, in);
-       if (ret < 0)
-               nf_ct_kfree_compat_sysctl_table(pn);
-
-       return ret;
+       return icmp_kmemdup_sysctl_table(pn, in);
 }
 
 static struct nf_proto_net *icmp_get_net_proto(struct net *net)
index ceb1873..cf986e1 100644 (file)
@@ -74,21 +74,19 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
        nf_conntrack_get(skb->nfct);
 #endif
        /*
-        * If we are in PREROUTING/INPUT, the checksum must be recalculated
-        * since the length could have changed as a result of defragmentation.
-        *
-        * We also decrease the TTL to mitigate potential loops between two
-        * hosts.
+        * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
+        * loops between two hosts.
         *
         * Set %IP_DF so that the original source is notified of a potentially
         * decreased MTU on the clone route. IPv6 does this too.
+        *
+        * IP header checksum will be recalculated at ip_local_out.
         */
        iph = ip_hdr(skb);
        iph->frag_off |= htons(IP_DF);
        if (hooknum == NF_INET_PRE_ROUTING ||
            hooknum == NF_INET_LOCAL_IN)
                --iph->ttl;
-       ip_send_check(iph);
 
        if (nf_dup_ipv4_route(net, skb, gw, oif)) {
                __this_cpu_write(nf_skb_duplicated, true);
index e7ad950..8945c26 100644 (file)
@@ -62,7 +62,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
        /* If it's for Ethernet and the lengths are OK, then log the ARP
         * payload.
         */
-       if (ah->ar_hrd != htons(1) ||
+       if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
            ah->ar_hln != ETH_ALEN ||
            ah->ar_pln != sizeof(__be32))
                return;
@@ -111,8 +111,7 @@ static struct nf_logger nf_arp_logger __read_mostly = {
 
 static int __net_init nf_log_arp_net_init(struct net *net)
 {
-       nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
-       return 0;
+       return nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
 }
 
 static void __net_exit nf_log_arp_net_exit(struct net *net)
index 076aadd..20f2255 100644 (file)
@@ -347,8 +347,7 @@ static struct nf_logger nf_ip_logger __read_mostly = {
 
 static int __net_init nf_log_ipv4_net_init(struct net *net)
 {
-       nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
-       return 0;
+       return nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
 }
 
 static void __net_exit nf_log_ipv4_net_exit(struct net *net)
index 8dd8696..c1bcf69 100644 (file)
@@ -379,8 +379,7 @@ static struct nf_logger nf_ip6_logger __read_mostly = {
 
 static int __net_init nf_log_ipv6_net_init(struct net *net)
 {
-       nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
-       return 0;
+       return nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
 }
 
 static void __net_exit nf_log_ipv6_net_exit(struct net *net)
index 9266cee..e8d56d9 100644 (file)
@@ -474,6 +474,12 @@ config NFT_META
          This option adds the "meta" expression that you can use to match and
          to set packet metainformation such as the packet mark.
 
+config NFT_NUMGEN
+       tristate "Netfilter nf_tables number generator module"
+       help
+         This option adds the number generator expression used to perform
+         incremental counting and random numbers bound to a upper limit.
+
 config NFT_CT
        depends on NF_CONNTRACK
        tristate "Netfilter nf_tables conntrack module"
@@ -481,13 +487,13 @@ config NFT_CT
          This option adds the "meta" expression that you can use to match
          connection tracking information such as the flow state.
 
-config NFT_RBTREE
+config NFT_SET_RBTREE
        tristate "Netfilter nf_tables rbtree set module"
        help
          This option adds the "rbtree" set type (Red Black tree) that is used
          to build interval-based sets.
 
-config NFT_HASH
+config NFT_SET_HASH
        tristate "Netfilter nf_tables hash set module"
        help
          This option adds the "hash" set type that is used to build one-way
@@ -542,6 +548,12 @@ config NFT_QUEUE
          This is required if you intend to use the userspace queueing
          infrastructure (also known as NFQUEUE) from nftables.
 
+config NFT_QUOTA
+       tristate "Netfilter nf_tables quota module"
+       help
+         This option adds the "quota" expression that you can use to match
+         enforce bytes quotas.
+
 config NFT_REJECT
        default m if NETFILTER_ADVANCED=n
        tristate "Netfilter nf_tables reject support"
@@ -563,6 +575,12 @@ config NFT_COMPAT
          x_tables match/target extensions over the nf_tables
          framework.
 
+config NFT_HASH
+       tristate "Netfilter nf_tables hash module"
+       help
+         This option adds the "hash" expression that you can use to perform
+         a hash operation on registers.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
index 6913454..0c85811 100644 (file)
@@ -80,18 +80,21 @@ obj-$(CONFIG_NF_TABLES_NETDEV)      += nf_tables_netdev.o
 obj-$(CONFIG_NFT_COMPAT)       += nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)       += nft_exthdr.o
 obj-$(CONFIG_NFT_META)         += nft_meta.o
+obj-$(CONFIG_NFT_NUMGEN)       += nft_numgen.o
 obj-$(CONFIG_NFT_CT)           += nft_ct.o
 obj-$(CONFIG_NFT_LIMIT)                += nft_limit.o
 obj-$(CONFIG_NFT_NAT)          += nft_nat.o
 obj-$(CONFIG_NFT_QUEUE)                += nft_queue.o
+obj-$(CONFIG_NFT_QUOTA)                += nft_quota.o
 obj-$(CONFIG_NFT_REJECT)       += nft_reject.o
 obj-$(CONFIG_NFT_REJECT_INET)  += nft_reject_inet.o
-obj-$(CONFIG_NFT_RBTREE)       += nft_rbtree.o
-obj-$(CONFIG_NFT_HASH)         += nft_hash.o
+obj-$(CONFIG_NFT_SET_RBTREE)   += nft_set_rbtree.o
+obj-$(CONFIG_NFT_SET_HASH)     += nft_set_hash.o
 obj-$(CONFIG_NFT_COUNTER)      += nft_counter.o
 obj-$(CONFIG_NFT_LOG)          += nft_log.o
 obj-$(CONFIG_NFT_MASQ)         += nft_masq.o
 obj-$(CONFIG_NFT_REDIR)                += nft_redir.o
+obj-$(CONFIG_NFT_HASH)         += nft_hash.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)   += nft_dup_netdev.o
index f04fd8d..fc230d9 100644 (file)
@@ -281,13 +281,10 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
        h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
        if (h) {
                ct = nf_ct_tuplehash_to_ctrack(h);
-               /* Show what happens instead of calling nf_ct_kill() */
-               if (del_timer(&ct->timeout)) {
-                       IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+               if (nf_ct_kill(ct)) {
+                       IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple="
                                FMT_TUPLE "\n",
                                __func__, ct, ARG_TUPLE(&tuple));
-                       if (ct->timeout.function)
-                               ct->timeout.function(ct->timeout.data);
                } else {
                        IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
                                FMT_TUPLE "\n",
index dd2c43a..ac1db40 100644 (file)
@@ -72,12 +72,24 @@ EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
 struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
 
+struct conntrack_gc_work {
+       struct delayed_work     dwork;
+       u32                     last_bucket;
+       bool                    exiting;
+};
+
 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
 static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
-static __read_mostly seqcount_t nf_conntrack_generation;
 static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
 static __read_mostly bool nf_conntrack_locks_all;
 
+#define GC_MAX_BUCKETS_DIV     64u
+#define GC_MAX_BUCKETS         8192u
+#define GC_INTERVAL            (5 * HZ)
+#define GC_MAX_EVICTS          256u
+
+static struct conntrack_gc_work conntrack_gc_work;
+
 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
 {
        spin_lock(lock);
@@ -164,7 +176,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
 unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
+seqcount_t nf_conntrack_generation __read_mostly;
 
 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
@@ -372,7 +384,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
 
        pr_debug("destroy_conntrack(%p)\n", ct);
        NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
-       NF_CT_ASSERT(!timer_pending(&ct->timeout));
 
        if (unlikely(nf_ct_is_template(ct))) {
                nf_ct_tmpl_free(ct);
@@ -435,35 +446,30 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 {
        struct nf_conn_tstamp *tstamp;
 
+       if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
+               return false;
+
        tstamp = nf_conn_tstamp_find(ct);
        if (tstamp && tstamp->stop == 0)
                tstamp->stop = ktime_get_real_ns();
 
-       if (nf_ct_is_dying(ct))
-               goto delete;
-
        if (nf_conntrack_event_report(IPCT_DESTROY, ct,
                                    portid, report) < 0) {
-               /* destroy event was not delivered */
+               /* destroy event was not delivered. nf_ct_put will
+                * be done by event cache worker on redelivery.
+                */
                nf_ct_delete_from_lists(ct);
                nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
                return false;
        }
 
        nf_conntrack_ecache_work(nf_ct_net(ct));
-       set_bit(IPS_DYING_BIT, &ct->status);
- delete:
        nf_ct_delete_from_lists(ct);
        nf_ct_put(ct);
        return true;
 }
 EXPORT_SYMBOL_GPL(nf_ct_delete);
 
-static void death_by_timeout(unsigned long ul_conntrack)
-{
-       nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
-}
-
 static inline bool
 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
                const struct nf_conntrack_tuple *tuple,
@@ -481,22 +487,17 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
               net_eq(net, nf_ct_net(ct));
 }
 
-/* must be called with rcu read lock held */
-void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
+/* caller must hold rcu readlock and none of the nf_conntrack_locks */
+static void nf_ct_gc_expired(struct nf_conn *ct)
 {
-       struct hlist_nulls_head *hptr;
-       unsigned int sequence, hsz;
+       if (!atomic_inc_not_zero(&ct->ct_general.use))
+               return;
 
-       do {
-               sequence = read_seqcount_begin(&nf_conntrack_generation);
-               hsz = nf_conntrack_htable_size;
-               hptr = nf_conntrack_hash;
-       } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+       if (nf_ct_should_gc(ct))
+               nf_ct_kill(ct);
 
-       *hash = hptr;
-       *hsize = hsz;
+       nf_ct_put(ct);
 }
-EXPORT_SYMBOL_GPL(nf_conntrack_get_ht);
 
 /*
  * Warning :
@@ -510,16 +511,24 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_head *ct_hash;
        struct hlist_nulls_node *n;
-       unsigned int bucket, sequence;
+       unsigned int bucket, hsize;
 
 begin:
-       do {
-               sequence = read_seqcount_begin(&nf_conntrack_generation);
-               bucket = scale_hash(hash);
-               ct_hash = nf_conntrack_hash;
-       } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+       nf_conntrack_get_ht(&ct_hash, &hsize);
+       bucket = reciprocal_scale(hash, hsize);
 
        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
+               struct nf_conn *ct;
+
+               ct = nf_ct_tuplehash_to_ctrack(h);
+               if (nf_ct_is_expired(ct)) {
+                       nf_ct_gc_expired(ct);
+                       continue;
+               }
+
+               if (nf_ct_is_dying(ct))
+                       continue;
+
                if (nf_ct_key_equal(h, tuple, zone, net)) {
                        NF_CT_STAT_INC_ATOMIC(net, found);
                        return h;
@@ -618,7 +627,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
                                    zone, net))
                        goto out;
 
-       add_timer(&ct->timeout);
        smp_wmb();
        /* The caller holds a reference to this object */
        atomic_set(&ct->ct_general.use, 2);
@@ -771,8 +779,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
        /* Timer relative to confirmation time, not original
           setting time, otherwise we'd get timer wrap in
           weird delay cases. */
-       ct->timeout.expires += jiffies;
-       add_timer(&ct->timeout);
+       ct->timeout += nfct_time_stamp;
        atomic_inc(&ct->ct_general.use);
        ct->status |= IPS_CONFIRMED;
 
@@ -823,29 +830,41 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
        const struct nf_conntrack_zone *zone;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_head *ct_hash;
-       unsigned int hash, sequence;
+       unsigned int hash, hsize;
        struct hlist_nulls_node *n;
        struct nf_conn *ct;
 
        zone = nf_ct_zone(ignored_conntrack);
 
        rcu_read_lock();
-       do {
-               sequence = read_seqcount_begin(&nf_conntrack_generation);
-               hash = hash_conntrack(net, tuple);
-               ct_hash = nf_conntrack_hash;
-       } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+ begin:
+       nf_conntrack_get_ht(&ct_hash, &hsize);
+       hash = __hash_conntrack(net, tuple, hsize);
 
        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
                ct = nf_ct_tuplehash_to_ctrack(h);
-               if (ct != ignored_conntrack &&
-                   nf_ct_key_equal(h, tuple, zone, net)) {
+
+               if (ct == ignored_conntrack)
+                       continue;
+
+               if (nf_ct_is_expired(ct)) {
+                       nf_ct_gc_expired(ct);
+                       continue;
+               }
+
+               if (nf_ct_key_equal(h, tuple, zone, net)) {
                        NF_CT_STAT_INC_ATOMIC(net, found);
                        rcu_read_unlock();
                        return 1;
                }
                NF_CT_STAT_INC_ATOMIC(net, searched);
        }
+
+       if (get_nulls_value(n) != hash) {
+               NF_CT_STAT_INC_ATOMIC(net, search_restart);
+               goto begin;
+       }
+
        rcu_read_unlock();
 
        return 0;
@@ -867,6 +886,11 @@ static unsigned int early_drop_list(struct net *net,
        hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
                tmp = nf_ct_tuplehash_to_ctrack(h);
 
+               if (nf_ct_is_expired(tmp)) {
+                       nf_ct_gc_expired(tmp);
+                       continue;
+               }
+
                if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
                    !net_eq(nf_ct_net(tmp), net) ||
                    nf_ct_is_dying(tmp))
@@ -884,7 +908,6 @@ static unsigned int early_drop_list(struct net *net,
                 */
                if (net_eq(nf_ct_net(tmp), net) &&
                    nf_ct_is_confirmed(tmp) &&
-                   del_timer(&tmp->timeout) &&
                    nf_ct_delete(tmp, 0, 0))
                        drops++;
 
@@ -900,14 +923,11 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
 
        for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
                struct hlist_nulls_head *ct_hash;
-               unsigned hash, sequence, drops;
+               unsigned int hash, hsize, drops;
 
                rcu_read_lock();
-               do {
-                       sequence = read_seqcount_begin(&nf_conntrack_generation);
-                       hash = scale_hash(_hash++);
-                       ct_hash = nf_conntrack_hash;
-               } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+               nf_conntrack_get_ht(&ct_hash, &hsize);
+               hash = reciprocal_scale(_hash++, hsize);
 
                drops = early_drop_list(net, &ct_hash[hash]);
                rcu_read_unlock();
@@ -921,6 +941,69 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
        return false;
 }
 
+static void gc_worker(struct work_struct *work)
+{
+       unsigned int i, goal, buckets = 0, expired_count = 0;
+       unsigned long next_run = GC_INTERVAL;
+       unsigned int ratio, scanned = 0;
+       struct conntrack_gc_work *gc_work;
+
+       gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
+
+       goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS);
+       i = gc_work->last_bucket;
+
+       do {
+               struct nf_conntrack_tuple_hash *h;
+               struct hlist_nulls_head *ct_hash;
+               struct hlist_nulls_node *n;
+               unsigned int hashsz;
+               struct nf_conn *tmp;
+
+               i++;
+               rcu_read_lock();
+
+               nf_conntrack_get_ht(&ct_hash, &hashsz);
+               if (i >= hashsz)
+                       i = 0;
+
+               hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+                       tmp = nf_ct_tuplehash_to_ctrack(h);
+
+                       scanned++;
+                       if (nf_ct_is_expired(tmp)) {
+                               nf_ct_gc_expired(tmp);
+                               expired_count++;
+                               continue;
+                       }
+               }
+
+               /* could check get_nulls_value() here and restart if ct
+                * was moved to another chain.  But given gc is best-effort
+                * we will just continue with next hash slot.
+                */
+               rcu_read_unlock();
+               cond_resched_rcu_qs();
+       } while (++buckets < goal &&
+                expired_count < GC_MAX_EVICTS);
+
+       if (gc_work->exiting)
+               return;
+
+       ratio = scanned ? expired_count * 100 / scanned : 0;
+       if (ratio >= 90)
+               next_run = 0;
+
+       gc_work->last_bucket = i;
+       schedule_delayed_work(&gc_work->dwork, next_run);
+}
+
+static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
+{
+       INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
+       gc_work->exiting = false;
+}
+
 static struct nf_conn *
 __nf_conntrack_alloc(struct net *net,
                     const struct nf_conntrack_zone *zone,
@@ -957,8 +1040,6 @@ __nf_conntrack_alloc(struct net *net,
        /* save hash for reusing when confirming */
        *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
        ct->status = 0;
-       /* Don't set timer yet: wait for confirmation */
-       setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
        write_pnet(&ct->ct_net, net);
        memset(&ct->__nfct_init_offset[0], 0,
               offsetof(struct nf_conn, proto) -
@@ -1332,7 +1413,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
                          unsigned long extra_jiffies,
                          int do_acct)
 {
-       NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
        NF_CT_ASSERT(skb);
 
        /* Only update if this is not a fixed timeout */
@@ -1340,39 +1420,25 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
                goto acct;
 
        /* If not in hash table, timer will not be active yet */
-       if (!nf_ct_is_confirmed(ct)) {
-               ct->timeout.expires = extra_jiffies;
-       } else {
-               unsigned long newtime = jiffies + extra_jiffies;
-
-               /* Only update the timeout if the new timeout is at least
-                  HZ jiffies from the old timeout. Need del_timer for race
-                  avoidance (may already be dying). */
-               if (newtime - ct->timeout.expires >= HZ)
-                       mod_timer_pending(&ct->timeout, newtime);
-       }
+       if (nf_ct_is_confirmed(ct))
+               extra_jiffies += nfct_time_stamp;
 
+       ct->timeout = extra_jiffies;
 acct:
        if (do_acct)
                nf_ct_acct_update(ct, ctinfo, skb->len);
 }
 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
 
-bool __nf_ct_kill_acct(struct nf_conn *ct,
-                      enum ip_conntrack_info ctinfo,
-                      const struct sk_buff *skb,
-                      int do_acct)
+bool nf_ct_kill_acct(struct nf_conn *ct,
+                    enum ip_conntrack_info ctinfo,
+                    const struct sk_buff *skb)
 {
-       if (do_acct)
-               nf_ct_acct_update(ct, ctinfo, skb->len);
+       nf_ct_acct_update(ct, ctinfo, skb->len);
 
-       if (del_timer(&ct->timeout)) {
-               ct->timeout.function((unsigned long)ct);
-               return true;
-       }
-       return false;
+       return nf_ct_delete(ct, 0, 0);
 }
-EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
+EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
@@ -1505,11 +1571,8 @@ void nf_ct_iterate_cleanup(struct net *net,
 
        while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
                /* Time to push up daises... */
-               if (del_timer(&ct->timeout))
-                       nf_ct_delete(ct, portid, report);
-
-               /* ... else the timer will get him soon. */
 
+               nf_ct_delete(ct, portid, report);
                nf_ct_put(ct);
                cond_resched();
        }
@@ -1545,6 +1608,7 @@ static int untrack_refs(void)
 
 void nf_conntrack_cleanup_start(void)
 {
+       conntrack_gc_work.exiting = true;
        RCU_INIT_POINTER(ip_ct_attach, NULL);
 }
 
@@ -1554,6 +1618,7 @@ void nf_conntrack_cleanup_end(void)
        while (untrack_refs() > 0)
                schedule();
 
+       cancel_delayed_work_sync(&conntrack_gc_work.dwork);
        nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
 
        nf_conntrack_proto_fini();
@@ -1828,6 +1893,10 @@ int nf_conntrack_init_start(void)
        }
        /*  - and look it like as a confirmed connection */
        nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
+
+       conntrack_gc_work_init(&conntrack_gc_work);
+       schedule_delayed_work(&conntrack_gc_work.dwork, GC_INTERVAL);
+
        return 0;
 
 err_proto:
index d28011b..da9df2d 100644 (file)
@@ -49,8 +49,13 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
 
        hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
                struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+               struct nf_conntrack_ecache *e;
 
-               if (nf_ct_is_dying(ct))
+               if (!nf_ct_is_confirmed(ct))
+                       continue;
+
+               e = nf_ct_ecache_find(ct);
+               if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
                        continue;
 
                if (nf_conntrack_event(IPCT_DESTROY, ct)) {
@@ -58,8 +63,7 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
                        break;
                }
 
-               /* we've got the event delivered, now it's dying */
-               set_bit(IPS_DYING_BIT, &ct->status);
+               e->state = NFCT_ECACHE_DESTROY_SENT;
                refs[evicted] = ct;
 
                if (++evicted >= ARRAY_SIZE(refs)) {
@@ -130,7 +134,7 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
        if (!e)
                goto out_unlock;
 
-       if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
+       if (nf_ct_is_confirmed(ct)) {
                struct nf_ct_event item = {
                        .ct     = ct,
                        .portid = e->portid ? e->portid : portid,
@@ -150,11 +154,13 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
                                 * triggered by a process, we store the PORTID
                                 * to include it in the retransmission.
                                 */
-                               if (eventmask & (1 << IPCT_DESTROY) &&
-                                   e->portid == 0 && portid != 0)
-                                       e->portid = portid;
-                               else
+                               if (eventmask & (1 << IPCT_DESTROY)) {
+                                       if (e->portid == 0 && portid != 0)
+                                               e->portid = portid;
+                                       e->state = NFCT_ECACHE_DESTROY_FAIL;
+                               } else {
                                        e->missed |= eventmask;
+                               }
                        } else {
                                e->missed &= ~missed;
                        }
index 4314700..b6934b5 100644 (file)
@@ -237,7 +237,7 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
        }
        delim = data[0];
        if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
-               pr_debug("try_eprt: invalid delimitter.\n");
+               pr_debug("try_eprt: invalid delimiter.\n");
                return 0;
        }
 
index fdfc71f..c052b71 100644 (file)
@@ -149,10 +149,7 @@ nla_put_failure:
 
 static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
 {
-       long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
-
-       if (timeout < 0)
-               timeout = 0;
+       long timeout = nf_ct_expires(ct) / HZ;
 
        if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
                goto nla_put_failure;
@@ -818,14 +815,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
        struct hlist_nulls_node *n;
        struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        u_int8_t l3proto = nfmsg->nfgen_family;
-       int res;
+       struct nf_conn *nf_ct_evict[8];
+       int res, i;
        spinlock_t *lockp;
 
        last = (struct nf_conn *)cb->args[1];
+       i = 0;
 
        local_bh_disable();
        for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
 restart:
+               while (i) {
+                       i--;
+                       if (nf_ct_should_gc(nf_ct_evict[i]))
+                               nf_ct_kill(nf_ct_evict[i]);
+                       nf_ct_put(nf_ct_evict[i]);
+               }
+
                lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
                nf_conntrack_lock(lockp);
                if (cb->args[0] >= nf_conntrack_htable_size) {
@@ -837,6 +843,13 @@ restart:
                        if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
                                continue;
                        ct = nf_ct_tuplehash_to_ctrack(h);
+                       if (nf_ct_is_expired(ct)) {
+                               if (i < ARRAY_SIZE(nf_ct_evict) &&
+                                   atomic_inc_not_zero(&ct->ct_general.use))
+                                       nf_ct_evict[i++] = ct;
+                               continue;
+                       }
+
                        if (!net_eq(net, nf_ct_net(ct)))
                                continue;
 
@@ -878,6 +891,13 @@ out:
        if (last)
                nf_ct_put(last);
 
+       while (i) {
+               i--;
+               if (nf_ct_should_gc(nf_ct_evict[i]))
+                       nf_ct_kill(nf_ct_evict[i]);
+               nf_ct_put(nf_ct_evict[i]);
+       }
+
        return skb->len;
 }
 
@@ -1147,9 +1167,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
                }
        }
 
-       if (del_timer(&ct->timeout))
-               nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
-
+       nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
        nf_ct_put(ct);
 
        return 0;
@@ -1517,11 +1535,10 @@ static int ctnetlink_change_timeout(struct nf_conn *ct,
 {
        u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
 
-       if (!del_timer(&ct->timeout))
-               return -ETIME;
+       ct->timeout = nfct_time_stamp + timeout * HZ;
 
-       ct->timeout.expires = jiffies + timeout * HZ;
-       add_timer(&ct->timeout);
+       if (test_bit(IPS_DYING_BIT, &ct->status))
+               return -ETIME;
 
        return 0;
 }
@@ -1719,9 +1736,8 @@ ctnetlink_create_conntrack(struct net *net,
 
        if (!cda[CTA_TIMEOUT])
                goto err1;
-       ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
 
-       ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+       ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
 
        rcu_read_lock();
        if (cda[CTA_HELP]) {
index 5588c7a..f60a475 100644 (file)
@@ -157,8 +157,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
                pr_debug("setting timeout of conntrack %p to 0\n", sibling);
                sibling->proto.gre.timeout        = 0;
                sibling->proto.gre.stream_timeout = 0;
-               if (del_timer(&sibling->timeout))
-                       sibling->timeout.function((unsigned long)sibling);
+               nf_ct_kill(sibling);
                nf_ct_put(sibling);
                return 1;
        } else {
index b65d586..8d2c7d8 100644 (file)
@@ -159,54 +159,6 @@ static int kill_l4proto(struct nf_conn *i, void *data)
               nf_ct_l3num(i) == l4proto->l3proto;
 }
 
-static struct nf_ip_net *nf_ct_l3proto_net(struct net *net,
-                                          struct nf_conntrack_l3proto *l3proto)
-{
-       if (l3proto->l3proto == PF_INET)
-               return &net->ct.nf_ct_proto;
-       else
-               return NULL;
-}
-
-static int nf_ct_l3proto_register_sysctl(struct net *net,
-                                        struct nf_conntrack_l3proto *l3proto)
-{
-       int err = 0;
-       struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
-       /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */
-       if (in == NULL)
-               return 0;
-
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-       if (in->ctl_table != NULL) {
-               err = nf_ct_register_sysctl(net,
-                                           &in->ctl_table_header,
-                                           l3proto->ctl_table_path,
-                                           in->ctl_table);
-               if (err < 0) {
-                       kfree(in->ctl_table);
-                       in->ctl_table = NULL;
-               }
-       }
-#endif
-       return err;
-}
-
-static void nf_ct_l3proto_unregister_sysctl(struct net *net,
-                                           struct nf_conntrack_l3proto *l3proto)
-{
-       struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
-
-       if (in == NULL)
-               return;
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-       if (in->ctl_table_header != NULL)
-               nf_ct_unregister_sysctl(&in->ctl_table_header,
-                                       &in->ctl_table,
-                                       0);
-#endif
-}
-
 int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
 {
        int ret = 0;
@@ -241,7 +193,7 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
 int nf_ct_l3proto_pernet_register(struct net *net,
                                  struct nf_conntrack_l3proto *proto)
 {
-       int ret = 0;
+       int ret;
 
        if (proto->init_net) {
                ret = proto->init_net(net);
@@ -249,7 +201,7 @@ int nf_ct_l3proto_pernet_register(struct net *net,
                        return ret;
        }
 
-       return nf_ct_l3proto_register_sysctl(net, proto);
+       return 0;
 }
 EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
 
@@ -272,8 +224,6 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
 void nf_ct_l3proto_pernet_unregister(struct net *net,
                                     struct nf_conntrack_l3proto *proto)
 {
-       nf_ct_l3proto_unregister_sysctl(net, proto);
-
        /* Remove all contrack entries for this protocol */
        nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
 }
@@ -312,26 +262,6 @@ int nf_ct_l4proto_register_sysctl(struct net *net,
                        }
                }
        }
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-       if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) {
-               if (err < 0) {
-                       nf_ct_kfree_compat_sysctl_table(pn);
-                       goto out;
-               }
-               err = nf_ct_register_sysctl(net,
-                                           &pn->ctl_compat_header,
-                                           "net/ipv4/netfilter",
-                                           pn->ctl_compat_table);
-               if (err == 0)
-                       goto out;
-
-               nf_ct_kfree_compat_sysctl_table(pn);
-               nf_ct_unregister_sysctl(&pn->ctl_table_header,
-                                       &pn->ctl_table,
-                                       pn->users);
-       }
-out:
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
        return err;
 }
@@ -346,13 +276,6 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net,
                nf_ct_unregister_sysctl(&pn->ctl_table_header,
                                        &pn->ctl_table,
                                        pn->users);
-
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-       if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL)
-               nf_ct_unregister_sysctl(&pn->ctl_compat_header,
-                                       &pn->ctl_compat_table,
-                                       0);
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 }
 
index 399a38f..a45bee5 100644 (file)
@@ -402,7 +402,8 @@ static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 {
        struct dccp_hdr _hdr, *dh;
 
-       dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+       /* Actually only need first 4 bytes to get ports. */
+       dh = skb_header_pointer(skb, dataoff, 4, &_hdr);
        if (dh == NULL)
                return false;
 
index 86dc752..d5868ba 100644 (file)
@@ -151,17 +151,6 @@ static struct ctl_table generic_sysctl_table[] = {
        },
        { }
 };
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table generic_compat_sysctl_table[] = {
-       {
-               .procname       = "ip_conntrack_generic_timeout",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
 static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -179,40 +168,14 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
        return 0;
 }
 
-static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
-                                              struct nf_generic_net *gn)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-       pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table,
-                                      sizeof(generic_compat_sysctl_table),
-                                      GFP_KERNEL);
-       if (!pn->ctl_compat_table)
-               return -ENOMEM;
-
-       pn->ctl_compat_table[0].data = &gn->timeout;
-#endif
-#endif
-       return 0;
-}
-
 static int generic_init_net(struct net *net, u_int16_t proto)
 {
-       int ret;
        struct nf_generic_net *gn = generic_pernet(net);
        struct nf_proto_net *pn = &gn->pn;
 
        gn->timeout = nf_ct_generic_timeout;
 
-       ret = generic_kmemdup_compat_sysctl_table(pn, gn);
-       if (ret < 0)
-               return ret;
-
-       ret = generic_kmemdup_sysctl_table(pn, gn);
-       if (ret < 0)
-               nf_ct_kfree_compat_sysctl_table(pn);
-
-       return ret;
+       return generic_kmemdup_sysctl_table(pn, gn);
 }
 
 static struct nf_proto_net *generic_get_net_proto(struct net *net)
index 1d7ab96..982ea62 100644 (file)
@@ -161,8 +161,8 @@ static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
        const struct sctphdr *hp;
        struct sctphdr _hdr;
 
-       /* Actually only need first 8 bytes. */
-       hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+       /* Actually only need first 4 bytes to get ports. */
+       hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
        if (hp == NULL)
                return false;
 
@@ -705,54 +705,6 @@ static struct ctl_table sctp_sysctl_table[] = {
        },
        { }
 };
-
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table sctp_compat_sysctl_table[] = {
-       {
-               .procname       = "ip_conntrack_sctp_timeout_closed",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_sctp_timeout_cookie_wait",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_sctp_timeout_cookie_echoed",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_sctp_timeout_established",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_sctp_timeout_shutdown_sent",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_sctp_timeout_shutdown_recd",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif
 
 static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -781,32 +733,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
        return 0;
 }
 
-static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
-                                           struct sctp_net *sn)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-       pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table,
-                                      sizeof(sctp_compat_sysctl_table),
-                                      GFP_KERNEL);
-       if (!pn->ctl_compat_table)
-               return -ENOMEM;
-
-       pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
-       pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
-       pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
-       pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
-       pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
-       pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
-       pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
-#endif
-#endif
-       return 0;
-}
-
 static int sctp_init_net(struct net *net, u_int16_t proto)
 {
-       int ret;
        struct sctp_net *sn = sctp_pernet(net);
        struct nf_proto_net *pn = &sn->pn;
 
@@ -817,18 +745,7 @@ static int sctp_init_net(struct net *net, u_int16_t proto)
                        sn->timeouts[i] = sctp_timeouts[i];
        }
 
-       if (proto == AF_INET) {
-               ret = sctp_kmemdup_compat_sysctl_table(pn, sn);
-               if (ret < 0)
-                       return ret;
-
-               ret = sctp_kmemdup_sysctl_table(pn, sn);
-               if (ret < 0)
-                       nf_ct_kfree_compat_sysctl_table(pn);
-       } else
-               ret = sctp_kmemdup_sysctl_table(pn, sn);
-
-       return ret;
+       return sctp_kmemdup_sysctl_table(pn, sn);
 }
 
 static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
index 70c8381..69f6877 100644 (file)
@@ -282,8 +282,8 @@ static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
        const struct tcphdr *hp;
        struct tcphdr _hdr;
 
-       /* Actually only need first 8 bytes. */
-       hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+       /* Actually only need first 4 bytes to get ports. */
+       hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
        if (hp == NULL)
                return false;
 
@@ -1481,90 +1481,6 @@ static struct ctl_table tcp_sysctl_table[] = {
        },
        { }
 };
-
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table tcp_compat_sysctl_table[] = {
-       {
-               .procname       = "ip_conntrack_tcp_timeout_syn_sent",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_timeout_syn_sent2",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_timeout_syn_recv",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_timeout_established",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_timeout_fin_wait",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_timeout_close_wait",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_timeout_last_ack",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_timeout_time_wait",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_timeout_close",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_timeout_max_retrans",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_loose",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_be_liberal",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
-       {
-               .procname       = "ip_conntrack_tcp_max_retrans",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
-       { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
 static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -1597,38 +1513,8 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
        return 0;
 }
 
-static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
-                                          struct nf_tcp_net *tn)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-       pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table,
-                                      sizeof(tcp_compat_sysctl_table),
-                                      GFP_KERNEL);
-       if (!pn->ctl_compat_table)
-               return -ENOMEM;
-
-       pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
-       pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2];
-       pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
-       pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
-       pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
-       pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
-       pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
-       pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
-       pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
-       pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
-       pn->ctl_compat_table[10].data = &tn->tcp_loose;
-       pn->ctl_compat_table[11].data = &tn->tcp_be_liberal;
-       pn->ctl_compat_table[12].data = &tn->tcp_max_retrans;
-#endif
-#endif
-       return 0;
-}
-
 static int tcp_init_net(struct net *net, u_int16_t proto)
 {
-       int ret;
        struct nf_tcp_net *tn = tcp_pernet(net);
        struct nf_proto_net *pn = &tn->pn;
 
@@ -1643,18 +1529,7 @@ static int tcp_init_net(struct net *net, u_int16_t proto)
                tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
        }
 
-       if (proto == AF_INET) {
-               ret = tcp_kmemdup_compat_sysctl_table(pn, tn);
-               if (ret < 0)
-                       return ret;
-
-               ret = tcp_kmemdup_sysctl_table(pn, tn);
-               if (ret < 0)
-                       nf_ct_kfree_compat_sysctl_table(pn);
-       } else
-               ret = tcp_kmemdup_sysctl_table(pn, tn);
-
-       return ret;
+       return tcp_kmemdup_sysctl_table(pn, tn);
 }
 
 static struct nf_proto_net *tcp_get_net_proto(struct net *net)
index 4fd0405..20f35ed 100644 (file)
@@ -44,8 +44,8 @@ static bool udp_pkt_to_tuple(const struct sk_buff *skb,
        const struct udphdr *hp;
        struct udphdr _hdr;
 
-       /* Actually only need first 8 bytes. */
-       hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+       /* Actually only need first 4 bytes to get ports. */
+       hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
        if (hp == NULL)
                return false;
 
@@ -218,23 +218,6 @@ static struct ctl_table udp_sysctl_table[] = {
        },
        { }
 };
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-static struct ctl_table udp_compat_sysctl_table[] = {
-       {
-               .procname       = "ip_conntrack_udp_timeout",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "ip_conntrack_udp_timeout_stream",
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_jiffies,
-       },
-       { }
-};
-#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
 static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
@@ -254,27 +237,8 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
        return 0;
 }
 
-static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
-                                          struct nf_udp_net *un)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-       pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table,
-                                      sizeof(udp_compat_sysctl_table),
-                                      GFP_KERNEL);
-       if (!pn->ctl_compat_table)
-               return -ENOMEM;
-
-       pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
-       pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED];
-#endif
-#endif
-       return 0;
-}
-
 static int udp_init_net(struct net *net, u_int16_t proto)
 {
-       int ret;
        struct nf_udp_net *un = udp_pernet(net);
        struct nf_proto_net *pn = &un->pn;
 
@@ -285,18 +249,7 @@ static int udp_init_net(struct net *net, u_int16_t proto)
                        un->timeouts[i] = udp_timeouts[i];
        }
 
-       if (proto == AF_INET) {
-               ret = udp_kmemdup_compat_sysctl_table(pn, un);
-               if (ret < 0)
-                       return ret;
-
-               ret = udp_kmemdup_sysctl_table(pn, un);
-               if (ret < 0)
-                       nf_ct_kfree_compat_sysctl_table(pn);
-       } else
-               ret = udp_kmemdup_sysctl_table(pn, un);
-
-       return ret;
+       return udp_kmemdup_sysctl_table(pn, un);
 }
 
 static struct nf_proto_net *udp_get_net_proto(struct net *net)
index 9d692f5..029206e 100644 (file)
@@ -54,7 +54,8 @@ static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
        const struct udphdr *hp;
        struct udphdr _hdr;
 
-       hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+       /* Actually only need first 4 bytes to get ports. */
+       hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
        if (hp == NULL)
                return false;
 
index 9f267c3..3d9a316 100644 (file)
@@ -228,8 +228,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
        seq_printf(s, "%-8s %u %-8s %u %ld ",
                   l3proto->name, nf_ct_l3num(ct),
                   l4proto->name, nf_ct_protonum(ct),
-                  timer_pending(&ct->timeout)
-                  ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
+                  nf_ct_expires(ct)  / HZ);
 
        if (l4proto->print_conntrack)
                l4proto->print_conntrack(s, ct);
index aa5847a..30a17d6 100644 (file)
@@ -39,12 +39,12 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
        return NULL;
 }
 
-void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
+int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
 {
        const struct nf_logger *log;
 
-       if (pf == NFPROTO_UNSPEC)
-               return;
+       if (pf == NFPROTO_UNSPEC || pf >= ARRAY_SIZE(net->nf.nf_loggers))
+               return -EOPNOTSUPP;
 
        mutex_lock(&nf_log_mutex);
        log = nft_log_dereference(net->nf.nf_loggers[pf]);
@@ -52,6 +52,8 @@ void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
                rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
 
        mutex_unlock(&nf_log_mutex);
+
+       return 0;
 }
 EXPORT_SYMBOL(nf_log_set);
 
index de31818..81ae41f 100644 (file)
@@ -565,16 +565,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
         * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
         * will delete entry from already-freed table.
         */
-       if (!del_timer(&ct->timeout))
-               return 1;
-
        ct->status &= ~IPS_NAT_DONE_MASK;
-
        rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
                               nf_nat_bysource_params);
 
-       add_timer(&ct->timeout);
-
        /* don't delete conntrack.  Although that would make things a lot
         * simpler, we'd end up flushing all conntracks on nat rmmod.
         */
index 7e1c876..bd9715e 100644 (file)
@@ -1196,6 +1196,83 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
        }
 }
 
+struct nft_chain_hook {
+       u32                             num;
+       u32                             priority;
+       const struct nf_chain_type      *type;
+       struct net_device               *dev;
+};
+
+static int nft_chain_parse_hook(struct net *net,
+                               const struct nlattr * const nla[],
+                               struct nft_af_info *afi,
+                               struct nft_chain_hook *hook, bool create)
+{
+       struct nlattr *ha[NFTA_HOOK_MAX + 1];
+       const struct nf_chain_type *type;
+       struct net_device *dev;
+       int err;
+
+       err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
+                              nft_hook_policy);
+       if (err < 0)
+               return err;
+
+       if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
+           ha[NFTA_HOOK_PRIORITY] == NULL)
+               return -EINVAL;
+
+       hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+       if (hook->num >= afi->nhooks)
+               return -EINVAL;
+
+       hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+
+       type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT];
+       if (nla[NFTA_CHAIN_TYPE]) {
+               type = nf_tables_chain_type_lookup(afi, nla[NFTA_CHAIN_TYPE],
+                                                  create);
+               if (IS_ERR(type))
+                       return PTR_ERR(type);
+       }
+       if (!(type->hook_mask & (1 << hook->num)))
+               return -EOPNOTSUPP;
+       if (!try_module_get(type->owner))
+               return -ENOENT;
+
+       hook->type = type;
+
+       hook->dev = NULL;
+       if (afi->flags & NFT_AF_NEEDS_DEV) {
+               char ifname[IFNAMSIZ];
+
+               if (!ha[NFTA_HOOK_DEV]) {
+                       module_put(type->owner);
+                       return -EOPNOTSUPP;
+               }
+
+               nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
+               dev = dev_get_by_name(net, ifname);
+               if (!dev) {
+                       module_put(type->owner);
+                       return -ENOENT;
+               }
+               hook->dev = dev;
+       } else if (ha[NFTA_HOOK_DEV]) {
+               module_put(type->owner);
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
+
+static void nft_chain_release_hook(struct nft_chain_hook *hook)
+{
+       module_put(hook->type->owner);
+       if (hook->dev != NULL)
+               dev_put(hook->dev);
+}
+
 static int nf_tables_newchain(struct net *net, struct sock *nlsk,
                              struct sk_buff *skb, const struct nlmsghdr *nlh,
                              const struct nlattr * const nla[])
@@ -1206,10 +1283,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
        struct nft_table *table;
        struct nft_chain *chain;
        struct nft_base_chain *basechain = NULL;
-       struct nlattr *ha[NFTA_HOOK_MAX + 1];
        u8 genmask = nft_genmask_next(net);
        int family = nfmsg->nfgen_family;
-       struct net_device *dev = NULL;
        u8 policy = NF_ACCEPT;
        u64 handle = 0;
        unsigned int i;
@@ -1273,6 +1348,37 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
                if (nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;
 
+               if (nla[NFTA_CHAIN_HOOK]) {
+                       struct nft_base_chain *basechain;
+                       struct nft_chain_hook hook;
+                       struct nf_hook_ops *ops;
+
+                       if (!(chain->flags & NFT_BASE_CHAIN))
+                               return -EBUSY;
+
+                       err = nft_chain_parse_hook(net, nla, afi, &hook,
+                                                  create);
+                       if (err < 0)
+                               return err;
+
+                       basechain = nft_base_chain(chain);
+                       if (basechain->type != hook.type) {
+                               nft_chain_release_hook(&hook);
+                               return -EBUSY;
+                       }
+
+                       for (i = 0; i < afi->nops; i++) {
+                               ops = &basechain->ops[i];
+                               if (ops->hooknum != hook.num ||
+                                   ops->priority != hook.priority ||
+                                   ops->dev != hook.dev) {
+                                       nft_chain_release_hook(&hook);
+                                       return -EBUSY;
+                               }
+                       }
+                       nft_chain_release_hook(&hook);
+               }
+
                if (nla[NFTA_CHAIN_HANDLE] && name) {
                        struct nft_chain *chain2;
 
@@ -1320,102 +1426,53 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
                return -EOVERFLOW;
 
        if (nla[NFTA_CHAIN_HOOK]) {
-               const struct nf_chain_type *type;
+               struct nft_chain_hook hook;
                struct nf_hook_ops *ops;
                nf_hookfn *hookfn;
-               u32 hooknum, priority;
-
-               type = chain_type[family][NFT_CHAIN_T_DEFAULT];
-               if (nla[NFTA_CHAIN_TYPE]) {
-                       type = nf_tables_chain_type_lookup(afi,
-                                                          nla[NFTA_CHAIN_TYPE],
-                                                          create);
-                       if (IS_ERR(type))
-                               return PTR_ERR(type);
-               }
 
-               err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
-                                      nft_hook_policy);
+               err = nft_chain_parse_hook(net, nla, afi, &hook, create);
                if (err < 0)
                        return err;
-               if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
-                   ha[NFTA_HOOK_PRIORITY] == NULL)
-                       return -EINVAL;
-
-               hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
-               if (hooknum >= afi->nhooks)
-                       return -EINVAL;
-               priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
-
-               if (!(type->hook_mask & (1 << hooknum)))
-                       return -EOPNOTSUPP;
-               if (!try_module_get(type->owner))
-                       return -ENOENT;
-               hookfn = type->hooks[hooknum];
-
-               if (afi->flags & NFT_AF_NEEDS_DEV) {
-                       char ifname[IFNAMSIZ];
-
-                       if (!ha[NFTA_HOOK_DEV]) {
-                               module_put(type->owner);
-                               return -EOPNOTSUPP;
-                       }
-
-                       nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
-                       dev = dev_get_by_name(net, ifname);
-                       if (!dev) {
-                               module_put(type->owner);
-                               return -ENOENT;
-                       }
-               } else if (ha[NFTA_HOOK_DEV]) {
-                       module_put(type->owner);
-                       return -EOPNOTSUPP;
-               }
 
                basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
                if (basechain == NULL) {
-                       module_put(type->owner);
-                       if (dev != NULL)
-                               dev_put(dev);
+                       nft_chain_release_hook(&hook);
                        return -ENOMEM;
                }
 
-               if (dev != NULL)
-                       strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
+               if (hook.dev != NULL)
+                       strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ);
 
                if (nla[NFTA_CHAIN_COUNTERS]) {
                        stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
                        if (IS_ERR(stats)) {
-                               module_put(type->owner);
+                               nft_chain_release_hook(&hook);
                                kfree(basechain);
-                               if (dev != NULL)
-                                       dev_put(dev);
                                return PTR_ERR(stats);
                        }
                        basechain->stats = stats;
                } else {
                        stats = netdev_alloc_pcpu_stats(struct nft_stats);
                        if (stats == NULL) {
-                               module_put(type->owner);
+                               nft_chain_release_hook(&hook);
                                kfree(basechain);
-                               if (dev != NULL)
-                                       dev_put(dev);
                                return -ENOMEM;
                        }
                        rcu_assign_pointer(basechain->stats, stats);
                }
 
-               basechain->type = type;
+               hookfn = hook.type->hooks[hook.num];
+               basechain->type = hook.type;
                chain = &basechain->chain;
 
                for (i = 0; i < afi->nops; i++) {
                        ops = &basechain->ops[i];
                        ops->pf         = family;
-                       ops->hooknum    = hooknum;
-                       ops->priority   = priority;
+                       ops->hooknum    = hook.num;
+                       ops->priority   = hook.priority;
                        ops->priv       = chain;
                        ops->hook       = afi->hooks[ops->hooknum];
-                       ops->dev        = dev;
+                       ops->dev        = hook.dev;
                        if (hookfn)
                                ops->hook = hookfn;
                        if (afi->hook_ops_init)
@@ -3426,12 +3483,12 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
 }
 
 static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
-                           const struct nlattr *attr)
+                           const struct nlattr *attr, u32 nlmsg_flags)
 {
        struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
        struct nft_data_desc d1, d2;
        struct nft_set_ext_tmpl tmpl;
-       struct nft_set_ext *ext;
+       struct nft_set_ext *ext, *ext2;
        struct nft_set_elem elem;
        struct nft_set_binding *binding;
        struct nft_userdata *udata;
@@ -3558,9 +3615,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
                goto err4;
 
        ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
-       err = set->ops->insert(ctx->net, set, &elem);
-       if (err < 0)
+       err = set->ops->insert(ctx->net, set, &elem, &ext2);
+       if (err) {
+               if (err == -EEXIST) {
+                       if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+                           nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
+                           memcmp(nft_set_ext_data(ext),
+                                  nft_set_ext_data(ext2), set->dlen) != 0)
+                               err = -EBUSY;
+                       else if (!(nlmsg_flags & NLM_F_EXCL))
+                               err = 0;
+               }
                goto err5;
+       }
 
        nft_trans_elem(trans) = elem;
        list_add_tail(&trans->list, &ctx->net->nft.commit_list);
@@ -3616,7 +3683,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
                    !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact))
                        return -ENFILE;
 
-               err = nft_add_set_elem(&ctx, set, attr);
+               err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
                if (err < 0) {
                        atomic_dec(&set->nelems);
                        break;
index 564fa79..764251d 100644 (file)
 /*
- * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
  */
 
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/list.h>
-#include <linux/log2.h>
-#include <linux/jhash.h>
 #include <linux/netlink.h>
-#include <linux/workqueue.h>
-#include <linux/rhashtable.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
-
-/* We target a hash table size of 4, element hint is 75% of final size */
-#define NFT_HASH_ELEMENT_HINT 3
+#include <net/netfilter/nf_tables_core.h>
+#include <linux/jhash.h>
 
 struct nft_hash {
-       struct rhashtable               ht;
-       struct delayed_work             gc_work;
-};
-
-struct nft_hash_elem {
-       struct rhash_head               node;
-       struct nft_set_ext              ext;
+       enum nft_registers      sreg:8;
+       enum nft_registers      dreg:8;
+       u8                      len;
+       u32                     modulus;
+       u32                     seed;
 };
 
-struct nft_hash_cmp_arg {
-       const struct nft_set            *set;
-       const u32                       *key;
-       u8                              genmask;
-};
-
-static const struct rhashtable_params nft_hash_params;
-
-static inline u32 nft_hash_key(const void *data, u32 len, u32 seed)
-{
-       const struct nft_hash_cmp_arg *arg = data;
-
-       return jhash(arg->key, len, seed);
-}
-
-static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed)
-{
-       const struct nft_hash_elem *he = data;
-
-       return jhash(nft_set_ext_key(&he->ext), len, seed);
-}
-
-static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg,
-                              const void *ptr)
-{
-       const struct nft_hash_cmp_arg *x = arg->key;
-       const struct nft_hash_elem *he = ptr;
-
-       if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
-               return 1;
-       if (nft_set_elem_expired(&he->ext))
-               return 1;
-       if (!nft_set_elem_active(&he->ext, x->genmask))
-               return 1;
-       return 0;
-}
-
-static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
-                           const u32 *key, const struct nft_set_ext **ext)
-{
-       struct nft_hash *priv = nft_set_priv(set);
-       const struct nft_hash_elem *he;
-       struct nft_hash_cmp_arg arg = {
-               .genmask = nft_genmask_cur(net),
-               .set     = set,
-               .key     = key,
-       };
-
-       he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
-       if (he != NULL)
-               *ext = &he->ext;
-
-       return !!he;
-}
-
-static bool nft_hash_update(struct nft_set *set, const u32 *key,
-                           void *(*new)(struct nft_set *,
-                                        const struct nft_expr *,
-                                        struct nft_regs *regs),
-                           const struct nft_expr *expr,
-                           struct nft_regs *regs,
-                           const struct nft_set_ext **ext)
-{
-       struct nft_hash *priv = nft_set_priv(set);
-       struct nft_hash_elem *he;
-       struct nft_hash_cmp_arg arg = {
-               .genmask = NFT_GENMASK_ANY,
-               .set     = set,
-               .key     = key,
-       };
-
-       he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
-       if (he != NULL)
-               goto out;
-
-       he = new(set, expr, regs);
-       if (he == NULL)
-               goto err1;
-       if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
-                                        nft_hash_params))
-               goto err2;
-out:
-       *ext = &he->ext;
-       return true;
-
-err2:
-       nft_set_elem_destroy(set, he);
-err1:
-       return false;
-}
-
-static int nft_hash_insert(const struct net *net, const struct nft_set *set,
-                          const struct nft_set_elem *elem)
-{
-       struct nft_hash *priv = nft_set_priv(set);
-       struct nft_hash_elem *he = elem->priv;
-       struct nft_hash_cmp_arg arg = {
-               .genmask = nft_genmask_next(net),
-               .set     = set,
-               .key     = elem->key.val.data,
-       };
-
-       return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
-                                           nft_hash_params);
-}
-
-static void nft_hash_activate(const struct net *net, const struct nft_set *set,
-                             const struct nft_set_elem *elem)
+static void nft_hash_eval(const struct nft_expr *expr,
+                         struct nft_regs *regs,
+                         const struct nft_pktinfo *pkt)
 {
-       struct nft_hash_elem *he = elem->priv;
+       struct nft_hash *priv = nft_expr_priv(expr);
+       const void *data = &regs->data[priv->sreg];
 
-       nft_set_elem_change_active(net, set, &he->ext);
-       nft_set_elem_clear_busy(&he->ext);
+       regs->data[priv->dreg] =
+               reciprocal_scale(jhash(data, priv->len, priv->seed),
+                                priv->modulus);
 }
 
-static void *nft_hash_deactivate(const struct net *net,
-                                const struct nft_set *set,
-                                const struct nft_set_elem *elem)
-{
-       struct nft_hash *priv = nft_set_priv(set);
-       struct nft_hash_elem *he;
-       struct nft_hash_cmp_arg arg = {
-               .genmask = nft_genmask_next(net),
-               .set     = set,
-               .key     = elem->key.val.data,
-       };
-
-       rcu_read_lock();
-       he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
-       if (he != NULL) {
-               if (!nft_set_elem_mark_busy(&he->ext) ||
-                   !nft_is_active(net, &he->ext))
-                       nft_set_elem_change_active(net, set, &he->ext);
-               else
-                       he = NULL;
-       }
-       rcu_read_unlock();
-
-       return he;
-}
-
-static void nft_hash_remove(const struct nft_set *set,
-                           const struct nft_set_elem *elem)
-{
-       struct nft_hash *priv = nft_set_priv(set);
-       struct nft_hash_elem *he = elem->priv;
-
-       rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
-}
-
-static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
-                         struct nft_set_iter *iter)
-{
-       struct nft_hash *priv = nft_set_priv(set);
-       struct nft_hash_elem *he;
-       struct rhashtable_iter hti;
-       struct nft_set_elem elem;
-       int err;
-
-       err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
-       iter->err = err;
-       if (err)
-               return;
-
-       err = rhashtable_walk_start(&hti);
-       if (err && err != -EAGAIN) {
-               iter->err = err;
-               goto out;
-       }
-
-       while ((he = rhashtable_walk_next(&hti))) {
-               if (IS_ERR(he)) {
-                       err = PTR_ERR(he);
-                       if (err != -EAGAIN) {
-                               iter->err = err;
-                               goto out;
-                       }
-
-                       continue;
-               }
-
-               if (iter->count < iter->skip)
-                       goto cont;
-               if (nft_set_elem_expired(&he->ext))
-                       goto cont;
-               if (!nft_set_elem_active(&he->ext, iter->genmask))
-                       goto cont;
-
-               elem.priv = he;
-
-               iter->err = iter->fn(ctx, set, iter, &elem);
-               if (iter->err < 0)
-                       goto out;
-
-cont:
-               iter->count++;
-       }
-
-out:
-       rhashtable_walk_stop(&hti);
-       rhashtable_walk_exit(&hti);
-}
-
-static void nft_hash_gc(struct work_struct *work)
-{
-       struct nft_set *set;
-       struct nft_hash_elem *he;
-       struct nft_hash *priv;
-       struct nft_set_gc_batch *gcb = NULL;
-       struct rhashtable_iter hti;
-       int err;
-
-       priv = container_of(work, struct nft_hash, gc_work.work);
-       set  = nft_set_container_of(priv);
-
-       err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
-       if (err)
-               goto schedule;
-
-       err = rhashtable_walk_start(&hti);
-       if (err && err != -EAGAIN)
-               goto out;
-
-       while ((he = rhashtable_walk_next(&hti))) {
-               if (IS_ERR(he)) {
-                       if (PTR_ERR(he) != -EAGAIN)
-                               goto out;
-                       continue;
-               }
-
-               if (!nft_set_elem_expired(&he->ext))
-                       continue;
-               if (nft_set_elem_mark_busy(&he->ext))
-                       continue;
-
-               gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
-               if (gcb == NULL)
-                       goto out;
-               rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
-               atomic_dec(&set->nelems);
-               nft_set_gc_batch_add(gcb, he);
-       }
-out:
-       rhashtable_walk_stop(&hti);
-       rhashtable_walk_exit(&hti);
-
-       nft_set_gc_batch_complete(gcb);
-schedule:
-       queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
-                          nft_set_gc_interval(set));
-}
-
-static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
-{
-       return sizeof(struct nft_hash);
-}
-
-static const struct rhashtable_params nft_hash_params = {
-       .head_offset            = offsetof(struct nft_hash_elem, node),
-       .hashfn                 = nft_hash_key,
-       .obj_hashfn             = nft_hash_obj,
-       .obj_cmpfn              = nft_hash_cmp,
-       .automatic_shrinking    = true,
+static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
+       [NFTA_HASH_SREG]        = { .type = NLA_U32 },
+       [NFTA_HASH_DREG]        = { .type = NLA_U32 },
+       [NFTA_HASH_LEN]         = { .type = NLA_U32 },
+       [NFTA_HASH_MODULUS]     = { .type = NLA_U32 },
+       [NFTA_HASH_SEED]        = { .type = NLA_U32 },
 };
 
-static int nft_hash_init(const struct nft_set *set,
-                        const struct nft_set_desc *desc,
+static int nft_hash_init(const struct nft_ctx *ctx,
+                        const struct nft_expr *expr,
                         const struct nlattr * const tb[])
 {
-       struct nft_hash *priv = nft_set_priv(set);
-       struct rhashtable_params params = nft_hash_params;
-       int err;
+       struct nft_hash *priv = nft_expr_priv(expr);
+       u32 len;
 
-       params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT;
-       params.key_len    = set->klen;
+       if (!tb[NFTA_HASH_SREG] ||
+           !tb[NFTA_HASH_DREG] ||
+           !tb[NFTA_HASH_LEN]  ||
+           !tb[NFTA_HASH_SEED] ||
+           !tb[NFTA_HASH_MODULUS])
+               return -EINVAL;
 
-       err = rhashtable_init(&priv->ht, &params);
-       if (err < 0)
-               return err;
+       priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
+       priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
 
-       INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc);
-       if (set->flags & NFT_SET_TIMEOUT)
-               queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
-                                  nft_set_gc_interval(set));
-       return 0;
-}
+       len = ntohl(nla_get_be32(tb[NFTA_HASH_LEN]));
+       if (len == 0 || len > U8_MAX)
+               return -ERANGE;
 
-static void nft_hash_elem_destroy(void *ptr, void *arg)
-{
-       nft_set_elem_destroy((const struct nft_set *)arg, ptr);
-}
+       priv->len = len;
 
-static void nft_hash_destroy(const struct nft_set *set)
-{
-       struct nft_hash *priv = nft_set_priv(set);
+       priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
+       if (priv->modulus <= 1)
+               return -ERANGE;
 
-       cancel_delayed_work_sync(&priv->gc_work);
-       rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy,
-                                   (void *)set);
+       priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
+
+       return nft_validate_register_load(priv->sreg, len) &&
+              nft_validate_register_store(ctx, priv->dreg, NULL,
+                                          NFT_DATA_VALUE, sizeof(u32));
 }
 
-static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
-                             struct nft_set_estimate *est)
+static int nft_hash_dump(struct sk_buff *skb,
+                        const struct nft_expr *expr)
 {
-       unsigned int esize;
+       const struct nft_hash *priv = nft_expr_priv(expr);
 
-       esize = sizeof(struct nft_hash_elem);
-       if (desc->size) {
-               est->size = sizeof(struct nft_hash) +
-                           roundup_pow_of_two(desc->size * 4 / 3) *
-                           sizeof(struct nft_hash_elem *) +
-                           desc->size * esize;
-       } else {
-               /* Resizing happens when the load drops below 30% or goes
-                * above 75%. The average of 52.5% load (approximated by 50%)
-                * is used for the size estimation of the hash buckets,
-                * meaning we calculate two buckets per element.
-                */
-               est->size = esize + 2 * sizeof(struct nft_hash_elem *);
-       }
+       if (nft_dump_register(skb, NFTA_HASH_SREG, priv->sreg))
+               goto nla_put_failure;
+       if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg))
+               goto nla_put_failure;
+       if (nla_put_be32(skb, NFTA_HASH_LEN, htonl(priv->len)))
+               goto nla_put_failure;
+       if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus)))
+               goto nla_put_failure;
+       if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
+               goto nla_put_failure;
 
-       est->class = NFT_SET_CLASS_O_1;
+       return 0;
 
-       return true;
+nla_put_failure:
+       return -1;
 }
 
-static struct nft_set_ops nft_hash_ops __read_mostly = {
-       .privsize       = nft_hash_privsize,
-       .elemsize       = offsetof(struct nft_hash_elem, ext),
-       .estimate       = nft_hash_estimate,
+static struct nft_expr_type nft_hash_type;
+static const struct nft_expr_ops nft_hash_ops = {
+       .type           = &nft_hash_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_hash)),
+       .eval           = nft_hash_eval,
        .init           = nft_hash_init,
-       .destroy        = nft_hash_destroy,
-       .insert         = nft_hash_insert,
-       .activate       = nft_hash_activate,
-       .deactivate     = nft_hash_deactivate,
-       .remove         = nft_hash_remove,
-       .lookup         = nft_hash_lookup,
-       .update         = nft_hash_update,
-       .walk           = nft_hash_walk,
-       .features       = NFT_SET_MAP | NFT_SET_TIMEOUT,
+       .dump           = nft_hash_dump,
+};
+
+static struct nft_expr_type nft_hash_type __read_mostly = {
+       .name           = "hash",
+       .ops            = &nft_hash_ops,
+       .policy         = nft_hash_policy,
+       .maxattr        = NFTA_HASH_MAX,
        .owner          = THIS_MODULE,
 };
 
 static int __init nft_hash_module_init(void)
 {
-       return nft_register_set(&nft_hash_ops);
+       return nft_register_expr(&nft_hash_type);
 }
 
 static void __exit nft_hash_module_exit(void)
 {
-       nft_unregister_set(&nft_hash_ops);
+       nft_unregister_expr(&nft_hash_type);
 }
 
 module_init(nft_hash_module_init);
 module_exit(nft_hash_module_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_SET();
+MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
+MODULE_ALIAS_NFT_EXPR("hash");
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
new file mode 100644 (file)
index 0000000..294745e
--- /dev/null
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/static_key.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
+
+struct nft_ng_inc {
+       enum nft_registers      dreg:8;
+       u32                     until;
+       atomic_t                counter;
+};
+
+static void nft_ng_inc_eval(const struct nft_expr *expr,
+                           struct nft_regs *regs,
+                           const struct nft_pktinfo *pkt)
+{
+       struct nft_ng_inc *priv = nft_expr_priv(expr);
+       u32 nval, oval;
+
+       do {
+               oval = atomic_read(&priv->counter);
+               nval = (oval + 1 < priv->until) ? oval + 1 : 0;
+       } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
+
+       memcpy(&regs->data[priv->dreg], &priv->counter, sizeof(u32));
+}
+
+static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
+       [NFTA_NG_DREG]          = { .type = NLA_U32 },
+       [NFTA_NG_UNTIL]         = { .type = NLA_U32 },
+       [NFTA_NG_TYPE]          = { .type = NLA_U32 },
+};
+
+static int nft_ng_inc_init(const struct nft_ctx *ctx,
+                          const struct nft_expr *expr,
+                          const struct nlattr * const tb[])
+{
+       struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+       priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
+       if (priv->until == 0)
+               return -ERANGE;
+
+       priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
+       atomic_set(&priv->counter, 0);
+
+       return nft_validate_register_store(ctx, priv->dreg, NULL,
+                                          NFT_DATA_VALUE, sizeof(u32));
+}
+
+static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
+                      u32 until, enum nft_ng_types type)
+{
+       if (nft_dump_register(skb, NFTA_NG_DREG, dreg))
+               goto nla_put_failure;
+       if (nla_put_be32(skb, NFTA_NG_UNTIL, htonl(until)))
+               goto nla_put_failure;
+       if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type)))
+               goto nla_put_failure;
+
+       return 0;
+
+nla_put_failure:
+       return -1;
+}
+
+static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+       const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+       return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_INCREMENTAL);
+}
+
+struct nft_ng_random {
+       enum nft_registers      dreg:8;
+       u32                     until;
+};
+
+static void nft_ng_random_eval(const struct nft_expr *expr,
+                              struct nft_regs *regs,
+                              const struct nft_pktinfo *pkt)
+{
+       struct nft_ng_random *priv = nft_expr_priv(expr);
+       struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
+
+       regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state),
+                                                 priv->until);
+}
+
+static int nft_ng_random_init(const struct nft_ctx *ctx,
+                             const struct nft_expr *expr,
+                             const struct nlattr * const tb[])
+{
+       struct nft_ng_random *priv = nft_expr_priv(expr);
+
+       priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
+       if (priv->until == 0)
+               return -ERANGE;
+
+       prandom_init_once(&nft_numgen_prandom_state);
+
+       priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
+
+       return nft_validate_register_store(ctx, priv->dreg, NULL,
+                                          NFT_DATA_VALUE, sizeof(u32));
+}
+
+static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+       const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+       return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_RANDOM);
+}
+
+static struct nft_expr_type nft_ng_type;
+static const struct nft_expr_ops nft_ng_inc_ops = {
+       .type           = &nft_ng_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
+       .eval           = nft_ng_inc_eval,
+       .init           = nft_ng_inc_init,
+       .dump           = nft_ng_inc_dump,
+};
+
+static const struct nft_expr_ops nft_ng_random_ops = {
+       .type           = &nft_ng_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
+       .eval           = nft_ng_random_eval,
+       .init           = nft_ng_random_init,
+       .dump           = nft_ng_random_dump,
+};
+
+static const struct nft_expr_ops *
+nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
+{
+       u32 type;
+
+       if (!tb[NFTA_NG_DREG]   ||
+           !tb[NFTA_NG_UNTIL]  ||
+           !tb[NFTA_NG_TYPE])
+               return ERR_PTR(-EINVAL);
+
+       type = ntohl(nla_get_be32(tb[NFTA_NG_TYPE]));
+
+       switch (type) {
+       case NFT_NG_INCREMENTAL:
+               return &nft_ng_inc_ops;
+       case NFT_NG_RANDOM:
+               return &nft_ng_random_ops;
+       }
+
+       return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_ng_type __read_mostly = {
+       .name           = "numgen",
+       .select_ops     = &nft_ng_select_ops,
+       .policy         = nft_ng_policy,
+       .maxattr        = NFTA_NG_MAX,
+       .owner          = THIS_MODULE,
+};
+
+static int __init nft_ng_module_init(void)
+{
+       return nft_register_expr(&nft_ng_type);
+}
+
+static void __exit nft_ng_module_exit(void)
+{
+       nft_unregister_expr(&nft_ng_type);
+}
+
+module_init(nft_ng_module_init);
+module_exit(nft_ng_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
+MODULE_ALIAS_NFT_EXPR("numgen");
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
new file mode 100644 (file)
index 0000000..6eafbf9
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_quota {
+       u64             quota;
+       bool            invert;
+       atomic64_t      remain;
+};
+
+static inline long nft_quota(struct nft_quota *priv,
+                            const struct nft_pktinfo *pkt)
+{
+       return atomic64_sub_return(pkt->skb->len, &priv->remain);
+}
+
+static void nft_quota_eval(const struct nft_expr *expr,
+                          struct nft_regs *regs,
+                          const struct nft_pktinfo *pkt)
+{
+       struct nft_quota *priv = nft_expr_priv(expr);
+
+       if (nft_quota(priv, pkt) < 0 && !priv->invert)
+               regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
+       [NFTA_QUOTA_BYTES]      = { .type = NLA_U64 },
+       [NFTA_QUOTA_FLAGS]      = { .type = NLA_U32 },
+};
+
+static int nft_quota_init(const struct nft_ctx *ctx,
+                         const struct nft_expr *expr,
+                         const struct nlattr * const tb[])
+{
+       struct nft_quota *priv = nft_expr_priv(expr);
+       u32 flags = 0;
+       u64 quota;
+
+       if (!tb[NFTA_QUOTA_BYTES])
+               return -EINVAL;
+
+       quota = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_BYTES]));
+       if (quota > S64_MAX)
+               return -EOVERFLOW;
+
+       if (tb[NFTA_QUOTA_FLAGS]) {
+               flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
+               if (flags & ~NFT_QUOTA_F_INV)
+                       return -EINVAL;
+       }
+
+       priv->quota = quota;
+       priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false;
+       atomic64_set(&priv->remain, quota);
+
+       return 0;
+}
+
+static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+       const struct nft_quota *priv = nft_expr_priv(expr);
+       u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0;
+
+       if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
+                        NFTA_QUOTA_PAD) ||
+           nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags)))
+               goto nla_put_failure;
+       return 0;
+
+nla_put_failure:
+       return -1;
+}
+
+static struct nft_expr_type nft_quota_type;
+static const struct nft_expr_ops nft_quota_ops = {
+       .type           = &nft_quota_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_quota)),
+       .eval           = nft_quota_eval,
+       .init           = nft_quota_init,
+       .dump           = nft_quota_dump,
+};
+
+static struct nft_expr_type nft_quota_type __read_mostly = {
+       .name           = "quota",
+       .ops            = &nft_quota_ops,
+       .policy         = nft_quota_policy,
+       .maxattr        = NFTA_QUOTA_MAX,
+       .flags          = NFT_EXPR_STATEFUL,
+       .owner          = THIS_MODULE,
+};
+
+static int __init nft_quota_module_init(void)
+{
+        return nft_register_expr(&nft_quota_type);
+}
+
+static void __exit nft_quota_module_exit(void)
+{
+        nft_unregister_expr(&nft_quota_type);
+}
+
+module_init(nft_quota_module_init);
+module_exit(nft_quota_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("quota");
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
deleted file mode 100644 (file)
index ffe9ae0..0000000
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/rbtree.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-
-static DEFINE_SPINLOCK(nft_rbtree_lock);
-
-struct nft_rbtree {
-       struct rb_root          root;
-};
-
-struct nft_rbtree_elem {
-       struct rb_node          node;
-       struct nft_set_ext      ext;
-};
-
-static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
-{
-       return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
-              (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
-}
-
-static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
-                            const struct nft_rbtree_elem *interval)
-{
-       return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
-}
-
-static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
-                             const u32 *key, const struct nft_set_ext **ext)
-{
-       const struct nft_rbtree *priv = nft_set_priv(set);
-       const struct nft_rbtree_elem *rbe, *interval = NULL;
-       u8 genmask = nft_genmask_cur(net);
-       const struct rb_node *parent;
-       const void *this;
-       int d;
-
-       spin_lock_bh(&nft_rbtree_lock);
-       parent = priv->root.rb_node;
-       while (parent != NULL) {
-               rbe = rb_entry(parent, struct nft_rbtree_elem, node);
-
-               this = nft_set_ext_key(&rbe->ext);
-               d = memcmp(this, key, set->klen);
-               if (d < 0) {
-                       parent = parent->rb_left;
-                       /* In case of adjacent ranges, we always see the high
-                        * part of the range in first place, before the low one.
-                        * So don't update interval if the keys are equal.
-                        */
-                       if (interval && nft_rbtree_equal(set, this, interval))
-                               continue;
-                       interval = rbe;
-               } else if (d > 0)
-                       parent = parent->rb_right;
-               else {
-                       if (!nft_set_elem_active(&rbe->ext, genmask)) {
-                               parent = parent->rb_left;
-                               continue;
-                       }
-                       if (nft_rbtree_interval_end(rbe))
-                               goto out;
-                       spin_unlock_bh(&nft_rbtree_lock);
-
-                       *ext = &rbe->ext;
-                       return true;
-               }
-       }
-
-       if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
-           nft_set_elem_active(&interval->ext, genmask) &&
-           !nft_rbtree_interval_end(interval)) {
-               spin_unlock_bh(&nft_rbtree_lock);
-               *ext = &interval->ext;
-               return true;
-       }
-out:
-       spin_unlock_bh(&nft_rbtree_lock);
-       return false;
-}
-
-static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
-                              struct nft_rbtree_elem *new)
-{
-       struct nft_rbtree *priv = nft_set_priv(set);
-       u8 genmask = nft_genmask_next(net);
-       struct nft_rbtree_elem *rbe;
-       struct rb_node *parent, **p;
-       int d;
-
-       parent = NULL;
-       p = &priv->root.rb_node;
-       while (*p != NULL) {
-               parent = *p;
-               rbe = rb_entry(parent, struct nft_rbtree_elem, node);
-               d = memcmp(nft_set_ext_key(&rbe->ext),
-                          nft_set_ext_key(&new->ext),
-                          set->klen);
-               if (d < 0)
-                       p = &parent->rb_left;
-               else if (d > 0)
-                       p = &parent->rb_right;
-               else {
-                       if (nft_set_elem_active(&rbe->ext, genmask)) {
-                               if (nft_rbtree_interval_end(rbe) &&
-                                   !nft_rbtree_interval_end(new))
-                                       p = &parent->rb_left;
-                               else if (!nft_rbtree_interval_end(rbe) &&
-                                        nft_rbtree_interval_end(new))
-                                       p = &parent->rb_right;
-                               else
-                                       return -EEXIST;
-                       }
-               }
-       }
-       rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, &priv->root);
-       return 0;
-}
-
-static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
-                            const struct nft_set_elem *elem)
-{
-       struct nft_rbtree_elem *rbe = elem->priv;
-       int err;
-
-       spin_lock_bh(&nft_rbtree_lock);
-       err = __nft_rbtree_insert(net, set, rbe);
-       spin_unlock_bh(&nft_rbtree_lock);
-
-       return err;
-}
-
-static void nft_rbtree_remove(const struct nft_set *set,
-                             const struct nft_set_elem *elem)
-{
-       struct nft_rbtree *priv = nft_set_priv(set);
-       struct nft_rbtree_elem *rbe = elem->priv;
-
-       spin_lock_bh(&nft_rbtree_lock);
-       rb_erase(&rbe->node, &priv->root);
-       spin_unlock_bh(&nft_rbtree_lock);
-}
-
-static void nft_rbtree_activate(const struct net *net,
-                               const struct nft_set *set,
-                               const struct nft_set_elem *elem)
-{
-       struct nft_rbtree_elem *rbe = elem->priv;
-
-       nft_set_elem_change_active(net, set, &rbe->ext);
-}
-
-static void *nft_rbtree_deactivate(const struct net *net,
-                                  const struct nft_set *set,
-                                  const struct nft_set_elem *elem)
-{
-       const struct nft_rbtree *priv = nft_set_priv(set);
-       const struct rb_node *parent = priv->root.rb_node;
-       struct nft_rbtree_elem *rbe, *this = elem->priv;
-       u8 genmask = nft_genmask_next(net);
-       int d;
-
-       while (parent != NULL) {
-               rbe = rb_entry(parent, struct nft_rbtree_elem, node);
-
-               d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
-                                          set->klen);
-               if (d < 0)
-                       parent = parent->rb_left;
-               else if (d > 0)
-                       parent = parent->rb_right;
-               else {
-                       if (!nft_set_elem_active(&rbe->ext, genmask)) {
-                               parent = parent->rb_left;
-                               continue;
-                       }
-                       if (nft_rbtree_interval_end(rbe) &&
-                           !nft_rbtree_interval_end(this)) {
-                               parent = parent->rb_left;
-                               continue;
-                       } else if (!nft_rbtree_interval_end(rbe) &&
-                                  nft_rbtree_interval_end(this)) {
-                               parent = parent->rb_right;
-                               continue;
-                       }
-                       nft_set_elem_change_active(net, set, &rbe->ext);
-                       return rbe;
-               }
-       }
-       return NULL;
-}
-
-static void nft_rbtree_walk(const struct nft_ctx *ctx,
-                           const struct nft_set *set,
-                           struct nft_set_iter *iter)
-{
-       const struct nft_rbtree *priv = nft_set_priv(set);
-       struct nft_rbtree_elem *rbe;
-       struct nft_set_elem elem;
-       struct rb_node *node;
-
-       spin_lock_bh(&nft_rbtree_lock);
-       for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
-               rbe = rb_entry(node, struct nft_rbtree_elem, node);
-
-               if (iter->count < iter->skip)
-                       goto cont;
-               if (!nft_set_elem_active(&rbe->ext, iter->genmask))
-                       goto cont;
-
-               elem.priv = rbe;
-
-               iter->err = iter->fn(ctx, set, iter, &elem);
-               if (iter->err < 0) {
-                       spin_unlock_bh(&nft_rbtree_lock);
-                       return;
-               }
-cont:
-               iter->count++;
-       }
-       spin_unlock_bh(&nft_rbtree_lock);
-}
-
-static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[])
-{
-       return sizeof(struct nft_rbtree);
-}
-
-static int nft_rbtree_init(const struct nft_set *set,
-                          const struct nft_set_desc *desc,
-                          const struct nlattr * const nla[])
-{
-       struct nft_rbtree *priv = nft_set_priv(set);
-
-       priv->root = RB_ROOT;
-       return 0;
-}
-
-static void nft_rbtree_destroy(const struct nft_set *set)
-{
-       struct nft_rbtree *priv = nft_set_priv(set);
-       struct nft_rbtree_elem *rbe;
-       struct rb_node *node;
-
-       while ((node = priv->root.rb_node) != NULL) {
-               rb_erase(node, &priv->root);
-               rbe = rb_entry(node, struct nft_rbtree_elem, node);
-               nft_set_elem_destroy(set, rbe);
-       }
-}
-
-static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
-                               struct nft_set_estimate *est)
-{
-       unsigned int nsize;
-
-       nsize = sizeof(struct nft_rbtree_elem);
-       if (desc->size)
-               est->size = sizeof(struct nft_rbtree) + desc->size * nsize;
-       else
-               est->size = nsize;
-
-       est->class = NFT_SET_CLASS_O_LOG_N;
-
-       return true;
-}
-
-static struct nft_set_ops nft_rbtree_ops __read_mostly = {
-       .privsize       = nft_rbtree_privsize,
-       .elemsize       = offsetof(struct nft_rbtree_elem, ext),
-       .estimate       = nft_rbtree_estimate,
-       .init           = nft_rbtree_init,
-       .destroy        = nft_rbtree_destroy,
-       .insert         = nft_rbtree_insert,
-       .remove         = nft_rbtree_remove,
-       .deactivate     = nft_rbtree_deactivate,
-       .activate       = nft_rbtree_activate,
-       .lookup         = nft_rbtree_lookup,
-       .walk           = nft_rbtree_walk,
-       .features       = NFT_SET_INTERVAL | NFT_SET_MAP,
-       .owner          = THIS_MODULE,
-};
-
-static int __init nft_rbtree_module_init(void)
-{
-       return nft_register_set(&nft_rbtree_ops);
-}
-
-static void __exit nft_rbtree_module_exit(void)
-{
-       nft_unregister_set(&nft_rbtree_ops);
-}
-
-module_init(nft_rbtree_module_init);
-module_exit(nft_rbtree_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
new file mode 100644 (file)
index 0000000..3794cb2
--- /dev/null
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/jhash.h>
+#include <linux/netlink.h>
+#include <linux/workqueue.h>
+#include <linux/rhashtable.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+/* We target a hash table size of 4, element hint is 75% of final size */
+#define NFT_HASH_ELEMENT_HINT 3
+
+struct nft_hash {
+       struct rhashtable               ht;
+       struct delayed_work             gc_work;
+};
+
+struct nft_hash_elem {
+       struct rhash_head               node;
+       struct nft_set_ext              ext;
+};
+
+struct nft_hash_cmp_arg {
+       const struct nft_set            *set;
+       const u32                       *key;
+       u8                              genmask;
+};
+
+static const struct rhashtable_params nft_hash_params;
+
+static inline u32 nft_hash_key(const void *data, u32 len, u32 seed)
+{
+       const struct nft_hash_cmp_arg *arg = data;
+
+       return jhash(arg->key, len, seed);
+}
+
+static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed)
+{
+       const struct nft_hash_elem *he = data;
+
+       return jhash(nft_set_ext_key(&he->ext), len, seed);
+}
+
+static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg,
+                              const void *ptr)
+{
+       const struct nft_hash_cmp_arg *x = arg->key;
+       const struct nft_hash_elem *he = ptr;
+
+       if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
+               return 1;
+       if (nft_set_elem_expired(&he->ext))
+               return 1;
+       if (!nft_set_elem_active(&he->ext, x->genmask))
+               return 1;
+       return 0;
+}
+
+static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
+                           const u32 *key, const struct nft_set_ext **ext)
+{
+       struct nft_hash *priv = nft_set_priv(set);
+       const struct nft_hash_elem *he;
+       struct nft_hash_cmp_arg arg = {
+               .genmask = nft_genmask_cur(net),
+               .set     = set,
+               .key     = key,
+       };
+
+       he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
+       if (he != NULL)
+               *ext = &he->ext;
+
+       return !!he;
+}
+
+static bool nft_hash_update(struct nft_set *set, const u32 *key,
+                           void *(*new)(struct nft_set *,
+                                        const struct nft_expr *,
+                                        struct nft_regs *regs),
+                           const struct nft_expr *expr,
+                           struct nft_regs *regs,
+                           const struct nft_set_ext **ext)
+{
+       struct nft_hash *priv = nft_set_priv(set);
+       struct nft_hash_elem *he;
+       struct nft_hash_cmp_arg arg = {
+               .genmask = NFT_GENMASK_ANY,
+               .set     = set,
+               .key     = key,
+       };
+
+       he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
+       if (he != NULL)
+               goto out;
+
+       he = new(set, expr, regs);
+       if (he == NULL)
+               goto err1;
+       if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
+                                        nft_hash_params))
+               goto err2;
+out:
+       *ext = &he->ext;
+       return true;
+
+err2:
+       nft_set_elem_destroy(set, he);
+err1:
+       return false;
+}
+
+static int nft_hash_insert(const struct net *net, const struct nft_set *set,
+                          const struct nft_set_elem *elem,
+                          struct nft_set_ext **ext)
+{
+       struct nft_hash *priv = nft_set_priv(set);
+       struct nft_hash_elem *he = elem->priv;
+       struct nft_hash_cmp_arg arg = {
+               .genmask = nft_genmask_next(net),
+               .set     = set,
+               .key     = elem->key.val.data,
+       };
+       struct nft_hash_elem *prev;
+
+       prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
+                                              nft_hash_params);
+       if (IS_ERR(prev))
+               return PTR_ERR(prev);
+       if (prev) {
+               *ext = &prev->ext;
+               return -EEXIST;
+       }
+       return 0;
+}
+
+static void nft_hash_activate(const struct net *net, const struct nft_set *set,
+                             const struct nft_set_elem *elem)
+{
+       struct nft_hash_elem *he = elem->priv;
+
+       nft_set_elem_change_active(net, set, &he->ext);
+       nft_set_elem_clear_busy(&he->ext);
+}
+
+static void *nft_hash_deactivate(const struct net *net,
+                                const struct nft_set *set,
+                                const struct nft_set_elem *elem)
+{
+       struct nft_hash *priv = nft_set_priv(set);
+       struct nft_hash_elem *he;
+       struct nft_hash_cmp_arg arg = {
+               .genmask = nft_genmask_next(net),
+               .set     = set,
+               .key     = elem->key.val.data,
+       };
+
+       rcu_read_lock();
+       he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
+       if (he != NULL) {
+               if (!nft_set_elem_mark_busy(&he->ext) ||
+                   !nft_is_active(net, &he->ext))
+                       nft_set_elem_change_active(net, set, &he->ext);
+               else
+                       he = NULL;
+       }
+       rcu_read_unlock();
+
+       return he;
+}
+
+static void nft_hash_remove(const struct nft_set *set,
+                           const struct nft_set_elem *elem)
+{
+       struct nft_hash *priv = nft_set_priv(set);
+       struct nft_hash_elem *he = elem->priv;
+
+       rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
+}
+
+static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
+                         struct nft_set_iter *iter)
+{
+       struct nft_hash *priv = nft_set_priv(set);
+       struct nft_hash_elem *he;
+       struct rhashtable_iter hti;
+       struct nft_set_elem elem;
+       int err;
+
+       err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
+       iter->err = err;
+       if (err)
+               return;
+
+       err = rhashtable_walk_start(&hti);
+       if (err && err != -EAGAIN) {
+               iter->err = err;
+               goto out;
+       }
+
+       while ((he = rhashtable_walk_next(&hti))) {
+               if (IS_ERR(he)) {
+                       err = PTR_ERR(he);
+                       if (err != -EAGAIN) {
+                               iter->err = err;
+                               goto out;
+                       }
+
+                       continue;
+               }
+
+               if (iter->count < iter->skip)
+                       goto cont;
+               if (nft_set_elem_expired(&he->ext))
+                       goto cont;
+               if (!nft_set_elem_active(&he->ext, iter->genmask))
+                       goto cont;
+
+               elem.priv = he;
+
+               iter->err = iter->fn(ctx, set, iter, &elem);
+               if (iter->err < 0)
+                       goto out;
+
+cont:
+               iter->count++;
+       }
+
+out:
+       rhashtable_walk_stop(&hti);
+       rhashtable_walk_exit(&hti);
+}
+
+static void nft_hash_gc(struct work_struct *work)
+{
+       struct nft_set *set;
+       struct nft_hash_elem *he;
+       struct nft_hash *priv;
+       struct nft_set_gc_batch *gcb = NULL;
+       struct rhashtable_iter hti;
+       int err;
+
+       priv = container_of(work, struct nft_hash, gc_work.work);
+       set  = nft_set_container_of(priv);
+
+       err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
+       if (err)
+               goto schedule;
+
+       err = rhashtable_walk_start(&hti);
+       if (err && err != -EAGAIN)
+               goto out;
+
+       while ((he = rhashtable_walk_next(&hti))) {
+               if (IS_ERR(he)) {
+                       if (PTR_ERR(he) != -EAGAIN)
+                               goto out;
+                       continue;
+               }
+
+               if (!nft_set_elem_expired(&he->ext))
+                       continue;
+               if (nft_set_elem_mark_busy(&he->ext))
+                       continue;
+
+               gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+               if (gcb == NULL)
+                       goto out;
+               rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
+               atomic_dec(&set->nelems);
+               nft_set_gc_batch_add(gcb, he);
+       }
+out:
+       rhashtable_walk_stop(&hti);
+       rhashtable_walk_exit(&hti);
+
+       nft_set_gc_batch_complete(gcb);
+schedule:
+       queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+                          nft_set_gc_interval(set));
+}
+
+static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
+{
+       return sizeof(struct nft_hash);
+}
+
+static const struct rhashtable_params nft_hash_params = {
+       .head_offset            = offsetof(struct nft_hash_elem, node),
+       .hashfn                 = nft_hash_key,
+       .obj_hashfn             = nft_hash_obj,
+       .obj_cmpfn              = nft_hash_cmp,
+       .automatic_shrinking    = true,
+};
+
+static int nft_hash_init(const struct nft_set *set,
+                        const struct nft_set_desc *desc,
+                        const struct nlattr * const tb[])
+{
+       struct nft_hash *priv = nft_set_priv(set);
+       struct rhashtable_params params = nft_hash_params;
+       int err;
+
+       params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT;
+       params.key_len    = set->klen;
+
+       err = rhashtable_init(&priv->ht, &params);
+       if (err < 0)
+               return err;
+
+       INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc);
+       if (set->flags & NFT_SET_TIMEOUT)
+               queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+                                  nft_set_gc_interval(set));
+       return 0;
+}
+
+static void nft_hash_elem_destroy(void *ptr, void *arg)
+{
+       nft_set_elem_destroy((const struct nft_set *)arg, ptr);
+}
+
+static void nft_hash_destroy(const struct nft_set *set)
+{
+       struct nft_hash *priv = nft_set_priv(set);
+
+       cancel_delayed_work_sync(&priv->gc_work);
+       rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy,
+                                   (void *)set);
+}
+
+static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
+                             struct nft_set_estimate *est)
+{
+       unsigned int esize;
+
+       esize = sizeof(struct nft_hash_elem);
+       if (desc->size) {
+               est->size = sizeof(struct nft_hash) +
+                           roundup_pow_of_two(desc->size * 4 / 3) *
+                           sizeof(struct nft_hash_elem *) +
+                           desc->size * esize;
+       } else {
+               /* Resizing happens when the load drops below 30% or goes
+                * above 75%. The average of 52.5% load (approximated by 50%)
+                * is used for the size estimation of the hash buckets,
+                * meaning we calculate two buckets per element.
+                */
+               est->size = esize + 2 * sizeof(struct nft_hash_elem *);
+       }
+
+       est->class = NFT_SET_CLASS_O_1;
+
+       return true;
+}
+
+static struct nft_set_ops nft_hash_ops __read_mostly = {
+       .privsize       = nft_hash_privsize,
+       .elemsize       = offsetof(struct nft_hash_elem, ext),
+       .estimate       = nft_hash_estimate,
+       .init           = nft_hash_init,
+       .destroy        = nft_hash_destroy,
+       .insert         = nft_hash_insert,
+       .activate       = nft_hash_activate,
+       .deactivate     = nft_hash_deactivate,
+       .remove         = nft_hash_remove,
+       .lookup         = nft_hash_lookup,
+       .update         = nft_hash_update,
+       .walk           = nft_hash_walk,
+       .features       = NFT_SET_MAP | NFT_SET_TIMEOUT,
+       .owner          = THIS_MODULE,
+};
+
+static int __init nft_hash_module_init(void)
+{
+       return nft_register_set(&nft_hash_ops);
+}
+
+static void __exit nft_hash_module_exit(void)
+{
+       nft_unregister_set(&nft_hash_ops);
+}
+
+module_init(nft_hash_module_init);
+module_exit(nft_hash_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
new file mode 100644 (file)
index 0000000..38b5bda
--- /dev/null
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+static DEFINE_SPINLOCK(nft_rbtree_lock);
+
+struct nft_rbtree {
+       struct rb_root          root;
+};
+
+struct nft_rbtree_elem {
+       struct rb_node          node;
+       struct nft_set_ext      ext;
+};
+
+static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
+{
+       return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
+              (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
+}
+
+static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
+                            const struct nft_rbtree_elem *interval)
+{
+       return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
+}
+
+static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+                             const u32 *key, const struct nft_set_ext **ext)
+{
+       const struct nft_rbtree *priv = nft_set_priv(set);
+       const struct nft_rbtree_elem *rbe, *interval = NULL;
+       u8 genmask = nft_genmask_cur(net);
+       const struct rb_node *parent;
+       const void *this;
+       int d;
+
+       spin_lock_bh(&nft_rbtree_lock);
+       parent = priv->root.rb_node;
+       while (parent != NULL) {
+               rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+
+               this = nft_set_ext_key(&rbe->ext);
+               d = memcmp(this, key, set->klen);
+               if (d < 0) {
+                       parent = parent->rb_left;
+                       /* In case of adjacent ranges, we always see the high
+                        * part of the range in first place, before the low one.
+                        * So don't update interval if the keys are equal.
+                        */
+                       if (interval && nft_rbtree_equal(set, this, interval))
+                               continue;
+                       interval = rbe;
+               } else if (d > 0)
+                       parent = parent->rb_right;
+               else {
+                       if (!nft_set_elem_active(&rbe->ext, genmask)) {
+                               parent = parent->rb_left;
+                               continue;
+                       }
+                       if (nft_rbtree_interval_end(rbe))
+                               goto out;
+                       spin_unlock_bh(&nft_rbtree_lock);
+
+                       *ext = &rbe->ext;
+                       return true;
+               }
+       }
+
+       if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
+           nft_set_elem_active(&interval->ext, genmask) &&
+           !nft_rbtree_interval_end(interval)) {
+               spin_unlock_bh(&nft_rbtree_lock);
+               *ext = &interval->ext;
+               return true;
+       }
+out:
+       spin_unlock_bh(&nft_rbtree_lock);
+       return false;
+}
+
+static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
+                              struct nft_rbtree_elem *new,
+                              struct nft_set_ext **ext)
+{
+       struct nft_rbtree *priv = nft_set_priv(set);
+       u8 genmask = nft_genmask_next(net);
+       struct nft_rbtree_elem *rbe;
+       struct rb_node *parent, **p;
+       int d;
+
+       parent = NULL;
+       p = &priv->root.rb_node;
+       while (*p != NULL) {
+               parent = *p;
+               rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+               d = memcmp(nft_set_ext_key(&rbe->ext),
+                          nft_set_ext_key(&new->ext),
+                          set->klen);
+               if (d < 0)
+                       p = &parent->rb_left;
+               else if (d > 0)
+                       p = &parent->rb_right;
+               else {
+                       if (nft_set_elem_active(&rbe->ext, genmask)) {
+                               if (nft_rbtree_interval_end(rbe) &&
+                                   !nft_rbtree_interval_end(new))
+                                       p = &parent->rb_left;
+                               else if (!nft_rbtree_interval_end(rbe) &&
+                                        nft_rbtree_interval_end(new))
+                                       p = &parent->rb_right;
+                               else {
+                                       *ext = &rbe->ext;
+                                       return -EEXIST;
+                               }
+                       }
+               }
+       }
+       rb_link_node(&new->node, parent, p);
+       rb_insert_color(&new->node, &priv->root);
+       return 0;
+}
+
+static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
+                            const struct nft_set_elem *elem,
+                            struct nft_set_ext **ext)
+{
+       struct nft_rbtree_elem *rbe = elem->priv;
+       int err;
+
+       spin_lock_bh(&nft_rbtree_lock);
+       err = __nft_rbtree_insert(net, set, rbe, ext);
+       spin_unlock_bh(&nft_rbtree_lock);
+
+       return err;
+}
+
+static void nft_rbtree_remove(const struct nft_set *set,
+                             const struct nft_set_elem *elem)
+{
+       struct nft_rbtree *priv = nft_set_priv(set);
+       struct nft_rbtree_elem *rbe = elem->priv;
+
+       spin_lock_bh(&nft_rbtree_lock);
+       rb_erase(&rbe->node, &priv->root);
+       spin_unlock_bh(&nft_rbtree_lock);
+}
+
+static void nft_rbtree_activate(const struct net *net,
+                               const struct nft_set *set,
+                               const struct nft_set_elem *elem)
+{
+       struct nft_rbtree_elem *rbe = elem->priv;
+
+       nft_set_elem_change_active(net, set, &rbe->ext);
+}
+
+static void *nft_rbtree_deactivate(const struct net *net,
+                                  const struct nft_set *set,
+                                  const struct nft_set_elem *elem)
+{
+       const struct nft_rbtree *priv = nft_set_priv(set);
+       const struct rb_node *parent = priv->root.rb_node;
+       struct nft_rbtree_elem *rbe, *this = elem->priv;
+       u8 genmask = nft_genmask_next(net);
+       int d;
+
+       while (parent != NULL) {
+               rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+
+               d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
+                                          set->klen);
+               if (d < 0)
+                       parent = parent->rb_left;
+               else if (d > 0)
+                       parent = parent->rb_right;
+               else {
+                       if (!nft_set_elem_active(&rbe->ext, genmask)) {
+                               parent = parent->rb_left;
+                               continue;
+                       }
+                       if (nft_rbtree_interval_end(rbe) &&
+                           !nft_rbtree_interval_end(this)) {
+                               parent = parent->rb_left;
+                               continue;
+                       } else if (!nft_rbtree_interval_end(rbe) &&
+                                  nft_rbtree_interval_end(this)) {
+                               parent = parent->rb_right;
+                               continue;
+                       }
+                       nft_set_elem_change_active(net, set, &rbe->ext);
+                       return rbe;
+               }
+       }
+       return NULL;
+}
+
+static void nft_rbtree_walk(const struct nft_ctx *ctx,
+                           const struct nft_set *set,
+                           struct nft_set_iter *iter)
+{
+       const struct nft_rbtree *priv = nft_set_priv(set);
+       struct nft_rbtree_elem *rbe;
+       struct nft_set_elem elem;
+       struct rb_node *node;
+
+       spin_lock_bh(&nft_rbtree_lock);
+       for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+               rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+               if (iter->count < iter->skip)
+                       goto cont;
+               if (!nft_set_elem_active(&rbe->ext, iter->genmask))
+                       goto cont;
+
+               elem.priv = rbe;
+
+               iter->err = iter->fn(ctx, set, iter, &elem);
+               if (iter->err < 0) {
+                       spin_unlock_bh(&nft_rbtree_lock);
+                       return;
+               }
+cont:
+               iter->count++;
+       }
+       spin_unlock_bh(&nft_rbtree_lock);
+}
+
+static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[])
+{
+       return sizeof(struct nft_rbtree);
+}
+
+static int nft_rbtree_init(const struct nft_set *set,
+                          const struct nft_set_desc *desc,
+                          const struct nlattr * const nla[])
+{
+       struct nft_rbtree *priv = nft_set_priv(set);
+
+       priv->root = RB_ROOT;
+       return 0;
+}
+
+static void nft_rbtree_destroy(const struct nft_set *set)
+{
+       struct nft_rbtree *priv = nft_set_priv(set);
+       struct nft_rbtree_elem *rbe;
+       struct rb_node *node;
+
+       while ((node = priv->root.rb_node) != NULL) {
+               rb_erase(node, &priv->root);
+               rbe = rb_entry(node, struct nft_rbtree_elem, node);
+               nft_set_elem_destroy(set, rbe);
+       }
+}
+
+static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
+                               struct nft_set_estimate *est)
+{
+       unsigned int nsize;
+
+       nsize = sizeof(struct nft_rbtree_elem);
+       if (desc->size)
+               est->size = sizeof(struct nft_rbtree) + desc->size * nsize;
+       else
+               est->size = nsize;
+
+       est->class = NFT_SET_CLASS_O_LOG_N;
+
+       return true;
+}
+
+static struct nft_set_ops nft_rbtree_ops __read_mostly = {
+       .privsize       = nft_rbtree_privsize,
+       .elemsize       = offsetof(struct nft_rbtree_elem, ext),
+       .estimate       = nft_rbtree_estimate,
+       .init           = nft_rbtree_init,
+       .destroy        = nft_rbtree_destroy,
+       .insert         = nft_rbtree_insert,
+       .remove         = nft_rbtree_remove,
+       .deactivate     = nft_rbtree_deactivate,
+       .activate       = nft_rbtree_activate,
+       .lookup         = nft_rbtree_lookup,
+       .walk           = nft_rbtree_walk,
+       .features       = NFT_SET_INTERVAL | NFT_SET_MAP,
+       .owner          = THIS_MODULE,
+};
+
+static int __init nft_rbtree_module_init(void)
+{
+       return nft_register_set(&nft_rbtree_ops);
+}
+
+static void __exit nft_rbtree_module_exit(void)
+{
+       nft_unregister_set(&nft_rbtree_ops);
+}
+
+module_init(nft_rbtree_module_init);
+module_exit(nft_rbtree_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_SET();
index 188404b..a3b8f69 100644 (file)
@@ -233,10 +233,8 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
                return false;
 
        if (info->match_flags & XT_CONNTRACK_EXPIRES) {
-               unsigned long expires = 0;
+               unsigned long expires = nf_ct_expires(ct) / HZ;
 
-               if (timer_pending(&ct->timeout))
-                       expires = (ct->timeout.expires - jiffies) / HZ;
                if ((expires >= info->expires_min &&
                    expires <= info->expires_max) ^
                    !(info->invert_flags & XT_CONNTRACK_EXPIRES))
index e5f1898..bb33598 100644 (file)
@@ -107,8 +107,8 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
             info->invert & XT_PHYSDEV_OP_BRIDGED) &&
            par->hook_mask & ((1 << NF_INET_LOCAL_OUT) |
            (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) {
-               pr_info("using --physdev-out and --physdev-is-out are only"
-                       "supported in the FORWARD and POSTROUTING chains with"
+               pr_info("using --physdev-out and --physdev-is-out are only "
+                       "supported in the FORWARD and POSTROUTING chains with "
                        "bridged traffic.\n");
                if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
                        return -EINVAL;