net: Allow accepted sockets to be bound to l3mdev domain
authorDavid Ahern <dsa@cumulusnetworks.com>
Wed, 16 Dec 2015 21:20:44 +0000 (13:20 -0800)
committerDavid S. Miller <davem@davemloft.net>
Fri, 18 Dec 2015 19:43:38 +0000 (14:43 -0500)
Allow accepted sockets to derive their sk_bound_dev_if setting from the
l3mdev domain in which the packets originated. A sysctl setting is added
to control the behavior which is similar to sk_mark and
sysctl_tcp_fwmark_accept.

This effectively allow a process to have a "VRF-global" listen socket,
with child sockets bound to the VRF device in which the packet originated.
A similar behavior can be achieved using sk_mark, but a solution using marks
is incomplete as it does not handle duplicate addresses in different L3
domains/VRFs. Allowing sockets to inherit the sk_bound_dev_if from l3mdev
domain provides a complete solution.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Documentation/networking/ip-sysctl.txt
include/net/inet_sock.h
include/net/netns/ipv4.h
net/ipv4/syncookies.c
net/ipv4/sysctl_net_ipv4.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv6/syncookies.c

index 5de632e..ceb44a0 100644 (file)
@@ -335,6 +335,14 @@ tcp_keepalive_intvl - INTEGER
        after probes started. Default value: 75sec i.e. connection
        will be aborted after ~11 minutes of retries.
 
+tcp_l3mdev_accept - BOOLEAN
+       Enables child sockets to inherit the L3 master device index.
+       Enabling this option allows a "global" listen socket to work
+       across L3 master domains (e.g., VRFs) with connected sockets
+       derived from the listen socket to be bound to the L3 domain in
+       which the packets originated. Only valid when the kernel was
+       compiled with CONFIG_NET_L3_MASTER_DEV.
+
 tcp_low_latency - BOOLEAN
        If set, the TCP stack makes decisions that prefer lower
        latency as opposed to higher throughput.  By default, this
index 625bdf9..012b1f9 100644 (file)
@@ -28,6 +28,7 @@
 #include <net/request_sock.h>
 #include <net/netns/hash.h>
 #include <net/tcp_states.h>
+#include <net/l3mdev.h>
 
 /** struct ip_options - IP Options
  *
@@ -113,6 +114,19 @@ static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
        return sk->sk_mark;
 }
 
+static inline int inet_request_bound_dev_if(const struct sock *sk,
+                                           struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+       struct net *net = sock_net(sk);
+
+       if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept)
+               return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
+#endif
+
+       return sk->sk_bound_dev_if;
+}
+
 struct inet_cork {
        unsigned int            flags;
        __be32                  addr;
index c68926b..d75be32 100644 (file)
@@ -86,6 +86,9 @@ struct netns_ipv4 {
 
        int sysctl_fwmark_reflect;
        int sysctl_tcp_fwmark_accept;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+       int sysctl_tcp_l3mdev_accept;
+#endif
        int sysctl_tcp_mtu_probing;
        int sysctl_tcp_base_mss;
        int sysctl_tcp_probe_threshold;
index 4cbe9f0..643a86c 100644 (file)
@@ -351,7 +351,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
        treq->snt_synack.v64    = 0;
        treq->tfo_listener      = false;
 
-       ireq->ir_iif = sk->sk_bound_dev_if;
+       ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
 
        /* We throwed the options of the initial SYN away, so we hope
         * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -371,7 +371,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
         * hasn't changed since we received the original syn, but I see
         * no easy way to do this.
         */
-       flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+       flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark,
                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
                           inet_sk_flowi_flags(sk),
                           opt->srr ? opt->faddr : ireq->ir_rmt_addr,
index a0bd7a5..41ff1f8 100644 (file)
@@ -915,6 +915,17 @@ static struct ctl_table ipv4_net_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_NET_L3_MASTER_DEV
+       {
+               .procname       = "tcp_l3mdev_accept",
+               .data           = &init_net.ipv4.sysctl_tcp_l3mdev_accept,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif
        {
                .procname       = "tcp_mtu_probing",
                .data           = &init_net.ipv4.sysctl_tcp_mtu_probing,
index 2d656ee..7b1fddc 100644 (file)
@@ -6204,7 +6204,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
        tcp_openreq_init(req, &tmp_opt, skb, sk);
 
        /* Note: tcp_v6_init_req() might override ir_iif for link locals */
-       inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+       inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
 
        af_ops->init_req(req, sk, skb);
 
index 205e674..46e92fb 100644 (file)
@@ -1276,6 +1276,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
        ireq                  = inet_rsk(req);
        sk_daddr_set(newsk, ireq->ir_rmt_addr);
        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+       newsk->sk_bound_dev_if = ireq->ir_iif;
        newinet->inet_saddr           = ireq->ir_loc_addr;
        inet_opt              = ireq->opt;
        rcu_assign_pointer(newinet->inet_opt, inet_opt);
index eaf7ac4..2906ef2 100644 (file)
@@ -193,7 +193,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
                ireq->pktopts = skb;
        }
 
-       ireq->ir_iif = sk->sk_bound_dev_if;
+       ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
        /* So that link locals have meaning */
        if (!sk->sk_bound_dev_if &&
            ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
@@ -224,7 +224,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
                fl6.daddr = ireq->ir_v6_rmt_addr;
                final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
                fl6.saddr = ireq->ir_v6_loc_addr;
-               fl6.flowi6_oif = sk->sk_bound_dev_if;
+               fl6.flowi6_oif = ireq->ir_iif;
                fl6.flowi6_mark = ireq->ir_mark;
                fl6.fl6_dport = ireq->ir_rmt_port;
                fl6.fl6_sport = inet_sk(sk)->inet_sport;