Merge branch 'for-4.9/dax' into libnvdimm-for-next
[cascardo/linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102                                           ip_hdr(skb)->saddr,
103                                           tcp_hdr(skb)->dest,
104                                           tcp_hdr(skb)->source);
105 }
106
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110         struct tcp_sock *tp = tcp_sk(sk);
111
112         /* With PAWS, it is safe from the viewpoint
113            of data integrity. Even without PAWS it is safe provided sequence
114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115
116            Actually, the idea is close to VJ's one, only timestamp cache is
117            held not per host, but per port pair and TW bucket is used as state
118            holder.
119
120            If TW bucket has been already destroyed we fall back to VJ's scheme
121            and use initial timestamp retrieved from peer table.
122          */
123         if (tcptw->tw_ts_recent_stamp &&
124             (!twp || (sysctl_tcp_tw_reuse &&
125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127                 if (tp->write_seq == 0)
128                         tp->write_seq = 1;
129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131                 sock_hold(sktw);
132                 return 1;
133         }
134
135         return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143         struct inet_sock *inet = inet_sk(sk);
144         struct tcp_sock *tp = tcp_sk(sk);
145         __be16 orig_sport, orig_dport;
146         __be32 daddr, nexthop;
147         struct flowi4 *fl4;
148         struct rtable *rt;
149         int err;
150         struct ip_options_rcu *inet_opt;
151
152         if (addr_len < sizeof(struct sockaddr_in))
153                 return -EINVAL;
154
155         if (usin->sin_family != AF_INET)
156                 return -EAFNOSUPPORT;
157
158         nexthop = daddr = usin->sin_addr.s_addr;
159         inet_opt = rcu_dereference_protected(inet->inet_opt,
160                                              lockdep_sock_is_held(sk));
161         if (inet_opt && inet_opt->opt.srr) {
162                 if (!daddr)
163                         return -EINVAL;
164                 nexthop = inet_opt->opt.faddr;
165         }
166
167         orig_sport = inet->inet_sport;
168         orig_dport = usin->sin_port;
169         fl4 = &inet->cork.fl.u.ip4;
170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172                               IPPROTO_TCP,
173                               orig_sport, orig_dport, sk);
174         if (IS_ERR(rt)) {
175                 err = PTR_ERR(rt);
176                 if (err == -ENETUNREACH)
177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return err;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet_opt || !inet_opt->opt.srr)
187                 daddr = fl4->daddr;
188
189         if (!inet->inet_saddr)
190                 inet->inet_saddr = fl4->saddr;
191         sk_rcv_saddr_set(sk, inet->inet_saddr);
192
193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 if (likely(!tp->repair))
198                         tp->write_seq      = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
204
205         inet->inet_dport = usin->sin_port;
206         sk_daddr_set(sk, daddr);
207
208         inet_csk(sk)->icsk_ext_hdr_len = 0;
209         if (inet_opt)
210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211
212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213
214         /* Socket identity is still unknown (sport may be zero).
215          * However we set state to SYN-SENT and not releasing socket
216          * lock select source port, enter ourselves into the hash tables and
217          * complete initialization after this.
218          */
219         tcp_set_state(sk, TCP_SYN_SENT);
220         err = inet_hash_connect(&tcp_death_row, sk);
221         if (err)
222                 goto failure;
223
224         sk_set_txhash(sk);
225
226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227                                inet->inet_sport, inet->inet_dport, sk);
228         if (IS_ERR(rt)) {
229                 err = PTR_ERR(rt);
230                 rt = NULL;
231                 goto failure;
232         }
233         /* OK, now commit destination to socket.  */
234         sk->sk_gso_type = SKB_GSO_TCPV4;
235         sk_setup_caps(sk, &rt->dst);
236
237         if (!tp->write_seq && likely(!tp->repair))
238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239                                                            inet->inet_daddr,
240                                                            inet->inet_sport,
241                                                            usin->sin_port);
242
243         inet->inet_id = tp->write_seq ^ jiffies;
244
245         err = tcp_connect(sk);
246
247         rt = NULL;
248         if (err)
249                 goto failure;
250
251         return 0;
252
253 failure:
254         /*
255          * This unhashes the socket and releases the local port,
256          * if necessary.
257          */
258         tcp_set_state(sk, TCP_CLOSE);
259         ip_rt_put(rt);
260         sk->sk_route_caps = 0;
261         inet->inet_dport = 0;
262         return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275         u32 mtu = tcp_sk(sk)->mtu_info;
276
277         dst = inet_csk_update_pmtu(sk, mtu);
278         if (!dst)
279                 return;
280
281         /* Something is about to be wrong... Remember soft error
282          * for the case, if this connection will not able to recover.
283          */
284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285                 sk->sk_err_soft = EMSGSIZE;
286
287         mtu = dst_mtu(dst);
288
289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290             ip_sk_accept_pmtu(sk) &&
291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292                 tcp_sync_mss(sk, mtu);
293
294                 /* Resend the TCP packet because it's
295                  * clear that the old packet has been
296                  * dropped. This is the new "fast" path mtu
297                  * discovery.
298                  */
299                 tcp_simple_retransmit(sk);
300         } /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306         struct dst_entry *dst = __sk_dst_check(sk, 0);
307
308         if (dst)
309                 dst->ops->redirect(dst, sk, skb);
310 }
311
312
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316         struct request_sock *req = inet_reqsk(sk);
317         struct net *net = sock_net(sk);
318
319         /* ICMPs are not backlogged, hence we cannot get
320          * an established socket here.
321          */
322         if (seq != tcp_rsk(req)->snt_isn) {
323                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
324         } else if (abort) {
325                 /*
326                  * Still in SYN_RECV, just remove it silently.
327                  * There is no good way to pass the error to the newly
328                  * created socket, and POSIX does not want network
329                  * errors returned from accept().
330                  */
331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332                 tcp_listendrop(req->rsk_listener);
333         }
334         reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358         struct inet_connection_sock *icsk;
359         struct tcp_sock *tp;
360         struct inet_sock *inet;
361         const int type = icmp_hdr(icmp_skb)->type;
362         const int code = icmp_hdr(icmp_skb)->code;
363         struct sock *sk;
364         struct sk_buff *skb;
365         struct request_sock *fastopen;
366         __u32 seq, snd_una;
367         __u32 remaining;
368         int err;
369         struct net *net = dev_net(icmp_skb->dev);
370
371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372                                        th->dest, iph->saddr, ntohs(th->source),
373                                        inet_iif(icmp_skb));
374         if (!sk) {
375                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
376                 return;
377         }
378         if (sk->sk_state == TCP_TIME_WAIT) {
379                 inet_twsk_put(inet_twsk(sk));
380                 return;
381         }
382         seq = ntohl(th->seq);
383         if (sk->sk_state == TCP_NEW_SYN_RECV)
384                 return tcp_req_err(sk, seq,
385                                   type == ICMP_PARAMETERPROB ||
386                                   type == ICMP_TIME_EXCEEDED ||
387                                   (type == ICMP_DEST_UNREACH &&
388                                    (code == ICMP_NET_UNREACH ||
389                                     code == ICMP_HOST_UNREACH)));
390
391         bh_lock_sock(sk);
392         /* If too many ICMPs get dropped on busy
393          * servers this needs to be solved differently.
394          * We do take care of PMTU discovery (RFC1191) special case :
395          * we can receive locally generated ICMP messages while socket is held.
396          */
397         if (sock_owned_by_user(sk)) {
398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
400         }
401         if (sk->sk_state == TCP_CLOSE)
402                 goto out;
403
404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
406                 goto out;
407         }
408
409         icsk = inet_csk(sk);
410         tp = tcp_sk(sk);
411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412         fastopen = tp->fastopen_rsk;
413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414         if (sk->sk_state != TCP_LISTEN &&
415             !between(seq, snd_una, tp->snd_nxt)) {
416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
417                 goto out;
418         }
419
420         switch (type) {
421         case ICMP_REDIRECT:
422                 do_redirect(icmp_skb, sk);
423                 goto out;
424         case ICMP_SOURCE_QUENCH:
425                 /* Just silently ignore these. */
426                 goto out;
427         case ICMP_PARAMETERPROB:
428                 err = EPROTO;
429                 break;
430         case ICMP_DEST_UNREACH:
431                 if (code > NR_ICMP_UNREACH)
432                         goto out;
433
434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435                         /* We are not interested in TCP_LISTEN and open_requests
436                          * (SYN-ACKs send out by Linux are always <576bytes so
437                          * they should go through unfragmented).
438                          */
439                         if (sk->sk_state == TCP_LISTEN)
440                                 goto out;
441
442                         tp->mtu_info = info;
443                         if (!sock_owned_by_user(sk)) {
444                                 tcp_v4_mtu_reduced(sk);
445                         } else {
446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
447                                         sock_hold(sk);
448                         }
449                         goto out;
450                 }
451
452                 err = icmp_err_convert[code].errno;
453                 /* check if icmp_skb allows revert of backoff
454                  * (see draft-zimmermann-tcp-lcd) */
455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456                         break;
457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458                     !icsk->icsk_backoff || fastopen)
459                         break;
460
461                 if (sock_owned_by_user(sk))
462                         break;
463
464                 icsk->icsk_backoff--;
465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466                                                TCP_TIMEOUT_INIT;
467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468
469                 skb = tcp_write_queue_head(sk);
470                 BUG_ON(!skb);
471
472                 remaining = icsk->icsk_rto -
473                             min(icsk->icsk_rto,
474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
475
476                 if (remaining) {
477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478                                                   remaining, TCP_RTO_MAX);
479                 } else {
480                         /* RTO revert clocked out retransmission.
481                          * Will retransmit now */
482                         tcp_retransmit_timer(sk);
483                 }
484
485                 break;
486         case ICMP_TIME_EXCEEDED:
487                 err = EHOSTUNREACH;
488                 break;
489         default:
490                 goto out;
491         }
492
493         switch (sk->sk_state) {
494         case TCP_SYN_SENT:
495         case TCP_SYN_RECV:
496                 /* Only in fast or simultaneous open. If a fast open socket is
497                  * is already accepted it is treated as a connected one below.
498                  */
499                 if (fastopen && !fastopen->sk)
500                         break;
501
502                 if (!sock_owned_by_user(sk)) {
503                         sk->sk_err = err;
504
505                         sk->sk_error_report(sk);
506
507                         tcp_done(sk);
508                 } else {
509                         sk->sk_err_soft = err;
510                 }
511                 goto out;
512         }
513
514         /* If we've already connected we will keep trying
515          * until we time out, or the user gives up.
516          *
517          * rfc1122 4.2.3.9 allows to consider as hard errors
518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519          * but it is obsoleted by pmtu discovery).
520          *
521          * Note, that in modern internet, where routing is unreliable
522          * and in each dark corner broken firewalls sit, sending random
523          * errors ordered by their masters even this two messages finally lose
524          * their original sense (even Linux sends invalid PORT_UNREACHs)
525          *
526          * Now we are in compliance with RFCs.
527          *                                                      --ANK (980905)
528          */
529
530         inet = inet_sk(sk);
531         if (!sock_owned_by_user(sk) && inet->recverr) {
532                 sk->sk_err = err;
533                 sk->sk_error_report(sk);
534         } else  { /* Only an error on timeout */
535                 sk->sk_err_soft = err;
536         }
537
538 out:
539         bh_unlock_sock(sk);
540         sock_put(sk);
541 }
542
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545         struct tcphdr *th = tcp_hdr(skb);
546
547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549                 skb->csum_start = skb_transport_header(skb) - skb->head;
550                 skb->csum_offset = offsetof(struct tcphdr, check);
551         } else {
552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
553                                          csum_partial(th,
554                                                       th->doff << 2,
555                                                       skb->csum));
556         }
557 }
558
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562         const struct inet_sock *inet = inet_sk(sk);
563
564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567
568 /*
569  *      This routine will send an RST to the other tcp.
570  *
571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *                    for reset.
573  *      Answer: if a packet caused RST, it is not for a socket
574  *              existing in our system, if it is matched to a socket,
575  *              it is just duplicate segment or bug in other side's TCP.
576  *              So that we build reply only basing on parameters
577  *              arrived with segment.
578  *      Exception: precedence violation. We do not implement it in any case.
579  */
580
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583         const struct tcphdr *th = tcp_hdr(skb);
584         struct {
585                 struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589         } rep;
590         struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592         struct tcp_md5sig_key *key = NULL;
593         const __u8 *hash_location = NULL;
594         unsigned char newhash[16];
595         int genhash;
596         struct sock *sk1 = NULL;
597 #endif
598         struct net *net;
599
600         /* Never send a reset in response to a reset. */
601         if (th->rst)
602                 return;
603
604         /* If sk not NULL, it means we did a successful lookup and incoming
605          * route had to be correct. prequeue might have dropped our dst.
606          */
607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608                 return;
609
610         /* Swap the send and the receive. */
611         memset(&rep, 0, sizeof(rep));
612         rep.th.dest   = th->source;
613         rep.th.source = th->dest;
614         rep.th.doff   = sizeof(struct tcphdr) / 4;
615         rep.th.rst    = 1;
616
617         if (th->ack) {
618                 rep.th.seq = th->ack_seq;
619         } else {
620                 rep.th.ack = 1;
621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622                                        skb->len - (th->doff << 2));
623         }
624
625         memset(&arg, 0, sizeof(arg));
626         arg.iov[0].iov_base = (unsigned char *)&rep;
627         arg.iov[0].iov_len  = sizeof(rep.th);
628
629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631         rcu_read_lock();
632         hash_location = tcp_parse_md5sig_option(th);
633         if (sk && sk_fullsock(sk)) {
634                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
635                                         &ip_hdr(skb)->saddr, AF_INET);
636         } else if (hash_location) {
637                 /*
638                  * active side is lost. Try to find listening socket through
639                  * source port, and then find md5 key through listening socket.
640                  * we are not loose security here:
641                  * Incoming packet is checked with md5 hash with finding key,
642                  * no RST generated if md5 hash doesn't match.
643                  */
644                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
645                                              ip_hdr(skb)->saddr,
646                                              th->source, ip_hdr(skb)->daddr,
647                                              ntohs(th->source), inet_iif(skb));
648                 /* don't send rst if it can't find key */
649                 if (!sk1)
650                         goto out;
651
652                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
653                                         &ip_hdr(skb)->saddr, AF_INET);
654                 if (!key)
655                         goto out;
656
657
658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
660                         goto out;
661
662         }
663
664         if (key) {
665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666                                    (TCPOPT_NOP << 16) |
667                                    (TCPOPT_MD5SIG << 8) |
668                                    TCPOLEN_MD5SIG);
669                 /* Update length and the length the header thinks exists */
670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671                 rep.th.doff = arg.iov[0].iov_len / 4;
672
673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
674                                      key, ip_hdr(skb)->saddr,
675                                      ip_hdr(skb)->daddr, &rep.th);
676         }
677 #endif
678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679                                       ip_hdr(skb)->saddr, /* XXX */
680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
682         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
683
684         /* When socket is gone, all binding information is lost.
685          * routing might fail in this case. No choice here, if we choose to force
686          * input interface, we will misroute in case of asymmetric route.
687          */
688         if (sk)
689                 arg.bound_dev_if = sk->sk_bound_dev_if;
690
691         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
692                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
693
694         arg.tos = ip_hdr(skb)->tos;
695         local_bh_disable();
696         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
697                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
698                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
699                               &arg, arg.iov[0].iov_len);
700
701         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
702         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
703         local_bh_enable();
704
705 #ifdef CONFIG_TCP_MD5SIG
706 out:
707         rcu_read_unlock();
708 #endif
709 }
710
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714
715 static void tcp_v4_send_ack(struct net *net,
716                             struct sk_buff *skb, u32 seq, u32 ack,
717                             u32 win, u32 tsval, u32 tsecr, int oif,
718                             struct tcp_md5sig_key *key,
719                             int reply_flags, u8 tos)
720 {
721         const struct tcphdr *th = tcp_hdr(skb);
722         struct {
723                 struct tcphdr th;
724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728                         ];
729         } rep;
730         struct ip_reply_arg arg;
731
732         memset(&rep.th, 0, sizeof(struct tcphdr));
733         memset(&arg, 0, sizeof(arg));
734
735         arg.iov[0].iov_base = (unsigned char *)&rep;
736         arg.iov[0].iov_len  = sizeof(rep.th);
737         if (tsecr) {
738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739                                    (TCPOPT_TIMESTAMP << 8) |
740                                    TCPOLEN_TIMESTAMP);
741                 rep.opt[1] = htonl(tsval);
742                 rep.opt[2] = htonl(tsecr);
743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744         }
745
746         /* Swap the send and the receive. */
747         rep.th.dest    = th->source;
748         rep.th.source  = th->dest;
749         rep.th.doff    = arg.iov[0].iov_len / 4;
750         rep.th.seq     = htonl(seq);
751         rep.th.ack_seq = htonl(ack);
752         rep.th.ack     = 1;
753         rep.th.window  = htons(win);
754
755 #ifdef CONFIG_TCP_MD5SIG
756         if (key) {
757                 int offset = (tsecr) ? 3 : 0;
758
759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760                                           (TCPOPT_NOP << 16) |
761                                           (TCPOPT_MD5SIG << 8) |
762                                           TCPOLEN_MD5SIG);
763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764                 rep.th.doff = arg.iov[0].iov_len/4;
765
766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767                                     key, ip_hdr(skb)->saddr,
768                                     ip_hdr(skb)->daddr, &rep.th);
769         }
770 #endif
771         arg.flags = reply_flags;
772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773                                       ip_hdr(skb)->saddr, /* XXX */
774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776         if (oif)
777                 arg.bound_dev_if = oif;
778         arg.tos = tos;
779         local_bh_disable();
780         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
781                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
782                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
783                               &arg, arg.iov[0].iov_len);
784
785         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
786         local_bh_enable();
787 }
788
789 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
790 {
791         struct inet_timewait_sock *tw = inet_twsk(sk);
792         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
793
794         tcp_v4_send_ack(sock_net(sk), skb,
795                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
796                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
797                         tcp_time_stamp + tcptw->tw_ts_offset,
798                         tcptw->tw_ts_recent,
799                         tw->tw_bound_dev_if,
800                         tcp_twsk_md5_key(tcptw),
801                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
802                         tw->tw_tos
803                         );
804
805         inet_twsk_put(tw);
806 }
807
808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
809                                   struct request_sock *req)
810 {
811         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
812          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
813          */
814         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
815                                              tcp_sk(sk)->snd_nxt;
816
817         /* RFC 7323 2.3
818          * The window field (SEG.WND) of every outgoing segment, with the
819          * exception of <SYN> segments, MUST be right-shifted by
820          * Rcv.Wind.Shift bits:
821          */
822         tcp_v4_send_ack(sock_net(sk), skb, seq,
823                         tcp_rsk(req)->rcv_nxt,
824                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
825                         tcp_time_stamp,
826                         req->ts_recent,
827                         0,
828                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
829                                           AF_INET),
830                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
831                         ip_hdr(skb)->tos);
832 }
833
834 /*
835  *      Send a SYN-ACK after having received a SYN.
836  *      This still operates on a request_sock only, not on a big
837  *      socket.
838  */
839 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
840                               struct flowi *fl,
841                               struct request_sock *req,
842                               struct tcp_fastopen_cookie *foc,
843                               enum tcp_synack_type synack_type)
844 {
845         const struct inet_request_sock *ireq = inet_rsk(req);
846         struct flowi4 fl4;
847         int err = -1;
848         struct sk_buff *skb;
849
850         /* First, grab a route. */
851         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
852                 return -1;
853
854         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
855
856         if (skb) {
857                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
858
859                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
860                                             ireq->ir_rmt_addr,
861                                             ireq->opt);
862                 err = net_xmit_eval(err);
863         }
864
865         return err;
866 }
867
868 /*
869  *      IPv4 request_sock destructor.
870  */
871 static void tcp_v4_reqsk_destructor(struct request_sock *req)
872 {
873         kfree(inet_rsk(req)->opt);
874 }
875
876 #ifdef CONFIG_TCP_MD5SIG
877 /*
878  * RFC2385 MD5 checksumming requires a mapping of
879  * IP address->MD5 Key.
880  * We need to maintain these in the sk structure.
881  */
882
883 /* Find the Key structure for an address.  */
884 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
885                                          const union tcp_md5_addr *addr,
886                                          int family)
887 {
888         const struct tcp_sock *tp = tcp_sk(sk);
889         struct tcp_md5sig_key *key;
890         unsigned int size = sizeof(struct in_addr);
891         const struct tcp_md5sig_info *md5sig;
892
893         /* caller either holds rcu_read_lock() or socket lock */
894         md5sig = rcu_dereference_check(tp->md5sig_info,
895                                        lockdep_sock_is_held(sk));
896         if (!md5sig)
897                 return NULL;
898 #if IS_ENABLED(CONFIG_IPV6)
899         if (family == AF_INET6)
900                 size = sizeof(struct in6_addr);
901 #endif
902         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
903                 if (key->family != family)
904                         continue;
905                 if (!memcmp(&key->addr, addr, size))
906                         return key;
907         }
908         return NULL;
909 }
910 EXPORT_SYMBOL(tcp_md5_do_lookup);
911
912 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
913                                          const struct sock *addr_sk)
914 {
915         const union tcp_md5_addr *addr;
916
917         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
918         return tcp_md5_do_lookup(sk, addr, AF_INET);
919 }
920 EXPORT_SYMBOL(tcp_v4_md5_lookup);
921
922 /* This can be called on a newly created socket, from other files */
923 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
924                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
925 {
926         /* Add Key to the list */
927         struct tcp_md5sig_key *key;
928         struct tcp_sock *tp = tcp_sk(sk);
929         struct tcp_md5sig_info *md5sig;
930
931         key = tcp_md5_do_lookup(sk, addr, family);
932         if (key) {
933                 /* Pre-existing entry - just update that one. */
934                 memcpy(key->key, newkey, newkeylen);
935                 key->keylen = newkeylen;
936                 return 0;
937         }
938
939         md5sig = rcu_dereference_protected(tp->md5sig_info,
940                                            lockdep_sock_is_held(sk));
941         if (!md5sig) {
942                 md5sig = kmalloc(sizeof(*md5sig), gfp);
943                 if (!md5sig)
944                         return -ENOMEM;
945
946                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
947                 INIT_HLIST_HEAD(&md5sig->head);
948                 rcu_assign_pointer(tp->md5sig_info, md5sig);
949         }
950
951         key = sock_kmalloc(sk, sizeof(*key), gfp);
952         if (!key)
953                 return -ENOMEM;
954         if (!tcp_alloc_md5sig_pool()) {
955                 sock_kfree_s(sk, key, sizeof(*key));
956                 return -ENOMEM;
957         }
958
959         memcpy(key->key, newkey, newkeylen);
960         key->keylen = newkeylen;
961         key->family = family;
962         memcpy(&key->addr, addr,
963                (family == AF_INET6) ? sizeof(struct in6_addr) :
964                                       sizeof(struct in_addr));
965         hlist_add_head_rcu(&key->node, &md5sig->head);
966         return 0;
967 }
968 EXPORT_SYMBOL(tcp_md5_do_add);
969
970 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
971 {
972         struct tcp_md5sig_key *key;
973
974         key = tcp_md5_do_lookup(sk, addr, family);
975         if (!key)
976                 return -ENOENT;
977         hlist_del_rcu(&key->node);
978         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
979         kfree_rcu(key, rcu);
980         return 0;
981 }
982 EXPORT_SYMBOL(tcp_md5_do_del);
983
984 static void tcp_clear_md5_list(struct sock *sk)
985 {
986         struct tcp_sock *tp = tcp_sk(sk);
987         struct tcp_md5sig_key *key;
988         struct hlist_node *n;
989         struct tcp_md5sig_info *md5sig;
990
991         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
992
993         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
994                 hlist_del_rcu(&key->node);
995                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
996                 kfree_rcu(key, rcu);
997         }
998 }
999
1000 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1001                                  int optlen)
1002 {
1003         struct tcp_md5sig cmd;
1004         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1005
1006         if (optlen < sizeof(cmd))
1007                 return -EINVAL;
1008
1009         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1010                 return -EFAULT;
1011
1012         if (sin->sin_family != AF_INET)
1013                 return -EINVAL;
1014
1015         if (!cmd.tcpm_keylen)
1016                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017                                       AF_INET);
1018
1019         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1020                 return -EINVAL;
1021
1022         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1024                               GFP_KERNEL);
1025 }
1026
1027 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1028                                    __be32 daddr, __be32 saddr,
1029                                    const struct tcphdr *th, int nbytes)
1030 {
1031         struct tcp4_pseudohdr *bp;
1032         struct scatterlist sg;
1033         struct tcphdr *_th;
1034
1035         bp = hp->scratch;
1036         bp->saddr = saddr;
1037         bp->daddr = daddr;
1038         bp->pad = 0;
1039         bp->protocol = IPPROTO_TCP;
1040         bp->len = cpu_to_be16(nbytes);
1041
1042         _th = (struct tcphdr *)(bp + 1);
1043         memcpy(_th, th, sizeof(*th));
1044         _th->check = 0;
1045
1046         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1047         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1048                                 sizeof(*bp) + sizeof(*th));
1049         return crypto_ahash_update(hp->md5_req);
1050 }
1051
1052 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1053                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1054 {
1055         struct tcp_md5sig_pool *hp;
1056         struct ahash_request *req;
1057
1058         hp = tcp_get_md5sig_pool();
1059         if (!hp)
1060                 goto clear_hash_noput;
1061         req = hp->md5_req;
1062
1063         if (crypto_ahash_init(req))
1064                 goto clear_hash;
1065         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1066                 goto clear_hash;
1067         if (tcp_md5_hash_key(hp, key))
1068                 goto clear_hash;
1069         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1070         if (crypto_ahash_final(req))
1071                 goto clear_hash;
1072
1073         tcp_put_md5sig_pool();
1074         return 0;
1075
1076 clear_hash:
1077         tcp_put_md5sig_pool();
1078 clear_hash_noput:
1079         memset(md5_hash, 0, 16);
1080         return 1;
1081 }
1082
1083 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1084                         const struct sock *sk,
1085                         const struct sk_buff *skb)
1086 {
1087         struct tcp_md5sig_pool *hp;
1088         struct ahash_request *req;
1089         const struct tcphdr *th = tcp_hdr(skb);
1090         __be32 saddr, daddr;
1091
1092         if (sk) { /* valid for establish/request sockets */
1093                 saddr = sk->sk_rcv_saddr;
1094                 daddr = sk->sk_daddr;
1095         } else {
1096                 const struct iphdr *iph = ip_hdr(skb);
1097                 saddr = iph->saddr;
1098                 daddr = iph->daddr;
1099         }
1100
1101         hp = tcp_get_md5sig_pool();
1102         if (!hp)
1103                 goto clear_hash_noput;
1104         req = hp->md5_req;
1105
1106         if (crypto_ahash_init(req))
1107                 goto clear_hash;
1108
1109         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1110                 goto clear_hash;
1111         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1112                 goto clear_hash;
1113         if (tcp_md5_hash_key(hp, key))
1114                 goto clear_hash;
1115         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1116         if (crypto_ahash_final(req))
1117                 goto clear_hash;
1118
1119         tcp_put_md5sig_pool();
1120         return 0;
1121
1122 clear_hash:
1123         tcp_put_md5sig_pool();
1124 clear_hash_noput:
1125         memset(md5_hash, 0, 16);
1126         return 1;
1127 }
1128 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1129
1130 #endif
1131
1132 /* Called with rcu_read_lock() */
1133 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1134                                     const struct sk_buff *skb)
1135 {
1136 #ifdef CONFIG_TCP_MD5SIG
1137         /*
1138          * This gets called for each TCP segment that arrives
1139          * so we want to be efficient.
1140          * We have 3 drop cases:
1141          * o No MD5 hash and one expected.
1142          * o MD5 hash and we're not expecting one.
1143          * o MD5 hash and its wrong.
1144          */
1145         const __u8 *hash_location = NULL;
1146         struct tcp_md5sig_key *hash_expected;
1147         const struct iphdr *iph = ip_hdr(skb);
1148         const struct tcphdr *th = tcp_hdr(skb);
1149         int genhash;
1150         unsigned char newhash[16];
1151
1152         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1153                                           AF_INET);
1154         hash_location = tcp_parse_md5sig_option(th);
1155
1156         /* We've parsed the options - do we have a hash? */
1157         if (!hash_expected && !hash_location)
1158                 return false;
1159
1160         if (hash_expected && !hash_location) {
1161                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1162                 return true;
1163         }
1164
1165         if (!hash_expected && hash_location) {
1166                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1167                 return true;
1168         }
1169
1170         /* Okay, so this is hash_expected and hash_location -
1171          * so we need to calculate the checksum.
1172          */
1173         genhash = tcp_v4_md5_hash_skb(newhash,
1174                                       hash_expected,
1175                                       NULL, skb);
1176
1177         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1178                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1179                                      &iph->saddr, ntohs(th->source),
1180                                      &iph->daddr, ntohs(th->dest),
1181                                      genhash ? " tcp_v4_calc_md5_hash failed"
1182                                      : "");
1183                 return true;
1184         }
1185         return false;
1186 #endif
1187         return false;
1188 }
1189
1190 static void tcp_v4_init_req(struct request_sock *req,
1191                             const struct sock *sk_listener,
1192                             struct sk_buff *skb)
1193 {
1194         struct inet_request_sock *ireq = inet_rsk(req);
1195
1196         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1197         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1198         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1199         ireq->opt = tcp_v4_save_options(skb);
1200 }
1201
1202 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1203                                           struct flowi *fl,
1204                                           const struct request_sock *req,
1205                                           bool *strict)
1206 {
1207         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1208
1209         if (strict) {
1210                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1211                         *strict = true;
1212                 else
1213                         *strict = false;
1214         }
1215
1216         return dst;
1217 }
1218
1219 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1220         .family         =       PF_INET,
1221         .obj_size       =       sizeof(struct tcp_request_sock),
1222         .rtx_syn_ack    =       tcp_rtx_synack,
1223         .send_ack       =       tcp_v4_reqsk_send_ack,
1224         .destructor     =       tcp_v4_reqsk_destructor,
1225         .send_reset     =       tcp_v4_send_reset,
1226         .syn_ack_timeout =      tcp_syn_ack_timeout,
1227 };
1228
1229 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1230         .mss_clamp      =       TCP_MSS_DEFAULT,
1231 #ifdef CONFIG_TCP_MD5SIG
1232         .req_md5_lookup =       tcp_v4_md5_lookup,
1233         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1234 #endif
1235         .init_req       =       tcp_v4_init_req,
1236 #ifdef CONFIG_SYN_COOKIES
1237         .cookie_init_seq =      cookie_v4_init_sequence,
1238 #endif
1239         .route_req      =       tcp_v4_route_req,
1240         .init_seq       =       tcp_v4_init_sequence,
1241         .send_synack    =       tcp_v4_send_synack,
1242 };
1243
1244 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1245 {
1246         /* Never answer to SYNs send to broadcast or multicast */
1247         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1248                 goto drop;
1249
1250         return tcp_conn_request(&tcp_request_sock_ops,
1251                                 &tcp_request_sock_ipv4_ops, sk, skb);
1252
1253 drop:
1254         tcp_listendrop(sk);
1255         return 0;
1256 }
1257 EXPORT_SYMBOL(tcp_v4_conn_request);
1258
1259
1260 /*
1261  * The three way handshake has completed - we got a valid synack -
1262  * now create the new socket.
1263  */
1264 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1265                                   struct request_sock *req,
1266                                   struct dst_entry *dst,
1267                                   struct request_sock *req_unhash,
1268                                   bool *own_req)
1269 {
1270         struct inet_request_sock *ireq;
1271         struct inet_sock *newinet;
1272         struct tcp_sock *newtp;
1273         struct sock *newsk;
1274 #ifdef CONFIG_TCP_MD5SIG
1275         struct tcp_md5sig_key *key;
1276 #endif
1277         struct ip_options_rcu *inet_opt;
1278
1279         if (sk_acceptq_is_full(sk))
1280                 goto exit_overflow;
1281
1282         newsk = tcp_create_openreq_child(sk, req, skb);
1283         if (!newsk)
1284                 goto exit_nonewsk;
1285
1286         newsk->sk_gso_type = SKB_GSO_TCPV4;
1287         inet_sk_rx_dst_set(newsk, skb);
1288
1289         newtp                 = tcp_sk(newsk);
1290         newinet               = inet_sk(newsk);
1291         ireq                  = inet_rsk(req);
1292         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1293         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1294         newsk->sk_bound_dev_if = ireq->ir_iif;
1295         newinet->inet_saddr           = ireq->ir_loc_addr;
1296         inet_opt              = ireq->opt;
1297         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1298         ireq->opt             = NULL;
1299         newinet->mc_index     = inet_iif(skb);
1300         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1301         newinet->rcv_tos      = ip_hdr(skb)->tos;
1302         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1303         if (inet_opt)
1304                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1305         newinet->inet_id = newtp->write_seq ^ jiffies;
1306
1307         if (!dst) {
1308                 dst = inet_csk_route_child_sock(sk, newsk, req);
1309                 if (!dst)
1310                         goto put_and_exit;
1311         } else {
1312                 /* syncookie case : see end of cookie_v4_check() */
1313         }
1314         sk_setup_caps(newsk, dst);
1315
1316         tcp_ca_openreq_child(newsk, dst);
1317
1318         tcp_sync_mss(newsk, dst_mtu(dst));
1319         newtp->advmss = dst_metric_advmss(dst);
1320         if (tcp_sk(sk)->rx_opt.user_mss &&
1321             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1322                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1323
1324         tcp_initialize_rcv_mss(newsk);
1325
1326 #ifdef CONFIG_TCP_MD5SIG
1327         /* Copy over the MD5 key from the original socket */
1328         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1329                                 AF_INET);
1330         if (key) {
1331                 /*
1332                  * We're using one, so create a matching key
1333                  * on the newsk structure. If we fail to get
1334                  * memory, then we end up not copying the key
1335                  * across. Shucks.
1336                  */
1337                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1338                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1339                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1340         }
1341 #endif
1342
1343         if (__inet_inherit_port(sk, newsk) < 0)
1344                 goto put_and_exit;
1345         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1346         if (*own_req)
1347                 tcp_move_syn(newtp, req);
1348
1349         return newsk;
1350
1351 exit_overflow:
1352         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1353 exit_nonewsk:
1354         dst_release(dst);
1355 exit:
1356         tcp_listendrop(sk);
1357         return NULL;
1358 put_and_exit:
1359         inet_csk_prepare_forced_close(newsk);
1360         tcp_done(newsk);
1361         goto exit;
1362 }
1363 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1364
1365 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1366 {
1367 #ifdef CONFIG_SYN_COOKIES
1368         const struct tcphdr *th = tcp_hdr(skb);
1369
1370         if (!th->syn)
1371                 sk = cookie_v4_check(sk, skb);
1372 #endif
1373         return sk;
1374 }
1375
1376 /* The socket must have it's spinlock held when we get
1377  * here, unless it is a TCP_LISTEN socket.
1378  *
1379  * We have a potential double-lock case here, so even when
1380  * doing backlog processing we use the BH locking scheme.
1381  * This is because we cannot sleep with the original spinlock
1382  * held.
1383  */
1384 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1385 {
1386         struct sock *rsk;
1387
1388         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1389                 struct dst_entry *dst = sk->sk_rx_dst;
1390
1391                 sock_rps_save_rxhash(sk, skb);
1392                 sk_mark_napi_id(sk, skb);
1393                 if (dst) {
1394                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1395                             !dst->ops->check(dst, 0)) {
1396                                 dst_release(dst);
1397                                 sk->sk_rx_dst = NULL;
1398                         }
1399                 }
1400                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1401                 return 0;
1402         }
1403
1404         if (tcp_checksum_complete(skb))
1405                 goto csum_err;
1406
1407         if (sk->sk_state == TCP_LISTEN) {
1408                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1409
1410                 if (!nsk)
1411                         goto discard;
1412                 if (nsk != sk) {
1413                         sock_rps_save_rxhash(nsk, skb);
1414                         sk_mark_napi_id(nsk, skb);
1415                         if (tcp_child_process(sk, nsk, skb)) {
1416                                 rsk = nsk;
1417                                 goto reset;
1418                         }
1419                         return 0;
1420                 }
1421         } else
1422                 sock_rps_save_rxhash(sk, skb);
1423
1424         if (tcp_rcv_state_process(sk, skb)) {
1425                 rsk = sk;
1426                 goto reset;
1427         }
1428         return 0;
1429
1430 reset:
1431         tcp_v4_send_reset(rsk, skb);
1432 discard:
1433         kfree_skb(skb);
1434         /* Be careful here. If this function gets more complicated and
1435          * gcc suffers from register pressure on the x86, sk (in %ebx)
1436          * might be destroyed here. This current version compiles correctly,
1437          * but you have been warned.
1438          */
1439         return 0;
1440
1441 csum_err:
1442         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1443         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1444         goto discard;
1445 }
1446 EXPORT_SYMBOL(tcp_v4_do_rcv);
1447
1448 void tcp_v4_early_demux(struct sk_buff *skb)
1449 {
1450         const struct iphdr *iph;
1451         const struct tcphdr *th;
1452         struct sock *sk;
1453
1454         if (skb->pkt_type != PACKET_HOST)
1455                 return;
1456
1457         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1458                 return;
1459
1460         iph = ip_hdr(skb);
1461         th = tcp_hdr(skb);
1462
1463         if (th->doff < sizeof(struct tcphdr) / 4)
1464                 return;
1465
1466         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1467                                        iph->saddr, th->source,
1468                                        iph->daddr, ntohs(th->dest),
1469                                        skb->skb_iif);
1470         if (sk) {
1471                 skb->sk = sk;
1472                 skb->destructor = sock_edemux;
1473                 if (sk_fullsock(sk)) {
1474                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1475
1476                         if (dst)
1477                                 dst = dst_check(dst, 0);
1478                         if (dst &&
1479                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1480                                 skb_dst_set_noref(skb, dst);
1481                 }
1482         }
1483 }
1484
1485 /* Packet is added to VJ-style prequeue for processing in process
1486  * context, if a reader task is waiting. Apparently, this exciting
1487  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1488  * failed somewhere. Latency? Burstiness? Well, at least now we will
1489  * see, why it failed. 8)8)                               --ANK
1490  *
1491  */
1492 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1493 {
1494         struct tcp_sock *tp = tcp_sk(sk);
1495
1496         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1497                 return false;
1498
1499         if (skb->len <= tcp_hdrlen(skb) &&
1500             skb_queue_len(&tp->ucopy.prequeue) == 0)
1501                 return false;
1502
1503         /* Before escaping RCU protected region, we need to take care of skb
1504          * dst. Prequeue is only enabled for established sockets.
1505          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1506          * Instead of doing full sk_rx_dst validity here, let's perform
1507          * an optimistic check.
1508          */
1509         if (likely(sk->sk_rx_dst))
1510                 skb_dst_drop(skb);
1511         else
1512                 skb_dst_force_safe(skb);
1513
1514         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1515         tp->ucopy.memory += skb->truesize;
1516         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1517             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1518                 struct sk_buff *skb1;
1519
1520                 BUG_ON(sock_owned_by_user(sk));
1521                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1522                                 skb_queue_len(&tp->ucopy.prequeue));
1523
1524                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1525                         sk_backlog_rcv(sk, skb1);
1526
1527                 tp->ucopy.memory = 0;
1528         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1529                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1530                                            POLLIN | POLLRDNORM | POLLRDBAND);
1531                 if (!inet_csk_ack_scheduled(sk))
1532                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1533                                                   (3 * tcp_rto_min(sk)) / 4,
1534                                                   TCP_RTO_MAX);
1535         }
1536         return true;
1537 }
1538 EXPORT_SYMBOL(tcp_prequeue);
1539
1540 /*
1541  *      From tcp_input.c
1542  */
1543
1544 int tcp_v4_rcv(struct sk_buff *skb)
1545 {
1546         struct net *net = dev_net(skb->dev);
1547         const struct iphdr *iph;
1548         const struct tcphdr *th;
1549         bool refcounted;
1550         struct sock *sk;
1551         int ret;
1552
1553         if (skb->pkt_type != PACKET_HOST)
1554                 goto discard_it;
1555
1556         /* Count it even if it's bad */
1557         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1558
1559         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1560                 goto discard_it;
1561
1562         th = (const struct tcphdr *)skb->data;
1563
1564         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1565                 goto bad_packet;
1566         if (!pskb_may_pull(skb, th->doff * 4))
1567                 goto discard_it;
1568
1569         /* An explanation is required here, I think.
1570          * Packet length and doff are validated by header prediction,
1571          * provided case of th->doff==0 is eliminated.
1572          * So, we defer the checks. */
1573
1574         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1575                 goto csum_error;
1576
1577         th = (const struct tcphdr *)skb->data;
1578         iph = ip_hdr(skb);
1579         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1580          * barrier() makes sure compiler wont play fool^Waliasing games.
1581          */
1582         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1583                 sizeof(struct inet_skb_parm));
1584         barrier();
1585
1586         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1587         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1588                                     skb->len - th->doff * 4);
1589         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1590         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1591         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1592         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1593         TCP_SKB_CB(skb)->sacked  = 0;
1594
1595 lookup:
1596         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1597                                th->dest, &refcounted);
1598         if (!sk)
1599                 goto no_tcp_socket;
1600
1601 process:
1602         if (sk->sk_state == TCP_TIME_WAIT)
1603                 goto do_time_wait;
1604
1605         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1606                 struct request_sock *req = inet_reqsk(sk);
1607                 struct sock *nsk;
1608
1609                 sk = req->rsk_listener;
1610                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1611                         reqsk_put(req);
1612                         goto discard_it;
1613                 }
1614                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1615                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1616                         goto lookup;
1617                 }
1618                 /* We own a reference on the listener, increase it again
1619                  * as we might lose it too soon.
1620                  */
1621                 sock_hold(sk);
1622                 refcounted = true;
1623                 nsk = tcp_check_req(sk, skb, req, false);
1624                 if (!nsk) {
1625                         reqsk_put(req);
1626                         goto discard_and_relse;
1627                 }
1628                 if (nsk == sk) {
1629                         reqsk_put(req);
1630                 } else if (tcp_child_process(sk, nsk, skb)) {
1631                         tcp_v4_send_reset(nsk, skb);
1632                         goto discard_and_relse;
1633                 } else {
1634                         sock_put(sk);
1635                         return 0;
1636                 }
1637         }
1638         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1639                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1640                 goto discard_and_relse;
1641         }
1642
1643         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1644                 goto discard_and_relse;
1645
1646         if (tcp_v4_inbound_md5_hash(sk, skb))
1647                 goto discard_and_relse;
1648
1649         nf_reset(skb);
1650
1651         if (sk_filter(sk, skb))
1652                 goto discard_and_relse;
1653
1654         skb->dev = NULL;
1655
1656         if (sk->sk_state == TCP_LISTEN) {
1657                 ret = tcp_v4_do_rcv(sk, skb);
1658                 goto put_and_return;
1659         }
1660
1661         sk_incoming_cpu_update(sk);
1662
1663         bh_lock_sock_nested(sk);
1664         tcp_segs_in(tcp_sk(sk), skb);
1665         ret = 0;
1666         if (!sock_owned_by_user(sk)) {
1667                 if (!tcp_prequeue(sk, skb))
1668                         ret = tcp_v4_do_rcv(sk, skb);
1669         } else if (unlikely(sk_add_backlog(sk, skb,
1670                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1671                 bh_unlock_sock(sk);
1672                 __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
1673                 goto discard_and_relse;
1674         }
1675         bh_unlock_sock(sk);
1676
1677 put_and_return:
1678         if (refcounted)
1679                 sock_put(sk);
1680
1681         return ret;
1682
1683 no_tcp_socket:
1684         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1685                 goto discard_it;
1686
1687         if (tcp_checksum_complete(skb)) {
1688 csum_error:
1689                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1690 bad_packet:
1691                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1692         } else {
1693                 tcp_v4_send_reset(NULL, skb);
1694         }
1695
1696 discard_it:
1697         /* Discard frame. */
1698         kfree_skb(skb);
1699         return 0;
1700
1701 discard_and_relse:
1702         sk_drops_add(sk, skb);
1703         if (refcounted)
1704                 sock_put(sk);
1705         goto discard_it;
1706
1707 do_time_wait:
1708         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1709                 inet_twsk_put(inet_twsk(sk));
1710                 goto discard_it;
1711         }
1712
1713         if (tcp_checksum_complete(skb)) {
1714                 inet_twsk_put(inet_twsk(sk));
1715                 goto csum_error;
1716         }
1717         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1718         case TCP_TW_SYN: {
1719                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1720                                                         &tcp_hashinfo, skb,
1721                                                         __tcp_hdrlen(th),
1722                                                         iph->saddr, th->source,
1723                                                         iph->daddr, th->dest,
1724                                                         inet_iif(skb));
1725                 if (sk2) {
1726                         inet_twsk_deschedule_put(inet_twsk(sk));
1727                         sk = sk2;
1728                         refcounted = false;
1729                         goto process;
1730                 }
1731                 /* Fall through to ACK */
1732         }
1733         case TCP_TW_ACK:
1734                 tcp_v4_timewait_ack(sk, skb);
1735                 break;
1736         case TCP_TW_RST:
1737                 tcp_v4_send_reset(sk, skb);
1738                 inet_twsk_deschedule_put(inet_twsk(sk));
1739                 goto discard_it;
1740         case TCP_TW_SUCCESS:;
1741         }
1742         goto discard_it;
1743 }
1744
1745 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1746         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1747         .twsk_unique    = tcp_twsk_unique,
1748         .twsk_destructor= tcp_twsk_destructor,
1749 };
1750
1751 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1752 {
1753         struct dst_entry *dst = skb_dst(skb);
1754
1755         if (dst && dst_hold_safe(dst)) {
1756                 sk->sk_rx_dst = dst;
1757                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1758         }
1759 }
1760 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1761
1762 const struct inet_connection_sock_af_ops ipv4_specific = {
1763         .queue_xmit        = ip_queue_xmit,
1764         .send_check        = tcp_v4_send_check,
1765         .rebuild_header    = inet_sk_rebuild_header,
1766         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1767         .conn_request      = tcp_v4_conn_request,
1768         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1769         .net_header_len    = sizeof(struct iphdr),
1770         .setsockopt        = ip_setsockopt,
1771         .getsockopt        = ip_getsockopt,
1772         .addr2sockaddr     = inet_csk_addr2sockaddr,
1773         .sockaddr_len      = sizeof(struct sockaddr_in),
1774         .bind_conflict     = inet_csk_bind_conflict,
1775 #ifdef CONFIG_COMPAT
1776         .compat_setsockopt = compat_ip_setsockopt,
1777         .compat_getsockopt = compat_ip_getsockopt,
1778 #endif
1779         .mtu_reduced       = tcp_v4_mtu_reduced,
1780 };
1781 EXPORT_SYMBOL(ipv4_specific);
1782
1783 #ifdef CONFIG_TCP_MD5SIG
1784 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1785         .md5_lookup             = tcp_v4_md5_lookup,
1786         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1787         .md5_parse              = tcp_v4_parse_md5_keys,
1788 };
1789 #endif
1790
1791 /* NOTE: A lot of things set to zero explicitly by call to
1792  *       sk_alloc() so need not be done here.
1793  */
1794 static int tcp_v4_init_sock(struct sock *sk)
1795 {
1796         struct inet_connection_sock *icsk = inet_csk(sk);
1797
1798         tcp_init_sock(sk);
1799
1800         icsk->icsk_af_ops = &ipv4_specific;
1801
1802 #ifdef CONFIG_TCP_MD5SIG
1803         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1804 #endif
1805
1806         return 0;
1807 }
1808
1809 void tcp_v4_destroy_sock(struct sock *sk)
1810 {
1811         struct tcp_sock *tp = tcp_sk(sk);
1812
1813         tcp_clear_xmit_timers(sk);
1814
1815         tcp_cleanup_congestion_control(sk);
1816
1817         /* Cleanup up the write buffer. */
1818         tcp_write_queue_purge(sk);
1819
1820         /* Cleans up our, hopefully empty, out_of_order_queue. */
1821         __skb_queue_purge(&tp->out_of_order_queue);
1822
1823 #ifdef CONFIG_TCP_MD5SIG
1824         /* Clean up the MD5 key list, if any */
1825         if (tp->md5sig_info) {
1826                 tcp_clear_md5_list(sk);
1827                 kfree_rcu(tp->md5sig_info, rcu);
1828                 tp->md5sig_info = NULL;
1829         }
1830 #endif
1831
1832         /* Clean prequeue, it must be empty really */
1833         __skb_queue_purge(&tp->ucopy.prequeue);
1834
1835         /* Clean up a referenced TCP bind bucket. */
1836         if (inet_csk(sk)->icsk_bind_hash)
1837                 inet_put_port(sk);
1838
1839         BUG_ON(tp->fastopen_rsk);
1840
1841         /* If socket is aborted during connect operation */
1842         tcp_free_fastopen_req(tp);
1843         tcp_saved_syn_free(tp);
1844
1845         local_bh_disable();
1846         sk_sockets_allocated_dec(sk);
1847         local_bh_enable();
1848
1849         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1850                 sock_release_memcg(sk);
1851 }
1852 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1853
1854 #ifdef CONFIG_PROC_FS
1855 /* Proc filesystem TCP sock list dumping. */
1856
1857 /*
1858  * Get next listener socket follow cur.  If cur is NULL, get first socket
1859  * starting from bucket given in st->bucket; when st->bucket is zero the
1860  * very first socket in the hash table is returned.
1861  */
1862 static void *listening_get_next(struct seq_file *seq, void *cur)
1863 {
1864         struct tcp_iter_state *st = seq->private;
1865         struct net *net = seq_file_net(seq);
1866         struct inet_listen_hashbucket *ilb;
1867         struct inet_connection_sock *icsk;
1868         struct sock *sk = cur;
1869
1870         if (!sk) {
1871 get_head:
1872                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1873                 spin_lock_bh(&ilb->lock);
1874                 sk = sk_head(&ilb->head);
1875                 st->offset = 0;
1876                 goto get_sk;
1877         }
1878         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1879         ++st->num;
1880         ++st->offset;
1881
1882         sk = sk_next(sk);
1883 get_sk:
1884         sk_for_each_from(sk) {
1885                 if (!net_eq(sock_net(sk), net))
1886                         continue;
1887                 if (sk->sk_family == st->family)
1888                         return sk;
1889                 icsk = inet_csk(sk);
1890         }
1891         spin_unlock_bh(&ilb->lock);
1892         st->offset = 0;
1893         if (++st->bucket < INET_LHTABLE_SIZE)
1894                 goto get_head;
1895         return NULL;
1896 }
1897
1898 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1899 {
1900         struct tcp_iter_state *st = seq->private;
1901         void *rc;
1902
1903         st->bucket = 0;
1904         st->offset = 0;
1905         rc = listening_get_next(seq, NULL);
1906
1907         while (rc && *pos) {
1908                 rc = listening_get_next(seq, rc);
1909                 --*pos;
1910         }
1911         return rc;
1912 }
1913
1914 static inline bool empty_bucket(const struct tcp_iter_state *st)
1915 {
1916         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1917 }
1918
1919 /*
1920  * Get first established socket starting from bucket given in st->bucket.
1921  * If st->bucket is zero, the very first socket in the hash is returned.
1922  */
1923 static void *established_get_first(struct seq_file *seq)
1924 {
1925         struct tcp_iter_state *st = seq->private;
1926         struct net *net = seq_file_net(seq);
1927         void *rc = NULL;
1928
1929         st->offset = 0;
1930         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1931                 struct sock *sk;
1932                 struct hlist_nulls_node *node;
1933                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1934
1935                 /* Lockless fast path for the common case of empty buckets */
1936                 if (empty_bucket(st))
1937                         continue;
1938
1939                 spin_lock_bh(lock);
1940                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1941                         if (sk->sk_family != st->family ||
1942                             !net_eq(sock_net(sk), net)) {
1943                                 continue;
1944                         }
1945                         rc = sk;
1946                         goto out;
1947                 }
1948                 spin_unlock_bh(lock);
1949         }
1950 out:
1951         return rc;
1952 }
1953
1954 static void *established_get_next(struct seq_file *seq, void *cur)
1955 {
1956         struct sock *sk = cur;
1957         struct hlist_nulls_node *node;
1958         struct tcp_iter_state *st = seq->private;
1959         struct net *net = seq_file_net(seq);
1960
1961         ++st->num;
1962         ++st->offset;
1963
1964         sk = sk_nulls_next(sk);
1965
1966         sk_nulls_for_each_from(sk, node) {
1967                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1968                         return sk;
1969         }
1970
1971         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1972         ++st->bucket;
1973         return established_get_first(seq);
1974 }
1975
1976 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1977 {
1978         struct tcp_iter_state *st = seq->private;
1979         void *rc;
1980
1981         st->bucket = 0;
1982         rc = established_get_first(seq);
1983
1984         while (rc && pos) {
1985                 rc = established_get_next(seq, rc);
1986                 --pos;
1987         }
1988         return rc;
1989 }
1990
1991 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1992 {
1993         void *rc;
1994         struct tcp_iter_state *st = seq->private;
1995
1996         st->state = TCP_SEQ_STATE_LISTENING;
1997         rc        = listening_get_idx(seq, &pos);
1998
1999         if (!rc) {
2000                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2001                 rc        = established_get_idx(seq, pos);
2002         }
2003
2004         return rc;
2005 }
2006
2007 static void *tcp_seek_last_pos(struct seq_file *seq)
2008 {
2009         struct tcp_iter_state *st = seq->private;
2010         int offset = st->offset;
2011         int orig_num = st->num;
2012         void *rc = NULL;
2013
2014         switch (st->state) {
2015         case TCP_SEQ_STATE_LISTENING:
2016                 if (st->bucket >= INET_LHTABLE_SIZE)
2017                         break;
2018                 st->state = TCP_SEQ_STATE_LISTENING;
2019                 rc = listening_get_next(seq, NULL);
2020                 while (offset-- && rc)
2021                         rc = listening_get_next(seq, rc);
2022                 if (rc)
2023                         break;
2024                 st->bucket = 0;
2025                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2026                 /* Fallthrough */
2027         case TCP_SEQ_STATE_ESTABLISHED:
2028                 if (st->bucket > tcp_hashinfo.ehash_mask)
2029                         break;
2030                 rc = established_get_first(seq);
2031                 while (offset-- && rc)
2032                         rc = established_get_next(seq, rc);
2033         }
2034
2035         st->num = orig_num;
2036
2037         return rc;
2038 }
2039
2040 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2041 {
2042         struct tcp_iter_state *st = seq->private;
2043         void *rc;
2044
2045         if (*pos && *pos == st->last_pos) {
2046                 rc = tcp_seek_last_pos(seq);
2047                 if (rc)
2048                         goto out;
2049         }
2050
2051         st->state = TCP_SEQ_STATE_LISTENING;
2052         st->num = 0;
2053         st->bucket = 0;
2054         st->offset = 0;
2055         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2056
2057 out:
2058         st->last_pos = *pos;
2059         return rc;
2060 }
2061
2062 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2063 {
2064         struct tcp_iter_state *st = seq->private;
2065         void *rc = NULL;
2066
2067         if (v == SEQ_START_TOKEN) {
2068                 rc = tcp_get_idx(seq, 0);
2069                 goto out;
2070         }
2071
2072         switch (st->state) {
2073         case TCP_SEQ_STATE_LISTENING:
2074                 rc = listening_get_next(seq, v);
2075                 if (!rc) {
2076                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2077                         st->bucket = 0;
2078                         st->offset = 0;
2079                         rc        = established_get_first(seq);
2080                 }
2081                 break;
2082         case TCP_SEQ_STATE_ESTABLISHED:
2083                 rc = established_get_next(seq, v);
2084                 break;
2085         }
2086 out:
2087         ++*pos;
2088         st->last_pos = *pos;
2089         return rc;
2090 }
2091
2092 static void tcp_seq_stop(struct seq_file *seq, void *v)
2093 {
2094         struct tcp_iter_state *st = seq->private;
2095
2096         switch (st->state) {
2097         case TCP_SEQ_STATE_LISTENING:
2098                 if (v != SEQ_START_TOKEN)
2099                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2100                 break;
2101         case TCP_SEQ_STATE_ESTABLISHED:
2102                 if (v)
2103                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2104                 break;
2105         }
2106 }
2107
2108 int tcp_seq_open(struct inode *inode, struct file *file)
2109 {
2110         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2111         struct tcp_iter_state *s;
2112         int err;
2113
2114         err = seq_open_net(inode, file, &afinfo->seq_ops,
2115                           sizeof(struct tcp_iter_state));
2116         if (err < 0)
2117                 return err;
2118
2119         s = ((struct seq_file *)file->private_data)->private;
2120         s->family               = afinfo->family;
2121         s->last_pos             = 0;
2122         return 0;
2123 }
2124 EXPORT_SYMBOL(tcp_seq_open);
2125
2126 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2127 {
2128         int rc = 0;
2129         struct proc_dir_entry *p;
2130
2131         afinfo->seq_ops.start           = tcp_seq_start;
2132         afinfo->seq_ops.next            = tcp_seq_next;
2133         afinfo->seq_ops.stop            = tcp_seq_stop;
2134
2135         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2136                              afinfo->seq_fops, afinfo);
2137         if (!p)
2138                 rc = -ENOMEM;
2139         return rc;
2140 }
2141 EXPORT_SYMBOL(tcp_proc_register);
2142
2143 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2144 {
2145         remove_proc_entry(afinfo->name, net->proc_net);
2146 }
2147 EXPORT_SYMBOL(tcp_proc_unregister);
2148
2149 static void get_openreq4(const struct request_sock *req,
2150                          struct seq_file *f, int i)
2151 {
2152         const struct inet_request_sock *ireq = inet_rsk(req);
2153         long delta = req->rsk_timer.expires - jiffies;
2154
2155         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2156                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2157                 i,
2158                 ireq->ir_loc_addr,
2159                 ireq->ir_num,
2160                 ireq->ir_rmt_addr,
2161                 ntohs(ireq->ir_rmt_port),
2162                 TCP_SYN_RECV,
2163                 0, 0, /* could print option size, but that is af dependent. */
2164                 1,    /* timers active (only the expire timer) */
2165                 jiffies_delta_to_clock_t(delta),
2166                 req->num_timeout,
2167                 from_kuid_munged(seq_user_ns(f),
2168                                  sock_i_uid(req->rsk_listener)),
2169                 0,  /* non standard timer */
2170                 0, /* open_requests have no inode */
2171                 0,
2172                 req);
2173 }
2174
2175 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2176 {
2177         int timer_active;
2178         unsigned long timer_expires;
2179         const struct tcp_sock *tp = tcp_sk(sk);
2180         const struct inet_connection_sock *icsk = inet_csk(sk);
2181         const struct inet_sock *inet = inet_sk(sk);
2182         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2183         __be32 dest = inet->inet_daddr;
2184         __be32 src = inet->inet_rcv_saddr;
2185         __u16 destp = ntohs(inet->inet_dport);
2186         __u16 srcp = ntohs(inet->inet_sport);
2187         int rx_queue;
2188         int state;
2189
2190         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2191             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2192             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2193                 timer_active    = 1;
2194                 timer_expires   = icsk->icsk_timeout;
2195         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2196                 timer_active    = 4;
2197                 timer_expires   = icsk->icsk_timeout;
2198         } else if (timer_pending(&sk->sk_timer)) {
2199                 timer_active    = 2;
2200                 timer_expires   = sk->sk_timer.expires;
2201         } else {
2202                 timer_active    = 0;
2203                 timer_expires = jiffies;
2204         }
2205
2206         state = sk_state_load(sk);
2207         if (state == TCP_LISTEN)
2208                 rx_queue = sk->sk_ack_backlog;
2209         else
2210                 /* Because we don't lock the socket,
2211                  * we might find a transient negative value.
2212                  */
2213                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2214
2215         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2216                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2217                 i, src, srcp, dest, destp, state,
2218                 tp->write_seq - tp->snd_una,
2219                 rx_queue,
2220                 timer_active,
2221                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2222                 icsk->icsk_retransmits,
2223                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2224                 icsk->icsk_probes_out,
2225                 sock_i_ino(sk),
2226                 atomic_read(&sk->sk_refcnt), sk,
2227                 jiffies_to_clock_t(icsk->icsk_rto),
2228                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2229                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2230                 tp->snd_cwnd,
2231                 state == TCP_LISTEN ?
2232                     fastopenq->max_qlen :
2233                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2234 }
2235
2236 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2237                                struct seq_file *f, int i)
2238 {
2239         long delta = tw->tw_timer.expires - jiffies;
2240         __be32 dest, src;
2241         __u16 destp, srcp;
2242
2243         dest  = tw->tw_daddr;
2244         src   = tw->tw_rcv_saddr;
2245         destp = ntohs(tw->tw_dport);
2246         srcp  = ntohs(tw->tw_sport);
2247
2248         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2249                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2250                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2251                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2252                 atomic_read(&tw->tw_refcnt), tw);
2253 }
2254
2255 #define TMPSZ 150
2256
2257 static int tcp4_seq_show(struct seq_file *seq, void *v)
2258 {
2259         struct tcp_iter_state *st;
2260         struct sock *sk = v;
2261
2262         seq_setwidth(seq, TMPSZ - 1);
2263         if (v == SEQ_START_TOKEN) {
2264                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2265                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2266                            "inode");
2267                 goto out;
2268         }
2269         st = seq->private;
2270
2271         if (sk->sk_state == TCP_TIME_WAIT)
2272                 get_timewait4_sock(v, seq, st->num);
2273         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2274                 get_openreq4(v, seq, st->num);
2275         else
2276                 get_tcp4_sock(v, seq, st->num);
2277 out:
2278         seq_pad(seq, '\n');
2279         return 0;
2280 }
2281
2282 static const struct file_operations tcp_afinfo_seq_fops = {
2283         .owner   = THIS_MODULE,
2284         .open    = tcp_seq_open,
2285         .read    = seq_read,
2286         .llseek  = seq_lseek,
2287         .release = seq_release_net
2288 };
2289
2290 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2291         .name           = "tcp",
2292         .family         = AF_INET,
2293         .seq_fops       = &tcp_afinfo_seq_fops,
2294         .seq_ops        = {
2295                 .show           = tcp4_seq_show,
2296         },
2297 };
2298
2299 static int __net_init tcp4_proc_init_net(struct net *net)
2300 {
2301         return tcp_proc_register(net, &tcp4_seq_afinfo);
2302 }
2303
2304 static void __net_exit tcp4_proc_exit_net(struct net *net)
2305 {
2306         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2307 }
2308
2309 static struct pernet_operations tcp4_net_ops = {
2310         .init = tcp4_proc_init_net,
2311         .exit = tcp4_proc_exit_net,
2312 };
2313
2314 int __init tcp4_proc_init(void)
2315 {
2316         return register_pernet_subsys(&tcp4_net_ops);
2317 }
2318
2319 void tcp4_proc_exit(void)
2320 {
2321         unregister_pernet_subsys(&tcp4_net_ops);
2322 }
2323 #endif /* CONFIG_PROC_FS */
2324
2325 struct proto tcp_prot = {
2326         .name                   = "TCP",
2327         .owner                  = THIS_MODULE,
2328         .close                  = tcp_close,
2329         .connect                = tcp_v4_connect,
2330         .disconnect             = tcp_disconnect,
2331         .accept                 = inet_csk_accept,
2332         .ioctl                  = tcp_ioctl,
2333         .init                   = tcp_v4_init_sock,
2334         .destroy                = tcp_v4_destroy_sock,
2335         .shutdown               = tcp_shutdown,
2336         .setsockopt             = tcp_setsockopt,
2337         .getsockopt             = tcp_getsockopt,
2338         .recvmsg                = tcp_recvmsg,
2339         .sendmsg                = tcp_sendmsg,
2340         .sendpage               = tcp_sendpage,
2341         .backlog_rcv            = tcp_v4_do_rcv,
2342         .release_cb             = tcp_release_cb,
2343         .hash                   = inet_hash,
2344         .unhash                 = inet_unhash,
2345         .get_port               = inet_csk_get_port,
2346         .enter_memory_pressure  = tcp_enter_memory_pressure,
2347         .stream_memory_free     = tcp_stream_memory_free,
2348         .sockets_allocated      = &tcp_sockets_allocated,
2349         .orphan_count           = &tcp_orphan_count,
2350         .memory_allocated       = &tcp_memory_allocated,
2351         .memory_pressure        = &tcp_memory_pressure,
2352         .sysctl_mem             = sysctl_tcp_mem,
2353         .sysctl_wmem            = sysctl_tcp_wmem,
2354         .sysctl_rmem            = sysctl_tcp_rmem,
2355         .max_header             = MAX_TCP_HEADER,
2356         .obj_size               = sizeof(struct tcp_sock),
2357         .slab_flags             = SLAB_DESTROY_BY_RCU,
2358         .twsk_prot              = &tcp_timewait_sock_ops,
2359         .rsk_prot               = &tcp_request_sock_ops,
2360         .h.hashinfo             = &tcp_hashinfo,
2361         .no_autobind            = true,
2362 #ifdef CONFIG_COMPAT
2363         .compat_setsockopt      = compat_tcp_setsockopt,
2364         .compat_getsockopt      = compat_tcp_getsockopt,
2365 #endif
2366         .diag_destroy           = tcp_abort,
2367 };
2368 EXPORT_SYMBOL(tcp_prot);
2369
2370 static void __net_exit tcp_sk_exit(struct net *net)
2371 {
2372         int cpu;
2373
2374         for_each_possible_cpu(cpu)
2375                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2376         free_percpu(net->ipv4.tcp_sk);
2377 }
2378
2379 static int __net_init tcp_sk_init(struct net *net)
2380 {
2381         int res, cpu;
2382
2383         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2384         if (!net->ipv4.tcp_sk)
2385                 return -ENOMEM;
2386
2387         for_each_possible_cpu(cpu) {
2388                 struct sock *sk;
2389
2390                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2391                                            IPPROTO_TCP, net);
2392                 if (res)
2393                         goto fail;
2394                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2395                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2396         }
2397
2398         net->ipv4.sysctl_tcp_ecn = 2;
2399         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2400
2401         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2402         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2403         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2404
2405         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2406         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2407         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2408
2409         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2410         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2411         net->ipv4.sysctl_tcp_syncookies = 1;
2412         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2413         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2414         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2415         net->ipv4.sysctl_tcp_orphan_retries = 0;
2416         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2417         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2418
2419         return 0;
2420 fail:
2421         tcp_sk_exit(net);
2422
2423         return res;
2424 }
2425
2426 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2427 {
2428         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2429 }
2430
2431 static struct pernet_operations __net_initdata tcp_sk_ops = {
2432        .init       = tcp_sk_init,
2433        .exit       = tcp_sk_exit,
2434        .exit_batch = tcp_sk_exit_batch,
2435 };
2436
2437 void __init tcp_v4_init(void)
2438 {
2439         inet_hashinfo_init(&tcp_hashinfo);
2440         if (register_pernet_subsys(&tcp_sk_ops))
2441                 panic("Failed to create the TCP control socket.\n");
2442 }