x86/cpufeature: Update cpufeaure macros
[cascardo/linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102                                           ip_hdr(skb)->saddr,
103                                           tcp_hdr(skb)->dest,
104                                           tcp_hdr(skb)->source);
105 }
106
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110         struct tcp_sock *tp = tcp_sk(sk);
111
112         /* With PAWS, it is safe from the viewpoint
113            of data integrity. Even without PAWS it is safe provided sequence
114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115
116            Actually, the idea is close to VJ's one, only timestamp cache is
117            held not per host, but per port pair and TW bucket is used as state
118            holder.
119
120            If TW bucket has been already destroyed we fall back to VJ's scheme
121            and use initial timestamp retrieved from peer table.
122          */
123         if (tcptw->tw_ts_recent_stamp &&
124             (!twp || (sysctl_tcp_tw_reuse &&
125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127                 if (tp->write_seq == 0)
128                         tp->write_seq = 1;
129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131                 sock_hold(sktw);
132                 return 1;
133         }
134
135         return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143         struct inet_sock *inet = inet_sk(sk);
144         struct tcp_sock *tp = tcp_sk(sk);
145         __be16 orig_sport, orig_dport;
146         __be32 daddr, nexthop;
147         struct flowi4 *fl4;
148         struct rtable *rt;
149         int err;
150         struct ip_options_rcu *inet_opt;
151
152         if (addr_len < sizeof(struct sockaddr_in))
153                 return -EINVAL;
154
155         if (usin->sin_family != AF_INET)
156                 return -EAFNOSUPPORT;
157
158         nexthop = daddr = usin->sin_addr.s_addr;
159         inet_opt = rcu_dereference_protected(inet->inet_opt,
160                                              lockdep_sock_is_held(sk));
161         if (inet_opt && inet_opt->opt.srr) {
162                 if (!daddr)
163                         return -EINVAL;
164                 nexthop = inet_opt->opt.faddr;
165         }
166
167         orig_sport = inet->inet_sport;
168         orig_dport = usin->sin_port;
169         fl4 = &inet->cork.fl.u.ip4;
170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172                               IPPROTO_TCP,
173                               orig_sport, orig_dport, sk);
174         if (IS_ERR(rt)) {
175                 err = PTR_ERR(rt);
176                 if (err == -ENETUNREACH)
177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return err;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet_opt || !inet_opt->opt.srr)
187                 daddr = fl4->daddr;
188
189         if (!inet->inet_saddr)
190                 inet->inet_saddr = fl4->saddr;
191         sk_rcv_saddr_set(sk, inet->inet_saddr);
192
193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 if (likely(!tp->repair))
198                         tp->write_seq      = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
204
205         inet->inet_dport = usin->sin_port;
206         sk_daddr_set(sk, daddr);
207
208         inet_csk(sk)->icsk_ext_hdr_len = 0;
209         if (inet_opt)
210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211
212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213
214         /* Socket identity is still unknown (sport may be zero).
215          * However we set state to SYN-SENT and not releasing socket
216          * lock select source port, enter ourselves into the hash tables and
217          * complete initialization after this.
218          */
219         tcp_set_state(sk, TCP_SYN_SENT);
220         err = inet_hash_connect(&tcp_death_row, sk);
221         if (err)
222                 goto failure;
223
224         sk_set_txhash(sk);
225
226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227                                inet->inet_sport, inet->inet_dport, sk);
228         if (IS_ERR(rt)) {
229                 err = PTR_ERR(rt);
230                 rt = NULL;
231                 goto failure;
232         }
233         /* OK, now commit destination to socket.  */
234         sk->sk_gso_type = SKB_GSO_TCPV4;
235         sk_setup_caps(sk, &rt->dst);
236
237         if (!tp->write_seq && likely(!tp->repair))
238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239                                                            inet->inet_daddr,
240                                                            inet->inet_sport,
241                                                            usin->sin_port);
242
243         inet->inet_id = tp->write_seq ^ jiffies;
244
245         err = tcp_connect(sk);
246
247         rt = NULL;
248         if (err)
249                 goto failure;
250
251         return 0;
252
253 failure:
254         /*
255          * This unhashes the socket and releases the local port,
256          * if necessary.
257          */
258         tcp_set_state(sk, TCP_CLOSE);
259         ip_rt_put(rt);
260         sk->sk_route_caps = 0;
261         inet->inet_dport = 0;
262         return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275         u32 mtu = tcp_sk(sk)->mtu_info;
276
277         dst = inet_csk_update_pmtu(sk, mtu);
278         if (!dst)
279                 return;
280
281         /* Something is about to be wrong... Remember soft error
282          * for the case, if this connection will not able to recover.
283          */
284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285                 sk->sk_err_soft = EMSGSIZE;
286
287         mtu = dst_mtu(dst);
288
289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290             ip_sk_accept_pmtu(sk) &&
291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292                 tcp_sync_mss(sk, mtu);
293
294                 /* Resend the TCP packet because it's
295                  * clear that the old packet has been
296                  * dropped. This is the new "fast" path mtu
297                  * discovery.
298                  */
299                 tcp_simple_retransmit(sk);
300         } /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306         struct dst_entry *dst = __sk_dst_check(sk, 0);
307
308         if (dst)
309                 dst->ops->redirect(dst, sk, skb);
310 }
311
312
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316         struct request_sock *req = inet_reqsk(sk);
317         struct net *net = sock_net(sk);
318
319         /* ICMPs are not backlogged, hence we cannot get
320          * an established socket here.
321          */
322         if (seq != tcp_rsk(req)->snt_isn) {
323                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
324         } else if (abort) {
325                 /*
326                  * Still in SYN_RECV, just remove it silently.
327                  * There is no good way to pass the error to the newly
328                  * created socket, and POSIX does not want network
329                  * errors returned from accept().
330                  */
331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332                 tcp_listendrop(req->rsk_listener);
333         }
334         reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358         struct inet_connection_sock *icsk;
359         struct tcp_sock *tp;
360         struct inet_sock *inet;
361         const int type = icmp_hdr(icmp_skb)->type;
362         const int code = icmp_hdr(icmp_skb)->code;
363         struct sock *sk;
364         struct sk_buff *skb;
365         struct request_sock *fastopen;
366         __u32 seq, snd_una;
367         __u32 remaining;
368         int err;
369         struct net *net = dev_net(icmp_skb->dev);
370
371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372                                        th->dest, iph->saddr, ntohs(th->source),
373                                        inet_iif(icmp_skb));
374         if (!sk) {
375                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
376                 return;
377         }
378         if (sk->sk_state == TCP_TIME_WAIT) {
379                 inet_twsk_put(inet_twsk(sk));
380                 return;
381         }
382         seq = ntohl(th->seq);
383         if (sk->sk_state == TCP_NEW_SYN_RECV)
384                 return tcp_req_err(sk, seq,
385                                   type == ICMP_PARAMETERPROB ||
386                                   type == ICMP_TIME_EXCEEDED ||
387                                   (type == ICMP_DEST_UNREACH &&
388                                    (code == ICMP_NET_UNREACH ||
389                                     code == ICMP_HOST_UNREACH)));
390
391         bh_lock_sock(sk);
392         /* If too many ICMPs get dropped on busy
393          * servers this needs to be solved differently.
394          * We do take care of PMTU discovery (RFC1191) special case :
395          * we can receive locally generated ICMP messages while socket is held.
396          */
397         if (sock_owned_by_user(sk)) {
398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
400         }
401         if (sk->sk_state == TCP_CLOSE)
402                 goto out;
403
404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
406                 goto out;
407         }
408
409         icsk = inet_csk(sk);
410         tp = tcp_sk(sk);
411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412         fastopen = tp->fastopen_rsk;
413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414         if (sk->sk_state != TCP_LISTEN &&
415             !between(seq, snd_una, tp->snd_nxt)) {
416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
417                 goto out;
418         }
419
420         switch (type) {
421         case ICMP_REDIRECT:
422                 do_redirect(icmp_skb, sk);
423                 goto out;
424         case ICMP_SOURCE_QUENCH:
425                 /* Just silently ignore these. */
426                 goto out;
427         case ICMP_PARAMETERPROB:
428                 err = EPROTO;
429                 break;
430         case ICMP_DEST_UNREACH:
431                 if (code > NR_ICMP_UNREACH)
432                         goto out;
433
434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435                         /* We are not interested in TCP_LISTEN and open_requests
436                          * (SYN-ACKs send out by Linux are always <576bytes so
437                          * they should go through unfragmented).
438                          */
439                         if (sk->sk_state == TCP_LISTEN)
440                                 goto out;
441
442                         tp->mtu_info = info;
443                         if (!sock_owned_by_user(sk)) {
444                                 tcp_v4_mtu_reduced(sk);
445                         } else {
446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
447                                         sock_hold(sk);
448                         }
449                         goto out;
450                 }
451
452                 err = icmp_err_convert[code].errno;
453                 /* check if icmp_skb allows revert of backoff
454                  * (see draft-zimmermann-tcp-lcd) */
455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456                         break;
457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458                     !icsk->icsk_backoff || fastopen)
459                         break;
460
461                 if (sock_owned_by_user(sk))
462                         break;
463
464                 icsk->icsk_backoff--;
465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466                                                TCP_TIMEOUT_INIT;
467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468
469                 skb = tcp_write_queue_head(sk);
470                 BUG_ON(!skb);
471
472                 remaining = icsk->icsk_rto -
473                             min(icsk->icsk_rto,
474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
475
476                 if (remaining) {
477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478                                                   remaining, TCP_RTO_MAX);
479                 } else {
480                         /* RTO revert clocked out retransmission.
481                          * Will retransmit now */
482                         tcp_retransmit_timer(sk);
483                 }
484
485                 break;
486         case ICMP_TIME_EXCEEDED:
487                 err = EHOSTUNREACH;
488                 break;
489         default:
490                 goto out;
491         }
492
493         switch (sk->sk_state) {
494         case TCP_SYN_SENT:
495         case TCP_SYN_RECV:
496                 /* Only in fast or simultaneous open. If a fast open socket is
497                  * is already accepted it is treated as a connected one below.
498                  */
499                 if (fastopen && !fastopen->sk)
500                         break;
501
502                 if (!sock_owned_by_user(sk)) {
503                         sk->sk_err = err;
504
505                         sk->sk_error_report(sk);
506
507                         tcp_done(sk);
508                 } else {
509                         sk->sk_err_soft = err;
510                 }
511                 goto out;
512         }
513
514         /* If we've already connected we will keep trying
515          * until we time out, or the user gives up.
516          *
517          * rfc1122 4.2.3.9 allows to consider as hard errors
518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519          * but it is obsoleted by pmtu discovery).
520          *
521          * Note, that in modern internet, where routing is unreliable
522          * and in each dark corner broken firewalls sit, sending random
523          * errors ordered by their masters even this two messages finally lose
524          * their original sense (even Linux sends invalid PORT_UNREACHs)
525          *
526          * Now we are in compliance with RFCs.
527          *                                                      --ANK (980905)
528          */
529
530         inet = inet_sk(sk);
531         if (!sock_owned_by_user(sk) && inet->recverr) {
532                 sk->sk_err = err;
533                 sk->sk_error_report(sk);
534         } else  { /* Only an error on timeout */
535                 sk->sk_err_soft = err;
536         }
537
538 out:
539         bh_unlock_sock(sk);
540         sock_put(sk);
541 }
542
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545         struct tcphdr *th = tcp_hdr(skb);
546
547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549                 skb->csum_start = skb_transport_header(skb) - skb->head;
550                 skb->csum_offset = offsetof(struct tcphdr, check);
551         } else {
552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
553                                          csum_partial(th,
554                                                       th->doff << 2,
555                                                       skb->csum));
556         }
557 }
558
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562         const struct inet_sock *inet = inet_sk(sk);
563
564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567
568 /*
569  *      This routine will send an RST to the other tcp.
570  *
571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *                    for reset.
573  *      Answer: if a packet caused RST, it is not for a socket
574  *              existing in our system, if it is matched to a socket,
575  *              it is just duplicate segment or bug in other side's TCP.
576  *              So that we build reply only basing on parameters
577  *              arrived with segment.
578  *      Exception: precedence violation. We do not implement it in any case.
579  */
580
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583         const struct tcphdr *th = tcp_hdr(skb);
584         struct {
585                 struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589         } rep;
590         struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592         struct tcp_md5sig_key *key = NULL;
593         const __u8 *hash_location = NULL;
594         unsigned char newhash[16];
595         int genhash;
596         struct sock *sk1 = NULL;
597 #endif
598         struct net *net;
599
600         /* Never send a reset in response to a reset. */
601         if (th->rst)
602                 return;
603
604         /* If sk not NULL, it means we did a successful lookup and incoming
605          * route had to be correct. prequeue might have dropped our dst.
606          */
607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608                 return;
609
610         /* Swap the send and the receive. */
611         memset(&rep, 0, sizeof(rep));
612         rep.th.dest   = th->source;
613         rep.th.source = th->dest;
614         rep.th.doff   = sizeof(struct tcphdr) / 4;
615         rep.th.rst    = 1;
616
617         if (th->ack) {
618                 rep.th.seq = th->ack_seq;
619         } else {
620                 rep.th.ack = 1;
621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622                                        skb->len - (th->doff << 2));
623         }
624
625         memset(&arg, 0, sizeof(arg));
626         arg.iov[0].iov_base = (unsigned char *)&rep;
627         arg.iov[0].iov_len  = sizeof(rep.th);
628
629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631         rcu_read_lock();
632         hash_location = tcp_parse_md5sig_option(th);
633         if (sk && sk_fullsock(sk)) {
634                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
635                                         &ip_hdr(skb)->saddr, AF_INET);
636         } else if (hash_location) {
637                 /*
638                  * active side is lost. Try to find listening socket through
639                  * source port, and then find md5 key through listening socket.
640                  * we are not loose security here:
641                  * Incoming packet is checked with md5 hash with finding key,
642                  * no RST generated if md5 hash doesn't match.
643                  */
644                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
645                                              ip_hdr(skb)->saddr,
646                                              th->source, ip_hdr(skb)->daddr,
647                                              ntohs(th->source), inet_iif(skb));
648                 /* don't send rst if it can't find key */
649                 if (!sk1)
650                         goto out;
651
652                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
653                                         &ip_hdr(skb)->saddr, AF_INET);
654                 if (!key)
655                         goto out;
656
657
658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
660                         goto out;
661
662         }
663
664         if (key) {
665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666                                    (TCPOPT_NOP << 16) |
667                                    (TCPOPT_MD5SIG << 8) |
668                                    TCPOLEN_MD5SIG);
669                 /* Update length and the length the header thinks exists */
670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671                 rep.th.doff = arg.iov[0].iov_len / 4;
672
673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
674                                      key, ip_hdr(skb)->saddr,
675                                      ip_hdr(skb)->daddr, &rep.th);
676         }
677 #endif
678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679                                       ip_hdr(skb)->saddr, /* XXX */
680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
682         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
683
684         /* When socket is gone, all binding information is lost.
685          * routing might fail in this case. No choice here, if we choose to force
686          * input interface, we will misroute in case of asymmetric route.
687          */
688         if (sk)
689                 arg.bound_dev_if = sk->sk_bound_dev_if;
690
691         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
692                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
693
694         arg.tos = ip_hdr(skb)->tos;
695         local_bh_disable();
696         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
697                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
698                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
699                               &arg, arg.iov[0].iov_len);
700
701         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
702         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
703         local_bh_enable();
704
705 #ifdef CONFIG_TCP_MD5SIG
706 out:
707         rcu_read_unlock();
708 #endif
709 }
710
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714
715 static void tcp_v4_send_ack(struct net *net,
716                             struct sk_buff *skb, u32 seq, u32 ack,
717                             u32 win, u32 tsval, u32 tsecr, int oif,
718                             struct tcp_md5sig_key *key,
719                             int reply_flags, u8 tos)
720 {
721         const struct tcphdr *th = tcp_hdr(skb);
722         struct {
723                 struct tcphdr th;
724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728                         ];
729         } rep;
730         struct ip_reply_arg arg;
731
732         memset(&rep.th, 0, sizeof(struct tcphdr));
733         memset(&arg, 0, sizeof(arg));
734
735         arg.iov[0].iov_base = (unsigned char *)&rep;
736         arg.iov[0].iov_len  = sizeof(rep.th);
737         if (tsecr) {
738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739                                    (TCPOPT_TIMESTAMP << 8) |
740                                    TCPOLEN_TIMESTAMP);
741                 rep.opt[1] = htonl(tsval);
742                 rep.opt[2] = htonl(tsecr);
743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744         }
745
746         /* Swap the send and the receive. */
747         rep.th.dest    = th->source;
748         rep.th.source  = th->dest;
749         rep.th.doff    = arg.iov[0].iov_len / 4;
750         rep.th.seq     = htonl(seq);
751         rep.th.ack_seq = htonl(ack);
752         rep.th.ack     = 1;
753         rep.th.window  = htons(win);
754
755 #ifdef CONFIG_TCP_MD5SIG
756         if (key) {
757                 int offset = (tsecr) ? 3 : 0;
758
759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760                                           (TCPOPT_NOP << 16) |
761                                           (TCPOPT_MD5SIG << 8) |
762                                           TCPOLEN_MD5SIG);
763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764                 rep.th.doff = arg.iov[0].iov_len/4;
765
766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767                                     key, ip_hdr(skb)->saddr,
768                                     ip_hdr(skb)->daddr, &rep.th);
769         }
770 #endif
771         arg.flags = reply_flags;
772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773                                       ip_hdr(skb)->saddr, /* XXX */
774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776         if (oif)
777                 arg.bound_dev_if = oif;
778         arg.tos = tos;
779         local_bh_disable();
780         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
781                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
782                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
783                               &arg, arg.iov[0].iov_len);
784
785         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
786         local_bh_enable();
787 }
788
789 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
790 {
791         struct inet_timewait_sock *tw = inet_twsk(sk);
792         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
793
794         tcp_v4_send_ack(sock_net(sk), skb,
795                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
796                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
797                         tcp_time_stamp + tcptw->tw_ts_offset,
798                         tcptw->tw_ts_recent,
799                         tw->tw_bound_dev_if,
800                         tcp_twsk_md5_key(tcptw),
801                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
802                         tw->tw_tos
803                         );
804
805         inet_twsk_put(tw);
806 }
807
808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
809                                   struct request_sock *req)
810 {
811         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
812          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
813          */
814         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
815                                              tcp_sk(sk)->snd_nxt;
816
817         tcp_v4_send_ack(sock_net(sk), skb, seq,
818                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
819                         tcp_time_stamp,
820                         req->ts_recent,
821                         0,
822                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
823                                           AF_INET),
824                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
825                         ip_hdr(skb)->tos);
826 }
827
828 /*
829  *      Send a SYN-ACK after having received a SYN.
830  *      This still operates on a request_sock only, not on a big
831  *      socket.
832  */
833 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
834                               struct flowi *fl,
835                               struct request_sock *req,
836                               struct tcp_fastopen_cookie *foc,
837                               enum tcp_synack_type synack_type)
838 {
839         const struct inet_request_sock *ireq = inet_rsk(req);
840         struct flowi4 fl4;
841         int err = -1;
842         struct sk_buff *skb;
843
844         /* First, grab a route. */
845         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
846                 return -1;
847
848         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
849
850         if (skb) {
851                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
852
853                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
854                                             ireq->ir_rmt_addr,
855                                             ireq->opt);
856                 err = net_xmit_eval(err);
857         }
858
859         return err;
860 }
861
862 /*
863  *      IPv4 request_sock destructor.
864  */
865 static void tcp_v4_reqsk_destructor(struct request_sock *req)
866 {
867         kfree(inet_rsk(req)->opt);
868 }
869
870 #ifdef CONFIG_TCP_MD5SIG
871 /*
872  * RFC2385 MD5 checksumming requires a mapping of
873  * IP address->MD5 Key.
874  * We need to maintain these in the sk structure.
875  */
876
877 /* Find the Key structure for an address.  */
878 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
879                                          const union tcp_md5_addr *addr,
880                                          int family)
881 {
882         const struct tcp_sock *tp = tcp_sk(sk);
883         struct tcp_md5sig_key *key;
884         unsigned int size = sizeof(struct in_addr);
885         const struct tcp_md5sig_info *md5sig;
886
887         /* caller either holds rcu_read_lock() or socket lock */
888         md5sig = rcu_dereference_check(tp->md5sig_info,
889                                        lockdep_sock_is_held(sk));
890         if (!md5sig)
891                 return NULL;
892 #if IS_ENABLED(CONFIG_IPV6)
893         if (family == AF_INET6)
894                 size = sizeof(struct in6_addr);
895 #endif
896         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
897                 if (key->family != family)
898                         continue;
899                 if (!memcmp(&key->addr, addr, size))
900                         return key;
901         }
902         return NULL;
903 }
904 EXPORT_SYMBOL(tcp_md5_do_lookup);
905
906 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
907                                          const struct sock *addr_sk)
908 {
909         const union tcp_md5_addr *addr;
910
911         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
912         return tcp_md5_do_lookup(sk, addr, AF_INET);
913 }
914 EXPORT_SYMBOL(tcp_v4_md5_lookup);
915
916 /* This can be called on a newly created socket, from other files */
917 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
918                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
919 {
920         /* Add Key to the list */
921         struct tcp_md5sig_key *key;
922         struct tcp_sock *tp = tcp_sk(sk);
923         struct tcp_md5sig_info *md5sig;
924
925         key = tcp_md5_do_lookup(sk, addr, family);
926         if (key) {
927                 /* Pre-existing entry - just update that one. */
928                 memcpy(key->key, newkey, newkeylen);
929                 key->keylen = newkeylen;
930                 return 0;
931         }
932
933         md5sig = rcu_dereference_protected(tp->md5sig_info,
934                                            lockdep_sock_is_held(sk));
935         if (!md5sig) {
936                 md5sig = kmalloc(sizeof(*md5sig), gfp);
937                 if (!md5sig)
938                         return -ENOMEM;
939
940                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
941                 INIT_HLIST_HEAD(&md5sig->head);
942                 rcu_assign_pointer(tp->md5sig_info, md5sig);
943         }
944
945         key = sock_kmalloc(sk, sizeof(*key), gfp);
946         if (!key)
947                 return -ENOMEM;
948         if (!tcp_alloc_md5sig_pool()) {
949                 sock_kfree_s(sk, key, sizeof(*key));
950                 return -ENOMEM;
951         }
952
953         memcpy(key->key, newkey, newkeylen);
954         key->keylen = newkeylen;
955         key->family = family;
956         memcpy(&key->addr, addr,
957                (family == AF_INET6) ? sizeof(struct in6_addr) :
958                                       sizeof(struct in_addr));
959         hlist_add_head_rcu(&key->node, &md5sig->head);
960         return 0;
961 }
962 EXPORT_SYMBOL(tcp_md5_do_add);
963
964 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
965 {
966         struct tcp_md5sig_key *key;
967
968         key = tcp_md5_do_lookup(sk, addr, family);
969         if (!key)
970                 return -ENOENT;
971         hlist_del_rcu(&key->node);
972         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
973         kfree_rcu(key, rcu);
974         return 0;
975 }
976 EXPORT_SYMBOL(tcp_md5_do_del);
977
978 static void tcp_clear_md5_list(struct sock *sk)
979 {
980         struct tcp_sock *tp = tcp_sk(sk);
981         struct tcp_md5sig_key *key;
982         struct hlist_node *n;
983         struct tcp_md5sig_info *md5sig;
984
985         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
986
987         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
988                 hlist_del_rcu(&key->node);
989                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
990                 kfree_rcu(key, rcu);
991         }
992 }
993
994 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
995                                  int optlen)
996 {
997         struct tcp_md5sig cmd;
998         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
999
1000         if (optlen < sizeof(cmd))
1001                 return -EINVAL;
1002
1003         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1004                 return -EFAULT;
1005
1006         if (sin->sin_family != AF_INET)
1007                 return -EINVAL;
1008
1009         if (!cmd.tcpm_keylen)
1010                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1011                                       AF_INET);
1012
1013         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1014                 return -EINVAL;
1015
1016         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1018                               GFP_KERNEL);
1019 }
1020
1021 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1022                                         __be32 daddr, __be32 saddr, int nbytes)
1023 {
1024         struct tcp4_pseudohdr *bp;
1025         struct scatterlist sg;
1026
1027         bp = &hp->md5_blk.ip4;
1028
1029         /*
1030          * 1. the TCP pseudo-header (in the order: source IP address,
1031          * destination IP address, zero-padded protocol number, and
1032          * segment length)
1033          */
1034         bp->saddr = saddr;
1035         bp->daddr = daddr;
1036         bp->pad = 0;
1037         bp->protocol = IPPROTO_TCP;
1038         bp->len = cpu_to_be16(nbytes);
1039
1040         sg_init_one(&sg, bp, sizeof(*bp));
1041         ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
1042         return crypto_ahash_update(hp->md5_req);
1043 }
1044
1045 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1046                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1047 {
1048         struct tcp_md5sig_pool *hp;
1049         struct ahash_request *req;
1050
1051         hp = tcp_get_md5sig_pool();
1052         if (!hp)
1053                 goto clear_hash_noput;
1054         req = hp->md5_req;
1055
1056         if (crypto_ahash_init(req))
1057                 goto clear_hash;
1058         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1059                 goto clear_hash;
1060         if (tcp_md5_hash_header(hp, th))
1061                 goto clear_hash;
1062         if (tcp_md5_hash_key(hp, key))
1063                 goto clear_hash;
1064         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1065         if (crypto_ahash_final(req))
1066                 goto clear_hash;
1067
1068         tcp_put_md5sig_pool();
1069         return 0;
1070
1071 clear_hash:
1072         tcp_put_md5sig_pool();
1073 clear_hash_noput:
1074         memset(md5_hash, 0, 16);
1075         return 1;
1076 }
1077
1078 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1079                         const struct sock *sk,
1080                         const struct sk_buff *skb)
1081 {
1082         struct tcp_md5sig_pool *hp;
1083         struct ahash_request *req;
1084         const struct tcphdr *th = tcp_hdr(skb);
1085         __be32 saddr, daddr;
1086
1087         if (sk) { /* valid for establish/request sockets */
1088                 saddr = sk->sk_rcv_saddr;
1089                 daddr = sk->sk_daddr;
1090         } else {
1091                 const struct iphdr *iph = ip_hdr(skb);
1092                 saddr = iph->saddr;
1093                 daddr = iph->daddr;
1094         }
1095
1096         hp = tcp_get_md5sig_pool();
1097         if (!hp)
1098                 goto clear_hash_noput;
1099         req = hp->md5_req;
1100
1101         if (crypto_ahash_init(req))
1102                 goto clear_hash;
1103
1104         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1105                 goto clear_hash;
1106         if (tcp_md5_hash_header(hp, th))
1107                 goto clear_hash;
1108         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1109                 goto clear_hash;
1110         if (tcp_md5_hash_key(hp, key))
1111                 goto clear_hash;
1112         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1113         if (crypto_ahash_final(req))
1114                 goto clear_hash;
1115
1116         tcp_put_md5sig_pool();
1117         return 0;
1118
1119 clear_hash:
1120         tcp_put_md5sig_pool();
1121 clear_hash_noput:
1122         memset(md5_hash, 0, 16);
1123         return 1;
1124 }
1125 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1126
1127 #endif
1128
1129 /* Called with rcu_read_lock() */
1130 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1131                                     const struct sk_buff *skb)
1132 {
1133 #ifdef CONFIG_TCP_MD5SIG
1134         /*
1135          * This gets called for each TCP segment that arrives
1136          * so we want to be efficient.
1137          * We have 3 drop cases:
1138          * o No MD5 hash and one expected.
1139          * o MD5 hash and we're not expecting one.
1140          * o MD5 hash and its wrong.
1141          */
1142         const __u8 *hash_location = NULL;
1143         struct tcp_md5sig_key *hash_expected;
1144         const struct iphdr *iph = ip_hdr(skb);
1145         const struct tcphdr *th = tcp_hdr(skb);
1146         int genhash;
1147         unsigned char newhash[16];
1148
1149         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1150                                           AF_INET);
1151         hash_location = tcp_parse_md5sig_option(th);
1152
1153         /* We've parsed the options - do we have a hash? */
1154         if (!hash_expected && !hash_location)
1155                 return false;
1156
1157         if (hash_expected && !hash_location) {
1158                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1159                 return true;
1160         }
1161
1162         if (!hash_expected && hash_location) {
1163                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1164                 return true;
1165         }
1166
1167         /* Okay, so this is hash_expected and hash_location -
1168          * so we need to calculate the checksum.
1169          */
1170         genhash = tcp_v4_md5_hash_skb(newhash,
1171                                       hash_expected,
1172                                       NULL, skb);
1173
1174         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1175                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1176                                      &iph->saddr, ntohs(th->source),
1177                                      &iph->daddr, ntohs(th->dest),
1178                                      genhash ? " tcp_v4_calc_md5_hash failed"
1179                                      : "");
1180                 return true;
1181         }
1182         return false;
1183 #endif
1184         return false;
1185 }
1186
1187 static void tcp_v4_init_req(struct request_sock *req,
1188                             const struct sock *sk_listener,
1189                             struct sk_buff *skb)
1190 {
1191         struct inet_request_sock *ireq = inet_rsk(req);
1192
1193         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1194         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1195         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1196         ireq->opt = tcp_v4_save_options(skb);
1197 }
1198
1199 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1200                                           struct flowi *fl,
1201                                           const struct request_sock *req,
1202                                           bool *strict)
1203 {
1204         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1205
1206         if (strict) {
1207                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1208                         *strict = true;
1209                 else
1210                         *strict = false;
1211         }
1212
1213         return dst;
1214 }
1215
1216 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1217         .family         =       PF_INET,
1218         .obj_size       =       sizeof(struct tcp_request_sock),
1219         .rtx_syn_ack    =       tcp_rtx_synack,
1220         .send_ack       =       tcp_v4_reqsk_send_ack,
1221         .destructor     =       tcp_v4_reqsk_destructor,
1222         .send_reset     =       tcp_v4_send_reset,
1223         .syn_ack_timeout =      tcp_syn_ack_timeout,
1224 };
1225
1226 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1227         .mss_clamp      =       TCP_MSS_DEFAULT,
1228 #ifdef CONFIG_TCP_MD5SIG
1229         .req_md5_lookup =       tcp_v4_md5_lookup,
1230         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1231 #endif
1232         .init_req       =       tcp_v4_init_req,
1233 #ifdef CONFIG_SYN_COOKIES
1234         .cookie_init_seq =      cookie_v4_init_sequence,
1235 #endif
1236         .route_req      =       tcp_v4_route_req,
1237         .init_seq       =       tcp_v4_init_sequence,
1238         .send_synack    =       tcp_v4_send_synack,
1239 };
1240
1241 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1242 {
1243         /* Never answer to SYNs send to broadcast or multicast */
1244         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1245                 goto drop;
1246
1247         return tcp_conn_request(&tcp_request_sock_ops,
1248                                 &tcp_request_sock_ipv4_ops, sk, skb);
1249
1250 drop:
1251         tcp_listendrop(sk);
1252         return 0;
1253 }
1254 EXPORT_SYMBOL(tcp_v4_conn_request);
1255
1256
1257 /*
1258  * The three way handshake has completed - we got a valid synack -
1259  * now create the new socket.
1260  */
1261 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1262                                   struct request_sock *req,
1263                                   struct dst_entry *dst,
1264                                   struct request_sock *req_unhash,
1265                                   bool *own_req)
1266 {
1267         struct inet_request_sock *ireq;
1268         struct inet_sock *newinet;
1269         struct tcp_sock *newtp;
1270         struct sock *newsk;
1271 #ifdef CONFIG_TCP_MD5SIG
1272         struct tcp_md5sig_key *key;
1273 #endif
1274         struct ip_options_rcu *inet_opt;
1275
1276         if (sk_acceptq_is_full(sk))
1277                 goto exit_overflow;
1278
1279         newsk = tcp_create_openreq_child(sk, req, skb);
1280         if (!newsk)
1281                 goto exit_nonewsk;
1282
1283         newsk->sk_gso_type = SKB_GSO_TCPV4;
1284         inet_sk_rx_dst_set(newsk, skb);
1285
1286         newtp                 = tcp_sk(newsk);
1287         newinet               = inet_sk(newsk);
1288         ireq                  = inet_rsk(req);
1289         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1290         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1291         newsk->sk_bound_dev_if = ireq->ir_iif;
1292         newinet->inet_saddr           = ireq->ir_loc_addr;
1293         inet_opt              = ireq->opt;
1294         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1295         ireq->opt             = NULL;
1296         newinet->mc_index     = inet_iif(skb);
1297         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1298         newinet->rcv_tos      = ip_hdr(skb)->tos;
1299         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1300         if (inet_opt)
1301                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1302         newinet->inet_id = newtp->write_seq ^ jiffies;
1303
1304         if (!dst) {
1305                 dst = inet_csk_route_child_sock(sk, newsk, req);
1306                 if (!dst)
1307                         goto put_and_exit;
1308         } else {
1309                 /* syncookie case : see end of cookie_v4_check() */
1310         }
1311         sk_setup_caps(newsk, dst);
1312
1313         tcp_ca_openreq_child(newsk, dst);
1314
1315         tcp_sync_mss(newsk, dst_mtu(dst));
1316         newtp->advmss = dst_metric_advmss(dst);
1317         if (tcp_sk(sk)->rx_opt.user_mss &&
1318             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1319                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1320
1321         tcp_initialize_rcv_mss(newsk);
1322
1323 #ifdef CONFIG_TCP_MD5SIG
1324         /* Copy over the MD5 key from the original socket */
1325         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1326                                 AF_INET);
1327         if (key) {
1328                 /*
1329                  * We're using one, so create a matching key
1330                  * on the newsk structure. If we fail to get
1331                  * memory, then we end up not copying the key
1332                  * across. Shucks.
1333                  */
1334                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1335                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1336                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1337         }
1338 #endif
1339
1340         if (__inet_inherit_port(sk, newsk) < 0)
1341                 goto put_and_exit;
1342         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1343         if (*own_req)
1344                 tcp_move_syn(newtp, req);
1345
1346         return newsk;
1347
1348 exit_overflow:
1349         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1350 exit_nonewsk:
1351         dst_release(dst);
1352 exit:
1353         tcp_listendrop(sk);
1354         return NULL;
1355 put_and_exit:
1356         inet_csk_prepare_forced_close(newsk);
1357         tcp_done(newsk);
1358         goto exit;
1359 }
1360 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1361
1362 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1363 {
1364 #ifdef CONFIG_SYN_COOKIES
1365         const struct tcphdr *th = tcp_hdr(skb);
1366
1367         if (!th->syn)
1368                 sk = cookie_v4_check(sk, skb);
1369 #endif
1370         return sk;
1371 }
1372
1373 /* The socket must have it's spinlock held when we get
1374  * here, unless it is a TCP_LISTEN socket.
1375  *
1376  * We have a potential double-lock case here, so even when
1377  * doing backlog processing we use the BH locking scheme.
1378  * This is because we cannot sleep with the original spinlock
1379  * held.
1380  */
1381 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1382 {
1383         struct sock *rsk;
1384
1385         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1386                 struct dst_entry *dst = sk->sk_rx_dst;
1387
1388                 sock_rps_save_rxhash(sk, skb);
1389                 sk_mark_napi_id(sk, skb);
1390                 if (dst) {
1391                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1392                             !dst->ops->check(dst, 0)) {
1393                                 dst_release(dst);
1394                                 sk->sk_rx_dst = NULL;
1395                         }
1396                 }
1397                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1398                 return 0;
1399         }
1400
1401         if (tcp_checksum_complete(skb))
1402                 goto csum_err;
1403
1404         if (sk->sk_state == TCP_LISTEN) {
1405                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1406
1407                 if (!nsk)
1408                         goto discard;
1409                 if (nsk != sk) {
1410                         sock_rps_save_rxhash(nsk, skb);
1411                         sk_mark_napi_id(nsk, skb);
1412                         if (tcp_child_process(sk, nsk, skb)) {
1413                                 rsk = nsk;
1414                                 goto reset;
1415                         }
1416                         return 0;
1417                 }
1418         } else
1419                 sock_rps_save_rxhash(sk, skb);
1420
1421         if (tcp_rcv_state_process(sk, skb)) {
1422                 rsk = sk;
1423                 goto reset;
1424         }
1425         return 0;
1426
1427 reset:
1428         tcp_v4_send_reset(rsk, skb);
1429 discard:
1430         kfree_skb(skb);
1431         /* Be careful here. If this function gets more complicated and
1432          * gcc suffers from register pressure on the x86, sk (in %ebx)
1433          * might be destroyed here. This current version compiles correctly,
1434          * but you have been warned.
1435          */
1436         return 0;
1437
1438 csum_err:
1439         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1440         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1441         goto discard;
1442 }
1443 EXPORT_SYMBOL(tcp_v4_do_rcv);
1444
1445 void tcp_v4_early_demux(struct sk_buff *skb)
1446 {
1447         const struct iphdr *iph;
1448         const struct tcphdr *th;
1449         struct sock *sk;
1450
1451         if (skb->pkt_type != PACKET_HOST)
1452                 return;
1453
1454         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1455                 return;
1456
1457         iph = ip_hdr(skb);
1458         th = tcp_hdr(skb);
1459
1460         if (th->doff < sizeof(struct tcphdr) / 4)
1461                 return;
1462
1463         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1464                                        iph->saddr, th->source,
1465                                        iph->daddr, ntohs(th->dest),
1466                                        skb->skb_iif);
1467         if (sk) {
1468                 skb->sk = sk;
1469                 skb->destructor = sock_edemux;
1470                 if (sk_fullsock(sk)) {
1471                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1472
1473                         if (dst)
1474                                 dst = dst_check(dst, 0);
1475                         if (dst &&
1476                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1477                                 skb_dst_set_noref(skb, dst);
1478                 }
1479         }
1480 }
1481
1482 /* Packet is added to VJ-style prequeue for processing in process
1483  * context, if a reader task is waiting. Apparently, this exciting
1484  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1485  * failed somewhere. Latency? Burstiness? Well, at least now we will
1486  * see, why it failed. 8)8)                               --ANK
1487  *
1488  */
1489 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1490 {
1491         struct tcp_sock *tp = tcp_sk(sk);
1492
1493         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1494                 return false;
1495
1496         if (skb->len <= tcp_hdrlen(skb) &&
1497             skb_queue_len(&tp->ucopy.prequeue) == 0)
1498                 return false;
1499
1500         /* Before escaping RCU protected region, we need to take care of skb
1501          * dst. Prequeue is only enabled for established sockets.
1502          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1503          * Instead of doing full sk_rx_dst validity here, let's perform
1504          * an optimistic check.
1505          */
1506         if (likely(sk->sk_rx_dst))
1507                 skb_dst_drop(skb);
1508         else
1509                 skb_dst_force_safe(skb);
1510
1511         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1512         tp->ucopy.memory += skb->truesize;
1513         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1514             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1515                 struct sk_buff *skb1;
1516
1517                 BUG_ON(sock_owned_by_user(sk));
1518                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1519                                 skb_queue_len(&tp->ucopy.prequeue));
1520
1521                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1522                         sk_backlog_rcv(sk, skb1);
1523
1524                 tp->ucopy.memory = 0;
1525         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1526                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1527                                            POLLIN | POLLRDNORM | POLLRDBAND);
1528                 if (!inet_csk_ack_scheduled(sk))
1529                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1530                                                   (3 * tcp_rto_min(sk)) / 4,
1531                                                   TCP_RTO_MAX);
1532         }
1533         return true;
1534 }
1535 EXPORT_SYMBOL(tcp_prequeue);
1536
1537 /*
1538  *      From tcp_input.c
1539  */
1540
1541 int tcp_v4_rcv(struct sk_buff *skb)
1542 {
1543         struct net *net = dev_net(skb->dev);
1544         const struct iphdr *iph;
1545         const struct tcphdr *th;
1546         bool refcounted;
1547         struct sock *sk;
1548         int ret;
1549
1550         if (skb->pkt_type != PACKET_HOST)
1551                 goto discard_it;
1552
1553         /* Count it even if it's bad */
1554         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1555
1556         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1557                 goto discard_it;
1558
1559         th = (const struct tcphdr *)skb->data;
1560
1561         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1562                 goto bad_packet;
1563         if (!pskb_may_pull(skb, th->doff * 4))
1564                 goto discard_it;
1565
1566         /* An explanation is required here, I think.
1567          * Packet length and doff are validated by header prediction,
1568          * provided case of th->doff==0 is eliminated.
1569          * So, we defer the checks. */
1570
1571         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1572                 goto csum_error;
1573
1574         th = (const struct tcphdr *)skb->data;
1575         iph = ip_hdr(skb);
1576         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1577          * barrier() makes sure compiler wont play fool^Waliasing games.
1578          */
1579         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1580                 sizeof(struct inet_skb_parm));
1581         barrier();
1582
1583         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1584         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1585                                     skb->len - th->doff * 4);
1586         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1587         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1588         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1589         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1590         TCP_SKB_CB(skb)->sacked  = 0;
1591
1592 lookup:
1593         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1594                                th->dest, &refcounted);
1595         if (!sk)
1596                 goto no_tcp_socket;
1597
1598 process:
1599         if (sk->sk_state == TCP_TIME_WAIT)
1600                 goto do_time_wait;
1601
1602         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1603                 struct request_sock *req = inet_reqsk(sk);
1604                 struct sock *nsk;
1605
1606                 sk = req->rsk_listener;
1607                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1608                         reqsk_put(req);
1609                         goto discard_it;
1610                 }
1611                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1612                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1613                         goto lookup;
1614                 }
1615                 /* We own a reference on the listener, increase it again
1616                  * as we might lose it too soon.
1617                  */
1618                 sock_hold(sk);
1619                 refcounted = true;
1620                 nsk = tcp_check_req(sk, skb, req, false);
1621                 if (!nsk) {
1622                         reqsk_put(req);
1623                         goto discard_and_relse;
1624                 }
1625                 if (nsk == sk) {
1626                         reqsk_put(req);
1627                 } else if (tcp_child_process(sk, nsk, skb)) {
1628                         tcp_v4_send_reset(nsk, skb);
1629                         goto discard_and_relse;
1630                 } else {
1631                         sock_put(sk);
1632                         return 0;
1633                 }
1634         }
1635         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1636                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1637                 goto discard_and_relse;
1638         }
1639
1640         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1641                 goto discard_and_relse;
1642
1643         if (tcp_v4_inbound_md5_hash(sk, skb))
1644                 goto discard_and_relse;
1645
1646         nf_reset(skb);
1647
1648         if (sk_filter(sk, skb))
1649                 goto discard_and_relse;
1650
1651         skb->dev = NULL;
1652
1653         if (sk->sk_state == TCP_LISTEN) {
1654                 ret = tcp_v4_do_rcv(sk, skb);
1655                 goto put_and_return;
1656         }
1657
1658         sk_incoming_cpu_update(sk);
1659
1660         bh_lock_sock_nested(sk);
1661         tcp_segs_in(tcp_sk(sk), skb);
1662         ret = 0;
1663         if (!sock_owned_by_user(sk)) {
1664                 if (!tcp_prequeue(sk, skb))
1665                         ret = tcp_v4_do_rcv(sk, skb);
1666         } else if (unlikely(sk_add_backlog(sk, skb,
1667                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1668                 bh_unlock_sock(sk);
1669                 __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
1670                 goto discard_and_relse;
1671         }
1672         bh_unlock_sock(sk);
1673
1674 put_and_return:
1675         if (refcounted)
1676                 sock_put(sk);
1677
1678         return ret;
1679
1680 no_tcp_socket:
1681         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1682                 goto discard_it;
1683
1684         if (tcp_checksum_complete(skb)) {
1685 csum_error:
1686                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1687 bad_packet:
1688                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1689         } else {
1690                 tcp_v4_send_reset(NULL, skb);
1691         }
1692
1693 discard_it:
1694         /* Discard frame. */
1695         kfree_skb(skb);
1696         return 0;
1697
1698 discard_and_relse:
1699         sk_drops_add(sk, skb);
1700         if (refcounted)
1701                 sock_put(sk);
1702         goto discard_it;
1703
1704 do_time_wait:
1705         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1706                 inet_twsk_put(inet_twsk(sk));
1707                 goto discard_it;
1708         }
1709
1710         if (tcp_checksum_complete(skb)) {
1711                 inet_twsk_put(inet_twsk(sk));
1712                 goto csum_error;
1713         }
1714         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1715         case TCP_TW_SYN: {
1716                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1717                                                         &tcp_hashinfo, skb,
1718                                                         __tcp_hdrlen(th),
1719                                                         iph->saddr, th->source,
1720                                                         iph->daddr, th->dest,
1721                                                         inet_iif(skb));
1722                 if (sk2) {
1723                         inet_twsk_deschedule_put(inet_twsk(sk));
1724                         sk = sk2;
1725                         refcounted = false;
1726                         goto process;
1727                 }
1728                 /* Fall through to ACK */
1729         }
1730         case TCP_TW_ACK:
1731                 tcp_v4_timewait_ack(sk, skb);
1732                 break;
1733         case TCP_TW_RST:
1734                 tcp_v4_send_reset(sk, skb);
1735                 inet_twsk_deschedule_put(inet_twsk(sk));
1736                 goto discard_it;
1737         case TCP_TW_SUCCESS:;
1738         }
1739         goto discard_it;
1740 }
1741
1742 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1743         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1744         .twsk_unique    = tcp_twsk_unique,
1745         .twsk_destructor= tcp_twsk_destructor,
1746 };
1747
1748 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1749 {
1750         struct dst_entry *dst = skb_dst(skb);
1751
1752         if (dst && dst_hold_safe(dst)) {
1753                 sk->sk_rx_dst = dst;
1754                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1755         }
1756 }
1757 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1758
1759 const struct inet_connection_sock_af_ops ipv4_specific = {
1760         .queue_xmit        = ip_queue_xmit,
1761         .send_check        = tcp_v4_send_check,
1762         .rebuild_header    = inet_sk_rebuild_header,
1763         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1764         .conn_request      = tcp_v4_conn_request,
1765         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1766         .net_header_len    = sizeof(struct iphdr),
1767         .setsockopt        = ip_setsockopt,
1768         .getsockopt        = ip_getsockopt,
1769         .addr2sockaddr     = inet_csk_addr2sockaddr,
1770         .sockaddr_len      = sizeof(struct sockaddr_in),
1771         .bind_conflict     = inet_csk_bind_conflict,
1772 #ifdef CONFIG_COMPAT
1773         .compat_setsockopt = compat_ip_setsockopt,
1774         .compat_getsockopt = compat_ip_getsockopt,
1775 #endif
1776         .mtu_reduced       = tcp_v4_mtu_reduced,
1777 };
1778 EXPORT_SYMBOL(ipv4_specific);
1779
1780 #ifdef CONFIG_TCP_MD5SIG
1781 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1782         .md5_lookup             = tcp_v4_md5_lookup,
1783         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1784         .md5_parse              = tcp_v4_parse_md5_keys,
1785 };
1786 #endif
1787
1788 /* NOTE: A lot of things set to zero explicitly by call to
1789  *       sk_alloc() so need not be done here.
1790  */
1791 static int tcp_v4_init_sock(struct sock *sk)
1792 {
1793         struct inet_connection_sock *icsk = inet_csk(sk);
1794
1795         tcp_init_sock(sk);
1796
1797         icsk->icsk_af_ops = &ipv4_specific;
1798
1799 #ifdef CONFIG_TCP_MD5SIG
1800         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1801 #endif
1802
1803         return 0;
1804 }
1805
1806 void tcp_v4_destroy_sock(struct sock *sk)
1807 {
1808         struct tcp_sock *tp = tcp_sk(sk);
1809
1810         tcp_clear_xmit_timers(sk);
1811
1812         tcp_cleanup_congestion_control(sk);
1813
1814         /* Cleanup up the write buffer. */
1815         tcp_write_queue_purge(sk);
1816
1817         /* Cleans up our, hopefully empty, out_of_order_queue. */
1818         __skb_queue_purge(&tp->out_of_order_queue);
1819
1820 #ifdef CONFIG_TCP_MD5SIG
1821         /* Clean up the MD5 key list, if any */
1822         if (tp->md5sig_info) {
1823                 tcp_clear_md5_list(sk);
1824                 kfree_rcu(tp->md5sig_info, rcu);
1825                 tp->md5sig_info = NULL;
1826         }
1827 #endif
1828
1829         /* Clean prequeue, it must be empty really */
1830         __skb_queue_purge(&tp->ucopy.prequeue);
1831
1832         /* Clean up a referenced TCP bind bucket. */
1833         if (inet_csk(sk)->icsk_bind_hash)
1834                 inet_put_port(sk);
1835
1836         BUG_ON(tp->fastopen_rsk);
1837
1838         /* If socket is aborted during connect operation */
1839         tcp_free_fastopen_req(tp);
1840         tcp_saved_syn_free(tp);
1841
1842         local_bh_disable();
1843         sk_sockets_allocated_dec(sk);
1844         local_bh_enable();
1845
1846         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1847                 sock_release_memcg(sk);
1848 }
1849 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1850
1851 #ifdef CONFIG_PROC_FS
1852 /* Proc filesystem TCP sock list dumping. */
1853
1854 /*
1855  * Get next listener socket follow cur.  If cur is NULL, get first socket
1856  * starting from bucket given in st->bucket; when st->bucket is zero the
1857  * very first socket in the hash table is returned.
1858  */
1859 static void *listening_get_next(struct seq_file *seq, void *cur)
1860 {
1861         struct tcp_iter_state *st = seq->private;
1862         struct net *net = seq_file_net(seq);
1863         struct inet_listen_hashbucket *ilb;
1864         struct inet_connection_sock *icsk;
1865         struct sock *sk = cur;
1866
1867         if (!sk) {
1868 get_head:
1869                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1870                 spin_lock_bh(&ilb->lock);
1871                 sk = sk_head(&ilb->head);
1872                 st->offset = 0;
1873                 goto get_sk;
1874         }
1875         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1876         ++st->num;
1877         ++st->offset;
1878
1879         sk = sk_next(sk);
1880 get_sk:
1881         sk_for_each_from(sk) {
1882                 if (!net_eq(sock_net(sk), net))
1883                         continue;
1884                 if (sk->sk_family == st->family)
1885                         return sk;
1886                 icsk = inet_csk(sk);
1887         }
1888         spin_unlock_bh(&ilb->lock);
1889         st->offset = 0;
1890         if (++st->bucket < INET_LHTABLE_SIZE)
1891                 goto get_head;
1892         return NULL;
1893 }
1894
1895 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1896 {
1897         struct tcp_iter_state *st = seq->private;
1898         void *rc;
1899
1900         st->bucket = 0;
1901         st->offset = 0;
1902         rc = listening_get_next(seq, NULL);
1903
1904         while (rc && *pos) {
1905                 rc = listening_get_next(seq, rc);
1906                 --*pos;
1907         }
1908         return rc;
1909 }
1910
1911 static inline bool empty_bucket(const struct tcp_iter_state *st)
1912 {
1913         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1914 }
1915
1916 /*
1917  * Get first established socket starting from bucket given in st->bucket.
1918  * If st->bucket is zero, the very first socket in the hash is returned.
1919  */
1920 static void *established_get_first(struct seq_file *seq)
1921 {
1922         struct tcp_iter_state *st = seq->private;
1923         struct net *net = seq_file_net(seq);
1924         void *rc = NULL;
1925
1926         st->offset = 0;
1927         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1928                 struct sock *sk;
1929                 struct hlist_nulls_node *node;
1930                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1931
1932                 /* Lockless fast path for the common case of empty buckets */
1933                 if (empty_bucket(st))
1934                         continue;
1935
1936                 spin_lock_bh(lock);
1937                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1938                         if (sk->sk_family != st->family ||
1939                             !net_eq(sock_net(sk), net)) {
1940                                 continue;
1941                         }
1942                         rc = sk;
1943                         goto out;
1944                 }
1945                 spin_unlock_bh(lock);
1946         }
1947 out:
1948         return rc;
1949 }
1950
1951 static void *established_get_next(struct seq_file *seq, void *cur)
1952 {
1953         struct sock *sk = cur;
1954         struct hlist_nulls_node *node;
1955         struct tcp_iter_state *st = seq->private;
1956         struct net *net = seq_file_net(seq);
1957
1958         ++st->num;
1959         ++st->offset;
1960
1961         sk = sk_nulls_next(sk);
1962
1963         sk_nulls_for_each_from(sk, node) {
1964                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1965                         return sk;
1966         }
1967
1968         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1969         ++st->bucket;
1970         return established_get_first(seq);
1971 }
1972
1973 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1974 {
1975         struct tcp_iter_state *st = seq->private;
1976         void *rc;
1977
1978         st->bucket = 0;
1979         rc = established_get_first(seq);
1980
1981         while (rc && pos) {
1982                 rc = established_get_next(seq, rc);
1983                 --pos;
1984         }
1985         return rc;
1986 }
1987
1988 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1989 {
1990         void *rc;
1991         struct tcp_iter_state *st = seq->private;
1992
1993         st->state = TCP_SEQ_STATE_LISTENING;
1994         rc        = listening_get_idx(seq, &pos);
1995
1996         if (!rc) {
1997                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1998                 rc        = established_get_idx(seq, pos);
1999         }
2000
2001         return rc;
2002 }
2003
2004 static void *tcp_seek_last_pos(struct seq_file *seq)
2005 {
2006         struct tcp_iter_state *st = seq->private;
2007         int offset = st->offset;
2008         int orig_num = st->num;
2009         void *rc = NULL;
2010
2011         switch (st->state) {
2012         case TCP_SEQ_STATE_LISTENING:
2013                 if (st->bucket >= INET_LHTABLE_SIZE)
2014                         break;
2015                 st->state = TCP_SEQ_STATE_LISTENING;
2016                 rc = listening_get_next(seq, NULL);
2017                 while (offset-- && rc)
2018                         rc = listening_get_next(seq, rc);
2019                 if (rc)
2020                         break;
2021                 st->bucket = 0;
2022                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2023                 /* Fallthrough */
2024         case TCP_SEQ_STATE_ESTABLISHED:
2025                 if (st->bucket > tcp_hashinfo.ehash_mask)
2026                         break;
2027                 rc = established_get_first(seq);
2028                 while (offset-- && rc)
2029                         rc = established_get_next(seq, rc);
2030         }
2031
2032         st->num = orig_num;
2033
2034         return rc;
2035 }
2036
2037 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2038 {
2039         struct tcp_iter_state *st = seq->private;
2040         void *rc;
2041
2042         if (*pos && *pos == st->last_pos) {
2043                 rc = tcp_seek_last_pos(seq);
2044                 if (rc)
2045                         goto out;
2046         }
2047
2048         st->state = TCP_SEQ_STATE_LISTENING;
2049         st->num = 0;
2050         st->bucket = 0;
2051         st->offset = 0;
2052         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2053
2054 out:
2055         st->last_pos = *pos;
2056         return rc;
2057 }
2058
2059 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2060 {
2061         struct tcp_iter_state *st = seq->private;
2062         void *rc = NULL;
2063
2064         if (v == SEQ_START_TOKEN) {
2065                 rc = tcp_get_idx(seq, 0);
2066                 goto out;
2067         }
2068
2069         switch (st->state) {
2070         case TCP_SEQ_STATE_LISTENING:
2071                 rc = listening_get_next(seq, v);
2072                 if (!rc) {
2073                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2074                         st->bucket = 0;
2075                         st->offset = 0;
2076                         rc        = established_get_first(seq);
2077                 }
2078                 break;
2079         case TCP_SEQ_STATE_ESTABLISHED:
2080                 rc = established_get_next(seq, v);
2081                 break;
2082         }
2083 out:
2084         ++*pos;
2085         st->last_pos = *pos;
2086         return rc;
2087 }
2088
2089 static void tcp_seq_stop(struct seq_file *seq, void *v)
2090 {
2091         struct tcp_iter_state *st = seq->private;
2092
2093         switch (st->state) {
2094         case TCP_SEQ_STATE_LISTENING:
2095                 if (v != SEQ_START_TOKEN)
2096                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2097                 break;
2098         case TCP_SEQ_STATE_ESTABLISHED:
2099                 if (v)
2100                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2101                 break;
2102         }
2103 }
2104
2105 int tcp_seq_open(struct inode *inode, struct file *file)
2106 {
2107         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2108         struct tcp_iter_state *s;
2109         int err;
2110
2111         err = seq_open_net(inode, file, &afinfo->seq_ops,
2112                           sizeof(struct tcp_iter_state));
2113         if (err < 0)
2114                 return err;
2115
2116         s = ((struct seq_file *)file->private_data)->private;
2117         s->family               = afinfo->family;
2118         s->last_pos             = 0;
2119         return 0;
2120 }
2121 EXPORT_SYMBOL(tcp_seq_open);
2122
2123 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2124 {
2125         int rc = 0;
2126         struct proc_dir_entry *p;
2127
2128         afinfo->seq_ops.start           = tcp_seq_start;
2129         afinfo->seq_ops.next            = tcp_seq_next;
2130         afinfo->seq_ops.stop            = tcp_seq_stop;
2131
2132         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2133                              afinfo->seq_fops, afinfo);
2134         if (!p)
2135                 rc = -ENOMEM;
2136         return rc;
2137 }
2138 EXPORT_SYMBOL(tcp_proc_register);
2139
2140 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2141 {
2142         remove_proc_entry(afinfo->name, net->proc_net);
2143 }
2144 EXPORT_SYMBOL(tcp_proc_unregister);
2145
2146 static void get_openreq4(const struct request_sock *req,
2147                          struct seq_file *f, int i)
2148 {
2149         const struct inet_request_sock *ireq = inet_rsk(req);
2150         long delta = req->rsk_timer.expires - jiffies;
2151
2152         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2153                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2154                 i,
2155                 ireq->ir_loc_addr,
2156                 ireq->ir_num,
2157                 ireq->ir_rmt_addr,
2158                 ntohs(ireq->ir_rmt_port),
2159                 TCP_SYN_RECV,
2160                 0, 0, /* could print option size, but that is af dependent. */
2161                 1,    /* timers active (only the expire timer) */
2162                 jiffies_delta_to_clock_t(delta),
2163                 req->num_timeout,
2164                 from_kuid_munged(seq_user_ns(f),
2165                                  sock_i_uid(req->rsk_listener)),
2166                 0,  /* non standard timer */
2167                 0, /* open_requests have no inode */
2168                 0,
2169                 req);
2170 }
2171
2172 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2173 {
2174         int timer_active;
2175         unsigned long timer_expires;
2176         const struct tcp_sock *tp = tcp_sk(sk);
2177         const struct inet_connection_sock *icsk = inet_csk(sk);
2178         const struct inet_sock *inet = inet_sk(sk);
2179         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2180         __be32 dest = inet->inet_daddr;
2181         __be32 src = inet->inet_rcv_saddr;
2182         __u16 destp = ntohs(inet->inet_dport);
2183         __u16 srcp = ntohs(inet->inet_sport);
2184         int rx_queue;
2185         int state;
2186
2187         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2188             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2189             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2190                 timer_active    = 1;
2191                 timer_expires   = icsk->icsk_timeout;
2192         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2193                 timer_active    = 4;
2194                 timer_expires   = icsk->icsk_timeout;
2195         } else if (timer_pending(&sk->sk_timer)) {
2196                 timer_active    = 2;
2197                 timer_expires   = sk->sk_timer.expires;
2198         } else {
2199                 timer_active    = 0;
2200                 timer_expires = jiffies;
2201         }
2202
2203         state = sk_state_load(sk);
2204         if (state == TCP_LISTEN)
2205                 rx_queue = sk->sk_ack_backlog;
2206         else
2207                 /* Because we don't lock the socket,
2208                  * we might find a transient negative value.
2209                  */
2210                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2211
2212         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2213                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2214                 i, src, srcp, dest, destp, state,
2215                 tp->write_seq - tp->snd_una,
2216                 rx_queue,
2217                 timer_active,
2218                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2219                 icsk->icsk_retransmits,
2220                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2221                 icsk->icsk_probes_out,
2222                 sock_i_ino(sk),
2223                 atomic_read(&sk->sk_refcnt), sk,
2224                 jiffies_to_clock_t(icsk->icsk_rto),
2225                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2226                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2227                 tp->snd_cwnd,
2228                 state == TCP_LISTEN ?
2229                     fastopenq->max_qlen :
2230                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2231 }
2232
2233 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2234                                struct seq_file *f, int i)
2235 {
2236         long delta = tw->tw_timer.expires - jiffies;
2237         __be32 dest, src;
2238         __u16 destp, srcp;
2239
2240         dest  = tw->tw_daddr;
2241         src   = tw->tw_rcv_saddr;
2242         destp = ntohs(tw->tw_dport);
2243         srcp  = ntohs(tw->tw_sport);
2244
2245         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2246                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2247                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2248                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2249                 atomic_read(&tw->tw_refcnt), tw);
2250 }
2251
2252 #define TMPSZ 150
2253
2254 static int tcp4_seq_show(struct seq_file *seq, void *v)
2255 {
2256         struct tcp_iter_state *st;
2257         struct sock *sk = v;
2258
2259         seq_setwidth(seq, TMPSZ - 1);
2260         if (v == SEQ_START_TOKEN) {
2261                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2262                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2263                            "inode");
2264                 goto out;
2265         }
2266         st = seq->private;
2267
2268         if (sk->sk_state == TCP_TIME_WAIT)
2269                 get_timewait4_sock(v, seq, st->num);
2270         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2271                 get_openreq4(v, seq, st->num);
2272         else
2273                 get_tcp4_sock(v, seq, st->num);
2274 out:
2275         seq_pad(seq, '\n');
2276         return 0;
2277 }
2278
2279 static const struct file_operations tcp_afinfo_seq_fops = {
2280         .owner   = THIS_MODULE,
2281         .open    = tcp_seq_open,
2282         .read    = seq_read,
2283         .llseek  = seq_lseek,
2284         .release = seq_release_net
2285 };
2286
2287 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2288         .name           = "tcp",
2289         .family         = AF_INET,
2290         .seq_fops       = &tcp_afinfo_seq_fops,
2291         .seq_ops        = {
2292                 .show           = tcp4_seq_show,
2293         },
2294 };
2295
2296 static int __net_init tcp4_proc_init_net(struct net *net)
2297 {
2298         return tcp_proc_register(net, &tcp4_seq_afinfo);
2299 }
2300
2301 static void __net_exit tcp4_proc_exit_net(struct net *net)
2302 {
2303         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2304 }
2305
2306 static struct pernet_operations tcp4_net_ops = {
2307         .init = tcp4_proc_init_net,
2308         .exit = tcp4_proc_exit_net,
2309 };
2310
2311 int __init tcp4_proc_init(void)
2312 {
2313         return register_pernet_subsys(&tcp4_net_ops);
2314 }
2315
2316 void tcp4_proc_exit(void)
2317 {
2318         unregister_pernet_subsys(&tcp4_net_ops);
2319 }
2320 #endif /* CONFIG_PROC_FS */
2321
2322 struct proto tcp_prot = {
2323         .name                   = "TCP",
2324         .owner                  = THIS_MODULE,
2325         .close                  = tcp_close,
2326         .connect                = tcp_v4_connect,
2327         .disconnect             = tcp_disconnect,
2328         .accept                 = inet_csk_accept,
2329         .ioctl                  = tcp_ioctl,
2330         .init                   = tcp_v4_init_sock,
2331         .destroy                = tcp_v4_destroy_sock,
2332         .shutdown               = tcp_shutdown,
2333         .setsockopt             = tcp_setsockopt,
2334         .getsockopt             = tcp_getsockopt,
2335         .recvmsg                = tcp_recvmsg,
2336         .sendmsg                = tcp_sendmsg,
2337         .sendpage               = tcp_sendpage,
2338         .backlog_rcv            = tcp_v4_do_rcv,
2339         .release_cb             = tcp_release_cb,
2340         .hash                   = inet_hash,
2341         .unhash                 = inet_unhash,
2342         .get_port               = inet_csk_get_port,
2343         .enter_memory_pressure  = tcp_enter_memory_pressure,
2344         .stream_memory_free     = tcp_stream_memory_free,
2345         .sockets_allocated      = &tcp_sockets_allocated,
2346         .orphan_count           = &tcp_orphan_count,
2347         .memory_allocated       = &tcp_memory_allocated,
2348         .memory_pressure        = &tcp_memory_pressure,
2349         .sysctl_mem             = sysctl_tcp_mem,
2350         .sysctl_wmem            = sysctl_tcp_wmem,
2351         .sysctl_rmem            = sysctl_tcp_rmem,
2352         .max_header             = MAX_TCP_HEADER,
2353         .obj_size               = sizeof(struct tcp_sock),
2354         .slab_flags             = SLAB_DESTROY_BY_RCU,
2355         .twsk_prot              = &tcp_timewait_sock_ops,
2356         .rsk_prot               = &tcp_request_sock_ops,
2357         .h.hashinfo             = &tcp_hashinfo,
2358         .no_autobind            = true,
2359 #ifdef CONFIG_COMPAT
2360         .compat_setsockopt      = compat_tcp_setsockopt,
2361         .compat_getsockopt      = compat_tcp_getsockopt,
2362 #endif
2363         .diag_destroy           = tcp_abort,
2364 };
2365 EXPORT_SYMBOL(tcp_prot);
2366
2367 static void __net_exit tcp_sk_exit(struct net *net)
2368 {
2369         int cpu;
2370
2371         for_each_possible_cpu(cpu)
2372                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2373         free_percpu(net->ipv4.tcp_sk);
2374 }
2375
2376 static int __net_init tcp_sk_init(struct net *net)
2377 {
2378         int res, cpu;
2379
2380         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2381         if (!net->ipv4.tcp_sk)
2382                 return -ENOMEM;
2383
2384         for_each_possible_cpu(cpu) {
2385                 struct sock *sk;
2386
2387                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2388                                            IPPROTO_TCP, net);
2389                 if (res)
2390                         goto fail;
2391                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2392                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2393         }
2394
2395         net->ipv4.sysctl_tcp_ecn = 2;
2396         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2397
2398         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2399         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2400         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2401
2402         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2403         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2404         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2405
2406         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2407         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2408         net->ipv4.sysctl_tcp_syncookies = 1;
2409         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2410         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2411         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2412         net->ipv4.sysctl_tcp_orphan_retries = 0;
2413         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2414         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2415
2416         return 0;
2417 fail:
2418         tcp_sk_exit(net);
2419
2420         return res;
2421 }
2422
2423 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2424 {
2425         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2426 }
2427
2428 static struct pernet_operations __net_initdata tcp_sk_ops = {
2429        .init       = tcp_sk_init,
2430        .exit       = tcp_sk_exit,
2431        .exit_batch = tcp_sk_exit_batch,
2432 };
2433
2434 void __init tcp_v4_init(void)
2435 {
2436         inet_hashinfo_init(&tcp_hashinfo);
2437         if (register_pernet_subsys(&tcp_sk_ops))
2438                 panic("Failed to create the TCP control socket.\n");
2439 }