mtd: nand: davinci: Reinitialize the HW ECC engine in 4bit hwctl
[cascardo/linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102                                           ip_hdr(skb)->saddr,
103                                           tcp_hdr(skb)->dest,
104                                           tcp_hdr(skb)->source);
105 }
106
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110         struct tcp_sock *tp = tcp_sk(sk);
111
112         /* With PAWS, it is safe from the viewpoint
113            of data integrity. Even without PAWS it is safe provided sequence
114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115
116            Actually, the idea is close to VJ's one, only timestamp cache is
117            held not per host, but per port pair and TW bucket is used as state
118            holder.
119
120            If TW bucket has been already destroyed we fall back to VJ's scheme
121            and use initial timestamp retrieved from peer table.
122          */
123         if (tcptw->tw_ts_recent_stamp &&
124             (!twp || (sysctl_tcp_tw_reuse &&
125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127                 if (tp->write_seq == 0)
128                         tp->write_seq = 1;
129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131                 sock_hold(sktw);
132                 return 1;
133         }
134
135         return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143         struct inet_sock *inet = inet_sk(sk);
144         struct tcp_sock *tp = tcp_sk(sk);
145         __be16 orig_sport, orig_dport;
146         __be32 daddr, nexthop;
147         struct flowi4 *fl4;
148         struct rtable *rt;
149         int err;
150         struct ip_options_rcu *inet_opt;
151
152         if (addr_len < sizeof(struct sockaddr_in))
153                 return -EINVAL;
154
155         if (usin->sin_family != AF_INET)
156                 return -EAFNOSUPPORT;
157
158         nexthop = daddr = usin->sin_addr.s_addr;
159         inet_opt = rcu_dereference_protected(inet->inet_opt,
160                                              lockdep_sock_is_held(sk));
161         if (inet_opt && inet_opt->opt.srr) {
162                 if (!daddr)
163                         return -EINVAL;
164                 nexthop = inet_opt->opt.faddr;
165         }
166
167         orig_sport = inet->inet_sport;
168         orig_dport = usin->sin_port;
169         fl4 = &inet->cork.fl.u.ip4;
170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172                               IPPROTO_TCP,
173                               orig_sport, orig_dport, sk);
174         if (IS_ERR(rt)) {
175                 err = PTR_ERR(rt);
176                 if (err == -ENETUNREACH)
177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return err;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet_opt || !inet_opt->opt.srr)
187                 daddr = fl4->daddr;
188
189         if (!inet->inet_saddr)
190                 inet->inet_saddr = fl4->saddr;
191         sk_rcv_saddr_set(sk, inet->inet_saddr);
192
193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 if (likely(!tp->repair))
198                         tp->write_seq      = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
204
205         inet->inet_dport = usin->sin_port;
206         sk_daddr_set(sk, daddr);
207
208         inet_csk(sk)->icsk_ext_hdr_len = 0;
209         if (inet_opt)
210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211
212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213
214         /* Socket identity is still unknown (sport may be zero).
215          * However we set state to SYN-SENT and not releasing socket
216          * lock select source port, enter ourselves into the hash tables and
217          * complete initialization after this.
218          */
219         tcp_set_state(sk, TCP_SYN_SENT);
220         err = inet_hash_connect(&tcp_death_row, sk);
221         if (err)
222                 goto failure;
223
224         sk_set_txhash(sk);
225
226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227                                inet->inet_sport, inet->inet_dport, sk);
228         if (IS_ERR(rt)) {
229                 err = PTR_ERR(rt);
230                 rt = NULL;
231                 goto failure;
232         }
233         /* OK, now commit destination to socket.  */
234         sk->sk_gso_type = SKB_GSO_TCPV4;
235         sk_setup_caps(sk, &rt->dst);
236
237         if (!tp->write_seq && likely(!tp->repair))
238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239                                                            inet->inet_daddr,
240                                                            inet->inet_sport,
241                                                            usin->sin_port);
242
243         inet->inet_id = tp->write_seq ^ jiffies;
244
245         err = tcp_connect(sk);
246
247         rt = NULL;
248         if (err)
249                 goto failure;
250
251         return 0;
252
253 failure:
254         /*
255          * This unhashes the socket and releases the local port,
256          * if necessary.
257          */
258         tcp_set_state(sk, TCP_CLOSE);
259         ip_rt_put(rt);
260         sk->sk_route_caps = 0;
261         inet->inet_dport = 0;
262         return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275         u32 mtu = tcp_sk(sk)->mtu_info;
276
277         dst = inet_csk_update_pmtu(sk, mtu);
278         if (!dst)
279                 return;
280
281         /* Something is about to be wrong... Remember soft error
282          * for the case, if this connection will not able to recover.
283          */
284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285                 sk->sk_err_soft = EMSGSIZE;
286
287         mtu = dst_mtu(dst);
288
289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290             ip_sk_accept_pmtu(sk) &&
291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292                 tcp_sync_mss(sk, mtu);
293
294                 /* Resend the TCP packet because it's
295                  * clear that the old packet has been
296                  * dropped. This is the new "fast" path mtu
297                  * discovery.
298                  */
299                 tcp_simple_retransmit(sk);
300         } /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306         struct dst_entry *dst = __sk_dst_check(sk, 0);
307
308         if (dst)
309                 dst->ops->redirect(dst, sk, skb);
310 }
311
312
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316         struct request_sock *req = inet_reqsk(sk);
317         struct net *net = sock_net(sk);
318
319         /* ICMPs are not backlogged, hence we cannot get
320          * an established socket here.
321          */
322         if (seq != tcp_rsk(req)->snt_isn) {
323                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
324         } else if (abort) {
325                 /*
326                  * Still in SYN_RECV, just remove it silently.
327                  * There is no good way to pass the error to the newly
328                  * created socket, and POSIX does not want network
329                  * errors returned from accept().
330                  */
331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332                 tcp_listendrop(req->rsk_listener);
333         }
334         reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358         struct inet_connection_sock *icsk;
359         struct tcp_sock *tp;
360         struct inet_sock *inet;
361         const int type = icmp_hdr(icmp_skb)->type;
362         const int code = icmp_hdr(icmp_skb)->code;
363         struct sock *sk;
364         struct sk_buff *skb;
365         struct request_sock *fastopen;
366         __u32 seq, snd_una;
367         __u32 remaining;
368         int err;
369         struct net *net = dev_net(icmp_skb->dev);
370
371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372                                        th->dest, iph->saddr, ntohs(th->source),
373                                        inet_iif(icmp_skb));
374         if (!sk) {
375                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
376                 return;
377         }
378         if (sk->sk_state == TCP_TIME_WAIT) {
379                 inet_twsk_put(inet_twsk(sk));
380                 return;
381         }
382         seq = ntohl(th->seq);
383         if (sk->sk_state == TCP_NEW_SYN_RECV)
384                 return tcp_req_err(sk, seq,
385                                   type == ICMP_PARAMETERPROB ||
386                                   type == ICMP_TIME_EXCEEDED ||
387                                   (type == ICMP_DEST_UNREACH &&
388                                    (code == ICMP_NET_UNREACH ||
389                                     code == ICMP_HOST_UNREACH)));
390
391         bh_lock_sock(sk);
392         /* If too many ICMPs get dropped on busy
393          * servers this needs to be solved differently.
394          * We do take care of PMTU discovery (RFC1191) special case :
395          * we can receive locally generated ICMP messages while socket is held.
396          */
397         if (sock_owned_by_user(sk)) {
398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
400         }
401         if (sk->sk_state == TCP_CLOSE)
402                 goto out;
403
404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
406                 goto out;
407         }
408
409         icsk = inet_csk(sk);
410         tp = tcp_sk(sk);
411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412         fastopen = tp->fastopen_rsk;
413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414         if (sk->sk_state != TCP_LISTEN &&
415             !between(seq, snd_una, tp->snd_nxt)) {
416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
417                 goto out;
418         }
419
420         switch (type) {
421         case ICMP_REDIRECT:
422                 do_redirect(icmp_skb, sk);
423                 goto out;
424         case ICMP_SOURCE_QUENCH:
425                 /* Just silently ignore these. */
426                 goto out;
427         case ICMP_PARAMETERPROB:
428                 err = EPROTO;
429                 break;
430         case ICMP_DEST_UNREACH:
431                 if (code > NR_ICMP_UNREACH)
432                         goto out;
433
434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435                         /* We are not interested in TCP_LISTEN and open_requests
436                          * (SYN-ACKs send out by Linux are always <576bytes so
437                          * they should go through unfragmented).
438                          */
439                         if (sk->sk_state == TCP_LISTEN)
440                                 goto out;
441
442                         tp->mtu_info = info;
443                         if (!sock_owned_by_user(sk)) {
444                                 tcp_v4_mtu_reduced(sk);
445                         } else {
446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
447                                         sock_hold(sk);
448                         }
449                         goto out;
450                 }
451
452                 err = icmp_err_convert[code].errno;
453                 /* check if icmp_skb allows revert of backoff
454                  * (see draft-zimmermann-tcp-lcd) */
455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456                         break;
457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458                     !icsk->icsk_backoff || fastopen)
459                         break;
460
461                 if (sock_owned_by_user(sk))
462                         break;
463
464                 icsk->icsk_backoff--;
465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466                                                TCP_TIMEOUT_INIT;
467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468
469                 skb = tcp_write_queue_head(sk);
470                 BUG_ON(!skb);
471
472                 remaining = icsk->icsk_rto -
473                             min(icsk->icsk_rto,
474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
475
476                 if (remaining) {
477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478                                                   remaining, TCP_RTO_MAX);
479                 } else {
480                         /* RTO revert clocked out retransmission.
481                          * Will retransmit now */
482                         tcp_retransmit_timer(sk);
483                 }
484
485                 break;
486         case ICMP_TIME_EXCEEDED:
487                 err = EHOSTUNREACH;
488                 break;
489         default:
490                 goto out;
491         }
492
493         switch (sk->sk_state) {
494         case TCP_SYN_SENT:
495         case TCP_SYN_RECV:
496                 /* Only in fast or simultaneous open. If a fast open socket is
497                  * is already accepted it is treated as a connected one below.
498                  */
499                 if (fastopen && !fastopen->sk)
500                         break;
501
502                 if (!sock_owned_by_user(sk)) {
503                         sk->sk_err = err;
504
505                         sk->sk_error_report(sk);
506
507                         tcp_done(sk);
508                 } else {
509                         sk->sk_err_soft = err;
510                 }
511                 goto out;
512         }
513
514         /* If we've already connected we will keep trying
515          * until we time out, or the user gives up.
516          *
517          * rfc1122 4.2.3.9 allows to consider as hard errors
518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519          * but it is obsoleted by pmtu discovery).
520          *
521          * Note, that in modern internet, where routing is unreliable
522          * and in each dark corner broken firewalls sit, sending random
523          * errors ordered by their masters even this two messages finally lose
524          * their original sense (even Linux sends invalid PORT_UNREACHs)
525          *
526          * Now we are in compliance with RFCs.
527          *                                                      --ANK (980905)
528          */
529
530         inet = inet_sk(sk);
531         if (!sock_owned_by_user(sk) && inet->recverr) {
532                 sk->sk_err = err;
533                 sk->sk_error_report(sk);
534         } else  { /* Only an error on timeout */
535                 sk->sk_err_soft = err;
536         }
537
538 out:
539         bh_unlock_sock(sk);
540         sock_put(sk);
541 }
542
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545         struct tcphdr *th = tcp_hdr(skb);
546
547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549                 skb->csum_start = skb_transport_header(skb) - skb->head;
550                 skb->csum_offset = offsetof(struct tcphdr, check);
551         } else {
552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
553                                          csum_partial(th,
554                                                       th->doff << 2,
555                                                       skb->csum));
556         }
557 }
558
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562         const struct inet_sock *inet = inet_sk(sk);
563
564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567
568 /*
569  *      This routine will send an RST to the other tcp.
570  *
571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *                    for reset.
573  *      Answer: if a packet caused RST, it is not for a socket
574  *              existing in our system, if it is matched to a socket,
575  *              it is just duplicate segment or bug in other side's TCP.
576  *              So that we build reply only basing on parameters
577  *              arrived with segment.
578  *      Exception: precedence violation. We do not implement it in any case.
579  */
580
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583         const struct tcphdr *th = tcp_hdr(skb);
584         struct {
585                 struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589         } rep;
590         struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592         struct tcp_md5sig_key *key = NULL;
593         const __u8 *hash_location = NULL;
594         unsigned char newhash[16];
595         int genhash;
596         struct sock *sk1 = NULL;
597 #endif
598         struct net *net;
599
600         /* Never send a reset in response to a reset. */
601         if (th->rst)
602                 return;
603
604         /* If sk not NULL, it means we did a successful lookup and incoming
605          * route had to be correct. prequeue might have dropped our dst.
606          */
607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608                 return;
609
610         /* Swap the send and the receive. */
611         memset(&rep, 0, sizeof(rep));
612         rep.th.dest   = th->source;
613         rep.th.source = th->dest;
614         rep.th.doff   = sizeof(struct tcphdr) / 4;
615         rep.th.rst    = 1;
616
617         if (th->ack) {
618                 rep.th.seq = th->ack_seq;
619         } else {
620                 rep.th.ack = 1;
621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622                                        skb->len - (th->doff << 2));
623         }
624
625         memset(&arg, 0, sizeof(arg));
626         arg.iov[0].iov_base = (unsigned char *)&rep;
627         arg.iov[0].iov_len  = sizeof(rep.th);
628
629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631         rcu_read_lock();
632         hash_location = tcp_parse_md5sig_option(th);
633         if (sk && sk_fullsock(sk)) {
634                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
635                                         &ip_hdr(skb)->saddr, AF_INET);
636         } else if (hash_location) {
637                 /*
638                  * active side is lost. Try to find listening socket through
639                  * source port, and then find md5 key through listening socket.
640                  * we are not loose security here:
641                  * Incoming packet is checked with md5 hash with finding key,
642                  * no RST generated if md5 hash doesn't match.
643                  */
644                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
645                                              ip_hdr(skb)->saddr,
646                                              th->source, ip_hdr(skb)->daddr,
647                                              ntohs(th->source), inet_iif(skb));
648                 /* don't send rst if it can't find key */
649                 if (!sk1)
650                         goto out;
651
652                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
653                                         &ip_hdr(skb)->saddr, AF_INET);
654                 if (!key)
655                         goto out;
656
657
658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
660                         goto out;
661
662         }
663
664         if (key) {
665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666                                    (TCPOPT_NOP << 16) |
667                                    (TCPOPT_MD5SIG << 8) |
668                                    TCPOLEN_MD5SIG);
669                 /* Update length and the length the header thinks exists */
670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671                 rep.th.doff = arg.iov[0].iov_len / 4;
672
673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
674                                      key, ip_hdr(skb)->saddr,
675                                      ip_hdr(skb)->daddr, &rep.th);
676         }
677 #endif
678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679                                       ip_hdr(skb)->saddr, /* XXX */
680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
682         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
683
684         /* When socket is gone, all binding information is lost.
685          * routing might fail in this case. No choice here, if we choose to force
686          * input interface, we will misroute in case of asymmetric route.
687          */
688         if (sk)
689                 arg.bound_dev_if = sk->sk_bound_dev_if;
690
691         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
692                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
693
694         arg.tos = ip_hdr(skb)->tos;
695         local_bh_disable();
696         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
697                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
698                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
699                               &arg, arg.iov[0].iov_len);
700
701         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
702         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
703         local_bh_enable();
704
705 #ifdef CONFIG_TCP_MD5SIG
706 out:
707         rcu_read_unlock();
708 #endif
709 }
710
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714
715 static void tcp_v4_send_ack(struct net *net,
716                             struct sk_buff *skb, u32 seq, u32 ack,
717                             u32 win, u32 tsval, u32 tsecr, int oif,
718                             struct tcp_md5sig_key *key,
719                             int reply_flags, u8 tos)
720 {
721         const struct tcphdr *th = tcp_hdr(skb);
722         struct {
723                 struct tcphdr th;
724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728                         ];
729         } rep;
730         struct ip_reply_arg arg;
731
732         memset(&rep.th, 0, sizeof(struct tcphdr));
733         memset(&arg, 0, sizeof(arg));
734
735         arg.iov[0].iov_base = (unsigned char *)&rep;
736         arg.iov[0].iov_len  = sizeof(rep.th);
737         if (tsecr) {
738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739                                    (TCPOPT_TIMESTAMP << 8) |
740                                    TCPOLEN_TIMESTAMP);
741                 rep.opt[1] = htonl(tsval);
742                 rep.opt[2] = htonl(tsecr);
743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744         }
745
746         /* Swap the send and the receive. */
747         rep.th.dest    = th->source;
748         rep.th.source  = th->dest;
749         rep.th.doff    = arg.iov[0].iov_len / 4;
750         rep.th.seq     = htonl(seq);
751         rep.th.ack_seq = htonl(ack);
752         rep.th.ack     = 1;
753         rep.th.window  = htons(win);
754
755 #ifdef CONFIG_TCP_MD5SIG
756         if (key) {
757                 int offset = (tsecr) ? 3 : 0;
758
759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760                                           (TCPOPT_NOP << 16) |
761                                           (TCPOPT_MD5SIG << 8) |
762                                           TCPOLEN_MD5SIG);
763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764                 rep.th.doff = arg.iov[0].iov_len/4;
765
766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767                                     key, ip_hdr(skb)->saddr,
768                                     ip_hdr(skb)->daddr, &rep.th);
769         }
770 #endif
771         arg.flags = reply_flags;
772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773                                       ip_hdr(skb)->saddr, /* XXX */
774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776         if (oif)
777                 arg.bound_dev_if = oif;
778         arg.tos = tos;
779         local_bh_disable();
780         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
781                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
782                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
783                               &arg, arg.iov[0].iov_len);
784
785         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
786         local_bh_enable();
787 }
788
789 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
790 {
791         struct inet_timewait_sock *tw = inet_twsk(sk);
792         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
793
794         tcp_v4_send_ack(sock_net(sk), skb,
795                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
796                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
797                         tcp_time_stamp + tcptw->tw_ts_offset,
798                         tcptw->tw_ts_recent,
799                         tw->tw_bound_dev_if,
800                         tcp_twsk_md5_key(tcptw),
801                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
802                         tw->tw_tos
803                         );
804
805         inet_twsk_put(tw);
806 }
807
808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
809                                   struct request_sock *req)
810 {
811         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
812          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
813          */
814         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
815                                              tcp_sk(sk)->snd_nxt;
816
817         tcp_v4_send_ack(sock_net(sk), skb, seq,
818                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
819                         tcp_time_stamp,
820                         req->ts_recent,
821                         0,
822                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
823                                           AF_INET),
824                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
825                         ip_hdr(skb)->tos);
826 }
827
828 /*
829  *      Send a SYN-ACK after having received a SYN.
830  *      This still operates on a request_sock only, not on a big
831  *      socket.
832  */
833 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
834                               struct flowi *fl,
835                               struct request_sock *req,
836                               struct tcp_fastopen_cookie *foc,
837                               enum tcp_synack_type synack_type)
838 {
839         const struct inet_request_sock *ireq = inet_rsk(req);
840         struct flowi4 fl4;
841         int err = -1;
842         struct sk_buff *skb;
843
844         /* First, grab a route. */
845         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
846                 return -1;
847
848         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
849
850         if (skb) {
851                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
852
853                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
854                                             ireq->ir_rmt_addr,
855                                             ireq->opt);
856                 err = net_xmit_eval(err);
857         }
858
859         return err;
860 }
861
862 /*
863  *      IPv4 request_sock destructor.
864  */
865 static void tcp_v4_reqsk_destructor(struct request_sock *req)
866 {
867         kfree(inet_rsk(req)->opt);
868 }
869
870 #ifdef CONFIG_TCP_MD5SIG
871 /*
872  * RFC2385 MD5 checksumming requires a mapping of
873  * IP address->MD5 Key.
874  * We need to maintain these in the sk structure.
875  */
876
877 /* Find the Key structure for an address.  */
878 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
879                                          const union tcp_md5_addr *addr,
880                                          int family)
881 {
882         const struct tcp_sock *tp = tcp_sk(sk);
883         struct tcp_md5sig_key *key;
884         unsigned int size = sizeof(struct in_addr);
885         const struct tcp_md5sig_info *md5sig;
886
887         /* caller either holds rcu_read_lock() or socket lock */
888         md5sig = rcu_dereference_check(tp->md5sig_info,
889                                        lockdep_sock_is_held(sk));
890         if (!md5sig)
891                 return NULL;
892 #if IS_ENABLED(CONFIG_IPV6)
893         if (family == AF_INET6)
894                 size = sizeof(struct in6_addr);
895 #endif
896         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
897                 if (key->family != family)
898                         continue;
899                 if (!memcmp(&key->addr, addr, size))
900                         return key;
901         }
902         return NULL;
903 }
904 EXPORT_SYMBOL(tcp_md5_do_lookup);
905
906 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
907                                          const struct sock *addr_sk)
908 {
909         const union tcp_md5_addr *addr;
910
911         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
912         return tcp_md5_do_lookup(sk, addr, AF_INET);
913 }
914 EXPORT_SYMBOL(tcp_v4_md5_lookup);
915
916 /* This can be called on a newly created socket, from other files */
917 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
918                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
919 {
920         /* Add Key to the list */
921         struct tcp_md5sig_key *key;
922         struct tcp_sock *tp = tcp_sk(sk);
923         struct tcp_md5sig_info *md5sig;
924
925         key = tcp_md5_do_lookup(sk, addr, family);
926         if (key) {
927                 /* Pre-existing entry - just update that one. */
928                 memcpy(key->key, newkey, newkeylen);
929                 key->keylen = newkeylen;
930                 return 0;
931         }
932
933         md5sig = rcu_dereference_protected(tp->md5sig_info,
934                                            lockdep_sock_is_held(sk));
935         if (!md5sig) {
936                 md5sig = kmalloc(sizeof(*md5sig), gfp);
937                 if (!md5sig)
938                         return -ENOMEM;
939
940                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
941                 INIT_HLIST_HEAD(&md5sig->head);
942                 rcu_assign_pointer(tp->md5sig_info, md5sig);
943         }
944
945         key = sock_kmalloc(sk, sizeof(*key), gfp);
946         if (!key)
947                 return -ENOMEM;
948         if (!tcp_alloc_md5sig_pool()) {
949                 sock_kfree_s(sk, key, sizeof(*key));
950                 return -ENOMEM;
951         }
952
953         memcpy(key->key, newkey, newkeylen);
954         key->keylen = newkeylen;
955         key->family = family;
956         memcpy(&key->addr, addr,
957                (family == AF_INET6) ? sizeof(struct in6_addr) :
958                                       sizeof(struct in_addr));
959         hlist_add_head_rcu(&key->node, &md5sig->head);
960         return 0;
961 }
962 EXPORT_SYMBOL(tcp_md5_do_add);
963
964 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
965 {
966         struct tcp_md5sig_key *key;
967
968         key = tcp_md5_do_lookup(sk, addr, family);
969         if (!key)
970                 return -ENOENT;
971         hlist_del_rcu(&key->node);
972         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
973         kfree_rcu(key, rcu);
974         return 0;
975 }
976 EXPORT_SYMBOL(tcp_md5_do_del);
977
978 static void tcp_clear_md5_list(struct sock *sk)
979 {
980         struct tcp_sock *tp = tcp_sk(sk);
981         struct tcp_md5sig_key *key;
982         struct hlist_node *n;
983         struct tcp_md5sig_info *md5sig;
984
985         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
986
987         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
988                 hlist_del_rcu(&key->node);
989                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
990                 kfree_rcu(key, rcu);
991         }
992 }
993
994 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
995                                  int optlen)
996 {
997         struct tcp_md5sig cmd;
998         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
999
1000         if (optlen < sizeof(cmd))
1001                 return -EINVAL;
1002
1003         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1004                 return -EFAULT;
1005
1006         if (sin->sin_family != AF_INET)
1007                 return -EINVAL;
1008
1009         if (!cmd.tcpm_keylen)
1010                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1011                                       AF_INET);
1012
1013         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1014                 return -EINVAL;
1015
1016         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1018                               GFP_KERNEL);
1019 }
1020
1021 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1022                                    __be32 daddr, __be32 saddr,
1023                                    const struct tcphdr *th, int nbytes)
1024 {
1025         struct tcp4_pseudohdr *bp;
1026         struct scatterlist sg;
1027         struct tcphdr *_th;
1028
1029         bp = hp->scratch;
1030         bp->saddr = saddr;
1031         bp->daddr = daddr;
1032         bp->pad = 0;
1033         bp->protocol = IPPROTO_TCP;
1034         bp->len = cpu_to_be16(nbytes);
1035
1036         _th = (struct tcphdr *)(bp + 1);
1037         memcpy(_th, th, sizeof(*th));
1038         _th->check = 0;
1039
1040         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1041         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1042                                 sizeof(*bp) + sizeof(*th));
1043         return crypto_ahash_update(hp->md5_req);
1044 }
1045
1046 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1047                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1048 {
1049         struct tcp_md5sig_pool *hp;
1050         struct ahash_request *req;
1051
1052         hp = tcp_get_md5sig_pool();
1053         if (!hp)
1054                 goto clear_hash_noput;
1055         req = hp->md5_req;
1056
1057         if (crypto_ahash_init(req))
1058                 goto clear_hash;
1059         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1060                 goto clear_hash;
1061         if (tcp_md5_hash_key(hp, key))
1062                 goto clear_hash;
1063         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1064         if (crypto_ahash_final(req))
1065                 goto clear_hash;
1066
1067         tcp_put_md5sig_pool();
1068         return 0;
1069
1070 clear_hash:
1071         tcp_put_md5sig_pool();
1072 clear_hash_noput:
1073         memset(md5_hash, 0, 16);
1074         return 1;
1075 }
1076
1077 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1078                         const struct sock *sk,
1079                         const struct sk_buff *skb)
1080 {
1081         struct tcp_md5sig_pool *hp;
1082         struct ahash_request *req;
1083         const struct tcphdr *th = tcp_hdr(skb);
1084         __be32 saddr, daddr;
1085
1086         if (sk) { /* valid for establish/request sockets */
1087                 saddr = sk->sk_rcv_saddr;
1088                 daddr = sk->sk_daddr;
1089         } else {
1090                 const struct iphdr *iph = ip_hdr(skb);
1091                 saddr = iph->saddr;
1092                 daddr = iph->daddr;
1093         }
1094
1095         hp = tcp_get_md5sig_pool();
1096         if (!hp)
1097                 goto clear_hash_noput;
1098         req = hp->md5_req;
1099
1100         if (crypto_ahash_init(req))
1101                 goto clear_hash;
1102
1103         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1104                 goto clear_hash;
1105         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1106                 goto clear_hash;
1107         if (tcp_md5_hash_key(hp, key))
1108                 goto clear_hash;
1109         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1110         if (crypto_ahash_final(req))
1111                 goto clear_hash;
1112
1113         tcp_put_md5sig_pool();
1114         return 0;
1115
1116 clear_hash:
1117         tcp_put_md5sig_pool();
1118 clear_hash_noput:
1119         memset(md5_hash, 0, 16);
1120         return 1;
1121 }
1122 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1123
1124 #endif
1125
1126 /* Called with rcu_read_lock() */
1127 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1128                                     const struct sk_buff *skb)
1129 {
1130 #ifdef CONFIG_TCP_MD5SIG
1131         /*
1132          * This gets called for each TCP segment that arrives
1133          * so we want to be efficient.
1134          * We have 3 drop cases:
1135          * o No MD5 hash and one expected.
1136          * o MD5 hash and we're not expecting one.
1137          * o MD5 hash and its wrong.
1138          */
1139         const __u8 *hash_location = NULL;
1140         struct tcp_md5sig_key *hash_expected;
1141         const struct iphdr *iph = ip_hdr(skb);
1142         const struct tcphdr *th = tcp_hdr(skb);
1143         int genhash;
1144         unsigned char newhash[16];
1145
1146         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1147                                           AF_INET);
1148         hash_location = tcp_parse_md5sig_option(th);
1149
1150         /* We've parsed the options - do we have a hash? */
1151         if (!hash_expected && !hash_location)
1152                 return false;
1153
1154         if (hash_expected && !hash_location) {
1155                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1156                 return true;
1157         }
1158
1159         if (!hash_expected && hash_location) {
1160                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1161                 return true;
1162         }
1163
1164         /* Okay, so this is hash_expected and hash_location -
1165          * so we need to calculate the checksum.
1166          */
1167         genhash = tcp_v4_md5_hash_skb(newhash,
1168                                       hash_expected,
1169                                       NULL, skb);
1170
1171         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1172                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1173                                      &iph->saddr, ntohs(th->source),
1174                                      &iph->daddr, ntohs(th->dest),
1175                                      genhash ? " tcp_v4_calc_md5_hash failed"
1176                                      : "");
1177                 return true;
1178         }
1179         return false;
1180 #endif
1181         return false;
1182 }
1183
1184 static void tcp_v4_init_req(struct request_sock *req,
1185                             const struct sock *sk_listener,
1186                             struct sk_buff *skb)
1187 {
1188         struct inet_request_sock *ireq = inet_rsk(req);
1189
1190         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1191         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1192         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1193         ireq->opt = tcp_v4_save_options(skb);
1194 }
1195
1196 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1197                                           struct flowi *fl,
1198                                           const struct request_sock *req,
1199                                           bool *strict)
1200 {
1201         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1202
1203         if (strict) {
1204                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1205                         *strict = true;
1206                 else
1207                         *strict = false;
1208         }
1209
1210         return dst;
1211 }
1212
1213 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1214         .family         =       PF_INET,
1215         .obj_size       =       sizeof(struct tcp_request_sock),
1216         .rtx_syn_ack    =       tcp_rtx_synack,
1217         .send_ack       =       tcp_v4_reqsk_send_ack,
1218         .destructor     =       tcp_v4_reqsk_destructor,
1219         .send_reset     =       tcp_v4_send_reset,
1220         .syn_ack_timeout =      tcp_syn_ack_timeout,
1221 };
1222
1223 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1224         .mss_clamp      =       TCP_MSS_DEFAULT,
1225 #ifdef CONFIG_TCP_MD5SIG
1226         .req_md5_lookup =       tcp_v4_md5_lookup,
1227         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1228 #endif
1229         .init_req       =       tcp_v4_init_req,
1230 #ifdef CONFIG_SYN_COOKIES
1231         .cookie_init_seq =      cookie_v4_init_sequence,
1232 #endif
1233         .route_req      =       tcp_v4_route_req,
1234         .init_seq       =       tcp_v4_init_sequence,
1235         .send_synack    =       tcp_v4_send_synack,
1236 };
1237
1238 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1239 {
1240         /* Never answer to SYNs send to broadcast or multicast */
1241         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1242                 goto drop;
1243
1244         return tcp_conn_request(&tcp_request_sock_ops,
1245                                 &tcp_request_sock_ipv4_ops, sk, skb);
1246
1247 drop:
1248         tcp_listendrop(sk);
1249         return 0;
1250 }
1251 EXPORT_SYMBOL(tcp_v4_conn_request);
1252
1253
1254 /*
1255  * The three way handshake has completed - we got a valid synack -
1256  * now create the new socket.
1257  */
1258 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1259                                   struct request_sock *req,
1260                                   struct dst_entry *dst,
1261                                   struct request_sock *req_unhash,
1262                                   bool *own_req)
1263 {
1264         struct inet_request_sock *ireq;
1265         struct inet_sock *newinet;
1266         struct tcp_sock *newtp;
1267         struct sock *newsk;
1268 #ifdef CONFIG_TCP_MD5SIG
1269         struct tcp_md5sig_key *key;
1270 #endif
1271         struct ip_options_rcu *inet_opt;
1272
1273         if (sk_acceptq_is_full(sk))
1274                 goto exit_overflow;
1275
1276         newsk = tcp_create_openreq_child(sk, req, skb);
1277         if (!newsk)
1278                 goto exit_nonewsk;
1279
1280         newsk->sk_gso_type = SKB_GSO_TCPV4;
1281         inet_sk_rx_dst_set(newsk, skb);
1282
1283         newtp                 = tcp_sk(newsk);
1284         newinet               = inet_sk(newsk);
1285         ireq                  = inet_rsk(req);
1286         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1287         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1288         newsk->sk_bound_dev_if = ireq->ir_iif;
1289         newinet->inet_saddr           = ireq->ir_loc_addr;
1290         inet_opt              = ireq->opt;
1291         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1292         ireq->opt             = NULL;
1293         newinet->mc_index     = inet_iif(skb);
1294         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1295         newinet->rcv_tos      = ip_hdr(skb)->tos;
1296         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1297         if (inet_opt)
1298                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1299         newinet->inet_id = newtp->write_seq ^ jiffies;
1300
1301         if (!dst) {
1302                 dst = inet_csk_route_child_sock(sk, newsk, req);
1303                 if (!dst)
1304                         goto put_and_exit;
1305         } else {
1306                 /* syncookie case : see end of cookie_v4_check() */
1307         }
1308         sk_setup_caps(newsk, dst);
1309
1310         tcp_ca_openreq_child(newsk, dst);
1311
1312         tcp_sync_mss(newsk, dst_mtu(dst));
1313         newtp->advmss = dst_metric_advmss(dst);
1314         if (tcp_sk(sk)->rx_opt.user_mss &&
1315             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1316                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1317
1318         tcp_initialize_rcv_mss(newsk);
1319
1320 #ifdef CONFIG_TCP_MD5SIG
1321         /* Copy over the MD5 key from the original socket */
1322         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1323                                 AF_INET);
1324         if (key) {
1325                 /*
1326                  * We're using one, so create a matching key
1327                  * on the newsk structure. If we fail to get
1328                  * memory, then we end up not copying the key
1329                  * across. Shucks.
1330                  */
1331                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1332                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1333                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1334         }
1335 #endif
1336
1337         if (__inet_inherit_port(sk, newsk) < 0)
1338                 goto put_and_exit;
1339         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1340         if (*own_req)
1341                 tcp_move_syn(newtp, req);
1342
1343         return newsk;
1344
1345 exit_overflow:
1346         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1347 exit_nonewsk:
1348         dst_release(dst);
1349 exit:
1350         tcp_listendrop(sk);
1351         return NULL;
1352 put_and_exit:
1353         inet_csk_prepare_forced_close(newsk);
1354         tcp_done(newsk);
1355         goto exit;
1356 }
1357 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1358
1359 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1360 {
1361 #ifdef CONFIG_SYN_COOKIES
1362         const struct tcphdr *th = tcp_hdr(skb);
1363
1364         if (!th->syn)
1365                 sk = cookie_v4_check(sk, skb);
1366 #endif
1367         return sk;
1368 }
1369
1370 /* The socket must have it's spinlock held when we get
1371  * here, unless it is a TCP_LISTEN socket.
1372  *
1373  * We have a potential double-lock case here, so even when
1374  * doing backlog processing we use the BH locking scheme.
1375  * This is because we cannot sleep with the original spinlock
1376  * held.
1377  */
1378 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1379 {
1380         struct sock *rsk;
1381
1382         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1383                 struct dst_entry *dst = sk->sk_rx_dst;
1384
1385                 sock_rps_save_rxhash(sk, skb);
1386                 sk_mark_napi_id(sk, skb);
1387                 if (dst) {
1388                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1389                             !dst->ops->check(dst, 0)) {
1390                                 dst_release(dst);
1391                                 sk->sk_rx_dst = NULL;
1392                         }
1393                 }
1394                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1395                 return 0;
1396         }
1397
1398         if (tcp_checksum_complete(skb))
1399                 goto csum_err;
1400
1401         if (sk->sk_state == TCP_LISTEN) {
1402                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1403
1404                 if (!nsk)
1405                         goto discard;
1406                 if (nsk != sk) {
1407                         sock_rps_save_rxhash(nsk, skb);
1408                         sk_mark_napi_id(nsk, skb);
1409                         if (tcp_child_process(sk, nsk, skb)) {
1410                                 rsk = nsk;
1411                                 goto reset;
1412                         }
1413                         return 0;
1414                 }
1415         } else
1416                 sock_rps_save_rxhash(sk, skb);
1417
1418         if (tcp_rcv_state_process(sk, skb)) {
1419                 rsk = sk;
1420                 goto reset;
1421         }
1422         return 0;
1423
1424 reset:
1425         tcp_v4_send_reset(rsk, skb);
1426 discard:
1427         kfree_skb(skb);
1428         /* Be careful here. If this function gets more complicated and
1429          * gcc suffers from register pressure on the x86, sk (in %ebx)
1430          * might be destroyed here. This current version compiles correctly,
1431          * but you have been warned.
1432          */
1433         return 0;
1434
1435 csum_err:
1436         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1437         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1438         goto discard;
1439 }
1440 EXPORT_SYMBOL(tcp_v4_do_rcv);
1441
1442 void tcp_v4_early_demux(struct sk_buff *skb)
1443 {
1444         const struct iphdr *iph;
1445         const struct tcphdr *th;
1446         struct sock *sk;
1447
1448         if (skb->pkt_type != PACKET_HOST)
1449                 return;
1450
1451         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1452                 return;
1453
1454         iph = ip_hdr(skb);
1455         th = tcp_hdr(skb);
1456
1457         if (th->doff < sizeof(struct tcphdr) / 4)
1458                 return;
1459
1460         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1461                                        iph->saddr, th->source,
1462                                        iph->daddr, ntohs(th->dest),
1463                                        skb->skb_iif);
1464         if (sk) {
1465                 skb->sk = sk;
1466                 skb->destructor = sock_edemux;
1467                 if (sk_fullsock(sk)) {
1468                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1469
1470                         if (dst)
1471                                 dst = dst_check(dst, 0);
1472                         if (dst &&
1473                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1474                                 skb_dst_set_noref(skb, dst);
1475                 }
1476         }
1477 }
1478
1479 /* Packet is added to VJ-style prequeue for processing in process
1480  * context, if a reader task is waiting. Apparently, this exciting
1481  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1482  * failed somewhere. Latency? Burstiness? Well, at least now we will
1483  * see, why it failed. 8)8)                               --ANK
1484  *
1485  */
1486 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1487 {
1488         struct tcp_sock *tp = tcp_sk(sk);
1489
1490         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1491                 return false;
1492
1493         if (skb->len <= tcp_hdrlen(skb) &&
1494             skb_queue_len(&tp->ucopy.prequeue) == 0)
1495                 return false;
1496
1497         /* Before escaping RCU protected region, we need to take care of skb
1498          * dst. Prequeue is only enabled for established sockets.
1499          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1500          * Instead of doing full sk_rx_dst validity here, let's perform
1501          * an optimistic check.
1502          */
1503         if (likely(sk->sk_rx_dst))
1504                 skb_dst_drop(skb);
1505         else
1506                 skb_dst_force_safe(skb);
1507
1508         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1509         tp->ucopy.memory += skb->truesize;
1510         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1511             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1512                 struct sk_buff *skb1;
1513
1514                 BUG_ON(sock_owned_by_user(sk));
1515                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1516                                 skb_queue_len(&tp->ucopy.prequeue));
1517
1518                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1519                         sk_backlog_rcv(sk, skb1);
1520
1521                 tp->ucopy.memory = 0;
1522         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1523                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1524                                            POLLIN | POLLRDNORM | POLLRDBAND);
1525                 if (!inet_csk_ack_scheduled(sk))
1526                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1527                                                   (3 * tcp_rto_min(sk)) / 4,
1528                                                   TCP_RTO_MAX);
1529         }
1530         return true;
1531 }
1532 EXPORT_SYMBOL(tcp_prequeue);
1533
1534 /*
1535  *      From tcp_input.c
1536  */
1537
1538 int tcp_v4_rcv(struct sk_buff *skb)
1539 {
1540         struct net *net = dev_net(skb->dev);
1541         const struct iphdr *iph;
1542         const struct tcphdr *th;
1543         bool refcounted;
1544         struct sock *sk;
1545         int ret;
1546
1547         if (skb->pkt_type != PACKET_HOST)
1548                 goto discard_it;
1549
1550         /* Count it even if it's bad */
1551         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1552
1553         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1554                 goto discard_it;
1555
1556         th = (const struct tcphdr *)skb->data;
1557
1558         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1559                 goto bad_packet;
1560         if (!pskb_may_pull(skb, th->doff * 4))
1561                 goto discard_it;
1562
1563         /* An explanation is required here, I think.
1564          * Packet length and doff are validated by header prediction,
1565          * provided case of th->doff==0 is eliminated.
1566          * So, we defer the checks. */
1567
1568         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1569                 goto csum_error;
1570
1571         th = (const struct tcphdr *)skb->data;
1572         iph = ip_hdr(skb);
1573         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1574          * barrier() makes sure compiler wont play fool^Waliasing games.
1575          */
1576         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1577                 sizeof(struct inet_skb_parm));
1578         barrier();
1579
1580         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1581         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1582                                     skb->len - th->doff * 4);
1583         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1584         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1585         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1586         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1587         TCP_SKB_CB(skb)->sacked  = 0;
1588
1589 lookup:
1590         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1591                                th->dest, &refcounted);
1592         if (!sk)
1593                 goto no_tcp_socket;
1594
1595 process:
1596         if (sk->sk_state == TCP_TIME_WAIT)
1597                 goto do_time_wait;
1598
1599         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1600                 struct request_sock *req = inet_reqsk(sk);
1601                 struct sock *nsk;
1602
1603                 sk = req->rsk_listener;
1604                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1605                         reqsk_put(req);
1606                         goto discard_it;
1607                 }
1608                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1609                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1610                         goto lookup;
1611                 }
1612                 /* We own a reference on the listener, increase it again
1613                  * as we might lose it too soon.
1614                  */
1615                 sock_hold(sk);
1616                 refcounted = true;
1617                 nsk = tcp_check_req(sk, skb, req, false);
1618                 if (!nsk) {
1619                         reqsk_put(req);
1620                         goto discard_and_relse;
1621                 }
1622                 if (nsk == sk) {
1623                         reqsk_put(req);
1624                 } else if (tcp_child_process(sk, nsk, skb)) {
1625                         tcp_v4_send_reset(nsk, skb);
1626                         goto discard_and_relse;
1627                 } else {
1628                         sock_put(sk);
1629                         return 0;
1630                 }
1631         }
1632         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1633                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1634                 goto discard_and_relse;
1635         }
1636
1637         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1638                 goto discard_and_relse;
1639
1640         if (tcp_v4_inbound_md5_hash(sk, skb))
1641                 goto discard_and_relse;
1642
1643         nf_reset(skb);
1644
1645         if (sk_filter(sk, skb))
1646                 goto discard_and_relse;
1647
1648         skb->dev = NULL;
1649
1650         if (sk->sk_state == TCP_LISTEN) {
1651                 ret = tcp_v4_do_rcv(sk, skb);
1652                 goto put_and_return;
1653         }
1654
1655         sk_incoming_cpu_update(sk);
1656
1657         bh_lock_sock_nested(sk);
1658         tcp_segs_in(tcp_sk(sk), skb);
1659         ret = 0;
1660         if (!sock_owned_by_user(sk)) {
1661                 if (!tcp_prequeue(sk, skb))
1662                         ret = tcp_v4_do_rcv(sk, skb);
1663         } else if (unlikely(sk_add_backlog(sk, skb,
1664                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1665                 bh_unlock_sock(sk);
1666                 __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
1667                 goto discard_and_relse;
1668         }
1669         bh_unlock_sock(sk);
1670
1671 put_and_return:
1672         if (refcounted)
1673                 sock_put(sk);
1674
1675         return ret;
1676
1677 no_tcp_socket:
1678         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1679                 goto discard_it;
1680
1681         if (tcp_checksum_complete(skb)) {
1682 csum_error:
1683                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1684 bad_packet:
1685                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1686         } else {
1687                 tcp_v4_send_reset(NULL, skb);
1688         }
1689
1690 discard_it:
1691         /* Discard frame. */
1692         kfree_skb(skb);
1693         return 0;
1694
1695 discard_and_relse:
1696         sk_drops_add(sk, skb);
1697         if (refcounted)
1698                 sock_put(sk);
1699         goto discard_it;
1700
1701 do_time_wait:
1702         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1703                 inet_twsk_put(inet_twsk(sk));
1704                 goto discard_it;
1705         }
1706
1707         if (tcp_checksum_complete(skb)) {
1708                 inet_twsk_put(inet_twsk(sk));
1709                 goto csum_error;
1710         }
1711         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1712         case TCP_TW_SYN: {
1713                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1714                                                         &tcp_hashinfo, skb,
1715                                                         __tcp_hdrlen(th),
1716                                                         iph->saddr, th->source,
1717                                                         iph->daddr, th->dest,
1718                                                         inet_iif(skb));
1719                 if (sk2) {
1720                         inet_twsk_deschedule_put(inet_twsk(sk));
1721                         sk = sk2;
1722                         refcounted = false;
1723                         goto process;
1724                 }
1725                 /* Fall through to ACK */
1726         }
1727         case TCP_TW_ACK:
1728                 tcp_v4_timewait_ack(sk, skb);
1729                 break;
1730         case TCP_TW_RST:
1731                 tcp_v4_send_reset(sk, skb);
1732                 inet_twsk_deschedule_put(inet_twsk(sk));
1733                 goto discard_it;
1734         case TCP_TW_SUCCESS:;
1735         }
1736         goto discard_it;
1737 }
1738
1739 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1740         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1741         .twsk_unique    = tcp_twsk_unique,
1742         .twsk_destructor= tcp_twsk_destructor,
1743 };
1744
1745 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1746 {
1747         struct dst_entry *dst = skb_dst(skb);
1748
1749         if (dst && dst_hold_safe(dst)) {
1750                 sk->sk_rx_dst = dst;
1751                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1752         }
1753 }
1754 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1755
1756 const struct inet_connection_sock_af_ops ipv4_specific = {
1757         .queue_xmit        = ip_queue_xmit,
1758         .send_check        = tcp_v4_send_check,
1759         .rebuild_header    = inet_sk_rebuild_header,
1760         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1761         .conn_request      = tcp_v4_conn_request,
1762         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1763         .net_header_len    = sizeof(struct iphdr),
1764         .setsockopt        = ip_setsockopt,
1765         .getsockopt        = ip_getsockopt,
1766         .addr2sockaddr     = inet_csk_addr2sockaddr,
1767         .sockaddr_len      = sizeof(struct sockaddr_in),
1768         .bind_conflict     = inet_csk_bind_conflict,
1769 #ifdef CONFIG_COMPAT
1770         .compat_setsockopt = compat_ip_setsockopt,
1771         .compat_getsockopt = compat_ip_getsockopt,
1772 #endif
1773         .mtu_reduced       = tcp_v4_mtu_reduced,
1774 };
1775 EXPORT_SYMBOL(ipv4_specific);
1776
1777 #ifdef CONFIG_TCP_MD5SIG
1778 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1779         .md5_lookup             = tcp_v4_md5_lookup,
1780         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1781         .md5_parse              = tcp_v4_parse_md5_keys,
1782 };
1783 #endif
1784
1785 /* NOTE: A lot of things set to zero explicitly by call to
1786  *       sk_alloc() so need not be done here.
1787  */
1788 static int tcp_v4_init_sock(struct sock *sk)
1789 {
1790         struct inet_connection_sock *icsk = inet_csk(sk);
1791
1792         tcp_init_sock(sk);
1793
1794         icsk->icsk_af_ops = &ipv4_specific;
1795
1796 #ifdef CONFIG_TCP_MD5SIG
1797         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1798 #endif
1799
1800         return 0;
1801 }
1802
1803 void tcp_v4_destroy_sock(struct sock *sk)
1804 {
1805         struct tcp_sock *tp = tcp_sk(sk);
1806
1807         tcp_clear_xmit_timers(sk);
1808
1809         tcp_cleanup_congestion_control(sk);
1810
1811         /* Cleanup up the write buffer. */
1812         tcp_write_queue_purge(sk);
1813
1814         /* Cleans up our, hopefully empty, out_of_order_queue. */
1815         __skb_queue_purge(&tp->out_of_order_queue);
1816
1817 #ifdef CONFIG_TCP_MD5SIG
1818         /* Clean up the MD5 key list, if any */
1819         if (tp->md5sig_info) {
1820                 tcp_clear_md5_list(sk);
1821                 kfree_rcu(tp->md5sig_info, rcu);
1822                 tp->md5sig_info = NULL;
1823         }
1824 #endif
1825
1826         /* Clean prequeue, it must be empty really */
1827         __skb_queue_purge(&tp->ucopy.prequeue);
1828
1829         /* Clean up a referenced TCP bind bucket. */
1830         if (inet_csk(sk)->icsk_bind_hash)
1831                 inet_put_port(sk);
1832
1833         BUG_ON(tp->fastopen_rsk);
1834
1835         /* If socket is aborted during connect operation */
1836         tcp_free_fastopen_req(tp);
1837         tcp_saved_syn_free(tp);
1838
1839         local_bh_disable();
1840         sk_sockets_allocated_dec(sk);
1841         local_bh_enable();
1842
1843         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1844                 sock_release_memcg(sk);
1845 }
1846 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1847
1848 #ifdef CONFIG_PROC_FS
1849 /* Proc filesystem TCP sock list dumping. */
1850
1851 /*
1852  * Get next listener socket follow cur.  If cur is NULL, get first socket
1853  * starting from bucket given in st->bucket; when st->bucket is zero the
1854  * very first socket in the hash table is returned.
1855  */
1856 static void *listening_get_next(struct seq_file *seq, void *cur)
1857 {
1858         struct tcp_iter_state *st = seq->private;
1859         struct net *net = seq_file_net(seq);
1860         struct inet_listen_hashbucket *ilb;
1861         struct inet_connection_sock *icsk;
1862         struct sock *sk = cur;
1863
1864         if (!sk) {
1865 get_head:
1866                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1867                 spin_lock_bh(&ilb->lock);
1868                 sk = sk_head(&ilb->head);
1869                 st->offset = 0;
1870                 goto get_sk;
1871         }
1872         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1873         ++st->num;
1874         ++st->offset;
1875
1876         sk = sk_next(sk);
1877 get_sk:
1878         sk_for_each_from(sk) {
1879                 if (!net_eq(sock_net(sk), net))
1880                         continue;
1881                 if (sk->sk_family == st->family)
1882                         return sk;
1883                 icsk = inet_csk(sk);
1884         }
1885         spin_unlock_bh(&ilb->lock);
1886         st->offset = 0;
1887         if (++st->bucket < INET_LHTABLE_SIZE)
1888                 goto get_head;
1889         return NULL;
1890 }
1891
1892 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1893 {
1894         struct tcp_iter_state *st = seq->private;
1895         void *rc;
1896
1897         st->bucket = 0;
1898         st->offset = 0;
1899         rc = listening_get_next(seq, NULL);
1900
1901         while (rc && *pos) {
1902                 rc = listening_get_next(seq, rc);
1903                 --*pos;
1904         }
1905         return rc;
1906 }
1907
1908 static inline bool empty_bucket(const struct tcp_iter_state *st)
1909 {
1910         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1911 }
1912
1913 /*
1914  * Get first established socket starting from bucket given in st->bucket.
1915  * If st->bucket is zero, the very first socket in the hash is returned.
1916  */
1917 static void *established_get_first(struct seq_file *seq)
1918 {
1919         struct tcp_iter_state *st = seq->private;
1920         struct net *net = seq_file_net(seq);
1921         void *rc = NULL;
1922
1923         st->offset = 0;
1924         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1925                 struct sock *sk;
1926                 struct hlist_nulls_node *node;
1927                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1928
1929                 /* Lockless fast path for the common case of empty buckets */
1930                 if (empty_bucket(st))
1931                         continue;
1932
1933                 spin_lock_bh(lock);
1934                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1935                         if (sk->sk_family != st->family ||
1936                             !net_eq(sock_net(sk), net)) {
1937                                 continue;
1938                         }
1939                         rc = sk;
1940                         goto out;
1941                 }
1942                 spin_unlock_bh(lock);
1943         }
1944 out:
1945         return rc;
1946 }
1947
1948 static void *established_get_next(struct seq_file *seq, void *cur)
1949 {
1950         struct sock *sk = cur;
1951         struct hlist_nulls_node *node;
1952         struct tcp_iter_state *st = seq->private;
1953         struct net *net = seq_file_net(seq);
1954
1955         ++st->num;
1956         ++st->offset;
1957
1958         sk = sk_nulls_next(sk);
1959
1960         sk_nulls_for_each_from(sk, node) {
1961                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1962                         return sk;
1963         }
1964
1965         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1966         ++st->bucket;
1967         return established_get_first(seq);
1968 }
1969
1970 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1971 {
1972         struct tcp_iter_state *st = seq->private;
1973         void *rc;
1974
1975         st->bucket = 0;
1976         rc = established_get_first(seq);
1977
1978         while (rc && pos) {
1979                 rc = established_get_next(seq, rc);
1980                 --pos;
1981         }
1982         return rc;
1983 }
1984
1985 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1986 {
1987         void *rc;
1988         struct tcp_iter_state *st = seq->private;
1989
1990         st->state = TCP_SEQ_STATE_LISTENING;
1991         rc        = listening_get_idx(seq, &pos);
1992
1993         if (!rc) {
1994                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1995                 rc        = established_get_idx(seq, pos);
1996         }
1997
1998         return rc;
1999 }
2000
2001 static void *tcp_seek_last_pos(struct seq_file *seq)
2002 {
2003         struct tcp_iter_state *st = seq->private;
2004         int offset = st->offset;
2005         int orig_num = st->num;
2006         void *rc = NULL;
2007
2008         switch (st->state) {
2009         case TCP_SEQ_STATE_LISTENING:
2010                 if (st->bucket >= INET_LHTABLE_SIZE)
2011                         break;
2012                 st->state = TCP_SEQ_STATE_LISTENING;
2013                 rc = listening_get_next(seq, NULL);
2014                 while (offset-- && rc)
2015                         rc = listening_get_next(seq, rc);
2016                 if (rc)
2017                         break;
2018                 st->bucket = 0;
2019                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2020                 /* Fallthrough */
2021         case TCP_SEQ_STATE_ESTABLISHED:
2022                 if (st->bucket > tcp_hashinfo.ehash_mask)
2023                         break;
2024                 rc = established_get_first(seq);
2025                 while (offset-- && rc)
2026                         rc = established_get_next(seq, rc);
2027         }
2028
2029         st->num = orig_num;
2030
2031         return rc;
2032 }
2033
2034 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2035 {
2036         struct tcp_iter_state *st = seq->private;
2037         void *rc;
2038
2039         if (*pos && *pos == st->last_pos) {
2040                 rc = tcp_seek_last_pos(seq);
2041                 if (rc)
2042                         goto out;
2043         }
2044
2045         st->state = TCP_SEQ_STATE_LISTENING;
2046         st->num = 0;
2047         st->bucket = 0;
2048         st->offset = 0;
2049         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2050
2051 out:
2052         st->last_pos = *pos;
2053         return rc;
2054 }
2055
2056 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2057 {
2058         struct tcp_iter_state *st = seq->private;
2059         void *rc = NULL;
2060
2061         if (v == SEQ_START_TOKEN) {
2062                 rc = tcp_get_idx(seq, 0);
2063                 goto out;
2064         }
2065
2066         switch (st->state) {
2067         case TCP_SEQ_STATE_LISTENING:
2068                 rc = listening_get_next(seq, v);
2069                 if (!rc) {
2070                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2071                         st->bucket = 0;
2072                         st->offset = 0;
2073                         rc        = established_get_first(seq);
2074                 }
2075                 break;
2076         case TCP_SEQ_STATE_ESTABLISHED:
2077                 rc = established_get_next(seq, v);
2078                 break;
2079         }
2080 out:
2081         ++*pos;
2082         st->last_pos = *pos;
2083         return rc;
2084 }
2085
2086 static void tcp_seq_stop(struct seq_file *seq, void *v)
2087 {
2088         struct tcp_iter_state *st = seq->private;
2089
2090         switch (st->state) {
2091         case TCP_SEQ_STATE_LISTENING:
2092                 if (v != SEQ_START_TOKEN)
2093                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2094                 break;
2095         case TCP_SEQ_STATE_ESTABLISHED:
2096                 if (v)
2097                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2098                 break;
2099         }
2100 }
2101
2102 int tcp_seq_open(struct inode *inode, struct file *file)
2103 {
2104         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2105         struct tcp_iter_state *s;
2106         int err;
2107
2108         err = seq_open_net(inode, file, &afinfo->seq_ops,
2109                           sizeof(struct tcp_iter_state));
2110         if (err < 0)
2111                 return err;
2112
2113         s = ((struct seq_file *)file->private_data)->private;
2114         s->family               = afinfo->family;
2115         s->last_pos             = 0;
2116         return 0;
2117 }
2118 EXPORT_SYMBOL(tcp_seq_open);
2119
2120 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2121 {
2122         int rc = 0;
2123         struct proc_dir_entry *p;
2124
2125         afinfo->seq_ops.start           = tcp_seq_start;
2126         afinfo->seq_ops.next            = tcp_seq_next;
2127         afinfo->seq_ops.stop            = tcp_seq_stop;
2128
2129         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2130                              afinfo->seq_fops, afinfo);
2131         if (!p)
2132                 rc = -ENOMEM;
2133         return rc;
2134 }
2135 EXPORT_SYMBOL(tcp_proc_register);
2136
2137 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2138 {
2139         remove_proc_entry(afinfo->name, net->proc_net);
2140 }
2141 EXPORT_SYMBOL(tcp_proc_unregister);
2142
2143 static void get_openreq4(const struct request_sock *req,
2144                          struct seq_file *f, int i)
2145 {
2146         const struct inet_request_sock *ireq = inet_rsk(req);
2147         long delta = req->rsk_timer.expires - jiffies;
2148
2149         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2150                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2151                 i,
2152                 ireq->ir_loc_addr,
2153                 ireq->ir_num,
2154                 ireq->ir_rmt_addr,
2155                 ntohs(ireq->ir_rmt_port),
2156                 TCP_SYN_RECV,
2157                 0, 0, /* could print option size, but that is af dependent. */
2158                 1,    /* timers active (only the expire timer) */
2159                 jiffies_delta_to_clock_t(delta),
2160                 req->num_timeout,
2161                 from_kuid_munged(seq_user_ns(f),
2162                                  sock_i_uid(req->rsk_listener)),
2163                 0,  /* non standard timer */
2164                 0, /* open_requests have no inode */
2165                 0,
2166                 req);
2167 }
2168
2169 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2170 {
2171         int timer_active;
2172         unsigned long timer_expires;
2173         const struct tcp_sock *tp = tcp_sk(sk);
2174         const struct inet_connection_sock *icsk = inet_csk(sk);
2175         const struct inet_sock *inet = inet_sk(sk);
2176         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2177         __be32 dest = inet->inet_daddr;
2178         __be32 src = inet->inet_rcv_saddr;
2179         __u16 destp = ntohs(inet->inet_dport);
2180         __u16 srcp = ntohs(inet->inet_sport);
2181         int rx_queue;
2182         int state;
2183
2184         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2185             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2186             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2187                 timer_active    = 1;
2188                 timer_expires   = icsk->icsk_timeout;
2189         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2190                 timer_active    = 4;
2191                 timer_expires   = icsk->icsk_timeout;
2192         } else if (timer_pending(&sk->sk_timer)) {
2193                 timer_active    = 2;
2194                 timer_expires   = sk->sk_timer.expires;
2195         } else {
2196                 timer_active    = 0;
2197                 timer_expires = jiffies;
2198         }
2199
2200         state = sk_state_load(sk);
2201         if (state == TCP_LISTEN)
2202                 rx_queue = sk->sk_ack_backlog;
2203         else
2204                 /* Because we don't lock the socket,
2205                  * we might find a transient negative value.
2206                  */
2207                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2208
2209         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2210                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2211                 i, src, srcp, dest, destp, state,
2212                 tp->write_seq - tp->snd_una,
2213                 rx_queue,
2214                 timer_active,
2215                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2216                 icsk->icsk_retransmits,
2217                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2218                 icsk->icsk_probes_out,
2219                 sock_i_ino(sk),
2220                 atomic_read(&sk->sk_refcnt), sk,
2221                 jiffies_to_clock_t(icsk->icsk_rto),
2222                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2223                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2224                 tp->snd_cwnd,
2225                 state == TCP_LISTEN ?
2226                     fastopenq->max_qlen :
2227                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2228 }
2229
2230 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2231                                struct seq_file *f, int i)
2232 {
2233         long delta = tw->tw_timer.expires - jiffies;
2234         __be32 dest, src;
2235         __u16 destp, srcp;
2236
2237         dest  = tw->tw_daddr;
2238         src   = tw->tw_rcv_saddr;
2239         destp = ntohs(tw->tw_dport);
2240         srcp  = ntohs(tw->tw_sport);
2241
2242         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2243                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2244                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2245                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2246                 atomic_read(&tw->tw_refcnt), tw);
2247 }
2248
2249 #define TMPSZ 150
2250
2251 static int tcp4_seq_show(struct seq_file *seq, void *v)
2252 {
2253         struct tcp_iter_state *st;
2254         struct sock *sk = v;
2255
2256         seq_setwidth(seq, TMPSZ - 1);
2257         if (v == SEQ_START_TOKEN) {
2258                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2259                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2260                            "inode");
2261                 goto out;
2262         }
2263         st = seq->private;
2264
2265         if (sk->sk_state == TCP_TIME_WAIT)
2266                 get_timewait4_sock(v, seq, st->num);
2267         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2268                 get_openreq4(v, seq, st->num);
2269         else
2270                 get_tcp4_sock(v, seq, st->num);
2271 out:
2272         seq_pad(seq, '\n');
2273         return 0;
2274 }
2275
2276 static const struct file_operations tcp_afinfo_seq_fops = {
2277         .owner   = THIS_MODULE,
2278         .open    = tcp_seq_open,
2279         .read    = seq_read,
2280         .llseek  = seq_lseek,
2281         .release = seq_release_net
2282 };
2283
2284 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2285         .name           = "tcp",
2286         .family         = AF_INET,
2287         .seq_fops       = &tcp_afinfo_seq_fops,
2288         .seq_ops        = {
2289                 .show           = tcp4_seq_show,
2290         },
2291 };
2292
2293 static int __net_init tcp4_proc_init_net(struct net *net)
2294 {
2295         return tcp_proc_register(net, &tcp4_seq_afinfo);
2296 }
2297
2298 static void __net_exit tcp4_proc_exit_net(struct net *net)
2299 {
2300         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2301 }
2302
2303 static struct pernet_operations tcp4_net_ops = {
2304         .init = tcp4_proc_init_net,
2305         .exit = tcp4_proc_exit_net,
2306 };
2307
2308 int __init tcp4_proc_init(void)
2309 {
2310         return register_pernet_subsys(&tcp4_net_ops);
2311 }
2312
2313 void tcp4_proc_exit(void)
2314 {
2315         unregister_pernet_subsys(&tcp4_net_ops);
2316 }
2317 #endif /* CONFIG_PROC_FS */
2318
2319 struct proto tcp_prot = {
2320         .name                   = "TCP",
2321         .owner                  = THIS_MODULE,
2322         .close                  = tcp_close,
2323         .connect                = tcp_v4_connect,
2324         .disconnect             = tcp_disconnect,
2325         .accept                 = inet_csk_accept,
2326         .ioctl                  = tcp_ioctl,
2327         .init                   = tcp_v4_init_sock,
2328         .destroy                = tcp_v4_destroy_sock,
2329         .shutdown               = tcp_shutdown,
2330         .setsockopt             = tcp_setsockopt,
2331         .getsockopt             = tcp_getsockopt,
2332         .recvmsg                = tcp_recvmsg,
2333         .sendmsg                = tcp_sendmsg,
2334         .sendpage               = tcp_sendpage,
2335         .backlog_rcv            = tcp_v4_do_rcv,
2336         .release_cb             = tcp_release_cb,
2337         .hash                   = inet_hash,
2338         .unhash                 = inet_unhash,
2339         .get_port               = inet_csk_get_port,
2340         .enter_memory_pressure  = tcp_enter_memory_pressure,
2341         .stream_memory_free     = tcp_stream_memory_free,
2342         .sockets_allocated      = &tcp_sockets_allocated,
2343         .orphan_count           = &tcp_orphan_count,
2344         .memory_allocated       = &tcp_memory_allocated,
2345         .memory_pressure        = &tcp_memory_pressure,
2346         .sysctl_mem             = sysctl_tcp_mem,
2347         .sysctl_wmem            = sysctl_tcp_wmem,
2348         .sysctl_rmem            = sysctl_tcp_rmem,
2349         .max_header             = MAX_TCP_HEADER,
2350         .obj_size               = sizeof(struct tcp_sock),
2351         .slab_flags             = SLAB_DESTROY_BY_RCU,
2352         .twsk_prot              = &tcp_timewait_sock_ops,
2353         .rsk_prot               = &tcp_request_sock_ops,
2354         .h.hashinfo             = &tcp_hashinfo,
2355         .no_autobind            = true,
2356 #ifdef CONFIG_COMPAT
2357         .compat_setsockopt      = compat_tcp_setsockopt,
2358         .compat_getsockopt      = compat_tcp_getsockopt,
2359 #endif
2360         .diag_destroy           = tcp_abort,
2361 };
2362 EXPORT_SYMBOL(tcp_prot);
2363
2364 static void __net_exit tcp_sk_exit(struct net *net)
2365 {
2366         int cpu;
2367
2368         for_each_possible_cpu(cpu)
2369                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2370         free_percpu(net->ipv4.tcp_sk);
2371 }
2372
2373 static int __net_init tcp_sk_init(struct net *net)
2374 {
2375         int res, cpu;
2376
2377         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2378         if (!net->ipv4.tcp_sk)
2379                 return -ENOMEM;
2380
2381         for_each_possible_cpu(cpu) {
2382                 struct sock *sk;
2383
2384                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2385                                            IPPROTO_TCP, net);
2386                 if (res)
2387                         goto fail;
2388                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2389                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2390         }
2391
2392         net->ipv4.sysctl_tcp_ecn = 2;
2393         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2394
2395         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2396         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2397         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2398
2399         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2400         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2401         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2402
2403         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2404         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2405         net->ipv4.sysctl_tcp_syncookies = 1;
2406         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2407         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2408         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2409         net->ipv4.sysctl_tcp_orphan_retries = 0;
2410         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2411         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2412
2413         return 0;
2414 fail:
2415         tcp_sk_exit(net);
2416
2417         return res;
2418 }
2419
2420 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2421 {
2422         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2423 }
2424
2425 static struct pernet_operations __net_initdata tcp_sk_ops = {
2426        .init       = tcp_sk_init,
2427        .exit       = tcp_sk_exit,
2428        .exit_batch = tcp_sk_exit_batch,
2429 };
2430
2431 void __init tcp_v4_init(void)
2432 {
2433         inet_hashinfo_init(&tcp_hashinfo);
2434         if (register_pernet_subsys(&tcp_sk_ops))
2435                 panic("Failed to create the TCP control socket.\n");
2436 }