2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78 #include <net/busy_poll.h>
80 #include <linux/inet.h>
81 #include <linux/ipv6.h>
82 #include <linux/stddef.h>
83 #include <linux/proc_fs.h>
84 #include <linux/seq_file.h>
86 #include <linux/crypto.h>
87 #include <linux/scatterlist.h>
89 int sysctl_tcp_tw_reuse __read_mostly;
90 int sysctl_tcp_low_latency __read_mostly;
91 EXPORT_SYMBOL(sysctl_tcp_low_latency);
94 #ifdef CONFIG_TCP_MD5SIG
95 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 __be32 daddr, __be32 saddr, const struct tcphdr *th);
99 struct inet_hashinfo tcp_hashinfo;
100 EXPORT_SYMBOL(tcp_hashinfo);
102 static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
104 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
107 tcp_hdr(skb)->source);
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 struct tcp_sock *tp = tcp_sk(sk);
115 /* With PAWS, it is safe from the viewpoint
116 of data integrity. Even without PAWS it is safe provided sequence
117 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 Actually, the idea is close to VJ's one, only timestamp cache is
120 held not per host, but per port pair and TW bucket is used as state
123 If TW bucket has been already destroyed we fall back to VJ's scheme
124 and use initial timestamp retrieved from peer table.
126 if (tcptw->tw_ts_recent_stamp &&
127 (twp == NULL || (sysctl_tcp_tw_reuse &&
128 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
129 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
130 if (tp->write_seq == 0)
132 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
133 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
140 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 /* This will initiate an outgoing connection. */
143 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
146 struct inet_sock *inet = inet_sk(sk);
147 struct tcp_sock *tp = tcp_sk(sk);
148 __be16 orig_sport, orig_dport;
149 __be32 daddr, nexthop;
153 struct ip_options_rcu *inet_opt;
155 if (addr_len < sizeof(struct sockaddr_in))
158 if (usin->sin_family != AF_INET)
159 return -EAFNOSUPPORT;
161 nexthop = daddr = usin->sin_addr.s_addr;
162 inet_opt = rcu_dereference_protected(inet->inet_opt,
163 sock_owned_by_user(sk));
164 if (inet_opt && inet_opt->opt.srr) {
167 nexthop = inet_opt->opt.faddr;
170 orig_sport = inet->inet_sport;
171 orig_dport = usin->sin_port;
172 fl4 = &inet->cork.fl.u.ip4;
173 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
176 orig_sport, orig_dport, sk);
179 if (err == -ENETUNREACH)
180 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
184 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
189 if (!inet_opt || !inet_opt->opt.srr)
192 if (!inet->inet_saddr)
193 inet->inet_saddr = fl4->saddr;
194 inet->inet_rcv_saddr = inet->inet_saddr;
196 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
197 /* Reset inherited state */
198 tp->rx_opt.ts_recent = 0;
199 tp->rx_opt.ts_recent_stamp = 0;
200 if (likely(!tp->repair))
204 if (tcp_death_row.sysctl_tw_recycle &&
205 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
206 tcp_fetch_timewait_stamp(sk, &rt->dst);
208 inet->inet_dport = usin->sin_port;
209 inet->inet_daddr = daddr;
213 inet_csk(sk)->icsk_ext_hdr_len = 0;
215 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
217 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
219 /* Socket identity is still unknown (sport may be zero).
220 * However we set state to SYN-SENT and not releasing socket
221 * lock select source port, enter ourselves into the hash tables and
222 * complete initialization after this.
224 tcp_set_state(sk, TCP_SYN_SENT);
225 err = inet_hash_connect(&tcp_death_row, sk);
229 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
230 inet->inet_sport, inet->inet_dport, sk);
236 /* OK, now commit destination to socket. */
237 sk->sk_gso_type = SKB_GSO_TCPV4;
238 sk_setup_caps(sk, &rt->dst);
240 if (!tp->write_seq && likely(!tp->repair))
241 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
246 inet->inet_id = tp->write_seq ^ jiffies;
248 err = tcp_connect(sk);
258 * This unhashes the socket and releases the local port,
261 tcp_set_state(sk, TCP_CLOSE);
263 sk->sk_route_caps = 0;
264 inet->inet_dport = 0;
267 EXPORT_SYMBOL(tcp_v4_connect);
270 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
271 * It can be called through tcp_release_cb() if socket was owned by user
272 * at the time tcp_v4_err() was called to handle ICMP message.
274 static void tcp_v4_mtu_reduced(struct sock *sk)
276 struct dst_entry *dst;
277 struct inet_sock *inet = inet_sk(sk);
278 u32 mtu = tcp_sk(sk)->mtu_info;
280 dst = inet_csk_update_pmtu(sk, mtu);
284 /* Something is about to be wrong... Remember soft error
285 * for the case, if this connection will not able to recover.
287 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
288 sk->sk_err_soft = EMSGSIZE;
292 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
293 ip_sk_accept_pmtu(sk) &&
294 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
295 tcp_sync_mss(sk, mtu);
297 /* Resend the TCP packet because it's
298 * clear that the old packet has been
299 * dropped. This is the new "fast" path mtu
302 tcp_simple_retransmit(sk);
303 } /* else let the usual retransmit timer handle it */
306 static void do_redirect(struct sk_buff *skb, struct sock *sk)
308 struct dst_entry *dst = __sk_dst_check(sk, 0);
311 dst->ops->redirect(dst, sk, skb);
315 * This routine is called by the ICMP module when it gets some
316 * sort of error condition. If err < 0 then the socket should
317 * be closed and the error returned to the user. If err > 0
318 * it's just the icmp type << 8 | icmp code. After adjustment
319 * header points to the first 8 bytes of the tcp header. We need
320 * to find the appropriate port.
322 * The locking strategy used here is very "optimistic". When
323 * someone else accesses the socket the ICMP is just dropped
324 * and for some paths there is no check at all.
325 * A more general error queue to queue errors for later handling
326 * is probably better.
330 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
332 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
333 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
334 struct inet_connection_sock *icsk;
336 struct inet_sock *inet;
337 const int type = icmp_hdr(icmp_skb)->type;
338 const int code = icmp_hdr(icmp_skb)->code;
341 struct request_sock *fastopen;
345 struct net *net = dev_net(icmp_skb->dev);
347 if (icmp_skb->len < (iph->ihl << 2) + 8) {
348 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
352 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353 iph->saddr, th->source, inet_iif(icmp_skb));
355 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
358 if (sk->sk_state == TCP_TIME_WAIT) {
359 inet_twsk_put(inet_twsk(sk));
364 /* If too many ICMPs get dropped on busy
365 * servers this needs to be solved differently.
366 * We do take care of PMTU discovery (RFC1191) special case :
367 * we can receive locally generated ICMP messages while socket is held.
369 if (sock_owned_by_user(sk)) {
370 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
371 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
373 if (sk->sk_state == TCP_CLOSE)
376 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
377 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
383 seq = ntohl(th->seq);
384 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
385 fastopen = tp->fastopen_rsk;
386 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
387 if (sk->sk_state != TCP_LISTEN &&
388 !between(seq, snd_una, tp->snd_nxt)) {
389 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 do_redirect(icmp_skb, sk);
397 case ICMP_SOURCE_QUENCH:
398 /* Just silently ignore these. */
400 case ICMP_PARAMETERPROB:
403 case ICMP_DEST_UNREACH:
404 if (code > NR_ICMP_UNREACH)
407 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
408 /* We are not interested in TCP_LISTEN and open_requests
409 * (SYN-ACKs send out by Linux are always <576bytes so
410 * they should go through unfragmented).
412 if (sk->sk_state == TCP_LISTEN)
416 if (!sock_owned_by_user(sk)) {
417 tcp_v4_mtu_reduced(sk);
419 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
425 err = icmp_err_convert[code].errno;
426 /* check if icmp_skb allows revert of backoff
427 * (see draft-zimmermann-tcp-lcd) */
428 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
430 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
431 !icsk->icsk_backoff || fastopen)
434 if (sock_owned_by_user(sk))
437 icsk->icsk_backoff--;
438 inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
439 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
442 skb = tcp_write_queue_head(sk);
445 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
446 tcp_time_stamp - TCP_SKB_CB(skb)->when);
449 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
450 remaining, TCP_RTO_MAX);
452 /* RTO revert clocked out retransmission.
453 * Will retransmit now */
454 tcp_retransmit_timer(sk);
458 case ICMP_TIME_EXCEEDED:
465 switch (sk->sk_state) {
466 struct request_sock *req, **prev;
468 if (sock_owned_by_user(sk))
471 req = inet_csk_search_req(sk, &prev, th->dest,
472 iph->daddr, iph->saddr);
476 /* ICMPs are not backlogged, hence we cannot get
477 an established socket here.
481 if (seq != tcp_rsk(req)->snt_isn) {
482 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
487 * Still in SYN_RECV, just remove it silently.
488 * There is no good way to pass the error to the newly
489 * created socket, and POSIX does not want network
490 * errors returned from accept().
492 inet_csk_reqsk_queue_drop(sk, req, prev);
493 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
498 /* Only in fast or simultaneous open. If a fast open socket is
499 * is already accepted it is treated as a connected one below.
501 if (fastopen && fastopen->sk == NULL)
504 if (!sock_owned_by_user(sk)) {
507 sk->sk_error_report(sk);
511 sk->sk_err_soft = err;
516 /* If we've already connected we will keep trying
517 * until we time out, or the user gives up.
519 * rfc1122 4.2.3.9 allows to consider as hard errors
520 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
521 * but it is obsoleted by pmtu discovery).
523 * Note, that in modern internet, where routing is unreliable
524 * and in each dark corner broken firewalls sit, sending random
525 * errors ordered by their masters even this two messages finally lose
526 * their original sense (even Linux sends invalid PORT_UNREACHs)
528 * Now we are in compliance with RFCs.
533 if (!sock_owned_by_user(sk) && inet->recverr) {
535 sk->sk_error_report(sk);
536 } else { /* Only an error on timeout */
537 sk->sk_err_soft = err;
545 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
547 struct tcphdr *th = tcp_hdr(skb);
549 if (skb->ip_summed == CHECKSUM_PARTIAL) {
550 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
551 skb->csum_start = skb_transport_header(skb) - skb->head;
552 skb->csum_offset = offsetof(struct tcphdr, check);
554 th->check = tcp_v4_check(skb->len, saddr, daddr,
561 /* This routine computes an IPv4 TCP checksum. */
562 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
564 const struct inet_sock *inet = inet_sk(sk);
566 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
568 EXPORT_SYMBOL(tcp_v4_send_check);
571 * This routine will send an RST to the other tcp.
573 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575 * Answer: if a packet caused RST, it is not for a socket
576 * existing in our system, if it is matched to a socket,
577 * it is just duplicate segment or bug in other side's TCP.
578 * So that we build reply only basing on parameters
579 * arrived with segment.
580 * Exception: precedence violation. We do not implement it in any case.
583 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
585 const struct tcphdr *th = tcp_hdr(skb);
588 #ifdef CONFIG_TCP_MD5SIG
589 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
592 struct ip_reply_arg arg;
593 #ifdef CONFIG_TCP_MD5SIG
594 struct tcp_md5sig_key *key;
595 const __u8 *hash_location = NULL;
596 unsigned char newhash[16];
598 struct sock *sk1 = NULL;
602 /* Never send a reset in response to a reset. */
606 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
609 /* Swap the send and the receive. */
610 memset(&rep, 0, sizeof(rep));
611 rep.th.dest = th->source;
612 rep.th.source = th->dest;
613 rep.th.doff = sizeof(struct tcphdr) / 4;
617 rep.th.seq = th->ack_seq;
620 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
621 skb->len - (th->doff << 2));
624 memset(&arg, 0, sizeof(arg));
625 arg.iov[0].iov_base = (unsigned char *)&rep;
626 arg.iov[0].iov_len = sizeof(rep.th);
628 #ifdef CONFIG_TCP_MD5SIG
629 hash_location = tcp_parse_md5sig_option(th);
630 if (!sk && hash_location) {
632 * active side is lost. Try to find listening socket through
633 * source port, and then find md5 key through listening socket.
634 * we are not loose security here:
635 * Incoming packet is checked with md5 hash with finding key,
636 * no RST generated if md5 hash doesn't match.
638 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
639 &tcp_hashinfo, ip_hdr(skb)->saddr,
640 th->source, ip_hdr(skb)->daddr,
641 ntohs(th->source), inet_iif(skb));
642 /* don't send rst if it can't find key */
646 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
647 &ip_hdr(skb)->saddr, AF_INET);
651 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
652 if (genhash || memcmp(hash_location, newhash, 16) != 0)
655 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
661 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
663 (TCPOPT_MD5SIG << 8) |
665 /* Update length and the length the header thinks exists */
666 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
667 rep.th.doff = arg.iov[0].iov_len / 4;
669 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
670 key, ip_hdr(skb)->saddr,
671 ip_hdr(skb)->daddr, &rep.th);
674 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
675 ip_hdr(skb)->saddr, /* XXX */
676 arg.iov[0].iov_len, IPPROTO_TCP, 0);
677 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
678 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
679 /* When socket is gone, all binding information is lost.
680 * routing might fail in this case. No choice here, if we choose to force
681 * input interface, we will misroute in case of asymmetric route.
684 arg.bound_dev_if = sk->sk_bound_dev_if;
686 net = dev_net(skb_dst(skb)->dev);
687 arg.tos = ip_hdr(skb)->tos;
688 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
689 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
691 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
692 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
694 #ifdef CONFIG_TCP_MD5SIG
703 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
704 outside socket context is ugly, certainly. What can I do?
707 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
708 u32 win, u32 tsval, u32 tsecr, int oif,
709 struct tcp_md5sig_key *key,
710 int reply_flags, u8 tos)
712 const struct tcphdr *th = tcp_hdr(skb);
715 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
716 #ifdef CONFIG_TCP_MD5SIG
717 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
721 struct ip_reply_arg arg;
722 struct net *net = dev_net(skb_dst(skb)->dev);
724 memset(&rep.th, 0, sizeof(struct tcphdr));
725 memset(&arg, 0, sizeof(arg));
727 arg.iov[0].iov_base = (unsigned char *)&rep;
728 arg.iov[0].iov_len = sizeof(rep.th);
730 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
731 (TCPOPT_TIMESTAMP << 8) |
733 rep.opt[1] = htonl(tsval);
734 rep.opt[2] = htonl(tsecr);
735 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
738 /* Swap the send and the receive. */
739 rep.th.dest = th->source;
740 rep.th.source = th->dest;
741 rep.th.doff = arg.iov[0].iov_len / 4;
742 rep.th.seq = htonl(seq);
743 rep.th.ack_seq = htonl(ack);
745 rep.th.window = htons(win);
747 #ifdef CONFIG_TCP_MD5SIG
749 int offset = (tsecr) ? 3 : 0;
751 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
753 (TCPOPT_MD5SIG << 8) |
755 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
756 rep.th.doff = arg.iov[0].iov_len/4;
758 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
759 key, ip_hdr(skb)->saddr,
760 ip_hdr(skb)->daddr, &rep.th);
763 arg.flags = reply_flags;
764 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
765 ip_hdr(skb)->saddr, /* XXX */
766 arg.iov[0].iov_len, IPPROTO_TCP, 0);
767 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
769 arg.bound_dev_if = oif;
771 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
772 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
774 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
777 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
779 struct inet_timewait_sock *tw = inet_twsk(sk);
780 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
782 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
783 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
784 tcp_time_stamp + tcptw->tw_ts_offset,
787 tcp_twsk_md5_key(tcptw),
788 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
795 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
796 struct request_sock *req)
798 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
799 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
801 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
802 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
803 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
807 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
809 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
814 * Send a SYN-ACK after having received a SYN.
815 * This still operates on a request_sock only, not on a big
818 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
820 struct request_sock *req,
822 struct tcp_fastopen_cookie *foc)
824 const struct inet_request_sock *ireq = inet_rsk(req);
829 /* First, grab a route. */
830 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
833 skb = tcp_make_synack(sk, dst, req, foc);
836 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
838 skb_set_queue_mapping(skb, queue_mapping);
839 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
842 err = net_xmit_eval(err);
849 * IPv4 request_sock destructor.
851 static void tcp_v4_reqsk_destructor(struct request_sock *req)
853 kfree(inet_rsk(req)->opt);
857 * Return true if a syncookie should be sent
859 bool tcp_syn_flood_action(struct sock *sk,
860 const struct sk_buff *skb,
863 const char *msg = "Dropping request";
864 bool want_cookie = false;
865 struct listen_sock *lopt;
867 #ifdef CONFIG_SYN_COOKIES
868 if (sysctl_tcp_syncookies) {
869 msg = "Sending cookies";
871 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
874 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
876 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
877 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
878 lopt->synflood_warned = 1;
879 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
880 proto, ntohs(tcp_hdr(skb)->dest), msg);
884 EXPORT_SYMBOL(tcp_syn_flood_action);
887 * Save and compile IPv4 options into the request_sock if needed.
889 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
891 const struct ip_options *opt = &(IPCB(skb)->opt);
892 struct ip_options_rcu *dopt = NULL;
894 if (opt && opt->optlen) {
895 int opt_size = sizeof(*dopt) + opt->optlen;
897 dopt = kmalloc(opt_size, GFP_ATOMIC);
899 if (ip_options_echo(&dopt->opt, skb)) {
908 #ifdef CONFIG_TCP_MD5SIG
910 * RFC2385 MD5 checksumming requires a mapping of
911 * IP address->MD5 Key.
912 * We need to maintain these in the sk structure.
915 /* Find the Key structure for an address. */
916 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
917 const union tcp_md5_addr *addr,
920 struct tcp_sock *tp = tcp_sk(sk);
921 struct tcp_md5sig_key *key;
922 unsigned int size = sizeof(struct in_addr);
923 struct tcp_md5sig_info *md5sig;
925 /* caller either holds rcu_read_lock() or socket lock */
926 md5sig = rcu_dereference_check(tp->md5sig_info,
927 sock_owned_by_user(sk) ||
928 lockdep_is_held(&sk->sk_lock.slock));
931 #if IS_ENABLED(CONFIG_IPV6)
932 if (family == AF_INET6)
933 size = sizeof(struct in6_addr);
935 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
936 if (key->family != family)
938 if (!memcmp(&key->addr, addr, size))
943 EXPORT_SYMBOL(tcp_md5_do_lookup);
945 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
946 struct sock *addr_sk)
948 union tcp_md5_addr *addr;
950 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
951 return tcp_md5_do_lookup(sk, addr, AF_INET);
953 EXPORT_SYMBOL(tcp_v4_md5_lookup);
955 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
956 struct request_sock *req)
958 union tcp_md5_addr *addr;
960 addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
961 return tcp_md5_do_lookup(sk, addr, AF_INET);
964 /* This can be called on a newly created socket, from other files */
965 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
966 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
968 /* Add Key to the list */
969 struct tcp_md5sig_key *key;
970 struct tcp_sock *tp = tcp_sk(sk);
971 struct tcp_md5sig_info *md5sig;
973 key = tcp_md5_do_lookup(sk, addr, family);
975 /* Pre-existing entry - just update that one. */
976 memcpy(key->key, newkey, newkeylen);
977 key->keylen = newkeylen;
981 md5sig = rcu_dereference_protected(tp->md5sig_info,
982 sock_owned_by_user(sk));
984 md5sig = kmalloc(sizeof(*md5sig), gfp);
988 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
989 INIT_HLIST_HEAD(&md5sig->head);
990 rcu_assign_pointer(tp->md5sig_info, md5sig);
993 key = sock_kmalloc(sk, sizeof(*key), gfp);
996 if (!tcp_alloc_md5sig_pool()) {
997 sock_kfree_s(sk, key, sizeof(*key));
1001 memcpy(key->key, newkey, newkeylen);
1002 key->keylen = newkeylen;
1003 key->family = family;
1004 memcpy(&key->addr, addr,
1005 (family == AF_INET6) ? sizeof(struct in6_addr) :
1006 sizeof(struct in_addr));
1007 hlist_add_head_rcu(&key->node, &md5sig->head);
1010 EXPORT_SYMBOL(tcp_md5_do_add);
1012 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1014 struct tcp_md5sig_key *key;
1016 key = tcp_md5_do_lookup(sk, addr, family);
1019 hlist_del_rcu(&key->node);
1020 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1021 kfree_rcu(key, rcu);
1024 EXPORT_SYMBOL(tcp_md5_do_del);
1026 static void tcp_clear_md5_list(struct sock *sk)
1028 struct tcp_sock *tp = tcp_sk(sk);
1029 struct tcp_md5sig_key *key;
1030 struct hlist_node *n;
1031 struct tcp_md5sig_info *md5sig;
1033 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1035 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1036 hlist_del_rcu(&key->node);
1037 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1038 kfree_rcu(key, rcu);
1042 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1045 struct tcp_md5sig cmd;
1046 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1048 if (optlen < sizeof(cmd))
1051 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1054 if (sin->sin_family != AF_INET)
1057 if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1058 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1061 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1064 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1065 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1069 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1070 __be32 daddr, __be32 saddr, int nbytes)
1072 struct tcp4_pseudohdr *bp;
1073 struct scatterlist sg;
1075 bp = &hp->md5_blk.ip4;
1078 * 1. the TCP pseudo-header (in the order: source IP address,
1079 * destination IP address, zero-padded protocol number, and
1085 bp->protocol = IPPROTO_TCP;
1086 bp->len = cpu_to_be16(nbytes);
1088 sg_init_one(&sg, bp, sizeof(*bp));
1089 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1092 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1093 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1095 struct tcp_md5sig_pool *hp;
1096 struct hash_desc *desc;
1098 hp = tcp_get_md5sig_pool();
1100 goto clear_hash_noput;
1101 desc = &hp->md5_desc;
1103 if (crypto_hash_init(desc))
1105 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1107 if (tcp_md5_hash_header(hp, th))
1109 if (tcp_md5_hash_key(hp, key))
1111 if (crypto_hash_final(desc, md5_hash))
1114 tcp_put_md5sig_pool();
1118 tcp_put_md5sig_pool();
1120 memset(md5_hash, 0, 16);
1124 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1125 const struct sock *sk, const struct request_sock *req,
1126 const struct sk_buff *skb)
1128 struct tcp_md5sig_pool *hp;
1129 struct hash_desc *desc;
1130 const struct tcphdr *th = tcp_hdr(skb);
1131 __be32 saddr, daddr;
1134 saddr = inet_sk(sk)->inet_saddr;
1135 daddr = inet_sk(sk)->inet_daddr;
1137 saddr = inet_rsk(req)->ir_loc_addr;
1138 daddr = inet_rsk(req)->ir_rmt_addr;
1140 const struct iphdr *iph = ip_hdr(skb);
1145 hp = tcp_get_md5sig_pool();
1147 goto clear_hash_noput;
1148 desc = &hp->md5_desc;
1150 if (crypto_hash_init(desc))
1153 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1155 if (tcp_md5_hash_header(hp, th))
1157 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1159 if (tcp_md5_hash_key(hp, key))
1161 if (crypto_hash_final(desc, md5_hash))
1164 tcp_put_md5sig_pool();
1168 tcp_put_md5sig_pool();
1170 memset(md5_hash, 0, 16);
1173 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1175 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1178 * This gets called for each TCP segment that arrives
1179 * so we want to be efficient.
1180 * We have 3 drop cases:
1181 * o No MD5 hash and one expected.
1182 * o MD5 hash and we're not expecting one.
1183 * o MD5 hash and its wrong.
1185 const __u8 *hash_location = NULL;
1186 struct tcp_md5sig_key *hash_expected;
1187 const struct iphdr *iph = ip_hdr(skb);
1188 const struct tcphdr *th = tcp_hdr(skb);
1190 unsigned char newhash[16];
1192 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1194 hash_location = tcp_parse_md5sig_option(th);
1196 /* We've parsed the options - do we have a hash? */
1197 if (!hash_expected && !hash_location)
1200 if (hash_expected && !hash_location) {
1201 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1205 if (!hash_expected && hash_location) {
1206 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1210 /* Okay, so this is hash_expected and hash_location -
1211 * so we need to calculate the checksum.
1213 genhash = tcp_v4_md5_hash_skb(newhash,
1217 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1218 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1219 &iph->saddr, ntohs(th->source),
1220 &iph->daddr, ntohs(th->dest),
1221 genhash ? " tcp_v4_calc_md5_hash failed"
1230 static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1231 struct sk_buff *skb)
1233 struct inet_request_sock *ireq = inet_rsk(req);
1235 ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1236 ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1237 ireq->no_srccheck = inet_sk(sk)->transparent;
1238 ireq->opt = tcp_v4_save_options(skb);
1241 static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1242 const struct request_sock *req,
1245 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1248 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1257 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1259 .obj_size = sizeof(struct tcp_request_sock),
1260 .rtx_syn_ack = tcp_rtx_synack,
1261 .send_ack = tcp_v4_reqsk_send_ack,
1262 .destructor = tcp_v4_reqsk_destructor,
1263 .send_reset = tcp_v4_send_reset,
1264 .syn_ack_timeout = tcp_syn_ack_timeout,
1267 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1268 .mss_clamp = TCP_MSS_DEFAULT,
1269 #ifdef CONFIG_TCP_MD5SIG
1270 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1271 .calc_md5_hash = tcp_v4_md5_hash_skb,
1273 .init_req = tcp_v4_init_req,
1274 #ifdef CONFIG_SYN_COOKIES
1275 .cookie_init_seq = cookie_v4_init_sequence,
1277 .route_req = tcp_v4_route_req,
1278 .init_seq = tcp_v4_init_sequence,
1279 .send_synack = tcp_v4_send_synack,
1280 .queue_hash_add = inet_csk_reqsk_queue_hash_add,
1283 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1285 /* Never answer to SYNs send to broadcast or multicast */
1286 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1289 return tcp_conn_request(&tcp_request_sock_ops,
1290 &tcp_request_sock_ipv4_ops, sk, skb);
1293 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1296 EXPORT_SYMBOL(tcp_v4_conn_request);
1300 * The three way handshake has completed - we got a valid synack -
1301 * now create the new socket.
1303 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1304 struct request_sock *req,
1305 struct dst_entry *dst)
1307 struct inet_request_sock *ireq;
1308 struct inet_sock *newinet;
1309 struct tcp_sock *newtp;
1311 #ifdef CONFIG_TCP_MD5SIG
1312 struct tcp_md5sig_key *key;
1314 struct ip_options_rcu *inet_opt;
1316 if (sk_acceptq_is_full(sk))
1319 newsk = tcp_create_openreq_child(sk, req, skb);
1323 newsk->sk_gso_type = SKB_GSO_TCPV4;
1324 inet_sk_rx_dst_set(newsk, skb);
1326 newtp = tcp_sk(newsk);
1327 newinet = inet_sk(newsk);
1328 ireq = inet_rsk(req);
1329 newinet->inet_daddr = ireq->ir_rmt_addr;
1330 newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1331 newinet->inet_saddr = ireq->ir_loc_addr;
1332 inet_opt = ireq->opt;
1333 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1335 newinet->mc_index = inet_iif(skb);
1336 newinet->mc_ttl = ip_hdr(skb)->ttl;
1337 newinet->rcv_tos = ip_hdr(skb)->tos;
1338 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1339 inet_set_txhash(newsk);
1341 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1342 newinet->inet_id = newtp->write_seq ^ jiffies;
1345 dst = inet_csk_route_child_sock(sk, newsk, req);
1349 /* syncookie case : see end of cookie_v4_check() */
1351 sk_setup_caps(newsk, dst);
1353 tcp_sync_mss(newsk, dst_mtu(dst));
1354 newtp->advmss = dst_metric_advmss(dst);
1355 if (tcp_sk(sk)->rx_opt.user_mss &&
1356 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1357 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1359 tcp_initialize_rcv_mss(newsk);
1361 #ifdef CONFIG_TCP_MD5SIG
1362 /* Copy over the MD5 key from the original socket */
1363 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1367 * We're using one, so create a matching key
1368 * on the newsk structure. If we fail to get
1369 * memory, then we end up not copying the key
1372 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1373 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1374 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1378 if (__inet_inherit_port(sk, newsk) < 0)
1380 __inet_hash_nolisten(newsk, NULL);
1385 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1389 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1392 inet_csk_prepare_forced_close(newsk);
1396 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1398 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1400 struct tcphdr *th = tcp_hdr(skb);
1401 const struct iphdr *iph = ip_hdr(skb);
1403 struct request_sock **prev;
1404 /* Find possible connection requests. */
1405 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1406 iph->saddr, iph->daddr);
1408 return tcp_check_req(sk, skb, req, prev, false);
1410 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1411 th->source, iph->daddr, th->dest, inet_iif(skb));
1414 if (nsk->sk_state != TCP_TIME_WAIT) {
1418 inet_twsk_put(inet_twsk(nsk));
1422 #ifdef CONFIG_SYN_COOKIES
1424 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1429 /* The socket must have it's spinlock held when we get
1432 * We have a potential double-lock case here, so even when
1433 * doing backlog processing we use the BH locking scheme.
1434 * This is because we cannot sleep with the original spinlock
1437 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1440 #ifdef CONFIG_TCP_MD5SIG
1442 * We really want to reject the packet as early as possible
1444 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1445 * o There is an MD5 option and we're not expecting one
1447 if (tcp_v4_inbound_md5_hash(sk, skb))
1451 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1452 struct dst_entry *dst = sk->sk_rx_dst;
1454 sock_rps_save_rxhash(sk, skb);
1456 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1457 dst->ops->check(dst, 0) == NULL) {
1459 sk->sk_rx_dst = NULL;
1462 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1466 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1469 if (sk->sk_state == TCP_LISTEN) {
1470 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1475 sock_rps_save_rxhash(nsk, skb);
1476 if (tcp_child_process(sk, nsk, skb)) {
1483 sock_rps_save_rxhash(sk, skb);
1485 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1492 tcp_v4_send_reset(rsk, skb);
1495 /* Be careful here. If this function gets more complicated and
1496 * gcc suffers from register pressure on the x86, sk (in %ebx)
1497 * might be destroyed here. This current version compiles correctly,
1498 * but you have been warned.
1503 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1504 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1507 EXPORT_SYMBOL(tcp_v4_do_rcv);
1509 void tcp_v4_early_demux(struct sk_buff *skb)
1511 const struct iphdr *iph;
1512 const struct tcphdr *th;
1515 if (skb->pkt_type != PACKET_HOST)
1518 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1524 if (th->doff < sizeof(struct tcphdr) / 4)
1527 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1528 iph->saddr, th->source,
1529 iph->daddr, ntohs(th->dest),
1533 skb->destructor = sock_edemux;
1534 if (sk->sk_state != TCP_TIME_WAIT) {
1535 struct dst_entry *dst = sk->sk_rx_dst;
1538 dst = dst_check(dst, 0);
1540 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1541 skb_dst_set_noref(skb, dst);
1546 /* Packet is added to VJ-style prequeue for processing in process
1547 * context, if a reader task is waiting. Apparently, this exciting
1548 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1549 * failed somewhere. Latency? Burstiness? Well, at least now we will
1550 * see, why it failed. 8)8) --ANK
1553 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1555 struct tcp_sock *tp = tcp_sk(sk);
1557 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1560 if (skb->len <= tcp_hdrlen(skb) &&
1561 skb_queue_len(&tp->ucopy.prequeue) == 0)
1565 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1566 tp->ucopy.memory += skb->truesize;
1567 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1568 struct sk_buff *skb1;
1570 BUG_ON(sock_owned_by_user(sk));
1572 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1573 sk_backlog_rcv(sk, skb1);
1574 NET_INC_STATS_BH(sock_net(sk),
1575 LINUX_MIB_TCPPREQUEUEDROPPED);
1578 tp->ucopy.memory = 0;
1579 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1580 wake_up_interruptible_sync_poll(sk_sleep(sk),
1581 POLLIN | POLLRDNORM | POLLRDBAND);
1582 if (!inet_csk_ack_scheduled(sk))
1583 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1584 (3 * tcp_rto_min(sk)) / 4,
1589 EXPORT_SYMBOL(tcp_prequeue);
1595 int tcp_v4_rcv(struct sk_buff *skb)
1597 const struct iphdr *iph;
1598 const struct tcphdr *th;
1601 struct net *net = dev_net(skb->dev);
1603 if (skb->pkt_type != PACKET_HOST)
1606 /* Count it even if it's bad */
1607 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1609 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1614 if (th->doff < sizeof(struct tcphdr) / 4)
1616 if (!pskb_may_pull(skb, th->doff * 4))
1619 /* An explanation is required here, I think.
1620 * Packet length and doff are validated by header prediction,
1621 * provided case of th->doff==0 is eliminated.
1622 * So, we defer the checks. */
1624 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1629 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1630 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1631 skb->len - th->doff * 4);
1632 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1633 TCP_SKB_CB(skb)->when = 0;
1634 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1635 TCP_SKB_CB(skb)->sacked = 0;
1637 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1642 if (sk->sk_state == TCP_TIME_WAIT)
1645 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1646 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1647 goto discard_and_relse;
1650 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1651 goto discard_and_relse;
1654 if (sk_filter(sk, skb))
1655 goto discard_and_relse;
1657 sk_mark_napi_id(sk, skb);
1660 bh_lock_sock_nested(sk);
1662 if (!sock_owned_by_user(sk)) {
1663 #ifdef CONFIG_NET_DMA
1664 struct tcp_sock *tp = tcp_sk(sk);
1665 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1666 tp->ucopy.dma_chan = net_dma_find_channel();
1667 if (tp->ucopy.dma_chan)
1668 ret = tcp_v4_do_rcv(sk, skb);
1672 if (!tcp_prequeue(sk, skb))
1673 ret = tcp_v4_do_rcv(sk, skb);
1675 } else if (unlikely(sk_add_backlog(sk, skb,
1676 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1678 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1679 goto discard_and_relse;
1688 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1691 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1693 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1695 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1697 tcp_v4_send_reset(NULL, skb);
1701 /* Discard frame. */
1710 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1711 inet_twsk_put(inet_twsk(sk));
1715 if (skb->len < (th->doff << 2)) {
1716 inet_twsk_put(inet_twsk(sk));
1719 if (tcp_checksum_complete(skb)) {
1720 inet_twsk_put(inet_twsk(sk));
1723 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1725 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1727 iph->saddr, th->source,
1728 iph->daddr, th->dest,
1731 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1732 inet_twsk_put(inet_twsk(sk));
1736 /* Fall through to ACK */
1739 tcp_v4_timewait_ack(sk, skb);
1743 case TCP_TW_SUCCESS:;
1748 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1749 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1750 .twsk_unique = tcp_twsk_unique,
1751 .twsk_destructor= tcp_twsk_destructor,
1754 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1756 struct dst_entry *dst = skb_dst(skb);
1759 sk->sk_rx_dst = dst;
1760 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1762 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1764 const struct inet_connection_sock_af_ops ipv4_specific = {
1765 .queue_xmit = ip_queue_xmit,
1766 .send_check = tcp_v4_send_check,
1767 .rebuild_header = inet_sk_rebuild_header,
1768 .sk_rx_dst_set = inet_sk_rx_dst_set,
1769 .conn_request = tcp_v4_conn_request,
1770 .syn_recv_sock = tcp_v4_syn_recv_sock,
1771 .net_header_len = sizeof(struct iphdr),
1772 .setsockopt = ip_setsockopt,
1773 .getsockopt = ip_getsockopt,
1774 .addr2sockaddr = inet_csk_addr2sockaddr,
1775 .sockaddr_len = sizeof(struct sockaddr_in),
1776 .bind_conflict = inet_csk_bind_conflict,
1777 #ifdef CONFIG_COMPAT
1778 .compat_setsockopt = compat_ip_setsockopt,
1779 .compat_getsockopt = compat_ip_getsockopt,
1782 EXPORT_SYMBOL(ipv4_specific);
1784 #ifdef CONFIG_TCP_MD5SIG
1785 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1786 .md5_lookup = tcp_v4_md5_lookup,
1787 .calc_md5_hash = tcp_v4_md5_hash_skb,
1788 .md5_parse = tcp_v4_parse_md5_keys,
1792 /* NOTE: A lot of things set to zero explicitly by call to
1793 * sk_alloc() so need not be done here.
1795 static int tcp_v4_init_sock(struct sock *sk)
1797 struct inet_connection_sock *icsk = inet_csk(sk);
1801 icsk->icsk_af_ops = &ipv4_specific;
1803 #ifdef CONFIG_TCP_MD5SIG
1804 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1810 void tcp_v4_destroy_sock(struct sock *sk)
1812 struct tcp_sock *tp = tcp_sk(sk);
1814 tcp_clear_xmit_timers(sk);
1816 tcp_cleanup_congestion_control(sk);
1818 /* Cleanup up the write buffer. */
1819 tcp_write_queue_purge(sk);
1821 /* Cleans up our, hopefully empty, out_of_order_queue. */
1822 __skb_queue_purge(&tp->out_of_order_queue);
1824 #ifdef CONFIG_TCP_MD5SIG
1825 /* Clean up the MD5 key list, if any */
1826 if (tp->md5sig_info) {
1827 tcp_clear_md5_list(sk);
1828 kfree_rcu(tp->md5sig_info, rcu);
1829 tp->md5sig_info = NULL;
1833 #ifdef CONFIG_NET_DMA
1834 /* Cleans up our sk_async_wait_queue */
1835 __skb_queue_purge(&sk->sk_async_wait_queue);
1838 /* Clean prequeue, it must be empty really */
1839 __skb_queue_purge(&tp->ucopy.prequeue);
1841 /* Clean up a referenced TCP bind bucket. */
1842 if (inet_csk(sk)->icsk_bind_hash)
1845 BUG_ON(tp->fastopen_rsk != NULL);
1847 /* If socket is aborted during connect operation */
1848 tcp_free_fastopen_req(tp);
1850 sk_sockets_allocated_dec(sk);
1851 sock_release_memcg(sk);
1853 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1855 #ifdef CONFIG_PROC_FS
1856 /* Proc filesystem TCP sock list dumping. */
1859 * Get next listener socket follow cur. If cur is NULL, get first socket
1860 * starting from bucket given in st->bucket; when st->bucket is zero the
1861 * very first socket in the hash table is returned.
1863 static void *listening_get_next(struct seq_file *seq, void *cur)
1865 struct inet_connection_sock *icsk;
1866 struct hlist_nulls_node *node;
1867 struct sock *sk = cur;
1868 struct inet_listen_hashbucket *ilb;
1869 struct tcp_iter_state *st = seq->private;
1870 struct net *net = seq_file_net(seq);
1873 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1874 spin_lock_bh(&ilb->lock);
1875 sk = sk_nulls_head(&ilb->head);
1879 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1883 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1884 struct request_sock *req = cur;
1886 icsk = inet_csk(st->syn_wait_sk);
1890 if (req->rsk_ops->family == st->family) {
1896 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1899 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1901 sk = sk_nulls_next(st->syn_wait_sk);
1902 st->state = TCP_SEQ_STATE_LISTENING;
1903 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1905 icsk = inet_csk(sk);
1906 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1909 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1910 sk = sk_nulls_next(sk);
1913 sk_nulls_for_each_from(sk, node) {
1914 if (!net_eq(sock_net(sk), net))
1916 if (sk->sk_family == st->family) {
1920 icsk = inet_csk(sk);
1921 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1922 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1924 st->uid = sock_i_uid(sk);
1925 st->syn_wait_sk = sk;
1926 st->state = TCP_SEQ_STATE_OPENREQ;
1930 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1932 spin_unlock_bh(&ilb->lock);
1934 if (++st->bucket < INET_LHTABLE_SIZE) {
1935 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1936 spin_lock_bh(&ilb->lock);
1937 sk = sk_nulls_head(&ilb->head);
1945 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1947 struct tcp_iter_state *st = seq->private;
1952 rc = listening_get_next(seq, NULL);
1954 while (rc && *pos) {
1955 rc = listening_get_next(seq, rc);
1961 static inline bool empty_bucket(const struct tcp_iter_state *st)
1963 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1967 * Get first established socket starting from bucket given in st->bucket.
1968 * If st->bucket is zero, the very first socket in the hash is returned.
1970 static void *established_get_first(struct seq_file *seq)
1972 struct tcp_iter_state *st = seq->private;
1973 struct net *net = seq_file_net(seq);
1977 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1979 struct hlist_nulls_node *node;
1980 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1982 /* Lockless fast path for the common case of empty buckets */
1983 if (empty_bucket(st))
1987 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1988 if (sk->sk_family != st->family ||
1989 !net_eq(sock_net(sk), net)) {
1995 spin_unlock_bh(lock);
2001 static void *established_get_next(struct seq_file *seq, void *cur)
2003 struct sock *sk = cur;
2004 struct hlist_nulls_node *node;
2005 struct tcp_iter_state *st = seq->private;
2006 struct net *net = seq_file_net(seq);
2011 sk = sk_nulls_next(sk);
2013 sk_nulls_for_each_from(sk, node) {
2014 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2018 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2020 return established_get_first(seq);
2023 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2025 struct tcp_iter_state *st = seq->private;
2029 rc = established_get_first(seq);
2032 rc = established_get_next(seq, rc);
2038 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2041 struct tcp_iter_state *st = seq->private;
2043 st->state = TCP_SEQ_STATE_LISTENING;
2044 rc = listening_get_idx(seq, &pos);
2047 st->state = TCP_SEQ_STATE_ESTABLISHED;
2048 rc = established_get_idx(seq, pos);
2054 static void *tcp_seek_last_pos(struct seq_file *seq)
2056 struct tcp_iter_state *st = seq->private;
2057 int offset = st->offset;
2058 int orig_num = st->num;
2061 switch (st->state) {
2062 case TCP_SEQ_STATE_OPENREQ:
2063 case TCP_SEQ_STATE_LISTENING:
2064 if (st->bucket >= INET_LHTABLE_SIZE)
2066 st->state = TCP_SEQ_STATE_LISTENING;
2067 rc = listening_get_next(seq, NULL);
2068 while (offset-- && rc)
2069 rc = listening_get_next(seq, rc);
2073 st->state = TCP_SEQ_STATE_ESTABLISHED;
2075 case TCP_SEQ_STATE_ESTABLISHED:
2076 if (st->bucket > tcp_hashinfo.ehash_mask)
2078 rc = established_get_first(seq);
2079 while (offset-- && rc)
2080 rc = established_get_next(seq, rc);
2088 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2090 struct tcp_iter_state *st = seq->private;
2093 if (*pos && *pos == st->last_pos) {
2094 rc = tcp_seek_last_pos(seq);
2099 st->state = TCP_SEQ_STATE_LISTENING;
2103 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2106 st->last_pos = *pos;
2110 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2112 struct tcp_iter_state *st = seq->private;
2115 if (v == SEQ_START_TOKEN) {
2116 rc = tcp_get_idx(seq, 0);
2120 switch (st->state) {
2121 case TCP_SEQ_STATE_OPENREQ:
2122 case TCP_SEQ_STATE_LISTENING:
2123 rc = listening_get_next(seq, v);
2125 st->state = TCP_SEQ_STATE_ESTABLISHED;
2128 rc = established_get_first(seq);
2131 case TCP_SEQ_STATE_ESTABLISHED:
2132 rc = established_get_next(seq, v);
2137 st->last_pos = *pos;
2141 static void tcp_seq_stop(struct seq_file *seq, void *v)
2143 struct tcp_iter_state *st = seq->private;
2145 switch (st->state) {
2146 case TCP_SEQ_STATE_OPENREQ:
2148 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2149 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2151 case TCP_SEQ_STATE_LISTENING:
2152 if (v != SEQ_START_TOKEN)
2153 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2155 case TCP_SEQ_STATE_ESTABLISHED:
2157 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2162 int tcp_seq_open(struct inode *inode, struct file *file)
2164 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2165 struct tcp_iter_state *s;
2168 err = seq_open_net(inode, file, &afinfo->seq_ops,
2169 sizeof(struct tcp_iter_state));
2173 s = ((struct seq_file *)file->private_data)->private;
2174 s->family = afinfo->family;
2178 EXPORT_SYMBOL(tcp_seq_open);
2180 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2183 struct proc_dir_entry *p;
2185 afinfo->seq_ops.start = tcp_seq_start;
2186 afinfo->seq_ops.next = tcp_seq_next;
2187 afinfo->seq_ops.stop = tcp_seq_stop;
2189 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2190 afinfo->seq_fops, afinfo);
2195 EXPORT_SYMBOL(tcp_proc_register);
2197 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2199 remove_proc_entry(afinfo->name, net->proc_net);
2201 EXPORT_SYMBOL(tcp_proc_unregister);
2203 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2204 struct seq_file *f, int i, kuid_t uid)
2206 const struct inet_request_sock *ireq = inet_rsk(req);
2207 long delta = req->expires - jiffies;
2209 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2210 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2213 ntohs(inet_sk(sk)->inet_sport),
2215 ntohs(ireq->ir_rmt_port),
2217 0, 0, /* could print option size, but that is af dependent. */
2218 1, /* timers active (only the expire timer) */
2219 jiffies_delta_to_clock_t(delta),
2221 from_kuid_munged(seq_user_ns(f), uid),
2222 0, /* non standard timer */
2223 0, /* open_requests have no inode */
2224 atomic_read(&sk->sk_refcnt),
2228 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2231 unsigned long timer_expires;
2232 const struct tcp_sock *tp = tcp_sk(sk);
2233 const struct inet_connection_sock *icsk = inet_csk(sk);
2234 const struct inet_sock *inet = inet_sk(sk);
2235 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2236 __be32 dest = inet->inet_daddr;
2237 __be32 src = inet->inet_rcv_saddr;
2238 __u16 destp = ntohs(inet->inet_dport);
2239 __u16 srcp = ntohs(inet->inet_sport);
2242 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2243 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2244 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2246 timer_expires = icsk->icsk_timeout;
2247 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2249 timer_expires = icsk->icsk_timeout;
2250 } else if (timer_pending(&sk->sk_timer)) {
2252 timer_expires = sk->sk_timer.expires;
2255 timer_expires = jiffies;
2258 if (sk->sk_state == TCP_LISTEN)
2259 rx_queue = sk->sk_ack_backlog;
2262 * because we dont lock socket, we might find a transient negative value
2264 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2266 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2267 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2268 i, src, srcp, dest, destp, sk->sk_state,
2269 tp->write_seq - tp->snd_una,
2272 jiffies_delta_to_clock_t(timer_expires - jiffies),
2273 icsk->icsk_retransmits,
2274 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2275 icsk->icsk_probes_out,
2277 atomic_read(&sk->sk_refcnt), sk,
2278 jiffies_to_clock_t(icsk->icsk_rto),
2279 jiffies_to_clock_t(icsk->icsk_ack.ato),
2280 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2282 sk->sk_state == TCP_LISTEN ?
2283 (fastopenq ? fastopenq->max_qlen : 0) :
2284 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2287 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2288 struct seq_file *f, int i)
2292 s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2294 dest = tw->tw_daddr;
2295 src = tw->tw_rcv_saddr;
2296 destp = ntohs(tw->tw_dport);
2297 srcp = ntohs(tw->tw_sport);
2299 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2300 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2301 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2302 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2303 atomic_read(&tw->tw_refcnt), tw);
2308 static int tcp4_seq_show(struct seq_file *seq, void *v)
2310 struct tcp_iter_state *st;
2311 struct sock *sk = v;
2313 seq_setwidth(seq, TMPSZ - 1);
2314 if (v == SEQ_START_TOKEN) {
2315 seq_puts(seq, " sl local_address rem_address st tx_queue "
2316 "rx_queue tr tm->when retrnsmt uid timeout "
2322 switch (st->state) {
2323 case TCP_SEQ_STATE_LISTENING:
2324 case TCP_SEQ_STATE_ESTABLISHED:
2325 if (sk->sk_state == TCP_TIME_WAIT)
2326 get_timewait4_sock(v, seq, st->num);
2328 get_tcp4_sock(v, seq, st->num);
2330 case TCP_SEQ_STATE_OPENREQ:
2331 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2339 static const struct file_operations tcp_afinfo_seq_fops = {
2340 .owner = THIS_MODULE,
2341 .open = tcp_seq_open,
2343 .llseek = seq_lseek,
2344 .release = seq_release_net
2347 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2350 .seq_fops = &tcp_afinfo_seq_fops,
2352 .show = tcp4_seq_show,
2356 static int __net_init tcp4_proc_init_net(struct net *net)
2358 return tcp_proc_register(net, &tcp4_seq_afinfo);
2361 static void __net_exit tcp4_proc_exit_net(struct net *net)
2363 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2366 static struct pernet_operations tcp4_net_ops = {
2367 .init = tcp4_proc_init_net,
2368 .exit = tcp4_proc_exit_net,
2371 int __init tcp4_proc_init(void)
2373 return register_pernet_subsys(&tcp4_net_ops);
2376 void tcp4_proc_exit(void)
2378 unregister_pernet_subsys(&tcp4_net_ops);
2380 #endif /* CONFIG_PROC_FS */
2382 struct proto tcp_prot = {
2384 .owner = THIS_MODULE,
2386 .connect = tcp_v4_connect,
2387 .disconnect = tcp_disconnect,
2388 .accept = inet_csk_accept,
2390 .init = tcp_v4_init_sock,
2391 .destroy = tcp_v4_destroy_sock,
2392 .shutdown = tcp_shutdown,
2393 .setsockopt = tcp_setsockopt,
2394 .getsockopt = tcp_getsockopt,
2395 .recvmsg = tcp_recvmsg,
2396 .sendmsg = tcp_sendmsg,
2397 .sendpage = tcp_sendpage,
2398 .backlog_rcv = tcp_v4_do_rcv,
2399 .release_cb = tcp_release_cb,
2400 .mtu_reduced = tcp_v4_mtu_reduced,
2402 .unhash = inet_unhash,
2403 .get_port = inet_csk_get_port,
2404 .enter_memory_pressure = tcp_enter_memory_pressure,
2405 .stream_memory_free = tcp_stream_memory_free,
2406 .sockets_allocated = &tcp_sockets_allocated,
2407 .orphan_count = &tcp_orphan_count,
2408 .memory_allocated = &tcp_memory_allocated,
2409 .memory_pressure = &tcp_memory_pressure,
2410 .sysctl_mem = sysctl_tcp_mem,
2411 .sysctl_wmem = sysctl_tcp_wmem,
2412 .sysctl_rmem = sysctl_tcp_rmem,
2413 .max_header = MAX_TCP_HEADER,
2414 .obj_size = sizeof(struct tcp_sock),
2415 .slab_flags = SLAB_DESTROY_BY_RCU,
2416 .twsk_prot = &tcp_timewait_sock_ops,
2417 .rsk_prot = &tcp_request_sock_ops,
2418 .h.hashinfo = &tcp_hashinfo,
2419 .no_autobind = true,
2420 #ifdef CONFIG_COMPAT
2421 .compat_setsockopt = compat_tcp_setsockopt,
2422 .compat_getsockopt = compat_tcp_getsockopt,
2424 #ifdef CONFIG_MEMCG_KMEM
2425 .init_cgroup = tcp_init_cgroup,
2426 .destroy_cgroup = tcp_destroy_cgroup,
2427 .proto_cgroup = tcp_proto_cgroup,
2430 EXPORT_SYMBOL(tcp_prot);
2432 static int __net_init tcp_sk_init(struct net *net)
2434 net->ipv4.sysctl_tcp_ecn = 2;
2438 static void __net_exit tcp_sk_exit(struct net *net)
2442 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2444 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2447 static struct pernet_operations __net_initdata tcp_sk_ops = {
2448 .init = tcp_sk_init,
2449 .exit = tcp_sk_exit,
2450 .exit_batch = tcp_sk_exit_batch,
2453 void __init tcp_v4_init(void)
2455 inet_hashinfo_init(&tcp_hashinfo);
2456 if (register_pernet_subsys(&tcp_sk_ops))
2457 panic("Failed to create the TCP control socket.\n");