net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78 #include <net/busy_poll.h>
  79
  80 #include <linux/inet.h>
  81 #include <linux/ipv6.h>
  82 #include <linux/stddef.h>
  83 #include <linux/proc_fs.h>
  84 #include <linux/seq_file.h>
  85
  86 #include <linux/crypto.h>
  87 #include <linux/scatterlist.h>
  88
  89 int sysctl_tcp_tw_reuse __read_mostly;
  90 int sysctl_tcp_low_latency __read_mostly;
  91 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  92
  93
  94 #ifdef CONFIG_TCP_MD5SIG
  95 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  96                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  97 #endif
  98
  99 struct inet_hashinfo tcp_hashinfo;
 100 EXPORT_SYMBOL(tcp_hashinfo);
 101
 102 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 103 {
 104         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 105                                           ip_hdr(skb)->saddr,
 106                                           tcp_hdr(skb)->dest,
 107                                           tcp_hdr(skb)->source);
 108 }
 109
 110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 111 {
 112         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 113         struct tcp_sock *tp = tcp_sk(sk);
 114
 115         /* With PAWS, it is safe from the viewpoint
 116            of data integrity. Even without PAWS it is safe provided sequence
 117            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 118
 119            Actually, the idea is close to VJ's one, only timestamp cache is
 120            held not per host, but per port pair and TW bucket is used as state
 121            holder.
 122
 123            If TW bucket has been already destroyed we fall back to VJ's scheme
 124            and use initial timestamp retrieved from peer table.
 125          */
 126         if (tcptw->tw_ts_recent_stamp &&
 127             (twp == NULL || (sysctl_tcp_tw_reuse &&
 128                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 129                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 130                 if (tp->write_seq == 0)
 131                         tp->write_seq = 1;
 132                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 133                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 134                 sock_hold(sktw);
 135                 return 1;
 136         }
 137
 138         return 0;
 139 }
 140 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 141
 142 /* This will initiate an outgoing connection. */
 143 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 144 {
 145         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 146         struct inet_sock *inet = inet_sk(sk);
 147         struct tcp_sock *tp = tcp_sk(sk);
 148         __be16 orig_sport, orig_dport;
 149         __be32 daddr, nexthop;
 150         struct flowi4 *fl4;
 151         struct rtable *rt;
 152         int err;
 153         struct ip_options_rcu *inet_opt;
 154
 155         if (addr_len < sizeof(struct sockaddr_in))
 156                 return -EINVAL;
 157
 158         if (usin->sin_family != AF_INET)
 159                 return -EAFNOSUPPORT;
 160
 161         nexthop = daddr = usin->sin_addr.s_addr;
 162         inet_opt = rcu_dereference_protected(inet->inet_opt,
 163                                              sock_owned_by_user(sk));
 164         if (inet_opt && inet_opt->opt.srr) {
 165                 if (!daddr)
 166                         return -EINVAL;
 167                 nexthop = inet_opt->opt.faddr;
 168         }
 169
 170         orig_sport = inet->inet_sport;
 171         orig_dport = usin->sin_port;
 172         fl4 = &inet->cork.fl.u.ip4;
 173         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 174                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 175                               IPPROTO_TCP,
 176                               orig_sport, orig_dport, sk);
 177         if (IS_ERR(rt)) {
 178                 err = PTR_ERR(rt);
 179                 if (err == -ENETUNREACH)
 180                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 181                 return err;
 182         }
 183
 184         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 185                 ip_rt_put(rt);
 186                 return -ENETUNREACH;
 187         }
 188
 189         if (!inet_opt || !inet_opt->opt.srr)
 190                 daddr = fl4->daddr;
 191
 192         if (!inet->inet_saddr)
 193                 inet->inet_saddr = fl4->saddr;
 194         inet->inet_rcv_saddr = inet->inet_saddr;
 195
 196         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 197                 /* Reset inherited state */
 198                 tp->rx_opt.ts_recent       = 0;
 199                 tp->rx_opt.ts_recent_stamp = 0;
 200                 if (likely(!tp->repair))
 201                         tp->write_seq      = 0;
 202         }
 203
 204         if (tcp_death_row.sysctl_tw_recycle &&
 205             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 206                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 207
 208         inet->inet_dport = usin->sin_port;
 209         inet->inet_daddr = daddr;
 210
 211         inet_set_txhash(sk);
 212
 213         inet_csk(sk)->icsk_ext_hdr_len = 0;
 214         if (inet_opt)
 215                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 216
 217         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 218
 219         /* Socket identity is still unknown (sport may be zero).
 220          * However we set state to SYN-SENT and not releasing socket
 221          * lock select source port, enter ourselves into the hash tables and
 222          * complete initialization after this.
 223          */
 224         tcp_set_state(sk, TCP_SYN_SENT);
 225         err = inet_hash_connect(&tcp_death_row, sk);
 226         if (err)
 227                 goto failure;
 228
 229         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 230                                inet->inet_sport, inet->inet_dport, sk);
 231         if (IS_ERR(rt)) {
 232                 err = PTR_ERR(rt);
 233                 rt = NULL;
 234                 goto failure;
 235         }
 236         /* OK, now commit destination to socket.  */
 237         sk->sk_gso_type = SKB_GSO_TCPV4;
 238         sk_setup_caps(sk, &rt->dst);
 239
 240         if (!tp->write_seq && likely(!tp->repair))
 241                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 242                                                            inet->inet_daddr,
 243                                                            inet->inet_sport,
 244                                                            usin->sin_port);
 245
 246         inet->inet_id = tp->write_seq ^ jiffies;
 247
 248         err = tcp_connect(sk);
 249
 250         rt = NULL;
 251         if (err)
 252                 goto failure;
 253
 254         return 0;
 255
 256 failure:
 257         /*
 258          * This unhashes the socket and releases the local port,
 259          * if necessary.
 260          */
 261         tcp_set_state(sk, TCP_CLOSE);
 262         ip_rt_put(rt);
 263         sk->sk_route_caps = 0;
 264         inet->inet_dport = 0;
 265         return err;
 266 }
 267 EXPORT_SYMBOL(tcp_v4_connect);
 268
 269 /*
 270  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 271  * It can be called through tcp_release_cb() if socket was owned by user
 272  * at the time tcp_v4_err() was called to handle ICMP message.
 273  */
 274 static void tcp_v4_mtu_reduced(struct sock *sk)
 275 {
 276         struct dst_entry *dst;
 277         struct inet_sock *inet = inet_sk(sk);
 278         u32 mtu = tcp_sk(sk)->mtu_info;
 279
 280         dst = inet_csk_update_pmtu(sk, mtu);
 281         if (!dst)
 282                 return;
 283
 284         /* Something is about to be wrong... Remember soft error
 285          * for the case, if this connection will not able to recover.
 286          */
 287         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 288                 sk->sk_err_soft = EMSGSIZE;
 289
 290         mtu = dst_mtu(dst);
 291
 292         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 293             ip_sk_accept_pmtu(sk) &&
 294             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 295                 tcp_sync_mss(sk, mtu);
 296
 297                 /* Resend the TCP packet because it's
 298                  * clear that the old packet has been
 299                  * dropped. This is the new "fast" path mtu
 300                  * discovery.
 301                  */
 302                 tcp_simple_retransmit(sk);
 303         } /* else let the usual retransmit timer handle it */
 304 }
 305
 306 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 307 {
 308         struct dst_entry *dst = __sk_dst_check(sk, 0);
 309
 310         if (dst)
 311                 dst->ops->redirect(dst, sk, skb);
 312 }
 313
 314 /*
 315  * This routine is called by the ICMP module when it gets some
 316  * sort of error condition.  If err < 0 then the socket should
 317  * be closed and the error returned to the user.  If err > 0
 318  * it's just the icmp type << 8 | icmp code.  After adjustment
 319  * header points to the first 8 bytes of the tcp header.  We need
 320  * to find the appropriate port.
 321  *
 322  * The locking strategy used here is very "optimistic". When
 323  * someone else accesses the socket the ICMP is just dropped
 324  * and for some paths there is no check at all.
 325  * A more general error queue to queue errors for later handling
 326  * is probably better.
 327  *
 328  */
 329
 330 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 331 {
 332         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 333         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 334         struct inet_connection_sock *icsk;
 335         struct tcp_sock *tp;
 336         struct inet_sock *inet;
 337         const int type = icmp_hdr(icmp_skb)->type;
 338         const int code = icmp_hdr(icmp_skb)->code;
 339         struct sock *sk;
 340         struct sk_buff *skb;
 341         struct request_sock *fastopen;
 342         __u32 seq, snd_una;
 343         __u32 remaining;
 344         int err;
 345         struct net *net = dev_net(icmp_skb->dev);
 346
 347         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 349                 return;
 350         }
 351
 352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 353                         iph->saddr, th->source, inet_iif(icmp_skb));
 354         if (!sk) {
 355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 356                 return;
 357         }
 358         if (sk->sk_state == TCP_TIME_WAIT) {
 359                 inet_twsk_put(inet_twsk(sk));
 360                 return;
 361         }
 362
 363         bh_lock_sock(sk);
 364         /* If too many ICMPs get dropped on busy
 365          * servers this needs to be solved differently.
 366          * We do take care of PMTU discovery (RFC1191) special case :
 367          * we can receive locally generated ICMP messages while socket is held.
 368          */
 369         if (sock_owned_by_user(sk)) {
 370                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 371                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 372         }
 373         if (sk->sk_state == TCP_CLOSE)
 374                 goto out;
 375
 376         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 377                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 378                 goto out;
 379         }
 380
 381         icsk = inet_csk(sk);
 382         tp = tcp_sk(sk);
 383         seq = ntohl(th->seq);
 384         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 385         fastopen = tp->fastopen_rsk;
 386         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 387         if (sk->sk_state != TCP_LISTEN &&
 388             !between(seq, snd_una, tp->snd_nxt)) {
 389                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 390                 goto out;
 391         }
 392
 393         switch (type) {
 394         case ICMP_REDIRECT:
 395                 do_redirect(icmp_skb, sk);
 396                 goto out;
 397         case ICMP_SOURCE_QUENCH:
 398                 /* Just silently ignore these. */
 399                 goto out;
 400         case ICMP_PARAMETERPROB:
 401                 err = EPROTO;
 402                 break;
 403         case ICMP_DEST_UNREACH:
 404                 if (code > NR_ICMP_UNREACH)
 405                         goto out;
 406
 407                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 408                         /* We are not interested in TCP_LISTEN and open_requests
 409                          * (SYN-ACKs send out by Linux are always <576bytes so
 410                          * they should go through unfragmented).
 411                          */
 412                         if (sk->sk_state == TCP_LISTEN)
 413                                 goto out;
 414
 415                         tp->mtu_info = info;
 416                         if (!sock_owned_by_user(sk)) {
 417                                 tcp_v4_mtu_reduced(sk);
 418                         } else {
 419                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 420                                         sock_hold(sk);
 421                         }
 422                         goto out;
 423                 }
 424
 425                 err = icmp_err_convert[code].errno;
 426                 /* check if icmp_skb allows revert of backoff
 427                  * (see draft-zimmermann-tcp-lcd) */
 428                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 429                         break;
 430                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 431                     !icsk->icsk_backoff || fastopen)
 432                         break;
 433
 434                 if (sock_owned_by_user(sk))
 435                         break;
 436
 437                 icsk->icsk_backoff--;
 438                 inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
 439                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 440                 tcp_bound_rto(sk);
 441
 442                 skb = tcp_write_queue_head(sk);
 443                 BUG_ON(!skb);
 444
 445                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 446                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 447
 448                 if (remaining) {
 449                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 450                                                   remaining, TCP_RTO_MAX);
 451                 } else {
 452                         /* RTO revert clocked out retransmission.
 453                          * Will retransmit now */
 454                         tcp_retransmit_timer(sk);
 455                 }
 456
 457                 break;
 458         case ICMP_TIME_EXCEEDED:
 459                 err = EHOSTUNREACH;
 460                 break;
 461         default:
 462                 goto out;
 463         }
 464
 465         switch (sk->sk_state) {
 466                 struct request_sock *req, **prev;
 467         case TCP_LISTEN:
 468                 if (sock_owned_by_user(sk))
 469                         goto out;
 470
 471                 req = inet_csk_search_req(sk, &prev, th->dest,
 472                                           iph->daddr, iph->saddr);
 473                 if (!req)
 474                         goto out;
 475
 476                 /* ICMPs are not backlogged, hence we cannot get
 477                    an established socket here.
 478                  */
 479                 WARN_ON(req->sk);
 480
 481                 if (seq != tcp_rsk(req)->snt_isn) {
 482                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 483                         goto out;
 484                 }
 485
 486                 /*
 487                  * Still in SYN_RECV, just remove it silently.
 488                  * There is no good way to pass the error to the newly
 489                  * created socket, and POSIX does not want network
 490                  * errors returned from accept().
 491                  */
 492                 inet_csk_reqsk_queue_drop(sk, req, prev);
 493                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 494                 goto out;
 495
 496         case TCP_SYN_SENT:
 497         case TCP_SYN_RECV:
 498                 /* Only in fast or simultaneous open. If a fast open socket is
 499                  * is already accepted it is treated as a connected one below.
 500                  */
 501                 if (fastopen && fastopen->sk == NULL)
 502                         break;
 503
 504                 if (!sock_owned_by_user(sk)) {
 505                         sk->sk_err = err;
 506
 507                         sk->sk_error_report(sk);
 508
 509                         tcp_done(sk);
 510                 } else {
 511                         sk->sk_err_soft = err;
 512                 }
 513                 goto out;
 514         }
 515
 516         /* If we've already connected we will keep trying
 517          * until we time out, or the user gives up.
 518          *
 519          * rfc1122 4.2.3.9 allows to consider as hard errors
 520          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 521          * but it is obsoleted by pmtu discovery).
 522          *
 523          * Note, that in modern internet, where routing is unreliable
 524          * and in each dark corner broken firewalls sit, sending random
 525          * errors ordered by their masters even this two messages finally lose
 526          * their original sense (even Linux sends invalid PORT_UNREACHs)
 527          *
 528          * Now we are in compliance with RFCs.
 529          *                                                      --ANK (980905)
 530          */
 531
 532         inet = inet_sk(sk);
 533         if (!sock_owned_by_user(sk) && inet->recverr) {
 534                 sk->sk_err = err;
 535                 sk->sk_error_report(sk);
 536         } else  { /* Only an error on timeout */
 537                 sk->sk_err_soft = err;
 538         }
 539
 540 out:
 541         bh_unlock_sock(sk);
 542         sock_put(sk);
 543 }
 544
 545 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 546 {
 547         struct tcphdr *th = tcp_hdr(skb);
 548
 549         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 550                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 551                 skb->csum_start = skb_transport_header(skb) - skb->head;
 552                 skb->csum_offset = offsetof(struct tcphdr, check);
 553         } else {
 554                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 555                                          csum_partial(th,
 556                                                       th->doff << 2,
 557                                                       skb->csum));
 558         }
 559 }
 560
 561 /* This routine computes an IPv4 TCP checksum. */
 562 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 563 {
 564         const struct inet_sock *inet = inet_sk(sk);
 565
 566         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 567 }
 568 EXPORT_SYMBOL(tcp_v4_send_check);
 569
 570 /*
 571  *      This routine will send an RST to the other tcp.
 572  *
 573  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 574  *                    for reset.
 575  *      Answer: if a packet caused RST, it is not for a socket
 576  *              existing in our system, if it is matched to a socket,
 577  *              it is just duplicate segment or bug in other side's TCP.
 578  *              So that we build reply only basing on parameters
 579  *              arrived with segment.
 580  *      Exception: precedence violation. We do not implement it in any case.
 581  */
 582
 583 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 584 {
 585         const struct tcphdr *th = tcp_hdr(skb);
 586         struct {
 587                 struct tcphdr th;
 588 #ifdef CONFIG_TCP_MD5SIG
 589                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 590 #endif
 591         } rep;
 592         struct ip_reply_arg arg;
 593 #ifdef CONFIG_TCP_MD5SIG
 594         struct tcp_md5sig_key *key;
 595         const __u8 *hash_location = NULL;
 596         unsigned char newhash[16];
 597         int genhash;
 598         struct sock *sk1 = NULL;
 599 #endif
 600         struct net *net;
 601
 602         /* Never send a reset in response to a reset. */
 603         if (th->rst)
 604                 return;
 605
 606         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 607                 return;
 608
 609         /* Swap the send and the receive. */
 610         memset(&rep, 0, sizeof(rep));
 611         rep.th.dest   = th->source;
 612         rep.th.source = th->dest;
 613         rep.th.doff   = sizeof(struct tcphdr) / 4;
 614         rep.th.rst    = 1;
 615
 616         if (th->ack) {
 617                 rep.th.seq = th->ack_seq;
 618         } else {
 619                 rep.th.ack = 1;
 620                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 621                                        skb->len - (th->doff << 2));
 622         }
 623
 624         memset(&arg, 0, sizeof(arg));
 625         arg.iov[0].iov_base = (unsigned char *)&rep;
 626         arg.iov[0].iov_len  = sizeof(rep.th);
 627
 628 #ifdef CONFIG_TCP_MD5SIG
 629         hash_location = tcp_parse_md5sig_option(th);
 630         if (!sk && hash_location) {
 631                 /*
 632                  * active side is lost. Try to find listening socket through
 633                  * source port, and then find md5 key through listening socket.
 634                  * we are not loose security here:
 635                  * Incoming packet is checked with md5 hash with finding key,
 636                  * no RST generated if md5 hash doesn't match.
 637                  */
 638                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 639                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
 640                                              th->source, ip_hdr(skb)->daddr,
 641                                              ntohs(th->source), inet_iif(skb));
 642                 /* don't send rst if it can't find key */
 643                 if (!sk1)
 644                         return;
 645                 rcu_read_lock();
 646                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 647                                         &ip_hdr(skb)->saddr, AF_INET);
 648                 if (!key)
 649                         goto release_sk1;
 650
 651                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 652                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 653                         goto release_sk1;
 654         } else {
 655                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 656                                              &ip_hdr(skb)->saddr,
 657                                              AF_INET) : NULL;
 658         }
 659
 660         if (key) {
 661                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 662                                    (TCPOPT_NOP << 16) |
 663                                    (TCPOPT_MD5SIG << 8) |
 664                                    TCPOLEN_MD5SIG);
 665                 /* Update length and the length the header thinks exists */
 666                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 667                 rep.th.doff = arg.iov[0].iov_len / 4;
 668
 669                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 670                                      key, ip_hdr(skb)->saddr,
 671                                      ip_hdr(skb)->daddr, &rep.th);
 672         }
 673 #endif
 674         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 675                                       ip_hdr(skb)->saddr, /* XXX */
 676                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 677         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 678         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 679         /* When socket is gone, all binding information is lost.
 680          * routing might fail in this case. No choice here, if we choose to force
 681          * input interface, we will misroute in case of asymmetric route.
 682          */
 683         if (sk)
 684                 arg.bound_dev_if = sk->sk_bound_dev_if;
 685
 686         net = dev_net(skb_dst(skb)->dev);
 687         arg.tos = ip_hdr(skb)->tos;
 688         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 689                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 690
 691         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 692         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 693
 694 #ifdef CONFIG_TCP_MD5SIG
 695 release_sk1:
 696         if (sk1) {
 697                 rcu_read_unlock();
 698                 sock_put(sk1);
 699         }
 700 #endif
 701 }
 702
 703 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 704    outside socket context is ugly, certainly. What can I do?
 705  */
 706
 707 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 708                             u32 win, u32 tsval, u32 tsecr, int oif,
 709                             struct tcp_md5sig_key *key,
 710                             int reply_flags, u8 tos)
 711 {
 712         const struct tcphdr *th = tcp_hdr(skb);
 713         struct {
 714                 struct tcphdr th;
 715                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 716 #ifdef CONFIG_TCP_MD5SIG
 717                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 718 #endif
 719                         ];
 720         } rep;
 721         struct ip_reply_arg arg;
 722         struct net *net = dev_net(skb_dst(skb)->dev);
 723
 724         memset(&rep.th, 0, sizeof(struct tcphdr));
 725         memset(&arg, 0, sizeof(arg));
 726
 727         arg.iov[0].iov_base = (unsigned char *)&rep;
 728         arg.iov[0].iov_len  = sizeof(rep.th);
 729         if (tsecr) {
 730                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 731                                    (TCPOPT_TIMESTAMP << 8) |
 732                                    TCPOLEN_TIMESTAMP);
 733                 rep.opt[1] = htonl(tsval);
 734                 rep.opt[2] = htonl(tsecr);
 735                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 736         }
 737
 738         /* Swap the send and the receive. */
 739         rep.th.dest    = th->source;
 740         rep.th.source  = th->dest;
 741         rep.th.doff    = arg.iov[0].iov_len / 4;
 742         rep.th.seq     = htonl(seq);
 743         rep.th.ack_seq = htonl(ack);
 744         rep.th.ack     = 1;
 745         rep.th.window  = htons(win);
 746
 747 #ifdef CONFIG_TCP_MD5SIG
 748         if (key) {
 749                 int offset = (tsecr) ? 3 : 0;
 750
 751                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 752                                           (TCPOPT_NOP << 16) |
 753                                           (TCPOPT_MD5SIG << 8) |
 754                                           TCPOLEN_MD5SIG);
 755                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 756                 rep.th.doff = arg.iov[0].iov_len/4;
 757
 758                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 759                                     key, ip_hdr(skb)->saddr,
 760                                     ip_hdr(skb)->daddr, &rep.th);
 761         }
 762 #endif
 763         arg.flags = reply_flags;
 764         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 765                                       ip_hdr(skb)->saddr, /* XXX */
 766                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 767         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 768         if (oif)
 769                 arg.bound_dev_if = oif;
 770         arg.tos = tos;
 771         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 772                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 773
 774         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 775 }
 776
 777 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 778 {
 779         struct inet_timewait_sock *tw = inet_twsk(sk);
 780         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 781
 782         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 783                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 784                         tcp_time_stamp + tcptw->tw_ts_offset,
 785                         tcptw->tw_ts_recent,
 786                         tw->tw_bound_dev_if,
 787                         tcp_twsk_md5_key(tcptw),
 788                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 789                         tw->tw_tos
 790                         );
 791
 792         inet_twsk_put(tw);
 793 }
 794
 795 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 796                                   struct request_sock *req)
 797 {
 798         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 799          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 800          */
 801         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 802                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 803                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 804                         tcp_time_stamp,
 805                         req->ts_recent,
 806                         0,
 807                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 808                                           AF_INET),
 809                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 810                         ip_hdr(skb)->tos);
 811 }
 812
 813 /*
 814  *      Send a SYN-ACK after having received a SYN.
 815  *      This still operates on a request_sock only, not on a big
 816  *      socket.
 817  */
 818 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 819                               struct flowi *fl,
 820                               struct request_sock *req,
 821                               u16 queue_mapping,
 822                               struct tcp_fastopen_cookie *foc)
 823 {
 824         const struct inet_request_sock *ireq = inet_rsk(req);
 825         struct flowi4 fl4;
 826         int err = -1;
 827         struct sk_buff *skb;
 828
 829         /* First, grab a route. */
 830         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 831                 return -1;
 832
 833         skb = tcp_make_synack(sk, dst, req, foc);
 834
 835         if (skb) {
 836                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 837
 838                 skb_set_queue_mapping(skb, queue_mapping);
 839                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 840                                             ireq->ir_rmt_addr,
 841                                             ireq->opt);
 842                 err = net_xmit_eval(err);
 843         }
 844
 845         return err;
 846 }
 847
 848 /*
 849  *      IPv4 request_sock destructor.
 850  */
 851 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 852 {
 853         kfree(inet_rsk(req)->opt);
 854 }
 855
 856 /*
 857  * Return true if a syncookie should be sent
 858  */
 859 bool tcp_syn_flood_action(struct sock *sk,
 860                          const struct sk_buff *skb,
 861                          const char *proto)
 862 {
 863         const char *msg = "Dropping request";
 864         bool want_cookie = false;
 865         struct listen_sock *lopt;
 866
 867 #ifdef CONFIG_SYN_COOKIES
 868         if (sysctl_tcp_syncookies) {
 869                 msg = "Sending cookies";
 870                 want_cookie = true;
 871                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 872         } else
 873 #endif
 874                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 875
 876         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 877         if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
 878                 lopt->synflood_warned = 1;
 879                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 880                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 881         }
 882         return want_cookie;
 883 }
 884 EXPORT_SYMBOL(tcp_syn_flood_action);
 885
 886 /*
 887  * Save and compile IPv4 options into the request_sock if needed.
 888  */
 889 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 890 {
 891         const struct ip_options *opt = &(IPCB(skb)->opt);
 892         struct ip_options_rcu *dopt = NULL;
 893
 894         if (opt && opt->optlen) {
 895                 int opt_size = sizeof(*dopt) + opt->optlen;
 896
 897                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 898                 if (dopt) {
 899                         if (ip_options_echo(&dopt->opt, skb)) {
 900                                 kfree(dopt);
 901                                 dopt = NULL;
 902                         }
 903                 }
 904         }
 905         return dopt;
 906 }
 907
 908 #ifdef CONFIG_TCP_MD5SIG
 909 /*
 910  * RFC2385 MD5 checksumming requires a mapping of
 911  * IP address->MD5 Key.
 912  * We need to maintain these in the sk structure.
 913  */
 914
 915 /* Find the Key structure for an address.  */
 916 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 917                                          const union tcp_md5_addr *addr,
 918                                          int family)
 919 {
 920         struct tcp_sock *tp = tcp_sk(sk);
 921         struct tcp_md5sig_key *key;
 922         unsigned int size = sizeof(struct in_addr);
 923         struct tcp_md5sig_info *md5sig;
 924
 925         /* caller either holds rcu_read_lock() or socket lock */
 926         md5sig = rcu_dereference_check(tp->md5sig_info,
 927                                        sock_owned_by_user(sk) ||
 928                                        lockdep_is_held(&sk->sk_lock.slock));
 929         if (!md5sig)
 930                 return NULL;
 931 #if IS_ENABLED(CONFIG_IPV6)
 932         if (family == AF_INET6)
 933                 size = sizeof(struct in6_addr);
 934 #endif
 935         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 936                 if (key->family != family)
 937                         continue;
 938                 if (!memcmp(&key->addr, addr, size))
 939                         return key;
 940         }
 941         return NULL;
 942 }
 943 EXPORT_SYMBOL(tcp_md5_do_lookup);
 944
 945 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 946                                          struct sock *addr_sk)
 947 {
 948         union tcp_md5_addr *addr;
 949
 950         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 951         return tcp_md5_do_lookup(sk, addr, AF_INET);
 952 }
 953 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 954
 955 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 956                                                       struct request_sock *req)
 957 {
 958         union tcp_md5_addr *addr;
 959
 960         addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
 961         return tcp_md5_do_lookup(sk, addr, AF_INET);
 962 }
 963
 964 /* This can be called on a newly created socket, from other files */
 965 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 966                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 967 {
 968         /* Add Key to the list */
 969         struct tcp_md5sig_key *key;
 970         struct tcp_sock *tp = tcp_sk(sk);
 971         struct tcp_md5sig_info *md5sig;
 972
 973         key = tcp_md5_do_lookup(sk, addr, family);
 974         if (key) {
 975                 /* Pre-existing entry - just update that one. */
 976                 memcpy(key->key, newkey, newkeylen);
 977                 key->keylen = newkeylen;
 978                 return 0;
 979         }
 980
 981         md5sig = rcu_dereference_protected(tp->md5sig_info,
 982                                            sock_owned_by_user(sk));
 983         if (!md5sig) {
 984                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 985                 if (!md5sig)
 986                         return -ENOMEM;
 987
 988                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 989                 INIT_HLIST_HEAD(&md5sig->head);
 990                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 991         }
 992
 993         key = sock_kmalloc(sk, sizeof(*key), gfp);
 994         if (!key)
 995                 return -ENOMEM;
 996         if (!tcp_alloc_md5sig_pool()) {
 997                 sock_kfree_s(sk, key, sizeof(*key));
 998                 return -ENOMEM;
 999         }
1000
1001         memcpy(key->key, newkey, newkeylen);
1002         key->keylen = newkeylen;
1003         key->family = family;
1004         memcpy(&key->addr, addr,
1005                (family == AF_INET6) ? sizeof(struct in6_addr) :
1006                                       sizeof(struct in_addr));
1007         hlist_add_head_rcu(&key->node, &md5sig->head);
1008         return 0;
1009 }
1010 EXPORT_SYMBOL(tcp_md5_do_add);
1011
1012 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1013 {
1014         struct tcp_md5sig_key *key;
1015
1016         key = tcp_md5_do_lookup(sk, addr, family);
1017         if (!key)
1018                 return -ENOENT;
1019         hlist_del_rcu(&key->node);
1020         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1021         kfree_rcu(key, rcu);
1022         return 0;
1023 }
1024 EXPORT_SYMBOL(tcp_md5_do_del);
1025
1026 static void tcp_clear_md5_list(struct sock *sk)
1027 {
1028         struct tcp_sock *tp = tcp_sk(sk);
1029         struct tcp_md5sig_key *key;
1030         struct hlist_node *n;
1031         struct tcp_md5sig_info *md5sig;
1032
1033         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1034
1035         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1036                 hlist_del_rcu(&key->node);
1037                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1038                 kfree_rcu(key, rcu);
1039         }
1040 }
1041
1042 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1043                                  int optlen)
1044 {
1045         struct tcp_md5sig cmd;
1046         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1047
1048         if (optlen < sizeof(cmd))
1049                 return -EINVAL;
1050
1051         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1052                 return -EFAULT;
1053
1054         if (sin->sin_family != AF_INET)
1055                 return -EINVAL;
1056
1057         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1058                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1059                                       AF_INET);
1060
1061         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1062                 return -EINVAL;
1063
1064         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1065                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1066                               GFP_KERNEL);
1067 }
1068
1069 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1070                                         __be32 daddr, __be32 saddr, int nbytes)
1071 {
1072         struct tcp4_pseudohdr *bp;
1073         struct scatterlist sg;
1074
1075         bp = &hp->md5_blk.ip4;
1076
1077         /*
1078          * 1. the TCP pseudo-header (in the order: source IP address,
1079          * destination IP address, zero-padded protocol number, and
1080          * segment length)
1081          */
1082         bp->saddr = saddr;
1083         bp->daddr = daddr;
1084         bp->pad = 0;
1085         bp->protocol = IPPROTO_TCP;
1086         bp->len = cpu_to_be16(nbytes);
1087
1088         sg_init_one(&sg, bp, sizeof(*bp));
1089         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1090 }
1091
1092 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1093                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1094 {
1095         struct tcp_md5sig_pool *hp;
1096         struct hash_desc *desc;
1097
1098         hp = tcp_get_md5sig_pool();
1099         if (!hp)
1100                 goto clear_hash_noput;
1101         desc = &hp->md5_desc;
1102
1103         if (crypto_hash_init(desc))
1104                 goto clear_hash;
1105         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1106                 goto clear_hash;
1107         if (tcp_md5_hash_header(hp, th))
1108                 goto clear_hash;
1109         if (tcp_md5_hash_key(hp, key))
1110                 goto clear_hash;
1111         if (crypto_hash_final(desc, md5_hash))
1112                 goto clear_hash;
1113
1114         tcp_put_md5sig_pool();
1115         return 0;
1116
1117 clear_hash:
1118         tcp_put_md5sig_pool();
1119 clear_hash_noput:
1120         memset(md5_hash, 0, 16);
1121         return 1;
1122 }
1123
1124 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1125                         const struct sock *sk, const struct request_sock *req,
1126                         const struct sk_buff *skb)
1127 {
1128         struct tcp_md5sig_pool *hp;
1129         struct hash_desc *desc;
1130         const struct tcphdr *th = tcp_hdr(skb);
1131         __be32 saddr, daddr;
1132
1133         if (sk) {
1134                 saddr = inet_sk(sk)->inet_saddr;
1135                 daddr = inet_sk(sk)->inet_daddr;
1136         } else if (req) {
1137                 saddr = inet_rsk(req)->ir_loc_addr;
1138                 daddr = inet_rsk(req)->ir_rmt_addr;
1139         } else {
1140                 const struct iphdr *iph = ip_hdr(skb);
1141                 saddr = iph->saddr;
1142                 daddr = iph->daddr;
1143         }
1144
1145         hp = tcp_get_md5sig_pool();
1146         if (!hp)
1147                 goto clear_hash_noput;
1148         desc = &hp->md5_desc;
1149
1150         if (crypto_hash_init(desc))
1151                 goto clear_hash;
1152
1153         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1154                 goto clear_hash;
1155         if (tcp_md5_hash_header(hp, th))
1156                 goto clear_hash;
1157         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1158                 goto clear_hash;
1159         if (tcp_md5_hash_key(hp, key))
1160                 goto clear_hash;
1161         if (crypto_hash_final(desc, md5_hash))
1162                 goto clear_hash;
1163
1164         tcp_put_md5sig_pool();
1165         return 0;
1166
1167 clear_hash:
1168         tcp_put_md5sig_pool();
1169 clear_hash_noput:
1170         memset(md5_hash, 0, 16);
1171         return 1;
1172 }
1173 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1174
1175 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1176 {
1177         /*
1178          * This gets called for each TCP segment that arrives
1179          * so we want to be efficient.
1180          * We have 3 drop cases:
1181          * o No MD5 hash and one expected.
1182          * o MD5 hash and we're not expecting one.
1183          * o MD5 hash and its wrong.
1184          */
1185         const __u8 *hash_location = NULL;
1186         struct tcp_md5sig_key *hash_expected;
1187         const struct iphdr *iph = ip_hdr(skb);
1188         const struct tcphdr *th = tcp_hdr(skb);
1189         int genhash;
1190         unsigned char newhash[16];
1191
1192         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1193                                           AF_INET);
1194         hash_location = tcp_parse_md5sig_option(th);
1195
1196         /* We've parsed the options - do we have a hash? */
1197         if (!hash_expected && !hash_location)
1198                 return false;
1199
1200         if (hash_expected && !hash_location) {
1201                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1202                 return true;
1203         }
1204
1205         if (!hash_expected && hash_location) {
1206                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1207                 return true;
1208         }
1209
1210         /* Okay, so this is hash_expected and hash_location -
1211          * so we need to calculate the checksum.
1212          */
1213         genhash = tcp_v4_md5_hash_skb(newhash,
1214                                       hash_expected,
1215                                       NULL, NULL, skb);
1216
1217         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1218                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1219                                      &iph->saddr, ntohs(th->source),
1220                                      &iph->daddr, ntohs(th->dest),
1221                                      genhash ? " tcp_v4_calc_md5_hash failed"
1222                                      : "");
1223                 return true;
1224         }
1225         return false;
1226 }
1227
1228 #endif
1229
1230 static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1231                             struct sk_buff *skb)
1232 {
1233         struct inet_request_sock *ireq = inet_rsk(req);
1234
1235         ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1236         ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1237         ireq->no_srccheck = inet_sk(sk)->transparent;
1238         ireq->opt = tcp_v4_save_options(skb);
1239 }
1240
1241 static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1242                                           const struct request_sock *req,
1243                                           bool *strict)
1244 {
1245         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1246
1247         if (strict) {
1248                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1249                         *strict = true;
1250                 else
1251                         *strict = false;
1252         }
1253
1254         return dst;
1255 }
1256
1257 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1258         .family         =       PF_INET,
1259         .obj_size       =       sizeof(struct tcp_request_sock),
1260         .rtx_syn_ack    =       tcp_rtx_synack,
1261         .send_ack       =       tcp_v4_reqsk_send_ack,
1262         .destructor     =       tcp_v4_reqsk_destructor,
1263         .send_reset     =       tcp_v4_send_reset,
1264         .syn_ack_timeout =      tcp_syn_ack_timeout,
1265 };
1266
1267 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1268         .mss_clamp      =       TCP_MSS_DEFAULT,
1269 #ifdef CONFIG_TCP_MD5SIG
1270         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1271         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1272 #endif
1273         .init_req       =       tcp_v4_init_req,
1274 #ifdef CONFIG_SYN_COOKIES
1275         .cookie_init_seq =      cookie_v4_init_sequence,
1276 #endif
1277         .route_req      =       tcp_v4_route_req,
1278         .init_seq       =       tcp_v4_init_sequence,
1279         .send_synack    =       tcp_v4_send_synack,
1280         .queue_hash_add =       inet_csk_reqsk_queue_hash_add,
1281 };
1282
1283 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1284 {
1285         /* Never answer to SYNs send to broadcast or multicast */
1286         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1287                 goto drop;
1288
1289         return tcp_conn_request(&tcp_request_sock_ops,
1290                                 &tcp_request_sock_ipv4_ops, sk, skb);
1291
1292 drop:
1293         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1294         return 0;
1295 }
1296 EXPORT_SYMBOL(tcp_v4_conn_request);
1297
1298
1299 /*
1300  * The three way handshake has completed - we got a valid synack -
1301  * now create the new socket.
1302  */
1303 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1304                                   struct request_sock *req,
1305                                   struct dst_entry *dst)
1306 {
1307         struct inet_request_sock *ireq;
1308         struct inet_sock *newinet;
1309         struct tcp_sock *newtp;
1310         struct sock *newsk;
1311 #ifdef CONFIG_TCP_MD5SIG
1312         struct tcp_md5sig_key *key;
1313 #endif
1314         struct ip_options_rcu *inet_opt;
1315
1316         if (sk_acceptq_is_full(sk))
1317                 goto exit_overflow;
1318
1319         newsk = tcp_create_openreq_child(sk, req, skb);
1320         if (!newsk)
1321                 goto exit_nonewsk;
1322
1323         newsk->sk_gso_type = SKB_GSO_TCPV4;
1324         inet_sk_rx_dst_set(newsk, skb);
1325
1326         newtp                 = tcp_sk(newsk);
1327         newinet               = inet_sk(newsk);
1328         ireq                  = inet_rsk(req);
1329         newinet->inet_daddr   = ireq->ir_rmt_addr;
1330         newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1331         newinet->inet_saddr           = ireq->ir_loc_addr;
1332         inet_opt              = ireq->opt;
1333         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1334         ireq->opt             = NULL;
1335         newinet->mc_index     = inet_iif(skb);
1336         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1337         newinet->rcv_tos      = ip_hdr(skb)->tos;
1338         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1339         inet_set_txhash(newsk);
1340         if (inet_opt)
1341                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1342         newinet->inet_id = newtp->write_seq ^ jiffies;
1343
1344         if (!dst) {
1345                 dst = inet_csk_route_child_sock(sk, newsk, req);
1346                 if (!dst)
1347                         goto put_and_exit;
1348         } else {
1349                 /* syncookie case : see end of cookie_v4_check() */
1350         }
1351         sk_setup_caps(newsk, dst);
1352
1353         tcp_sync_mss(newsk, dst_mtu(dst));
1354         newtp->advmss = dst_metric_advmss(dst);
1355         if (tcp_sk(sk)->rx_opt.user_mss &&
1356             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1357                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1358
1359         tcp_initialize_rcv_mss(newsk);
1360
1361 #ifdef CONFIG_TCP_MD5SIG
1362         /* Copy over the MD5 key from the original socket */
1363         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1364                                 AF_INET);
1365         if (key != NULL) {
1366                 /*
1367                  * We're using one, so create a matching key
1368                  * on the newsk structure. If we fail to get
1369                  * memory, then we end up not copying the key
1370                  * across. Shucks.
1371                  */
1372                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1373                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1374                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1375         }
1376 #endif
1377
1378         if (__inet_inherit_port(sk, newsk) < 0)
1379                 goto put_and_exit;
1380         __inet_hash_nolisten(newsk, NULL);
1381
1382         return newsk;
1383
1384 exit_overflow:
1385         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1386 exit_nonewsk:
1387         dst_release(dst);
1388 exit:
1389         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1390         return NULL;
1391 put_and_exit:
1392         inet_csk_prepare_forced_close(newsk);
1393         tcp_done(newsk);
1394         goto exit;
1395 }
1396 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1397
1398 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1399 {
1400         struct tcphdr *th = tcp_hdr(skb);
1401         const struct iphdr *iph = ip_hdr(skb);
1402         struct sock *nsk;
1403         struct request_sock **prev;
1404         /* Find possible connection requests. */
1405         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1406                                                        iph->saddr, iph->daddr);
1407         if (req)
1408                 return tcp_check_req(sk, skb, req, prev, false);
1409
1410         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1411                         th->source, iph->daddr, th->dest, inet_iif(skb));
1412
1413         if (nsk) {
1414                 if (nsk->sk_state != TCP_TIME_WAIT) {
1415                         bh_lock_sock(nsk);
1416                         return nsk;
1417                 }
1418                 inet_twsk_put(inet_twsk(nsk));
1419                 return NULL;
1420         }
1421
1422 #ifdef CONFIG_SYN_COOKIES
1423         if (!th->syn)
1424                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1425 #endif
1426         return sk;
1427 }
1428
1429 /* The socket must have it's spinlock held when we get
1430  * here.
1431  *
1432  * We have a potential double-lock case here, so even when
1433  * doing backlog processing we use the BH locking scheme.
1434  * This is because we cannot sleep with the original spinlock
1435  * held.
1436  */
1437 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1438 {
1439         struct sock *rsk;
1440 #ifdef CONFIG_TCP_MD5SIG
1441         /*
1442          * We really want to reject the packet as early as possible
1443          * if:
1444          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1445          *  o There is an MD5 option and we're not expecting one
1446          */
1447         if (tcp_v4_inbound_md5_hash(sk, skb))
1448                 goto discard;
1449 #endif
1450
1451         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1452                 struct dst_entry *dst = sk->sk_rx_dst;
1453
1454                 sock_rps_save_rxhash(sk, skb);
1455                 if (dst) {
1456                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1457                             dst->ops->check(dst, 0) == NULL) {
1458                                 dst_release(dst);
1459                                 sk->sk_rx_dst = NULL;
1460                         }
1461                 }
1462                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1463                 return 0;
1464         }
1465
1466         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1467                 goto csum_err;
1468
1469         if (sk->sk_state == TCP_LISTEN) {
1470                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1471                 if (!nsk)
1472                         goto discard;
1473
1474                 if (nsk != sk) {
1475                         sock_rps_save_rxhash(nsk, skb);
1476                         if (tcp_child_process(sk, nsk, skb)) {
1477                                 rsk = nsk;
1478                                 goto reset;
1479                         }
1480                         return 0;
1481                 }
1482         } else
1483                 sock_rps_save_rxhash(sk, skb);
1484
1485         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1486                 rsk = sk;
1487                 goto reset;
1488         }
1489         return 0;
1490
1491 reset:
1492         tcp_v4_send_reset(rsk, skb);
1493 discard:
1494         kfree_skb(skb);
1495         /* Be careful here. If this function gets more complicated and
1496          * gcc suffers from register pressure on the x86, sk (in %ebx)
1497          * might be destroyed here. This current version compiles correctly,
1498          * but you have been warned.
1499          */
1500         return 0;
1501
1502 csum_err:
1503         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1504         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1505         goto discard;
1506 }
1507 EXPORT_SYMBOL(tcp_v4_do_rcv);
1508
1509 void tcp_v4_early_demux(struct sk_buff *skb)
1510 {
1511         const struct iphdr *iph;
1512         const struct tcphdr *th;
1513         struct sock *sk;
1514
1515         if (skb->pkt_type != PACKET_HOST)
1516                 return;
1517
1518         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1519                 return;
1520
1521         iph = ip_hdr(skb);
1522         th = tcp_hdr(skb);
1523
1524         if (th->doff < sizeof(struct tcphdr) / 4)
1525                 return;
1526
1527         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1528                                        iph->saddr, th->source,
1529                                        iph->daddr, ntohs(th->dest),
1530                                        skb->skb_iif);
1531         if (sk) {
1532                 skb->sk = sk;
1533                 skb->destructor = sock_edemux;
1534                 if (sk->sk_state != TCP_TIME_WAIT) {
1535                         struct dst_entry *dst = sk->sk_rx_dst;
1536
1537                         if (dst)
1538                                 dst = dst_check(dst, 0);
1539                         if (dst &&
1540                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1541                                 skb_dst_set_noref(skb, dst);
1542                 }
1543         }
1544 }
1545
1546 /* Packet is added to VJ-style prequeue for processing in process
1547  * context, if a reader task is waiting. Apparently, this exciting
1548  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1549  * failed somewhere. Latency? Burstiness? Well, at least now we will
1550  * see, why it failed. 8)8)                               --ANK
1551  *
1552  */
1553 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1554 {
1555         struct tcp_sock *tp = tcp_sk(sk);
1556
1557         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1558                 return false;
1559
1560         if (skb->len <= tcp_hdrlen(skb) &&
1561             skb_queue_len(&tp->ucopy.prequeue) == 0)
1562                 return false;
1563
1564         skb_dst_force(skb);
1565         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1566         tp->ucopy.memory += skb->truesize;
1567         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1568                 struct sk_buff *skb1;
1569
1570                 BUG_ON(sock_owned_by_user(sk));
1571
1572                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1573                         sk_backlog_rcv(sk, skb1);
1574                         NET_INC_STATS_BH(sock_net(sk),
1575                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1576                 }
1577
1578                 tp->ucopy.memory = 0;
1579         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1580                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1581                                            POLLIN | POLLRDNORM | POLLRDBAND);
1582                 if (!inet_csk_ack_scheduled(sk))
1583                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1584                                                   (3 * tcp_rto_min(sk)) / 4,
1585                                                   TCP_RTO_MAX);
1586         }
1587         return true;
1588 }
1589 EXPORT_SYMBOL(tcp_prequeue);
1590
1591 /*
1592  *      From tcp_input.c
1593  */
1594
1595 int tcp_v4_rcv(struct sk_buff *skb)
1596 {
1597         const struct iphdr *iph;
1598         const struct tcphdr *th;
1599         struct sock *sk;
1600         int ret;
1601         struct net *net = dev_net(skb->dev);
1602
1603         if (skb->pkt_type != PACKET_HOST)
1604                 goto discard_it;
1605
1606         /* Count it even if it's bad */
1607         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1608
1609         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1610                 goto discard_it;
1611
1612         th = tcp_hdr(skb);
1613
1614         if (th->doff < sizeof(struct tcphdr) / 4)
1615                 goto bad_packet;
1616         if (!pskb_may_pull(skb, th->doff * 4))
1617                 goto discard_it;
1618
1619         /* An explanation is required here, I think.
1620          * Packet length and doff are validated by header prediction,
1621          * provided case of th->doff==0 is eliminated.
1622          * So, we defer the checks. */
1623
1624         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1625                 goto csum_error;
1626
1627         th = tcp_hdr(skb);
1628         iph = ip_hdr(skb);
1629         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1630         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1631                                     skb->len - th->doff * 4);
1632         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1633         TCP_SKB_CB(skb)->when    = 0;
1634         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1635         TCP_SKB_CB(skb)->sacked  = 0;
1636
1637         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1638         if (!sk)
1639                 goto no_tcp_socket;
1640
1641 process:
1642         if (sk->sk_state == TCP_TIME_WAIT)
1643                 goto do_time_wait;
1644
1645         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1646                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1647                 goto discard_and_relse;
1648         }
1649
1650         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1651                 goto discard_and_relse;
1652         nf_reset(skb);
1653
1654         if (sk_filter(sk, skb))
1655                 goto discard_and_relse;
1656
1657         sk_mark_napi_id(sk, skb);
1658         skb->dev = NULL;
1659
1660         bh_lock_sock_nested(sk);
1661         ret = 0;
1662         if (!sock_owned_by_user(sk)) {
1663 #ifdef CONFIG_NET_DMA
1664                 struct tcp_sock *tp = tcp_sk(sk);
1665                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1666                         tp->ucopy.dma_chan = net_dma_find_channel();
1667                 if (tp->ucopy.dma_chan)
1668                         ret = tcp_v4_do_rcv(sk, skb);
1669                 else
1670 #endif
1671                 {
1672                         if (!tcp_prequeue(sk, skb))
1673                                 ret = tcp_v4_do_rcv(sk, skb);
1674                 }
1675         } else if (unlikely(sk_add_backlog(sk, skb,
1676                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1677                 bh_unlock_sock(sk);
1678                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1679                 goto discard_and_relse;
1680         }
1681         bh_unlock_sock(sk);
1682
1683         sock_put(sk);
1684
1685         return ret;
1686
1687 no_tcp_socket:
1688         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1689                 goto discard_it;
1690
1691         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1692 csum_error:
1693                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1694 bad_packet:
1695                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1696         } else {
1697                 tcp_v4_send_reset(NULL, skb);
1698         }
1699
1700 discard_it:
1701         /* Discard frame. */
1702         kfree_skb(skb);
1703         return 0;
1704
1705 discard_and_relse:
1706         sock_put(sk);
1707         goto discard_it;
1708
1709 do_time_wait:
1710         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1711                 inet_twsk_put(inet_twsk(sk));
1712                 goto discard_it;
1713         }
1714
1715         if (skb->len < (th->doff << 2)) {
1716                 inet_twsk_put(inet_twsk(sk));
1717                 goto bad_packet;
1718         }
1719         if (tcp_checksum_complete(skb)) {
1720                 inet_twsk_put(inet_twsk(sk));
1721                 goto csum_error;
1722         }
1723         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1724         case TCP_TW_SYN: {
1725                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1726                                                         &tcp_hashinfo,
1727                                                         iph->saddr, th->source,
1728                                                         iph->daddr, th->dest,
1729                                                         inet_iif(skb));
1730                 if (sk2) {
1731                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1732                         inet_twsk_put(inet_twsk(sk));
1733                         sk = sk2;
1734                         goto process;
1735                 }
1736                 /* Fall through to ACK */
1737         }
1738         case TCP_TW_ACK:
1739                 tcp_v4_timewait_ack(sk, skb);
1740                 break;
1741         case TCP_TW_RST:
1742                 goto no_tcp_socket;
1743         case TCP_TW_SUCCESS:;
1744         }
1745         goto discard_it;
1746 }
1747
1748 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1749         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1750         .twsk_unique    = tcp_twsk_unique,
1751         .twsk_destructor= tcp_twsk_destructor,
1752 };
1753
1754 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1755 {
1756         struct dst_entry *dst = skb_dst(skb);
1757
1758         dst_hold(dst);
1759         sk->sk_rx_dst = dst;
1760         inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1761 }
1762 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1763
1764 const struct inet_connection_sock_af_ops ipv4_specific = {
1765         .queue_xmit        = ip_queue_xmit,
1766         .send_check        = tcp_v4_send_check,
1767         .rebuild_header    = inet_sk_rebuild_header,
1768         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1769         .conn_request      = tcp_v4_conn_request,
1770         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1771         .net_header_len    = sizeof(struct iphdr),
1772         .setsockopt        = ip_setsockopt,
1773         .getsockopt        = ip_getsockopt,
1774         .addr2sockaddr     = inet_csk_addr2sockaddr,
1775         .sockaddr_len      = sizeof(struct sockaddr_in),
1776         .bind_conflict     = inet_csk_bind_conflict,
1777 #ifdef CONFIG_COMPAT
1778         .compat_setsockopt = compat_ip_setsockopt,
1779         .compat_getsockopt = compat_ip_getsockopt,
1780 #endif
1781 };
1782 EXPORT_SYMBOL(ipv4_specific);
1783
1784 #ifdef CONFIG_TCP_MD5SIG
1785 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1786         .md5_lookup             = tcp_v4_md5_lookup,
1787         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1788         .md5_parse              = tcp_v4_parse_md5_keys,
1789 };
1790 #endif
1791
1792 /* NOTE: A lot of things set to zero explicitly by call to
1793  *       sk_alloc() so need not be done here.
1794  */
1795 static int tcp_v4_init_sock(struct sock *sk)
1796 {
1797         struct inet_connection_sock *icsk = inet_csk(sk);
1798
1799         tcp_init_sock(sk);
1800
1801         icsk->icsk_af_ops = &ipv4_specific;
1802
1803 #ifdef CONFIG_TCP_MD5SIG
1804         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1805 #endif
1806
1807         return 0;
1808 }
1809
1810 void tcp_v4_destroy_sock(struct sock *sk)
1811 {
1812         struct tcp_sock *tp = tcp_sk(sk);
1813
1814         tcp_clear_xmit_timers(sk);
1815
1816         tcp_cleanup_congestion_control(sk);
1817
1818         /* Cleanup up the write buffer. */
1819         tcp_write_queue_purge(sk);
1820
1821         /* Cleans up our, hopefully empty, out_of_order_queue. */
1822         __skb_queue_purge(&tp->out_of_order_queue);
1823
1824 #ifdef CONFIG_TCP_MD5SIG
1825         /* Clean up the MD5 key list, if any */
1826         if (tp->md5sig_info) {
1827                 tcp_clear_md5_list(sk);
1828                 kfree_rcu(tp->md5sig_info, rcu);
1829                 tp->md5sig_info = NULL;
1830         }
1831 #endif
1832
1833 #ifdef CONFIG_NET_DMA
1834         /* Cleans up our sk_async_wait_queue */
1835         __skb_queue_purge(&sk->sk_async_wait_queue);
1836 #endif
1837
1838         /* Clean prequeue, it must be empty really */
1839         __skb_queue_purge(&tp->ucopy.prequeue);
1840
1841         /* Clean up a referenced TCP bind bucket. */
1842         if (inet_csk(sk)->icsk_bind_hash)
1843                 inet_put_port(sk);
1844
1845         BUG_ON(tp->fastopen_rsk != NULL);
1846
1847         /* If socket is aborted during connect operation */
1848         tcp_free_fastopen_req(tp);
1849
1850         sk_sockets_allocated_dec(sk);
1851         sock_release_memcg(sk);
1852 }
1853 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1854
1855 #ifdef CONFIG_PROC_FS
1856 /* Proc filesystem TCP sock list dumping. */
1857
1858 /*
1859  * Get next listener socket follow cur.  If cur is NULL, get first socket
1860  * starting from bucket given in st->bucket; when st->bucket is zero the
1861  * very first socket in the hash table is returned.
1862  */
1863 static void *listening_get_next(struct seq_file *seq, void *cur)
1864 {
1865         struct inet_connection_sock *icsk;
1866         struct hlist_nulls_node *node;
1867         struct sock *sk = cur;
1868         struct inet_listen_hashbucket *ilb;
1869         struct tcp_iter_state *st = seq->private;
1870         struct net *net = seq_file_net(seq);
1871
1872         if (!sk) {
1873                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1874                 spin_lock_bh(&ilb->lock);
1875                 sk = sk_nulls_head(&ilb->head);
1876                 st->offset = 0;
1877                 goto get_sk;
1878         }
1879         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1880         ++st->num;
1881         ++st->offset;
1882
1883         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1884                 struct request_sock *req = cur;
1885
1886                 icsk = inet_csk(st->syn_wait_sk);
1887                 req = req->dl_next;
1888                 while (1) {
1889                         while (req) {
1890                                 if (req->rsk_ops->family == st->family) {
1891                                         cur = req;
1892                                         goto out;
1893                                 }
1894                                 req = req->dl_next;
1895                         }
1896                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1897                                 break;
1898 get_req:
1899                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1900                 }
1901                 sk        = sk_nulls_next(st->syn_wait_sk);
1902                 st->state = TCP_SEQ_STATE_LISTENING;
1903                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1904         } else {
1905                 icsk = inet_csk(sk);
1906                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1908                         goto start_req;
1909                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1910                 sk = sk_nulls_next(sk);
1911         }
1912 get_sk:
1913         sk_nulls_for_each_from(sk, node) {
1914                 if (!net_eq(sock_net(sk), net))
1915                         continue;
1916                 if (sk->sk_family == st->family) {
1917                         cur = sk;
1918                         goto out;
1919                 }
1920                 icsk = inet_csk(sk);
1921                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1922                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1923 start_req:
1924                         st->uid         = sock_i_uid(sk);
1925                         st->syn_wait_sk = sk;
1926                         st->state       = TCP_SEQ_STATE_OPENREQ;
1927                         st->sbucket     = 0;
1928                         goto get_req;
1929                 }
1930                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1931         }
1932         spin_unlock_bh(&ilb->lock);
1933         st->offset = 0;
1934         if (++st->bucket < INET_LHTABLE_SIZE) {
1935                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1936                 spin_lock_bh(&ilb->lock);
1937                 sk = sk_nulls_head(&ilb->head);
1938                 goto get_sk;
1939         }
1940         cur = NULL;
1941 out:
1942         return cur;
1943 }
1944
1945 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1946 {
1947         struct tcp_iter_state *st = seq->private;
1948         void *rc;
1949
1950         st->bucket = 0;
1951         st->offset = 0;
1952         rc = listening_get_next(seq, NULL);
1953
1954         while (rc && *pos) {
1955                 rc = listening_get_next(seq, rc);
1956                 --*pos;
1957         }
1958         return rc;
1959 }
1960
1961 static inline bool empty_bucket(const struct tcp_iter_state *st)
1962 {
1963         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1964 }
1965
1966 /*
1967  * Get first established socket starting from bucket given in st->bucket.
1968  * If st->bucket is zero, the very first socket in the hash is returned.
1969  */
1970 static void *established_get_first(struct seq_file *seq)
1971 {
1972         struct tcp_iter_state *st = seq->private;
1973         struct net *net = seq_file_net(seq);
1974         void *rc = NULL;
1975
1976         st->offset = 0;
1977         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1978                 struct sock *sk;
1979                 struct hlist_nulls_node *node;
1980                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1981
1982                 /* Lockless fast path for the common case of empty buckets */
1983                 if (empty_bucket(st))
1984                         continue;
1985
1986                 spin_lock_bh(lock);
1987                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1988                         if (sk->sk_family != st->family ||
1989                             !net_eq(sock_net(sk), net)) {
1990                                 continue;
1991                         }
1992                         rc = sk;
1993                         goto out;
1994                 }
1995                 spin_unlock_bh(lock);
1996         }
1997 out:
1998         return rc;
1999 }
2000
2001 static void *established_get_next(struct seq_file *seq, void *cur)
2002 {
2003         struct sock *sk = cur;
2004         struct hlist_nulls_node *node;
2005         struct tcp_iter_state *st = seq->private;
2006         struct net *net = seq_file_net(seq);
2007
2008         ++st->num;
2009         ++st->offset;
2010
2011         sk = sk_nulls_next(sk);
2012
2013         sk_nulls_for_each_from(sk, node) {
2014                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2015                         return sk;
2016         }
2017
2018         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2019         ++st->bucket;
2020         return established_get_first(seq);
2021 }
2022
2023 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2024 {
2025         struct tcp_iter_state *st = seq->private;
2026         void *rc;
2027
2028         st->bucket = 0;
2029         rc = established_get_first(seq);
2030
2031         while (rc && pos) {
2032                 rc = established_get_next(seq, rc);
2033                 --pos;
2034         }
2035         return rc;
2036 }
2037
2038 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2039 {
2040         void *rc;
2041         struct tcp_iter_state *st = seq->private;
2042
2043         st->state = TCP_SEQ_STATE_LISTENING;
2044         rc        = listening_get_idx(seq, &pos);
2045
2046         if (!rc) {
2047                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2048                 rc        = established_get_idx(seq, pos);
2049         }
2050
2051         return rc;
2052 }
2053
2054 static void *tcp_seek_last_pos(struct seq_file *seq)
2055 {
2056         struct tcp_iter_state *st = seq->private;
2057         int offset = st->offset;
2058         int orig_num = st->num;
2059         void *rc = NULL;
2060
2061         switch (st->state) {
2062         case TCP_SEQ_STATE_OPENREQ:
2063         case TCP_SEQ_STATE_LISTENING:
2064                 if (st->bucket >= INET_LHTABLE_SIZE)
2065                         break;
2066                 st->state = TCP_SEQ_STATE_LISTENING;
2067                 rc = listening_get_next(seq, NULL);
2068                 while (offset-- && rc)
2069                         rc = listening_get_next(seq, rc);
2070                 if (rc)
2071                         break;
2072                 st->bucket = 0;
2073                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2074                 /* Fallthrough */
2075         case TCP_SEQ_STATE_ESTABLISHED:
2076                 if (st->bucket > tcp_hashinfo.ehash_mask)
2077                         break;
2078                 rc = established_get_first(seq);
2079                 while (offset-- && rc)
2080                         rc = established_get_next(seq, rc);
2081         }
2082
2083         st->num = orig_num;
2084
2085         return rc;
2086 }
2087
2088 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2089 {
2090         struct tcp_iter_state *st = seq->private;
2091         void *rc;
2092
2093         if (*pos && *pos == st->last_pos) {
2094                 rc = tcp_seek_last_pos(seq);
2095                 if (rc)
2096                         goto out;
2097         }
2098
2099         st->state = TCP_SEQ_STATE_LISTENING;
2100         st->num = 0;
2101         st->bucket = 0;
2102         st->offset = 0;
2103         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2104
2105 out:
2106         st->last_pos = *pos;
2107         return rc;
2108 }
2109
2110 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2111 {
2112         struct tcp_iter_state *st = seq->private;
2113         void *rc = NULL;
2114
2115         if (v == SEQ_START_TOKEN) {
2116                 rc = tcp_get_idx(seq, 0);
2117                 goto out;
2118         }
2119
2120         switch (st->state) {
2121         case TCP_SEQ_STATE_OPENREQ:
2122         case TCP_SEQ_STATE_LISTENING:
2123                 rc = listening_get_next(seq, v);
2124                 if (!rc) {
2125                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2126                         st->bucket = 0;
2127                         st->offset = 0;
2128                         rc        = established_get_first(seq);
2129                 }
2130                 break;
2131         case TCP_SEQ_STATE_ESTABLISHED:
2132                 rc = established_get_next(seq, v);
2133                 break;
2134         }
2135 out:
2136         ++*pos;
2137         st->last_pos = *pos;
2138         return rc;
2139 }
2140
2141 static void tcp_seq_stop(struct seq_file *seq, void *v)
2142 {
2143         struct tcp_iter_state *st = seq->private;
2144
2145         switch (st->state) {
2146         case TCP_SEQ_STATE_OPENREQ:
2147                 if (v) {
2148                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2149                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2150                 }
2151         case TCP_SEQ_STATE_LISTENING:
2152                 if (v != SEQ_START_TOKEN)
2153                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2154                 break;
2155         case TCP_SEQ_STATE_ESTABLISHED:
2156                 if (v)
2157                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2158                 break;
2159         }
2160 }
2161
2162 int tcp_seq_open(struct inode *inode, struct file *file)
2163 {
2164         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2165         struct tcp_iter_state *s;
2166         int err;
2167
2168         err = seq_open_net(inode, file, &afinfo->seq_ops,
2169                           sizeof(struct tcp_iter_state));
2170         if (err < 0)
2171                 return err;
2172
2173         s = ((struct seq_file *)file->private_data)->private;
2174         s->family               = afinfo->family;
2175         s->last_pos             = 0;
2176         return 0;
2177 }
2178 EXPORT_SYMBOL(tcp_seq_open);
2179
2180 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2181 {
2182         int rc = 0;
2183         struct proc_dir_entry *p;
2184
2185         afinfo->seq_ops.start           = tcp_seq_start;
2186         afinfo->seq_ops.next            = tcp_seq_next;
2187         afinfo->seq_ops.stop            = tcp_seq_stop;
2188
2189         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2190                              afinfo->seq_fops, afinfo);
2191         if (!p)
2192                 rc = -ENOMEM;
2193         return rc;
2194 }
2195 EXPORT_SYMBOL(tcp_proc_register);
2196
2197 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2198 {
2199         remove_proc_entry(afinfo->name, net->proc_net);
2200 }
2201 EXPORT_SYMBOL(tcp_proc_unregister);
2202
2203 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2204                          struct seq_file *f, int i, kuid_t uid)
2205 {
2206         const struct inet_request_sock *ireq = inet_rsk(req);
2207         long delta = req->expires - jiffies;
2208
2209         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2210                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2211                 i,
2212                 ireq->ir_loc_addr,
2213                 ntohs(inet_sk(sk)->inet_sport),
2214                 ireq->ir_rmt_addr,
2215                 ntohs(ireq->ir_rmt_port),
2216                 TCP_SYN_RECV,
2217                 0, 0, /* could print option size, but that is af dependent. */
2218                 1,    /* timers active (only the expire timer) */
2219                 jiffies_delta_to_clock_t(delta),
2220                 req->num_timeout,
2221                 from_kuid_munged(seq_user_ns(f), uid),
2222                 0,  /* non standard timer */
2223                 0, /* open_requests have no inode */
2224                 atomic_read(&sk->sk_refcnt),
2225                 req);
2226 }
2227
2228 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2229 {
2230         int timer_active;
2231         unsigned long timer_expires;
2232         const struct tcp_sock *tp = tcp_sk(sk);
2233         const struct inet_connection_sock *icsk = inet_csk(sk);
2234         const struct inet_sock *inet = inet_sk(sk);
2235         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2236         __be32 dest = inet->inet_daddr;
2237         __be32 src = inet->inet_rcv_saddr;
2238         __u16 destp = ntohs(inet->inet_dport);
2239         __u16 srcp = ntohs(inet->inet_sport);
2240         int rx_queue;
2241
2242         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2243             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2244             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2245                 timer_active    = 1;
2246                 timer_expires   = icsk->icsk_timeout;
2247         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2248                 timer_active    = 4;
2249                 timer_expires   = icsk->icsk_timeout;
2250         } else if (timer_pending(&sk->sk_timer)) {
2251                 timer_active    = 2;
2252                 timer_expires   = sk->sk_timer.expires;
2253         } else {
2254                 timer_active    = 0;
2255                 timer_expires = jiffies;
2256         }
2257
2258         if (sk->sk_state == TCP_LISTEN)
2259                 rx_queue = sk->sk_ack_backlog;
2260         else
2261                 /*
2262                  * because we dont lock socket, we might find a transient negative value
2263                  */
2264                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2265
2266         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2267                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2268                 i, src, srcp, dest, destp, sk->sk_state,
2269                 tp->write_seq - tp->snd_una,
2270                 rx_queue,
2271                 timer_active,
2272                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2273                 icsk->icsk_retransmits,
2274                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2275                 icsk->icsk_probes_out,
2276                 sock_i_ino(sk),
2277                 atomic_read(&sk->sk_refcnt), sk,
2278                 jiffies_to_clock_t(icsk->icsk_rto),
2279                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2280                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2281                 tp->snd_cwnd,
2282                 sk->sk_state == TCP_LISTEN ?
2283                     (fastopenq ? fastopenq->max_qlen : 0) :
2284                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2285 }
2286
2287 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2288                                struct seq_file *f, int i)
2289 {
2290         __be32 dest, src;
2291         __u16 destp, srcp;
2292         s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2293
2294         dest  = tw->tw_daddr;
2295         src   = tw->tw_rcv_saddr;
2296         destp = ntohs(tw->tw_dport);
2297         srcp  = ntohs(tw->tw_sport);
2298
2299         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2300                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2301                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2302                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2303                 atomic_read(&tw->tw_refcnt), tw);
2304 }
2305
2306 #define TMPSZ 150
2307
2308 static int tcp4_seq_show(struct seq_file *seq, void *v)
2309 {
2310         struct tcp_iter_state *st;
2311         struct sock *sk = v;
2312
2313         seq_setwidth(seq, TMPSZ - 1);
2314         if (v == SEQ_START_TOKEN) {
2315                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2316                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2317                            "inode");
2318                 goto out;
2319         }
2320         st = seq->private;
2321
2322         switch (st->state) {
2323         case TCP_SEQ_STATE_LISTENING:
2324         case TCP_SEQ_STATE_ESTABLISHED:
2325                 if (sk->sk_state == TCP_TIME_WAIT)
2326                         get_timewait4_sock(v, seq, st->num);
2327                 else
2328                         get_tcp4_sock(v, seq, st->num);
2329                 break;
2330         case TCP_SEQ_STATE_OPENREQ:
2331                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2332                 break;
2333         }
2334 out:
2335         seq_pad(seq, '\n');
2336         return 0;
2337 }
2338
2339 static const struct file_operations tcp_afinfo_seq_fops = {
2340         .owner   = THIS_MODULE,
2341         .open    = tcp_seq_open,
2342         .read    = seq_read,
2343         .llseek  = seq_lseek,
2344         .release = seq_release_net
2345 };
2346
2347 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2348         .name           = "tcp",
2349         .family         = AF_INET,
2350         .seq_fops       = &tcp_afinfo_seq_fops,
2351         .seq_ops        = {
2352                 .show           = tcp4_seq_show,
2353         },
2354 };
2355
2356 static int __net_init tcp4_proc_init_net(struct net *net)
2357 {
2358         return tcp_proc_register(net, &tcp4_seq_afinfo);
2359 }
2360
2361 static void __net_exit tcp4_proc_exit_net(struct net *net)
2362 {
2363         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2364 }
2365
2366 static struct pernet_operations tcp4_net_ops = {
2367         .init = tcp4_proc_init_net,
2368         .exit = tcp4_proc_exit_net,
2369 };
2370
2371 int __init tcp4_proc_init(void)
2372 {
2373         return register_pernet_subsys(&tcp4_net_ops);
2374 }
2375
2376 void tcp4_proc_exit(void)
2377 {
2378         unregister_pernet_subsys(&tcp4_net_ops);
2379 }
2380 #endif /* CONFIG_PROC_FS */
2381
2382 struct proto tcp_prot = {
2383         .name                   = "TCP",
2384         .owner                  = THIS_MODULE,
2385         .close                  = tcp_close,
2386         .connect                = tcp_v4_connect,
2387         .disconnect             = tcp_disconnect,
2388         .accept                 = inet_csk_accept,
2389         .ioctl                  = tcp_ioctl,
2390         .init                   = tcp_v4_init_sock,
2391         .destroy                = tcp_v4_destroy_sock,
2392         .shutdown               = tcp_shutdown,
2393         .setsockopt             = tcp_setsockopt,
2394         .getsockopt             = tcp_getsockopt,
2395         .recvmsg                = tcp_recvmsg,
2396         .sendmsg                = tcp_sendmsg,
2397         .sendpage               = tcp_sendpage,
2398         .backlog_rcv            = tcp_v4_do_rcv,
2399         .release_cb             = tcp_release_cb,
2400         .mtu_reduced            = tcp_v4_mtu_reduced,
2401         .hash                   = inet_hash,
2402         .unhash                 = inet_unhash,
2403         .get_port               = inet_csk_get_port,
2404         .enter_memory_pressure  = tcp_enter_memory_pressure,
2405         .stream_memory_free     = tcp_stream_memory_free,
2406         .sockets_allocated      = &tcp_sockets_allocated,
2407         .orphan_count           = &tcp_orphan_count,
2408         .memory_allocated       = &tcp_memory_allocated,
2409         .memory_pressure        = &tcp_memory_pressure,
2410         .sysctl_mem             = sysctl_tcp_mem,
2411         .sysctl_wmem            = sysctl_tcp_wmem,
2412         .sysctl_rmem            = sysctl_tcp_rmem,
2413         .max_header             = MAX_TCP_HEADER,
2414         .obj_size               = sizeof(struct tcp_sock),
2415         .slab_flags             = SLAB_DESTROY_BY_RCU,
2416         .twsk_prot              = &tcp_timewait_sock_ops,
2417         .rsk_prot               = &tcp_request_sock_ops,
2418         .h.hashinfo             = &tcp_hashinfo,
2419         .no_autobind            = true,
2420 #ifdef CONFIG_COMPAT
2421         .compat_setsockopt      = compat_tcp_setsockopt,
2422         .compat_getsockopt      = compat_tcp_getsockopt,
2423 #endif
2424 #ifdef CONFIG_MEMCG_KMEM
2425         .init_cgroup            = tcp_init_cgroup,
2426         .destroy_cgroup         = tcp_destroy_cgroup,
2427         .proto_cgroup           = tcp_proto_cgroup,
2428 #endif
2429 };
2430 EXPORT_SYMBOL(tcp_prot);
2431
2432 static int __net_init tcp_sk_init(struct net *net)
2433 {
2434         net->ipv4.sysctl_tcp_ecn = 2;
2435         return 0;
2436 }
2437
2438 static void __net_exit tcp_sk_exit(struct net *net)
2439 {
2440 }
2441
2442 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2443 {
2444         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2445 }
2446
2447 static struct pernet_operations __net_initdata tcp_sk_ops = {
2448        .init       = tcp_sk_init,
2449        .exit       = tcp_sk_exit,
2450        .exit_batch = tcp_sk_exit_batch,
2451 };
2452
2453 void __init tcp_v4_init(void)
2454 {
2455         inet_hashinfo_init(&tcp_hashinfo);
2456         if (register_pernet_subsys(&tcp_sk_ops))
2457                 panic("Failed to create the TCP control socket.\n");
2458 }