[NET]: Cleanup INET_REFCNT_DEBUG code
[cascardo/linux.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265
266
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279 int sysctl_tcp_mem[3];
280 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283 EXPORT_SYMBOL(sysctl_tcp_mem);
284 EXPORT_SYMBOL(sysctl_tcp_rmem);
285 EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
288 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
289
290 EXPORT_SYMBOL(tcp_memory_allocated);
291 EXPORT_SYMBOL(tcp_sockets_allocated);
292
293 /*
294  * Pressure flag: try to collapse.
295  * Technical note: it is used by multiple contexts non atomically.
296  * All the sk_stream_mem_schedule() is of this nature: accounting
297  * is strict, actions are advisory and have some latency.
298  */
299 int tcp_memory_pressure;
300
301 EXPORT_SYMBOL(tcp_memory_pressure);
302
303 void tcp_enter_memory_pressure(void)
304 {
305         if (!tcp_memory_pressure) {
306                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307                 tcp_memory_pressure = 1;
308         }
309 }
310
311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313 /*
314  * LISTEN is a special case for poll..
315  */
316 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317                                                poll_table *wait)
318 {
319         return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320 }
321
322 /*
323  *      Wait for a TCP event.
324  *
325  *      Note that we don't need to lock the socket, as the upper poll layers
326  *      take care of normal races (between the test and the event) and we don't
327  *      go look at any of the socket buffers directly.
328  */
329 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330 {
331         unsigned int mask;
332         struct sock *sk = sock->sk;
333         struct tcp_sock *tp = tcp_sk(sk);
334
335         poll_wait(file, sk->sk_sleep, wait);
336         if (sk->sk_state == TCP_LISTEN)
337                 return tcp_listen_poll(sk, wait);
338
339         /* Socket is not locked. We are protected from async events
340            by poll logic and correct handling of state changes
341            made by another threads is impossible in any case.
342          */
343
344         mask = 0;
345         if (sk->sk_err)
346                 mask = POLLERR;
347
348         /*
349          * POLLHUP is certainly not done right. But poll() doesn't
350          * have a notion of HUP in just one direction, and for a
351          * socket the read side is more interesting.
352          *
353          * Some poll() documentation says that POLLHUP is incompatible
354          * with the POLLOUT/POLLWR flags, so somebody should check this
355          * all. But careful, it tends to be safer to return too many
356          * bits than too few, and you can easily break real applications
357          * if you don't tell them that something has hung up!
358          *
359          * Check-me.
360          *
361          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362          * our fs/select.c). It means that after we received EOF,
363          * poll always returns immediately, making impossible poll() on write()
364          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365          * if and only if shutdown has been made in both directions.
366          * Actually, it is interesting to look how Solaris and DUX
367          * solve this dilemma. I would prefer, if PULLHUP were maskable,
368          * then we could set it on SND_SHUTDOWN. BTW examples given
369          * in Stevens' books assume exactly this behaviour, it explains
370          * why PULLHUP is incompatible with POLLOUT.    --ANK
371          *
372          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373          * blocking on fresh not-connected or disconnected socket. --ANK
374          */
375         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376                 mask |= POLLHUP;
377         if (sk->sk_shutdown & RCV_SHUTDOWN)
378                 mask |= POLLIN | POLLRDNORM;
379
380         /* Connected? */
381         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382                 /* Potential race condition. If read of tp below will
383                  * escape above sk->sk_state, we can be illegally awaken
384                  * in SYN_* states. */
385                 if ((tp->rcv_nxt != tp->copied_seq) &&
386                     (tp->urg_seq != tp->copied_seq ||
387                      tp->rcv_nxt != tp->copied_seq + 1 ||
388                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389                         mask |= POLLIN | POLLRDNORM;
390
391                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393                                 mask |= POLLOUT | POLLWRNORM;
394                         } else {  /* send SIGIO later */
395                                 set_bit(SOCK_ASYNC_NOSPACE,
396                                         &sk->sk_socket->flags);
397                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399                                 /* Race breaker. If space is freed after
400                                  * wspace test but before the flags are set,
401                                  * IO signal will be lost.
402                                  */
403                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404                                         mask |= POLLOUT | POLLWRNORM;
405                         }
406                 }
407
408                 if (tp->urg_data & TCP_URG_VALID)
409                         mask |= POLLPRI;
410         }
411         return mask;
412 }
413
414 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415 {
416         struct tcp_sock *tp = tcp_sk(sk);
417         int answ;
418
419         switch (cmd) {
420         case SIOCINQ:
421                 if (sk->sk_state == TCP_LISTEN)
422                         return -EINVAL;
423
424                 lock_sock(sk);
425                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426                         answ = 0;
427                 else if (sock_flag(sk, SOCK_URGINLINE) ||
428                          !tp->urg_data ||
429                          before(tp->urg_seq, tp->copied_seq) ||
430                          !before(tp->urg_seq, tp->rcv_nxt)) {
431                         answ = tp->rcv_nxt - tp->copied_seq;
432
433                         /* Subtract 1, if FIN is in queue. */
434                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435                                 answ -=
436                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437                 } else
438                         answ = tp->urg_seq - tp->copied_seq;
439                 release_sock(sk);
440                 break;
441         case SIOCATMARK:
442                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443                 break;
444         case SIOCOUTQ:
445                 if (sk->sk_state == TCP_LISTEN)
446                         return -EINVAL;
447
448                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449                         answ = 0;
450                 else
451                         answ = tp->write_seq - tp->snd_una;
452                 break;
453         default:
454                 return -ENOIOCTLCMD;
455         };
456
457         return put_user(answ, (int __user *)arg);
458 }
459
460
461 int tcp_listen_start(struct sock *sk)
462 {
463         struct inet_sock *inet = inet_sk(sk);
464         struct tcp_sock *tp = tcp_sk(sk);
465         int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467         if (rc != 0)
468                 return rc;
469
470         sk->sk_max_ack_backlog = 0;
471         sk->sk_ack_backlog = 0;
472         tcp_delack_init(tp);
473
474         /* There is race window here: we announce ourselves listening,
475          * but this transition is still not validated by get_port().
476          * It is OK, because this socket enters to hash table only
477          * after validation is complete.
478          */
479         sk->sk_state = TCP_LISTEN;
480         if (!sk->sk_prot->get_port(sk, inet->num)) {
481                 inet->sport = htons(inet->num);
482
483                 sk_dst_reset(sk);
484                 sk->sk_prot->hash(sk);
485
486                 return 0;
487         }
488
489         sk->sk_state = TCP_CLOSE;
490         __reqsk_queue_destroy(&tp->accept_queue);
491         return -EADDRINUSE;
492 }
493
494 /*
495  *      This routine closes sockets which have been at least partially
496  *      opened, but not yet accepted.
497  */
498
499 static void tcp_listen_stop (struct sock *sk)
500 {
501         struct tcp_sock *tp = tcp_sk(sk);
502         struct request_sock *acc_req;
503         struct request_sock *req;
504
505         tcp_delete_keepalive_timer(sk);
506
507         /* make all the listen_opt local to us */
508         acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
509
510         /* Following specs, it would be better either to send FIN
511          * (and enter FIN-WAIT-1, it is normal close)
512          * or to send active reset (abort).
513          * Certainly, it is pretty dangerous while synflood, but it is
514          * bad justification for our negligence 8)
515          * To be honest, we are not able to make either
516          * of the variants now.                 --ANK
517          */
518         reqsk_queue_destroy(&tp->accept_queue);
519
520         while ((req = acc_req) != NULL) {
521                 struct sock *child = req->sk;
522
523                 acc_req = req->dl_next;
524
525                 local_bh_disable();
526                 bh_lock_sock(child);
527                 BUG_TRAP(!sock_owned_by_user(child));
528                 sock_hold(child);
529
530                 tcp_disconnect(child, O_NONBLOCK);
531
532                 sock_orphan(child);
533
534                 atomic_inc(&tcp_orphan_count);
535
536                 tcp_destroy_sock(child);
537
538                 bh_unlock_sock(child);
539                 local_bh_enable();
540                 sock_put(child);
541
542                 sk_acceptq_removed(sk);
543                 __reqsk_free(req);
544         }
545         BUG_TRAP(!sk->sk_ack_backlog);
546 }
547
548 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
549 {
550         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
551         tp->pushed_seq = tp->write_seq;
552 }
553
554 static inline int forced_push(struct tcp_sock *tp)
555 {
556         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
557 }
558
559 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
560                               struct sk_buff *skb)
561 {
562         skb->csum = 0;
563         TCP_SKB_CB(skb)->seq = tp->write_seq;
564         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
565         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
566         TCP_SKB_CB(skb)->sacked = 0;
567         skb_header_release(skb);
568         __skb_queue_tail(&sk->sk_write_queue, skb);
569         sk_charge_skb(sk, skb);
570         if (!sk->sk_send_head)
571                 sk->sk_send_head = skb;
572         if (tp->nonagle & TCP_NAGLE_PUSH)
573                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
574 }
575
576 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
577                                 struct sk_buff *skb)
578 {
579         if (flags & MSG_OOB) {
580                 tp->urg_mode = 1;
581                 tp->snd_up = tp->write_seq;
582                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
583         }
584 }
585
586 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
587                             int mss_now, int nonagle)
588 {
589         if (sk->sk_send_head) {
590                 struct sk_buff *skb = sk->sk_write_queue.prev;
591                 if (!(flags & MSG_MORE) || forced_push(tp))
592                         tcp_mark_push(tp, skb);
593                 tcp_mark_urg(tp, flags, skb);
594                 __tcp_push_pending_frames(sk, tp, mss_now,
595                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
596         }
597 }
598
599 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
600                          size_t psize, int flags)
601 {
602         struct tcp_sock *tp = tcp_sk(sk);
603         int mss_now, size_goal;
604         int err;
605         ssize_t copied;
606         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
607
608         /* Wait for a connection to finish. */
609         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
610                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
611                         goto out_err;
612
613         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
614
615         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
616         size_goal = tp->xmit_size_goal;
617         copied = 0;
618
619         err = -EPIPE;
620         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
621                 goto do_error;
622
623         while (psize > 0) {
624                 struct sk_buff *skb = sk->sk_write_queue.prev;
625                 struct page *page = pages[poffset / PAGE_SIZE];
626                 int copy, i, can_coalesce;
627                 int offset = poffset % PAGE_SIZE;
628                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
629
630                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
631 new_segment:
632                         if (!sk_stream_memory_free(sk))
633                                 goto wait_for_sndbuf;
634
635                         skb = sk_stream_alloc_pskb(sk, 0, 0,
636                                                    sk->sk_allocation);
637                         if (!skb)
638                                 goto wait_for_memory;
639
640                         skb_entail(sk, tp, skb);
641                         copy = size_goal;
642                 }
643
644                 if (copy > size)
645                         copy = size;
646
647                 i = skb_shinfo(skb)->nr_frags;
648                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
649                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
650                         tcp_mark_push(tp, skb);
651                         goto new_segment;
652                 }
653                 if (sk->sk_forward_alloc < copy &&
654                     !sk_stream_mem_schedule(sk, copy, 0))
655                         goto wait_for_memory;
656                 
657                 if (can_coalesce) {
658                         skb_shinfo(skb)->frags[i - 1].size += copy;
659                 } else {
660                         get_page(page);
661                         skb_fill_page_desc(skb, i, page, offset, copy);
662                 }
663
664                 skb->len += copy;
665                 skb->data_len += copy;
666                 skb->truesize += copy;
667                 sk->sk_wmem_queued += copy;
668                 sk->sk_forward_alloc -= copy;
669                 skb->ip_summed = CHECKSUM_HW;
670                 tp->write_seq += copy;
671                 TCP_SKB_CB(skb)->end_seq += copy;
672                 skb_shinfo(skb)->tso_segs = 0;
673
674                 if (!copied)
675                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
676
677                 copied += copy;
678                 poffset += copy;
679                 if (!(psize -= copy))
680                         goto out;
681
682                 if (skb->len < mss_now || (flags & MSG_OOB))
683                         continue;
684
685                 if (forced_push(tp)) {
686                         tcp_mark_push(tp, skb);
687                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
688                 } else if (skb == sk->sk_send_head)
689                         tcp_push_one(sk, mss_now);
690                 continue;
691
692 wait_for_sndbuf:
693                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
694 wait_for_memory:
695                 if (copied)
696                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
697
698                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
699                         goto do_error;
700
701                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
702                 size_goal = tp->xmit_size_goal;
703         }
704
705 out:
706         if (copied)
707                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
708         return copied;
709
710 do_error:
711         if (copied)
712                 goto out;
713 out_err:
714         return sk_stream_error(sk, flags, err);
715 }
716
717 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
718                      size_t size, int flags)
719 {
720         ssize_t res;
721         struct sock *sk = sock->sk;
722
723 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
724
725         if (!(sk->sk_route_caps & NETIF_F_SG) ||
726             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
727                 return sock_no_sendpage(sock, page, offset, size, flags);
728
729 #undef TCP_ZC_CSUM_FLAGS
730
731         lock_sock(sk);
732         TCP_CHECK_TIMER(sk);
733         res = do_tcp_sendpages(sk, &page, offset, size, flags);
734         TCP_CHECK_TIMER(sk);
735         release_sock(sk);
736         return res;
737 }
738
739 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
740 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
741
742 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
743 {
744         int tmp = tp->mss_cache;
745
746         if (sk->sk_route_caps & NETIF_F_SG) {
747                 if (sk->sk_route_caps & NETIF_F_TSO)
748                         tmp = 0;
749                 else {
750                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
751
752                         if (tmp >= pgbreak &&
753                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
754                                 tmp = pgbreak;
755                 }
756         }
757
758         return tmp;
759 }
760
761 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
762                 size_t size)
763 {
764         struct iovec *iov;
765         struct tcp_sock *tp = tcp_sk(sk);
766         struct sk_buff *skb;
767         int iovlen, flags;
768         int mss_now, size_goal;
769         int err, copied;
770         long timeo;
771
772         lock_sock(sk);
773         TCP_CHECK_TIMER(sk);
774
775         flags = msg->msg_flags;
776         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
777
778         /* Wait for a connection to finish. */
779         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
780                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
781                         goto out_err;
782
783         /* This should be in poll */
784         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
785
786         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
787         size_goal = tp->xmit_size_goal;
788
789         /* Ok commence sending. */
790         iovlen = msg->msg_iovlen;
791         iov = msg->msg_iov;
792         copied = 0;
793
794         err = -EPIPE;
795         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
796                 goto do_error;
797
798         while (--iovlen >= 0) {
799                 int seglen = iov->iov_len;
800                 unsigned char __user *from = iov->iov_base;
801
802                 iov++;
803
804                 while (seglen > 0) {
805                         int copy;
806
807                         skb = sk->sk_write_queue.prev;
808
809                         if (!sk->sk_send_head ||
810                             (copy = size_goal - skb->len) <= 0) {
811
812 new_segment:
813                                 /* Allocate new segment. If the interface is SG,
814                                  * allocate skb fitting to single page.
815                                  */
816                                 if (!sk_stream_memory_free(sk))
817                                         goto wait_for_sndbuf;
818
819                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
820                                                            0, sk->sk_allocation);
821                                 if (!skb)
822                                         goto wait_for_memory;
823
824                                 /*
825                                  * Check whether we can use HW checksum.
826                                  */
827                                 if (sk->sk_route_caps &
828                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
829                                      NETIF_F_HW_CSUM))
830                                         skb->ip_summed = CHECKSUM_HW;
831
832                                 skb_entail(sk, tp, skb);
833                                 copy = size_goal;
834                         }
835
836                         /* Try to append data to the end of skb. */
837                         if (copy > seglen)
838                                 copy = seglen;
839
840                         /* Where to copy to? */
841                         if (skb_tailroom(skb) > 0) {
842                                 /* We have some space in skb head. Superb! */
843                                 if (copy > skb_tailroom(skb))
844                                         copy = skb_tailroom(skb);
845                                 if ((err = skb_add_data(skb, from, copy)) != 0)
846                                         goto do_fault;
847                         } else {
848                                 int merge = 0;
849                                 int i = skb_shinfo(skb)->nr_frags;
850                                 struct page *page = TCP_PAGE(sk);
851                                 int off = TCP_OFF(sk);
852
853                                 if (skb_can_coalesce(skb, i, page, off) &&
854                                     off != PAGE_SIZE) {
855                                         /* We can extend the last page
856                                          * fragment. */
857                                         merge = 1;
858                                 } else if (i == MAX_SKB_FRAGS ||
859                                            (!i &&
860                                            !(sk->sk_route_caps & NETIF_F_SG))) {
861                                         /* Need to add new fragment and cannot
862                                          * do this because interface is non-SG,
863                                          * or because all the page slots are
864                                          * busy. */
865                                         tcp_mark_push(tp, skb);
866                                         goto new_segment;
867                                 } else if (page) {
868                                         if (off == PAGE_SIZE) {
869                                                 put_page(page);
870                                                 TCP_PAGE(sk) = page = NULL;
871                                         }
872                                 }
873
874                                 if (!page) {
875                                         /* Allocate new cache page. */
876                                         if (!(page = sk_stream_alloc_page(sk)))
877                                                 goto wait_for_memory;
878                                         off = 0;
879                                 }
880
881                                 if (copy > PAGE_SIZE - off)
882                                         copy = PAGE_SIZE - off;
883
884                                 /* Time to copy data. We are close to
885                                  * the end! */
886                                 err = skb_copy_to_page(sk, from, skb, page,
887                                                        off, copy);
888                                 if (err) {
889                                         /* If this page was new, give it to the
890                                          * socket so it does not get leaked.
891                                          */
892                                         if (!TCP_PAGE(sk)) {
893                                                 TCP_PAGE(sk) = page;
894                                                 TCP_OFF(sk) = 0;
895                                         }
896                                         goto do_error;
897                                 }
898
899                                 /* Update the skb. */
900                                 if (merge) {
901                                         skb_shinfo(skb)->frags[i - 1].size +=
902                                                                         copy;
903                                 } else {
904                                         skb_fill_page_desc(skb, i, page, off, copy);
905                                         if (TCP_PAGE(sk)) {
906                                                 get_page(page);
907                                         } else if (off + copy < PAGE_SIZE) {
908                                                 get_page(page);
909                                                 TCP_PAGE(sk) = page;
910                                         }
911                                 }
912
913                                 TCP_OFF(sk) = off + copy;
914                         }
915
916                         if (!copied)
917                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
918
919                         tp->write_seq += copy;
920                         TCP_SKB_CB(skb)->end_seq += copy;
921                         skb_shinfo(skb)->tso_segs = 0;
922
923                         from += copy;
924                         copied += copy;
925                         if ((seglen -= copy) == 0 && iovlen == 0)
926                                 goto out;
927
928                         if (skb->len < mss_now || (flags & MSG_OOB))
929                                 continue;
930
931                         if (forced_push(tp)) {
932                                 tcp_mark_push(tp, skb);
933                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
934                         } else if (skb == sk->sk_send_head)
935                                 tcp_push_one(sk, mss_now);
936                         continue;
937
938 wait_for_sndbuf:
939                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
940 wait_for_memory:
941                         if (copied)
942                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
943
944                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
945                                 goto do_error;
946
947                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
948                         size_goal = tp->xmit_size_goal;
949                 }
950         }
951
952 out:
953         if (copied)
954                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
955         TCP_CHECK_TIMER(sk);
956         release_sock(sk);
957         return copied;
958
959 do_fault:
960         if (!skb->len) {
961                 if (sk->sk_send_head == skb)
962                         sk->sk_send_head = NULL;
963                 __skb_unlink(skb, &sk->sk_write_queue);
964                 sk_stream_free_skb(sk, skb);
965         }
966
967 do_error:
968         if (copied)
969                 goto out;
970 out_err:
971         err = sk_stream_error(sk, flags, err);
972         TCP_CHECK_TIMER(sk);
973         release_sock(sk);
974         return err;
975 }
976
977 /*
978  *      Handle reading urgent data. BSD has very simple semantics for
979  *      this, no blocking and very strange errors 8)
980  */
981
982 static int tcp_recv_urg(struct sock *sk, long timeo,
983                         struct msghdr *msg, int len, int flags,
984                         int *addr_len)
985 {
986         struct tcp_sock *tp = tcp_sk(sk);
987
988         /* No URG data to read. */
989         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
990             tp->urg_data == TCP_URG_READ)
991                 return -EINVAL; /* Yes this is right ! */
992
993         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
994                 return -ENOTCONN;
995
996         if (tp->urg_data & TCP_URG_VALID) {
997                 int err = 0;
998                 char c = tp->urg_data;
999
1000                 if (!(flags & MSG_PEEK))
1001                         tp->urg_data = TCP_URG_READ;
1002
1003                 /* Read urgent data. */
1004                 msg->msg_flags |= MSG_OOB;
1005
1006                 if (len > 0) {
1007                         if (!(flags & MSG_TRUNC))
1008                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1009                         len = 1;
1010                 } else
1011                         msg->msg_flags |= MSG_TRUNC;
1012
1013                 return err ? -EFAULT : len;
1014         }
1015
1016         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1017                 return 0;
1018
1019         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1020          * the available implementations agree in this case:
1021          * this call should never block, independent of the
1022          * blocking state of the socket.
1023          * Mike <pall@rz.uni-karlsruhe.de>
1024          */
1025         return -EAGAIN;
1026 }
1027
1028 /* Clean up the receive buffer for full frames taken by the user,
1029  * then send an ACK if necessary.  COPIED is the number of bytes
1030  * tcp_recvmsg has given to the user so far, it speeds up the
1031  * calculation of whether or not we must ACK for the sake of
1032  * a window update.
1033  */
1034 static void cleanup_rbuf(struct sock *sk, int copied)
1035 {
1036         struct tcp_sock *tp = tcp_sk(sk);
1037         int time_to_ack = 0;
1038
1039 #if TCP_DEBUG
1040         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1041
1042         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1043 #endif
1044
1045         if (tcp_ack_scheduled(tp)) {
1046                    /* Delayed ACKs frequently hit locked sockets during bulk
1047                     * receive. */
1048                 if (tp->ack.blocked ||
1049                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1050                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1051                     /*
1052                      * If this read emptied read buffer, we send ACK, if
1053                      * connection is not bidirectional, user drained
1054                      * receive buffer and there was a small segment
1055                      * in queue.
1056                      */
1057                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1058                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1059                         time_to_ack = 1;
1060         }
1061
1062         /* We send an ACK if we can now advertise a non-zero window
1063          * which has been raised "significantly".
1064          *
1065          * Even if window raised up to infinity, do not send window open ACK
1066          * in states, where we will not receive more. It is useless.
1067          */
1068         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1069                 __u32 rcv_window_now = tcp_receive_window(tp);
1070
1071                 /* Optimize, __tcp_select_window() is not cheap. */
1072                 if (2*rcv_window_now <= tp->window_clamp) {
1073                         __u32 new_window = __tcp_select_window(sk);
1074
1075                         /* Send ACK now, if this read freed lots of space
1076                          * in our buffer. Certainly, new_window is new window.
1077                          * We can advertise it now, if it is not less than current one.
1078                          * "Lots" means "at least twice" here.
1079                          */
1080                         if (new_window && new_window >= 2 * rcv_window_now)
1081                                 time_to_ack = 1;
1082                 }
1083         }
1084         if (time_to_ack)
1085                 tcp_send_ack(sk);
1086 }
1087
1088 static void tcp_prequeue_process(struct sock *sk)
1089 {
1090         struct sk_buff *skb;
1091         struct tcp_sock *tp = tcp_sk(sk);
1092
1093         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1094
1095         /* RX process wants to run with disabled BHs, though it is not
1096          * necessary */
1097         local_bh_disable();
1098         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1099                 sk->sk_backlog_rcv(sk, skb);
1100         local_bh_enable();
1101
1102         /* Clear memory counter. */
1103         tp->ucopy.memory = 0;
1104 }
1105
1106 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1107 {
1108         struct sk_buff *skb;
1109         u32 offset;
1110
1111         skb_queue_walk(&sk->sk_receive_queue, skb) {
1112                 offset = seq - TCP_SKB_CB(skb)->seq;
1113                 if (skb->h.th->syn)
1114                         offset--;
1115                 if (offset < skb->len || skb->h.th->fin) {
1116                         *off = offset;
1117                         return skb;
1118                 }
1119         }
1120         return NULL;
1121 }
1122
1123 /*
1124  * This routine provides an alternative to tcp_recvmsg() for routines
1125  * that would like to handle copying from skbuffs directly in 'sendfile'
1126  * fashion.
1127  * Note:
1128  *      - It is assumed that the socket was locked by the caller.
1129  *      - The routine does not block.
1130  *      - At present, there is no support for reading OOB data
1131  *        or for 'peeking' the socket using this routine
1132  *        (although both would be easy to implement).
1133  */
1134 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1135                   sk_read_actor_t recv_actor)
1136 {
1137         struct sk_buff *skb;
1138         struct tcp_sock *tp = tcp_sk(sk);
1139         u32 seq = tp->copied_seq;
1140         u32 offset;
1141         int copied = 0;
1142
1143         if (sk->sk_state == TCP_LISTEN)
1144                 return -ENOTCONN;
1145         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1146                 if (offset < skb->len) {
1147                         size_t used, len;
1148
1149                         len = skb->len - offset;
1150                         /* Stop reading if we hit a patch of urgent data */
1151                         if (tp->urg_data) {
1152                                 u32 urg_offset = tp->urg_seq - seq;
1153                                 if (urg_offset < len)
1154                                         len = urg_offset;
1155                                 if (!len)
1156                                         break;
1157                         }
1158                         used = recv_actor(desc, skb, offset, len);
1159                         if (used <= len) {
1160                                 seq += used;
1161                                 copied += used;
1162                                 offset += used;
1163                         }
1164                         if (offset != skb->len)
1165                                 break;
1166                 }
1167                 if (skb->h.th->fin) {
1168                         sk_eat_skb(sk, skb);
1169                         ++seq;
1170                         break;
1171                 }
1172                 sk_eat_skb(sk, skb);
1173                 if (!desc->count)
1174                         break;
1175         }
1176         tp->copied_seq = seq;
1177
1178         tcp_rcv_space_adjust(sk);
1179
1180         /* Clean up data we have read: This will do ACK frames. */
1181         if (copied)
1182                 cleanup_rbuf(sk, copied);
1183         return copied;
1184 }
1185
1186 /*
1187  *      This routine copies from a sock struct into the user buffer.
1188  *
1189  *      Technical note: in 2.3 we work on _locked_ socket, so that
1190  *      tricks with *seq access order and skb->users are not required.
1191  *      Probably, code can be easily improved even more.
1192  */
1193
1194 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1195                 size_t len, int nonblock, int flags, int *addr_len)
1196 {
1197         struct tcp_sock *tp = tcp_sk(sk);
1198         int copied = 0;
1199         u32 peek_seq;
1200         u32 *seq;
1201         unsigned long used;
1202         int err;
1203         int target;             /* Read at least this many bytes */
1204         long timeo;
1205         struct task_struct *user_recv = NULL;
1206
1207         lock_sock(sk);
1208
1209         TCP_CHECK_TIMER(sk);
1210
1211         err = -ENOTCONN;
1212         if (sk->sk_state == TCP_LISTEN)
1213                 goto out;
1214
1215         timeo = sock_rcvtimeo(sk, nonblock);
1216
1217         /* Urgent data needs to be handled specially. */
1218         if (flags & MSG_OOB)
1219                 goto recv_urg;
1220
1221         seq = &tp->copied_seq;
1222         if (flags & MSG_PEEK) {
1223                 peek_seq = tp->copied_seq;
1224                 seq = &peek_seq;
1225         }
1226
1227         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1228
1229         do {
1230                 struct sk_buff *skb;
1231                 u32 offset;
1232
1233                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1234                 if (tp->urg_data && tp->urg_seq == *seq) {
1235                         if (copied)
1236                                 break;
1237                         if (signal_pending(current)) {
1238                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1239                                 break;
1240                         }
1241                 }
1242
1243                 /* Next get a buffer. */
1244
1245                 skb = skb_peek(&sk->sk_receive_queue);
1246                 do {
1247                         if (!skb)
1248                                 break;
1249
1250                         /* Now that we have two receive queues this
1251                          * shouldn't happen.
1252                          */
1253                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1254                                 printk(KERN_INFO "recvmsg bug: copied %X "
1255                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1256                                 break;
1257                         }
1258                         offset = *seq - TCP_SKB_CB(skb)->seq;
1259                         if (skb->h.th->syn)
1260                                 offset--;
1261                         if (offset < skb->len)
1262                                 goto found_ok_skb;
1263                         if (skb->h.th->fin)
1264                                 goto found_fin_ok;
1265                         BUG_TRAP(flags & MSG_PEEK);
1266                         skb = skb->next;
1267                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1268
1269                 /* Well, if we have backlog, try to process it now yet. */
1270
1271                 if (copied >= target && !sk->sk_backlog.tail)
1272                         break;
1273
1274                 if (copied) {
1275                         if (sk->sk_err ||
1276                             sk->sk_state == TCP_CLOSE ||
1277                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1278                             !timeo ||
1279                             signal_pending(current) ||
1280                             (flags & MSG_PEEK))
1281                                 break;
1282                 } else {
1283                         if (sock_flag(sk, SOCK_DONE))
1284                                 break;
1285
1286                         if (sk->sk_err) {
1287                                 copied = sock_error(sk);
1288                                 break;
1289                         }
1290
1291                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1292                                 break;
1293
1294                         if (sk->sk_state == TCP_CLOSE) {
1295                                 if (!sock_flag(sk, SOCK_DONE)) {
1296                                         /* This occurs when user tries to read
1297                                          * from never connected socket.
1298                                          */
1299                                         copied = -ENOTCONN;
1300                                         break;
1301                                 }
1302                                 break;
1303                         }
1304
1305                         if (!timeo) {
1306                                 copied = -EAGAIN;
1307                                 break;
1308                         }
1309
1310                         if (signal_pending(current)) {
1311                                 copied = sock_intr_errno(timeo);
1312                                 break;
1313                         }
1314                 }
1315
1316                 cleanup_rbuf(sk, copied);
1317
1318                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1319                         /* Install new reader */
1320                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1321                                 user_recv = current;
1322                                 tp->ucopy.task = user_recv;
1323                                 tp->ucopy.iov = msg->msg_iov;
1324                         }
1325
1326                         tp->ucopy.len = len;
1327
1328                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1329                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1330
1331                         /* Ugly... If prequeue is not empty, we have to
1332                          * process it before releasing socket, otherwise
1333                          * order will be broken at second iteration.
1334                          * More elegant solution is required!!!
1335                          *
1336                          * Look: we have the following (pseudo)queues:
1337                          *
1338                          * 1. packets in flight
1339                          * 2. backlog
1340                          * 3. prequeue
1341                          * 4. receive_queue
1342                          *
1343                          * Each queue can be processed only if the next ones
1344                          * are empty. At this point we have empty receive_queue.
1345                          * But prequeue _can_ be not empty after 2nd iteration,
1346                          * when we jumped to start of loop because backlog
1347                          * processing added something to receive_queue.
1348                          * We cannot release_sock(), because backlog contains
1349                          * packets arrived _after_ prequeued ones.
1350                          *
1351                          * Shortly, algorithm is clear --- to process all
1352                          * the queues in order. We could make it more directly,
1353                          * requeueing packets from backlog to prequeue, if
1354                          * is not empty. It is more elegant, but eats cycles,
1355                          * unfortunately.
1356                          */
1357                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1358                                 goto do_prequeue;
1359
1360                         /* __ Set realtime policy in scheduler __ */
1361                 }
1362
1363                 if (copied >= target) {
1364                         /* Do not sleep, just process backlog. */
1365                         release_sock(sk);
1366                         lock_sock(sk);
1367                 } else
1368                         sk_wait_data(sk, &timeo);
1369
1370                 if (user_recv) {
1371                         int chunk;
1372
1373                         /* __ Restore normal policy in scheduler __ */
1374
1375                         if ((chunk = len - tp->ucopy.len) != 0) {
1376                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1377                                 len -= chunk;
1378                                 copied += chunk;
1379                         }
1380
1381                         if (tp->rcv_nxt == tp->copied_seq &&
1382                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1383 do_prequeue:
1384                                 tcp_prequeue_process(sk);
1385
1386                                 if ((chunk = len - tp->ucopy.len) != 0) {
1387                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1388                                         len -= chunk;
1389                                         copied += chunk;
1390                                 }
1391                         }
1392                 }
1393                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1394                         if (net_ratelimit())
1395                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1396                                        current->comm, current->pid);
1397                         peek_seq = tp->copied_seq;
1398                 }
1399                 continue;
1400
1401         found_ok_skb:
1402                 /* Ok so how much can we use? */
1403                 used = skb->len - offset;
1404                 if (len < used)
1405                         used = len;
1406
1407                 /* Do we have urgent data here? */
1408                 if (tp->urg_data) {
1409                         u32 urg_offset = tp->urg_seq - *seq;
1410                         if (urg_offset < used) {
1411                                 if (!urg_offset) {
1412                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1413                                                 ++*seq;
1414                                                 offset++;
1415                                                 used--;
1416                                                 if (!used)
1417                                                         goto skip_copy;
1418                                         }
1419                                 } else
1420                                         used = urg_offset;
1421                         }
1422                 }
1423
1424                 if (!(flags & MSG_TRUNC)) {
1425                         err = skb_copy_datagram_iovec(skb, offset,
1426                                                       msg->msg_iov, used);
1427                         if (err) {
1428                                 /* Exception. Bailout! */
1429                                 if (!copied)
1430                                         copied = -EFAULT;
1431                                 break;
1432                         }
1433                 }
1434
1435                 *seq += used;
1436                 copied += used;
1437                 len -= used;
1438
1439                 tcp_rcv_space_adjust(sk);
1440
1441 skip_copy:
1442                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1443                         tp->urg_data = 0;
1444                         tcp_fast_path_check(sk, tp);
1445                 }
1446                 if (used + offset < skb->len)
1447                         continue;
1448
1449                 if (skb->h.th->fin)
1450                         goto found_fin_ok;
1451                 if (!(flags & MSG_PEEK))
1452                         sk_eat_skb(sk, skb);
1453                 continue;
1454
1455         found_fin_ok:
1456                 /* Process the FIN. */
1457                 ++*seq;
1458                 if (!(flags & MSG_PEEK))
1459                         sk_eat_skb(sk, skb);
1460                 break;
1461         } while (len > 0);
1462
1463         if (user_recv) {
1464                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1465                         int chunk;
1466
1467                         tp->ucopy.len = copied > 0 ? len : 0;
1468
1469                         tcp_prequeue_process(sk);
1470
1471                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1472                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1473                                 len -= chunk;
1474                                 copied += chunk;
1475                         }
1476                 }
1477
1478                 tp->ucopy.task = NULL;
1479                 tp->ucopy.len = 0;
1480         }
1481
1482         /* According to UNIX98, msg_name/msg_namelen are ignored
1483          * on connected socket. I was just happy when found this 8) --ANK
1484          */
1485
1486         /* Clean up data we have read: This will do ACK frames. */
1487         cleanup_rbuf(sk, copied);
1488
1489         TCP_CHECK_TIMER(sk);
1490         release_sock(sk);
1491         return copied;
1492
1493 out:
1494         TCP_CHECK_TIMER(sk);
1495         release_sock(sk);
1496         return err;
1497
1498 recv_urg:
1499         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1500         goto out;
1501 }
1502
1503 /*
1504  *      State processing on a close. This implements the state shift for
1505  *      sending our FIN frame. Note that we only send a FIN for some
1506  *      states. A shutdown() may have already sent the FIN, or we may be
1507  *      closed.
1508  */
1509
1510 static unsigned char new_state[16] = {
1511   /* current state:        new state:      action:      */
1512   /* (Invalid)          */ TCP_CLOSE,
1513   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1514   /* TCP_SYN_SENT       */ TCP_CLOSE,
1515   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1516   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1517   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1518   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1519   /* TCP_CLOSE          */ TCP_CLOSE,
1520   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1521   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1522   /* TCP_LISTEN         */ TCP_CLOSE,
1523   /* TCP_CLOSING        */ TCP_CLOSING,
1524 };
1525
1526 static int tcp_close_state(struct sock *sk)
1527 {
1528         int next = (int)new_state[sk->sk_state];
1529         int ns = next & TCP_STATE_MASK;
1530
1531         tcp_set_state(sk, ns);
1532
1533         return next & TCP_ACTION_FIN;
1534 }
1535
1536 /*
1537  *      Shutdown the sending side of a connection. Much like close except
1538  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1539  */
1540
1541 void tcp_shutdown(struct sock *sk, int how)
1542 {
1543         /*      We need to grab some memory, and put together a FIN,
1544          *      and then put it into the queue to be sent.
1545          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1546          */
1547         if (!(how & SEND_SHUTDOWN))
1548                 return;
1549
1550         /* If we've already sent a FIN, or it's a closed state, skip this. */
1551         if ((1 << sk->sk_state) &
1552             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1553              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1554                 /* Clear out any half completed packets.  FIN if needed. */
1555                 if (tcp_close_state(sk))
1556                         tcp_send_fin(sk);
1557         }
1558 }
1559
1560 /*
1561  * At this point, there should be no process reference to this
1562  * socket, and thus no user references at all.  Therefore we
1563  * can assume the socket waitqueue is inactive and nobody will
1564  * try to jump onto it.
1565  */
1566 void tcp_destroy_sock(struct sock *sk)
1567 {
1568         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1569         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1570
1571         /* It cannot be in hash table! */
1572         BUG_TRAP(sk_unhashed(sk));
1573
1574         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1575         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1576
1577         sk->sk_prot->destroy(sk);
1578
1579         sk_stream_kill_queues(sk);
1580
1581         xfrm_sk_free_policy(sk);
1582
1583         sk_refcnt_debug_release(sk);
1584
1585         atomic_dec(&tcp_orphan_count);
1586         sock_put(sk);
1587 }
1588
1589 void tcp_close(struct sock *sk, long timeout)
1590 {
1591         struct sk_buff *skb;
1592         int data_was_unread = 0;
1593
1594         lock_sock(sk);
1595         sk->sk_shutdown = SHUTDOWN_MASK;
1596
1597         if (sk->sk_state == TCP_LISTEN) {
1598                 tcp_set_state(sk, TCP_CLOSE);
1599
1600                 /* Special case. */
1601                 tcp_listen_stop(sk);
1602
1603                 goto adjudge_to_death;
1604         }
1605
1606         /*  We need to flush the recv. buffs.  We do this only on the
1607          *  descriptor close, not protocol-sourced closes, because the
1608          *  reader process may not have drained the data yet!
1609          */
1610         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1611                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1612                           skb->h.th->fin;
1613                 data_was_unread += len;
1614                 __kfree_skb(skb);
1615         }
1616
1617         sk_stream_mem_reclaim(sk);
1618
1619         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1620          * 3.10, we send a RST here because data was lost.  To
1621          * witness the awful effects of the old behavior of always
1622          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1623          * a bulk GET in an FTP client, suspend the process, wait
1624          * for the client to advertise a zero window, then kill -9
1625          * the FTP client, wheee...  Note: timeout is always zero
1626          * in such a case.
1627          */
1628         if (data_was_unread) {
1629                 /* Unread data was tossed, zap the connection. */
1630                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1631                 tcp_set_state(sk, TCP_CLOSE);
1632                 tcp_send_active_reset(sk, GFP_KERNEL);
1633         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1634                 /* Check zero linger _after_ checking for unread data. */
1635                 sk->sk_prot->disconnect(sk, 0);
1636                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1637         } else if (tcp_close_state(sk)) {
1638                 /* We FIN if the application ate all the data before
1639                  * zapping the connection.
1640                  */
1641
1642                 /* RED-PEN. Formally speaking, we have broken TCP state
1643                  * machine. State transitions:
1644                  *
1645                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1646                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1647                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1648                  *
1649                  * are legal only when FIN has been sent (i.e. in window),
1650                  * rather than queued out of window. Purists blame.
1651                  *
1652                  * F.e. "RFC state" is ESTABLISHED,
1653                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1654                  *
1655                  * The visible declinations are that sometimes
1656                  * we enter time-wait state, when it is not required really
1657                  * (harmless), do not send active resets, when they are
1658                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1659                  * they look as CLOSING or LAST_ACK for Linux)
1660                  * Probably, I missed some more holelets.
1661                  *                                              --ANK
1662                  */
1663                 tcp_send_fin(sk);
1664         }
1665
1666         sk_stream_wait_close(sk, timeout);
1667
1668 adjudge_to_death:
1669         /* It is the last release_sock in its life. It will remove backlog. */
1670         release_sock(sk);
1671
1672
1673         /* Now socket is owned by kernel and we acquire BH lock
1674            to finish close. No need to check for user refs.
1675          */
1676         local_bh_disable();
1677         bh_lock_sock(sk);
1678         BUG_TRAP(!sock_owned_by_user(sk));
1679
1680         sock_hold(sk);
1681         sock_orphan(sk);
1682
1683         /*      This is a (useful) BSD violating of the RFC. There is a
1684          *      problem with TCP as specified in that the other end could
1685          *      keep a socket open forever with no application left this end.
1686          *      We use a 3 minute timeout (about the same as BSD) then kill
1687          *      our end. If they send after that then tough - BUT: long enough
1688          *      that we won't make the old 4*rto = almost no time - whoops
1689          *      reset mistake.
1690          *
1691          *      Nope, it was not mistake. It is really desired behaviour
1692          *      f.e. on http servers, when such sockets are useless, but
1693          *      consume significant resources. Let's do it with special
1694          *      linger2 option.                                 --ANK
1695          */
1696
1697         if (sk->sk_state == TCP_FIN_WAIT2) {
1698                 struct tcp_sock *tp = tcp_sk(sk);
1699                 if (tp->linger2 < 0) {
1700                         tcp_set_state(sk, TCP_CLOSE);
1701                         tcp_send_active_reset(sk, GFP_ATOMIC);
1702                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1703                 } else {
1704                         int tmo = tcp_fin_time(tp);
1705
1706                         if (tmo > TCP_TIMEWAIT_LEN) {
1707                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1708                         } else {
1709                                 atomic_inc(&tcp_orphan_count);
1710                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1711                                 goto out;
1712                         }
1713                 }
1714         }
1715         if (sk->sk_state != TCP_CLOSE) {
1716                 sk_stream_mem_reclaim(sk);
1717                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1718                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1719                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1720                         if (net_ratelimit())
1721                                 printk(KERN_INFO "TCP: too many of orphaned "
1722                                        "sockets\n");
1723                         tcp_set_state(sk, TCP_CLOSE);
1724                         tcp_send_active_reset(sk, GFP_ATOMIC);
1725                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1726                 }
1727         }
1728         atomic_inc(&tcp_orphan_count);
1729
1730         if (sk->sk_state == TCP_CLOSE)
1731                 tcp_destroy_sock(sk);
1732         /* Otherwise, socket is reprieved until protocol close. */
1733
1734 out:
1735         bh_unlock_sock(sk);
1736         local_bh_enable();
1737         sock_put(sk);
1738 }
1739
1740 /* These states need RST on ABORT according to RFC793 */
1741
1742 static inline int tcp_need_reset(int state)
1743 {
1744         return (1 << state) &
1745                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1746                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1747 }
1748
1749 int tcp_disconnect(struct sock *sk, int flags)
1750 {
1751         struct inet_sock *inet = inet_sk(sk);
1752         struct tcp_sock *tp = tcp_sk(sk);
1753         int err = 0;
1754         int old_state = sk->sk_state;
1755
1756         if (old_state != TCP_CLOSE)
1757                 tcp_set_state(sk, TCP_CLOSE);
1758
1759         /* ABORT function of RFC793 */
1760         if (old_state == TCP_LISTEN) {
1761                 tcp_listen_stop(sk);
1762         } else if (tcp_need_reset(old_state) ||
1763                    (tp->snd_nxt != tp->write_seq &&
1764                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1765                 /* The last check adjusts for discrepance of Linux wrt. RFC
1766                  * states
1767                  */
1768                 tcp_send_active_reset(sk, gfp_any());
1769                 sk->sk_err = ECONNRESET;
1770         } else if (old_state == TCP_SYN_SENT)
1771                 sk->sk_err = ECONNRESET;
1772
1773         tcp_clear_xmit_timers(sk);
1774         __skb_queue_purge(&sk->sk_receive_queue);
1775         sk_stream_writequeue_purge(sk);
1776         __skb_queue_purge(&tp->out_of_order_queue);
1777
1778         inet->dport = 0;
1779
1780         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1781                 inet_reset_saddr(sk);
1782
1783         sk->sk_shutdown = 0;
1784         sock_reset_flag(sk, SOCK_DONE);
1785         tp->srtt = 0;
1786         if ((tp->write_seq += tp->max_window + 2) == 0)
1787                 tp->write_seq = 1;
1788         tp->backoff = 0;
1789         tp->snd_cwnd = 2;
1790         tp->probes_out = 0;
1791         tp->packets_out = 0;
1792         tp->snd_ssthresh = 0x7fffffff;
1793         tp->snd_cwnd_cnt = 0;
1794         tcp_set_ca_state(tp, TCP_CA_Open);
1795         tcp_clear_retrans(tp);
1796         tcp_delack_init(tp);
1797         sk->sk_send_head = NULL;
1798         tp->rx_opt.saw_tstamp = 0;
1799         tcp_sack_reset(&tp->rx_opt);
1800         __sk_dst_reset(sk);
1801
1802         BUG_TRAP(!inet->num || tp->bind_hash);
1803
1804         sk->sk_error_report(sk);
1805         return err;
1806 }
1807
1808 /*
1809  *      Wait for an incoming connection, avoid race
1810  *      conditions. This must be called with the socket locked.
1811  */
1812 static int wait_for_connect(struct sock *sk, long timeo)
1813 {
1814         struct tcp_sock *tp = tcp_sk(sk);
1815         DEFINE_WAIT(wait);
1816         int err;
1817
1818         /*
1819          * True wake-one mechanism for incoming connections: only
1820          * one process gets woken up, not the 'whole herd'.
1821          * Since we do not 'race & poll' for established sockets
1822          * anymore, the common case will execute the loop only once.
1823          *
1824          * Subtle issue: "add_wait_queue_exclusive()" will be added
1825          * after any current non-exclusive waiters, and we know that
1826          * it will always _stay_ after any new non-exclusive waiters
1827          * because all non-exclusive waiters are added at the
1828          * beginning of the wait-queue. As such, it's ok to "drop"
1829          * our exclusiveness temporarily when we get woken up without
1830          * having to remove and re-insert us on the wait queue.
1831          */
1832         for (;;) {
1833                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1834                                           TASK_INTERRUPTIBLE);
1835                 release_sock(sk);
1836                 if (reqsk_queue_empty(&tp->accept_queue))
1837                         timeo = schedule_timeout(timeo);
1838                 lock_sock(sk);
1839                 err = 0;
1840                 if (!reqsk_queue_empty(&tp->accept_queue))
1841                         break;
1842                 err = -EINVAL;
1843                 if (sk->sk_state != TCP_LISTEN)
1844                         break;
1845                 err = sock_intr_errno(timeo);
1846                 if (signal_pending(current))
1847                         break;
1848                 err = -EAGAIN;
1849                 if (!timeo)
1850                         break;
1851         }
1852         finish_wait(sk->sk_sleep, &wait);
1853         return err;
1854 }
1855
1856 /*
1857  *      This will accept the next outstanding connection.
1858  */
1859
1860 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1861 {
1862         struct tcp_sock *tp = tcp_sk(sk);
1863         struct sock *newsk;
1864         int error;
1865
1866         lock_sock(sk);
1867
1868         /* We need to make sure that this socket is listening,
1869          * and that it has something pending.
1870          */
1871         error = -EINVAL;
1872         if (sk->sk_state != TCP_LISTEN)
1873                 goto out_err;
1874
1875         /* Find already established connection */
1876         if (reqsk_queue_empty(&tp->accept_queue)) {
1877                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1878
1879                 /* If this is a non blocking socket don't sleep */
1880                 error = -EAGAIN;
1881                 if (!timeo)
1882                         goto out_err;
1883
1884                 error = wait_for_connect(sk, timeo);
1885                 if (error)
1886                         goto out_err;
1887         }
1888
1889         newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1890         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1891 out:
1892         release_sock(sk);
1893         return newsk;
1894 out_err:
1895         newsk = NULL;
1896         *err = error;
1897         goto out;
1898 }
1899
1900 /*
1901  *      Socket option code for TCP.
1902  */
1903 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1904                    int optlen)
1905 {
1906         struct tcp_sock *tp = tcp_sk(sk);
1907         int val;
1908         int err = 0;
1909
1910         if (level != SOL_TCP)
1911                 return tp->af_specific->setsockopt(sk, level, optname,
1912                                                    optval, optlen);
1913
1914         /* This is a string value all the others are int's */
1915         if (optname == TCP_CONGESTION) {
1916                 char name[TCP_CA_NAME_MAX];
1917
1918                 if (optlen < 1)
1919                         return -EINVAL;
1920
1921                 val = strncpy_from_user(name, optval,
1922                                         min(TCP_CA_NAME_MAX-1, optlen));
1923                 if (val < 0)
1924                         return -EFAULT;
1925                 name[val] = 0;
1926
1927                 lock_sock(sk);
1928                 err = tcp_set_congestion_control(tp, name);
1929                 release_sock(sk);
1930                 return err;
1931         }
1932
1933         if (optlen < sizeof(int))
1934                 return -EINVAL;
1935
1936         if (get_user(val, (int __user *)optval))
1937                 return -EFAULT;
1938
1939         lock_sock(sk);
1940
1941         switch (optname) {
1942         case TCP_MAXSEG:
1943                 /* Values greater than interface MTU won't take effect. However
1944                  * at the point when this call is done we typically don't yet
1945                  * know which interface is going to be used */
1946                 if (val < 8 || val > MAX_TCP_WINDOW) {
1947                         err = -EINVAL;
1948                         break;
1949                 }
1950                 tp->rx_opt.user_mss = val;
1951                 break;
1952
1953         case TCP_NODELAY:
1954                 if (val) {
1955                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1956                          * this option on corked socket is remembered, but
1957                          * it is not activated until cork is cleared.
1958                          *
1959                          * However, when TCP_NODELAY is set we make
1960                          * an explicit push, which overrides even TCP_CORK
1961                          * for currently queued segments.
1962                          */
1963                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1964                         tcp_push_pending_frames(sk, tp);
1965                 } else {
1966                         tp->nonagle &= ~TCP_NAGLE_OFF;
1967                 }
1968                 break;
1969
1970         case TCP_CORK:
1971                 /* When set indicates to always queue non-full frames.
1972                  * Later the user clears this option and we transmit
1973                  * any pending partial frames in the queue.  This is
1974                  * meant to be used alongside sendfile() to get properly
1975                  * filled frames when the user (for example) must write
1976                  * out headers with a write() call first and then use
1977                  * sendfile to send out the data parts.
1978                  *
1979                  * TCP_CORK can be set together with TCP_NODELAY and it is
1980                  * stronger than TCP_NODELAY.
1981                  */
1982                 if (val) {
1983                         tp->nonagle |= TCP_NAGLE_CORK;
1984                 } else {
1985                         tp->nonagle &= ~TCP_NAGLE_CORK;
1986                         if (tp->nonagle&TCP_NAGLE_OFF)
1987                                 tp->nonagle |= TCP_NAGLE_PUSH;
1988                         tcp_push_pending_frames(sk, tp);
1989                 }
1990                 break;
1991
1992         case TCP_KEEPIDLE:
1993                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1994                         err = -EINVAL;
1995                 else {
1996                         tp->keepalive_time = val * HZ;
1997                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
1998                             !((1 << sk->sk_state) &
1999                               (TCPF_CLOSE | TCPF_LISTEN))) {
2000                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2001                                 if (tp->keepalive_time > elapsed)
2002                                         elapsed = tp->keepalive_time - elapsed;
2003                                 else
2004                                         elapsed = 0;
2005                                 tcp_reset_keepalive_timer(sk, elapsed);
2006                         }
2007                 }
2008                 break;
2009         case TCP_KEEPINTVL:
2010                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2011                         err = -EINVAL;
2012                 else
2013                         tp->keepalive_intvl = val * HZ;
2014                 break;
2015         case TCP_KEEPCNT:
2016                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2017                         err = -EINVAL;
2018                 else
2019                         tp->keepalive_probes = val;
2020                 break;
2021         case TCP_SYNCNT:
2022                 if (val < 1 || val > MAX_TCP_SYNCNT)
2023                         err = -EINVAL;
2024                 else
2025                         tp->syn_retries = val;
2026                 break;
2027
2028         case TCP_LINGER2:
2029                 if (val < 0)
2030                         tp->linger2 = -1;
2031                 else if (val > sysctl_tcp_fin_timeout / HZ)
2032                         tp->linger2 = 0;
2033                 else
2034                         tp->linger2 = val * HZ;
2035                 break;
2036
2037         case TCP_DEFER_ACCEPT:
2038                 tp->defer_accept = 0;
2039                 if (val > 0) {
2040                         /* Translate value in seconds to number of
2041                          * retransmits */
2042                         while (tp->defer_accept < 32 &&
2043                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2044                                        tp->defer_accept))
2045                                 tp->defer_accept++;
2046                         tp->defer_accept++;
2047                 }
2048                 break;
2049
2050         case TCP_WINDOW_CLAMP:
2051                 if (!val) {
2052                         if (sk->sk_state != TCP_CLOSE) {
2053                                 err = -EINVAL;
2054                                 break;
2055                         }
2056                         tp->window_clamp = 0;
2057                 } else
2058                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2059                                                 SOCK_MIN_RCVBUF / 2 : val;
2060                 break;
2061
2062         case TCP_QUICKACK:
2063                 if (!val) {
2064                         tp->ack.pingpong = 1;
2065                 } else {
2066                         tp->ack.pingpong = 0;
2067                         if ((1 << sk->sk_state) &
2068                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2069                             tcp_ack_scheduled(tp)) {
2070                                 tp->ack.pending |= TCP_ACK_PUSHED;
2071                                 cleanup_rbuf(sk, 1);
2072                                 if (!(val & 1))
2073                                         tp->ack.pingpong = 1;
2074                         }
2075                 }
2076                 break;
2077
2078         default:
2079                 err = -ENOPROTOOPT;
2080                 break;
2081         };
2082         release_sock(sk);
2083         return err;
2084 }
2085
2086 /* Return information about state of tcp endpoint in API format. */
2087 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2088 {
2089         struct tcp_sock *tp = tcp_sk(sk);
2090         u32 now = tcp_time_stamp;
2091
2092         memset(info, 0, sizeof(*info));
2093
2094         info->tcpi_state = sk->sk_state;
2095         info->tcpi_ca_state = tp->ca_state;
2096         info->tcpi_retransmits = tp->retransmits;
2097         info->tcpi_probes = tp->probes_out;
2098         info->tcpi_backoff = tp->backoff;
2099
2100         if (tp->rx_opt.tstamp_ok)
2101                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2102         if (tp->rx_opt.sack_ok)
2103                 info->tcpi_options |= TCPI_OPT_SACK;
2104         if (tp->rx_opt.wscale_ok) {
2105                 info->tcpi_options |= TCPI_OPT_WSCALE;
2106                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2107                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2108         } 
2109
2110         if (tp->ecn_flags&TCP_ECN_OK)
2111                 info->tcpi_options |= TCPI_OPT_ECN;
2112
2113         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2114         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2115         info->tcpi_snd_mss = tp->mss_cache;
2116         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2117
2118         info->tcpi_unacked = tp->packets_out;
2119         info->tcpi_sacked = tp->sacked_out;
2120         info->tcpi_lost = tp->lost_out;
2121         info->tcpi_retrans = tp->retrans_out;
2122         info->tcpi_fackets = tp->fackets_out;
2123
2124         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2125         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2126         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2127
2128         info->tcpi_pmtu = tp->pmtu_cookie;
2129         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2130         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2131         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2132         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2133         info->tcpi_snd_cwnd = tp->snd_cwnd;
2134         info->tcpi_advmss = tp->advmss;
2135         info->tcpi_reordering = tp->reordering;
2136
2137         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2138         info->tcpi_rcv_space = tp->rcvq_space.space;
2139
2140         info->tcpi_total_retrans = tp->total_retrans;
2141 }
2142
2143 EXPORT_SYMBOL_GPL(tcp_get_info);
2144
2145 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2146                    int __user *optlen)
2147 {
2148         struct tcp_sock *tp = tcp_sk(sk);
2149         int val, len;
2150
2151         if (level != SOL_TCP)
2152                 return tp->af_specific->getsockopt(sk, level, optname,
2153                                                    optval, optlen);
2154
2155         if (get_user(len, optlen))
2156                 return -EFAULT;
2157
2158         len = min_t(unsigned int, len, sizeof(int));
2159
2160         if (len < 0)
2161                 return -EINVAL;
2162
2163         switch (optname) {
2164         case TCP_MAXSEG:
2165                 val = tp->mss_cache;
2166                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2167                         val = tp->rx_opt.user_mss;
2168                 break;
2169         case TCP_NODELAY:
2170                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2171                 break;
2172         case TCP_CORK:
2173                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2174                 break;
2175         case TCP_KEEPIDLE:
2176                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2177                 break;
2178         case TCP_KEEPINTVL:
2179                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2180                 break;
2181         case TCP_KEEPCNT:
2182                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2183                 break;
2184         case TCP_SYNCNT:
2185                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2186                 break;
2187         case TCP_LINGER2:
2188                 val = tp->linger2;
2189                 if (val >= 0)
2190                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2191                 break;
2192         case TCP_DEFER_ACCEPT:
2193                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2194                                                (tp->defer_accept - 1));
2195                 break;
2196         case TCP_WINDOW_CLAMP:
2197                 val = tp->window_clamp;
2198                 break;
2199         case TCP_INFO: {
2200                 struct tcp_info info;
2201
2202                 if (get_user(len, optlen))
2203                         return -EFAULT;
2204
2205                 tcp_get_info(sk, &info);
2206
2207                 len = min_t(unsigned int, len, sizeof(info));
2208                 if (put_user(len, optlen))
2209                         return -EFAULT;
2210                 if (copy_to_user(optval, &info, len))
2211                         return -EFAULT;
2212                 return 0;
2213         }
2214         case TCP_QUICKACK:
2215                 val = !tp->ack.pingpong;
2216                 break;
2217
2218         case TCP_CONGESTION:
2219                 if (get_user(len, optlen))
2220                         return -EFAULT;
2221                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2222                 if (put_user(len, optlen))
2223                         return -EFAULT;
2224                 if (copy_to_user(optval, tp->ca_ops->name, len))
2225                         return -EFAULT;
2226                 return 0;
2227         default:
2228                 return -ENOPROTOOPT;
2229         };
2230
2231         if (put_user(len, optlen))
2232                 return -EFAULT;
2233         if (copy_to_user(optval, &val, len))
2234                 return -EFAULT;
2235         return 0;
2236 }
2237
2238
2239 extern void __skb_cb_too_small_for_tcp(int, int);
2240 extern struct tcp_congestion_ops tcp_reno;
2241
2242 static __initdata unsigned long thash_entries;
2243 static int __init set_thash_entries(char *str)
2244 {
2245         if (!str)
2246                 return 0;
2247         thash_entries = simple_strtoul(str, &str, 0);
2248         return 1;
2249 }
2250 __setup("thash_entries=", set_thash_entries);
2251
2252 void __init tcp_init(void)
2253 {
2254         struct sk_buff *skb = NULL;
2255         int order, i;
2256
2257         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2258                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2259                                            sizeof(skb->cb));
2260
2261         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2262                                               sizeof(struct tcp_bind_bucket),
2263                                               0, SLAB_HWCACHE_ALIGN,
2264                                               NULL, NULL);
2265         if (!tcp_bucket_cachep)
2266                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2267
2268         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2269                                                 sizeof(struct tcp_tw_bucket),
2270                                                 0, SLAB_HWCACHE_ALIGN,
2271                                                 NULL, NULL);
2272         if (!tcp_timewait_cachep)
2273                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2274
2275         /* Size and allocate the main established and bind bucket
2276          * hash tables.
2277          *
2278          * The methodology is similar to that of the buffer cache.
2279          */
2280         tcp_ehash = (struct tcp_ehash_bucket *)
2281                 alloc_large_system_hash("TCP established",
2282                                         sizeof(struct tcp_ehash_bucket),
2283                                         thash_entries,
2284                                         (num_physpages >= 128 * 1024) ?
2285                                                 (25 - PAGE_SHIFT) :
2286                                                 (27 - PAGE_SHIFT),
2287                                         HASH_HIGHMEM,
2288                                         &tcp_ehash_size,
2289                                         NULL,
2290                                         0);
2291         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2292         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2293                 rwlock_init(&tcp_ehash[i].lock);
2294                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2295         }
2296
2297         tcp_bhash = (struct tcp_bind_hashbucket *)
2298                 alloc_large_system_hash("TCP bind",
2299                                         sizeof(struct tcp_bind_hashbucket),
2300                                         tcp_ehash_size,
2301                                         (num_physpages >= 128 * 1024) ?
2302                                                 (25 - PAGE_SHIFT) :
2303                                                 (27 - PAGE_SHIFT),
2304                                         HASH_HIGHMEM,
2305                                         &tcp_bhash_size,
2306                                         NULL,
2307                                         64 * 1024);
2308         tcp_bhash_size = 1 << tcp_bhash_size;
2309         for (i = 0; i < tcp_bhash_size; i++) {
2310                 spin_lock_init(&tcp_bhash[i].lock);
2311                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2312         }
2313
2314         /* Try to be a bit smarter and adjust defaults depending
2315          * on available memory.
2316          */
2317         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2318                         (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2319                         order++)
2320                 ;
2321         if (order >= 4) {
2322                 sysctl_local_port_range[0] = 32768;
2323                 sysctl_local_port_range[1] = 61000;
2324                 sysctl_tcp_max_tw_buckets = 180000;
2325                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2326                 sysctl_max_syn_backlog = 1024;
2327         } else if (order < 3) {
2328                 sysctl_local_port_range[0] = 1024 * (3 - order);
2329                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2330                 sysctl_tcp_max_orphans >>= (3 - order);
2331                 sysctl_max_syn_backlog = 128;
2332         }
2333         tcp_port_rover = sysctl_local_port_range[0] - 1;
2334
2335         sysctl_tcp_mem[0] =  768 << order;
2336         sysctl_tcp_mem[1] = 1024 << order;
2337         sysctl_tcp_mem[2] = 1536 << order;
2338
2339         if (order < 3) {
2340                 sysctl_tcp_wmem[2] = 64 * 1024;
2341                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2342                 sysctl_tcp_rmem[1] = 43689;
2343                 sysctl_tcp_rmem[2] = 2 * 43689;
2344         }
2345
2346         printk(KERN_INFO "TCP: Hash tables configured "
2347                "(established %d bind %d)\n",
2348                tcp_ehash_size << 1, tcp_bhash_size);
2349
2350         tcp_register_congestion_control(&tcp_reno);
2351 }
2352
2353 EXPORT_SYMBOL(tcp_accept);
2354 EXPORT_SYMBOL(tcp_close);
2355 EXPORT_SYMBOL(tcp_destroy_sock);
2356 EXPORT_SYMBOL(tcp_disconnect);
2357 EXPORT_SYMBOL(tcp_getsockopt);
2358 EXPORT_SYMBOL(tcp_ioctl);
2359 EXPORT_SYMBOL(tcp_poll);
2360 EXPORT_SYMBOL(tcp_read_sock);
2361 EXPORT_SYMBOL(tcp_recvmsg);
2362 EXPORT_SYMBOL(tcp_sendmsg);
2363 EXPORT_SYMBOL(tcp_sendpage);
2364 EXPORT_SYMBOL(tcp_setsockopt);
2365 EXPORT_SYMBOL(tcp_shutdown);
2366 EXPORT_SYMBOL(tcp_statistics);
2367 EXPORT_SYMBOL(tcp_timewait_cachep);