Merge tag 'gfs2-merge-window' of git://git.kernel.org/pub/scm/linux/kernel/git/steve...

[cascardo/linux.git] / include / net / tcp.h
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 2c2f24f..74efeda 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -30,6 +30,7 @@
  #include <linux/crypto.h>
  #include <linux/cryptohash.h>
  #include <linux/kref.h>
+#include <linux/ktime.h>
  
  #include <net/inet_connection_sock.h>
  #include <net/inet_timewait_sock.h>
@@ -218,8 +219,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  #define        TFO_SERVER_ENABLE       2
  #define        TFO_CLIENT_NO_COOKIE    4       /* Data in SYN w/o cookie option */
  
-/* Process SYN data but skip cookie validation */
-#define        TFO_SERVER_COOKIE_NOT_CHKED     0x100
  /* Accept SYN data w/o any cookie option */
  #define        TFO_SERVER_COOKIE_NOT_REQD      0x200
  
@@ -228,10 +227,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
   */
  #define        TFO_SERVER_WO_SOCKOPT1  0x400
  #define        TFO_SERVER_WO_SOCKOPT2  0x800
-/* Always create TFO child sockets on a TFO listener even when
- * cookie/data not present. (For testing purpose!)
- */
-#define        TFO_SERVER_ALWAYS       0x1000
  
  extern struct inet_timewait_death_row tcp_death_row;
  
@@ -371,7 +366,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                          const struct tcphdr *th, unsigned int len);
  void tcp_rcv_space_adjust(struct sock *sk);
-void tcp_cleanup_rbuf(struct sock *sk, int copied);
  int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
  void tcp_twsk_destructor(struct sock *sk);
  ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
@@ -414,13 +408,13 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                            bool fastopen);
  int tcp_child_process(struct sock *parent, struct sock *child,
                       struct sk_buff *skb);
-void tcp_enter_loss(struct sock *sk, int how);
+void tcp_enter_loss(struct sock *sk);
  void tcp_clear_retrans(struct tcp_sock *tp);
  void tcp_update_metrics(struct sock *sk);
  void tcp_init_metrics(struct sock *sk);
  void tcp_metrics_init(void);
  bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
-                       bool paws_check);
+                       bool paws_check, bool timestamps);
  bool tcp_remember_stamp(struct sock *sk);
  bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
  void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
@@ -451,6 +445,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
   */
  
  void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
+void tcp_v4_mtu_reduced(struct sock *sk);
  int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
  struct sock *tcp_create_openreq_child(struct sock *sk,
                                       struct request_sock *req,
@@ -476,34 +471,28 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
  struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
                              struct ip_options *opt);
  #ifdef CONFIG_SYN_COOKIES
-#include <linux/ktime.h>
  
-/* Syncookies use a monotonic timer which increments every 64 seconds.
+/* Syncookies use a monotonic timer which increments every 60 seconds.
   * This counter is used both as a hash input and partially encoded into
   * the cookie value.  A cookie is only validated further if the delta
   * between the current counter value and the encoded one is less than this,
- * i.e. a sent cookie is valid only at most for 128 seconds (or less if
+ * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
   * the counter advances immediately after a cookie is generated).
   */
  #define MAX_SYNCOOKIE_AGE 2
  
  static inline u32 tcp_cookie_time(void)
  {
-       struct timespec now;
-       getnstimeofday(&now);
-       return now.tv_sec >> 6; /* 64 seconds granularity */
+       u64 val = get_jiffies_64();
+
+       do_div(val, 60 * HZ);
+       return val;
  }
  
  u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
                               u16 *mssp);
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss);
-#else
-static inline __u32 cookie_v4_init_sequence(struct sock *sk,
-                                           struct sk_buff *skb,
-                                           __u16 *mss)
-{
-       return 0;
-}
+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
+                             __u16 *mss);
  #endif
  
  __u32 cookie_init_timestamp(struct request_sock *req);
@@ -519,13 +508,6 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
                               const struct tcphdr *th, u16 *mssp);
  __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,
                               __u16 *mss);
-#else
-static inline __u32 cookie_v6_init_sequence(struct sock *sk,
-                                           struct sk_buff *skb,
-                                           __u16 *mss)
-{
-       return 0;
-}
  #endif
  /* tcp_output.c */
  
@@ -538,7 +520,7 @@ void tcp_retransmit_timer(struct sock *sk);
  void tcp_xmit_retransmit_queue(struct sock *);
  void tcp_simple_retransmit(struct sock *);
  int tcp_trim_head(struct sock *, struct sk_buff *, u32);
-int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int);
+int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
  
  void tcp_send_probe0(struct sock *);
  void tcp_send_partial(struct sock *);
@@ -555,7 +537,6 @@ void tcp_send_loss_probe(struct sock *sk);
  bool tcp_schedule_loss_probe(struct sock *sk);
  
  /* tcp_input.c */
-void tcp_cwnd_application_limited(struct sock *sk);
  void tcp_resume_early_retransmit(struct sock *sk);
  void tcp_rearm_rto(struct sock *sk);
  void tcp_reset(struct sock *sk);
@@ -617,7 +598,7 @@ static inline void tcp_bound_rto(const struct sock *sk)
  
  static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
  {
-       return (tp->srtt >> 3) + tp->rttvar;
+       return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
  }
  
  static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
@@ -654,6 +635,11 @@ static inline u32 tcp_rto_min(struct sock *sk)
         return rto_min;
  }
  
+static inline u32 tcp_rto_min_us(struct sock *sk)
+{
+       return jiffies_to_usecs(tcp_rto_min(sk));
+}
+
  /* Compute the actual receive window we are currently advertising.
   * Rcv_nxt can be after the window if our peer push more data
   * than the offered window.
@@ -683,6 +669,12 @@ void tcp_send_window_probe(struct sock *sk);
   */
  #define tcp_time_stamp         ((__u32)(jiffies))
  
+static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
+{
+       return skb->skb_mstamp.stamp_jiffies;
+}
+
+
  #define tcp_flag_byte(th) (((u_int8_t *)th)[13])
  
  #define TCPHDR_FIN 0x01
@@ -701,15 +693,18 @@ void tcp_send_window_probe(struct sock *sk);
   * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
   */
  struct tcp_skb_cb {
-       union {
-               struct inet_skb_parm    h4;
-#if IS_ENABLED(CONFIG_IPV6)
-               struct inet6_skb_parm   h6;
-#endif
-       } header;       /* For incoming frames          */
         __u32           seq;            /* Starting sequence number     */
         __u32           end_seq;        /* SEQ + FIN + SYN + datalen    */
-       __u32           when;           /* used to compute rtt's        */
+       union {
+               /* Note : tcp_tw_isn is used in input path only
+                *        (isn chosen by tcp_timewait_state_process())
+                *
+                *        tcp_gso_segs is used in write queue only,
+                *        cf tcp_skb_pcount()
+                */
+               __u32           tcp_tw_isn;
+               __u32           tcp_gso_segs;
+       };
         __u8            tcp_flags;      /* TCP header flags. (tcp[13])  */
  
         __u8            sacked;         /* State flags for SACK/FACK.   */
@@ -717,39 +712,40 @@ struct tcp_skb_cb {
  #define TCPCB_SACKED_RETRANS   0x02    /* SKB retransmitted            */
  #define TCPCB_LOST             0x04    /* SKB is lost                  */
  #define TCPCB_TAGBITS          0x07    /* All tag bits                 */
+#define TCPCB_REPAIRED         0x10    /* SKB repaired (no skb_mstamp) */
  #define TCPCB_EVER_RETRANS     0x80    /* Ever retransmitted frame     */
-#define TCPCB_RETRANS          (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
+#define TCPCB_RETRANS          (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
+                               TCPCB_REPAIRED)
  
         __u8            ip_dsfield;     /* IPv4 tos or IPv6 dsfield     */
         /* 1 byte hole */
         __u32           ack_seq;        /* Sequence number ACK'd        */
+       union {
+               struct inet_skb_parm    h4;
+#if IS_ENABLED(CONFIG_IPV6)
+               struct inet6_skb_parm   h6;
+#endif
+       } header;       /* For incoming frames          */
  };
  
  #define TCP_SKB_CB(__skb)      ((struct tcp_skb_cb *)&((__skb)->cb[0]))
  
-/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
- *
- * If we receive a SYN packet with these bits set, it means a network is
- * playing bad games with TOS bits. In order to avoid possible false congestion
- * notifications, we disable TCP ECN negociation.
+/* Due to TSO, an SKB can be composed of multiple actual
+ * packets.  To keep these tracked properly, we use this.
   */
-static inline void
-TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
-               struct net *net)
+static inline int tcp_skb_pcount(const struct sk_buff *skb)
  {
-       const struct tcphdr *th = tcp_hdr(skb);
+       return TCP_SKB_CB(skb)->tcp_gso_segs;
+}
  
-       if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
-           INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
-               inet_rsk(req)->ecn_ok = 1;
+static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
+{
+       TCP_SKB_CB(skb)->tcp_gso_segs = segs;
  }
  
-/* Due to TSO, an SKB can be composed of multiple actual
- * packets.  To keep these tracked properly, we use this.
- */
-static inline int tcp_skb_pcount(const struct sk_buff *skb)
+static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
  {
-       return skb_shinfo(skb)->gso_segs;
+       TCP_SKB_CB(skb)->tcp_gso_segs += segs;
  }
  
  /* This is valid iff tcp_skb_pcount() > 1. */
@@ -764,8 +760,17 @@ enum tcp_ca_event {
         CA_EVENT_CWND_RESTART,  /* congestion window restart */
         CA_EVENT_COMPLETE_CWR,  /* end of congestion recovery */
         CA_EVENT_LOSS,          /* loss timeout */
-       CA_EVENT_FAST_ACK,      /* in sequence ack */
-       CA_EVENT_SLOW_ACK,      /* other ack */
+       CA_EVENT_ECN_NO_CE,     /* ECT set, but not CE marked */
+       CA_EVENT_ECN_IS_CE,     /* received CE marked IP packet */
+       CA_EVENT_DELAYED_ACK,   /* Delayed ack is sent */
+       CA_EVENT_NON_DELAYED_ACK,
+};
+
+/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+enum tcp_ca_ack_event_flags {
+       CA_ACK_SLOWPATH         = (1 << 0),     /* In slow path processing */
+       CA_ACK_WIN_UPDATE       = (1 << 1),     /* ACK updated window */
+       CA_ACK_ECE              = (1 << 2),     /* ECE bit is set on ack */
  };
  
  /*
@@ -775,8 +780,10 @@ enum tcp_ca_event {
  #define TCP_CA_MAX     128
  #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
  
+/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
  #define TCP_CONG_NON_RESTRICTED 0x1
-#define TCP_CONG_RTT_STAMP     0x2
+/* Requires ECN/ECT set on all packets */
+#define TCP_CONG_NEEDS_ECN     0x2
  
  struct tcp_congestion_ops {
         struct list_head        list;
@@ -789,14 +796,14 @@ struct tcp_congestion_ops {
  
         /* return slow start threshold (required) */
         u32 (*ssthresh)(struct sock *sk);
-       /* lower bound for congestion window (optional) */
-       u32 (*min_cwnd)(const struct sock *sk);
         /* do new cwnd calculation (required) */
-       void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked, u32 in_flight);
+       void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
         /* call before changing ca_state (optional) */
         void (*set_state)(struct sock *sk, u8 new_state);
         /* call when cwnd event occurs (optional) */
         void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+       /* call when ack arrives (optional) */
+       void (*in_ack_event)(struct sock *sk, u32 flags);
         /* new value of cwnd after loss (optional) */
         u32  (*undo_cwnd)(struct sock *sk);
         /* hook for packet ack accounting (optional) */
@@ -811,6 +818,7 @@ struct tcp_congestion_ops {
  int tcp_register_congestion_control(struct tcp_congestion_ops *type);
  void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
  
+void tcp_assign_congestion_control(struct sock *sk);
  void tcp_init_congestion_control(struct sock *sk);
  void tcp_cleanup_congestion_control(struct sock *sk);
  int tcp_set_default_congestion_control(const char *name);
@@ -819,15 +827,20 @@ void tcp_get_available_congestion_control(char *buf, size_t len);
  void tcp_get_allowed_congestion_control(char *buf, size_t len);
  int tcp_set_allowed_congestion_control(char *allowed);
  int tcp_set_congestion_control(struct sock *sk, const char *name);
-int tcp_slow_start(struct tcp_sock *tp, u32 acked);
+void tcp_slow_start(struct tcp_sock *tp, u32 acked);
  void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
  
-extern struct tcp_congestion_ops tcp_init_congestion_ops;
  u32 tcp_reno_ssthresh(struct sock *sk);
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight);
-u32 tcp_reno_min_cwnd(const struct sock *sk);
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
  extern struct tcp_congestion_ops tcp_reno;
  
+static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+{
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+
+       return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
+}
+
  static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
  {
         struct inet_connection_sock *icsk = inet_csk(sk);
@@ -944,7 +957,7 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
  /* Use define here intentionally to get WARN_ON location shown at the caller */
  #define tcp_verify_left_out(tp)        WARN_ON(tcp_left_out(tp) > tp->packets_out)
  
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
+void tcp_enter_cwr(struct sock *sk);
  __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst);
  
  /* The maximum number of MSS of available cwnd for which TSO defers
@@ -971,7 +984,30 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
  {
         return tp->snd_una + tp->snd_wnd;
  }
-bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight);
+
+/* We follow the spirit of RFC2861 to validate cwnd but implement a more
+ * flexible approach. The RFC suggests cwnd should not be raised unless
+ * it was fully used previously. And that's exactly what we do in
+ * congestion avoidance mode. But in slow start we allow cwnd to grow
+ * as long as the application has used half the cwnd.
+ * Example :
+ *    cwnd is 10 (IW10), but application sends 9 frames.
+ *    We allow cwnd to reach 18 when all frames are ACKed.
+ * This check is safe because it's as aggressive as slow start which already
+ * risks 100% overshoot. The advantage is that we discourage application to
+ * either send more filler packets or data to artificially blow up the cwnd
+ * usage, and allow application-limited process to probe bw more aggressively.
+ */
+static inline bool tcp_is_cwnd_limited(const struct sock *sk)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+
+       /* If in slow start, ensure cwnd grows to twice what was ACKed. */
+       if (tp->snd_cwnd <= tp->snd_ssthresh)
+               return tp->snd_cwnd < 2 * tp->max_packets_out;
+
+       return tp->is_cwnd_limited;
+}
  
  static inline void tcp_check_probe_timer(struct sock *sk)
  {
@@ -1072,7 +1108,7 @@ static inline int tcp_full_space(const struct sock *sk)
  
  static inline void tcp_openreq_init(struct request_sock *req,
                                     struct tcp_options_received *rx_opt,
-                                   struct sk_buff *skb)
+                                   struct sk_buff *skb, struct sock *sk)
  {
         struct inet_request_sock *ireq = inet_rsk(req);
  
@@ -1080,7 +1116,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
         req->cookie_ts = 0;
         tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
         tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
-       tcp_rsk(req)->snt_synack = 0;
+       tcp_rsk(req)->snt_synack = tcp_time_stamp;
         req->mss = rx_opt->mss_clamp;
         req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
         ireq->tstamp_ok = rx_opt->tstamp_ok;
@@ -1091,8 +1127,12 @@ static inline void tcp_openreq_init(struct request_sock *req,
         ireq->ecn_ok = 0;
         ireq->ir_rmt_port = tcp_hdr(skb)->source;
         ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+       ireq->ir_mark = inet_request_mark(sk, skb);
  }
  
+extern void tcp_openreq_init_rwin(struct request_sock *req,
+                                 struct sock *sk, struct dst_entry *dst);
+
  void tcp_enter_memory_pressure(struct sock *sk);
  
  static inline int keepalive_intvl_when(const struct tcp_sock *tp)
@@ -1302,8 +1342,10 @@ void tcp_free_fastopen_req(struct tcp_sock *tp);
  
  extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
  int tcp_fastopen_reset_cipher(void *key, unsigned int len);
-void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
-                            struct tcp_fastopen_cookie *foc);
+bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+                     struct request_sock *req,
+                     struct tcp_fastopen_cookie *foc,
+                     struct dst_entry *dst);
  void tcp_fastopen_init_key_once(bool publish);
  #define TCP_FASTOPEN_KEY_LENGTH 16
  
@@ -1554,6 +1596,11 @@ int tcp4_proc_init(void);
  void tcp4_proc_exit(void);
  #endif
  
+int tcp_rtx_synack(struct sock *sk, struct request_sock *req);
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+                    const struct tcp_request_sock_ops *af_ops,
+                    struct sock *sk, struct sk_buff *skb);
+
  /* TCP af-specific functions */
  struct tcp_sock_af_ops {
  #ifdef CONFIG_TCP_MD5SIG
@@ -1571,6 +1618,7 @@ struct tcp_sock_af_ops {
  };
  
  struct tcp_request_sock_ops {
+       u16 mss_clamp;
  #ifdef CONFIG_TCP_MD5SIG
         struct tcp_md5sig_key   *(*md5_lookup) (struct sock *sk,
                                                 struct request_sock *req);
@@ -1580,8 +1628,39 @@ struct tcp_request_sock_ops {
                                                   const struct request_sock *req,
                                                   const struct sk_buff *skb);
  #endif
+       void (*init_req)(struct request_sock *req, struct sock *sk,
+                        struct sk_buff *skb);
+#ifdef CONFIG_SYN_COOKIES
+       __u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb,
+                                __u16 *mss);
+#endif
+       struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl,
+                                      const struct request_sock *req,
+                                      bool *strict);
+       __u32 (*init_seq)(const struct sk_buff *skb);
+       int (*send_synack)(struct sock *sk, struct dst_entry *dst,
+                          struct flowi *fl, struct request_sock *req,
+                          u16 queue_mapping, struct tcp_fastopen_cookie *foc);
+       void (*queue_hash_add)(struct sock *sk, struct request_sock *req,
+                              const unsigned long timeout);
  };
  
+#ifdef CONFIG_SYN_COOKIES
+static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
+                                        struct sock *sk, struct sk_buff *skb,
+                                        __u16 *mss)
+{
+       return ops->cookie_init_seq(sk, skb, mss);
+}
+#else
+static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
+                                        struct sock *sk, struct sk_buff *skb,
+                                        __u16 *mss)
+{
+       return 0;
+}
+#endif
+
  int tcpv4_offload_init(void);
  
  void tcp_v4_init(void);