#include <linux/crypto.h>
#include <linux/cryptohash.h>
#include <linux/kref.h>
+#include <linux/ktime.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#define TFO_SERVER_ENABLE 2
#define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */
-/* Process SYN data but skip cookie validation */
-#define TFO_SERVER_COOKIE_NOT_CHKED 0x100
/* Accept SYN data w/o any cookie option */
#define TFO_SERVER_COOKIE_NOT_REQD 0x200
*/
#define TFO_SERVER_WO_SOCKOPT1 0x400
#define TFO_SERVER_WO_SOCKOPT2 0x800
-/* Always create TFO child sockets on a TFO listener even when
- * cookie/data not present. (For testing purpose!)
- */
-#define TFO_SERVER_ALWAYS 0x1000
extern struct inet_timewait_death_row tcp_death_row;
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len);
void tcp_rcv_space_adjust(struct sock *sk);
-void tcp_cleanup_rbuf(struct sock *sk, int copied);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
bool fastopen);
int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb);
-void tcp_enter_loss(struct sock *sk, int how);
+void tcp_enter_loss(struct sock *sk);
void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
- bool paws_check);
+ bool paws_check, bool timestamps);
bool tcp_remember_stamp(struct sock *sk);
bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
*/
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
+void tcp_v4_mtu_reduced(struct sock *sk);
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(struct sock *sk,
struct request_sock *req,
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt);
#ifdef CONFIG_SYN_COOKIES
-#include <linux/ktime.h>
-/* Syncookies use a monotonic timer which increments every 64 seconds.
+/* Syncookies use a monotonic timer which increments every 60 seconds.
* This counter is used both as a hash input and partially encoded into
* the cookie value. A cookie is only validated further if the delta
* between the current counter value and the encoded one is less than this,
- * i.e. a sent cookie is valid only at most for 128 seconds (or less if
+ * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
* the counter advances immediately after a cookie is generated).
*/
#define MAX_SYNCOOKIE_AGE 2
static inline u32 tcp_cookie_time(void)
{
- struct timespec now;
- getnstimeofday(&now);
- return now.tv_sec >> 6; /* 64 seconds granularity */
+ u64 val = get_jiffies_64();
+
+ do_div(val, 60 * HZ);
+ return val;
}
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
u16 *mssp);
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss);
-#else
-static inline __u32 cookie_v4_init_sequence(struct sock *sk,
- struct sk_buff *skb,
- __u16 *mss)
-{
- return 0;
-}
+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
+ __u16 *mss);
#endif
__u32 cookie_init_timestamp(struct request_sock *req);
const struct tcphdr *th, u16 *mssp);
__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,
__u16 *mss);
-#else
-static inline __u32 cookie_v6_init_sequence(struct sock *sk,
- struct sk_buff *skb,
- __u16 *mss)
-{
- return 0;
-}
#endif
/* tcp_output.c */
void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
-int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int);
+int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *);
bool tcp_schedule_loss_probe(struct sock *sk);
/* tcp_input.c */
-void tcp_cwnd_application_limited(struct sock *sk);
void tcp_resume_early_retransmit(struct sock *sk);
void tcp_rearm_rto(struct sock *sk);
void tcp_reset(struct sock *sk);
static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
- return (tp->srtt >> 3) + tp->rttvar;
+ return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
return rto_min;
}
+static inline u32 tcp_rto_min_us(struct sock *sk)
+{
+ return jiffies_to_usecs(tcp_rto_min(sk));
+}
+
/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
*/
#define tcp_time_stamp ((__u32)(jiffies))
+static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
+{
+ return skb->skb_mstamp.stamp_jiffies;
+}
+
+
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
#define TCPHDR_FIN 0x01
* If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
*/
struct tcp_skb_cb {
- union {
- struct inet_skb_parm h4;
-#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_skb_parm h6;
-#endif
- } header; /* For incoming frames */
__u32 seq; /* Starting sequence number */
__u32 end_seq; /* SEQ + FIN + SYN + datalen */
- __u32 when; /* used to compute rtt's */
+ union {
+ /* Note : tcp_tw_isn is used in input path only
+ * (isn chosen by tcp_timewait_state_process())
+ *
+ * tcp_gso_segs is used in write queue only,
+ * cf tcp_skb_pcount()
+ */
+ __u32 tcp_tw_isn;
+ __u32 tcp_gso_segs;
+ };
__u8 tcp_flags; /* TCP header flags. (tcp[13]) */
__u8 sacked; /* State flags for SACK/FACK. */
#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
#define TCPCB_LOST 0x04 /* SKB is lost */
#define TCPCB_TAGBITS 0x07 /* All tag bits */
+#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp) */
#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
-#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
+#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
+ TCPCB_REPAIRED)
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
/* 1 byte hole */
__u32 ack_seq; /* Sequence number ACK'd */
+ union {
+ struct inet_skb_parm h4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_skb_parm h6;
+#endif
+ } header; /* For incoming frames */
};
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
-/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
- *
- * If we receive a SYN packet with these bits set, it means a network is
- * playing bad games with TOS bits. In order to avoid possible false congestion
- * notifications, we disable TCP ECN negociation.
+/* Due to TSO, an SKB can be composed of multiple actual
+ * packets. To keep these tracked properly, we use this.
*/
-static inline void
-TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
- struct net *net)
+static inline int tcp_skb_pcount(const struct sk_buff *skb)
{
- const struct tcphdr *th = tcp_hdr(skb);
+ return TCP_SKB_CB(skb)->tcp_gso_segs;
+}
- if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
- INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
- inet_rsk(req)->ecn_ok = 1;
+static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
+{
+ TCP_SKB_CB(skb)->tcp_gso_segs = segs;
}
-/* Due to TSO, an SKB can be composed of multiple actual
- * packets. To keep these tracked properly, we use this.
- */
-static inline int tcp_skb_pcount(const struct sk_buff *skb)
+static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
{
- return skb_shinfo(skb)->gso_segs;
+ TCP_SKB_CB(skb)->tcp_gso_segs += segs;
}
/* This is valid iff tcp_skb_pcount() > 1. */
CA_EVENT_CWND_RESTART, /* congestion window restart */
CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
CA_EVENT_LOSS, /* loss timeout */
- CA_EVENT_FAST_ACK, /* in sequence ack */
- CA_EVENT_SLOW_ACK, /* other ack */
+ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
+ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
+ CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
+ CA_EVENT_NON_DELAYED_ACK,
+};
+
+/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+enum tcp_ca_ack_event_flags {
+ CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
+ CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
+ CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
};
/*
#define TCP_CA_MAX 128
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
+/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED 0x1
-#define TCP_CONG_RTT_STAMP 0x2
+/* Requires ECN/ECT set on all packets */
+#define TCP_CONG_NEEDS_ECN 0x2
struct tcp_congestion_ops {
struct list_head list;
/* return slow start threshold (required) */
u32 (*ssthresh)(struct sock *sk);
- /* lower bound for congestion window (optional) */
- u32 (*min_cwnd)(const struct sock *sk);
/* do new cwnd calculation (required) */
- void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked, u32 in_flight);
+ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
/* call before changing ca_state (optional) */
void (*set_state)(struct sock *sk, u8 new_state);
/* call when cwnd event occurs (optional) */
void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+ /* call when ack arrives (optional) */
+ void (*in_ack_event)(struct sock *sk, u32 flags);
/* new value of cwnd after loss (optional) */
u32 (*undo_cwnd)(struct sock *sk);
/* hook for packet ack accounting (optional) */
int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk);
void tcp_cleanup_congestion_control(struct sock *sk);
int tcp_set_default_congestion_control(const char *name);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
int tcp_set_congestion_control(struct sock *sk, const char *name);
-int tcp_slow_start(struct tcp_sock *tp, u32 acked);
+void tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
-extern struct tcp_congestion_ops tcp_init_congestion_ops;
u32 tcp_reno_ssthresh(struct sock *sk);
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight);
-u32 tcp_reno_min_cwnd(const struct sock *sk);
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;
+static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
+}
+
static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
{
struct inet_connection_sock *icsk = inet_csk(sk);
/* Use define here intentionally to get WARN_ON location shown at the caller */
#define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out)
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
+void tcp_enter_cwr(struct sock *sk);
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst);
/* The maximum number of MSS of available cwnd for which TSO defers
{
return tp->snd_una + tp->snd_wnd;
}
-bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight);
+
+/* We follow the spirit of RFC2861 to validate cwnd but implement a more
+ * flexible approach. The RFC suggests cwnd should not be raised unless
+ * it was fully used previously. And that's exactly what we do in
+ * congestion avoidance mode. But in slow start we allow cwnd to grow
+ * as long as the application has used half the cwnd.
+ * Example :
+ * cwnd is 10 (IW10), but application sends 9 frames.
+ * We allow cwnd to reach 18 when all frames are ACKed.
+ * This check is safe because it's as aggressive as slow start which already
+ * risks 100% overshoot. The advantage is that we discourage application to
+ * either send more filler packets or data to artificially blow up the cwnd
+ * usage, and allow application-limited process to probe bw more aggressively.
+ */
+static inline bool tcp_is_cwnd_limited(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If in slow start, ensure cwnd grows to twice what was ACKed. */
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ return tp->snd_cwnd < 2 * tp->max_packets_out;
+
+ return tp->is_cwnd_limited;
+}
static inline void tcp_check_probe_timer(struct sock *sk)
{
static inline void tcp_openreq_init(struct request_sock *req,
struct tcp_options_received *rx_opt,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct sock *sk)
{
struct inet_request_sock *ireq = inet_rsk(req);
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tcp_rsk(req)->snt_synack = 0;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->ecn_ok = 0;
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+ ireq->ir_mark = inet_request_mark(sk, skb);
}
+extern void tcp_openreq_init_rwin(struct request_sock *req,
+ struct sock *sk, struct dst_entry *dst);
+
void tcp_enter_memory_pressure(struct sock *sk);
static inline int keepalive_intvl_when(const struct tcp_sock *tp)
extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
int tcp_fastopen_reset_cipher(void *key, unsigned int len);
-void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
- struct tcp_fastopen_cookie *foc);
+bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct dst_entry *dst);
void tcp_fastopen_init_key_once(bool publish);
#define TCP_FASTOPEN_KEY_LENGTH 16
void tcp4_proc_exit(void);
#endif
+int tcp_rtx_synack(struct sock *sk, struct request_sock *req);
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct sk_buff *skb);
+
/* TCP af-specific functions */
struct tcp_sock_af_ops {
#ifdef CONFIG_TCP_MD5SIG
};
struct tcp_request_sock_ops {
+ u16 mss_clamp;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *(*md5_lookup) (struct sock *sk,
struct request_sock *req);
const struct request_sock *req,
const struct sk_buff *skb);
#endif
+ void (*init_req)(struct request_sock *req, struct sock *sk,
+ struct sk_buff *skb);
+#ifdef CONFIG_SYN_COOKIES
+ __u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb,
+ __u16 *mss);
+#endif
+ struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl,
+ const struct request_sock *req,
+ bool *strict);
+ __u32 (*init_seq)(const struct sk_buff *skb);
+ int (*send_synack)(struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl, struct request_sock *req,
+ u16 queue_mapping, struct tcp_fastopen_cookie *foc);
+ void (*queue_hash_add)(struct sock *sk, struct request_sock *req,
+ const unsigned long timeout);
};
+#ifdef CONFIG_SYN_COOKIES
+static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
+ struct sock *sk, struct sk_buff *skb,
+ __u16 *mss)
+{
+ return ops->cookie_init_seq(sk, skb, mss);
+}
+#else
+static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
+ struct sock *sk, struct sk_buff *skb,
+ __u16 *mss)
+{
+ return 0;
+}
+#endif
+
int tcpv4_offload_init(void);
void tcp_v4_init(void);