Merge tag 'ktest-v3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt...
[cascardo/linux.git] / net / ipv4 / tcp_metrics.c
1 #include <linux/rcupdate.h>
2 #include <linux/spinlock.h>
3 #include <linux/jiffies.h>
4 #include <linux/module.h>
5 #include <linux/cache.h>
6 #include <linux/slab.h>
7 #include <linux/init.h>
8 #include <linux/tcp.h>
9 #include <linux/hash.h>
10 #include <linux/tcp_metrics.h>
11 #include <linux/vmalloc.h>
12
13 #include <net/inet_connection_sock.h>
14 #include <net/net_namespace.h>
15 #include <net/request_sock.h>
16 #include <net/inetpeer.h>
17 #include <net/sock.h>
18 #include <net/ipv6.h>
19 #include <net/dst.h>
20 #include <net/tcp.h>
21 #include <net/genetlink.h>
22
23 int sysctl_tcp_nometrics_save __read_mostly;
24
25 static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
26                                                    struct net *net, unsigned int hash);
27
28 struct tcp_fastopen_metrics {
29         u16     mss;
30         u16     syn_loss:10;            /* Recurring Fast Open SYN losses */
31         unsigned long   last_syn_loss;  /* Last Fast Open SYN loss */
32         struct  tcp_fastopen_cookie     cookie;
33 };
34
35 struct tcp_metrics_block {
36         struct tcp_metrics_block __rcu  *tcpm_next;
37         struct inetpeer_addr            tcpm_addr;
38         unsigned long                   tcpm_stamp;
39         u32                             tcpm_ts;
40         u32                             tcpm_ts_stamp;
41         u32                             tcpm_lock;
42         u32                             tcpm_vals[TCP_METRIC_MAX + 1];
43         struct tcp_fastopen_metrics     tcpm_fastopen;
44
45         struct rcu_head                 rcu_head;
46 };
47
48 static bool tcp_metric_locked(struct tcp_metrics_block *tm,
49                               enum tcp_metric_index idx)
50 {
51         return tm->tcpm_lock & (1 << idx);
52 }
53
54 static u32 tcp_metric_get(struct tcp_metrics_block *tm,
55                           enum tcp_metric_index idx)
56 {
57         return tm->tcpm_vals[idx];
58 }
59
60 static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
61                                   enum tcp_metric_index idx)
62 {
63         return msecs_to_jiffies(tm->tcpm_vals[idx]);
64 }
65
66 static void tcp_metric_set(struct tcp_metrics_block *tm,
67                            enum tcp_metric_index idx,
68                            u32 val)
69 {
70         tm->tcpm_vals[idx] = val;
71 }
72
73 static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
74                                  enum tcp_metric_index idx,
75                                  u32 val)
76 {
77         tm->tcpm_vals[idx] = jiffies_to_msecs(val);
78 }
79
80 static bool addr_same(const struct inetpeer_addr *a,
81                       const struct inetpeer_addr *b)
82 {
83         const struct in6_addr *a6, *b6;
84
85         if (a->family != b->family)
86                 return false;
87         if (a->family == AF_INET)
88                 return a->addr.a4 == b->addr.a4;
89
90         a6 = (const struct in6_addr *) &a->addr.a6[0];
91         b6 = (const struct in6_addr *) &b->addr.a6[0];
92
93         return ipv6_addr_equal(a6, b6);
94 }
95
96 struct tcpm_hash_bucket {
97         struct tcp_metrics_block __rcu  *chain;
98 };
99
100 static DEFINE_SPINLOCK(tcp_metrics_lock);
101
102 static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
103                           bool fastopen_clear)
104 {
105         u32 val;
106
107         tm->tcpm_stamp = jiffies;
108
109         val = 0;
110         if (dst_metric_locked(dst, RTAX_RTT))
111                 val |= 1 << TCP_METRIC_RTT;
112         if (dst_metric_locked(dst, RTAX_RTTVAR))
113                 val |= 1 << TCP_METRIC_RTTVAR;
114         if (dst_metric_locked(dst, RTAX_SSTHRESH))
115                 val |= 1 << TCP_METRIC_SSTHRESH;
116         if (dst_metric_locked(dst, RTAX_CWND))
117                 val |= 1 << TCP_METRIC_CWND;
118         if (dst_metric_locked(dst, RTAX_REORDERING))
119                 val |= 1 << TCP_METRIC_REORDERING;
120         tm->tcpm_lock = val;
121
122         tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
123         tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
124         tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
125         tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
126         tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
127         tm->tcpm_ts = 0;
128         tm->tcpm_ts_stamp = 0;
129         if (fastopen_clear) {
130                 tm->tcpm_fastopen.mss = 0;
131                 tm->tcpm_fastopen.syn_loss = 0;
132                 tm->tcpm_fastopen.cookie.len = 0;
133         }
134 }
135
136 #define TCP_METRICS_TIMEOUT             (60 * 60 * HZ)
137
138 static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
139 {
140         if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
141                 tcpm_suck_dst(tm, dst, false);
142 }
143
144 #define TCP_METRICS_RECLAIM_DEPTH       5
145 #define TCP_METRICS_RECLAIM_PTR         (struct tcp_metrics_block *) 0x1UL
146
147 static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
148                                           struct inetpeer_addr *addr,
149                                           unsigned int hash)
150 {
151         struct tcp_metrics_block *tm;
152         struct net *net;
153         bool reclaim = false;
154
155         spin_lock_bh(&tcp_metrics_lock);
156         net = dev_net(dst->dev);
157
158         /* While waiting for the spin-lock the cache might have been populated
159          * with this entry and so we have to check again.
160          */
161         tm = __tcp_get_metrics(addr, net, hash);
162         if (tm == TCP_METRICS_RECLAIM_PTR) {
163                 reclaim = true;
164                 tm = NULL;
165         }
166         if (tm) {
167                 tcpm_check_stamp(tm, dst);
168                 goto out_unlock;
169         }
170
171         if (unlikely(reclaim)) {
172                 struct tcp_metrics_block *oldest;
173
174                 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
175                 for (tm = rcu_dereference(oldest->tcpm_next); tm;
176                      tm = rcu_dereference(tm->tcpm_next)) {
177                         if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
178                                 oldest = tm;
179                 }
180                 tm = oldest;
181         } else {
182                 tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
183                 if (!tm)
184                         goto out_unlock;
185         }
186         tm->tcpm_addr = *addr;
187
188         tcpm_suck_dst(tm, dst, true);
189
190         if (likely(!reclaim)) {
191                 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
192                 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
193         }
194
195 out_unlock:
196         spin_unlock_bh(&tcp_metrics_lock);
197         return tm;
198 }
199
200 static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
201 {
202         if (tm)
203                 return tm;
204         if (depth > TCP_METRICS_RECLAIM_DEPTH)
205                 return TCP_METRICS_RECLAIM_PTR;
206         return NULL;
207 }
208
209 static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
210                                                    struct net *net, unsigned int hash)
211 {
212         struct tcp_metrics_block *tm;
213         int depth = 0;
214
215         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
216              tm = rcu_dereference(tm->tcpm_next)) {
217                 if (addr_same(&tm->tcpm_addr, addr))
218                         break;
219                 depth++;
220         }
221         return tcp_get_encode(tm, depth);
222 }
223
224 static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
225                                                        struct dst_entry *dst)
226 {
227         struct tcp_metrics_block *tm;
228         struct inetpeer_addr addr;
229         unsigned int hash;
230         struct net *net;
231
232         addr.family = req->rsk_ops->family;
233         switch (addr.family) {
234         case AF_INET:
235                 addr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
236                 hash = (__force unsigned int) addr.addr.a4;
237                 break;
238 #if IS_ENABLED(CONFIG_IPV6)
239         case AF_INET6:
240                 *(struct in6_addr *)addr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr;
241                 hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
242                 break;
243 #endif
244         default:
245                 return NULL;
246         }
247
248         net = dev_net(dst->dev);
249         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
250
251         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
252              tm = rcu_dereference(tm->tcpm_next)) {
253                 if (addr_same(&tm->tcpm_addr, &addr))
254                         break;
255         }
256         tcpm_check_stamp(tm, dst);
257         return tm;
258 }
259
260 static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
261 {
262         struct tcp_metrics_block *tm;
263         struct inetpeer_addr addr;
264         unsigned int hash;
265         struct net *net;
266
267         addr.family = tw->tw_family;
268         switch (addr.family) {
269         case AF_INET:
270                 addr.addr.a4 = tw->tw_daddr;
271                 hash = (__force unsigned int) addr.addr.a4;
272                 break;
273 #if IS_ENABLED(CONFIG_IPV6)
274         case AF_INET6:
275                 *(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr;
276                 hash = ipv6_addr_hash(&tw->tw_v6_daddr);
277                 break;
278 #endif
279         default:
280                 return NULL;
281         }
282
283         net = twsk_net(tw);
284         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
285
286         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
287              tm = rcu_dereference(tm->tcpm_next)) {
288                 if (addr_same(&tm->tcpm_addr, &addr))
289                         break;
290         }
291         return tm;
292 }
293
294 static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
295                                                  struct dst_entry *dst,
296                                                  bool create)
297 {
298         struct tcp_metrics_block *tm;
299         struct inetpeer_addr addr;
300         unsigned int hash;
301         struct net *net;
302
303         addr.family = sk->sk_family;
304         switch (addr.family) {
305         case AF_INET:
306                 addr.addr.a4 = inet_sk(sk)->inet_daddr;
307                 hash = (__force unsigned int) addr.addr.a4;
308                 break;
309 #if IS_ENABLED(CONFIG_IPV6)
310         case AF_INET6:
311                 *(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr;
312                 hash = ipv6_addr_hash(&sk->sk_v6_daddr);
313                 break;
314 #endif
315         default:
316                 return NULL;
317         }
318
319         net = dev_net(dst->dev);
320         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
321
322         tm = __tcp_get_metrics(&addr, net, hash);
323         if (tm == TCP_METRICS_RECLAIM_PTR)
324                 tm = NULL;
325         if (!tm && create)
326                 tm = tcpm_new(dst, &addr, hash);
327         else
328                 tcpm_check_stamp(tm, dst);
329
330         return tm;
331 }
332
333 /* Save metrics learned by this TCP session.  This function is called
334  * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
335  * or goes from LAST-ACK to CLOSE.
336  */
337 void tcp_update_metrics(struct sock *sk)
338 {
339         const struct inet_connection_sock *icsk = inet_csk(sk);
340         struct dst_entry *dst = __sk_dst_get(sk);
341         struct tcp_sock *tp = tcp_sk(sk);
342         struct tcp_metrics_block *tm;
343         unsigned long rtt;
344         u32 val;
345         int m;
346
347         if (sysctl_tcp_nometrics_save || !dst)
348                 return;
349
350         if (dst->flags & DST_HOST)
351                 dst_confirm(dst);
352
353         rcu_read_lock();
354         if (icsk->icsk_backoff || !tp->srtt) {
355                 /* This session failed to estimate rtt. Why?
356                  * Probably, no packets returned in time.  Reset our
357                  * results.
358                  */
359                 tm = tcp_get_metrics(sk, dst, false);
360                 if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
361                         tcp_metric_set(tm, TCP_METRIC_RTT, 0);
362                 goto out_unlock;
363         } else
364                 tm = tcp_get_metrics(sk, dst, true);
365
366         if (!tm)
367                 goto out_unlock;
368
369         rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
370         m = rtt - tp->srtt;
371
372         /* If newly calculated rtt larger than stored one, store new
373          * one. Otherwise, use EWMA. Remember, rtt overestimation is
374          * always better than underestimation.
375          */
376         if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
377                 if (m <= 0)
378                         rtt = tp->srtt;
379                 else
380                         rtt -= (m >> 3);
381                 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
382         }
383
384         if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
385                 unsigned long var;
386
387                 if (m < 0)
388                         m = -m;
389
390                 /* Scale deviation to rttvar fixed point */
391                 m >>= 1;
392                 if (m < tp->mdev)
393                         m = tp->mdev;
394
395                 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
396                 if (m >= var)
397                         var = m;
398                 else
399                         var -= (var - m) >> 2;
400
401                 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
402         }
403
404         if (tcp_in_initial_slowstart(tp)) {
405                 /* Slow start still did not finish. */
406                 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
407                         val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
408                         if (val && (tp->snd_cwnd >> 1) > val)
409                                 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
410                                                tp->snd_cwnd >> 1);
411                 }
412                 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
413                         val = tcp_metric_get(tm, TCP_METRIC_CWND);
414                         if (tp->snd_cwnd > val)
415                                 tcp_metric_set(tm, TCP_METRIC_CWND,
416                                                tp->snd_cwnd);
417                 }
418         } else if (tp->snd_cwnd > tp->snd_ssthresh &&
419                    icsk->icsk_ca_state == TCP_CA_Open) {
420                 /* Cong. avoidance phase, cwnd is reliable. */
421                 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
422                         tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
423                                        max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
424                 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
425                         val = tcp_metric_get(tm, TCP_METRIC_CWND);
426                         tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
427                 }
428         } else {
429                 /* Else slow start did not finish, cwnd is non-sense,
430                  * ssthresh may be also invalid.
431                  */
432                 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
433                         val = tcp_metric_get(tm, TCP_METRIC_CWND);
434                         tcp_metric_set(tm, TCP_METRIC_CWND,
435                                        (val + tp->snd_ssthresh) >> 1);
436                 }
437                 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
438                         val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
439                         if (val && tp->snd_ssthresh > val)
440                                 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
441                                                tp->snd_ssthresh);
442                 }
443                 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
444                         val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
445                         if (val < tp->reordering &&
446                             tp->reordering != sysctl_tcp_reordering)
447                                 tcp_metric_set(tm, TCP_METRIC_REORDERING,
448                                                tp->reordering);
449                 }
450         }
451         tm->tcpm_stamp = jiffies;
452 out_unlock:
453         rcu_read_unlock();
454 }
455
456 /* Initialize metrics on socket. */
457
458 void tcp_init_metrics(struct sock *sk)
459 {
460         struct dst_entry *dst = __sk_dst_get(sk);
461         struct tcp_sock *tp = tcp_sk(sk);
462         struct tcp_metrics_block *tm;
463         u32 val, crtt = 0; /* cached RTT scaled by 8 */
464
465         if (dst == NULL)
466                 goto reset;
467
468         dst_confirm(dst);
469
470         rcu_read_lock();
471         tm = tcp_get_metrics(sk, dst, true);
472         if (!tm) {
473                 rcu_read_unlock();
474                 goto reset;
475         }
476
477         if (tcp_metric_locked(tm, TCP_METRIC_CWND))
478                 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
479
480         val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
481         if (val) {
482                 tp->snd_ssthresh = val;
483                 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
484                         tp->snd_ssthresh = tp->snd_cwnd_clamp;
485         } else {
486                 /* ssthresh may have been reduced unnecessarily during.
487                  * 3WHS. Restore it back to its initial default.
488                  */
489                 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
490         }
491         val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
492         if (val && tp->reordering != val) {
493                 tcp_disable_fack(tp);
494                 tcp_disable_early_retrans(tp);
495                 tp->reordering = val;
496         }
497
498         crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
499         rcu_read_unlock();
500 reset:
501         /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
502          * to seed the RTO for later data packets because SYN packets are
503          * small. Use the per-dst cached values to seed the RTO but keep
504          * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
505          * Later the RTO will be updated immediately upon obtaining the first
506          * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
507          * influences the first RTO but not later RTT estimation.
508          *
509          * But if RTT is not available from the SYN (due to retransmits or
510          * syn cookies) or the cache, force a conservative 3secs timeout.
511          *
512          * A bit of theory. RTT is time passed after "normal" sized packet
513          * is sent until it is ACKed. In normal circumstances sending small
514          * packets force peer to delay ACKs and calculation is correct too.
515          * The algorithm is adaptive and, provided we follow specs, it
516          * NEVER underestimate RTT. BUT! If peer tries to make some clever
517          * tricks sort of "quick acks" for time long enough to decrease RTT
518          * to low value, and then abruptly stops to do it and starts to delay
519          * ACKs, wait for troubles.
520          */
521         if (crtt > tp->srtt) {
522                 /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
523                 crtt >>= 3;
524                 inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
525         } else if (tp->srtt == 0) {
526                 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
527                  * 3WHS. This is most likely due to retransmission,
528                  * including spurious one. Reset the RTO back to 3secs
529                  * from the more aggressive 1sec to avoid more spurious
530                  * retransmission.
531                  */
532                 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
533                 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
534         }
535         /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
536          * retransmitted. In light of RFC6298 more aggressive 1sec
537          * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
538          * retransmission has occurred.
539          */
540         if (tp->total_retrans > 1)
541                 tp->snd_cwnd = 1;
542         else
543                 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
544         tp->snd_cwnd_stamp = tcp_time_stamp;
545 }
546
547 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
548 {
549         struct tcp_metrics_block *tm;
550         bool ret;
551
552         if (!dst)
553                 return false;
554
555         rcu_read_lock();
556         tm = __tcp_get_metrics_req(req, dst);
557         if (paws_check) {
558                 if (tm &&
559                     (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
560                     (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
561                         ret = false;
562                 else
563                         ret = true;
564         } else {
565                 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
566                         ret = true;
567                 else
568                         ret = false;
569         }
570         rcu_read_unlock();
571
572         return ret;
573 }
574 EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
575
576 void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
577 {
578         struct tcp_metrics_block *tm;
579
580         rcu_read_lock();
581         tm = tcp_get_metrics(sk, dst, true);
582         if (tm) {
583                 struct tcp_sock *tp = tcp_sk(sk);
584
585                 if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
586                         tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
587                         tp->rx_opt.ts_recent = tm->tcpm_ts;
588                 }
589         }
590         rcu_read_unlock();
591 }
592 EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
593
594 /* VJ's idea. Save last timestamp seen from this destination and hold
595  * it at least for normal timewait interval to use for duplicate
596  * segment detection in subsequent connections, before they enter
597  * synchronized state.
598  */
599 bool tcp_remember_stamp(struct sock *sk)
600 {
601         struct dst_entry *dst = __sk_dst_get(sk);
602         bool ret = false;
603
604         if (dst) {
605                 struct tcp_metrics_block *tm;
606
607                 rcu_read_lock();
608                 tm = tcp_get_metrics(sk, dst, true);
609                 if (tm) {
610                         struct tcp_sock *tp = tcp_sk(sk);
611
612                         if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
613                             ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
614                              tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
615                                 tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
616                                 tm->tcpm_ts = tp->rx_opt.ts_recent;
617                         }
618                         ret = true;
619                 }
620                 rcu_read_unlock();
621         }
622         return ret;
623 }
624
625 bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
626 {
627         struct tcp_metrics_block *tm;
628         bool ret = false;
629
630         rcu_read_lock();
631         tm = __tcp_get_metrics_tw(tw);
632         if (tm) {
633                 const struct tcp_timewait_sock *tcptw;
634                 struct sock *sk = (struct sock *) tw;
635
636                 tcptw = tcp_twsk(sk);
637                 if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
638                     ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
639                      tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
640                         tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
641                         tm->tcpm_ts        = tcptw->tw_ts_recent;
642                 }
643                 ret = true;
644         }
645         rcu_read_unlock();
646
647         return ret;
648 }
649
650 static DEFINE_SEQLOCK(fastopen_seqlock);
651
652 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
653                             struct tcp_fastopen_cookie *cookie,
654                             int *syn_loss, unsigned long *last_syn_loss)
655 {
656         struct tcp_metrics_block *tm;
657
658         rcu_read_lock();
659         tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
660         if (tm) {
661                 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
662                 unsigned int seq;
663
664                 do {
665                         seq = read_seqbegin(&fastopen_seqlock);
666                         if (tfom->mss)
667                                 *mss = tfom->mss;
668                         *cookie = tfom->cookie;
669                         *syn_loss = tfom->syn_loss;
670                         *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
671                 } while (read_seqretry(&fastopen_seqlock, seq));
672         }
673         rcu_read_unlock();
674 }
675
676 void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
677                             struct tcp_fastopen_cookie *cookie, bool syn_lost)
678 {
679         struct dst_entry *dst = __sk_dst_get(sk);
680         struct tcp_metrics_block *tm;
681
682         if (!dst)
683                 return;
684         rcu_read_lock();
685         tm = tcp_get_metrics(sk, dst, true);
686         if (tm) {
687                 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
688
689                 write_seqlock_bh(&fastopen_seqlock);
690                 if (mss)
691                         tfom->mss = mss;
692                 if (cookie && cookie->len > 0)
693                         tfom->cookie = *cookie;
694                 if (syn_lost) {
695                         ++tfom->syn_loss;
696                         tfom->last_syn_loss = jiffies;
697                 } else
698                         tfom->syn_loss = 0;
699                 write_sequnlock_bh(&fastopen_seqlock);
700         }
701         rcu_read_unlock();
702 }
703
704 static struct genl_family tcp_metrics_nl_family = {
705         .id             = GENL_ID_GENERATE,
706         .hdrsize        = 0,
707         .name           = TCP_METRICS_GENL_NAME,
708         .version        = TCP_METRICS_GENL_VERSION,
709         .maxattr        = TCP_METRICS_ATTR_MAX,
710         .netnsok        = true,
711 };
712
713 static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
714         [TCP_METRICS_ATTR_ADDR_IPV4]    = { .type = NLA_U32, },
715         [TCP_METRICS_ATTR_ADDR_IPV6]    = { .type = NLA_BINARY,
716                                             .len = sizeof(struct in6_addr), },
717         /* Following attributes are not received for GET/DEL,
718          * we keep them for reference
719          */
720 #if 0
721         [TCP_METRICS_ATTR_AGE]          = { .type = NLA_MSECS, },
722         [TCP_METRICS_ATTR_TW_TSVAL]     = { .type = NLA_U32, },
723         [TCP_METRICS_ATTR_TW_TS_STAMP]  = { .type = NLA_S32, },
724         [TCP_METRICS_ATTR_VALS]         = { .type = NLA_NESTED, },
725         [TCP_METRICS_ATTR_FOPEN_MSS]    = { .type = NLA_U16, },
726         [TCP_METRICS_ATTR_FOPEN_SYN_DROPS]      = { .type = NLA_U16, },
727         [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]    = { .type = NLA_MSECS, },
728         [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
729                                             .len = TCP_FASTOPEN_COOKIE_MAX, },
730 #endif
731 };
732
733 /* Add attributes, caller cancels its header on failure */
734 static int tcp_metrics_fill_info(struct sk_buff *msg,
735                                  struct tcp_metrics_block *tm)
736 {
737         struct nlattr *nest;
738         int i;
739
740         switch (tm->tcpm_addr.family) {
741         case AF_INET:
742                 if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
743                                 tm->tcpm_addr.addr.a4) < 0)
744                         goto nla_put_failure;
745                 break;
746         case AF_INET6:
747                 if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
748                             tm->tcpm_addr.addr.a6) < 0)
749                         goto nla_put_failure;
750                 break;
751         default:
752                 return -EAFNOSUPPORT;
753         }
754
755         if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
756                           jiffies - tm->tcpm_stamp) < 0)
757                 goto nla_put_failure;
758         if (tm->tcpm_ts_stamp) {
759                 if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
760                                 (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
761                         goto nla_put_failure;
762                 if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
763                                 tm->tcpm_ts) < 0)
764                         goto nla_put_failure;
765         }
766
767         {
768                 int n = 0;
769
770                 nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
771                 if (!nest)
772                         goto nla_put_failure;
773                 for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
774                         if (!tm->tcpm_vals[i])
775                                 continue;
776                         if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
777                                 goto nla_put_failure;
778                         n++;
779                 }
780                 if (n)
781                         nla_nest_end(msg, nest);
782                 else
783                         nla_nest_cancel(msg, nest);
784         }
785
786         {
787                 struct tcp_fastopen_metrics tfom_copy[1], *tfom;
788                 unsigned int seq;
789
790                 do {
791                         seq = read_seqbegin(&fastopen_seqlock);
792                         tfom_copy[0] = tm->tcpm_fastopen;
793                 } while (read_seqretry(&fastopen_seqlock, seq));
794
795                 tfom = tfom_copy;
796                 if (tfom->mss &&
797                     nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
798                                 tfom->mss) < 0)
799                         goto nla_put_failure;
800                 if (tfom->syn_loss &&
801                     (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
802                                 tfom->syn_loss) < 0 ||
803                      nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
804                                 jiffies - tfom->last_syn_loss) < 0))
805                         goto nla_put_failure;
806                 if (tfom->cookie.len > 0 &&
807                     nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
808                             tfom->cookie.len, tfom->cookie.val) < 0)
809                         goto nla_put_failure;
810         }
811
812         return 0;
813
814 nla_put_failure:
815         return -EMSGSIZE;
816 }
817
818 static int tcp_metrics_dump_info(struct sk_buff *skb,
819                                  struct netlink_callback *cb,
820                                  struct tcp_metrics_block *tm)
821 {
822         void *hdr;
823
824         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
825                           &tcp_metrics_nl_family, NLM_F_MULTI,
826                           TCP_METRICS_CMD_GET);
827         if (!hdr)
828                 return -EMSGSIZE;
829
830         if (tcp_metrics_fill_info(skb, tm) < 0)
831                 goto nla_put_failure;
832
833         return genlmsg_end(skb, hdr);
834
835 nla_put_failure:
836         genlmsg_cancel(skb, hdr);
837         return -EMSGSIZE;
838 }
839
840 static int tcp_metrics_nl_dump(struct sk_buff *skb,
841                                struct netlink_callback *cb)
842 {
843         struct net *net = sock_net(skb->sk);
844         unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
845         unsigned int row, s_row = cb->args[0];
846         int s_col = cb->args[1], col = s_col;
847
848         for (row = s_row; row < max_rows; row++, s_col = 0) {
849                 struct tcp_metrics_block *tm;
850                 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
851
852                 rcu_read_lock();
853                 for (col = 0, tm = rcu_dereference(hb->chain); tm;
854                      tm = rcu_dereference(tm->tcpm_next), col++) {
855                         if (col < s_col)
856                                 continue;
857                         if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
858                                 rcu_read_unlock();
859                                 goto done;
860                         }
861                 }
862                 rcu_read_unlock();
863         }
864
865 done:
866         cb->args[0] = row;
867         cb->args[1] = col;
868         return skb->len;
869 }
870
871 static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
872                          unsigned int *hash, int optional)
873 {
874         struct nlattr *a;
875
876         a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
877         if (a) {
878                 addr->family = AF_INET;
879                 addr->addr.a4 = nla_get_be32(a);
880                 *hash = (__force unsigned int) addr->addr.a4;
881                 return 0;
882         }
883         a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
884         if (a) {
885                 if (nla_len(a) != sizeof(struct in6_addr))
886                         return -EINVAL;
887                 addr->family = AF_INET6;
888                 memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
889                 *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
890                 return 0;
891         }
892         return optional ? 1 : -EAFNOSUPPORT;
893 }
894
895 static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
896 {
897         struct tcp_metrics_block *tm;
898         struct inetpeer_addr addr;
899         unsigned int hash;
900         struct sk_buff *msg;
901         struct net *net = genl_info_net(info);
902         void *reply;
903         int ret;
904
905         ret = parse_nl_addr(info, &addr, &hash, 0);
906         if (ret < 0)
907                 return ret;
908
909         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
910         if (!msg)
911                 return -ENOMEM;
912
913         reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
914                                   info->genlhdr->cmd);
915         if (!reply)
916                 goto nla_put_failure;
917
918         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
919         ret = -ESRCH;
920         rcu_read_lock();
921         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
922              tm = rcu_dereference(tm->tcpm_next)) {
923                 if (addr_same(&tm->tcpm_addr, &addr)) {
924                         ret = tcp_metrics_fill_info(msg, tm);
925                         break;
926                 }
927         }
928         rcu_read_unlock();
929         if (ret < 0)
930                 goto out_free;
931
932         genlmsg_end(msg, reply);
933         return genlmsg_reply(msg, info);
934
935 nla_put_failure:
936         ret = -EMSGSIZE;
937
938 out_free:
939         nlmsg_free(msg);
940         return ret;
941 }
942
943 #define deref_locked_genl(p)    \
944         rcu_dereference_protected(p, lockdep_genl_is_held() && \
945                                      lockdep_is_held(&tcp_metrics_lock))
946
947 #define deref_genl(p)   rcu_dereference_protected(p, lockdep_genl_is_held())
948
949 static int tcp_metrics_flush_all(struct net *net)
950 {
951         unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
952         struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
953         struct tcp_metrics_block *tm;
954         unsigned int row;
955
956         for (row = 0; row < max_rows; row++, hb++) {
957                 spin_lock_bh(&tcp_metrics_lock);
958                 tm = deref_locked_genl(hb->chain);
959                 if (tm)
960                         hb->chain = NULL;
961                 spin_unlock_bh(&tcp_metrics_lock);
962                 while (tm) {
963                         struct tcp_metrics_block *next;
964
965                         next = deref_genl(tm->tcpm_next);
966                         kfree_rcu(tm, rcu_head);
967                         tm = next;
968                 }
969         }
970         return 0;
971 }
972
973 static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
974 {
975         struct tcpm_hash_bucket *hb;
976         struct tcp_metrics_block *tm;
977         struct tcp_metrics_block __rcu **pp;
978         struct inetpeer_addr addr;
979         unsigned int hash;
980         struct net *net = genl_info_net(info);
981         int ret;
982
983         ret = parse_nl_addr(info, &addr, &hash, 1);
984         if (ret < 0)
985                 return ret;
986         if (ret > 0)
987                 return tcp_metrics_flush_all(net);
988
989         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
990         hb = net->ipv4.tcp_metrics_hash + hash;
991         pp = &hb->chain;
992         spin_lock_bh(&tcp_metrics_lock);
993         for (tm = deref_locked_genl(*pp); tm;
994              pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
995                 if (addr_same(&tm->tcpm_addr, &addr)) {
996                         *pp = tm->tcpm_next;
997                         break;
998                 }
999         }
1000         spin_unlock_bh(&tcp_metrics_lock);
1001         if (!tm)
1002                 return -ESRCH;
1003         kfree_rcu(tm, rcu_head);
1004         return 0;
1005 }
1006
1007 static const struct genl_ops tcp_metrics_nl_ops[] = {
1008         {
1009                 .cmd = TCP_METRICS_CMD_GET,
1010                 .doit = tcp_metrics_nl_cmd_get,
1011                 .dumpit = tcp_metrics_nl_dump,
1012                 .policy = tcp_metrics_nl_policy,
1013                 .flags = GENL_ADMIN_PERM,
1014         },
1015         {
1016                 .cmd = TCP_METRICS_CMD_DEL,
1017                 .doit = tcp_metrics_nl_cmd_del,
1018                 .policy = tcp_metrics_nl_policy,
1019                 .flags = GENL_ADMIN_PERM,
1020         },
1021 };
1022
1023 static unsigned int tcpmhash_entries;
1024 static int __init set_tcpmhash_entries(char *str)
1025 {
1026         ssize_t ret;
1027
1028         if (!str)
1029                 return 0;
1030
1031         ret = kstrtouint(str, 0, &tcpmhash_entries);
1032         if (ret)
1033                 return 0;
1034
1035         return 1;
1036 }
1037 __setup("tcpmhash_entries=", set_tcpmhash_entries);
1038
1039 static int __net_init tcp_net_metrics_init(struct net *net)
1040 {
1041         size_t size;
1042         unsigned int slots;
1043
1044         slots = tcpmhash_entries;
1045         if (!slots) {
1046                 if (totalram_pages >= 128 * 1024)
1047                         slots = 16 * 1024;
1048                 else
1049                         slots = 8 * 1024;
1050         }
1051
1052         net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
1053         size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
1054
1055         net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1056         if (!net->ipv4.tcp_metrics_hash)
1057                 net->ipv4.tcp_metrics_hash = vzalloc(size);
1058
1059         if (!net->ipv4.tcp_metrics_hash)
1060                 return -ENOMEM;
1061
1062         return 0;
1063 }
1064
1065 static void __net_exit tcp_net_metrics_exit(struct net *net)
1066 {
1067         unsigned int i;
1068
1069         for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
1070                 struct tcp_metrics_block *tm, *next;
1071
1072                 tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
1073                 while (tm) {
1074                         next = rcu_dereference_protected(tm->tcpm_next, 1);
1075                         kfree(tm);
1076                         tm = next;
1077                 }
1078         }
1079         if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash))
1080                 vfree(net->ipv4.tcp_metrics_hash);
1081         else
1082                 kfree(net->ipv4.tcp_metrics_hash);
1083 }
1084
1085 static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
1086         .init   =       tcp_net_metrics_init,
1087         .exit   =       tcp_net_metrics_exit,
1088 };
1089
1090 void __init tcp_metrics_init(void)
1091 {
1092         int ret;
1093
1094         ret = register_pernet_subsys(&tcp_net_metrics_ops);
1095         if (ret < 0)
1096                 goto cleanup;
1097         ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
1098                                             tcp_metrics_nl_ops);
1099         if (ret < 0)
1100                 goto cleanup_subsys;
1101         return;
1102
1103 cleanup_subsys:
1104         unregister_pernet_subsys(&tcp_net_metrics_ops);
1105
1106 cleanup:
1107         return;
1108 }