1 | /* $NetBSD: tcp_congctl.c,v 1.21 2016/04/26 08:44:44 ozaki-r Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation |
9 | * Facility, NASA Ames Research Center. |
10 | * This code is derived from software contributed to The NetBSD Foundation |
11 | * by Charles M. Hannum. |
12 | * This code is derived from software contributed to The NetBSD Foundation |
13 | * by Rui Paulo. |
14 | * |
15 | * Redistribution and use in source and binary forms, with or without |
16 | * modification, are permitted provided that the following conditions |
17 | * are met: |
18 | * 1. Redistributions of source code must retain the above copyright |
19 | * notice, this list of conditions and the following disclaimer. |
20 | * 2. Redistributions in binary form must reproduce the above copyright |
21 | * notice, this list of conditions and the following disclaimer in the |
22 | * documentation and/or other materials provided with the distribution. |
23 | * |
24 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
25 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
26 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
27 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
28 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
29 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
30 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
31 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
32 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
33 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
34 | * POSSIBILITY OF SUCH DAMAGE. |
35 | */ |
36 | |
37 | /* |
38 | * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. |
39 | * All rights reserved. |
40 | * |
41 | * Redistribution and use in source and binary forms, with or without |
42 | * modification, are permitted provided that the following conditions |
43 | * are met: |
44 | * 1. Redistributions of source code must retain the above copyright |
45 | * notice, this list of conditions and the following disclaimer. |
46 | * 2. Redistributions in binary form must reproduce the above copyright |
47 | * notice, this list of conditions and the following disclaimer in the |
48 | * documentation and/or other materials provided with the distribution. |
49 | * 3. Neither the name of the project nor the names of its contributors |
50 | * may be used to endorse or promote products derived from this software |
51 | * without specific prior written permission. |
52 | * |
53 | * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND |
54 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
55 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
56 | * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE |
57 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
58 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
59 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
60 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
61 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
62 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
63 | * SUCH DAMAGE. |
64 | */ |
65 | |
66 | /* |
67 | * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 |
68 | * |
69 | * NRL grants permission for redistribution and use in source and binary |
70 | * forms, with or without modification, of the software and documentation |
71 | * created at NRL provided that the following conditions are met: |
72 | * |
73 | * 1. Redistributions of source code must retain the above copyright |
74 | * notice, this list of conditions and the following disclaimer. |
75 | * 2. Redistributions in binary form must reproduce the above copyright |
76 | * notice, this list of conditions and the following disclaimer in the |
77 | * documentation and/or other materials provided with the distribution. |
78 | * 3. All advertising materials mentioning features or use of this software |
79 | * must display the following acknowledgements: |
80 | * This product includes software developed by the University of |
81 | * California, Berkeley and its contributors. |
82 | * This product includes software developed at the Information |
83 | * Technology Division, US Naval Research Laboratory. |
84 | * 4. Neither the name of the NRL nor the names of its contributors |
85 | * may be used to endorse or promote products derived from this software |
86 | * without specific prior written permission. |
87 | * |
88 | * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS |
89 | * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
90 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
91 | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR |
92 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
93 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
94 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
95 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
96 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
97 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
98 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
99 | * |
100 | * The views and conclusions contained in the software and documentation |
101 | * are those of the authors and should not be interpreted as representing |
102 | * official policies, either expressed or implied, of the US Naval |
103 | * Research Laboratory (NRL). |
104 | */ |
105 | |
106 | /* |
107 | * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 |
108 | * The Regents of the University of California. All rights reserved. |
109 | * |
110 | * Redistribution and use in source and binary forms, with or without |
111 | * modification, are permitted provided that the following conditions |
112 | * are met: |
113 | * 1. Redistributions of source code must retain the above copyright |
114 | * notice, this list of conditions and the following disclaimer. |
115 | * 2. Redistributions in binary form must reproduce the above copyright |
116 | * notice, this list of conditions and the following disclaimer in the |
117 | * documentation and/or other materials provided with the distribution. |
118 | * 3. Neither the name of the University nor the names of its contributors |
119 | * may be used to endorse or promote products derived from this software |
120 | * without specific prior written permission. |
121 | * |
122 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
123 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
124 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
125 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
126 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
127 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
128 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
129 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
130 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
131 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
132 | * SUCH DAMAGE. |
133 | * |
134 | * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 |
135 | */ |
136 | |
137 | #include <sys/cdefs.h> |
138 | __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.21 2016/04/26 08:44:44 ozaki-r Exp $" ); |
139 | |
140 | #ifdef _KERNEL_OPT |
141 | #include "opt_inet.h" |
142 | #include "opt_tcp_debug.h" |
143 | #include "opt_tcp_congctl.h" |
144 | #endif |
145 | |
146 | #include <sys/param.h> |
147 | #include <sys/systm.h> |
148 | #include <sys/malloc.h> |
149 | #include <sys/mbuf.h> |
150 | #include <sys/protosw.h> |
151 | #include <sys/socket.h> |
152 | #include <sys/socketvar.h> |
153 | #include <sys/errno.h> |
154 | #include <sys/syslog.h> |
155 | #include <sys/pool.h> |
156 | #include <sys/domain.h> |
157 | #include <sys/kernel.h> |
158 | #include <sys/mutex.h> |
159 | |
160 | #include <net/if.h> |
161 | |
162 | #include <netinet/in.h> |
163 | #include <netinet/in_systm.h> |
164 | #include <netinet/ip.h> |
165 | #include <netinet/in_pcb.h> |
166 | #include <netinet/in_var.h> |
167 | #include <netinet/ip_var.h> |
168 | |
169 | #ifdef INET6 |
170 | #ifndef INET |
171 | #include <netinet/in.h> |
172 | #endif |
173 | #include <netinet/ip6.h> |
174 | #include <netinet6/ip6_var.h> |
175 | #include <netinet6/in6_pcb.h> |
176 | #include <netinet6/ip6_var.h> |
177 | #include <netinet6/in6_var.h> |
178 | #include <netinet/icmp6.h> |
179 | #include <netinet6/nd6.h> |
180 | #endif |
181 | |
182 | #include <netinet/tcp.h> |
183 | #include <netinet/tcp_fsm.h> |
184 | #include <netinet/tcp_seq.h> |
185 | #include <netinet/tcp_timer.h> |
186 | #include <netinet/tcp_var.h> |
187 | #include <netinet/tcpip.h> |
188 | #include <netinet/tcp_congctl.h> |
189 | #ifdef TCP_DEBUG |
190 | #include <netinet/tcp_debug.h> |
191 | #endif |
192 | |
193 | /* |
194 | * TODO: |
195 | * consider separating the actual implementations in another file. |
196 | */ |
197 | |
198 | static void tcp_common_congestion_exp(struct tcpcb *, int, int); |
199 | |
200 | static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *); |
201 | static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *); |
202 | static void tcp_reno_slow_retransmit(struct tcpcb *); |
203 | static void tcp_reno_fast_retransmit_newack(struct tcpcb *, |
204 | const struct tcphdr *); |
205 | static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *); |
206 | static void tcp_reno_congestion_exp(struct tcpcb *tp); |
207 | |
208 | static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *); |
209 | static void tcp_newreno_fast_retransmit_newack(struct tcpcb *, |
210 | const struct tcphdr *); |
211 | static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *); |
212 | |
213 | static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *); |
214 | static void tcp_cubic_slow_retransmit(struct tcpcb *tp); |
215 | static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *); |
216 | static void tcp_cubic_congestion_exp(struct tcpcb *); |
217 | |
218 | static void tcp_congctl_fillnames(void); |
219 | |
220 | extern int tcprexmtthresh; |
221 | |
222 | MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl" , "TCP congestion control structures" ); |
223 | |
224 | /* currently selected global congestion control */ |
225 | char tcp_congctl_global_name[TCPCC_MAXLEN]; |
226 | |
227 | /* available global congestion control algorithms */ |
228 | char tcp_congctl_avail[10 * TCPCC_MAXLEN]; |
229 | |
230 | /* |
231 | * Used to list the available congestion control algorithms. |
232 | */ |
233 | TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd = |
234 | TAILQ_HEAD_INITIALIZER(tcp_congctlhd); |
235 | |
236 | static struct tcp_congctlent * tcp_congctl_global; |
237 | |
238 | static kmutex_t tcp_congctl_mtx; |
239 | |
240 | void |
241 | tcp_congctl_init(void) |
242 | { |
243 | int r __diagused; |
244 | |
245 | mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE); |
246 | |
247 | /* Base algorithms. */ |
248 | r = tcp_congctl_register("reno" , &tcp_reno_ctl); |
249 | KASSERT(r == 0); |
250 | r = tcp_congctl_register("newreno" , &tcp_newreno_ctl); |
251 | KASSERT(r == 0); |
252 | r = tcp_congctl_register("cubic" , &tcp_cubic_ctl); |
253 | KASSERT(r == 0); |
254 | |
255 | /* NewReno is the default. */ |
256 | #ifndef TCP_CONGCTL_DEFAULT |
257 | #define TCP_CONGCTL_DEFAULT "newreno" |
258 | #endif |
259 | |
260 | r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT); |
261 | KASSERT(r == 0); |
262 | } |
263 | |
264 | /* |
265 | * Register a congestion algorithm and select it if we have none. |
266 | */ |
267 | int |
268 | tcp_congctl_register(const char *name, const struct tcp_congctl *tcc) |
269 | { |
270 | struct tcp_congctlent *ntcc, *tccp; |
271 | |
272 | TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) |
273 | if (!strcmp(name, tccp->congctl_name)) { |
274 | /* name already registered */ |
275 | return EEXIST; |
276 | } |
277 | |
278 | ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO); |
279 | |
280 | strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1); |
281 | ntcc->congctl_ctl = tcc; |
282 | |
283 | TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent); |
284 | tcp_congctl_fillnames(); |
285 | |
286 | if (TAILQ_FIRST(&tcp_congctlhd) == ntcc) |
287 | tcp_congctl_select(NULL, name); |
288 | |
289 | return 0; |
290 | } |
291 | |
292 | int |
293 | tcp_congctl_unregister(const char *name) |
294 | { |
295 | struct tcp_congctlent *tccp, *rtccp; |
296 | unsigned int size; |
297 | |
298 | rtccp = NULL; |
299 | size = 0; |
300 | TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { |
301 | if (!strcmp(name, tccp->congctl_name)) |
302 | rtccp = tccp; |
303 | size++; |
304 | } |
305 | |
306 | if (!rtccp) |
307 | return ENOENT; |
308 | |
309 | if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt) |
310 | return EBUSY; |
311 | |
312 | TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent); |
313 | free(rtccp, M_TCPCONGCTL); |
314 | tcp_congctl_fillnames(); |
315 | |
316 | return 0; |
317 | } |
318 | |
319 | /* |
320 | * Select a congestion algorithm by name. |
321 | */ |
322 | int |
323 | tcp_congctl_select(struct tcpcb *tp, const char *name) |
324 | { |
325 | struct tcp_congctlent *tccp, *old_tccp, *new_tccp; |
326 | bool old_found, new_found; |
327 | |
328 | KASSERT(name); |
329 | |
330 | old_found = (tp == NULL || tp->t_congctl == NULL); |
331 | old_tccp = NULL; |
332 | new_found = false; |
333 | new_tccp = NULL; |
334 | |
335 | TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { |
336 | if (!old_found && tccp->congctl_ctl == tp->t_congctl) { |
337 | old_tccp = tccp; |
338 | old_found = true; |
339 | } |
340 | |
341 | if (!new_found && !strcmp(name, tccp->congctl_name)) { |
342 | new_tccp = tccp; |
343 | new_found = true; |
344 | } |
345 | |
346 | if (new_found && old_found) { |
347 | if (tp) { |
348 | mutex_enter(&tcp_congctl_mtx); |
349 | if (old_tccp) |
350 | old_tccp->congctl_refcnt--; |
351 | tp->t_congctl = new_tccp->congctl_ctl; |
352 | new_tccp->congctl_refcnt++; |
353 | mutex_exit(&tcp_congctl_mtx); |
354 | } else { |
355 | tcp_congctl_global = new_tccp; |
356 | strlcpy(tcp_congctl_global_name, |
357 | new_tccp->congctl_name, |
358 | sizeof(tcp_congctl_global_name) - 1); |
359 | } |
360 | return 0; |
361 | } |
362 | } |
363 | |
364 | return EINVAL; |
365 | } |
366 | |
367 | void |
368 | tcp_congctl_release(struct tcpcb *tp) |
369 | { |
370 | struct tcp_congctlent *tccp; |
371 | |
372 | KASSERT(tp->t_congctl); |
373 | |
374 | TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { |
375 | if (tccp->congctl_ctl == tp->t_congctl) { |
376 | tccp->congctl_refcnt--; |
377 | return; |
378 | } |
379 | } |
380 | } |
381 | |
382 | /* |
383 | * Returns the name of a congestion algorithm. |
384 | */ |
385 | const char * |
386 | tcp_congctl_bystruct(const struct tcp_congctl *tcc) |
387 | { |
388 | struct tcp_congctlent *tccp; |
389 | |
390 | KASSERT(tcc); |
391 | |
392 | TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) |
393 | if (tccp->congctl_ctl == tcc) |
394 | return tccp->congctl_name; |
395 | |
396 | return NULL; |
397 | } |
398 | |
399 | static void |
400 | tcp_congctl_fillnames(void) |
401 | { |
402 | struct tcp_congctlent *tccp; |
403 | const char *delim = " " ; |
404 | |
405 | tcp_congctl_avail[0] = '\0'; |
406 | TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { |
407 | strlcat(tcp_congctl_avail, tccp->congctl_name, |
408 | sizeof(tcp_congctl_avail) - 1); |
409 | if (TAILQ_NEXT(tccp, congctl_ent)) |
410 | strlcat(tcp_congctl_avail, delim, |
411 | sizeof(tcp_congctl_avail) - 1); |
412 | } |
413 | |
414 | } |
415 | |
416 | /* ------------------------------------------------------------------------ */ |
417 | |
418 | /* |
419 | * Common stuff |
420 | */ |
421 | |
422 | /* Window reduction (1-beta) for [New]Reno: 0.5 */ |
423 | #define RENO_BETAA 1 |
424 | #define RENO_BETAB 2 |
425 | /* Window reduction (1-beta) for Cubic: 0.8 */ |
426 | #define CUBIC_BETAA 4 |
427 | #define CUBIC_BETAB 5 |
428 | /* Draft Rhee Section 4.1 */ |
429 | #define CUBIC_CA 4 |
430 | #define CUBIC_CB 10 |
431 | |
432 | static void |
433 | tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab) |
434 | { |
435 | u_int win; |
436 | |
437 | /* |
438 | * Reduce the congestion window and the slow start threshold. |
439 | */ |
440 | win = min(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz; |
441 | if (win < 2) |
442 | win = 2; |
443 | |
444 | tp->snd_ssthresh = win * tp->t_segsz; |
445 | tp->snd_recover = tp->snd_max; |
446 | tp->snd_cwnd = tp->snd_ssthresh; |
447 | |
448 | /* |
449 | * When using TCP ECN, notify the peer that |
450 | * we reduced the cwnd. |
451 | */ |
452 | if (TCP_ECN_ALLOWED(tp)) |
453 | tp->t_flags |= TF_ECN_SND_CWR; |
454 | } |
455 | |
456 | |
457 | /* ------------------------------------------------------------------------ */ |
458 | |
459 | /* |
460 | * TCP/Reno congestion control. |
461 | */ |
462 | static void |
463 | tcp_reno_congestion_exp(struct tcpcb *tp) |
464 | { |
465 | |
466 | tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB); |
467 | } |
468 | |
469 | static int |
470 | tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) |
471 | { |
472 | /* |
473 | * Dup acks mean that packets have left the |
474 | * network (they're now cached at the receiver) |
475 | * so bump cwnd by the amount in the receiver |
476 | * to keep a constant cwnd packets in the |
477 | * network. |
478 | * |
479 | * If we are using TCP/SACK, then enter |
480 | * Fast Recovery if the receiver SACKs |
481 | * data that is tcprexmtthresh * MSS |
482 | * bytes past the last ACKed segment, |
483 | * irrespective of the number of DupAcks. |
484 | */ |
485 | |
486 | tcp_seq onxt = tp->snd_nxt; |
487 | |
488 | tp->t_partialacks = 0; |
489 | TCP_TIMER_DISARM(tp, TCPT_REXMT); |
490 | tp->t_rtttime = 0; |
491 | if (TCP_SACK_ENABLED(tp)) { |
492 | tp->t_dupacks = tcprexmtthresh; |
493 | tp->sack_newdata = tp->snd_nxt; |
494 | tp->snd_cwnd = tp->t_segsz; |
495 | (void) tcp_output(tp); |
496 | return 0; |
497 | } |
498 | tp->snd_nxt = th->th_ack; |
499 | tp->snd_cwnd = tp->t_segsz; |
500 | (void) tcp_output(tp); |
501 | tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks; |
502 | if (SEQ_GT(onxt, tp->snd_nxt)) |
503 | tp->snd_nxt = onxt; |
504 | |
505 | return 0; |
506 | } |
507 | |
508 | static int |
509 | tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) |
510 | { |
511 | |
512 | /* |
513 | * We know we're losing at the current |
514 | * window size so do congestion avoidance |
515 | * (set ssthresh to half the current window |
516 | * and pull our congestion window back to |
517 | * the new ssthresh). |
518 | */ |
519 | |
520 | tcp_reno_congestion_exp(tp); |
521 | return tcp_reno_do_fast_retransmit(tp, th); |
522 | } |
523 | |
524 | static void |
525 | tcp_reno_slow_retransmit(struct tcpcb *tp) |
526 | { |
527 | u_int win; |
528 | |
529 | /* |
530 | * Close the congestion window down to one segment |
531 | * (we'll open it by one segment for each ack we get). |
532 | * Since we probably have a window's worth of unacked |
533 | * data accumulated, this "slow start" keeps us from |
534 | * dumping all that data as back-to-back packets (which |
535 | * might overwhelm an intermediate gateway). |
536 | * |
537 | * There are two phases to the opening: Initially we |
538 | * open by one mss on each ack. This makes the window |
539 | * size increase exponentially with time. If the |
540 | * window is larger than the path can handle, this |
541 | * exponential growth results in dropped packet(s) |
542 | * almost immediately. To get more time between |
543 | * drops but still "push" the network to take advantage |
544 | * of improving conditions, we switch from exponential |
545 | * to linear window opening at some threshhold size. |
546 | * For a threshhold, we use half the current window |
547 | * size, truncated to a multiple of the mss. |
548 | * |
549 | * (the minimum cwnd that will give us exponential |
550 | * growth is 2 mss. We don't allow the threshhold |
551 | * to go below this.) |
552 | */ |
553 | |
554 | win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz; |
555 | if (win < 2) |
556 | win = 2; |
557 | /* Loss Window MUST be one segment. */ |
558 | tp->snd_cwnd = tp->t_segsz; |
559 | tp->snd_ssthresh = win * tp->t_segsz; |
560 | tp->t_partialacks = -1; |
561 | tp->t_dupacks = 0; |
562 | tp->t_bytes_acked = 0; |
563 | |
564 | if (TCP_ECN_ALLOWED(tp)) |
565 | tp->t_flags |= TF_ECN_SND_CWR; |
566 | } |
567 | |
568 | static void |
569 | tcp_reno_fast_retransmit_newack(struct tcpcb *tp, |
570 | const struct tcphdr *th) |
571 | { |
572 | if (tp->t_partialacks < 0) { |
573 | /* |
574 | * We were not in fast recovery. Reset the duplicate ack |
575 | * counter. |
576 | */ |
577 | tp->t_dupacks = 0; |
578 | } else { |
579 | /* |
580 | * Clamp the congestion window to the crossover point and |
581 | * exit fast recovery. |
582 | */ |
583 | if (tp->snd_cwnd > tp->snd_ssthresh) |
584 | tp->snd_cwnd = tp->snd_ssthresh; |
585 | tp->t_partialacks = -1; |
586 | tp->t_dupacks = 0; |
587 | tp->t_bytes_acked = 0; |
588 | if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) |
589 | tp->snd_fack = th->th_ack; |
590 | } |
591 | } |
592 | |
593 | static void |
594 | tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th) |
595 | { |
596 | /* |
597 | * When new data is acked, open the congestion window. |
598 | */ |
599 | |
600 | u_int cw = tp->snd_cwnd; |
601 | u_int incr = tp->t_segsz; |
602 | |
603 | if (tcp_do_abc) { |
604 | |
605 | /* |
606 | * RFC 3465 Appropriate Byte Counting (ABC) |
607 | */ |
608 | |
609 | int acked = th->th_ack - tp->snd_una; |
610 | |
611 | if (cw >= tp->snd_ssthresh) { |
612 | tp->t_bytes_acked += acked; |
613 | if (tp->t_bytes_acked >= cw) { |
614 | /* Time to increase the window. */ |
615 | tp->t_bytes_acked -= cw; |
616 | } else { |
617 | /* No need to increase yet. */ |
618 | incr = 0; |
619 | } |
620 | } else { |
621 | /* |
622 | * use 2*SMSS or 1*SMSS for the "L" param, |
623 | * depending on sysctl setting. |
624 | * |
625 | * (See RFC 3465 2.3 Choosing the Limit) |
626 | */ |
627 | u_int abc_lim; |
628 | |
629 | abc_lim = (tcp_abc_aggressive == 0 || |
630 | tp->snd_nxt != tp->snd_max) ? incr : incr * 2; |
631 | incr = min(acked, abc_lim); |
632 | } |
633 | } else { |
634 | |
635 | /* |
636 | * If the window gives us less than ssthresh packets |
637 | * in flight, open exponentially (segsz per packet). |
638 | * Otherwise open linearly: segsz per window |
639 | * (segsz^2 / cwnd per packet). |
640 | */ |
641 | |
642 | if (cw >= tp->snd_ssthresh) { |
643 | incr = incr * incr / cw; |
644 | } |
645 | } |
646 | |
647 | tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale); |
648 | } |
649 | |
650 | const struct tcp_congctl tcp_reno_ctl = { |
651 | .fast_retransmit = tcp_reno_fast_retransmit, |
652 | .slow_retransmit = tcp_reno_slow_retransmit, |
653 | .fast_retransmit_newack = tcp_reno_fast_retransmit_newack, |
654 | .newack = tcp_reno_newack, |
655 | .cong_exp = tcp_reno_congestion_exp, |
656 | }; |
657 | |
658 | /* |
659 | * TCP/NewReno Congestion control. |
660 | */ |
661 | static int |
662 | tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) |
663 | { |
664 | |
665 | if (SEQ_LT(th->th_ack, tp->snd_high)) { |
666 | /* |
667 | * False fast retransmit after timeout. |
668 | * Do not enter fast recovery |
669 | */ |
670 | tp->t_dupacks = 0; |
671 | return 1; |
672 | } |
673 | /* |
674 | * Fast retransmit is same as reno. |
675 | */ |
676 | return tcp_reno_fast_retransmit(tp, th); |
677 | } |
678 | |
679 | /* |
680 | * Implement the NewReno response to a new ack, checking for partial acks in |
681 | * fast recovery. |
682 | */ |
683 | static void |
684 | tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th) |
685 | { |
686 | if (tp->t_partialacks < 0) { |
687 | /* |
688 | * We were not in fast recovery. Reset the duplicate ack |
689 | * counter. |
690 | */ |
691 | tp->t_dupacks = 0; |
692 | } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { |
693 | /* |
694 | * This is a partial ack. Retransmit the first unacknowledged |
695 | * segment and deflate the congestion window by the amount of |
696 | * acknowledged data. Do not exit fast recovery. |
697 | */ |
698 | tcp_seq onxt = tp->snd_nxt; |
699 | u_long ocwnd = tp->snd_cwnd; |
700 | int sack_num_segs = 1, sack_bytes_rxmt = 0; |
701 | |
702 | /* |
703 | * snd_una has not yet been updated and the socket's send |
704 | * buffer has not yet drained off the ACK'd data, so we |
705 | * have to leave snd_una as it was to get the correct data |
706 | * offset in tcp_output(). |
707 | */ |
708 | tp->t_partialacks++; |
709 | TCP_TIMER_DISARM(tp, TCPT_REXMT); |
710 | tp->t_rtttime = 0; |
711 | tp->snd_nxt = th->th_ack; |
712 | |
713 | if (TCP_SACK_ENABLED(tp)) { |
714 | /* |
715 | * Partial ack handling within a sack recovery episode. |
716 | * Keeping this very simple for now. When a partial ack |
717 | * is received, force snd_cwnd to a value that will |
718 | * allow the sender to transmit no more than 2 segments. |
719 | * If necessary, a fancier scheme can be adopted at a |
720 | * later point, but for now, the goal is to prevent the |
721 | * sender from bursting a large amount of data in the |
722 | * midst of sack recovery. |
723 | */ |
724 | |
725 | /* |
726 | * send one or 2 segments based on how much |
727 | * new data was acked |
728 | */ |
729 | if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) |
730 | sack_num_segs = 2; |
731 | (void)tcp_sack_output(tp, &sack_bytes_rxmt); |
732 | tp->snd_cwnd = sack_bytes_rxmt + |
733 | (tp->snd_nxt - tp->sack_newdata) + |
734 | sack_num_segs * tp->t_segsz; |
735 | tp->t_flags |= TF_ACKNOW; |
736 | (void) tcp_output(tp); |
737 | } else { |
738 | /* |
739 | * Set snd_cwnd to one segment beyond ACK'd offset |
740 | * snd_una is not yet updated when we're called |
741 | */ |
742 | tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); |
743 | (void) tcp_output(tp); |
744 | tp->snd_cwnd = ocwnd; |
745 | if (SEQ_GT(onxt, tp->snd_nxt)) |
746 | tp->snd_nxt = onxt; |
747 | /* |
748 | * Partial window deflation. Relies on fact that |
749 | * tp->snd_una not updated yet. |
750 | */ |
751 | tp->snd_cwnd -= (th->th_ack - tp->snd_una - |
752 | tp->t_segsz); |
753 | } |
754 | } else { |
755 | /* |
756 | * Complete ack. Inflate the congestion window to ssthresh |
757 | * and exit fast recovery. |
758 | * |
759 | * Window inflation should have left us with approx. |
760 | * snd_ssthresh outstanding data. But in case we |
761 | * would be inclined to send a burst, better to do |
762 | * it via the slow start mechanism. |
763 | */ |
764 | if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) |
765 | tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) |
766 | + tp->t_segsz; |
767 | else |
768 | tp->snd_cwnd = tp->snd_ssthresh; |
769 | tp->t_partialacks = -1; |
770 | tp->t_dupacks = 0; |
771 | tp->t_bytes_acked = 0; |
772 | if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) |
773 | tp->snd_fack = th->th_ack; |
774 | } |
775 | } |
776 | |
777 | static void |
778 | tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th) |
779 | { |
780 | /* |
781 | * If we are still in fast recovery (meaning we are using |
782 | * NewReno and we have only received partial acks), do not |
783 | * inflate the window yet. |
784 | */ |
785 | if (tp->t_partialacks < 0) |
786 | tcp_reno_newack(tp, th); |
787 | } |
788 | |
789 | |
790 | const struct tcp_congctl tcp_newreno_ctl = { |
791 | .fast_retransmit = tcp_newreno_fast_retransmit, |
792 | .slow_retransmit = tcp_reno_slow_retransmit, |
793 | .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, |
794 | .newack = tcp_newreno_newack, |
795 | .cong_exp = tcp_reno_congestion_exp, |
796 | }; |
797 | |
798 | /* |
799 | * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02 |
800 | */ |
801 | |
802 | /* Cubic prototypes */ |
803 | static void tcp_cubic_update_ctime(struct tcpcb *tp); |
804 | static uint32_t tcp_cubic_diff_ctime(struct tcpcb *); |
805 | static uint32_t tcp_cubic_cbrt(uint32_t); |
806 | static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t); |
807 | |
808 | /* Cubic TIME functions - XXX I don't like using timevals and microuptime */ |
809 | /* |
810 | * Set congestion timer to now |
811 | */ |
812 | static void |
813 | tcp_cubic_update_ctime(struct tcpcb *tp) |
814 | { |
815 | struct timeval now_timeval; |
816 | |
817 | getmicrouptime(&now_timeval); |
818 | tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 + |
819 | now_timeval.tv_usec / 1000; |
820 | } |
821 | |
822 | /* |
823 | * miliseconds from last congestion |
824 | */ |
825 | static uint32_t |
826 | tcp_cubic_diff_ctime(struct tcpcb *tp) |
827 | { |
828 | struct timeval now_timeval; |
829 | |
830 | getmicrouptime(&now_timeval); |
831 | return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 - |
832 | tp->snd_cubic_ctime; |
833 | } |
834 | |
835 | /* |
836 | * Approximate cubic root |
837 | */ |
838 | #define CBRT_ROUNDS 30 |
839 | static uint32_t |
840 | tcp_cubic_cbrt(uint32_t v) |
841 | { |
842 | int i, rounds = CBRT_ROUNDS; |
843 | uint64_t x = v / 3; |
844 | |
845 | /* We fail to calculate correct for small numbers */ |
846 | if (v == 0) |
847 | return 0; |
848 | else if (v < 4) |
849 | return 1; |
850 | |
851 | /* |
852 | * largest x that 2*x^3+3*x fits 64bit |
853 | * Avoid overflow for a time cost |
854 | */ |
855 | if (x > 2097151) |
856 | rounds += 10; |
857 | |
858 | for (i = 0; i < rounds; i++) |
859 | if (rounds == CBRT_ROUNDS) |
860 | x = (v + 2 * x * x * x) / (3 * x * x); |
861 | else |
862 | /* Avoid overflow */ |
863 | x = v / (3 * x * x) + 2 * x / 3; |
864 | |
865 | return (uint32_t)x; |
866 | } |
867 | |
868 | /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */ |
869 | static ulong |
870 | tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt) |
871 | { |
872 | uint32_t K; |
873 | long tK3; |
874 | |
875 | /* Section 3.1 Eq. 2 */ |
876 | K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB * |
877 | CUBIC_CB / CUBIC_CA); |
878 | /* (t-K)^3 - not clear why is the measure unit mattering */ |
879 | tK3 = (long)(ms_elapsed + rtt) - (long)K; |
880 | tK3 = tK3 * tK3 * tK3; |
881 | |
882 | return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax; |
883 | } |
884 | |
885 | static void |
886 | tcp_cubic_congestion_exp(struct tcpcb *tp) |
887 | { |
888 | |
889 | /* |
890 | * Congestion - Set WMax and shrink cwnd |
891 | */ |
892 | tcp_cubic_update_ctime(tp); |
893 | |
894 | /* Section 3.6 - Fast Convergence */ |
895 | if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) { |
896 | tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; |
897 | tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 + |
898 | tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2; |
899 | } else { |
900 | tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; |
901 | tp->snd_cubic_wmax = tp->snd_cwnd; |
902 | } |
903 | |
904 | tp->snd_cubic_wmax = max(tp->t_segsz, tp->snd_cubic_wmax); |
905 | |
906 | /* Shrink CWND */ |
907 | tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB); |
908 | } |
909 | |
910 | static int |
911 | tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) |
912 | { |
913 | |
914 | if (SEQ_LT(th->th_ack, tp->snd_high)) { |
915 | /* See newreno */ |
916 | tp->t_dupacks = 0; |
917 | return 1; |
918 | } |
919 | |
920 | /* |
921 | * mark WMax |
922 | */ |
923 | tcp_cubic_congestion_exp(tp); |
924 | |
925 | /* Do fast retransmit */ |
926 | return tcp_reno_do_fast_retransmit(tp, th); |
927 | } |
928 | |
929 | static void |
930 | tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th) |
931 | { |
932 | uint32_t ms_elapsed, rtt; |
933 | u_long w_tcp; |
934 | |
935 | /* Congestion avoidance and not in fast recovery and usable rtt */ |
936 | if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 && |
937 | /* |
938 | * t_srtt is 1/32 units of slow ticks |
939 | * converting it in ms would be equal to |
940 | * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ |
941 | */ |
942 | (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) { |
943 | ms_elapsed = tcp_cubic_diff_ctime(tp); |
944 | |
945 | /* Compute W_tcp(t) */ |
946 | w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB + |
947 | ms_elapsed / rtt / 3; |
948 | |
949 | if (tp->snd_cwnd > w_tcp) { |
950 | /* Not in TCP friendly mode */ |
951 | tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) - |
952 | tp->snd_cwnd) / tp->snd_cwnd; |
953 | } else { |
954 | /* friendly TCP mode */ |
955 | tp->snd_cwnd = w_tcp; |
956 | } |
957 | |
958 | /* Make sure we are within limits */ |
959 | tp->snd_cwnd = max(tp->snd_cwnd, tp->t_segsz); |
960 | tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); |
961 | } else { |
962 | /* Use New Reno */ |
963 | tcp_newreno_newack(tp, th); |
964 | } |
965 | } |
966 | |
967 | static void |
968 | tcp_cubic_slow_retransmit(struct tcpcb *tp) |
969 | { |
970 | |
971 | /* Timeout - Mark new congestion */ |
972 | tcp_cubic_congestion_exp(tp); |
973 | |
974 | /* Loss Window MUST be one segment. */ |
975 | tp->snd_cwnd = tp->t_segsz; |
976 | tp->t_partialacks = -1; |
977 | tp->t_dupacks = 0; |
978 | tp->t_bytes_acked = 0; |
979 | |
980 | if (TCP_ECN_ALLOWED(tp)) |
981 | tp->t_flags |= TF_ECN_SND_CWR; |
982 | } |
983 | |
984 | const struct tcp_congctl tcp_cubic_ctl = { |
985 | .fast_retransmit = tcp_cubic_fast_retransmit, |
986 | .slow_retransmit = tcp_cubic_slow_retransmit, |
987 | .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, |
988 | .newack = tcp_cubic_newack, |
989 | .cong_exp = tcp_cubic_congestion_exp, |
990 | }; |
991 | |