tcp_congctl.c source code [src/src/sys/netinet/tcp_congctl.c]

1	/ $NetBSD: tcp_congctl.c,v 1.21 2016/04/26 08:44:44 ozaki-r Exp $ /
2
3	/-*
4	* Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
9	* Facility, NASA Ames Research Center.
10	* This code is derived from software contributed to The NetBSD Foundation
11	* by Charles M. Hannum.
12	* This code is derived from software contributed to The NetBSD Foundation
13	* by Rui Paulo.
14	*
15	* Redistribution and use in source and binary forms, with or without
16	* modification, are permitted provided that the following conditions
17	* are met:
18	* 1. Redistributions of source code must retain the above copyright
19	* notice, this list of conditions and the following disclaimer.
20	* 2. Redistributions in binary form must reproduce the above copyright
21	* notice, this list of conditions and the following disclaimer in the
22	* documentation and/or other materials provided with the distribution.
23	*
24	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34	* POSSIBILITY OF SUCH DAMAGE.
35	*/
36
37	/*
38	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
39	* All rights reserved.
40	*
41	* Redistribution and use in source and binary forms, with or without
42	* modification, are permitted provided that the following conditions
43	* are met:
44	* 1. Redistributions of source code must retain the above copyright
45	* notice, this list of conditions and the following disclaimer.
46	* 2. Redistributions in binary form must reproduce the above copyright
47	* notice, this list of conditions and the following disclaimer in the
48	* documentation and/or other materials provided with the distribution.
49	* 3. Neither the name of the project nor the names of its contributors
50	* may be used to endorse or promote products derived from this software
51	* without specific prior written permission.
52	*
53	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
54	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
57	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63	* SUCH DAMAGE.
64	*/
65
66	/*
67	* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
68	*
69	* NRL grants permission for redistribution and use in source and binary
70	* forms, with or without modification, of the software and documentation
71	* created at NRL provided that the following conditions are met:
72	*
73	* 1. Redistributions of source code must retain the above copyright
74	* notice, this list of conditions and the following disclaimer.
75	* 2. Redistributions in binary form must reproduce the above copyright
76	* notice, this list of conditions and the following disclaimer in the
77	* documentation and/or other materials provided with the distribution.
78	* 3. All advertising materials mentioning features or use of this software
79	* must display the following acknowledgements:
80	* This product includes software developed by the University of
81	* California, Berkeley and its contributors.
82	* This product includes software developed at the Information
83	* Technology Division, US Naval Research Laboratory.
84	* 4. Neither the name of the NRL nor the names of its contributors
85	* may be used to endorse or promote products derived from this software
86	* without specific prior written permission.
87	*
88	* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
89	* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
90	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
91	* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
92	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
93	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
94	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
95	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
96	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
97	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
98	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99	*
100	* The views and conclusions contained in the software and documentation
101	* are those of the authors and should not be interpreted as representing
102	* official policies, either expressed or implied, of the US Naval
103	* Research Laboratory (NRL).
104	*/
105
106	/*
107	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
108	* The Regents of the University of California. All rights reserved.
109	*
110	* Redistribution and use in source and binary forms, with or without
111	* modification, are permitted provided that the following conditions
112	* are met:
113	* 1. Redistributions of source code must retain the above copyright
114	* notice, this list of conditions and the following disclaimer.
115	* 2. Redistributions in binary form must reproduce the above copyright
116	* notice, this list of conditions and the following disclaimer in the
117	* documentation and/or other materials provided with the distribution.
118	* 3. Neither the name of the University nor the names of its contributors
119	* may be used to endorse or promote products derived from this software
120	* without specific prior written permission.
121	*
122	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132	* SUCH DAMAGE.
133	*
134	* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
135	*/
136
137	#include <sys/cdefs.h>
138	__KERNEL_RCSID(`0`, "$NetBSD: tcp_congctl.c,v 1.21 2016/04/26 08:44:44 ozaki-r Exp $");
139
140	#ifdef _KERNEL_OPT
141	#include "opt_inet.h"
142	#include "opt_tcp_debug.h"
143	#include "opt_tcp_congctl.h"
144	#endif
145
146	#include <sys/param.h>
147	#include <sys/systm.h>
148	#include <sys/malloc.h>
149	#include <sys/mbuf.h>
150	#include <sys/protosw.h>
151	#include <sys/socket.h>
152	#include <sys/socketvar.h>
153	#include <sys/errno.h>
154	#include <sys/syslog.h>
155	#include <sys/pool.h>
156	#include <sys/domain.h>
157	#include <sys/kernel.h>
158	#include <sys/mutex.h>
159
160	#include <net/if.h>
161
162	#include <netinet/in.h>
163	#include <netinet/in_systm.h>
164	#include <netinet/ip.h>
165	#include <netinet/in_pcb.h>
166	#include <netinet/in_var.h>
167	#include <netinet/ip_var.h>
168
169	#ifdef INET6
170	#ifndef INET
171	#include <netinet/in.h>
172	#endif
173	#include <netinet/ip6.h>
174	#include <netinet6/ip6_var.h>
175	#include <netinet6/in6_pcb.h>
176	#include <netinet6/ip6_var.h>
177	#include <netinet6/in6_var.h>
178	#include <netinet/icmp6.h>
179	#include <netinet6/nd6.h>
180	#endif
181
182	#include <netinet/tcp.h>
183	#include <netinet/tcp_fsm.h>
184	#include <netinet/tcp_seq.h>
185	#include <netinet/tcp_timer.h>
186	#include <netinet/tcp_var.h>
187	#include <netinet/tcpip.h>
188	#include <netinet/tcp_congctl.h>
189	#ifdef TCP_DEBUG
190	#include <netinet/tcp_debug.h>
191	#endif
192
193	/*
194	* TODO:
195	* consider separating the actual implementations in another file.
196	*/
197
198	static void tcp_common_congestion_exp(struct tcpcb , int, int*);
199
200	static int tcp_reno_do_fast_retransmit(struct tcpcb , const* struct tcphdr *);
201	static int tcp_reno_fast_retransmit(struct tcpcb , const* struct tcphdr *);
202	static void tcp_reno_slow_retransmit(struct tcpcb *);
203	static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
204	const struct tcphdr *);
205	static void tcp_reno_newack(struct tcpcb , const* struct tcphdr *);
206	static void tcp_reno_congestion_exp(struct tcpcb *tp);
207
208	static int tcp_newreno_fast_retransmit(struct tcpcb , const* struct tcphdr *);
209	static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
210	const struct tcphdr *);
211	static void tcp_newreno_newack(struct tcpcb , const* struct tcphdr *);
212
213	static int tcp_cubic_fast_retransmit(struct tcpcb , const* struct tcphdr *);
214	static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
215	static void tcp_cubic_newack(struct tcpcb , const* struct tcphdr *);
216	static void tcp_cubic_congestion_exp(struct tcpcb *);
217
218	static void tcp_congctl_fillnames(void);
219
220	extern int tcprexmtthresh;
221
222	MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
223
224	/ currently selected global congestion control /
225	char tcp_congctl_global_name[TCPCC_MAXLEN];
226
227	/ available global congestion control algorithms /
228	char tcp_congctl_avail[`10` * TCPCC_MAXLEN];
229
230	/*
231	* Used to list the available congestion control algorithms.
232	*/
233	TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
234	TAILQ_HEAD_INITIALIZER(tcp_congctlhd);
235
236	static struct tcp_congctlent * tcp_congctl_global;
237
238	static kmutex_t tcp_congctl_mtx;
239
240	void
241	tcp_congctl_init(void)
242	{
243	int r __diagused;
244
245	mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);
246
247	/ Base algorithms. /
248	r = tcp_congctl_register("reno", &tcp_reno_ctl);
249	KASSERT(r == `0`);
250	r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
251	KASSERT(r == `0`);
252	r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
253	KASSERT(r == `0`);
254
255	/ NewReno is the default. /
256	#ifndef TCP_CONGCTL_DEFAULT
257	#define TCP_CONGCTL_DEFAULT "newreno"
258	#endif
259
260	r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
261	KASSERT(r == `0`);
262	}
263
264	/*
265	* Register a congestion algorithm and select it if we have none.
266	*/
267	int
268	tcp_congctl_register(const char name, const* struct tcp_congctl *tcc)
269	{
270	struct tcp_congctlent ntcc, tccp;
271
272	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
273	if (!strcmp(name, tccp->congctl_name)) {
274	/ name already registered /
275	return EEXIST;
276	}
277
278	ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK\|M_ZERO);
279
280	strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - `1`);
281	ntcc->congctl_ctl = tcc;
282
283	TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
284	tcp_congctl_fillnames();
285
286	if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
287	tcp_congctl_select(NULL, name);
288
289	return `0`;
290	}
291
292	int
293	tcp_congctl_unregister(const char *name)
294	{
295	struct tcp_congctlent tccp, rtccp;
296	unsigned int size;
297
298	rtccp = NULL;
299	size = `0`;
300	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
301	if (!strcmp(name, tccp->congctl_name))
302	rtccp = tccp;
303	size++;
304	}
305
306	if (!rtccp)
307	return ENOENT;
308
309	if (size <= `1` \|\| tcp_congctl_global == rtccp \|\| rtccp->congctl_refcnt)
310	return EBUSY;
311
312	TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
313	free(rtccp, M_TCPCONGCTL);
314	tcp_congctl_fillnames();
315
316	return `0`;
317	}
318
319	/*
320	* Select a congestion algorithm by name.
321	*/
322	int
323	tcp_congctl_select(struct tcpcb tp, const* char *name)
324	{
325	struct tcp_congctlent tccp, old_tccp, *new_tccp;
326	bool old_found, new_found;
327
328	KASSERT(name);
329
330	old_found = (tp == NULL \|\| tp->t_congctl == NULL);
331	old_tccp = NULL;
332	new_found = false;
333	new_tccp = NULL;
334
335	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
336	if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
337	old_tccp = tccp;
338	old_found = true;
339	}
340
341	if (!new_found && !strcmp(name, tccp->congctl_name)) {
342	new_tccp = tccp;
343	new_found = true;
344	}
345
346	if (new_found && old_found) {
347	if (tp) {
348	mutex_enter(&tcp_congctl_mtx);
349	if (old_tccp)
350	old_tccp->congctl_refcnt--;
351	tp->t_congctl = new_tccp->congctl_ctl;
352	new_tccp->congctl_refcnt++;
353	mutex_exit(&tcp_congctl_mtx);
354	} else {
355	tcp_congctl_global = new_tccp;
356	strlcpy(tcp_congctl_global_name,
357	new_tccp->congctl_name,
358	sizeof(tcp_congctl_global_name) - `1`);
359	}
360	return `0`;
361	}
362	}
363
364	return EINVAL;
365	}
366
367	void
368	tcp_congctl_release(struct tcpcb *tp)
369	{
370	struct tcp_congctlent *tccp;
371
372	KASSERT(tp->t_congctl);
373
374	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
375	if (tccp->congctl_ctl == tp->t_congctl) {
376	tccp->congctl_refcnt--;
377	return;
378	}
379	}
380	}
381
382	/*
383	* Returns the name of a congestion algorithm.
384	*/
385	const char *
386	tcp_congctl_bystruct(const struct tcp_congctl *tcc)
387	{
388	struct tcp_congctlent *tccp;
389
390	KASSERT(tcc);
391
392	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
393	if (tccp->congctl_ctl == tcc)
394	return tccp->congctl_name;
395
396	return NULL;
397	}
398
399	static void
400	tcp_congctl_fillnames(void)
401	{
402	struct tcp_congctlent *tccp;
403	const char *delim = " ";
404
405	tcp_congctl_avail[`0`] = `'\0'`;
406	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
407	strlcat(tcp_congctl_avail, tccp->congctl_name,
408	sizeof(tcp_congctl_avail) - `1`);
409	if (TAILQ_NEXT(tccp, congctl_ent))
410	strlcat(tcp_congctl_avail, delim,
411	sizeof(tcp_congctl_avail) - `1`);
412	}
413
414	}
415
416	/ ------------------------------------------------------------------------ /
417
418	/*
419	* Common stuff
420	*/
421
422	/ Window reduction (1-beta) for [New]Reno: 0.5 /
423	#define RENO_BETAA 1
424	#define RENO_BETAB 2
425	/ Window reduction (1-beta) for Cubic: 0.8 /
426	#define CUBIC_BETAA 4
427	#define CUBIC_BETAB 5
428	/ Draft Rhee Section 4.1 /
429	#define CUBIC_CA 4
430	#define CUBIC_CB 10
431
432	static void
433	tcp_common_congestion_exp(struct tcpcb tp, int* betaa, int betab)
434	{
435	u_int win;
436
437	/*
438	* Reduce the congestion window and the slow start threshold.
439	*/
440	win = min(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
441	if (win < `2`)
442	win = `2`;
443
444	tp->snd_ssthresh = win * tp->t_segsz;
445	tp->snd_recover = tp->snd_max;
446	tp->snd_cwnd = tp->snd_ssthresh;
447
448	/*
449	* When using TCP ECN, notify the peer that
450	* we reduced the cwnd.
451	*/
452	if (TCP_ECN_ALLOWED(tp))
453	tp->t_flags \|= TF_ECN_SND_CWR;
454	}
455
456
457	/ ------------------------------------------------------------------------ /
458
459	/*
460	* TCP/Reno congestion control.
461	*/
462	static void
463	tcp_reno_congestion_exp(struct tcpcb *tp)
464	{
465
466	tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
467	}
468
469	static int
470	tcp_reno_do_fast_retransmit(struct tcpcb tp, const* struct tcphdr *th)
471	{
472	/*
473	* Dup acks mean that packets have left the
474	* network (they're now cached at the receiver)
475	* so bump cwnd by the amount in the receiver
476	* to keep a constant cwnd packets in the
477	* network.
478	*
479	* If we are using TCP/SACK, then enter
480	* Fast Recovery if the receiver SACKs
481	* data that is tcprexmtthresh * MSS
482	* bytes past the last ACKed segment,
483	* irrespective of the number of DupAcks.
484	*/
485
486	tcp_seq onxt = tp->snd_nxt;
487
488	tp->t_partialacks = `0`;
489	TCP_TIMER_DISARM(tp, TCPT_REXMT);
490	tp->t_rtttime = `0`;
491	if (TCP_SACK_ENABLED(tp)) {
492	tp->t_dupacks = tcprexmtthresh;
493	tp->sack_newdata = tp->snd_nxt;
494	tp->snd_cwnd = tp->t_segsz;
495	(void) tcp_output(tp);
496	return `0`;
497	}
498	tp->snd_nxt = th->th_ack;
499	tp->snd_cwnd = tp->t_segsz;
500	(void) tcp_output(tp);
501	tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
502	if (SEQ_GT(onxt, tp->snd_nxt))
503	tp->snd_nxt = onxt;
504
505	return `0`;
506	}
507
508	static int
509	tcp_reno_fast_retransmit(struct tcpcb tp, const* struct tcphdr *th)
510	{
511
512	/*
513	* We know we're losing at the current
514	* window size so do congestion avoidance
515	* (set ssthresh to half the current window
516	* and pull our congestion window back to
517	* the new ssthresh).
518	*/
519
520	tcp_reno_congestion_exp(tp);
521	return tcp_reno_do_fast_retransmit(tp, th);
522	}
523
524	static void
525	tcp_reno_slow_retransmit(struct tcpcb *tp)
526	{
527	u_int win;
528
529	/*
530	* Close the congestion window down to one segment
531	* (we'll open it by one segment for each ack we get).
532	* Since we probably have a window's worth of unacked
533	* data accumulated, this "slow start" keeps us from
534	* dumping all that data as back-to-back packets (which
535	* might overwhelm an intermediate gateway).
536	*
537	* There are two phases to the opening: Initially we
538	* open by one mss on each ack. This makes the window
539	* size increase exponentially with time. If the
540	* window is larger than the path can handle, this
541	* exponential growth results in dropped packet(s)
542	* almost immediately. To get more time between
543	* drops but still "push" the network to take advantage
544	* of improving conditions, we switch from exponential
545	* to linear window opening at some threshhold size.
546	* For a threshhold, we use half the current window
547	* size, truncated to a multiple of the mss.
548	*
549	* (the minimum cwnd that will give us exponential
550	* growth is 2 mss. We don't allow the threshhold
551	* to go below this.)
552	*/
553
554	win = min(tp->snd_wnd, tp->snd_cwnd) / `2` / tp->t_segsz;
555	if (win < `2`)
556	win = `2`;
557	/ Loss Window MUST be one segment. /
558	tp->snd_cwnd = tp->t_segsz;
559	tp->snd_ssthresh = win * tp->t_segsz;
560	tp->t_partialacks = -`1`;
561	tp->t_dupacks = `0`;
562	tp->t_bytes_acked = `0`;
563
564	if (TCP_ECN_ALLOWED(tp))
565	tp->t_flags \|= TF_ECN_SND_CWR;
566	}
567
568	static void
569	tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
570	const struct tcphdr *th)
571	{
572	if (tp->t_partialacks < `0`) {
573	/*
574	* We were not in fast recovery. Reset the duplicate ack
575	* counter.
576	*/
577	tp->t_dupacks = `0`;
578	} else {
579	/*
580	* Clamp the congestion window to the crossover point and
581	* exit fast recovery.
582	*/
583	if (tp->snd_cwnd > tp->snd_ssthresh)
584	tp->snd_cwnd = tp->snd_ssthresh;
585	tp->t_partialacks = -`1`;
586	tp->t_dupacks = `0`;
587	tp->t_bytes_acked = `0`;
588	if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
589	tp->snd_fack = th->th_ack;
590	}
591	}
592
593	static void
594	tcp_reno_newack(struct tcpcb tp, const* struct tcphdr *th)
595	{
596	/*
597	* When new data is acked, open the congestion window.
598	*/
599
600	u_int cw = tp->snd_cwnd;
601	u_int incr = tp->t_segsz;
602
603	if (tcp_do_abc) {
604
605	/*
606	* RFC 3465 Appropriate Byte Counting (ABC)
607	*/
608
609	int acked = th->th_ack - tp->snd_una;
610
611	if (cw >= tp->snd_ssthresh) {
612	tp->t_bytes_acked += acked;
613	if (tp->t_bytes_acked >= cw) {
614	/ Time to increase the window. /
615	tp->t_bytes_acked -= cw;
616	} else {
617	/ No need to increase yet. /
618	incr = `0`;
619	}
620	} else {
621	/*
622	* use 2SMSS or 1SMSS for the "L" param,
623	* depending on sysctl setting.
624	*
625	* (See RFC 3465 2.3 Choosing the Limit)
626	*/
627	u_int abc_lim;
628
629	abc_lim = (tcp_abc_aggressive == `0` \|\|
630	tp->snd_nxt != tp->snd_max) ? incr : incr * `2`;
631	incr = min(acked, abc_lim);
632	}
633	} else {
634
635	/*
636	* If the window gives us less than ssthresh packets
637	* in flight, open exponentially (segsz per packet).
638	* Otherwise open linearly: segsz per window
639	* (segsz^2 / cwnd per packet).
640	*/
641
642	if (cw >= tp->snd_ssthresh) {
643	incr = incr * incr / cw;
644	}
645	}
646
647	tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale);
648	}
649
650	const struct tcp_congctl tcp_reno_ctl = {
651	.fast_retransmit = tcp_reno_fast_retransmit,
652	.slow_retransmit = tcp_reno_slow_retransmit,
653	.fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
654	.newack = tcp_reno_newack,
655	.cong_exp = tcp_reno_congestion_exp,
656	};
657
658	/*
659	* TCP/NewReno Congestion control.
660	*/
661	static int
662	tcp_newreno_fast_retransmit(struct tcpcb tp, const* struct tcphdr *th)
663	{
664
665	if (SEQ_LT(th->th_ack, tp->snd_high)) {
666	/*
667	* False fast retransmit after timeout.
668	* Do not enter fast recovery
669	*/
670	tp->t_dupacks = `0`;
671	return `1`;
672	}
673	/*
674	* Fast retransmit is same as reno.
675	*/
676	return tcp_reno_fast_retransmit(tp, th);
677	}
678
679	/*
680	* Implement the NewReno response to a new ack, checking for partial acks in
681	* fast recovery.
682	*/
683	static void
684	tcp_newreno_fast_retransmit_newack(struct tcpcb tp, const* struct tcphdr *th)
685	{
686	if (tp->t_partialacks < `0`) {
687	/*
688	* We were not in fast recovery. Reset the duplicate ack
689	* counter.
690	*/
691	tp->t_dupacks = `0`;
692	} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
693	/*
694	* This is a partial ack. Retransmit the first unacknowledged
695	* segment and deflate the congestion window by the amount of
696	* acknowledged data. Do not exit fast recovery.
697	*/
698	tcp_seq onxt = tp->snd_nxt;
699	u_long ocwnd = tp->snd_cwnd;
700	int sack_num_segs = `1`, sack_bytes_rxmt = `0`;
701
702	/*
703	* snd_una has not yet been updated and the socket's send
704	* buffer has not yet drained off the ACK'd data, so we
705	* have to leave snd_una as it was to get the correct data
706	* offset in tcp_output().
707	*/
708	tp->t_partialacks++;
709	TCP_TIMER_DISARM(tp, TCPT_REXMT);
710	tp->t_rtttime = `0`;
711	tp->snd_nxt = th->th_ack;
712
713	if (TCP_SACK_ENABLED(tp)) {
714	/*
715	* Partial ack handling within a sack recovery episode.
716	* Keeping this very simple for now. When a partial ack
717	* is received, force snd_cwnd to a value that will
718	* allow the sender to transmit no more than 2 segments.
719	* If necessary, a fancier scheme can be adopted at a
720	* later point, but for now, the goal is to prevent the
721	* sender from bursting a large amount of data in the
722	* midst of sack recovery.
723	*/
724
725	/*
726	* send one or 2 segments based on how much
727	* new data was acked
728	*/
729	if (((th->th_ack - tp->snd_una) / tp->t_segsz) > `2`)
730	sack_num_segs = `2`;
731	(void)tcp_sack_output(tp, &sack_bytes_rxmt);
732	tp->snd_cwnd = sack_bytes_rxmt +
733	(tp->snd_nxt - tp->sack_newdata) +
734	sack_num_segs * tp->t_segsz;
735	tp->t_flags \|= TF_ACKNOW;
736	(void) tcp_output(tp);
737	} else {
738	/*
739	* Set snd_cwnd to one segment beyond ACK'd offset
740	* snd_una is not yet updated when we're called
741	*/
742	tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
743	(void) tcp_output(tp);
744	tp->snd_cwnd = ocwnd;
745	if (SEQ_GT(onxt, tp->snd_nxt))
746	tp->snd_nxt = onxt;
747	/*
748	* Partial window deflation. Relies on fact that
749	* tp->snd_una not updated yet.
750	*/
751	tp->snd_cwnd -= (th->th_ack - tp->snd_una -
752	tp->t_segsz);
753	}
754	} else {
755	/*
756	* Complete ack. Inflate the congestion window to ssthresh
757	* and exit fast recovery.
758	*
759	* Window inflation should have left us with approx.
760	* snd_ssthresh outstanding data. But in case we
761	* would be inclined to send a burst, better to do
762	* it via the slow start mechanism.
763	*/
764	if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
765	tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
766	+ tp->t_segsz;
767	else
768	tp->snd_cwnd = tp->snd_ssthresh;
769	tp->t_partialacks = -`1`;
770	tp->t_dupacks = `0`;
771	tp->t_bytes_acked = `0`;
772	if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
773	tp->snd_fack = th->th_ack;
774	}
775	}
776
777	static void
778	tcp_newreno_newack(struct tcpcb tp, const* struct tcphdr *th)
779	{
780	/*
781	* If we are still in fast recovery (meaning we are using
782	* NewReno and we have only received partial acks), do not
783	* inflate the window yet.
784	*/
785	if (tp->t_partialacks < `0`)
786	tcp_reno_newack(tp, th);
787	}
788
789
790	const struct tcp_congctl tcp_newreno_ctl = {
791	.fast_retransmit = tcp_newreno_fast_retransmit,
792	.slow_retransmit = tcp_reno_slow_retransmit,
793	.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
794	.newack = tcp_newreno_newack,
795	.cong_exp = tcp_reno_congestion_exp,
796	};
797
798	/*
799	* CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
800	*/
801
802	/ Cubic prototypes /
803	static void tcp_cubic_update_ctime(struct tcpcb *tp);
804	static uint32_t tcp_cubic_diff_ctime(struct tcpcb *);
805	static uint32_t tcp_cubic_cbrt(uint32_t);
806	static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);
807
808	/ Cubic TIME functions - XXX I don't like using timevals and microuptime /
809	/*
810	* Set congestion timer to now
811	*/
812	static void
813	tcp_cubic_update_ctime(struct tcpcb *tp)
814	{
815	struct timeval now_timeval;
816
817	getmicrouptime(&now_timeval);
818	tp->snd_cubic_ctime = now_timeval.tv_sec * `1000` +
819	now_timeval.tv_usec / `1000`;
820	}
821
822	/*
823	* miliseconds from last congestion
824	*/
825	static uint32_t
826	tcp_cubic_diff_ctime(struct tcpcb *tp)
827	{
828	struct timeval now_timeval;
829
830	getmicrouptime(&now_timeval);
831	return now_timeval.tv_sec * `1000` + now_timeval.tv_usec / `1000` -
832	tp->snd_cubic_ctime;
833	}
834
835	/*
836	* Approximate cubic root
837	*/
838	#define CBRT_ROUNDS 30
839	static uint32_t
840	tcp_cubic_cbrt(uint32_t v)
841	{
842	int i, rounds = CBRT_ROUNDS;
843	uint64_t x = v / `3`;
844
845	/ We fail to calculate correct for small numbers /
846	if (v == `0`)
847	return `0`;
848	else if (v < `4`)
849	return `1`;
850
851	/*
852	* largest x that 2x^3+3x fits 64bit
853	* Avoid overflow for a time cost
854	*/
855	if (x > `2097151`)
856	rounds += `10`;
857
858	for (i = `0`; i < rounds; i++)
859	if (rounds == CBRT_ROUNDS)
860	x = (v + `2` * x * x * x) / (`3` * x * x);
861	else
862	/ Avoid overflow /
863	x = v / (`3` * x * x) + `2` * x / `3`;
864
865	return (uint32_t)x;
866	}
867
868	/ Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 /
869	static ulong
870	tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
871	{
872	uint32_t K;
873	long tK3;
874
875	/ Section 3.1 Eq. 2 /
876	K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
877	CUBIC_CB / CUBIC_CA);
878	/ (t-K)^3 - not clear why is the measure unit mattering /
879	tK3 = (long)(ms_elapsed + rtt) - (long)K;
880	tK3 = tK3 * tK3 * tK3;
881
882	return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
883	}
884
885	static void
886	tcp_cubic_congestion_exp(struct tcpcb *tp)
887	{
888
889	/*
890	* Congestion - Set WMax and shrink cwnd
891	*/
892	tcp_cubic_update_ctime(tp);
893
894	/ Section 3.6 - Fast Convergence /
895	if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
896	tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
897	tp->snd_cubic_wmax = tp->snd_cubic_wmax / `2` +
898	tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / `2`;
899	} else {
900	tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
901	tp->snd_cubic_wmax = tp->snd_cwnd;
902	}
903
904	tp->snd_cubic_wmax = max(tp->t_segsz, tp->snd_cubic_wmax);
905
906	/ Shrink CWND /
907	tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
908	}
909
910	static int
911	tcp_cubic_fast_retransmit(struct tcpcb tp, const* struct tcphdr *th)
912	{
913
914	if (SEQ_LT(th->th_ack, tp->snd_high)) {
915	/ See newreno /
916	tp->t_dupacks = `0`;
917	return `1`;
918	}
919
920	/*
921	* mark WMax
922	*/
923	tcp_cubic_congestion_exp(tp);
924
925	/ Do fast retransmit /
926	return tcp_reno_do_fast_retransmit(tp, th);
927	}
928
929	static void
930	tcp_cubic_newack(struct tcpcb tp, const* struct tcphdr *th)
931	{
932	uint32_t ms_elapsed, rtt;
933	u_long w_tcp;
934
935	/ Congestion avoidance and not in fast recovery and usable rtt /
936	if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < `0` &&
937	/*
938	* t_srtt is 1/32 units of slow ticks
939	* converting it in ms would be equal to
940	* (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
941	*/
942	(rtt = (tp->t_srtt << `5`) / PR_SLOWHZ) > `0`) {
943	ms_elapsed = tcp_cubic_diff_ctime(tp);
944
945	/ Compute W_tcp(t) /
946	w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
947	ms_elapsed / rtt / `3`;
948
949	if (tp->snd_cwnd > w_tcp) {
950	/ Not in TCP friendly mode /
951	tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
952	tp->snd_cwnd) / tp->snd_cwnd;
953	} else {
954	/ friendly TCP mode /
955	tp->snd_cwnd = w_tcp;
956	}
957
958	/ Make sure we are within limits /
959	tp->snd_cwnd = max(tp->snd_cwnd, tp->t_segsz);
960	tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
961	} else {
962	/ Use New Reno /
963	tcp_newreno_newack(tp, th);
964	}
965	}
966
967	static void
968	tcp_cubic_slow_retransmit(struct tcpcb *tp)
969	{
970
971	/ Timeout - Mark new congestion /
972	tcp_cubic_congestion_exp(tp);
973
974	/ Loss Window MUST be one segment. /
975	tp->snd_cwnd = tp->t_segsz;
976	tp->t_partialacks = -`1`;
977	tp->t_dupacks = `0`;
978	tp->t_bytes_acked = `0`;
979
980	if (TCP_ECN_ALLOWED(tp))
981	tp->t_flags \|= TF_ECN_SND_CWR;
982	}
983
984	const struct tcp_congctl tcp_cubic_ctl = {
985	.fast_retransmit = tcp_cubic_fast_retransmit,
986	.slow_retransmit = tcp_cubic_slow_retransmit,
987	.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
988	.newack = tcp_cubic_newack,
989	.cong_exp = tcp_cubic_congestion_exp,
990	};
991

Browse the source code of src/src/sys/netinet/tcp_congctl.c