Between OS-7329#icft=OS-7329 and OS-7426, we'll be able to add support for the following in LX-branded zones:
The TCP_CONGESTION socket option
/proc/sys/net/ipv4/tcp_allowed_congestion_control
/proc/sys/net/ipv4/tcp_available_congestion_control
/proc/sys/net/ipv4/tcp_congestion_control
Former user commented on 2019-08-27T15:02:43.100-0400:
To test this, I rebooted my lab machine onto a PI with my change. Inside an LX zone, we can see that the different congestion control files exist:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ls -l /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control}
-rw-r--r-- 1 root root 0 Aug 27 18:19 /proc/sys/net/ipv4/tcp_allowed_congestion_control
-rw-r--r-- 1 root root 0 Aug 27 18:19 /proc/sys/net/ipv4/tcp_available_congestion_control
-rw-r--r-- 1 root root 0 Aug 27 18:19 /proc/sys/net/ipv4/tcp_congestion_control
For comparison with my laptop's Alpine Linux installation:
cpm@enlil ~ % ls -l /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control}
-rw-r--r-- 1 root root 0 Aug 27 11:19 /proc/sys/net/ipv4/tcp_allowed_congestion_control
-r--r--r-- 1 root root 0 Aug 27 11:19 /proc/sys/net/ipv4/tcp_available_congestion_control
-rw-r--r-- 1 root root 0 Aug 27 11:19 /proc/sys/net/ipv4/tcp_congestion_control
If we look at the contents of the files, then we can see that they list the currently configured algorithm, and list the names of the allowed algorithms (which is for now the same set as the available algorithms):
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# tail -n 500 /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control}
==> /proc/sys/net/ipv4/tcp_congestion_control <==
sunreno
==> /proc/sys/net/ipv4/tcp_allowed_congestion_control <==
sunreno newreno cubic
==> /proc/sys/net/ipv4/tcp_available_congestion_control <==
sunreno newreno cubic
For comparison with my Alpine Linux installation:
cpm@enlil ~ % tail -n 500 /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control}
==> /proc/sys/net/ipv4/tcp_congestion_control <==
cubic
==> /proc/sys/net/ipv4/tcp_allowed_congestion_control <==
reno cubic
==> /proc/sys/net/ipv4/tcp_available_congestion_control <==
reno cubic
I ran a simple C program based on the examples in tcp(7P) to help demonstrate the behaviour of changing the algorithm and getsockopt(TCP_CONGESTION):
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
int
main(int argc, char *argv[])
{
struct addrinfo hints, *gair, *p;
int fd, rv, rlen;
char buf[1024];
int y = 1;
if (argc != 3) {
fprintf(stderr, "%s <host> <port>\n", argv[0]);
return (1);
}
memset(&hints, 0, sizeof (hints));
hints.ai_family = PF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
if ((rv = getaddrinfo(argv[1], argv[2], &hints, &gair)) != 0) {
fprintf(stderr, "getaddrinfo() failed: %s\n",
gai_strerror(rv));
return (1);
}
for (p = gair; p != NULL; p = p->ai_next) {
if ((fd = socket(
p->ai_family,
p->ai_socktype,
p->ai_protocol)) == -1) {
perror("socket() failed");
continue;
}
if (connect(fd, p->ai_addr, p->ai_addrlen) == -1) {
close(fd);
perror("connect() failed");
continue;
}
break;
}
if (p == NULL) {
fprintf(stderr, "failed to connect to server\n");
return (1);
}
freeaddrinfo(gair);
char name[40] = "";
int namelen = 40;
if (getsockopt(fd, IPPROTO_TCP, TCP_CONGESTION, name, &namelen) == -1) {
perror("getsockopt(TCP_CONGESTION) failed");
return (1);
}
printf("using the \"%s\" cc algorithm\n", name);
if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &y,
sizeof (y)) == -1) {
perror("setsockopt(SO_KEEPALIVE) failed");
return (1);
}
while ((rlen = read(fd, buf, sizeof (buf))) > 0) {
fwrite(buf, rlen, 1, stdout);
}
if (rlen == -1) {
perror("read() failed");
}
fflush(stdout);
if (close(fd) == -1) {
perror("close() failed");
}
return (0);
}
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
void
logmsg(struct sockaddr *s, int bytes)
{
char dq[INET6_ADDRSTRLEN];
switch (s->sa_family) {
case AF_INET: {
struct sockaddr_in *s4 = (struct sockaddr_in *)s;
inet_ntop(AF_INET, &s4->sin_addr, dq, sizeof (dq));
fprintf(stdout, "sent %d bytes to %s:%d\n",
bytes, dq, ntohs(s4->sin_port));
break;
}
case AF_INET6: {
struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)s;
inet_ntop(AF_INET6, &s6->sin6_addr, dq, sizeof (dq));
fprintf(stdout, "sent %d bytes to [%s]:%d\n",
bytes, dq, ntohs(s6->sin6_port));
break;
}
default:
fprintf(stdout, "sent %d bytes to unknown client\n",
bytes);
break;
}
}
int
main(int argc, char *argv[])
{
struct addrinfo hints, *gair, *p;
int sfd, cfd;
int slen, wlen, rv;
if (argc != 3) {
fprintf(stderr, "%s <port> <message>\n", argv[0]);
return (1);
}
slen = strlen(argv[2]);
memset(&hints, 0, sizeof (hints));
hints.ai_family = PF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
hints.ai_flags = AI_PASSIVE;
if ((rv = getaddrinfo(NULL, argv[1], &hints, &gair)) != 0) {
fprintf(stderr, "getaddrinfo() failed: %s\n",
gai_strerror(rv));
return (1);
}
for (p = gair; p != NULL; p = p->ai_next) {
if ((sfd = socket(
p->ai_family,
p->ai_socktype,
p->ai_protocol)) == -1) {
perror("socket() failed");
continue;
}
if (bind(sfd, p->ai_addr, p->ai_addrlen) == -1) {
close(sfd);
perror("bind() failed");
continue;
}
break;
}
if (p == NULL) {
fprintf(stderr, "server failed to bind()\n");
return (1);
}
freeaddrinfo(gair);
if (listen(sfd, 1024) != 0) {
perror("listen() failed");
return (1);
}
fprintf(stdout, "waiting for clients...\n");
for (int times = 0; times < 5; times++) {
struct sockaddr_storage stor;
socklen_t alen = sizeof (stor);
struct sockaddr *addr = (struct sockaddr *)&stor;
if ((cfd = accept(sfd, addr, &alen)) == -1) {
perror("accept() failed");
continue;
}
wlen = 0;
do {
wlen += write(cfd, argv[2] + wlen, slen - wlen);
} while (wlen < slen);
logmsg(addr, wlen);
if (close(cfd) == -1) {
perror("close(cfd) failed");
}
}
if (close(sfd) == -1) {
perror("close(sfd) failed");
}
fprintf(stdout, "finished.\n");
return (0);
}
Running the client program, and also updating the current congestion control algorithm, we can see that getsockopt(TCP_CONGESTION) is reporting the right algorithm:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo sunreno > /proc/sys/net/ipv4/tcp_congestion_control
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_congestion_control
sunreno
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ./client 127.0.0.1 8080
using the "sunreno" cc algorithm
hello
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo newreno > /proc/sys/net/ipv4/tcp_congestion_control
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_congestion_control
newreno
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ./client 127.0.0.1 8080
using the "newreno" cc algorithm
hello
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic > /proc/sys/net/ipv4/tcp_congestion_control
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_congestion_control
cubic
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ./client 127.0.0.1 8080
using the "cubic" cc algorithm
hello
If we try writing nonexistent names to the file, then we get EINVAL:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic2 > /proc/sys/net/ipv4/tcp_congestion_control
-bash: echo: write error: Invalid argument
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo foobar > /proc/sys/net/ipv4/tcp_congestion_control
-bash: echo: write error: Invalid argument
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -y -e trace=write echo foobar > /proc/sys/net/ipv4/tcp_congestion_control
write(1</proc/sys/net/ipv4/tcp_congestion_control>, "foobar\n", 7) = -1 EINVAL (Invalid argument)
write(2</dev/pts/4>, "echo: ", 6echo: ) = 6
write(2</dev/pts/4>, "write error", 11write error) = 11
write(2</dev/pts/4>, ": Invalid argument", 18: Invalid argument) = 18
write(2</dev/pts/4>, "\n", 1
) = 1
+++ exited with 1 +++
Compared to Linux:
root@enlil ~ # strace -y -e trace=writev echo foobar > /proc/sys/net/ipv4/tcp_congestion_control
writev(1</proc/sys/net/ipv4/tcp_congestion_control>, [{iov_base="foobar", iov_len=6}, {iov_base="\n", iov_len=1}], 2) = -1 ENOENT (No such file or directory)
writev(2</dev/pts/12>, [{iov_base="echo: ", iov_len=6}, {iov_base=NULL, iov_len=0}], 2echo: ) = 6
writev(2</dev/pts/12>, [{iov_base="write error", iov_len=11}, {iov_base=NULL, iov_len=0}], 2write error) = 11
writev(2</dev/pts/12>, [{iov_base="", iov_len=0}, {iov_base="\n", iov_len=1}], 2
) = 1
+++ exited with 1 +++
If we try writing to the other newly added files, we can see that nothing in them changes:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic > /proc/sys/net/ipv4/tcp_allowed_congestion_control
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_allowed_congestion_control
sunreno newreno cubic
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic > /proc/sys/net/ipv4/tcp_available_congestion_control
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_available_congestion_control
sunreno newreno cubic
On Linux, writing to the tcp_allowed_congestion_control file changes the set of allowed algorithms. We don't have a concept of an allowed set of algorithms currently, so there's nothing for this to do.
One of the most useful parts of the TCP_CONGESTION socket option is that it allows setting connection-specific algorithms. iperf uses this to allow comparing the performance of different algorithms. From inside my LX zone, I am able to use all of the available algorithms:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C sunreno -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "sunreno", 7) = 0
[ 4] local 172.26.7.5 port 37080 connected to 37.153.108.173 port 11845
getsockopt(4, SOL_TCP, TCP_INFO, 0x7fffffeff864, 0x7fffffeff7d4) = -1 ENOPROTOOPT (Protocol not available)
iperf3: getsockopt - Protocol not available
[ ID] Interval Transfer Bandwidth Retr Cwnd
[ 4] 0.00-1.00 sec 252 KBytes 2.06 Mbits/sec 0 0.00 Bytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth Retr
[ 4] 0.00-1.00 sec 252 KBytes 2.06 Mbits/sec 0 sender
[ 4] 0.00-1.00 sec 116 KBytes 948 Kbits/sec receiver
iperf Done.
+++ exited with 0 +++
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C newreno -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "newreno", 7) = 0
[ 4] local 172.26.7.5 port 42367 connected to 37.153.108.173 port 11845
getsockopt(4, SOL_TCP, TCP_INFO, 0x7fffffeff864, 0x7fffffeff7d4) = -1 ENOPROTOOPT (Protocol not available)
iperf3: getsockopt - Protocol not available
[ ID] Interval Transfer Bandwidth Retr Cwnd
[ 4] 0.00-1.00 sec 378 KBytes 3.09 Mbits/sec 0 0.00 Bytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth Retr
[ 4] 0.00-1.00 sec 378 KBytes 3.09 Mbits/sec 0 sender
[ 4] 0.00-1.00 sec 249 KBytes 2.03 Mbits/sec receiver
iperf Done.
+++ exited with 0 +++
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C cubic -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "cubic", 5) = 0
[ 4] local 172.26.7.5 port 40579 connected to 37.153.108.173 port 11845
getsockopt(4, SOL_TCP, TCP_INFO, 0x7fffffeff864, 0x7fffffeff7d4) = -1 ENOPROTOOPT (Protocol not available)
iperf3: getsockopt - Protocol not available
[ ID] Interval Transfer Bandwidth Retr Cwnd
[ 4] 0.00-1.00 sec 378 KBytes 3.09 Mbits/sec 0 0.00 Bytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth Retr
[ 4] 0.00-1.00 sec 378 KBytes 3.09 Mbits/sec 0 sender
[ 4] 0.00-1.00 sec 249 KBytes 2.03 Mbits/sec receiver
iperf Done.
+++ exited with 0 +++
As evidenced here, iperf is only passing in the strlen() of the algorithm name, which illumos#11554 accounts for. The getsockopt() complaints are about the TCP_INFO socket option not being implemented. OS-4525#icft=OS-4525 has previously been filed to cover that issue.
Former user commented on 2019-08-27T15:03:24.322-0400 (edited 2019-08-27T15:04:11.075-0400):
Trying to set a nonexistent algorithm with iperf returns ENOENT:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C cubic4 -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "cubic4", 6) = -1 ENOENT (No such file or directory)
iperf3: error - unable to set TCP_CONGESTION: Supplied congestion control algorithm not supported on this host
+++ exited with 1 +++
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C foobar -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "foobar", 6) = -1 ENOENT (No such file or directory)
iperf3: error - unable to set TCP_CONGESTION: Supplied congestion control algorithm not supported on this host
+++ exited with 1 +++
This matches with Linux:
strace -e trace=setsockopt,getsockopt iperf3 -C foobar -c 37.153.108.173 -p 11845 -t 1
getsockopt(4, SOL_TCP, TCP_MAXSEG, [1448], [4]) = 0
Connecting to host 37.153.108.173, port 11845
getsockopt(5, SOL_SOCKET, SO_SNDBUF, [16384], [4]) = 0
getsockopt(5, SOL_SOCKET, SO_RCVBUF, [87380], [4]) = 0
setsockopt(5, SOL_TCP, TCP_CONGESTION, "foobar", 6) = -1 ENOENT (No such file or directory)
iperf3: error - unable to set TCP_CONGESTION: Supplied congestion control algorithm not supported on this host
+++ exited with 1 +++
Jira Bot commented on 2019-08-27T15:11:32.630-0400:
illumos-joyent commit df3850281ea4def494e12172cae5a6181823c77d (branch master, by Cody Peter Mello)
OS-7427#icft=OS-7427 Support Linux congestion control interfaces in LX zones
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>