Issue Type: | Bug |
---|---|
Priority: | 4 - Normal |
Status: | Resolved |
Created at: | 2018-12-05T19:59:10.775Z |
Updated at: | 2019-09-03T09:42:06.175Z |
Created by: | Former user |
---|---|
Reported by: | Former user |
Assigned to: | Former user |
Fixed: A fix for this issue is checked into the tree and tested.
(Resolution Date: 2019-08-27T19:11:42.900Z)
2019-08-29 Zoo York (Release Date: 2019-08-29)
Between OS-7329 and OS-7426, we'll be able to add support for the following in LX-branded zones:
TCP_CONGESTION
socket option/proc/sys/net/ipv4/tcp_allowed_congestion_control
/proc/sys/net/ipv4/tcp_available_congestion_control
/proc/sys/net/ipv4/tcp_congestion_control
To test this, I rebooted my lab machine onto a PI with my change. Inside an LX zone, we can see that the different congestion control files exist:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ls -l /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control} -rw-r--r-- 1 root root 0 Aug 27 18:19 /proc/sys/net/ipv4/tcp_allowed_congestion_control -rw-r--r-- 1 root root 0 Aug 27 18:19 /proc/sys/net/ipv4/tcp_available_congestion_control -rw-r--r-- 1 root root 0 Aug 27 18:19 /proc/sys/net/ipv4/tcp_congestion_control
For comparison with my laptop's Alpine Linux installation:
cpm@enlil ~ % ls -l /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control} -rw-r--r-- 1 root root 0 Aug 27 11:19 /proc/sys/net/ipv4/tcp_allowed_congestion_control -r--r--r-- 1 root root 0 Aug 27 11:19 /proc/sys/net/ipv4/tcp_available_congestion_control -rw-r--r-- 1 root root 0 Aug 27 11:19 /proc/sys/net/ipv4/tcp_congestion_control
If we look at the contents of the files, then we can see that they list the currently configured algorithm, and list the names of the allowed algorithms (which is for now the same set as the available algorithms):
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# tail -n 500 /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control} ==> /proc/sys/net/ipv4/tcp_congestion_control <== sunreno ==> /proc/sys/net/ipv4/tcp_allowed_congestion_control <== sunreno newreno cubic ==> /proc/sys/net/ipv4/tcp_available_congestion_control <== sunreno newreno cubic
For comparison with my Alpine Linux installation:
cpm@enlil ~ % tail -n 500 /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control} ==> /proc/sys/net/ipv4/tcp_congestion_control <== cubic ==> /proc/sys/net/ipv4/tcp_allowed_congestion_control <== reno cubic ==> /proc/sys/net/ipv4/tcp_available_congestion_control <== reno cubic
I ran a simple C program based on the examples in tcp(7P)
to help demonstrate the behaviour of changing the algorithm and getsockopt(TCP_CONGESTION)
:
#include <sys/socket.h> #include <netinet/in.h> #include <netinet/tcp.h> #include <netdb.h> #include <stdio.h> #include <string.h> #include <unistd.h> int main(int argc, char *argv[]) { struct addrinfo hints, *gair, *p; int fd, rv, rlen; char buf[1024]; int y = 1; if (argc != 3) { fprintf(stderr, "%s <host> <port>\n", argv[0]); return (1); } memset(&hints, 0, sizeof (hints)); hints.ai_family = PF_UNSPEC; hints.ai_socktype = SOCK_STREAM; if ((rv = getaddrinfo(argv[1], argv[2], &hints, &gair)) != 0) { fprintf(stderr, "getaddrinfo() failed: %s\n", gai_strerror(rv)); return (1); } for (p = gair; p != NULL; p = p->ai_next) { if ((fd = socket( p->ai_family, p->ai_socktype, p->ai_protocol)) == -1) { perror("socket() failed"); continue; } if (connect(fd, p->ai_addr, p->ai_addrlen) == -1) { close(fd); perror("connect() failed"); continue; } break; } if (p == NULL) { fprintf(stderr, "failed to connect to server\n"); return (1); } freeaddrinfo(gair); char name[40] = ""; int namelen = 40; if (getsockopt(fd, IPPROTO_TCP, TCP_CONGESTION, name, &namelen) == -1) { perror("getsockopt(TCP_CONGESTION) failed"); return (1); } printf("using the \"%s\" cc algorithm\n", name); if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &y, sizeof (y)) == -1) { perror("setsockopt(SO_KEEPALIVE) failed"); return (1); } while ((rlen = read(fd, buf, sizeof (buf))) > 0) { fwrite(buf, rlen, 1, stdout); } if (rlen == -1) { perror("read() failed"); } fflush(stdout); if (close(fd) == -1) { perror("close() failed"); } return (0); }
#include <sys/socket.h> #include <netinet/in.h> #include <netinet/tcp.h> #include <netdb.h> #include <stdio.h> #include <string.h> #include <unistd.h> #include <arpa/inet.h> void logmsg(struct sockaddr *s, int bytes) { char dq[INET6_ADDRSTRLEN]; switch (s->sa_family) { case AF_INET: { struct sockaddr_in *s4 = (struct sockaddr_in *)s; inet_ntop(AF_INET, &s4->sin_addr, dq, sizeof (dq)); fprintf(stdout, "sent %d bytes to %s:%d\n", bytes, dq, ntohs(s4->sin_port)); break; } case AF_INET6: { struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)s; inet_ntop(AF_INET6, &s6->sin6_addr, dq, sizeof (dq)); fprintf(stdout, "sent %d bytes to [%s]:%d\n", bytes, dq, ntohs(s6->sin6_port)); break; } default: fprintf(stdout, "sent %d bytes to unknown client\n", bytes); break; } } int main(int argc, char *argv[]) { struct addrinfo hints, *gair, *p; int sfd, cfd; int slen, wlen, rv; if (argc != 3) { fprintf(stderr, "%s <port> <message>\n", argv[0]); return (1); } slen = strlen(argv[2]); memset(&hints, 0, sizeof (hints)); hints.ai_family = PF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; if ((rv = getaddrinfo(NULL, argv[1], &hints, &gair)) != 0) { fprintf(stderr, "getaddrinfo() failed: %s\n", gai_strerror(rv)); return (1); } for (p = gair; p != NULL; p = p->ai_next) { if ((sfd = socket( p->ai_family, p->ai_socktype, p->ai_protocol)) == -1) { perror("socket() failed"); continue; } if (bind(sfd, p->ai_addr, p->ai_addrlen) == -1) { close(sfd); perror("bind() failed"); continue; } break; } if (p == NULL) { fprintf(stderr, "server failed to bind()\n"); return (1); } freeaddrinfo(gair); if (listen(sfd, 1024) != 0) { perror("listen() failed"); return (1); } fprintf(stdout, "waiting for clients...\n"); for (int times = 0; times < 5; times++) { struct sockaddr_storage stor; socklen_t alen = sizeof (stor); struct sockaddr *addr = (struct sockaddr *)&stor; if ((cfd = accept(sfd, addr, &alen)) == -1) { perror("accept() failed"); continue; } wlen = 0; do { wlen += write(cfd, argv[2] + wlen, slen - wlen); } while (wlen < slen); logmsg(addr, wlen); if (close(cfd) == -1) { perror("close(cfd) failed"); } } if (close(sfd) == -1) { perror("close(sfd) failed"); } fprintf(stdout, "finished.\n"); return (0); }
Running the client program, and also updating the current congestion control algorithm, we can see that getsockopt(TCP_CONGESTION)
is reporting the right algorithm:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo sunreno > /proc/sys/net/ipv4/tcp_congestion_control root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_congestion_control sunreno root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ./client 127.0.0.1 8080 using the "sunreno" cc algorithm hello root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo newreno > /proc/sys/net/ipv4/tcp_congestion_control root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_congestion_control newreno root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ./client 127.0.0.1 8080 using the "newreno" cc algorithm hello root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic > /proc/sys/net/ipv4/tcp_congestion_control root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_congestion_control cubic root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ./client 127.0.0.1 8080 using the "cubic" cc algorithm hello
If we try writing nonexistent names to the file, then we get EINVAL
:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic2 > /proc/sys/net/ipv4/tcp_congestion_control -bash: echo: write error: Invalid argument root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo foobar > /proc/sys/net/ipv4/tcp_congestion_control -bash: echo: write error: Invalid argument root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -y -e trace=write echo foobar > /proc/sys/net/ipv4/tcp_congestion_control write(1</proc/sys/net/ipv4/tcp_congestion_control>, "foobar\n", 7) = -1 EINVAL (Invalid argument) write(2</dev/pts/4>, "echo: ", 6echo: ) = 6 write(2</dev/pts/4>, "write error", 11write error) = 11 write(2</dev/pts/4>, ": Invalid argument", 18: Invalid argument) = 18 write(2</dev/pts/4>, "\n", 1 ) = 1 +++ exited with 1 +++
Compared to Linux:
root@enlil ~ # strace -y -e trace=writev echo foobar > /proc/sys/net/ipv4/tcp_congestion_control writev(1</proc/sys/net/ipv4/tcp_congestion_control>, [{iov_base="foobar", iov_len=6}, {iov_base="\n", iov_len=1}], 2) = -1 ENOENT (No such file or directory) writev(2</dev/pts/12>, [{iov_base="echo: ", iov_len=6}, {iov_base=NULL, iov_len=0}], 2echo: ) = 6 writev(2</dev/pts/12>, [{iov_base="write error", iov_len=11}, {iov_base=NULL, iov_len=0}], 2write error) = 11 writev(2</dev/pts/12>, [{iov_base="", iov_len=0}, {iov_base="\n", iov_len=1}], 2 ) = 1 +++ exited with 1 +++
If we try writing to the other newly added files, we can see that nothing in them changes:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic > /proc/sys/net/ipv4/tcp_allowed_congestion_control root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_allowed_congestion_control sunreno newreno cubic root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic > /proc/sys/net/ipv4/tcp_available_congestion_control root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_available_congestion_control sunreno newreno cubic
On Linux, writing to the tcp_allowed_congestion_control
file changes the set of allowed algorithms. We don't have a concept of an allowed set of algorithms currently, so there's nothing for this to do.
One of the most useful parts of the TCP_CONGESTION
socket option is that it allows setting connection-specific algorithms. iperf
uses this to allow comparing the performance of different algorithms. From inside my LX zone, I am able to use all of the available algorithms:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C sunreno -c 37.153.108.173 -p 11845 -t 1 Connecting to host 37.153.108.173, port 11845 setsockopt(4, SOL_TCP, TCP_CONGESTION, "sunreno", 7) = 0 [ 4] local 172.26.7.5 port 37080 connected to 37.153.108.173 port 11845 getsockopt(4, SOL_TCP, TCP_INFO, 0x7fffffeff864, 0x7fffffeff7d4) = -1 ENOPROTOOPT (Protocol not available) iperf3: getsockopt - Protocol not available [ ID] Interval Transfer Bandwidth Retr Cwnd [ 4] 0.00-1.00 sec 252 KBytes 2.06 Mbits/sec 0 0.00 Bytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Retr [ 4] 0.00-1.00 sec 252 KBytes 2.06 Mbits/sec 0 sender [ 4] 0.00-1.00 sec 116 KBytes 948 Kbits/sec receiver iperf Done. +++ exited with 0 +++ root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C newreno -c 37.153.108.173 -p 11845 -t 1 Connecting to host 37.153.108.173, port 11845 setsockopt(4, SOL_TCP, TCP_CONGESTION, "newreno", 7) = 0 [ 4] local 172.26.7.5 port 42367 connected to 37.153.108.173 port 11845 getsockopt(4, SOL_TCP, TCP_INFO, 0x7fffffeff864, 0x7fffffeff7d4) = -1 ENOPROTOOPT (Protocol not available) iperf3: getsockopt - Protocol not available [ ID] Interval Transfer Bandwidth Retr Cwnd [ 4] 0.00-1.00 sec 378 KBytes 3.09 Mbits/sec 0 0.00 Bytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Retr [ 4] 0.00-1.00 sec 378 KBytes 3.09 Mbits/sec 0 sender [ 4] 0.00-1.00 sec 249 KBytes 2.03 Mbits/sec receiver iperf Done. +++ exited with 0 +++ root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C cubic -c 37.153.108.173 -p 11845 -t 1 Connecting to host 37.153.108.173, port 11845 setsockopt(4, SOL_TCP, TCP_CONGESTION, "cubic", 5) = 0 [ 4] local 172.26.7.5 port 40579 connected to 37.153.108.173 port 11845 getsockopt(4, SOL_TCP, TCP_INFO, 0x7fffffeff864, 0x7fffffeff7d4) = -1 ENOPROTOOPT (Protocol not available) iperf3: getsockopt - Protocol not available [ ID] Interval Transfer Bandwidth Retr Cwnd [ 4] 0.00-1.00 sec 378 KBytes 3.09 Mbits/sec 0 0.00 Bytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Retr [ 4] 0.00-1.00 sec 378 KBytes 3.09 Mbits/sec 0 sender [ 4] 0.00-1.00 sec 249 KBytes 2.03 Mbits/sec receiver iperf Done. +++ exited with 0 +++
As evidenced here, iperf
is only passing in the strlen()
of the algorithm name, which illumos#11554 accounts for. The getsockopt()
complaints are about the TCP_INFO
socket option not being implemented. OS-4525 has previously been filed to cover that issue.
Trying to set a nonexistent algorithm with iperf
returns ENOENT
:
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C cubic4 -c 37.153.108.173 -p 11845 -t 1 Connecting to host 37.153.108.173, port 11845 setsockopt(4, SOL_TCP, TCP_CONGESTION, "cubic4", 6) = -1 ENOENT (No such file or directory) iperf3: error - unable to set TCP_CONGESTION: Supplied congestion control algorithm not supported on this host +++ exited with 1 +++ root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C foobar -c 37.153.108.173 -p 11845 -t 1 Connecting to host 37.153.108.173, port 11845 setsockopt(4, SOL_TCP, TCP_CONGESTION, "foobar", 6) = -1 ENOENT (No such file or directory) iperf3: error - unable to set TCP_CONGESTION: Supplied congestion control algorithm not supported on this host +++ exited with 1 +++
This matches with Linux:
strace -e trace=setsockopt,getsockopt iperf3 -C foobar -c 37.153.108.173 -p 11845 -t 1 getsockopt(4, SOL_TCP, TCP_MAXSEG, [1448], [4]) = 0 Connecting to host 37.153.108.173, port 11845 getsockopt(5, SOL_SOCKET, SO_SNDBUF, [16384], [4]) = 0 getsockopt(5, SOL_SOCKET, SO_RCVBUF, [87380], [4]) = 0 setsockopt(5, SOL_TCP, TCP_CONGESTION, "foobar", 6) = -1 ENOENT (No such file or directory) iperf3: error - unable to set TCP_CONGESTION: Supplied congestion control algorithm not supported on this host +++ exited with 1 +++
illumos-joyent commit df3850281ea4def494e12172cae5a6181823c77d (branch master, by Cody Peter Mello)
OS-7427 Support Linux congestion control interfaces in LX zones
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>