OS-7427: Support Linux congestion control interfaces in LX zones

Details

Issue Type:Bug
Priority:4 - Normal
Status:Resolved
Created at:2018-12-05T19:59:10.775Z
Updated at:2019-09-03T09:42:06.175Z

People

Created by:Former user
Reported by:Former user
Assigned to:Former user

Resolution

Fixed: A fix for this issue is checked into the tree and tested.
(Resolution Date: 2019-08-27T19:11:42.900Z)

Fix Versions

2019-08-29 Zoo York (Release Date: 2019-08-29)

Related Issues

Labels

lxbrand

Description

Between OS-7329 and OS-7426, we'll be able to add support for the following in LX-branded zones:

Comments

Comment by Former user
Created at 2019-08-27T19:02:43.100Z

To test this, I rebooted my lab machine onto a PI with my change. Inside an LX zone, we can see that the different congestion control files exist:

root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ls -l /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control}
-rw-r--r-- 1 root root 0 Aug 27 18:19 /proc/sys/net/ipv4/tcp_allowed_congestion_control
-rw-r--r-- 1 root root 0 Aug 27 18:19 /proc/sys/net/ipv4/tcp_available_congestion_control
-rw-r--r-- 1 root root 0 Aug 27 18:19 /proc/sys/net/ipv4/tcp_congestion_control

For comparison with my laptop's Alpine Linux installation:

cpm@enlil ~ % ls -l /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control}
-rw-r--r-- 1 root root 0 Aug 27 11:19 /proc/sys/net/ipv4/tcp_allowed_congestion_control
-r--r--r-- 1 root root 0 Aug 27 11:19 /proc/sys/net/ipv4/tcp_available_congestion_control
-rw-r--r-- 1 root root 0 Aug 27 11:19 /proc/sys/net/ipv4/tcp_congestion_control

If we look at the contents of the files, then we can see that they list the currently configured algorithm, and list the names of the allowed algorithms (which is for now the same set as the available algorithms):

root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# tail -n 500 /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control}
==> /proc/sys/net/ipv4/tcp_congestion_control <==
sunreno

==> /proc/sys/net/ipv4/tcp_allowed_congestion_control <==
sunreno newreno cubic

==> /proc/sys/net/ipv4/tcp_available_congestion_control <==
sunreno newreno cubic

For comparison with my Alpine Linux installation:

cpm@enlil ~ % tail -n 500 /proc/sys/net/ipv4/{tcp_congestion_control,tcp_allowed_congestion_control,tcp_available_congestion_control}
==> /proc/sys/net/ipv4/tcp_congestion_control <==
cubic

==> /proc/sys/net/ipv4/tcp_allowed_congestion_control <==
reno cubic

==> /proc/sys/net/ipv4/tcp_available_congestion_control <==
reno cubic

I ran a simple C program based on the examples in tcp(7P) to help demonstrate the behaviour of changing the algorithm and getsockopt(TCP_CONGESTION):

#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

int
main(int argc, char *argv[])
{
        struct addrinfo hints, *gair, *p;
        int fd, rv, rlen;
        char buf[1024];
        int y = 1;

        if (argc != 3) {
                fprintf(stderr, "%s <host> <port>\n", argv[0]);
                return (1);
        }

        memset(&hints, 0, sizeof (hints));
        hints.ai_family = PF_UNSPEC;
        hints.ai_socktype = SOCK_STREAM;

        if ((rv = getaddrinfo(argv[1], argv[2], &hints, &gair)) != 0) {
                fprintf(stderr, "getaddrinfo() failed: %s\n",
                    gai_strerror(rv));
                return (1);
        }

        for (p = gair; p != NULL; p = p->ai_next) {
                if ((fd = socket(
                    p->ai_family,
                    p->ai_socktype,
                    p->ai_protocol)) == -1) {
                        perror("socket() failed");
                        continue;
                }

                if (connect(fd, p->ai_addr, p->ai_addrlen) == -1) {
                        close(fd);
                        perror("connect() failed");
                        continue;
                }

                break;
        }

        if (p == NULL) {
                fprintf(stderr, "failed to connect to server\n");
                return (1);
        }

        freeaddrinfo(gair);

        char name[40] = "";
        int namelen = 40;
        if (getsockopt(fd, IPPROTO_TCP, TCP_CONGESTION, name, &namelen) == -1) {
            perror("getsockopt(TCP_CONGESTION) failed");
            return (1);
        }
        printf("using the \"%s\" cc algorithm\n", name);

        if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &y,
            sizeof (y)) == -1) {
                perror("setsockopt(SO_KEEPALIVE) failed");
                return (1);
        }

        while ((rlen = read(fd, buf, sizeof (buf))) > 0) {
                fwrite(buf, rlen, 1, stdout);
        }

        if (rlen == -1) {
                perror("read() failed");
        }

        fflush(stdout);

        if (close(fd) == -1) {
                perror("close() failed");
        }

        return (0);
}
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>

void
logmsg(struct sockaddr *s, int bytes)
{
        char dq[INET6_ADDRSTRLEN];

        switch (s->sa_family) {
        case AF_INET: {
                struct sockaddr_in *s4 = (struct sockaddr_in *)s;
                inet_ntop(AF_INET, &s4->sin_addr, dq, sizeof (dq));
                fprintf(stdout, "sent %d bytes to %s:%d\n",
                    bytes, dq, ntohs(s4->sin_port));
                break;
        }
        case AF_INET6: {
                struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)s;
                inet_ntop(AF_INET6, &s6->sin6_addr, dq, sizeof (dq));
                fprintf(stdout, "sent %d bytes to [%s]:%d\n",
                    bytes, dq, ntohs(s6->sin6_port));
                break;
        }
        default:
                fprintf(stdout, "sent %d bytes to unknown client\n",
                    bytes);
                break;
        }
}

int
main(int argc, char *argv[])
{
        struct addrinfo hints, *gair, *p;
        int sfd, cfd;
        int slen, wlen, rv;

        if (argc != 3) {
                fprintf(stderr, "%s <port> <message>\n", argv[0]);
                return (1);
        }

        slen = strlen(argv[2]);

        memset(&hints, 0, sizeof (hints));
        hints.ai_family = PF_UNSPEC;
        hints.ai_socktype = SOCK_STREAM;
        hints.ai_flags = AI_PASSIVE;

        if ((rv = getaddrinfo(NULL, argv[1], &hints, &gair)) != 0) {
                fprintf(stderr, "getaddrinfo() failed: %s\n",
                    gai_strerror(rv));
                return (1);
        }

        for (p = gair; p != NULL; p = p->ai_next) {
                if ((sfd = socket(
                    p->ai_family,
                    p->ai_socktype,
                    p->ai_protocol)) == -1) {
                        perror("socket() failed");
                        continue;
                }

                if (bind(sfd, p->ai_addr, p->ai_addrlen) == -1) {
                        close(sfd);
                        perror("bind() failed");
                        continue;
                }

                break;
        }

        if (p == NULL) {
                fprintf(stderr, "server failed to bind()\n");
                return (1);
        }

        freeaddrinfo(gair);

        if (listen(sfd, 1024) != 0) {
                perror("listen() failed");
                return (1);
        }

        fprintf(stdout, "waiting for clients...\n");

        for (int times = 0; times < 5; times++) {
                struct sockaddr_storage stor;
                socklen_t alen = sizeof (stor);
                struct sockaddr *addr = (struct sockaddr *)&stor;

                if ((cfd = accept(sfd, addr, &alen)) == -1) {
                        perror("accept() failed");
                        continue;
                }

                wlen = 0;

                do {
                        wlen += write(cfd, argv[2] + wlen, slen - wlen);
                } while (wlen < slen);

                logmsg(addr, wlen);

                if (close(cfd) == -1) {
                        perror("close(cfd) failed");
                }
        }

        if (close(sfd) == -1) {
                perror("close(sfd) failed");
        }

        fprintf(stdout, "finished.\n");

        return (0);
}

Running the client program, and also updating the current congestion control algorithm, we can see that getsockopt(TCP_CONGESTION) is reporting the right algorithm:

root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo sunreno > /proc/sys/net/ipv4/tcp_congestion_control
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_congestion_control
sunreno
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ./client 127.0.0.1 8080
using the "sunreno" cc algorithm
hello
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo newreno > /proc/sys/net/ipv4/tcp_congestion_control 
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_congestion_control
newreno
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ./client 127.0.0.1 8080
using the "newreno" cc algorithm
hello
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic > /proc/sys/net/ipv4/tcp_congestion_control 
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_congestion_control
cubic
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# ./client 127.0.0.1 8080
using the "cubic" cc algorithm
hello

If we try writing nonexistent names to the file, then we get EINVAL:

root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic2 > /proc/sys/net/ipv4/tcp_congestion_control 
-bash: echo: write error: Invalid argument
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo foobar > /proc/sys/net/ipv4/tcp_congestion_control 
-bash: echo: write error: Invalid argument
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -y -e trace=write echo foobar > /proc/sys/net/ipv4/tcp_congestion_control 
write(1</proc/sys/net/ipv4/tcp_congestion_control>, "foobar\n", 7) = -1 EINVAL (Invalid argument)
write(2</dev/pts/4>, "echo: ", 6echo: )       = 6
write(2</dev/pts/4>, "write error", 11write error) = 11
write(2</dev/pts/4>, ": Invalid argument", 18: Invalid argument) = 18
write(2</dev/pts/4>, "\n", 1
)           = 1
+++ exited with 1 +++

Compared to Linux:

root@enlil ~ # strace -y -e trace=writev echo foobar > /proc/sys/net/ipv4/tcp_congestion_control
writev(1</proc/sys/net/ipv4/tcp_congestion_control>, [{iov_base="foobar", iov_len=6}, {iov_base="\n", iov_len=1}], 2) = -1 ENOENT (No such file or directory)
writev(2</dev/pts/12>, [{iov_base="echo: ", iov_len=6}, {iov_base=NULL, iov_len=0}], 2echo: ) = 6
writev(2</dev/pts/12>, [{iov_base="write error", iov_len=11}, {iov_base=NULL, iov_len=0}], 2write error) = 11
writev(2</dev/pts/12>, [{iov_base="", iov_len=0}, {iov_base="\n", iov_len=1}], 2
) = 1
+++ exited with 1 +++

If we try writing to the other newly added files, we can see that nothing in them changes:

root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic > /proc/sys/net/ipv4/tcp_allowed_congestion_control 
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_allowed_congestion_control
sunreno newreno cubic
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# echo cubic > /proc/sys/net/ipv4/tcp_available_congestion_control 
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# cat /proc/sys/net/ipv4/tcp_available_congestion_control
sunreno newreno cubic

On Linux, writing to the tcp_allowed_congestion_control file changes the set of allowed algorithms. We don't have a concept of an allowed set of algorithms currently, so there's nothing for this to do.

One of the most useful parts of the TCP_CONGESTION socket option is that it allows setting connection-specific algorithms. iperf uses this to allow comparing the performance of different algorithms. From inside my LX zone, I am able to use all of the available algorithms:

root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C sunreno -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "sunreno", 7) = 0
[  4] local 172.26.7.5 port 37080 connected to 37.153.108.173 port 11845
getsockopt(4, SOL_TCP, TCP_INFO, 0x7fffffeff864, 0x7fffffeff7d4) = -1 ENOPROTOOPT (Protocol not available)
iperf3: getsockopt - Protocol not available
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   252 KBytes  2.06 Mbits/sec    0   0.00 Bytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-1.00   sec   252 KBytes  2.06 Mbits/sec    0             sender
[  4]   0.00-1.00   sec   116 KBytes   948 Kbits/sec                  receiver

iperf Done.
+++ exited with 0 +++
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C newreno -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "newreno", 7) = 0
[  4] local 172.26.7.5 port 42367 connected to 37.153.108.173 port 11845
getsockopt(4, SOL_TCP, TCP_INFO, 0x7fffffeff864, 0x7fffffeff7d4) = -1 ENOPROTOOPT (Protocol not available)
iperf3: getsockopt - Protocol not available
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   378 KBytes  3.09 Mbits/sec    0   0.00 Bytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-1.00   sec   378 KBytes  3.09 Mbits/sec    0             sender
[  4]   0.00-1.00   sec   249 KBytes  2.03 Mbits/sec                  receiver

iperf Done.
+++ exited with 0 +++
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C cubic -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "cubic", 5) = 0
[  4] local 172.26.7.5 port 40579 connected to 37.153.108.173 port 11845
getsockopt(4, SOL_TCP, TCP_INFO, 0x7fffffeff864, 0x7fffffeff7d4) = -1 ENOPROTOOPT (Protocol not available)
iperf3: getsockopt - Protocol not available
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   378 KBytes  3.09 Mbits/sec    0   0.00 Bytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-1.00   sec   378 KBytes  3.09 Mbits/sec    0             sender
[  4]   0.00-1.00   sec   249 KBytes  2.03 Mbits/sec                  receiver

iperf Done.
+++ exited with 0 +++

As evidenced here, iperf is only passing in the strlen() of the algorithm name, which illumos#11554 accounts for. The getsockopt() complaints are about the TCP_INFO socket option not being implemented. OS-4525 has previously been filed to cover that issue.


Comment by Former user
Created at 2019-08-27T19:03:24.322Z
Updated at 2019-08-27T19:04:11.075Z

Trying to set a nonexistent algorithm with iperf returns ENOENT:

root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C cubic4 -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "cubic4", 6) = -1 ENOENT (No such file or directory)
iperf3: error - unable to set TCP_CONGESTION: Supplied congestion control algorithm not supported on this host
+++ exited with 1 +++
root@5a0d3532-1b43-4459-f8e4-b5fc36a7cd26:~# strace -e trace=setsockopt,getsockopt iperf3 -C foobar -c 37.153.108.173 -p 11845 -t 1
Connecting to host 37.153.108.173, port 11845
setsockopt(4, SOL_TCP, TCP_CONGESTION, "foobar", 6) = -1 ENOENT (No such file or directory)
iperf3: error - unable to set TCP_CONGESTION: Supplied congestion control algorithm not supported on this host
+++ exited with 1 +++

This matches with Linux:

strace -e trace=setsockopt,getsockopt iperf3 -C foobar -c 37.153.108.173 -p 11845 -t 1

getsockopt(4, SOL_TCP, TCP_MAXSEG, [1448], [4]) = 0
Connecting to host 37.153.108.173, port 11845
getsockopt(5, SOL_SOCKET, SO_SNDBUF, [16384], [4]) = 0
getsockopt(5, SOL_SOCKET, SO_RCVBUF, [87380], [4]) = 0
setsockopt(5, SOL_TCP, TCP_CONGESTION, "foobar", 6) = -1 ENOENT (No such file or directory)
iperf3: error - unable to set TCP_CONGESTION: Supplied congestion control algorithm not supported on this host
+++ exited with 1 +++

Comment by Jira Bot
Created at 2019-08-27T19:11:32.630Z

illumos-joyent commit df3850281ea4def494e12172cae5a6181823c77d (branch master, by Cody Peter Mello)

OS-7427 Support Linux congestion control interfaces in LX zones
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>