OS-5655: hrt2ts and friends are too clever for their own good

Details

Issue Type:Improvement
Priority:4 - Normal
Status:Resolved
Created at:2016-09-07T00:15:51.000Z
Updated at:2020-04-09T14:00:19.781Z

People

Created by:Patrick Mooney [X]
Reported by:Patrick Mooney [X]
Assigned to:Patrick Mooney [X]

Resolution

Fixed: A fix for this issue is checked into the tree and tested.
(Resolution Date: 2016-09-20T14:32:59.000Z)

Fix Versions

2016-09-29 Ziggy (Release Date: 2016-09-29)

Related Links

Description

The family of functions for converted between time formats (hrt2ts, ts2hrt, etc) use a variety of tricks in an attempt to increase performance. While these techniques may have been valuable long ago, they appear to be harmful on modern x86_64 CPUs.

These routines should be checked and, when it would improve performance, the "optimizations" should be disabled via #ifdef.

Comments

Comment by Patrick Mooney [X]
Created at 2016-09-07T00:28:48.000Z
Updated at 2016-09-07T00:37:19.000Z

I wrote a crude little test program to benchmark the existing code against new "de-optimized" versions:

#include <stdio.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/time.h>

#define TESTCOUNT       10000000

void
hrt2ts_old(hrtime_t hrt, timespec_t *tsp)
{
        uint32_t sec, nsec, tmp;

        tmp = (uint32_t)(hrt >> 30);
        sec = tmp - (tmp >> 2);
        sec = tmp - (sec >> 5);
        sec = tmp + (sec >> 1);
        sec = tmp - (sec >> 6) + 7;
        sec = tmp - (sec >> 3);
        sec = tmp + (sec >> 1);
        sec = tmp + (sec >> 3);
        sec = tmp + (sec >> 4);
        tmp = (sec << 7) - sec - sec - sec;
        tmp = (tmp << 7) - tmp - tmp - tmp;
        tmp = (tmp << 7) - tmp - tmp - tmp;
        nsec = (uint32_t)hrt - (tmp << 9);
        while (nsec >= NANOSEC) {
                nsec -= NANOSEC;
                sec++;
        }
        tsp->tv_sec = (time_t)sec;
        tsp->tv_nsec = nsec;
}
void
hrt2ts_new(hrtime_t hrt, timespec_t *tsp)
{
        tsp->tv_sec = (time_t)(hrt / NANOSEC);
        tsp->tv_nsec = (hrt % NANOSEC);
}

void
do_test_hrt2ts(timespec_t *tsp, void (*tf)(hrtime_t, timespec_t *))
{
        int i;
        hrtime_t hrt;

        hrt = gethrtime();
        for (i = 0; i < TESTCOUNT; i++) {
                tf(hrt, tsp);
        }
}
void
test_hrt2ts(void (*tf)(hrtime_t, timespec_t *), const char *name)
{
        timespec_t start, end;
        uint64_t diff;
        timespec_t tsp;

        clock_gettime(CLOCK_MONOTONIC, &start);
        do_test_hrt2ts(&tsp, tf);
        clock_gettime(CLOCK_MONOTONIC, &end);
        if (end.tv_nsec < start.tv_nsec) {
                end.tv_sec -= 1;
                end.tv_nsec += NANOSEC;
        }
        diff = end.tv_nsec - start.tv_nsec;
        diff += (end.tv_sec - start.tv_sec) * NANOSEC;
        printf("%s out: %lld %lld\n", name, tsp.tv_sec, tsp.tv_nsec);
        printf("%s: %lld for %d iterations (%lldns per)\n", name, diff, TESTCOUNT, diff/TESTCOUNT);

}


hrtime_t
ts2hrt_old(timespec_t *tsp)
{
        hrtime_t hrt;

        hrt = tsp->tv_sec;
        hrt = (hrt << 7) - hrt - hrt - hrt;
        hrt = (hrt << 7) - hrt - hrt - hrt;
        hrt = (hrt << 7) - hrt - hrt - hrt;
        hrt = (hrt << 9) + tsp->tv_nsec;
        return (hrt);
}
hrtime_t
ts2hrt_new(timespec_t *tsp)
{
        return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec);
}

hrtime_t do_test_ts2hrt(timespec_t *tsp, hrtime_t (*tf)(timespec_t *))
{
        int i;
        hrtime_t hrt;

        hrt = gethrtime();
        for (i = 0; i < TESTCOUNT; i++) {
                hrt = tf(tsp);
        }
        return (hrt);
}
void test_ts2hrt(hrtime_t (*tf)(timespec_t *), const char *name)
{
        timespec_t start, end;
        timestruc_t tsc;
        uint64_t diff;
        hrtime_t hrt;

        clock_gettime(CLOCK_MONOTONIC, &start);
        hrt = do_test_ts2hrt(&start, tf);
        clock_gettime(CLOCK_MONOTONIC, &end);
        if (end.tv_nsec < start.tv_nsec) {
                end.tv_sec -= 1;
                end.tv_nsec += NANOSEC;
        }
        diff = end.tv_nsec - start.tv_nsec;
        diff += (end.tv_sec - start.tv_sec) * NANOSEC;
        printf("%s out: %lld\n", name, hrt);
        printf("%s: %lld for %d iterations (%lldns per)\n", name, diff, TESTCOUNT, diff/TESTCOUNT);

}

void
hrt2tv_old(hrtime_t hrt, struct timeval *tvp)
{
        uint32_t sec, nsec, tmp;
        uint32_t q, r, t;

        tmp = (uint32_t)(hrt >> 30);
        sec = tmp - (tmp >> 2);
        sec = tmp - (sec >> 5);
        sec = tmp + (sec >> 1);
        sec = tmp - (sec >> 6) + 7;
        sec = tmp - (sec >> 3);
        sec = tmp + (sec >> 1);
        sec = tmp + (sec >> 3);
        sec = tmp + (sec >> 4);
        tmp = (sec << 7) - sec - sec - sec;
        tmp = (tmp << 7) - tmp - tmp - tmp;
        tmp = (tmp << 7) - tmp - tmp - tmp;
        nsec = (uint32_t)hrt - (tmp << 9);
        while (nsec >= NANOSEC) {
                nsec -= NANOSEC;
                sec++;
        }
        tvp->tv_sec = (time_t)sec;

        t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12);
        q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14);
        q = q >> 9;
        r = nsec - q*1000;
        tvp->tv_usec = q + ((r + 24) >> 10);

}
void
hrt2tv_new(hrtime_t hrt, struct timeval *tvp)
{
        tvp->tv_sec = (time_t)(hrt / NANOSEC);
        tvp->tv_usec = ((hrt % NANOSEC) / 1000);
}

void
do_test_hrt2tv(struct timeval *tvp, void (*tf)(hrtime_t, struct timeval *))
{
        int i;
        hrtime_t hrt;

        hrt = gethrtime();
        for (i = 0; i < TESTCOUNT; i++) {
                tf(hrt, tvp);
        }
}
void
test_hrt2tv(void (*tf)(hrtime_t, struct timeval *), const char *name)
{
        timespec_t start, end;
        uint64_t diff;
        struct timeval tvp;

        clock_gettime(CLOCK_MONOTONIC, &start);
        do_test_hrt2tv(&tvp, tf);
        clock_gettime(CLOCK_MONOTONIC, &end);
        if (end.tv_nsec < start.tv_nsec) {
                end.tv_sec -= 1;
                end.tv_nsec += NANOSEC;
        }
        diff = end.tv_nsec - start.tv_nsec;
        diff += (end.tv_sec - start.tv_sec) * NANOSEC;
        printf("%s out: %ld %ld\n", name, tvp.tv_sec, tvp.tv_usec);
        printf("%s: %lld for %d iterations (%lldns per)\n", name, diff, TESTCOUNT, diff/TESTCOUNT);

}

int main()
{
        int count;

        test_hrt2ts(hrt2ts_old, "hrt2ts_old");
        test_hrt2ts(hrt2ts_new, "hrt2ts_new");

        test_ts2hrt(ts2hrt_old, "ts2hrt_old");
        test_ts2hrt(ts2hrt_new, "ts2hrt_new");

        test_hrt2tv(hrt2tv_old, "hrt2tv_old");
        test_hrt2tv(hrt2tv_new, "hrt2tv_new");
}

Here are the results for 64-bit:

[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ gcc -O2 -m64 test.c
[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ ./a.out
hrt2ts_old out: 3999579 457906879
hrt2ts_old: 65166214 for 10000000 iterations (6ns per)
hrt2ts_new out: 3999579 523236995
hrt2ts_new: 27384771 for 10000000 iterations (2ns per)
ts2hrt_old out: 3999579550630130
ts2hrt_old: 37698537 for 10000000 iterations (3ns per)
ts2hrt_new out: 3999579588336809
ts2hrt_new: 20534698 for 10000000 iterations (2ns per)
hrt2tv_old out: 3999579 608879
hrt2tv_old: 132647645 for 10000000 iterations (13ns per)
hrt2tv_new out: 3999579 741541
hrt2tv_new: 42581307 for 10000000 iterations (4ns per)

... and 32-bit:

[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ gcc -O2 -m32 test.c
[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ ./a.out
hrt2ts_old out: 505795097040587133 -81064690078906408
hrt2ts_old: 93591836 for 10000000 iterations (9ns per)
hrt2ts_new out: 908324255269980541 38664705664
hrt2ts_new: 208671683 for 10000000 iterations (20ns per)
ts2hrt_old out: 4000125420170843
ts2hrt_old: 121201904 for 10000000 iterations (12ns per)
ts2hrt_new out: 4000125541384366
ts2hrt_new: 30899755 for 10000000 iterations (3ns per)
hrt2tv_old out: 4000125 572295
hrt2tv_old: 161604889 for 10000000 iterations (16ns per)
hrt2tv_new out: 4000125 733913
hrt2tv_new: 310336290 for 10000000 iterations (31ns per)

This would suggest that ts2hrt should always use the simplified version on both i386 and amd64 while the hrt2t[vs] functions should be simplified only for amd64.


Comment by Jerry Jelinek [X]
Created at 2016-09-07T12:52:42.000Z
Updated at 2017-12-14T17:24:03.661Z

It looks like it is worthwhile to use the "new" implementation for ts2hrt on 32-bit. Do you want to add that to your fix?


Comment by Patrick Mooney [X]
Created at 2016-09-07T14:32:34.000Z

@jerry Yeah, it was a typo on my part. The addition defined() check was added to the conditional, but I missed changing it from amd64 to i386.


Comment by Patrick Mooney [X]
Created at 2016-09-07T18:46:58.000Z

I checked the output from clang 3.7 and its output was similar to gcc with respect to optimizing the division into cheaper multiplies. There aren't any test results as linker issues prevented me from exploring further.


Comment by Bot Bot [X]
Created at 2016-09-16T21:24:03.000Z

illumos-joyent commit 6a41e72 (branch master, by Patrick Mooney)

OS-5655 hrt2ts and friends are too clever for their own good
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>