Project

General

Profile

Actions

Feature #7709

closed

hrt2ts and friends are too clever for their own good

Added by Robert Mustacchi over 4 years ago. Updated about 1 year ago.

Status:
Closed
Priority:
Normal
Category:
kernel
Start date:
2016-12-30
Due date:
% Done:

100%

Estimated time:
Difficulty:
Medium
Tags:
Gerrit CR:

Description

Upstreaming of OS-5655 from SmartOS:

The family of functions for converted between time formats (hrt2ts, ts2hrt, etc) use a variety of tricks in an attempt to increase performance. While these techniques may have been valuable long ago, they appear to be harmful on modern x86_64 CPUs.

These routines should be checked and, when it would improve performance, the "optimizations" should be disabled via #ifdef.


I wrote a crude little test program to benchmark the existing code against new "de-optimized" versions:

#include <stdio.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/time.h>

#define TESTCOUNT       10000000

void
hrt2ts_old(hrtime_t hrt, timespec_t *tsp)
{
        uint32_t sec, nsec, tmp;

        tmp = (uint32_t)(hrt >> 30);
        sec = tmp - (tmp >> 2);
        sec = tmp - (sec >> 5);
        sec = tmp + (sec >> 1);
        sec = tmp - (sec >> 6) + 7;
        sec = tmp - (sec >> 3);
        sec = tmp + (sec >> 1);
        sec = tmp + (sec >> 3);
        sec = tmp + (sec >> 4);
        tmp = (sec << 7) - sec - sec - sec;
        tmp = (tmp << 7) - tmp - tmp - tmp;
        tmp = (tmp << 7) - tmp - tmp - tmp;
        nsec = (uint32_t)hrt - (tmp << 9);
        while (nsec >= NANOSEC) {
                nsec -= NANOSEC;
                sec++;
        }
        tsp->tv_sec = (time_t)sec;
        tsp->tv_nsec = nsec;
}
void
hrt2ts_new(hrtime_t hrt, timespec_t *tsp)
{
        tsp->tv_sec = (time_t)(hrt / NANOSEC);
        tsp->tv_nsec = (hrt % NANOSEC);
}

void
do_test_hrt2ts(timespec_t *tsp, void (*tf)(hrtime_t, timespec_t *))
{
        int i;
        hrtime_t hrt;

        hrt = gethrtime();
        for (i = 0; i < TESTCOUNT; i++) {
                tf(hrt, tsp);
        }
}
void
test_hrt2ts(void (*tf)(hrtime_t, timespec_t *), const char *name)
{
        timespec_t start, end;
        uint64_t diff;
        timespec_t tsp;

        clock_gettime(CLOCK_MONOTONIC, &start);
        do_test_hrt2ts(&tsp, tf);
        clock_gettime(CLOCK_MONOTONIC, &end);
        if (end.tv_nsec < start.tv_nsec) {
                end.tv_sec -= 1;
                end.tv_nsec += NANOSEC;
        }
        diff = end.tv_nsec - start.tv_nsec;
        diff += (end.tv_sec - start.tv_sec) * NANOSEC;
        printf("%s out: %lld %lld\n", name, tsp.tv_sec, tsp.tv_nsec);
        printf("%s: %lld for %d iterations (%lldns per)\n", name, diff, TESTCOUNT, diff/TESTCOUNT);

}

hrtime_t
ts2hrt_old(timespec_t *tsp)
{
        hrtime_t hrt;

        hrt = tsp->tv_sec;
        hrt = (hrt << 7) - hrt - hrt - hrt;
        hrt = (hrt << 7) - hrt - hrt - hrt;
        hrt = (hrt << 7) - hrt - hrt - hrt;
        hrt = (hrt << 9) + tsp->tv_nsec;
        return (hrt);
}
hrtime_t
ts2hrt_new(timespec_t *tsp)
{
        return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec);
}

hrtime_t do_test_ts2hrt(timespec_t *tsp, hrtime_t (*tf)(timespec_t *))
{
        int i;
        hrtime_t hrt;

        hrt = gethrtime();
        for (i = 0; i < TESTCOUNT; i++) {
                hrt = tf(tsp);
        }
        return (hrt);
}
void test_ts2hrt(hrtime_t (*tf)(timespec_t *), const char *name)
{
        timespec_t start, end;
        timestruc_t tsc;
        uint64_t diff;
        hrtime_t hrt;

        clock_gettime(CLOCK_MONOTONIC, &start);
        hrt = do_test_ts2hrt(&start, tf);
        clock_gettime(CLOCK_MONOTONIC, &end);
        if (end.tv_nsec < start.tv_nsec) {
                end.tv_sec -= 1;
                end.tv_nsec += NANOSEC;
        }
        diff = end.tv_nsec - start.tv_nsec;
        diff += (end.tv_sec - start.tv_sec) * NANOSEC;
        printf("%s out: %lld\n", name, hrt);
        printf("%s: %lld for %d iterations (%lldns per)\n", name, diff, TESTCOUNT, diff/TESTCOUNT);

}

void
hrt2tv_old(hrtime_t hrt, struct timeval *tvp)
{
        uint32_t sec, nsec, tmp;
        uint32_t q, r, t;

        tmp = (uint32_t)(hrt >> 30);
        sec = tmp - (tmp >> 2);
        sec = tmp - (sec >> 5);
        sec = tmp + (sec >> 1);
        sec = tmp - (sec >> 6) + 7;
        sec = tmp - (sec >> 3);
        sec = tmp + (sec >> 1);
        sec = tmp + (sec >> 3);
        sec = tmp + (sec >> 4);
        tmp = (sec << 7) - sec - sec - sec;
        tmp = (tmp << 7) - tmp - tmp - tmp;
        tmp = (tmp << 7) - tmp - tmp - tmp;
        nsec = (uint32_t)hrt - (tmp << 9);
        while (nsec >= NANOSEC) {
                nsec -= NANOSEC;
                sec++;
        }
        tvp->tv_sec = (time_t)sec;

        t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12);
        q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14);
        q = q >> 9;
        r = nsec - q*1000;
        tvp->tv_usec = q + ((r + 24) >> 10);

}
void
hrt2tv_new(hrtime_t hrt, struct timeval *tvp)
{
        tvp->tv_sec = (time_t)(hrt / NANOSEC);
        tvp->tv_usec = ((hrt % NANOSEC) / 1000);
}

void
do_test_hrt2tv(struct timeval *tvp, void (*tf)(hrtime_t, struct timeval *))
{
        int i;
        hrtime_t hrt;

        hrt = gethrtime();
        for (i = 0; i < TESTCOUNT; i++) {
                tf(hrt, tvp);
        }
}
void
test_hrt2tv(void (*tf)(hrtime_t, struct timeval *), const char *name)
{
        timespec_t start, end;
        uint64_t diff;
        struct timeval tvp;

        clock_gettime(CLOCK_MONOTONIC, &start);
        do_test_hrt2tv(&tvp, tf);
        clock_gettime(CLOCK_MONOTONIC, &end);
        if (end.tv_nsec < start.tv_nsec) {
                end.tv_sec -= 1;
                end.tv_nsec += NANOSEC;
        }
        diff = end.tv_nsec - start.tv_nsec;
        diff += (end.tv_sec - start.tv_sec) * NANOSEC;
        printf("%s out: %ld %ld\n", name, tvp.tv_sec, tvp.tv_usec);
        printf("%s: %lld for %d iterations (%lldns per)\n", name, diff, TESTCOUNT, diff/TESTCOUNT);

}

int main()
{
        int count;

        test_hrt2ts(hrt2ts_old, "hrt2ts_old");
        test_hrt2ts(hrt2ts_new, "hrt2ts_new");

        test_ts2hrt(ts2hrt_old, "ts2hrt_old");
        test_ts2hrt(ts2hrt_new, "ts2hrt_new");

        test_hrt2tv(hrt2tv_old, "hrt2tv_old");
        test_hrt2tv(hrt2tv_new, "hrt2tv_new");
}

Here are the results for 64-bit:

[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ gcc -O2 -m64 test.c
[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ ./a.out
hrt2ts_old out: 3999579 457906879
hrt2ts_old: 65166214 for 10000000 iterations (6ns per)
hrt2ts_new out: 3999579 523236995
hrt2ts_new: 27384771 for 10000000 iterations (2ns per)
ts2hrt_old out: 3999579550630130
ts2hrt_old: 37698537 for 10000000 iterations (3ns per)
ts2hrt_new out: 3999579588336809
ts2hrt_new: 20534698 for 10000000 iterations (2ns per)
hrt2tv_old out: 3999579 608879
hrt2tv_old: 132647645 for 10000000 iterations (13ns per)
hrt2tv_new out: 3999579 741541
hrt2tv_new: 42581307 for 10000000 iterations (4ns per)

... and 32-bit:

[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ gcc -O2 -m32 test.c
[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ ./a.out
hrt2ts_old out: 505795097040587133 -81064690078906408
hrt2ts_old: 93591836 for 10000000 iterations (9ns per)
hrt2ts_new out: 908324255269980541 38664705664
hrt2ts_new: 208671683 for 10000000 iterations (20ns per)
ts2hrt_old out: 4000125420170843
ts2hrt_old: 121201904 for 10000000 iterations (12ns per)
ts2hrt_new out: 4000125541384366
ts2hrt_new: 30899755 for 10000000 iterations (3ns per)
hrt2tv_old out: 4000125 572295
hrt2tv_old: 161604889 for 10000000 iterations (16ns per)
hrt2tv_new out: 4000125 733913
hrt2tv_new: 310336290 for 10000000 iterations (31ns per)

Related issues

Related to illumos gate - Bug #13088: uniqtime() is too clever for its own goodNewshua ll

Actions
Actions #1

Updated by Patrick Mooney about 1 year ago

  • Description updated (diff)
  • Status changed from New to In Progress
Actions #2

Updated by Electric Monk about 1 year ago

  • Gerrit CR set to 878
Actions #3

Updated by Toomas Soome about 1 year ago

Robert Mustacchi wrote:

Upstreaming of OS-5655 from SmartOS:

The family of functions for converted between time formats (hrt2ts, ts2hrt, etc) use a variety of tricks in an attempt to increase performance. While these techniques may have been valuable long ago, they appear to be harmful on modern x86_64 CPUs.

These routines should be checked and, when it would improve performance, the "optimizations" should be disabled via #ifdef.


I wrote a crude little test program to benchmark the existing code against new "de-optimized" versions:

[...]

Here are the results for 64-bit:

[...]

... and 32-bit:

[...]

T4-1, 32-bit:

tsoome@openindiana:~$ gcc -O2 -m32 test.c 
tsoome@openindiana:~$ ./a.out 
hrt2ts_old out: 84495617514786895 999999999
hrt2ts_old: 234592524 for 10000000 iterations (23ns per)
hrt2ts_new out: 84495617749683377 4282277792
hrt2ts_new: 1888438937 for 10000000 iterations (188ns per)
ts2hrt_old out: 19673171174531137
ts2hrt_old: 193794275 for 10000000 iterations (19ns per)
ts2hrt_new out: 19673171368409185
ts2hrt_new: 178229970 for 10000000 iterations (17ns per)
hrt2tv_old out: 19673171 546713
hrt2tv_old: 318258823 for 10000000 iterations (31ns per)
hrt2tv_new out: 19673171 865045
hrt2tv_new: 3042147022 for 10000000 iterations (304ns per)
tsoome@openindiana:~$ ./a.out 
hrt2ts_old out: 84495691227635539 999999999
hrt2ts_old: 234414399 for 10000000 iterations (23ns per)
hrt2ts_new out: 84495691462327677 4273364896
hrt2ts_new: 1922463215 for 10000000 iterations (192ns per)
ts2hrt_old out: 19673188906760687
ts2hrt_old: 193781875 for 10000000 iterations (19ns per)
ts2hrt_new out: 19673189100628807
ts2hrt_new: 178153574 for 10000000 iterations (17ns per)
hrt2tv_old out: 19673189 278864
hrt2tv_old: 318340623 for 10000000 iterations (31ns per)
hrt2tv_new out: 19673189 597285
hrt2tv_new: 2966634967 for 10000000 iterations (296ns per)
tsoome@openindiana:~$ ./a.out 
hrt2ts_old out: 84495729606530991 999999999
hrt2ts_old: 219752353 for 10000000 iterations (21ns per)
hrt2ts_new out: 84495729826572811 4280704928
hrt2ts_new: 1929243810 for 10000000 iterations (192ns per)
ts2hrt_old out: 19673197623070955
ts2hrt_old: 194261624 for 10000000 iterations (19ns per)
ts2hrt_new out: 19673197817370648
ts2hrt_new: 178021785 for 10000000 iterations (17ns per)
hrt2tv_old out: 19673197 995448
hrt2tv_old: 310441440 for 10000000 iterations (31ns per)
hrt2tv_new out: 19673198 305969
hrt2tv_new: 3176957658 for 10000000 iterations (317ns per)
tsoome@openindiana:~$ ./a.out 
hrt2ts_old out: 84495763940848102 999999999
hrt2ts_old: 234558427 for 10000000 iterations (23ns per)
hrt2ts_new out: 84495764175703032 4281229216
hrt2ts_new: 1833719577 for 10000000 iterations (183ns per)
ts2hrt_old out: 19673205516940006
ts2hrt_old: 193646554 for 10000000 iterations (19ns per)
ts2hrt_new out: 19673205710664689
ts2hrt_new: 178147799 for 10000000 iterations (17ns per)
hrt2tv_old out: 19673205 888852
hrt2tv_old: 310525583 for 10000000 iterations (31ns per)
hrt2tv_new out: 19673206 199457
hrt2tv_new: 2906462679 for 10000000 iterations (290ns per)
tsoome@openindiana:~$ 

and 64-bit:
tsoome@openindiana:~$ gcc -O2 -m64 test.c 
tsoome@openindiana:~$ ./a.out 
hrt2ts_old out: 19673374 956585860
hrt2ts_old: 254113953 for 10000000 iterations (25ns per)
hrt2ts_new out: 19673375 210882900
hrt2ts_new: 335590822 for 10000000 iterations (33ns per)
ts2hrt_old out: 19673375546550559
ts2hrt_old: 159719108 for 10000000 iterations (15ns per)
ts2hrt_new out: 19673375706303072
ts2hrt_new: 159336552 for 10000000 iterations (15ns per)
hrt2tv_old out: 19673375 865676
hrt2tv_old: 306799758 for 10000000 iterations (30ns per)
hrt2tv_new out: 19673376 172548
hrt2tv_new: 394085482 for 10000000 iterations (39ns per)
tsoome@openindiana:~$ ./a.out 
hrt2ts_old out: 19673377 944222778
hrt2ts_old: 260693668 for 10000000 iterations (26ns per)
hrt2ts_new out: 19673378 205108440
hrt2ts_new: 335664606 for 10000000 iterations (33ns per)
ts2hrt_old out: 19673378540846722
ts2hrt_old: 159704898 for 10000000 iterations (15ns per)
ts2hrt_new out: 19673378700605260
ts2hrt_new: 159608876 for 10000000 iterations (15ns per)
hrt2tv_old out: 19673378 860253
hrt2tv_old: 316960428 for 10000000 iterations (31ns per)
hrt2tv_new out: 19673379 177288
hrt2tv_new: 400119488 for 10000000 iterations (40ns per)
tsoome@openindiana:~$ ./a.out 
hrt2ts_old out: 19673380 664675660
hrt2ts_old: 252873684 for 10000000 iterations (25ns per)
hrt2ts_new out: 19673380 917712166
hrt2ts_new: 335180784 for 10000000 iterations (33ns per)
ts2hrt_old out: 19673381252977553
ts2hrt_old: 159690377 for 10000000 iterations (15ns per)
ts2hrt_new out: 19673381412702886
ts2hrt_new: 159188291 for 10000000 iterations (15ns per)
hrt2tv_old out: 19673381 571965
hrt2tv_old: 306483761 for 10000000 iterations (30ns per)
hrt2tv_new out: 19673381 878484
hrt2tv_new: 399461253 for 10000000 iterations (39ns per)
tsoome@openindiana:~$ ./a.out 
hrt2ts_old out: 19673383 328477505
hrt2ts_old: 253925602 for 10000000 iterations (25ns per)
hrt2ts_new out: 19673383 582600245
hrt2ts_new: 335217201 for 10000000 iterations (33ns per)
ts2hrt_old out: 19673383917868225
ts2hrt_old: 159576871 for 10000000 iterations (15ns per)
ts2hrt_new out: 19673384077525296
ts2hrt_new: 160282000 for 10000000 iterations (16ns per)
hrt2tv_old out: 19673384 237896
hrt2tv_old: 316315545 for 10000000 iterations (31ns per)
hrt2tv_new out: 19673384 554294
hrt2tv_new: 393278726 for 10000000 iterations (39ns per)
tsoome@openindiana:~$ 

gcc version 7.5.0 (OpenIndiana 7.5.0-il-0)

Actions #4

Updated by Patrick Mooney about 1 year ago

  • Related to Bug #13088: uniqtime() is too clever for its own good added
Actions #5

Updated by Patrick Mooney about 1 year ago

This change has been present in SmartOS for years without presenting a problem. With it built on top of illumos-gate, I booted up a BE and verified (via mdb -k) that hrt2ts was not implemented through the old code. Following that, I dtrace its calls and checked inputs/outputs against calculated values to make sure they looked correct, which they did.

Actions #6

Updated by Electric Monk about 1 year ago

  • Status changed from In Progress to Closed

git commit 88b8d9620aed414dab5fb34108dee58556f060f0

commit  88b8d9620aed414dab5fb34108dee58556f060f0
Author: Patrick Mooney <pmooney@pfmooney.com>
Date:   2020-09-04T20:37:06.000Z

    7709 hrt2ts and friends are too clever for their own good
    Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
    Reviewed by: Jason King <jason.king@joyent.com>
    Reviewed by: Toomas Soome <tsoome@me.com>
    Approved by: Richard Lowe <richlowe@richlowe.net>

Actions

Also available in: Atom PDF