Feature #7709
closedhrt2ts and friends are too clever for their own good
100%
Description
Upstreaming of OS-5655 from SmartOS:
The family of functions for converted between time formats (hrt2ts, ts2hrt, etc) use a variety of tricks in an attempt to increase performance. While these techniques may have been valuable long ago, they appear to be harmful on modern x86_64 CPUs.
These routines should be checked and, when it would improve performance, the "optimizations" should be disabled via #ifdef.
I wrote a crude little test program to benchmark the existing code against new "de-optimized" versions:
#include <stdio.h> #include <limits.h> #include <sys/types.h> #include <sys/time.h> #define TESTCOUNT 10000000 void hrt2ts_old(hrtime_t hrt, timespec_t *tsp) { uint32_t sec, nsec, tmp; tmp = (uint32_t)(hrt >> 30); sec = tmp - (tmp >> 2); sec = tmp - (sec >> 5); sec = tmp + (sec >> 1); sec = tmp - (sec >> 6) + 7; sec = tmp - (sec >> 3); sec = tmp + (sec >> 1); sec = tmp + (sec >> 3); sec = tmp + (sec >> 4); tmp = (sec << 7) - sec - sec - sec; tmp = (tmp << 7) - tmp - tmp - tmp; tmp = (tmp << 7) - tmp - tmp - tmp; nsec = (uint32_t)hrt - (tmp << 9); while (nsec >= NANOSEC) { nsec -= NANOSEC; sec++; } tsp->tv_sec = (time_t)sec; tsp->tv_nsec = nsec; } void hrt2ts_new(hrtime_t hrt, timespec_t *tsp) { tsp->tv_sec = (time_t)(hrt / NANOSEC); tsp->tv_nsec = (hrt % NANOSEC); } void do_test_hrt2ts(timespec_t *tsp, void (*tf)(hrtime_t, timespec_t *)) { int i; hrtime_t hrt; hrt = gethrtime(); for (i = 0; i < TESTCOUNT; i++) { tf(hrt, tsp); } } void test_hrt2ts(void (*tf)(hrtime_t, timespec_t *), const char *name) { timespec_t start, end; uint64_t diff; timespec_t tsp; clock_gettime(CLOCK_MONOTONIC, &start); do_test_hrt2ts(&tsp, tf); clock_gettime(CLOCK_MONOTONIC, &end); if (end.tv_nsec < start.tv_nsec) { end.tv_sec -= 1; end.tv_nsec += NANOSEC; } diff = end.tv_nsec - start.tv_nsec; diff += (end.tv_sec - start.tv_sec) * NANOSEC; printf("%s out: %lld %lld\n", name, tsp.tv_sec, tsp.tv_nsec); printf("%s: %lld for %d iterations (%lldns per)\n", name, diff, TESTCOUNT, diff/TESTCOUNT); } hrtime_t ts2hrt_old(timespec_t *tsp) { hrtime_t hrt; hrt = tsp->tv_sec; hrt = (hrt << 7) - hrt - hrt - hrt; hrt = (hrt << 7) - hrt - hrt - hrt; hrt = (hrt << 7) - hrt - hrt - hrt; hrt = (hrt << 9) + tsp->tv_nsec; return (hrt); } hrtime_t ts2hrt_new(timespec_t *tsp) { return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec); } hrtime_t do_test_ts2hrt(timespec_t *tsp, hrtime_t (*tf)(timespec_t *)) { int i; hrtime_t hrt; hrt = gethrtime(); for (i = 0; i < TESTCOUNT; i++) { hrt = tf(tsp); } return (hrt); } void test_ts2hrt(hrtime_t (*tf)(timespec_t *), const char *name) { timespec_t start, end; timestruc_t tsc; uint64_t diff; hrtime_t hrt; clock_gettime(CLOCK_MONOTONIC, &start); hrt = do_test_ts2hrt(&start, tf); clock_gettime(CLOCK_MONOTONIC, &end); if (end.tv_nsec < start.tv_nsec) { end.tv_sec -= 1; end.tv_nsec += NANOSEC; } diff = end.tv_nsec - start.tv_nsec; diff += (end.tv_sec - start.tv_sec) * NANOSEC; printf("%s out: %lld\n", name, hrt); printf("%s: %lld for %d iterations (%lldns per)\n", name, diff, TESTCOUNT, diff/TESTCOUNT); } void hrt2tv_old(hrtime_t hrt, struct timeval *tvp) { uint32_t sec, nsec, tmp; uint32_t q, r, t; tmp = (uint32_t)(hrt >> 30); sec = tmp - (tmp >> 2); sec = tmp - (sec >> 5); sec = tmp + (sec >> 1); sec = tmp - (sec >> 6) + 7; sec = tmp - (sec >> 3); sec = tmp + (sec >> 1); sec = tmp + (sec >> 3); sec = tmp + (sec >> 4); tmp = (sec << 7) - sec - sec - sec; tmp = (tmp << 7) - tmp - tmp - tmp; tmp = (tmp << 7) - tmp - tmp - tmp; nsec = (uint32_t)hrt - (tmp << 9); while (nsec >= NANOSEC) { nsec -= NANOSEC; sec++; } tvp->tv_sec = (time_t)sec; t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12); q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14); q = q >> 9; r = nsec - q*1000; tvp->tv_usec = q + ((r + 24) >> 10); } void hrt2tv_new(hrtime_t hrt, struct timeval *tvp) { tvp->tv_sec = (time_t)(hrt / NANOSEC); tvp->tv_usec = ((hrt % NANOSEC) / 1000); } void do_test_hrt2tv(struct timeval *tvp, void (*tf)(hrtime_t, struct timeval *)) { int i; hrtime_t hrt; hrt = gethrtime(); for (i = 0; i < TESTCOUNT; i++) { tf(hrt, tvp); } } void test_hrt2tv(void (*tf)(hrtime_t, struct timeval *), const char *name) { timespec_t start, end; uint64_t diff; struct timeval tvp; clock_gettime(CLOCK_MONOTONIC, &start); do_test_hrt2tv(&tvp, tf); clock_gettime(CLOCK_MONOTONIC, &end); if (end.tv_nsec < start.tv_nsec) { end.tv_sec -= 1; end.tv_nsec += NANOSEC; } diff = end.tv_nsec - start.tv_nsec; diff += (end.tv_sec - start.tv_sec) * NANOSEC; printf("%s out: %ld %ld\n", name, tvp.tv_sec, tvp.tv_usec); printf("%s: %lld for %d iterations (%lldns per)\n", name, diff, TESTCOUNT, diff/TESTCOUNT); } int main() { int count; test_hrt2ts(hrt2ts_old, "hrt2ts_old"); test_hrt2ts(hrt2ts_new, "hrt2ts_new"); test_ts2hrt(ts2hrt_old, "ts2hrt_old"); test_ts2hrt(ts2hrt_new, "ts2hrt_new"); test_hrt2tv(hrt2tv_old, "hrt2tv_old"); test_hrt2tv(hrt2tv_new, "hrt2tv_new"); }
Here are the results for 64-bit:
[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ gcc -O2 -m64 test.c [pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ ./a.out hrt2ts_old out: 3999579 457906879 hrt2ts_old: 65166214 for 10000000 iterations (6ns per) hrt2ts_new out: 3999579 523236995 hrt2ts_new: 27384771 for 10000000 iterations (2ns per) ts2hrt_old out: 3999579550630130 ts2hrt_old: 37698537 for 10000000 iterations (3ns per) ts2hrt_new out: 3999579588336809 ts2hrt_new: 20534698 for 10000000 iterations (2ns per) hrt2tv_old out: 3999579 608879 hrt2tv_old: 132647645 for 10000000 iterations (13ns per) hrt2tv_new out: 3999579 741541 hrt2tv_new: 42581307 for 10000000 iterations (4ns per)
... and 32-bit:
[pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ gcc -O2 -m32 test.c [pmooney@fbf78115-e35d-41ad-8de1-c7729e7ef507 ~]$ ./a.out hrt2ts_old out: 505795097040587133 -81064690078906408 hrt2ts_old: 93591836 for 10000000 iterations (9ns per) hrt2ts_new out: 908324255269980541 38664705664 hrt2ts_new: 208671683 for 10000000 iterations (20ns per) ts2hrt_old out: 4000125420170843 ts2hrt_old: 121201904 for 10000000 iterations (12ns per) ts2hrt_new out: 4000125541384366 ts2hrt_new: 30899755 for 10000000 iterations (3ns per) hrt2tv_old out: 4000125 572295 hrt2tv_old: 161604889 for 10000000 iterations (16ns per) hrt2tv_new out: 4000125 733913 hrt2tv_new: 310336290 for 10000000 iterations (31ns per)
Related issues
Updated by Patrick Mooney over 1 year ago
- Description updated (diff)
- Status changed from New to In Progress
Updated by Toomas Soome over 1 year ago
Robert Mustacchi wrote:
Upstreaming of OS-5655 from SmartOS:
The family of functions for converted between time formats (hrt2ts, ts2hrt, etc) use a variety of tricks in an attempt to increase performance. While these techniques may have been valuable long ago, they appear to be harmful on modern x86_64 CPUs.
These routines should be checked and, when it would improve performance, the "optimizations" should be disabled via #ifdef.
I wrote a crude little test program to benchmark the existing code against new "de-optimized" versions:
[...]
Here are the results for 64-bit:
[...]
... and 32-bit:
[...]
T4-1, 32-bit:
tsoome@openindiana:~$ gcc -O2 -m32 test.c tsoome@openindiana:~$ ./a.out hrt2ts_old out: 84495617514786895 999999999 hrt2ts_old: 234592524 for 10000000 iterations (23ns per) hrt2ts_new out: 84495617749683377 4282277792 hrt2ts_new: 1888438937 for 10000000 iterations (188ns per) ts2hrt_old out: 19673171174531137 ts2hrt_old: 193794275 for 10000000 iterations (19ns per) ts2hrt_new out: 19673171368409185 ts2hrt_new: 178229970 for 10000000 iterations (17ns per) hrt2tv_old out: 19673171 546713 hrt2tv_old: 318258823 for 10000000 iterations (31ns per) hrt2tv_new out: 19673171 865045 hrt2tv_new: 3042147022 for 10000000 iterations (304ns per) tsoome@openindiana:~$ ./a.out hrt2ts_old out: 84495691227635539 999999999 hrt2ts_old: 234414399 for 10000000 iterations (23ns per) hrt2ts_new out: 84495691462327677 4273364896 hrt2ts_new: 1922463215 for 10000000 iterations (192ns per) ts2hrt_old out: 19673188906760687 ts2hrt_old: 193781875 for 10000000 iterations (19ns per) ts2hrt_new out: 19673189100628807 ts2hrt_new: 178153574 for 10000000 iterations (17ns per) hrt2tv_old out: 19673189 278864 hrt2tv_old: 318340623 for 10000000 iterations (31ns per) hrt2tv_new out: 19673189 597285 hrt2tv_new: 2966634967 for 10000000 iterations (296ns per) tsoome@openindiana:~$ ./a.out hrt2ts_old out: 84495729606530991 999999999 hrt2ts_old: 219752353 for 10000000 iterations (21ns per) hrt2ts_new out: 84495729826572811 4280704928 hrt2ts_new: 1929243810 for 10000000 iterations (192ns per) ts2hrt_old out: 19673197623070955 ts2hrt_old: 194261624 for 10000000 iterations (19ns per) ts2hrt_new out: 19673197817370648 ts2hrt_new: 178021785 for 10000000 iterations (17ns per) hrt2tv_old out: 19673197 995448 hrt2tv_old: 310441440 for 10000000 iterations (31ns per) hrt2tv_new out: 19673198 305969 hrt2tv_new: 3176957658 for 10000000 iterations (317ns per) tsoome@openindiana:~$ ./a.out hrt2ts_old out: 84495763940848102 999999999 hrt2ts_old: 234558427 for 10000000 iterations (23ns per) hrt2ts_new out: 84495764175703032 4281229216 hrt2ts_new: 1833719577 for 10000000 iterations (183ns per) ts2hrt_old out: 19673205516940006 ts2hrt_old: 193646554 for 10000000 iterations (19ns per) ts2hrt_new out: 19673205710664689 ts2hrt_new: 178147799 for 10000000 iterations (17ns per) hrt2tv_old out: 19673205 888852 hrt2tv_old: 310525583 for 10000000 iterations (31ns per) hrt2tv_new out: 19673206 199457 hrt2tv_new: 2906462679 for 10000000 iterations (290ns per) tsoome@openindiana:~$
and 64-bit:
tsoome@openindiana:~$ gcc -O2 -m64 test.c tsoome@openindiana:~$ ./a.out hrt2ts_old out: 19673374 956585860 hrt2ts_old: 254113953 for 10000000 iterations (25ns per) hrt2ts_new out: 19673375 210882900 hrt2ts_new: 335590822 for 10000000 iterations (33ns per) ts2hrt_old out: 19673375546550559 ts2hrt_old: 159719108 for 10000000 iterations (15ns per) ts2hrt_new out: 19673375706303072 ts2hrt_new: 159336552 for 10000000 iterations (15ns per) hrt2tv_old out: 19673375 865676 hrt2tv_old: 306799758 for 10000000 iterations (30ns per) hrt2tv_new out: 19673376 172548 hrt2tv_new: 394085482 for 10000000 iterations (39ns per) tsoome@openindiana:~$ ./a.out hrt2ts_old out: 19673377 944222778 hrt2ts_old: 260693668 for 10000000 iterations (26ns per) hrt2ts_new out: 19673378 205108440 hrt2ts_new: 335664606 for 10000000 iterations (33ns per) ts2hrt_old out: 19673378540846722 ts2hrt_old: 159704898 for 10000000 iterations (15ns per) ts2hrt_new out: 19673378700605260 ts2hrt_new: 159608876 for 10000000 iterations (15ns per) hrt2tv_old out: 19673378 860253 hrt2tv_old: 316960428 for 10000000 iterations (31ns per) hrt2tv_new out: 19673379 177288 hrt2tv_new: 400119488 for 10000000 iterations (40ns per) tsoome@openindiana:~$ ./a.out hrt2ts_old out: 19673380 664675660 hrt2ts_old: 252873684 for 10000000 iterations (25ns per) hrt2ts_new out: 19673380 917712166 hrt2ts_new: 335180784 for 10000000 iterations (33ns per) ts2hrt_old out: 19673381252977553 ts2hrt_old: 159690377 for 10000000 iterations (15ns per) ts2hrt_new out: 19673381412702886 ts2hrt_new: 159188291 for 10000000 iterations (15ns per) hrt2tv_old out: 19673381 571965 hrt2tv_old: 306483761 for 10000000 iterations (30ns per) hrt2tv_new out: 19673381 878484 hrt2tv_new: 399461253 for 10000000 iterations (39ns per) tsoome@openindiana:~$ ./a.out hrt2ts_old out: 19673383 328477505 hrt2ts_old: 253925602 for 10000000 iterations (25ns per) hrt2ts_new out: 19673383 582600245 hrt2ts_new: 335217201 for 10000000 iterations (33ns per) ts2hrt_old out: 19673383917868225 ts2hrt_old: 159576871 for 10000000 iterations (15ns per) ts2hrt_new out: 19673384077525296 ts2hrt_new: 160282000 for 10000000 iterations (16ns per) hrt2tv_old out: 19673384 237896 hrt2tv_old: 316315545 for 10000000 iterations (31ns per) hrt2tv_new out: 19673384 554294 hrt2tv_new: 393278726 for 10000000 iterations (39ns per) tsoome@openindiana:~$
gcc version 7.5.0 (OpenIndiana 7.5.0-il-0)
Updated by Patrick Mooney over 1 year ago
- Related to Bug #13088: uniqtime() is too clever for its own good added
Updated by Patrick Mooney over 1 year ago
This change has been present in SmartOS for years without presenting a problem. With it built on top of illumos-gate, I booted up a BE and verified (via mdb -k
) that hrt2ts
was not implemented through the old code. Following that, I dtrace its calls and checked inputs/outputs against calculated values to make sure they looked correct, which they did.
Updated by Electric Monk over 1 year ago
- Status changed from In Progress to Closed
git commit 88b8d9620aed414dab5fb34108dee58556f060f0
commit 88b8d9620aed414dab5fb34108dee58556f060f0 Author: Patrick Mooney <pmooney@pfmooney.com> Date: 2020-09-04T20:37:06.000Z 7709 hrt2ts and friends are too clever for their own good Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Jason King <jason.king@joyent.com> Reviewed by: Toomas Soome <tsoome@me.com> Approved by: Richard Lowe <richlowe@richlowe.net>