Trying different compilers for double precision math...
icc -fast -march=core2
(icc 11.1.069, 32bit)
g++-4.2 -O2 -ffast-math -msse3
(Ubuntu 8.10, 32bit)
g++-4.2 -O2 -ffast-math -msse3 -mfpmath=sse
(Ubuntu 8.10, 32bit)
g++-4.4 -O3 -ffast-math -msse3 -mfpmath=sse -march=core2
(gcc 4.4 early experimental, 32bit)
g++-4.4 -O3 -ffast-math -msse3 -mfpmath=sse -march=core2
(Ubuntu 9.10, 64bit)
log
, exp
, pow
, gcc is
doing better on erfc
, erf
, gamma
-- the accuracy might be different though.
For comparison, the 64bit machine is a bit faster (3 GHz) than the 32bit
one (2.67 GHz).
The Intel compiler is always using SSE instructions and has reasonable
good SSE implementations for log
, exp
,
pow
; the GNU compiler struggles with FPU instructions and
does not have double-precision SSE code, some operations get slower with
SSE, e.g. floor
, tan
, log
.
It is hard to get good overall performance with gcc for 32bit,
64bit fares better.
To speed up log
and exp
, one can use approximate
math functions (at single precision). Shigeo MITSUNARI offers fmath which
implements log
and exp
in SSE with table
lookups. It also can benefit from the Xbyak JIT
compiler.
José Fonseca has a blog entry Fast SSE2 pow: tables or polynomials? which also looks interesting.
icc | gcc-4.2 | gcc-4.2 | (gcc-4.4) | gcc-4.4 | |
float->double | 150 | 120 | 120 | 110 | 70 |
double->float | 110 | 90 | 90 | 90 | 100 |
erf | 5020 | 1960 | 1990 | 1960 | 1080 |
erfc | 9600 | 2310 | 2220 | 2220 | 1270 |
gamma | 12040 | 6340 | 6340 | 6340 | 6570 |
tgamma | 9050 | 12660 | 12570 | 12890 | 9920 |
log | 1900 | 2340 | 4390 | 4430 | 4440 |
fmath::log | 560 | 1170 | 490 | 590 | 450 |
exp | 1290 | 5290 | 6280 | 6380 | 2870 |
fmath::exp | 640 | 600 | 650 | 660 | 620 |
pow | 3660 | 11980 | 11820 | 11680 | 9210 |
fmath::pow | 2170 | 2720 | 1650 | 1800 | 1580 |
sqrt | 2140 | 2550 | 2140 | 2140 | 640 |
cos | 1660 | 3380 | 3870 | 3860 | 1690 |
tan | 2520 | 4840 | 5930 | 5930 | 3480 |
atan | 1600 | 5330 | 5880 | 5630 | 1660 |
mul | 91 | 91 | 86 | 98 | 99 |
div | 1190 | 1400 | 1170 | 1180 | 680 |
add | 92 | 96 | 87 | 93 | 99 |
floor | 356 | 788 | 2177 | 319 | 262 |
fabs | 88 | 90 | 91 | 89 | 96 |
Here's the updated code using the fmath library.
#include <cmath> #include <cstdio> #include <ctime> #include <cstdlib> #include "fmath.hpp" int main() { volatile double a; volatile double b = 0.8234234; volatile double c = 1.43234; volatile float d; clock_t t; const int N = 100000000; t = clock(); for (int i=0; i < N; i++) a = d; printf("float -> double %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) d = a; printf("double -> float %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = erf(b); printf("erf %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = erfc(b); printf("erfc %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = gamma(b); printf("gamma %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = tgamma(b); printf("tgamma %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = log(b); printf("log %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = fmath::log(b); printf("fmath::log %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = exp(b); printf("exp %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = fmath::exp(b); printf("fmath::exp %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = pow(b, c); printf("pow %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = fmath::exp(fmath::log(b) * c); printf("fmath::pow %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = sqrt(b); printf("sqrt %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = cos(b); printf("cos %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = tan(b); printf("tan %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = atan(b); printf("atan %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < 10*N; i++) a = b*c; printf("mul %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = b/c; printf("div %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < 10*N; i++) a = b+c; printf("add %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < 10*N; i++) a = floor(b); printf("floor %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < 10*N; i++) a = fabs(b); printf("fabs %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC); return EXIT_SUCCESS; }
posted at: 15:34 | path: /programming | permanent link