28 Mar 2010
Eigen is a C++ template library for linear algebra: vectors, matrices, and related algorithms. The GNU Scientific Library (GSL) is a numerical C library which builds on BLAS for more complicated matrix operations.
Compiled with:
g++ -static -O3 -Wall -I/tmp/eigen -msse3 -ffast-math -mfpmath=sse -o mm
mm.cpp -lgsl /usr/lib/sse2/libcblas.a /usr/lib/sse2/libatlas.a
Note: several other ways to link GSL with BLAS are possible, but all were
slower (Ubuntu 8.10, 32bit).
eigen 0.380
GSL 0.780
#include <cstdlib> #include <cstdio> #include <ctime> #define EIGEN2_SUPPORT #include "Eigen/Core" USING_PART_OF_NAMESPACE_EIGEN #include#include int main() { clock_t t; MatrixXf ea; MatrixXf eb; ea = MatrixXf::Random(18, 18); eb = MatrixXf::Random(18, 256); t = clock(); for (int i = 0; i < 10000; i++) MatrixXf er = ea * eb; printf("eigen %.3f\n", (clock() - t) / (float) CLOCKS_PER_SEC); gsl_matrix *ga = gsl_matrix_alloc(18, 18); gsl_matrix *gb = gsl_matrix_alloc(18, 256); gsl_matrix *gr = gsl_matrix_alloc(18, 256); t = clock(); for (int i = 0; i < 10000; i++) gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, ga, gb, 0.0, gr); printf("GSL %.3f\n", (clock() - t) / (float) CLOCKS_PER_SEC); gsl_matrix_free(ga); gsl_matrix_free(gb); gsl_matrix_free(gr); return EXIT_SUCCESS; }
posted at: 16:32 | path: /programming | permanent link
Trying different compilers for double precision math...
icc -fast -march=core2
(icc 11.1.069, 32bit)
g++-4.2 -O2 -ffast-math -msse3
(Ubuntu 8.10, 32bit)
g++-4.2 -O2 -ffast-math -msse3 -mfpmath=sse
(Ubuntu 8.10, 32bit)
g++-4.4 -O3 -ffast-math -msse3 -mfpmath=sse -march=core2
(gcc 4.4 early experimental, 32bit)
g++-4.4 -O3 -ffast-math -msse3 -mfpmath=sse -march=core2
(Ubuntu 9.10, 64bit)
log
, exp
, pow
, gcc is
doing better on erfc
, erf
, gamma
-- the accuracy might be different though.
For comparison, the 64bit machine is a bit faster (3 GHz) than the 32bit
one (2.67 GHz).
The Intel compiler is always using SSE instructions and has reasonable
good SSE implementations for log
, exp
,
pow
; the GNU compiler struggles with FPU instructions and
does not have double-precision SSE code, some operations get slower with
SSE, e.g. floor
, tan
, log
.
It is hard to get good overall performance with gcc for 32bit,
64bit fares better.
To speed up log
and exp
, one can use approximate
math functions (at single precision). Shigeo MITSUNARI offers fmath which
implements log
and exp
in SSE with table
lookups. It also can benefit from the Xbyak JIT
compiler.
José Fonseca has a blog entry Fast SSE2 pow: tables or polynomials? which also looks interesting.
icc | gcc-4.2 | gcc-4.2 | (gcc-4.4) | gcc-4.4 | |
float->double | 150 | 120 | 120 | 110 | 70 |
double->float | 110 | 90 | 90 | 90 | 100 |
erf | 5020 | 1960 | 1990 | 1960 | 1080 |
erfc | 9600 | 2310 | 2220 | 2220 | 1270 |
gamma | 12040 | 6340 | 6340 | 6340 | 6570 |
tgamma | 9050 | 12660 | 12570 | 12890 | 9920 |
log | 1900 | 2340 | 4390 | 4430 | 4440 |
fmath::log | 560 | 1170 | 490 | 590 | 450 |
exp | 1290 | 5290 | 6280 | 6380 | 2870 |
fmath::exp | 640 | 600 | 650 | 660 | 620 |
pow | 3660 | 11980 | 11820 | 11680 | 9210 |
fmath::pow | 2170 | 2720 | 1650 | 1800 | 1580 |
sqrt | 2140 | 2550 | 2140 | 2140 | 640 |
cos | 1660 | 3380 | 3870 | 3860 | 1690 |
tan | 2520 | 4840 | 5930 | 5930 | 3480 |
atan | 1600 | 5330 | 5880 | 5630 | 1660 |
mul | 91 | 91 | 86 | 98 | 99 |
div | 1190 | 1400 | 1170 | 1180 | 680 |
add | 92 | 96 | 87 | 93 | 99 |
floor | 356 | 788 | 2177 | 319 | 262 |
fabs | 88 | 90 | 91 | 89 | 96 |
Here's the updated code using the fmath library.
#include <cmath> #include <cstdio> #include <ctime> #include <cstdlib> #include "fmath.hpp" int main() { volatile double a; volatile double b = 0.8234234; volatile double c = 1.43234; volatile float d; clock_t t; const int N = 100000000; t = clock(); for (int i=0; i < N; i++) a = d; printf("float -> double %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) d = a; printf("double -> float %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = erf(b); printf("erf %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = erfc(b); printf("erfc %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = gamma(b); printf("gamma %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = tgamma(b); printf("tgamma %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = log(b); printf("log %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = fmath::log(b); printf("fmath::log %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = exp(b); printf("exp %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = fmath::exp(b); printf("fmath::exp %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = pow(b, c); printf("pow %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = fmath::exp(fmath::log(b) * c); printf("fmath::pow %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = sqrt(b); printf("sqrt %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = cos(b); printf("cos %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = tan(b); printf("tan %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = atan(b); printf("atan %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < 10*N; i++) a = b*c; printf("mul %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < N; i++) a = b/c; printf("div %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < 10*N; i++) a = b+c; printf("add %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < 10*N; i++) a = floor(b); printf("floor %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC); t = clock(); for (int i=0; i < 10*N; i++) a = fabs(b); printf("fabs %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC); return EXIT_SUCCESS; }
posted at: 15:34 | path: /programming | permanent link