pmeerw's blog /2010/Mar

Eigen is a C++ template library for linear algebra: vectors, matrices, and related algorithms. The GNU Scientific Library (GSL) is a numerical C library which builds on BLAS for more complicated matrix operations.

Compiled with:
g++ -static -O3 -Wall -I/tmp/eigen -msse3 -ffast-math -mfpmath=sse -o mm mm.cpp -lgsl /usr/lib/sse2/libcblas.a /usr/lib/sse2/libatlas.a
Note: several other ways to link GSL with BLAS are possible, but all were slower (Ubuntu 8.10, 32bit).

eigen 0.380
GSL 0.780

#include <cstdlib>
#include <cstdio>
#include <ctime>

#define EIGEN2_SUPPORT
#include "Eigen/Core"

USING_PART_OF_NAMESPACE_EIGEN

#include 
#include 

int main() {
    clock_t t;

    MatrixXf ea;
    MatrixXf eb;

    ea = MatrixXf::Random(18, 18);    
    eb = MatrixXf::Random(18, 256);

    t = clock();
    for (int i = 0; i < 10000; i++)
        MatrixXf er = ea * eb;
    printf("eigen %.3f\n", (clock() - t) / (float) CLOCKS_PER_SEC);

    gsl_matrix *ga = gsl_matrix_alloc(18, 18);
    gsl_matrix *gb = gsl_matrix_alloc(18, 256);
    gsl_matrix *gr = gsl_matrix_alloc(18, 256);
    
    t = clock();
    for (int i = 0; i < 10000; i++)
    	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, ga, gb, 0.0, gr);

    printf("GSL %.3f\n", (clock() - t) / (float) CLOCKS_PER_SEC);

    gsl_matrix_free(ga);
    gsl_matrix_free(gb);
    gsl_matrix_free(gr);            

    return EXIT_SUCCESS;
}

posted at: 16:32 | path: /programming | permanent link

Floating-point performance: Compiler comparison and speedup

Trying different compilers for double precision math...

icc -fast -march=core2 (icc 11.1.069, 32bit)
g++-4.2 -O2 -ffast-math -msse3 (Ubuntu 8.10, 32bit)
g++-4.2 -O2 -ffast-math -msse3 -mfpmath=sse (Ubuntu 8.10, 32bit)
g++-4.4 -O3 -ffast-math -msse3 -mfpmath=sse -march=core2 (gcc 4.4 early experimental, 32bit)
g++-4.4 -O3 -ffast-math -msse3 -mfpmath=sse -march=core2 (Ubuntu 9.10, 64bit)

Conclusion: icc is tremendously faster than gcc on log, exp, pow, gcc is doing better on erfc, erf, gamma -- the accuracy might be different though. For comparison, the 64bit machine is a bit faster (3 GHz) than the 32bit one (2.67 GHz).

The Intel compiler is always using SSE instructions and has reasonable good SSE implementations for log, exp, pow; the GNU compiler struggles with FPU instructions and does not have double-precision SSE code, some operations get slower with SSE, e.g. floor, tan, log. It is hard to get good overall performance with gcc for 32bit, 64bit fares better.

To speed up log and exp, one can use approximate math functions (at single precision). Shigeo MITSUNARI offers fmath which implements log and exp in SSE with table lookups. It also can benefit from the Xbyak JIT compiler.

José Fonseca has a blog entry Fast SSE2 pow: tables or polynomials? which also looks interesting.

icc gcc-4.2 gcc-4.2 (gcc-4.4) gcc-4.4

float->double 150 120 120 110 70

double->float 110 90 90 90 100

erf 5020 1960 1990 1960 1080

erfc 9600 2310 2220 2220 1270

gamma 12040 6340 6340 6340 6570

tgamma 9050 12660 12570 12890 9920

log 1900 2340 4390 4430 4440

fmath::log 560 1170 490 590 450

exp 1290 5290 6280 6380 2870

fmath::exp 640 600 650 660 620

pow 3660 11980 11820 11680 9210

fmath::pow 2170 2720 1650 1800 1580

sqrt 2140 2550 2140 2140 640

cos 1660 3380 3870 3860 1690

tan 2520 4840 5930 5930 3480

atan 1600 5330 5880 5630 1660

mul 91 91 86 98 99

div 1190 1400 1170 1180 680

add 92 96 87 93 99

floor 356 788 2177 319 262

fabs 88 90 91 89 96

Here's the updated code using the fmath library.

#include <cmath>
#include <cstdio>
#include <ctime>
#include <cstdlib>

#include "fmath.hpp"

int main() {
    volatile double a;
    volatile double b = 0.8234234;
    volatile double c = 1.43234;
    volatile float d;
    clock_t t;

    const int N = 100000000;

    t = clock();
    for (int i=0; i < N; i++)
        a = d;
    printf("float -> double %.3f\n", (clock()-t) / (float) 
CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        d = a;
    printf("double -> float %.3f\n", (clock()-t) / (float) 
CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        a = erf(b);
    printf("erf %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        a = erfc(b);
    printf("erfc %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        a = gamma(b);
    printf("gamma %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        a = tgamma(b);
    printf("tgamma %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = log(b);
    printf("log %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);      

    t = clock();
    for (int i=0; i < N; i++)
        a = fmath::log(b);
    printf("fmath::log %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);      

    t = clock();
    for (int i=0; i < N; i++)
        a = exp(b);
    printf("exp %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = fmath::exp(b);
    printf("fmath::exp %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = pow(b, c);
    printf("pow %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = fmath::exp(fmath::log(b) * c);
    printf("fmath::pow %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = sqrt(b);
    printf("sqrt %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = cos(b);
    printf("cos %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = tan(b);
    printf("tan %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = atan(b);
    printf("atan %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

        
    t = clock();
    for (int i=0; i < 10*N; i++)
        a = b*c;
    printf("mul %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        a = b/c;
    printf("div %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);  

    t = clock();
    for (int i=0; i < 10*N; i++)
        a = b+c;
    printf("add %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC);  

    t = clock();
    for (int i=0; i < 10*N; i++)
        a = floor(b);
    printf("floor %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC);  

    t = clock();
    for (int i=0; i < 10*N; i++)
        a = fabs(b);
    printf("fabs %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC);  

    return EXIT_SUCCESS;
}

posted at: 15:34 | path: /programming | permanent link

Sat, 27 Mar 2010

Floating-point arithmetic performance

Sometimes you really want to know... (continuation of my previous writeup)

Runtime for various double-precision floating-point arithmetic operations, normalized by clock frequency, across several CPU models; P4 and Opteron suck, E8400 is consistently the fastest. Surprisingly, div and floor are quite slow, pow is dead slow.

posted at: 18:10 | path: /programming | permanent link

Fri, 26 Mar 2010

Floating-point arithmetic performance

Sometimes one wants to know...

Results in seconds for 100000000 (1e8) operations on Intel Core2 CPU (E6700), 2.66 GHz, code (see below) was compiled with g++ -static -O2 -Wall -ffast-math -o fptime.cpp, GCC 4.2.4

erf 1.900
erfc 2.120
gamma 6.020
tgamma 12.120
log 2.360
exp 5.290
pow 12.090
sqrt 2.550
cos 3.390
tan 4.840
atan 5.330
mul 0.103
div 1.400
add 0.105
floor 0.792
fabs 0.097

#include <cmath>
#include <cstdio>
#include <ctime>
#include <cstdlib>

using namespace std;

int main() {
    volatile double a;
    volatile double b = 0.8234234;
    volatile double c = 1.43234;
    clock_t t;

    const int N = 100000000;

    t = clock();
    for (int i=0; i < N; i++)
        a = erf(b);
    printf("erf %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        a = erfc(b);
    printf("erfc %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        a = gamma(b);
    printf("gamma %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        a = tgamma(b);
    printf("tgamma %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = log(b);
    printf("log %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);      

    t = clock();
    for (int i=0; i < N; i++)
        a = exp(b);
    printf("exp %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = pow(b, c);
    printf("pow %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = sqrt(b);
    printf("sqrt %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = cos(b);
    printf("cos %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = tan(b);
    printf("tan %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

    t = clock();
    for (int i=0; i < N; i++)
        a = atan(b);
    printf("atan %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);     

        
    t = clock();
    for (int i=0; i < 10*N; i++)
        a = b*c;
    printf("mul %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC);    

    t = clock();
    for (int i=0; i < N; i++)
        a = b/c;
    printf("div %.3f\n", (clock()-t) / (float) CLOCKS_PER_SEC);  

    t = clock();
    for (int i=0; i < 10*N; i++)
        a = b+c;
    printf("add %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC);  

    t = clock();
    for (int i=0; i < 10*N; i++)
        a = floor(b);
    printf("floor %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC);  

    t = clock();
    for (int i=0; i < 10*N; i++)
        a = fabs(b);
    printf("fabs %.3f\n", (clock()-t) / 10 / (float) CLOCKS_PER_SEC);  

    return EXIT_SUCCESS;
}

posted at: 19:55 | path: /programming | permanent link

Sat, 20 Mar 2010

NVidia OpenCL toolkit fail

One might think that SDKs are supposed to facilitate programming, which not the case for the NVidia OpenCL SDK. Here is how I got very basic OpenCL code working...
Disclaimer: Linux only, using Ubuntu 8.10 Hardy

First, it is not clear what version of the graphics driver, CUDA and OpenCL SDK you are supposed to install. The latest (stable) graphics drivers do not include OpenCL.so, version 195.36.15 worked for me.

Second, what version of CUDA? cudatoolkit 3.0 worked for me, maybe you are supposed to use cudasdk 2.3? or maybe cudatoolkit 2.3?

Third, what version of OpenCL? the package is convincingly named gpucomputingsdk 2.3a, so maybe to be used with the 2.3 CUDA stuff? CUDA SDK 3.x also contains the relevant OpenCL header files, and there are no libs except for the OpenCL.so shipped with the graphics driver; good documentation is available from the khronos size.

In the end, the OpenCL samples fail to work as

// Create the OpenCL context on a GPU device
cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, 
NULL, &ciErr1);

returns -32 (CL_INVALID_PLATFORM), other samples just crash. Turns out that the first parameter must not be zero; specification says the behaviour is implementation-defined if zero -- great idea to put such code in the SDK samples. Properly calling clGetPlatformIDs() and you are good to go!

Here is some device query code which works for me:

#include "CL/cl.h"

int main() {
    cl_uint n;

    cl_platform_id plat_ids[3];
    cl_int ret = clGetPlatformIDs(3, plat_ids, &n);
    printf("ret = %d, platforms = %d\n", ret, n);
    
    for (cl_uint i = 0; i < n; i++) {
        char buf[1024];
        size_t bytes;

        ret = clGetPlatformInfo(plat_ids[i], CL_PLATFORM_VERSION, sizeof(buf), buf, &bytes);
        printf("ret = %d, bytes = %d, version %s\n", ret, bytes, buf);
        ret = clGetPlatformInfo(plat_ids[i], CL_PLATFORM_NAME, sizeof(buf), buf, &bytes);
        printf("ret = %d, bytes = %d, name %s\n", ret, bytes, buf);
    }

    cl_device_id dev_ids[3];
    ret = clGetDeviceIDs(plat_ids[0], CL_DEVICE_TYPE_GPU, 3, dev_ids, &n);
    printf("ret = %d, devices = %d\n", ret, n);

    char buf[1024];
    size_t bytes;
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_AVAILABLE, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, avail %d\n", ret, bytes, *(cl_bool*) &buf);
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, global mem %lld\n", ret, bytes, *(cl_ulong*) &buf);
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, local mem %lld\n", ret, bytes, *(cl_ulong*) &buf);
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, global mem cache %lld\n", ret, bytes, *(cl_ulong*) &buf);
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_IMAGE_SUPPORT, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, image support %d\n", ret, bytes, *(cl_bool*) &buf);
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, image 2d max height %d\n", ret, bytes, *(size_t*) &buf);
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, clock freq %d\n", ret, bytes, *(cl_uint*) &buf);
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, compute units %d\n", ret, bytes, *(cl_uint*) &buf);
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, work group size %d\n", ret, bytes, *(size_t*) &buf);
    ret = clGetDeviceInfo(dev_ids[0], CL_DEVICE_NAME, sizeof(buf), buf, &bytes);
    printf("ret = %d, bytes = %d, name %s\n", ret, bytes, buf);


    cl_context_properties props[3];
    props[0] = CL_CONTEXT_PLATFORM;
    props[1] = (cl_context_properties)plat_ids[0];
    props[2] = 0;

    cl_context ctx = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU, 0, 0, &ret);
    printf("ret = %d\n", ret);

    return 0;
}

posted at: 12:30 | path: /rant | permanent link

Sun, 28 Mar 2010

Matrix multiplication: Eigen vs. GSL

Floating-point performance: Compiler comparison and speedup

Sat, 27 Mar 2010

Floating-point arithmetic performance

Fri, 26 Mar 2010

Floating-point arithmetic performance

Sat, 20 Mar 2010

NVidia OpenCL toolkit fail

Tue, 02 Mar 2010

valgrind rocks

	icc	gcc-4.2	gcc-4.2	(gcc-4.4)	gcc-4.4
float->double	150	120	120	110	70
double->float	110	90	90	90	100
erf	5020	1960	1990	1960	1080
erfc	9600	2310	2220	2220	1270
gamma	12040	6340	6340	6340	6570
tgamma	9050	12660	12570	12890	9920
log	1900	2340	4390	4430	4440
fmath::log	560	1170	490	590	450
exp	1290	5290	6280	6380	2870
fmath::exp	640	600	650	660	620
pow	3660	11980	11820	11680	9210
fmath::pow	2170	2720	1650	1800	1580
sqrt	2140	2550	2140	2140	640
cos	1660	3380	3870	3860	1690
tan	2520	4840	5930	5930	3480
atan	1600	5330	5880	5630	1660
mul	91	91	86	98	99
div	1190	1400	1170	1180	680
add	92	96	87	93	99
floor	356	788	2177	319	262
fabs	88	90	91	89	96

<	March 2010					>
Su	Mo	Tu	We	Th	Fr	Sa
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31