This is how I use ARM NEON intrinsics to speed up division and square root operations...
#include "arm_neon.h" // approximative quadword float inverse square root static inline float32x4_t invsqrtv(float32x4_t x) { float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); return vrsqrtsq_f32(x * sqrt_reciprocal, sqrt_reciprocal) * sqrt_reciprocal; } // approximative quadword float square root static inline float32x4_t sqrtv(float32x4_t x) { return x * invsqrtv(x); } // approximative quadword float inverse static inline float32x4_t invv(float32x4_t x) { float32x4_t reciprocal = vrecpeq_f32(x); reciprocal = vrecpsq_f32(x, reciprocal) * reciprocal; return reciprocal; } // approximative quadword float division static inline float32x4_t divv(float32x4_t x, float32x4_t y) { float32x4_t reciprocal = vrecpeq_f32(y); reciprocal = vrecpsq_f32(y, reciprocal) * reciprocal; return x * invv(y); } // accumulate four quadword floats static inline float accumv(float32x4_t x) { static const float32x2_t f0 = vdup_n_f32(0.0f); return vget_lane_f32(vpadd_f32(f0, vget_high_f32(x) + vget_low_f32(x)), 1); }
posted at: 10:39 | path: /programming | permanent link