comparison sconv_neon.c @ 3:e889fd0e7769

stuff
author Peter Meerwald <p.meerwald@bct-electronic.com>
date Thu, 05 Jul 2012 17:31:56 +0200
parents b829afbea564
children 07763f536182
comparison
equal deleted inserted replaced
2:09ee6a01a3d3 3:e889fd0e7769
39 39
40 #if defined(__arm__) 40 #if defined(__arm__)
41 41
42 #include "arm_neon.h" 42 #include "arm_neon.h"
43 43
44 void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) { 44 void pa_sconv_s16le_from_float32ne(unsigned n, const float *src, int16_t *dst) {
45 pa_assert(a); 45 pa_assert(src);
46 pa_assert(b); 46 pa_assert(dst);
47 47
48 for (; n > 0; n--) { 48 for (; n > 0; n--) {
49 float v = *(a++); 49 float v = *(src++);
50 50
51 v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f); 51 v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f);
52 *(b++) = (int16_t) lrintf(v * 0x7FFF); 52 *(dst++) = (int16_t) lrintf(v * 0x7FFF);
53 } 53 }
54 } 54 }
55 55
56 void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) { 56 void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *src, int16_t *dst) {
57 unsigned i; 57 unsigned i = n & 3;
58 58
59 const float32x4_t plusone4 = vdupq_n_f32(1.0f); 59 asm volatile (
60 const float32x4_t minusone4 = vdupq_n_f32(-1.0f); 60 "mov %[n], %[n], lsr #2\n\t"
61 const float32x4_t half4 = vdupq_n_f32(0.5f); 61 "vdup.f32 q2, %[plusone]\n\t"
62 const float32x4_t scale4 = vdupq_n_f32(32767.0f); 62 "vneg.f32 q3, q2\n\t"
63 const uint32x4_t mask4 = vdupq_n_u32(0x80000000); 63 "vdup.f32 q4, %[scale]\n\t"
64 64 "vdup.u32 q5, %[mask]\n\t"
65 for (i = 0; i < n/4; i++) { 65 "vdup.f32 q6, %[half]\n\t"
66 float32x4_t v4 = ((float32x4_t *)a)[i]; 66 "1:\n\t"
67 v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4); 67 "vld1.32 {q0}, [%[src]]!\n\t"
68 68 "vmin.f32 q0, q0, q2\n\t" /* clamp */
69 const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32( 69 "vmax.f32 q0, q0, q3\n\t"
70 vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4))); 70 "vmul.f32 q0, q0, q4\n\t" /* scale */
71 71 "vand.u32 q1, q0, q5\n\t"
72 ((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4))); 72 "vorr.u32 q1, q1, q6\n\t" /* round */
73 } 73 "vadd.f32 q0, q0, q1\n\t"
74 74 "vcvt.s32.f32 q0, q0\n\t" /* narrow */
75 "vmovn.i32 d0, q0\n\t"
76 "subs %[n], %[n], #1\n\t"
77 "vst1.16 {d0}, [%[dst]]!\n\t"
78 "bgt 1b\n\t"
79 /* output operands (or input operands that get modified) */
80 : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
81 : [plusone] "r" (1.0f), [scale] "r" (32767.0f),
82 [half] "r" (0.5f), [mask] "r" (0x80000000) /* input operands */
83 : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6" /* clobber list */
84 );
85
75 // leftovers 86 // leftovers
76 for (i = n & ~3; i < n; i++) { 87 while (i--) {
77 b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF); 88 *dst++ = (int16_t) lrintf(PA_CLAMP_UNLIKELY(*src, -1.0f, 1.0f) * 0x7FFF);
78 } 89 src++;
79 } 90 }
80 91 }
81 void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) { 92
82 pa_assert(a); 93 void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *src, float *dst) {
83 pa_assert(b); 94 pa_assert(src);
95 pa_assert(dst);
84 96
85 for (; n > 0; n--) 97 for (; n > 0; n--)
86 *(b++) = ((float) (*(a++)))/(float) 0x7FFF; 98 *(dst++) = ((float) (*(src++)))/(float) 0x7FFF;
87 } 99 }
88 100
89 void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) { 101 void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *dst) {
90 unsigned i; 102 unsigned i = n & 3;
91 103
92 const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF); 104 const float invscale = 1.0f / 0x7FFF;
93 105
94 for (i = 0; i < n/4; i++) { 106 asm volatile (
95 ((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4); 107 "mov %[n], %[n], lsr #2\n\t"
96 } 108 "vdup.f32 q1, %[invscale]\n\t"
109 "1:\n\t"
110 "vld1.16 {d0}, [%[src]]!\n\t"
111 "vmovl.s16 q0, d0\n\t"
112
113 "vcvt.f32.s32 q0, q0\n\t"
114 "vmul.f32 q0, q0, q1\n\t"
115
116 "subs %[n], %[n], #1\n\t"
117 "vst1.32 {q0}, [%[dst]]!\n\t"
118 "bgt 1b\n\t"
119 /* output operands (or input operands that get modified) */
120 : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
121 : [invscale] "r" (invscale) /* input operands */
122 : "memory", "cc", "q0", "q1" /* clobber list */
123 );
97 124
98 // leftovers 125 // leftovers
99 const float invscale = 1.0f / 0x7FFF; 126 while (i--) {
100 for (i = n & ~3; i < n; i++) { 127 *dst++ = *src++ * invscale;
101 b[i] = a[i] * invscale;
102 } 128 }
103 } 129 }
104 130
105 #define SAMPLES 1019 131 #define SAMPLES 1019
106 #define TIMES 10000 132 #define TIMES 100000
107 133
108 static void run_test_from(void) { 134 static void run_test_from(void) {
109 int16_t samples[SAMPLES]; 135 int16_t samples[SAMPLES];
110 int16_t samples_ref[SAMPLES]; 136 int16_t samples_ref[SAMPLES];
111 float floats[SAMPLES]; 137 float floats[SAMPLES];
112 int i; 138 int i;
113 pa_usec_t start, stop; 139 pa_usec_t start, stop;
114 pa_convert_func_t func;
115 140
116 pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES); 141 pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES);
117 142
118 memset(samples_ref, 0, sizeof(samples_ref)); 143 memset(samples_ref, 0, sizeof(samples_ref));
119 memset(samples, 0, sizeof(samples)); 144 memset(samples, 0, sizeof(samples));
120 145
121 for (i = 0; i < SAMPLES; i++) { 146 for (i = 0; i < SAMPLES; i++) {
122 floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f); 147 floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
123 } 148 }
124 149
125 func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne; 150 pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref);
126 func(SAMPLES, floats, samples_ref);
127 pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); 151 pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
128 152
129 for (i = 0; i < SAMPLES; i++) { 153 for (i = 0; i < SAMPLES; i++) {
130 if (abs(samples[i] - samples_ref[i]) > 0) { 154 if (abs(samples[i] - samples_ref[i]) > 0) {
131 pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i], 155 pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
140 stop = pa_rtclock_now(); 164 stop = pa_rtclock_now();
141 pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); 165 pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
142 166
143 start = pa_rtclock_now(); 167 start = pa_rtclock_now();
144 for (i = 0; i < TIMES; i++) { 168 for (i = 0; i < TIMES; i++) {
145 func(SAMPLES, floats, samples_ref); 169 pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref);
146 } 170 }
147 stop = pa_rtclock_now(); 171 stop = pa_rtclock_now();
148 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); 172 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
149 } 173 }
150 174

Repositories maintained by Peter Meerwald, pmeerw@pmeerw.net.