Mercurial > hg > pa-neon
comparison sconv_neon.c @ 3:e889fd0e7769
stuff
author | Peter Meerwald <p.meerwald@bct-electronic.com> |
---|---|
date | Thu, 05 Jul 2012 17:31:56 +0200 |
parents | b829afbea564 |
children | 07763f536182 |
comparison
equal
deleted
inserted
replaced
2:09ee6a01a3d3 | 3:e889fd0e7769 |
---|---|
39 | 39 |
40 #if defined(__arm__) | 40 #if defined(__arm__) |
41 | 41 |
42 #include "arm_neon.h" | 42 #include "arm_neon.h" |
43 | 43 |
44 void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) { | 44 void pa_sconv_s16le_from_float32ne(unsigned n, const float *src, int16_t *dst) { |
45 pa_assert(a); | 45 pa_assert(src); |
46 pa_assert(b); | 46 pa_assert(dst); |
47 | 47 |
48 for (; n > 0; n--) { | 48 for (; n > 0; n--) { |
49 float v = *(a++); | 49 float v = *(src++); |
50 | 50 |
51 v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f); | 51 v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f); |
52 *(b++) = (int16_t) lrintf(v * 0x7FFF); | 52 *(dst++) = (int16_t) lrintf(v * 0x7FFF); |
53 } | 53 } |
54 } | 54 } |
55 | 55 |
56 void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) { | 56 void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *src, int16_t *dst) { |
57 unsigned i; | 57 unsigned i = n & 3; |
58 | 58 |
59 const float32x4_t plusone4 = vdupq_n_f32(1.0f); | 59 asm volatile ( |
60 const float32x4_t minusone4 = vdupq_n_f32(-1.0f); | 60 "mov %[n], %[n], lsr #2\n\t" |
61 const float32x4_t half4 = vdupq_n_f32(0.5f); | 61 "vdup.f32 q2, %[plusone]\n\t" |
62 const float32x4_t scale4 = vdupq_n_f32(32767.0f); | 62 "vneg.f32 q3, q2\n\t" |
63 const uint32x4_t mask4 = vdupq_n_u32(0x80000000); | 63 "vdup.f32 q4, %[scale]\n\t" |
64 | 64 "vdup.u32 q5, %[mask]\n\t" |
65 for (i = 0; i < n/4; i++) { | 65 "vdup.f32 q6, %[half]\n\t" |
66 float32x4_t v4 = ((float32x4_t *)a)[i]; | 66 "1:\n\t" |
67 v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4); | 67 "vld1.32 {q0}, [%[src]]!\n\t" |
68 | 68 "vmin.f32 q0, q0, q2\n\t" /* clamp */ |
69 const float32x4_t w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32( | 69 "vmax.f32 q0, q0, q3\n\t" |
70 vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4))); | 70 "vmul.f32 q0, q0, q4\n\t" /* scale */ |
71 | 71 "vand.u32 q1, q0, q5\n\t" |
72 ((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4))); | 72 "vorr.u32 q1, q1, q6\n\t" /* round */ |
73 } | 73 "vadd.f32 q0, q0, q1\n\t" |
74 | 74 "vcvt.s32.f32 q0, q0\n\t" /* narrow */ |
75 "vmovn.i32 d0, q0\n\t" | |
76 "subs %[n], %[n], #1\n\t" | |
77 "vst1.16 {d0}, [%[dst]]!\n\t" | |
78 "bgt 1b\n\t" | |
79 /* output operands (or input operands that get modified) */ | |
80 : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) | |
81 : [plusone] "r" (1.0f), [scale] "r" (32767.0f), | |
82 [half] "r" (0.5f), [mask] "r" (0x80000000) /* input operands */ | |
83 : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6" /* clobber list */ | |
84 ); | |
85 | |
75 // leftovers | 86 // leftovers |
76 for (i = n & ~3; i < n; i++) { | 87 while (i--) { |
77 b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF); | 88 *dst++ = (int16_t) lrintf(PA_CLAMP_UNLIKELY(*src, -1.0f, 1.0f) * 0x7FFF); |
78 } | 89 src++; |
79 } | 90 } |
80 | 91 } |
81 void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) { | 92 |
82 pa_assert(a); | 93 void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *src, float *dst) { |
83 pa_assert(b); | 94 pa_assert(src); |
95 pa_assert(dst); | |
84 | 96 |
85 for (; n > 0; n--) | 97 for (; n > 0; n--) |
86 *(b++) = ((float) (*(a++)))/(float) 0x7FFF; | 98 *(dst++) = ((float) (*(src++)))/(float) 0x7FFF; |
87 } | 99 } |
88 | 100 |
89 void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) { | 101 void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *dst) { |
90 unsigned i; | 102 unsigned i = n & 3; |
91 | 103 |
92 const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF); | 104 const float invscale = 1.0f / 0x7FFF; |
93 | 105 |
94 for (i = 0; i < n/4; i++) { | 106 asm volatile ( |
95 ((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4); | 107 "mov %[n], %[n], lsr #2\n\t" |
96 } | 108 "vdup.f32 q1, %[invscale]\n\t" |
109 "1:\n\t" | |
110 "vld1.16 {d0}, [%[src]]!\n\t" | |
111 "vmovl.s16 q0, d0\n\t" | |
112 | |
113 "vcvt.f32.s32 q0, q0\n\t" | |
114 "vmul.f32 q0, q0, q1\n\t" | |
115 | |
116 "subs %[n], %[n], #1\n\t" | |
117 "vst1.32 {q0}, [%[dst]]!\n\t" | |
118 "bgt 1b\n\t" | |
119 /* output operands (or input operands that get modified) */ | |
120 : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n) | |
121 : [invscale] "r" (invscale) /* input operands */ | |
122 : "memory", "cc", "q0", "q1" /* clobber list */ | |
123 ); | |
97 | 124 |
98 // leftovers | 125 // leftovers |
99 const float invscale = 1.0f / 0x7FFF; | 126 while (i--) { |
100 for (i = n & ~3; i < n; i++) { | 127 *dst++ = *src++ * invscale; |
101 b[i] = a[i] * invscale; | |
102 } | 128 } |
103 } | 129 } |
104 | 130 |
105 #define SAMPLES 1019 | 131 #define SAMPLES 1019 |
106 #define TIMES 10000 | 132 #define TIMES 100000 |
107 | 133 |
108 static void run_test_from(void) { | 134 static void run_test_from(void) { |
109 int16_t samples[SAMPLES]; | 135 int16_t samples[SAMPLES]; |
110 int16_t samples_ref[SAMPLES]; | 136 int16_t samples_ref[SAMPLES]; |
111 float floats[SAMPLES]; | 137 float floats[SAMPLES]; |
112 int i; | 138 int i; |
113 pa_usec_t start, stop; | 139 pa_usec_t start, stop; |
114 pa_convert_func_t func; | |
115 | 140 |
116 pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES); | 141 pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES); |
117 | 142 |
118 memset(samples_ref, 0, sizeof(samples_ref)); | 143 memset(samples_ref, 0, sizeof(samples_ref)); |
119 memset(samples, 0, sizeof(samples)); | 144 memset(samples, 0, sizeof(samples)); |
120 | 145 |
121 for (i = 0; i < SAMPLES; i++) { | 146 for (i = 0; i < SAMPLES; i++) { |
122 floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f); | 147 floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f); |
123 } | 148 } |
124 | 149 |
125 func = (pa_convert_func_t) pa_sconv_s16le_from_float32ne; | 150 pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref); |
126 func(SAMPLES, floats, samples_ref); | |
127 pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); | 151 pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples); |
128 | 152 |
129 for (i = 0; i < SAMPLES; i++) { | 153 for (i = 0; i < SAMPLES; i++) { |
130 if (abs(samples[i] - samples_ref[i]) > 0) { | 154 if (abs(samples[i] - samples_ref[i]) > 0) { |
131 pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i], | 155 pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i], |
140 stop = pa_rtclock_now(); | 164 stop = pa_rtclock_now(); |
141 pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); | 165 pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start)); |
142 | 166 |
143 start = pa_rtclock_now(); | 167 start = pa_rtclock_now(); |
144 for (i = 0; i < TIMES; i++) { | 168 for (i = 0; i < TIMES; i++) { |
145 func(SAMPLES, floats, samples_ref); | 169 pa_sconv_s16le_from_float32ne(SAMPLES, floats, samples_ref); |
146 } | 170 } |
147 stop = pa_rtclock_now(); | 171 stop = pa_rtclock_now(); |
148 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); | 172 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); |
149 } | 173 } |
150 | 174 |