Mercurial > hg > audiostuff
comparison spandsp-0.0.6pre17/src/vector_int.c @ 4:26cd8f1ef0b1
import spandsp-0.0.6pre17
author | Peter Meerwald <pmeerw@cosy.sbg.ac.at> |
---|---|
date | Fri, 25 Jun 2010 15:50:58 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:c6c5a16ce2f2 | 4:26cd8f1ef0b1 |
---|---|
1 /* | |
2 * SpanDSP - a series of DSP components for telephony | |
3 * | |
4 * vector_int.c - Integer vector arithmetic | |
5 * | |
6 * Written by Steve Underwood <steveu@coppice.org> | |
7 * | |
8 * Copyright (C) 2006 Steve Underwood | |
9 * | |
10 * All rights reserved. | |
11 * | |
12 * This program is free software; you can redistribute it and/or modify | |
13 * it under the terms of the GNU Lesser General Public License version 2.1, | |
14 * as published by the Free Software Foundation. | |
15 * | |
16 * This program is distributed in the hope that it will be useful, | |
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 * GNU Lesser General Public License for more details. | |
20 * | |
21 * You should have received a copy of the GNU Lesser General Public | |
22 * License along with this program; if not, write to the Free Software | |
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
24 * | |
25 * $Id: vector_int.c,v 1.26.4.1 2009/12/28 11:54:59 steveu Exp $ | |
26 */ | |
27 | |
28 /*! \file */ | |
29 | |
30 #if defined(HAVE_CONFIG_H) | |
31 #include "config.h" | |
32 #endif | |
33 | |
34 #include <inttypes.h> | |
35 #include <stdlib.h> | |
36 #include <stdio.h> | |
37 #include <string.h> | |
38 #if defined(HAVE_TGMATH_H) | |
39 #include <tgmath.h> | |
40 #endif | |
41 #if defined(HAVE_MATH_H) | |
42 #include <math.h> | |
43 #endif | |
44 #include <assert.h> | |
45 | |
46 #include "floating_fudge.h" | |
47 #include "mmx_sse_decs.h" | |
48 | |
49 #include "spandsp/telephony.h" | |
50 #include "spandsp/vector_int.h" | |
51 | |
52 SPAN_DECLARE(int32_t) vec_dot_prodi16(const int16_t x[], const int16_t y[], int n) | |
53 { | |
54 int32_t z; | |
55 | |
56 #if defined(__GNUC__) && defined(SPANDSP_USE_MMX) && defined(__x86_64__) | |
57 __asm__ __volatile__( | |
58 " emms;\n" | |
59 " pxor %%mm0,%%mm0;\n" | |
60 " leaq -32(%%rsi,%%rax,2),%%rdx;\n" /* rdx = top - 32 */ | |
61 | |
62 " cmpq %%rdx,%%rsi;\n" | |
63 " ja 1f;\n" | |
64 | |
65 /* Work in blocks of 16 int16_t's until we are near the end */ | |
66 " .p2align 2;\n" | |
67 "2:\n" | |
68 " movq (%%rdi),%%mm1;\n" | |
69 " movq (%%rsi),%%mm2;\n" | |
70 " pmaddwd %%mm2,%%mm1;\n" | |
71 " paddd %%mm1,%%mm0;\n" | |
72 " movq 8(%%rdi),%%mm1;\n" | |
73 " movq 8(%%rsi),%%mm2;\n" | |
74 " pmaddwd %%mm2,%%mm1;\n" | |
75 " paddd %%mm1,%%mm0;\n" | |
76 " movq 16(%%rdi),%%mm1;\n" | |
77 " movq 16(%%rsi),%%mm2;\n" | |
78 " pmaddwd %%mm2,%%mm1;\n" | |
79 " paddd %%mm1,%%mm0;\n" | |
80 " movq 24(%%rdi),%%mm1;\n" | |
81 " movq 24(%%rsi),%%mm2;\n" | |
82 " pmaddwd %%mm2,%%mm1;\n" | |
83 " paddd %%mm1,%%mm0;\n" | |
84 | |
85 " addq $32,%%rsi;\n" | |
86 " addq $32,%%rdi;\n" | |
87 " cmpq %%rdx,%%rsi;\n" | |
88 " jbe 2b;\n" | |
89 | |
90 " .p2align 2;\n" | |
91 "1:\n" | |
92 " addq $24,%%rdx;\n" /* Now edx = top - 8 */ | |
93 " cmpq %%rdx,%%rsi;\n" | |
94 " ja 3f;\n" | |
95 | |
96 /* Work in blocks of 4 int16_t's until we are near the end */ | |
97 " .p2align 2;\n" | |
98 "4:\n" | |
99 " movq (%%rdi),%%mm1;\n" | |
100 " movq (%%rsi),%%mm2;\n" | |
101 " pmaddwd %%mm2,%%mm1;\n" | |
102 " paddd %%mm1,%%mm0;\n" | |
103 | |
104 " addq $8,%%rsi;\n" | |
105 " addq $8,%%rdi;\n" | |
106 " cmpq %%rdx,%%rsi;" | |
107 " jbe 4b;\n" | |
108 | |
109 " .p2align 2;\n" | |
110 "3:\n" | |
111 " addq $4,%%rdx;\n" /* Now edx = top - 4 */ | |
112 " cmpq %%rdx,%%rsi;\n" | |
113 " ja 5f;\n" | |
114 | |
115 /* Work in a block of 2 int16_t's */ | |
116 " movd (%%rdi),%%mm1;\n" | |
117 " movd (%%rsi),%%mm2;\n" | |
118 " pmaddwd %%mm2,%%mm1;\n" | |
119 " paddd %%mm1,%%mm0;\n" | |
120 | |
121 " addq $4,%%rsi;\n" | |
122 " addq $4,%%rdi;\n" | |
123 | |
124 " .p2align 2;\n" | |
125 "5:\n" | |
126 " addq $2,%%rdx;\n" /* Now edx = top - 2 */ | |
127 " cmpq %%rdx,%%rsi;\n" | |
128 " ja 6f;\n" | |
129 | |
130 /* Deal with the very last int16_t, when n is odd */ | |
131 " movswl (%%rdi),%%eax;\n" | |
132 " andl $65535,%%eax;\n" | |
133 " movd %%eax,%%mm1;\n" | |
134 " movswl (%%rsi),%%eax;\n" | |
135 " andl $65535,%%eax;\n" | |
136 " movd %%eax,%%mm2;\n" | |
137 " pmaddwd %%mm2,%%mm1;\n" | |
138 " paddd %%mm1,%%mm0;\n" | |
139 | |
140 " .p2align 2;\n" | |
141 "6:\n" | |
142 /* Merge the pieces of the answer */ | |
143 " movq %%mm0,%%mm1;\n" | |
144 " punpckhdq %%mm0,%%mm1;\n" | |
145 " paddd %%mm1,%%mm0;\n" | |
146 /* Et voila, eax has the final result */ | |
147 " movd %%mm0,%%eax;\n" | |
148 | |
149 " emms;\n" | |
150 : "=a" (z) | |
151 : "S" (x), "D" (y), "a" (n) | |
152 : "cc" | |
153 ); | |
154 #elif defined(__GNUC__) && defined(SPANDSP_USE_MMX) && defined(__i386__) | |
155 __asm__ __volatile__( | |
156 " emms;\n" | |
157 " pxor %%mm0,%%mm0;\n" | |
158 " leal -32(%%esi,%%eax,2),%%edx;\n" /* edx = top - 32 */ | |
159 | |
160 " cmpl %%edx,%%esi;\n" | |
161 " ja 1f;\n" | |
162 | |
163 /* Work in blocks of 16 int16_t's until we are near the end */ | |
164 " .p2align 2;\n" | |
165 "2:\n" | |
166 " movq (%%edi),%%mm1;\n" | |
167 " movq (%%esi),%%mm2;\n" | |
168 " pmaddwd %%mm2,%%mm1;\n" | |
169 " paddd %%mm1,%%mm0;\n" | |
170 " movq 8(%%edi),%%mm1;\n" | |
171 " movq 8(%%esi),%%mm2;\n" | |
172 " pmaddwd %%mm2,%%mm1;\n" | |
173 " paddd %%mm1,%%mm0;\n" | |
174 " movq 16(%%edi),%%mm1;\n" | |
175 " movq 16(%%esi),%%mm2;\n" | |
176 " pmaddwd %%mm2,%%mm1;\n" | |
177 " paddd %%mm1,%%mm0;\n" | |
178 " movq 24(%%edi),%%mm1;\n" | |
179 " movq 24(%%esi),%%mm2;\n" | |
180 " pmaddwd %%mm2,%%mm1;\n" | |
181 " paddd %%mm1,%%mm0;\n" | |
182 | |
183 " addl $32,%%esi;\n" | |
184 " addl $32,%%edi;\n" | |
185 " cmpl %%edx,%%esi;\n" | |
186 " jbe 2b;\n" | |
187 | |
188 " .p2align 2;\n" | |
189 "1:\n" | |
190 " addl $24,%%edx;\n" /* Now edx = top - 8 */ | |
191 " cmpl %%edx,%%esi;\n" | |
192 " ja 3f;\n" | |
193 | |
194 /* Work in blocks of 4 int16_t's until we are near the end */ | |
195 " .p2align 2;\n" | |
196 "4:\n" | |
197 " movq (%%edi),%%mm1;\n" | |
198 " movq (%%esi),%%mm2;\n" | |
199 " pmaddwd %%mm2,%%mm1;\n" | |
200 " paddd %%mm1,%%mm0;\n" | |
201 | |
202 " addl $8,%%esi;\n" | |
203 " addl $8,%%edi;\n" | |
204 " cmpl %%edx,%%esi;" | |
205 " jbe 4b;\n" | |
206 | |
207 " .p2align 2;\n" | |
208 "3:\n" | |
209 " addl $4,%%edx;\n" /* Now edx = top - 4 */ | |
210 " cmpl %%edx,%%esi;\n" | |
211 " ja 5f;\n" | |
212 | |
213 /* Work in a block of 2 int16_t's */ | |
214 " movd (%%edi),%%mm1;\n" | |
215 " movd (%%esi),%%mm2;\n" | |
216 " pmaddwd %%mm2,%%mm1;\n" | |
217 " paddd %%mm1,%%mm0;\n" | |
218 | |
219 " addl $4,%%esi;\n" | |
220 " addl $4,%%edi;\n" | |
221 | |
222 " .p2align 2;\n" | |
223 "5:\n" | |
224 " addl $2,%%edx;\n" /* Now edx = top - 2 */ | |
225 " cmpl %%edx,%%esi;\n" | |
226 " ja 6f;\n" | |
227 | |
228 /* Deal with the very last int16_t, when n is odd */ | |
229 " movswl (%%edi),%%eax;\n" | |
230 " andl $65535,%%eax;\n" | |
231 " movd %%eax,%%mm1;\n" | |
232 " movswl (%%esi),%%eax;\n" | |
233 " andl $65535,%%eax;\n" | |
234 " movd %%eax,%%mm2;\n" | |
235 " pmaddwd %%mm2,%%mm1;\n" | |
236 " paddd %%mm1,%%mm0;\n" | |
237 | |
238 " .p2align 2;\n" | |
239 "6:\n" | |
240 /* Merge the pieces of the answer */ | |
241 " movq %%mm0,%%mm1;\n" | |
242 " punpckhdq %%mm0,%%mm1;\n" | |
243 " paddd %%mm1,%%mm0;\n" | |
244 /* Et voila, eax has the final result */ | |
245 " movd %%mm0,%%eax;\n" | |
246 | |
247 " emms;\n" | |
248 : "=a" (z) | |
249 : "S" (x), "D" (y), "a" (n) | |
250 : "cc" | |
251 ); | |
252 #else | |
253 int i; | |
254 | |
255 z = 0; | |
256 for (i = 0; i < n; i++) | |
257 z += (int32_t) x[i]*(int32_t) y[i]; | |
258 #endif | |
259 return z; | |
260 } | |
261 /*- End of function --------------------------------------------------------*/ | |
262 | |
263 SPAN_DECLARE(int32_t) vec_circular_dot_prodi16(const int16_t x[], const int16_t y[], int n, int pos) | |
264 { | |
265 int32_t z; | |
266 | |
267 z = vec_dot_prodi16(&x[pos], &y[0], n - pos); | |
268 z += vec_dot_prodi16(&x[0], &y[n - pos], pos); | |
269 return z; | |
270 } | |
271 /*- End of function --------------------------------------------------------*/ | |
272 | |
273 SPAN_DECLARE(void) vec_lmsi16(const int16_t x[], int16_t y[], int n, int16_t error) | |
274 { | |
275 int i; | |
276 | |
277 for (i = 0; i < n; i++) | |
278 y[i] += (int16_t) (((int32_t) x[i]*(int32_t) error) >> 15); | |
279 } | |
280 /*- End of function --------------------------------------------------------*/ | |
281 | |
282 SPAN_DECLARE(void) vec_circular_lmsi16(const int16_t x[], int16_t y[], int n, int pos, int16_t error) | |
283 { | |
284 vec_lmsi16(&x[pos], &y[0], n - pos, error); | |
285 vec_lmsi16(&x[0], &y[n - pos], pos, error); | |
286 } | |
287 /*- End of function --------------------------------------------------------*/ | |
288 | |
289 SPAN_DECLARE(int32_t) vec_min_maxi16(const int16_t x[], int n, int16_t out[]) | |
290 { | |
291 #if defined(__GNUC__) && defined(SPANDSP_USE_MMX) && defined(__x86_64__) | |
292 static const int32_t lower_bound = 0x80008000; | |
293 static const int32_t upper_bound = 0x7FFF7FFF; | |
294 int32_t max; | |
295 | |
296 __asm__ __volatile__( | |
297 " emms;\n" | |
298 " pushq %%rdx;\n" | |
299 " leaq -8(%%rsi,%%rax,2),%%rdx;\n" | |
300 | |
301 " cmpq %%rdx,%%rsi;\n" | |
302 " jbe 2f;\n" | |
303 " movd %[lower],%%mm0;\n" | |
304 " movd %[upper],%%mm1;\n" | |
305 " jmp 1f;\n" | |
306 | |
307 " .p2align 2;\n" | |
308 "2:\n" | |
309 " movq (%%rsi),%%mm0;\n" /* mm0 will be max's */ | |
310 " movq %%mm0,%%mm1;\n" /* mm1 will be min's */ | |
311 " addq $8,%%rsi;\n" | |
312 " cmpq %%rdx,%%rsi;\n" | |
313 " ja 4f;\n" | |
314 | |
315 "3:\n" | |
316 " movq (%%rsi),%%mm2;\n" | |
317 | |
318 " movq %%mm2,%%mm3;\n" | |
319 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
320 " movq %%mm3,%%mm4;\n" | |
321 " pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */ | |
322 " pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */ | |
323 " por %%mm3,%%mm4;\n" | |
324 " movq %%mm4,%%mm0;\n" /* Now mm0 is updated max's */ | |
325 | |
326 " movq %%mm1,%%mm3;\n" | |
327 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
328 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
329 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
330 " por %%mm3,%%mm2;\n" | |
331 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
332 | |
333 " addq $8,%%rsi;\n" | |
334 " cmpq %%rdx,%%rsi;\n" | |
335 " jbe 3b;\n" | |
336 | |
337 " .p2align 2;\n" | |
338 "4:\n" | |
339 /* Merge down the 4-word max/mins to lower 2 words */ | |
340 " movq %%mm0,%%mm2;\n" | |
341 " psrlq $32,%%mm2;\n" | |
342 " movq %%mm2,%%mm3;\n" | |
343 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
344 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */ | |
345 " pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */ | |
346 " por %%mm3,%%mm2;\n" | |
347 " movq %%mm2,%%mm0;\n" /* now mm0 is updated max's */ | |
348 | |
349 " movq %%mm1,%%mm2;\n" | |
350 " psrlq $32,%%mm2;\n" | |
351 " movq %%mm1,%%mm3;\n" | |
352 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
353 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
354 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
355 " por %%mm3,%%mm2;\n" | |
356 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
357 | |
358 " .p2align 2;\n" | |
359 "1:\n" | |
360 " addq $4,%%rdx;\n" /* now dx = top-4 */ | |
361 " cmpq %%rdx,%%rsi;\n" | |
362 " ja 5f;\n" | |
363 /* Here, there are >= 2 words of input remaining */ | |
364 " movd (%%rsi),%%mm2;\n" | |
365 | |
366 " movq %%mm2,%%mm3;\n" | |
367 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
368 " movq %%mm3,%%mm4;\n" | |
369 " pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */ | |
370 " pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */ | |
371 " por %%mm3,%%mm4;\n" | |
372 " movq %%mm4,%%mm0;\n" /* now mm0 is updated max's */ | |
373 | |
374 " movq %%mm1,%%mm3;\n" | |
375 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
376 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
377 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
378 " por %%mm3,%%mm2;\n" | |
379 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
380 | |
381 " addq $4,%%rsi;\n" | |
382 | |
383 " .p2align 2;\n" | |
384 "5:\n" | |
385 /* Merge down the 2-word max/mins to 1 word */ | |
386 " movq %%mm0,%%mm2;\n" | |
387 " psrlq $16,%%mm2;\n" | |
388 " movq %%mm2,%%mm3;\n" | |
389 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
390 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */ | |
391 " pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */ | |
392 " por %%mm3,%%mm2;\n" | |
393 " movd %%mm2,%%ecx;\n" /* cx is max so far */ | |
394 | |
395 " movq %%mm1,%%mm2;\n" | |
396 " psrlq $16,%%mm2;\n" | |
397 " movq %%mm1,%%mm3;\n" | |
398 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
399 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
400 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
401 " por %%mm3,%%mm2;\n" | |
402 " movd %%mm2,%%eax;\n" /* ax is min so far */ | |
403 | |
404 " addq $2,%%rdx;\n" /* now dx = top-2 */ | |
405 " cmpq %%rdx,%%rsi;\n" | |
406 " ja 6f;\n" | |
407 | |
408 /* Here, there is one word of input left */ | |
409 " cmpw (%%rsi),%%cx;\n" | |
410 " jge 9f;\n" | |
411 " movw (%%rsi),%%cx;\n" | |
412 " .p2align 2;\n" | |
413 "9:\n" | |
414 " cmpw (%%rsi),%%ax;\n" | |
415 " jle 6f;\n" | |
416 " movw (%%rsi),%%ax;\n" | |
417 | |
418 " .p2align 2;\n" | |
419 "6:\n" | |
420 /* (finally!) cx is the max, ax the min */ | |
421 " movswl %%cx,%%ecx;\n" | |
422 " movswl %%ax,%%eax;\n" | |
423 | |
424 " popq %%rdx;\n" /* ptr to output max,min vals */ | |
425 " andq %%rdx,%%rdx;\n" | |
426 " jz 7f;\n" | |
427 " movw %%cx,(%%rdx);\n" /* max */ | |
428 " movw %%ax,2(%%rdx);\n" /* min */ | |
429 " .p2align 2;\n" | |
430 "7:\n" | |
431 /* Now calculate max absolute value */ | |
432 " negl %%eax;\n" | |
433 " cmpl %%ecx,%%eax;\n" | |
434 " jge 8f;\n" | |
435 " movl %%ecx,%%eax;\n" | |
436 " .p2align 2;\n" | |
437 "8:\n" | |
438 " emms;\n" | |
439 : "=a" (max) | |
440 : "S" (x), "a" (n), "d" (out), [lower] "m" (lower_bound), [upper] "m" (upper_bound) | |
441 : "ecx" | |
442 ); | |
443 #elif defined(__GNUC__) && defined(SPANDSP_USE_MMX) && defined(__i386__) | |
444 static const int32_t lower_bound = 0x80008000; | |
445 static const int32_t upper_bound = 0x7FFF7FFF; | |
446 int32_t max; | |
447 | |
448 __asm__ __volatile__( | |
449 " emms;\n" | |
450 " pushl %%edx;\n" | |
451 " leal -8(%%esi,%%eax,2),%%edx;\n" | |
452 | |
453 " cmpl %%edx,%%esi;\n" | |
454 " jbe 2f;\n" | |
455 " movd %[lower],%%mm0;\n" | |
456 " movd %[upper],%%mm1;\n" | |
457 " jmp 1f;\n" | |
458 | |
459 " .p2align 2;\n" | |
460 "2:\n" | |
461 " movq (%%esi),%%mm0;\n" /* mm0 will be max's */ | |
462 " movq %%mm0,%%mm1;\n" /* mm1 will be min's */ | |
463 " addl $8,%%esi;\n" | |
464 " cmpl %%edx,%%esi;\n" | |
465 " ja 4f;\n" | |
466 | |
467 " .p2align 2;\n" | |
468 "3:\n" | |
469 " movq (%%esi),%%mm2;\n" | |
470 | |
471 " movq %%mm2,%%mm3;\n" | |
472 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
473 " movq %%mm3,%%mm4;\n" | |
474 " pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */ | |
475 " pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */ | |
476 " por %%mm3,%%mm4;\n" | |
477 " movq %%mm4,%%mm0;\n" /* Now mm0 is updated max's */ | |
478 | |
479 " movq %%mm1,%%mm3;\n" | |
480 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
481 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
482 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
483 " por %%mm3,%%mm2;\n" | |
484 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
485 | |
486 " addl $8,%%esi;\n" | |
487 " cmpl %%edx,%%esi;\n" | |
488 " jbe 3b;\n" | |
489 | |
490 " .p2align 2;\n" | |
491 "4:\n" | |
492 /* Merge down the 4-word max/mins to lower 2 words */ | |
493 " movq %%mm0,%%mm2;\n" | |
494 " psrlq $32,%%mm2;\n" | |
495 " movq %%mm2,%%mm3;\n" | |
496 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
497 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */ | |
498 " pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */ | |
499 " por %%mm3,%%mm2;\n" | |
500 " movq %%mm2,%%mm0;\n" /* now mm0 is updated max's */ | |
501 | |
502 " movq %%mm1,%%mm2;\n" | |
503 " psrlq $32,%%mm2;\n" | |
504 " movq %%mm1,%%mm3;\n" | |
505 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
506 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
507 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
508 " por %%mm3,%%mm2;\n" | |
509 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
510 | |
511 " .p2align 2;\n" | |
512 "1:\n" | |
513 " addl $4,%%edx;\n" /* now dx = top-4 */ | |
514 " cmpl %%edx,%%esi;\n" | |
515 " ja 5f;\n" | |
516 /* Here, there are >= 2 words of input remaining */ | |
517 " movd (%%esi),%%mm2;\n" | |
518 | |
519 " movq %%mm2,%%mm3;\n" | |
520 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
521 " movq %%mm3,%%mm4;\n" | |
522 " pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */ | |
523 " pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */ | |
524 " por %%mm3,%%mm4;\n" | |
525 " movq %%mm4,%%mm0;\n" /* now mm0 is updated max's */ | |
526 | |
527 " movq %%mm1,%%mm3;\n" | |
528 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
529 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
530 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
531 " por %%mm3,%%mm2;\n" | |
532 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
533 | |
534 " addl $4,%%esi;\n" | |
535 | |
536 " .p2align 2;\n" | |
537 "5:\n" | |
538 /* Merge down the 2-word max/mins to 1 word */ | |
539 " movq %%mm0,%%mm2;\n" | |
540 " psrlq $16,%%mm2;\n" | |
541 " movq %%mm2,%%mm3;\n" | |
542 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
543 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */ | |
544 " pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */ | |
545 " por %%mm3,%%mm2;\n" | |
546 " movd %%mm2,%%ecx;\n" /* cx is max so far */ | |
547 | |
548 " movq %%mm1,%%mm2;\n" | |
549 " psrlq $16,%%mm2;\n" | |
550 " movq %%mm1,%%mm3;\n" | |
551 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
552 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
553 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
554 " por %%mm3,%%mm2;\n" | |
555 " movd %%mm2,%%eax;\n" /* ax is min so far */ | |
556 | |
557 " addl $2,%%edx;\n" /* now dx = top-2 */ | |
558 " cmpl %%edx,%%esi;\n" | |
559 " ja 6f;\n" | |
560 | |
561 /* Here, there is one word of input left */ | |
562 " cmpw (%%esi),%%cx;\n" | |
563 " jge 9f;\n" | |
564 " movw (%%esi),%%cx;\n" | |
565 " .p2align 2;\n" | |
566 "9:\n" | |
567 " cmpw (%%esi),%%ax;\n" | |
568 " jle 6f;\n" | |
569 " movw (%%esi),%%ax;\n" | |
570 | |
571 " .p2align 2;\n" | |
572 "6:\n" | |
573 /* (finally!) cx is the max, ax the min */ | |
574 " movswl %%cx,%%ecx;\n" | |
575 " movswl %%ax,%%eax;\n" | |
576 | |
577 " popl %%edx;\n" /* ptr to output max,min vals */ | |
578 " andl %%edx,%%edx;\n" | |
579 " jz 7f;\n" | |
580 " movw %%cx,(%%edx);\n" /* max */ | |
581 " movw %%ax,2(%%edx);\n" /* min */ | |
582 " .p2align 2;\n" | |
583 "7:\n" | |
584 /* Now calculate max absolute value */ | |
585 " negl %%eax;\n" | |
586 " cmpl %%ecx,%%eax;\n" | |
587 " jge 8f;\n" | |
588 " movl %%ecx,%%eax;\n" | |
589 " .p2align 2;\n" | |
590 "8:\n" | |
591 " emms;\n" | |
592 : "=a" (max) | |
593 : "S" (x), "a" (n), "d" (out), [lower] "m" (lower_bound), [upper] "m" (upper_bound) | |
594 : "ecx" | |
595 ); | |
596 #else | |
597 int i; | |
598 int16_t min; | |
599 int16_t max; | |
600 int16_t temp; | |
601 int32_t z; | |
602 | |
603 max = INT16_MIN; | |
604 min = INT16_MAX; | |
605 for (i = 0; i < n; i++) | |
606 { | |
607 temp = x[i]; | |
608 if (temp > max) | |
609 max = temp; | |
610 /*endif*/ | |
611 if (temp < min) | |
612 min = temp; | |
613 /*endif*/ | |
614 } | |
615 /*endfor*/ | |
616 if (out) | |
617 { | |
618 out[0] = max; | |
619 out[1] = min; | |
620 } | |
621 z = abs(min); | |
622 if (z > max) | |
623 return z; | |
624 #endif | |
625 return max; | |
626 } | |
627 /*- End of function --------------------------------------------------------*/ | |
628 /*- End of file ------------------------------------------------------------*/ |