Mercurial > hg > audiostuff
comparison spandsp-0.0.6pre17/src/vector_int.c @ 4:26cd8f1ef0b1
import spandsp-0.0.6pre17
| author | Peter Meerwald <pmeerw@cosy.sbg.ac.at> |
|---|---|
| date | Fri, 25 Jun 2010 15:50:58 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:c6c5a16ce2f2 | 4:26cd8f1ef0b1 |
|---|---|
| 1 /* | |
| 2 * SpanDSP - a series of DSP components for telephony | |
| 3 * | |
| 4 * vector_int.c - Integer vector arithmetic | |
| 5 * | |
| 6 * Written by Steve Underwood <steveu@coppice.org> | |
| 7 * | |
| 8 * Copyright (C) 2006 Steve Underwood | |
| 9 * | |
| 10 * All rights reserved. | |
| 11 * | |
| 12 * This program is free software; you can redistribute it and/or modify | |
| 13 * it under the terms of the GNU Lesser General Public License version 2.1, | |
| 14 * as published by the Free Software Foundation. | |
| 15 * | |
| 16 * This program is distributed in the hope that it will be useful, | |
| 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 19 * GNU Lesser General Public License for more details. | |
| 20 * | |
| 21 * You should have received a copy of the GNU Lesser General Public | |
| 22 * License along with this program; if not, write to the Free Software | |
| 23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
| 24 * | |
| 25 * $Id: vector_int.c,v 1.26.4.1 2009/12/28 11:54:59 steveu Exp $ | |
| 26 */ | |
| 27 | |
| 28 /*! \file */ | |
| 29 | |
| 30 #if defined(HAVE_CONFIG_H) | |
| 31 #include "config.h" | |
| 32 #endif | |
| 33 | |
| 34 #include <inttypes.h> | |
| 35 #include <stdlib.h> | |
| 36 #include <stdio.h> | |
| 37 #include <string.h> | |
| 38 #if defined(HAVE_TGMATH_H) | |
| 39 #include <tgmath.h> | |
| 40 #endif | |
| 41 #if defined(HAVE_MATH_H) | |
| 42 #include <math.h> | |
| 43 #endif | |
| 44 #include <assert.h> | |
| 45 | |
| 46 #include "floating_fudge.h" | |
| 47 #include "mmx_sse_decs.h" | |
| 48 | |
| 49 #include "spandsp/telephony.h" | |
| 50 #include "spandsp/vector_int.h" | |
| 51 | |
| 52 SPAN_DECLARE(int32_t) vec_dot_prodi16(const int16_t x[], const int16_t y[], int n) | |
| 53 { | |
| 54 int32_t z; | |
| 55 | |
| 56 #if defined(__GNUC__) && defined(SPANDSP_USE_MMX) && defined(__x86_64__) | |
| 57 __asm__ __volatile__( | |
| 58 " emms;\n" | |
| 59 " pxor %%mm0,%%mm0;\n" | |
| 60 " leaq -32(%%rsi,%%rax,2),%%rdx;\n" /* rdx = top - 32 */ | |
| 61 | |
| 62 " cmpq %%rdx,%%rsi;\n" | |
| 63 " ja 1f;\n" | |
| 64 | |
| 65 /* Work in blocks of 16 int16_t's until we are near the end */ | |
| 66 " .p2align 2;\n" | |
| 67 "2:\n" | |
| 68 " movq (%%rdi),%%mm1;\n" | |
| 69 " movq (%%rsi),%%mm2;\n" | |
| 70 " pmaddwd %%mm2,%%mm1;\n" | |
| 71 " paddd %%mm1,%%mm0;\n" | |
| 72 " movq 8(%%rdi),%%mm1;\n" | |
| 73 " movq 8(%%rsi),%%mm2;\n" | |
| 74 " pmaddwd %%mm2,%%mm1;\n" | |
| 75 " paddd %%mm1,%%mm0;\n" | |
| 76 " movq 16(%%rdi),%%mm1;\n" | |
| 77 " movq 16(%%rsi),%%mm2;\n" | |
| 78 " pmaddwd %%mm2,%%mm1;\n" | |
| 79 " paddd %%mm1,%%mm0;\n" | |
| 80 " movq 24(%%rdi),%%mm1;\n" | |
| 81 " movq 24(%%rsi),%%mm2;\n" | |
| 82 " pmaddwd %%mm2,%%mm1;\n" | |
| 83 " paddd %%mm1,%%mm0;\n" | |
| 84 | |
| 85 " addq $32,%%rsi;\n" | |
| 86 " addq $32,%%rdi;\n" | |
| 87 " cmpq %%rdx,%%rsi;\n" | |
| 88 " jbe 2b;\n" | |
| 89 | |
| 90 " .p2align 2;\n" | |
| 91 "1:\n" | |
| 92 " addq $24,%%rdx;\n" /* Now edx = top - 8 */ | |
| 93 " cmpq %%rdx,%%rsi;\n" | |
| 94 " ja 3f;\n" | |
| 95 | |
| 96 /* Work in blocks of 4 int16_t's until we are near the end */ | |
| 97 " .p2align 2;\n" | |
| 98 "4:\n" | |
| 99 " movq (%%rdi),%%mm1;\n" | |
| 100 " movq (%%rsi),%%mm2;\n" | |
| 101 " pmaddwd %%mm2,%%mm1;\n" | |
| 102 " paddd %%mm1,%%mm0;\n" | |
| 103 | |
| 104 " addq $8,%%rsi;\n" | |
| 105 " addq $8,%%rdi;\n" | |
| 106 " cmpq %%rdx,%%rsi;" | |
| 107 " jbe 4b;\n" | |
| 108 | |
| 109 " .p2align 2;\n" | |
| 110 "3:\n" | |
| 111 " addq $4,%%rdx;\n" /* Now edx = top - 4 */ | |
| 112 " cmpq %%rdx,%%rsi;\n" | |
| 113 " ja 5f;\n" | |
| 114 | |
| 115 /* Work in a block of 2 int16_t's */ | |
| 116 " movd (%%rdi),%%mm1;\n" | |
| 117 " movd (%%rsi),%%mm2;\n" | |
| 118 " pmaddwd %%mm2,%%mm1;\n" | |
| 119 " paddd %%mm1,%%mm0;\n" | |
| 120 | |
| 121 " addq $4,%%rsi;\n" | |
| 122 " addq $4,%%rdi;\n" | |
| 123 | |
| 124 " .p2align 2;\n" | |
| 125 "5:\n" | |
| 126 " addq $2,%%rdx;\n" /* Now edx = top - 2 */ | |
| 127 " cmpq %%rdx,%%rsi;\n" | |
| 128 " ja 6f;\n" | |
| 129 | |
| 130 /* Deal with the very last int16_t, when n is odd */ | |
| 131 " movswl (%%rdi),%%eax;\n" | |
| 132 " andl $65535,%%eax;\n" | |
| 133 " movd %%eax,%%mm1;\n" | |
| 134 " movswl (%%rsi),%%eax;\n" | |
| 135 " andl $65535,%%eax;\n" | |
| 136 " movd %%eax,%%mm2;\n" | |
| 137 " pmaddwd %%mm2,%%mm1;\n" | |
| 138 " paddd %%mm1,%%mm0;\n" | |
| 139 | |
| 140 " .p2align 2;\n" | |
| 141 "6:\n" | |
| 142 /* Merge the pieces of the answer */ | |
| 143 " movq %%mm0,%%mm1;\n" | |
| 144 " punpckhdq %%mm0,%%mm1;\n" | |
| 145 " paddd %%mm1,%%mm0;\n" | |
| 146 /* Et voila, eax has the final result */ | |
| 147 " movd %%mm0,%%eax;\n" | |
| 148 | |
| 149 " emms;\n" | |
| 150 : "=a" (z) | |
| 151 : "S" (x), "D" (y), "a" (n) | |
| 152 : "cc" | |
| 153 ); | |
| 154 #elif defined(__GNUC__) && defined(SPANDSP_USE_MMX) && defined(__i386__) | |
| 155 __asm__ __volatile__( | |
| 156 " emms;\n" | |
| 157 " pxor %%mm0,%%mm0;\n" | |
| 158 " leal -32(%%esi,%%eax,2),%%edx;\n" /* edx = top - 32 */ | |
| 159 | |
| 160 " cmpl %%edx,%%esi;\n" | |
| 161 " ja 1f;\n" | |
| 162 | |
| 163 /* Work in blocks of 16 int16_t's until we are near the end */ | |
| 164 " .p2align 2;\n" | |
| 165 "2:\n" | |
| 166 " movq (%%edi),%%mm1;\n" | |
| 167 " movq (%%esi),%%mm2;\n" | |
| 168 " pmaddwd %%mm2,%%mm1;\n" | |
| 169 " paddd %%mm1,%%mm0;\n" | |
| 170 " movq 8(%%edi),%%mm1;\n" | |
| 171 " movq 8(%%esi),%%mm2;\n" | |
| 172 " pmaddwd %%mm2,%%mm1;\n" | |
| 173 " paddd %%mm1,%%mm0;\n" | |
| 174 " movq 16(%%edi),%%mm1;\n" | |
| 175 " movq 16(%%esi),%%mm2;\n" | |
| 176 " pmaddwd %%mm2,%%mm1;\n" | |
| 177 " paddd %%mm1,%%mm0;\n" | |
| 178 " movq 24(%%edi),%%mm1;\n" | |
| 179 " movq 24(%%esi),%%mm2;\n" | |
| 180 " pmaddwd %%mm2,%%mm1;\n" | |
| 181 " paddd %%mm1,%%mm0;\n" | |
| 182 | |
| 183 " addl $32,%%esi;\n" | |
| 184 " addl $32,%%edi;\n" | |
| 185 " cmpl %%edx,%%esi;\n" | |
| 186 " jbe 2b;\n" | |
| 187 | |
| 188 " .p2align 2;\n" | |
| 189 "1:\n" | |
| 190 " addl $24,%%edx;\n" /* Now edx = top - 8 */ | |
| 191 " cmpl %%edx,%%esi;\n" | |
| 192 " ja 3f;\n" | |
| 193 | |
| 194 /* Work in blocks of 4 int16_t's until we are near the end */ | |
| 195 " .p2align 2;\n" | |
| 196 "4:\n" | |
| 197 " movq (%%edi),%%mm1;\n" | |
| 198 " movq (%%esi),%%mm2;\n" | |
| 199 " pmaddwd %%mm2,%%mm1;\n" | |
| 200 " paddd %%mm1,%%mm0;\n" | |
| 201 | |
| 202 " addl $8,%%esi;\n" | |
| 203 " addl $8,%%edi;\n" | |
| 204 " cmpl %%edx,%%esi;" | |
| 205 " jbe 4b;\n" | |
| 206 | |
| 207 " .p2align 2;\n" | |
| 208 "3:\n" | |
| 209 " addl $4,%%edx;\n" /* Now edx = top - 4 */ | |
| 210 " cmpl %%edx,%%esi;\n" | |
| 211 " ja 5f;\n" | |
| 212 | |
| 213 /* Work in a block of 2 int16_t's */ | |
| 214 " movd (%%edi),%%mm1;\n" | |
| 215 " movd (%%esi),%%mm2;\n" | |
| 216 " pmaddwd %%mm2,%%mm1;\n" | |
| 217 " paddd %%mm1,%%mm0;\n" | |
| 218 | |
| 219 " addl $4,%%esi;\n" | |
| 220 " addl $4,%%edi;\n" | |
| 221 | |
| 222 " .p2align 2;\n" | |
| 223 "5:\n" | |
| 224 " addl $2,%%edx;\n" /* Now edx = top - 2 */ | |
| 225 " cmpl %%edx,%%esi;\n" | |
| 226 " ja 6f;\n" | |
| 227 | |
| 228 /* Deal with the very last int16_t, when n is odd */ | |
| 229 " movswl (%%edi),%%eax;\n" | |
| 230 " andl $65535,%%eax;\n" | |
| 231 " movd %%eax,%%mm1;\n" | |
| 232 " movswl (%%esi),%%eax;\n" | |
| 233 " andl $65535,%%eax;\n" | |
| 234 " movd %%eax,%%mm2;\n" | |
| 235 " pmaddwd %%mm2,%%mm1;\n" | |
| 236 " paddd %%mm1,%%mm0;\n" | |
| 237 | |
| 238 " .p2align 2;\n" | |
| 239 "6:\n" | |
| 240 /* Merge the pieces of the answer */ | |
| 241 " movq %%mm0,%%mm1;\n" | |
| 242 " punpckhdq %%mm0,%%mm1;\n" | |
| 243 " paddd %%mm1,%%mm0;\n" | |
| 244 /* Et voila, eax has the final result */ | |
| 245 " movd %%mm0,%%eax;\n" | |
| 246 | |
| 247 " emms;\n" | |
| 248 : "=a" (z) | |
| 249 : "S" (x), "D" (y), "a" (n) | |
| 250 : "cc" | |
| 251 ); | |
| 252 #else | |
| 253 int i; | |
| 254 | |
| 255 z = 0; | |
| 256 for (i = 0; i < n; i++) | |
| 257 z += (int32_t) x[i]*(int32_t) y[i]; | |
| 258 #endif | |
| 259 return z; | |
| 260 } | |
| 261 /*- End of function --------------------------------------------------------*/ | |
| 262 | |
| 263 SPAN_DECLARE(int32_t) vec_circular_dot_prodi16(const int16_t x[], const int16_t y[], int n, int pos) | |
| 264 { | |
| 265 int32_t z; | |
| 266 | |
| 267 z = vec_dot_prodi16(&x[pos], &y[0], n - pos); | |
| 268 z += vec_dot_prodi16(&x[0], &y[n - pos], pos); | |
| 269 return z; | |
| 270 } | |
| 271 /*- End of function --------------------------------------------------------*/ | |
| 272 | |
| 273 SPAN_DECLARE(void) vec_lmsi16(const int16_t x[], int16_t y[], int n, int16_t error) | |
| 274 { | |
| 275 int i; | |
| 276 | |
| 277 for (i = 0; i < n; i++) | |
| 278 y[i] += (int16_t) (((int32_t) x[i]*(int32_t) error) >> 15); | |
| 279 } | |
| 280 /*- End of function --------------------------------------------------------*/ | |
| 281 | |
| 282 SPAN_DECLARE(void) vec_circular_lmsi16(const int16_t x[], int16_t y[], int n, int pos, int16_t error) | |
| 283 { | |
| 284 vec_lmsi16(&x[pos], &y[0], n - pos, error); | |
| 285 vec_lmsi16(&x[0], &y[n - pos], pos, error); | |
| 286 } | |
| 287 /*- End of function --------------------------------------------------------*/ | |
| 288 | |
| 289 SPAN_DECLARE(int32_t) vec_min_maxi16(const int16_t x[], int n, int16_t out[]) | |
| 290 { | |
| 291 #if defined(__GNUC__) && defined(SPANDSP_USE_MMX) && defined(__x86_64__) | |
| 292 static const int32_t lower_bound = 0x80008000; | |
| 293 static const int32_t upper_bound = 0x7FFF7FFF; | |
| 294 int32_t max; | |
| 295 | |
| 296 __asm__ __volatile__( | |
| 297 " emms;\n" | |
| 298 " pushq %%rdx;\n" | |
| 299 " leaq -8(%%rsi,%%rax,2),%%rdx;\n" | |
| 300 | |
| 301 " cmpq %%rdx,%%rsi;\n" | |
| 302 " jbe 2f;\n" | |
| 303 " movd %[lower],%%mm0;\n" | |
| 304 " movd %[upper],%%mm1;\n" | |
| 305 " jmp 1f;\n" | |
| 306 | |
| 307 " .p2align 2;\n" | |
| 308 "2:\n" | |
| 309 " movq (%%rsi),%%mm0;\n" /* mm0 will be max's */ | |
| 310 " movq %%mm0,%%mm1;\n" /* mm1 will be min's */ | |
| 311 " addq $8,%%rsi;\n" | |
| 312 " cmpq %%rdx,%%rsi;\n" | |
| 313 " ja 4f;\n" | |
| 314 | |
| 315 "3:\n" | |
| 316 " movq (%%rsi),%%mm2;\n" | |
| 317 | |
| 318 " movq %%mm2,%%mm3;\n" | |
| 319 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
| 320 " movq %%mm3,%%mm4;\n" | |
| 321 " pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */ | |
| 322 " pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */ | |
| 323 " por %%mm3,%%mm4;\n" | |
| 324 " movq %%mm4,%%mm0;\n" /* Now mm0 is updated max's */ | |
| 325 | |
| 326 " movq %%mm1,%%mm3;\n" | |
| 327 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
| 328 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
| 329 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
| 330 " por %%mm3,%%mm2;\n" | |
| 331 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
| 332 | |
| 333 " addq $8,%%rsi;\n" | |
| 334 " cmpq %%rdx,%%rsi;\n" | |
| 335 " jbe 3b;\n" | |
| 336 | |
| 337 " .p2align 2;\n" | |
| 338 "4:\n" | |
| 339 /* Merge down the 4-word max/mins to lower 2 words */ | |
| 340 " movq %%mm0,%%mm2;\n" | |
| 341 " psrlq $32,%%mm2;\n" | |
| 342 " movq %%mm2,%%mm3;\n" | |
| 343 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
| 344 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */ | |
| 345 " pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */ | |
| 346 " por %%mm3,%%mm2;\n" | |
| 347 " movq %%mm2,%%mm0;\n" /* now mm0 is updated max's */ | |
| 348 | |
| 349 " movq %%mm1,%%mm2;\n" | |
| 350 " psrlq $32,%%mm2;\n" | |
| 351 " movq %%mm1,%%mm3;\n" | |
| 352 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
| 353 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
| 354 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
| 355 " por %%mm3,%%mm2;\n" | |
| 356 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
| 357 | |
| 358 " .p2align 2;\n" | |
| 359 "1:\n" | |
| 360 " addq $4,%%rdx;\n" /* now dx = top-4 */ | |
| 361 " cmpq %%rdx,%%rsi;\n" | |
| 362 " ja 5f;\n" | |
| 363 /* Here, there are >= 2 words of input remaining */ | |
| 364 " movd (%%rsi),%%mm2;\n" | |
| 365 | |
| 366 " movq %%mm2,%%mm3;\n" | |
| 367 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
| 368 " movq %%mm3,%%mm4;\n" | |
| 369 " pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */ | |
| 370 " pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */ | |
| 371 " por %%mm3,%%mm4;\n" | |
| 372 " movq %%mm4,%%mm0;\n" /* now mm0 is updated max's */ | |
| 373 | |
| 374 " movq %%mm1,%%mm3;\n" | |
| 375 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
| 376 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
| 377 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
| 378 " por %%mm3,%%mm2;\n" | |
| 379 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
| 380 | |
| 381 " addq $4,%%rsi;\n" | |
| 382 | |
| 383 " .p2align 2;\n" | |
| 384 "5:\n" | |
| 385 /* Merge down the 2-word max/mins to 1 word */ | |
| 386 " movq %%mm0,%%mm2;\n" | |
| 387 " psrlq $16,%%mm2;\n" | |
| 388 " movq %%mm2,%%mm3;\n" | |
| 389 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
| 390 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */ | |
| 391 " pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */ | |
| 392 " por %%mm3,%%mm2;\n" | |
| 393 " movd %%mm2,%%ecx;\n" /* cx is max so far */ | |
| 394 | |
| 395 " movq %%mm1,%%mm2;\n" | |
| 396 " psrlq $16,%%mm2;\n" | |
| 397 " movq %%mm1,%%mm3;\n" | |
| 398 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
| 399 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
| 400 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
| 401 " por %%mm3,%%mm2;\n" | |
| 402 " movd %%mm2,%%eax;\n" /* ax is min so far */ | |
| 403 | |
| 404 " addq $2,%%rdx;\n" /* now dx = top-2 */ | |
| 405 " cmpq %%rdx,%%rsi;\n" | |
| 406 " ja 6f;\n" | |
| 407 | |
| 408 /* Here, there is one word of input left */ | |
| 409 " cmpw (%%rsi),%%cx;\n" | |
| 410 " jge 9f;\n" | |
| 411 " movw (%%rsi),%%cx;\n" | |
| 412 " .p2align 2;\n" | |
| 413 "9:\n" | |
| 414 " cmpw (%%rsi),%%ax;\n" | |
| 415 " jle 6f;\n" | |
| 416 " movw (%%rsi),%%ax;\n" | |
| 417 | |
| 418 " .p2align 2;\n" | |
| 419 "6:\n" | |
| 420 /* (finally!) cx is the max, ax the min */ | |
| 421 " movswl %%cx,%%ecx;\n" | |
| 422 " movswl %%ax,%%eax;\n" | |
| 423 | |
| 424 " popq %%rdx;\n" /* ptr to output max,min vals */ | |
| 425 " andq %%rdx,%%rdx;\n" | |
| 426 " jz 7f;\n" | |
| 427 " movw %%cx,(%%rdx);\n" /* max */ | |
| 428 " movw %%ax,2(%%rdx);\n" /* min */ | |
| 429 " .p2align 2;\n" | |
| 430 "7:\n" | |
| 431 /* Now calculate max absolute value */ | |
| 432 " negl %%eax;\n" | |
| 433 " cmpl %%ecx,%%eax;\n" | |
| 434 " jge 8f;\n" | |
| 435 " movl %%ecx,%%eax;\n" | |
| 436 " .p2align 2;\n" | |
| 437 "8:\n" | |
| 438 " emms;\n" | |
| 439 : "=a" (max) | |
| 440 : "S" (x), "a" (n), "d" (out), [lower] "m" (lower_bound), [upper] "m" (upper_bound) | |
| 441 : "ecx" | |
| 442 ); | |
| 443 #elif defined(__GNUC__) && defined(SPANDSP_USE_MMX) && defined(__i386__) | |
| 444 static const int32_t lower_bound = 0x80008000; | |
| 445 static const int32_t upper_bound = 0x7FFF7FFF; | |
| 446 int32_t max; | |
| 447 | |
| 448 __asm__ __volatile__( | |
| 449 " emms;\n" | |
| 450 " pushl %%edx;\n" | |
| 451 " leal -8(%%esi,%%eax,2),%%edx;\n" | |
| 452 | |
| 453 " cmpl %%edx,%%esi;\n" | |
| 454 " jbe 2f;\n" | |
| 455 " movd %[lower],%%mm0;\n" | |
| 456 " movd %[upper],%%mm1;\n" | |
| 457 " jmp 1f;\n" | |
| 458 | |
| 459 " .p2align 2;\n" | |
| 460 "2:\n" | |
| 461 " movq (%%esi),%%mm0;\n" /* mm0 will be max's */ | |
| 462 " movq %%mm0,%%mm1;\n" /* mm1 will be min's */ | |
| 463 " addl $8,%%esi;\n" | |
| 464 " cmpl %%edx,%%esi;\n" | |
| 465 " ja 4f;\n" | |
| 466 | |
| 467 " .p2align 2;\n" | |
| 468 "3:\n" | |
| 469 " movq (%%esi),%%mm2;\n" | |
| 470 | |
| 471 " movq %%mm2,%%mm3;\n" | |
| 472 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
| 473 " movq %%mm3,%%mm4;\n" | |
| 474 " pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */ | |
| 475 " pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */ | |
| 476 " por %%mm3,%%mm4;\n" | |
| 477 " movq %%mm4,%%mm0;\n" /* Now mm0 is updated max's */ | |
| 478 | |
| 479 " movq %%mm1,%%mm3;\n" | |
| 480 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
| 481 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
| 482 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
| 483 " por %%mm3,%%mm2;\n" | |
| 484 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
| 485 | |
| 486 " addl $8,%%esi;\n" | |
| 487 " cmpl %%edx,%%esi;\n" | |
| 488 " jbe 3b;\n" | |
| 489 | |
| 490 " .p2align 2;\n" | |
| 491 "4:\n" | |
| 492 /* Merge down the 4-word max/mins to lower 2 words */ | |
| 493 " movq %%mm0,%%mm2;\n" | |
| 494 " psrlq $32,%%mm2;\n" | |
| 495 " movq %%mm2,%%mm3;\n" | |
| 496 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
| 497 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */ | |
| 498 " pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */ | |
| 499 " por %%mm3,%%mm2;\n" | |
| 500 " movq %%mm2,%%mm0;\n" /* now mm0 is updated max's */ | |
| 501 | |
| 502 " movq %%mm1,%%mm2;\n" | |
| 503 " psrlq $32,%%mm2;\n" | |
| 504 " movq %%mm1,%%mm3;\n" | |
| 505 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
| 506 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
| 507 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
| 508 " por %%mm3,%%mm2;\n" | |
| 509 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
| 510 | |
| 511 " .p2align 2;\n" | |
| 512 "1:\n" | |
| 513 " addl $4,%%edx;\n" /* now dx = top-4 */ | |
| 514 " cmpl %%edx,%%esi;\n" | |
| 515 " ja 5f;\n" | |
| 516 /* Here, there are >= 2 words of input remaining */ | |
| 517 " movd (%%esi),%%mm2;\n" | |
| 518 | |
| 519 " movq %%mm2,%%mm3;\n" | |
| 520 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
| 521 " movq %%mm3,%%mm4;\n" | |
| 522 " pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */ | |
| 523 " pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */ | |
| 524 " por %%mm3,%%mm4;\n" | |
| 525 " movq %%mm4,%%mm0;\n" /* now mm0 is updated max's */ | |
| 526 | |
| 527 " movq %%mm1,%%mm3;\n" | |
| 528 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
| 529 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
| 530 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
| 531 " por %%mm3,%%mm2;\n" | |
| 532 " movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */ | |
| 533 | |
| 534 " addl $4,%%esi;\n" | |
| 535 | |
| 536 " .p2align 2;\n" | |
| 537 "5:\n" | |
| 538 /* Merge down the 2-word max/mins to 1 word */ | |
| 539 " movq %%mm0,%%mm2;\n" | |
| 540 " psrlq $16,%%mm2;\n" | |
| 541 " movq %%mm2,%%mm3;\n" | |
| 542 " pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */ | |
| 543 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */ | |
| 544 " pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */ | |
| 545 " por %%mm3,%%mm2;\n" | |
| 546 " movd %%mm2,%%ecx;\n" /* cx is max so far */ | |
| 547 | |
| 548 " movq %%mm1,%%mm2;\n" | |
| 549 " psrlq $16,%%mm2;\n" | |
| 550 " movq %%mm1,%%mm3;\n" | |
| 551 " pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */ | |
| 552 " pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */ | |
| 553 " pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */ | |
| 554 " por %%mm3,%%mm2;\n" | |
| 555 " movd %%mm2,%%eax;\n" /* ax is min so far */ | |
| 556 | |
| 557 " addl $2,%%edx;\n" /* now dx = top-2 */ | |
| 558 " cmpl %%edx,%%esi;\n" | |
| 559 " ja 6f;\n" | |
| 560 | |
| 561 /* Here, there is one word of input left */ | |
| 562 " cmpw (%%esi),%%cx;\n" | |
| 563 " jge 9f;\n" | |
| 564 " movw (%%esi),%%cx;\n" | |
| 565 " .p2align 2;\n" | |
| 566 "9:\n" | |
| 567 " cmpw (%%esi),%%ax;\n" | |
| 568 " jle 6f;\n" | |
| 569 " movw (%%esi),%%ax;\n" | |
| 570 | |
| 571 " .p2align 2;\n" | |
| 572 "6:\n" | |
| 573 /* (finally!) cx is the max, ax the min */ | |
| 574 " movswl %%cx,%%ecx;\n" | |
| 575 " movswl %%ax,%%eax;\n" | |
| 576 | |
| 577 " popl %%edx;\n" /* ptr to output max,min vals */ | |
| 578 " andl %%edx,%%edx;\n" | |
| 579 " jz 7f;\n" | |
| 580 " movw %%cx,(%%edx);\n" /* max */ | |
| 581 " movw %%ax,2(%%edx);\n" /* min */ | |
| 582 " .p2align 2;\n" | |
| 583 "7:\n" | |
| 584 /* Now calculate max absolute value */ | |
| 585 " negl %%eax;\n" | |
| 586 " cmpl %%ecx,%%eax;\n" | |
| 587 " jge 8f;\n" | |
| 588 " movl %%ecx,%%eax;\n" | |
| 589 " .p2align 2;\n" | |
| 590 "8:\n" | |
| 591 " emms;\n" | |
| 592 : "=a" (max) | |
| 593 : "S" (x), "a" (n), "d" (out), [lower] "m" (lower_bound), [upper] "m" (upper_bound) | |
| 594 : "ecx" | |
| 595 ); | |
| 596 #else | |
| 597 int i; | |
| 598 int16_t min; | |
| 599 int16_t max; | |
| 600 int16_t temp; | |
| 601 int32_t z; | |
| 602 | |
| 603 max = INT16_MIN; | |
| 604 min = INT16_MAX; | |
| 605 for (i = 0; i < n; i++) | |
| 606 { | |
| 607 temp = x[i]; | |
| 608 if (temp > max) | |
| 609 max = temp; | |
| 610 /*endif*/ | |
| 611 if (temp < min) | |
| 612 min = temp; | |
| 613 /*endif*/ | |
| 614 } | |
| 615 /*endfor*/ | |
| 616 if (out) | |
| 617 { | |
| 618 out[0] = max; | |
| 619 out[1] = min; | |
| 620 } | |
| 621 z = abs(min); | |
| 622 if (z > max) | |
| 623 return z; | |
| 624 #endif | |
| 625 return max; | |
| 626 } | |
| 627 /*- End of function --------------------------------------------------------*/ | |
| 628 /*- End of file ------------------------------------------------------------*/ |
