653 lines
20 KiB
C
653 lines
20 KiB
C
/*
|
|
* SpanDSP - a series of DSP components for telephony
|
|
*
|
|
* vector_int.c - Integer vector arithmetic
|
|
*
|
|
* Written by Steve Underwood <steveu@coppice.org>
|
|
*
|
|
* Copyright (C) 2006 Steve Underwood
|
|
*
|
|
* All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License version 2.1,
|
|
* as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
*
|
|
* $Id: vector_int.c,v 1.25 2009/02/21 04:27:46 steveu Exp $
|
|
*/
|
|
|
|
/*! \file */
|
|
|
|
#if defined(HAVE_CONFIG_H)
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include <inttypes.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#if defined(HAVE_TGMATH_H)
|
|
#include <tgmath.h>
|
|
#endif
|
|
#if defined(HAVE_MATH_H)
|
|
#include <math.h>
|
|
#endif
|
|
#include "floating_fudge.h"
|
|
#include <assert.h>
|
|
|
|
#if defined(SPANDSP_USE_MMX)
|
|
#include <mmintrin.h>
|
|
#endif
|
|
#if defined(SPANDSP_USE_SSE)
|
|
#include <xmmintrin.h>
|
|
#endif
|
|
#if defined(SPANDSP_USE_SSE2)
|
|
#include <emmintrin.h>
|
|
#endif
|
|
#if defined(SPANDSP_USE_SSE3)
|
|
#include <pmmintrin.h>
|
|
#endif
|
|
#if defined(SPANDSP_USE_SSE4_1)
|
|
#include <smmintrin.h>
|
|
#endif
|
|
#if defined(SPANDSP_USE_SSE4_2)
|
|
#include <nmmintrin.h>
|
|
#endif
|
|
#if defined(SPANDSP_USE_SSE4A)
|
|
#include <ammintrin.h>
|
|
#endif
|
|
#if defined(SPANDSP_USE_SSE5)
|
|
#include <bmmintrin.h>
|
|
#endif
|
|
|
|
#include "spandsp/telephony.h"
|
|
#include "spandsp/vector_int.h"
|
|
|
|
SPAN_DECLARE(int32_t) vec_dot_prodi16(const int16_t x[], const int16_t y[], int n)
|
|
{
|
|
int32_t z;
|
|
|
|
#if defined(__GNUC__) && defined(SPANDSP_USE_MMX)
|
|
#if defined(__x86_64__)
|
|
__asm__ __volatile__(
|
|
" emms;\n"
|
|
" pxor %%mm0,%%mm0;\n"
|
|
" leaq -32(%%rsi,%%rax,2),%%rdx;\n" /* rdx = top - 32 */
|
|
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" ja 1f;\n"
|
|
|
|
/* Work in blocks of 16 int16_t's until we are near the end */
|
|
" .p2align 2;\n"
|
|
"2:\n"
|
|
" movq (%%rdi),%%mm1;\n"
|
|
" movq (%%rsi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
" movq 8(%%rdi),%%mm1;\n"
|
|
" movq 8(%%rsi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
" movq 16(%%rdi),%%mm1;\n"
|
|
" movq 16(%%rsi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
" movq 24(%%rdi),%%mm1;\n"
|
|
" movq 24(%%rsi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
|
|
" addq $32,%%rsi;\n"
|
|
" addq $32,%%rdi;\n"
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" jbe 2b;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"1:\n"
|
|
" addq $24,%%rdx;\n" /* Now edx = top - 8 */
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" ja 3f;\n"
|
|
|
|
/* Work in blocks of 4 int16_t's until we are near the end */
|
|
" .p2align 2;\n"
|
|
"4:\n"
|
|
" movq (%%rdi),%%mm1;\n"
|
|
" movq (%%rsi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
|
|
" addq $8,%%rsi;\n"
|
|
" addq $8,%%rdi;\n"
|
|
" cmpq %%rdx,%%rsi;"
|
|
" jbe 4b;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"3:\n"
|
|
" addq $4,%%rdx;\n" /* Now edx = top - 4 */
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" ja 5f;\n"
|
|
|
|
/* Work in a block of 2 int16_t's */
|
|
" movd (%%rdi),%%mm1;\n"
|
|
" movd (%%rsi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
|
|
" addq $4,%%rsi;\n"
|
|
" addq $4,%%rdi;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"5:\n"
|
|
" addq $2,%%rdx;\n" /* Now edx = top - 2 */
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" ja 6f;\n"
|
|
|
|
/* Deal with the very last int16_t, when n is odd */
|
|
" movswl (%%rdi),%%eax;\n"
|
|
" andl $65535,%%eax;\n"
|
|
" movd %%eax,%%mm1;\n"
|
|
" movswl (%%rsi),%%eax;\n"
|
|
" andl $65535,%%eax;\n"
|
|
" movd %%eax,%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"6:\n"
|
|
/* Merge the pieces of the answer */
|
|
" movq %%mm0,%%mm1;\n"
|
|
" punpckhdq %%mm0,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
/* Et voila, eax has the final result */
|
|
" movd %%mm0,%%eax;\n"
|
|
|
|
" emms;\n"
|
|
: "=a" (z)
|
|
: "S" (x), "D" (y), "a" (n)
|
|
: "cc"
|
|
);
|
|
#else
|
|
__asm__ __volatile__(
|
|
" emms;\n"
|
|
" pxor %%mm0,%%mm0;\n"
|
|
" leal -32(%%esi,%%eax,2),%%edx;\n" /* edx = top - 32 */
|
|
|
|
" cmpl %%edx,%%esi;\n"
|
|
" ja 1f;\n"
|
|
|
|
/* Work in blocks of 16 int16_t's until we are near the end */
|
|
" .p2align 2;\n"
|
|
"2:\n"
|
|
" movq (%%edi),%%mm1;\n"
|
|
" movq (%%esi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
" movq 8(%%edi),%%mm1;\n"
|
|
" movq 8(%%esi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
" movq 16(%%edi),%%mm1;\n"
|
|
" movq 16(%%esi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
" movq 24(%%edi),%%mm1;\n"
|
|
" movq 24(%%esi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
|
|
" addl $32,%%esi;\n"
|
|
" addl $32,%%edi;\n"
|
|
" cmpl %%edx,%%esi;\n"
|
|
" jbe 2b;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"1:\n"
|
|
" addl $24,%%edx;\n" /* Now edx = top - 8 */
|
|
" cmpl %%edx,%%esi;\n"
|
|
" ja 3f;\n"
|
|
|
|
/* Work in blocks of 4 int16_t's until we are near the end */
|
|
" .p2align 2;\n"
|
|
"4:\n"
|
|
" movq (%%edi),%%mm1;\n"
|
|
" movq (%%esi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
|
|
" addl $8,%%esi;\n"
|
|
" addl $8,%%edi;\n"
|
|
" cmpl %%edx,%%esi;"
|
|
" jbe 4b;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"3:\n"
|
|
" addl $4,%%edx;\n" /* Now edx = top - 4 */
|
|
" cmpl %%edx,%%esi;\n"
|
|
" ja 5f;\n"
|
|
|
|
/* Work in a block of 2 int16_t's */
|
|
" movd (%%edi),%%mm1;\n"
|
|
" movd (%%esi),%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
|
|
" addl $4,%%esi;\n"
|
|
" addl $4,%%edi;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"5:\n"
|
|
" addl $2,%%edx;\n" /* Now edx = top - 2 */
|
|
" cmpl %%edx,%%esi;\n"
|
|
" ja 6f;\n"
|
|
|
|
/* Deal with the very last int16_t, when n is odd */
|
|
" movswl (%%edi),%%eax;\n"
|
|
" andl $65535,%%eax;\n"
|
|
" movd %%eax,%%mm1;\n"
|
|
" movswl (%%esi),%%eax;\n"
|
|
" andl $65535,%%eax;\n"
|
|
" movd %%eax,%%mm2;\n"
|
|
" pmaddwd %%mm2,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"6:\n"
|
|
/* Merge the pieces of the answer */
|
|
" movq %%mm0,%%mm1;\n"
|
|
" punpckhdq %%mm0,%%mm1;\n"
|
|
" paddd %%mm1,%%mm0;\n"
|
|
/* Et voila, eax has the final result */
|
|
" movd %%mm0,%%eax;\n"
|
|
|
|
" emms;\n"
|
|
: "=a" (z)
|
|
: "S" (x), "D" (y), "a" (n)
|
|
: "cc"
|
|
);
|
|
#endif
|
|
#else
|
|
int i;
|
|
|
|
z = 0;
|
|
for (i = 0; i < n; i++)
|
|
z += (int32_t) x[i]*(int32_t) y[i];
|
|
#endif
|
|
return z;
|
|
}
|
|
/*- End of function --------------------------------------------------------*/
|
|
|
|
SPAN_DECLARE(int32_t) vec_circular_dot_prodi16(const int16_t x[], const int16_t y[], int n, int pos)
|
|
{
|
|
int32_t z;
|
|
|
|
z = vec_dot_prodi16(&x[pos], &y[0], n - pos);
|
|
z += vec_dot_prodi16(&x[0], &y[n - pos], pos);
|
|
return z;
|
|
}
|
|
/*- End of function --------------------------------------------------------*/
|
|
|
|
SPAN_DECLARE(void) vec_lmsi16(const int16_t x[], int16_t y[], int n, int16_t error)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < n; i++)
|
|
y[i] += (int16_t) (((int32_t) x[i]*(int32_t) error) >> 15);
|
|
}
|
|
/*- End of function --------------------------------------------------------*/
|
|
|
|
SPAN_DECLARE(void) vec_circular_lmsi16(const int16_t x[], int16_t y[], int n, int pos, int16_t error)
|
|
{
|
|
vec_lmsi16(&x[pos], &y[0], n - pos, error);
|
|
vec_lmsi16(&x[0], &y[n - pos], pos, error);
|
|
}
|
|
/*- End of function --------------------------------------------------------*/
|
|
|
|
SPAN_DECLARE(int32_t) vec_min_maxi16(const int16_t x[], int n, int16_t out[])
|
|
{
|
|
#if defined(__GNUC__) && defined(SPANDSP_USE_MMX)
|
|
static const int32_t lower_bound = 0x80008000;
|
|
static const int32_t upper_bound = 0x7FFF7FFF;
|
|
int32_t max;
|
|
|
|
#if defined(__x86_64__)
|
|
__asm__ __volatile__(
|
|
" emms;\n"
|
|
" pushq %%rdx;\n"
|
|
" leaq -8(%%rsi,%%rax,2),%%rdx;\n"
|
|
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" jbe 2f;\n"
|
|
" movd %[lower],%%mm0;\n"
|
|
" movd %[upper],%%mm1;\n"
|
|
" jmp 1f;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"2:\n"
|
|
" movq (%%rsi),%%mm0;\n" /* mm0 will be max's */
|
|
" movq %%mm0,%%mm1;\n" /* mm1 will be min's */
|
|
" addq $8,%%rsi;\n"
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" ja 4f;\n"
|
|
|
|
"3:\n"
|
|
" movq (%%rsi),%%mm2;\n"
|
|
|
|
" movq %%mm2,%%mm3;\n"
|
|
" pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */
|
|
" movq %%mm3,%%mm4;\n"
|
|
" pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */
|
|
" pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */
|
|
" por %%mm3,%%mm4;\n"
|
|
" movq %%mm4,%%mm0;\n" /* Now mm0 is updated max's */
|
|
|
|
" movq %%mm1,%%mm3;\n"
|
|
" pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */
|
|
" pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */
|
|
|
|
" addq $8,%%rsi;\n"
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" jbe 3b;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"4:\n"
|
|
/* Merge down the 4-word max/mins to lower 2 words */
|
|
" movq %%mm0,%%mm2;\n"
|
|
" psrlq $32,%%mm2;\n"
|
|
" movq %%mm2,%%mm3;\n"
|
|
" pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */
|
|
" pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movq %%mm2,%%mm0;\n" /* now mm0 is updated max's */
|
|
|
|
" movq %%mm1,%%mm2;\n"
|
|
" psrlq $32,%%mm2;\n"
|
|
" movq %%mm1,%%mm3;\n"
|
|
" pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */
|
|
" pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */
|
|
|
|
" .p2align 2;\n"
|
|
"1:\n"
|
|
" addq $4,%%rdx;\n" /* now dx = top-4 */
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" ja 5f;\n"
|
|
/* Here, there are >= 2 words of input remaining */
|
|
" movd (%%rsi),%%mm2;\n"
|
|
|
|
" movq %%mm2,%%mm3;\n"
|
|
" pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */
|
|
" movq %%mm3,%%mm4;\n"
|
|
" pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */
|
|
" pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */
|
|
" por %%mm3,%%mm4;\n"
|
|
" movq %%mm4,%%mm0;\n" /* now mm0 is updated max's */
|
|
|
|
" movq %%mm1,%%mm3;\n"
|
|
" pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */
|
|
" pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */
|
|
|
|
" addq $4,%%rsi;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"5:\n"
|
|
/* Merge down the 2-word max/mins to 1 word */
|
|
" movq %%mm0,%%mm2;\n"
|
|
" psrlq $16,%%mm2;\n"
|
|
" movq %%mm2,%%mm3;\n"
|
|
" pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */
|
|
" pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movd %%mm2,%%ecx;\n" /* cx is max so far */
|
|
|
|
" movq %%mm1,%%mm2;\n"
|
|
" psrlq $16,%%mm2;\n"
|
|
" movq %%mm1,%%mm3;\n"
|
|
" pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */
|
|
" pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movd %%mm2,%%eax;\n" /* ax is min so far */
|
|
|
|
" addq $2,%%rdx;\n" /* now dx = top-2 */
|
|
" cmpq %%rdx,%%rsi;\n"
|
|
" ja 6f;\n"
|
|
|
|
/* Here, there is one word of input left */
|
|
" cmpw (%%rsi),%%cx;\n"
|
|
" jge 9f;\n"
|
|
" movw (%%rsi),%%cx;\n"
|
|
" .p2align 2;\n"
|
|
"9:\n"
|
|
" cmpw (%%rsi),%%ax;\n"
|
|
" jle 6f;\n"
|
|
" movw (%%rsi),%%ax;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"6:\n"
|
|
/* (finally!) cx is the max, ax the min */
|
|
" movswl %%cx,%%ecx;\n"
|
|
" movswl %%ax,%%eax;\n"
|
|
|
|
" popq %%rdx;\n" /* ptr to output max,min vals */
|
|
" andq %%rdx,%%rdx;\n"
|
|
" jz 7f;\n"
|
|
" movw %%cx,(%%rdx);\n" /* max */
|
|
" movw %%ax,2(%%rdx);\n" /* min */
|
|
" .p2align 2;\n"
|
|
"7:\n"
|
|
/* Now calculate max absolute value */
|
|
" negl %%eax;\n"
|
|
" cmpl %%ecx,%%eax;\n"
|
|
" jge 8f;\n"
|
|
" movl %%ecx,%%eax;\n"
|
|
" .p2align 2;\n"
|
|
"8:\n"
|
|
" emms;\n"
|
|
: "=a" (max)
|
|
: "S" (x), "a" (n), "d" (out), [lower] "m" (lower_bound), [upper] "m" (upper_bound)
|
|
: "ecx"
|
|
);
|
|
#else
|
|
__asm__ __volatile__(
|
|
" emms;\n"
|
|
" pushl %%edx;\n"
|
|
" leal -8(%%esi,%%eax,2),%%edx;\n"
|
|
|
|
" cmpl %%edx,%%esi;\n"
|
|
" jbe 2f;\n"
|
|
" movd %[lower],%%mm0;\n"
|
|
" movd %[upper],%%mm1;\n"
|
|
" jmp 1f;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"2:\n"
|
|
" movq (%%esi),%%mm0;\n" /* mm0 will be max's */
|
|
" movq %%mm0,%%mm1;\n" /* mm1 will be min's */
|
|
" addl $8,%%esi;\n"
|
|
" cmpl %%edx,%%esi;\n"
|
|
" ja 4f;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"3:\n"
|
|
" movq (%%esi),%%mm2;\n"
|
|
|
|
" movq %%mm2,%%mm3;\n"
|
|
" pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */
|
|
" movq %%mm3,%%mm4;\n"
|
|
" pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */
|
|
" pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */
|
|
" por %%mm3,%%mm4;\n"
|
|
" movq %%mm4,%%mm0;\n" /* Now mm0 is updated max's */
|
|
|
|
" movq %%mm1,%%mm3;\n"
|
|
" pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */
|
|
" pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */
|
|
|
|
" addl $8,%%esi;\n"
|
|
" cmpl %%edx,%%esi;\n"
|
|
" jbe 3b;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"4:\n"
|
|
/* Merge down the 4-word max/mins to lower 2 words */
|
|
" movq %%mm0,%%mm2;\n"
|
|
" psrlq $32,%%mm2;\n"
|
|
" movq %%mm2,%%mm3;\n"
|
|
" pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */
|
|
" pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movq %%mm2,%%mm0;\n" /* now mm0 is updated max's */
|
|
|
|
" movq %%mm1,%%mm2;\n"
|
|
" psrlq $32,%%mm2;\n"
|
|
" movq %%mm1,%%mm3;\n"
|
|
" pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */
|
|
" pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */
|
|
|
|
" .p2align 2;\n"
|
|
"1:\n"
|
|
" addl $4,%%edx;\n" /* now dx = top-4 */
|
|
" cmpl %%edx,%%esi;\n"
|
|
" ja 5f;\n"
|
|
/* Here, there are >= 2 words of input remaining */
|
|
" movd (%%esi),%%mm2;\n"
|
|
|
|
" movq %%mm2,%%mm3;\n"
|
|
" pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */
|
|
" movq %%mm3,%%mm4;\n"
|
|
" pand %%mm2,%%mm3;\n" /* mm3 is mm2 masked to new max's */
|
|
" pandn %%mm0,%%mm4;\n" /* mm4 is mm0 masked to its max's */
|
|
" por %%mm3,%%mm4;\n"
|
|
" movq %%mm4,%%mm0;\n" /* now mm0 is updated max's */
|
|
|
|
" movq %%mm1,%%mm3;\n"
|
|
" pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */
|
|
" pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movq %%mm2,%%mm1;\n" /* now mm1 is updated min's */
|
|
|
|
" addl $4,%%esi;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"5:\n"
|
|
/* Merge down the 2-word max/mins to 1 word */
|
|
" movq %%mm0,%%mm2;\n"
|
|
" psrlq $16,%%mm2;\n"
|
|
" movq %%mm2,%%mm3;\n"
|
|
" pcmpgtw %%mm0,%%mm3;\n" /* mm3 is bitmask for words where mm2 > mm0 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new max's */
|
|
" pandn %%mm0,%%mm3;\n" /* mm3 is mm0 masked to its max's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movd %%mm2,%%ecx;\n" /* cx is max so far */
|
|
|
|
" movq %%mm1,%%mm2;\n"
|
|
" psrlq $16,%%mm2;\n"
|
|
" movq %%mm1,%%mm3;\n"
|
|
" pcmpgtw %%mm2,%%mm3;\n" /* mm3 is bitmask for words where mm2 < mm1 */
|
|
" pand %%mm3,%%mm2;\n" /* mm2 is mm2 masked to new min's */
|
|
" pandn %%mm1,%%mm3;\n" /* mm3 is mm1 masked to its min's */
|
|
" por %%mm3,%%mm2;\n"
|
|
" movd %%mm2,%%eax;\n" /* ax is min so far */
|
|
|
|
" addl $2,%%edx;\n" /* now dx = top-2 */
|
|
" cmpl %%edx,%%esi;\n"
|
|
" ja 6f;\n"
|
|
|
|
/* Here, there is one word of input left */
|
|
" cmpw (%%esi),%%cx;\n"
|
|
" jge 9f;\n"
|
|
" movw (%%esi),%%cx;\n"
|
|
" .p2align 2;\n"
|
|
"9:\n"
|
|
" cmpw (%%esi),%%ax;\n"
|
|
" jle 6f;\n"
|
|
" movw (%%esi),%%ax;\n"
|
|
|
|
" .p2align 2;\n"
|
|
"6:\n"
|
|
/* (finally!) cx is the max, ax the min */
|
|
" movswl %%cx,%%ecx;\n"
|
|
" movswl %%ax,%%eax;\n"
|
|
|
|
" popl %%edx;\n" /* ptr to output max,min vals */
|
|
" andl %%edx,%%edx;\n"
|
|
" jz 7f;\n"
|
|
" movw %%cx,(%%edx);\n" /* max */
|
|
" movw %%ax,2(%%edx);\n" /* min */
|
|
" .p2align 2;\n"
|
|
"7:\n"
|
|
/* Now calculate max absolute value */
|
|
" negl %%eax;\n"
|
|
" cmpl %%ecx,%%eax;\n"
|
|
" jge 8f;\n"
|
|
" movl %%ecx,%%eax;\n"
|
|
" .p2align 2;\n"
|
|
"8:\n"
|
|
" emms;\n"
|
|
: "=a" (max)
|
|
: "S" (x), "a" (n), "d" (out), [lower] "m" (lower_bound), [upper] "m" (upper_bound)
|
|
: "ecx"
|
|
);
|
|
#endif
|
|
return max;
|
|
#else
|
|
int i;
|
|
int16_t min;
|
|
int16_t max;
|
|
int16_t temp;
|
|
int32_t z;
|
|
|
|
max = INT16_MIN;
|
|
min = INT16_MAX;
|
|
for (i = 0; i < n; i++)
|
|
{
|
|
temp = x[i];
|
|
if (temp > max)
|
|
max = temp;
|
|
/*endif*/
|
|
if (temp < min)
|
|
min = temp;
|
|
/*endif*/
|
|
}
|
|
/*endfor*/
|
|
if (out)
|
|
{
|
|
out[0] = max;
|
|
out[1] = min;
|
|
}
|
|
z = abs(min);
|
|
if (z > max)
|
|
return z;
|
|
return max;
|
|
#endif
|
|
}
|
|
/*- End of function --------------------------------------------------------*/
|
|
/*- End of file ------------------------------------------------------------*/
|