Main Page
Related Pages
Modules
Data Structures
Files
Examples
File List
Globals
libavcodec
x86
fft_sse.c
Go to the documentation of this file.
1
/*
2
* FFT/MDCT transform with SSE optimizations
3
* Copyright (c) 2008 Loren Merritt
4
*
5
* This file is part of Libav.
6
*
7
* Libav is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Lesser General Public
9
* License as published by the Free Software Foundation; either
10
* version 2.1 of the License, or (at your option) any later version.
11
*
12
* Libav is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* Lesser General Public License for more details.
16
*
17
* You should have received a copy of the GNU Lesser General Public
18
* License along with Libav; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
*/
21
22
#include "
libavutil/x86_cpu.h
"
23
#include "
libavcodec/dsputil.h
"
24
#include "
fft.h
"
25
#include "
config.h
"
26
27
DECLARE_ASM_CONST
(16,
unsigned
int
, ff_m1m1m1m1)[4] =
28
{ 1
U
<< 31, 1
U
<< 31, 1U << 31, 1U << 31 };
29
30
void
ff_fft_dispatch_sse
(
FFTComplex
*z,
int
nbits);
31
void
ff_fft_dispatch_interleave_sse
(
FFTComplex
*z,
int
nbits);
32
void
ff_fft_dispatch_interleave_avx
(
FFTComplex
*z,
int
nbits);
33
34
#if HAVE_AVX
35
void
ff_fft_calc_avx
(
FFTContext
*s,
FFTComplex
*z)
36
{
37
ff_fft_dispatch_interleave_avx
(z, s->
nbits
);
38
}
39
#endif
40
41
void
ff_fft_calc_sse
(
FFTContext
*s,
FFTComplex
*z)
42
{
43
int
n = 1 << s->
nbits
;
44
45
ff_fft_dispatch_interleave_sse
(z, s->
nbits
);
46
47
if
(n <= 16) {
48
x86_reg
i = -8*n;
49
__asm__
volatile
(
50
"1: \n"
51
"movaps (%0,%1), %%xmm0 \n"
52
"movaps %%xmm0, %%xmm1 \n"
53
"unpcklps 16(%0,%1), %%xmm0 \n"
54
"unpckhps 16(%0,%1), %%xmm1 \n"
55
"movaps %%xmm0, (%0,%1) \n"
56
"movaps %%xmm1, 16(%0,%1) \n"
57
"add $32, %0 \n"
58
"jl 1b \n"
59
:
"+r"
(i)
60
:
"r"
(z+n)
61
:
"memory"
62
);
63
}
64
}
65
66
void
ff_fft_permute_sse
(
FFTContext
*s,
FFTComplex
*z)
67
{
68
int
n = 1 << s->
nbits
;
69
int
i;
70
for
(i=0; i<n; i+=2) {
71
__asm__
volatile
(
72
"movaps %2, %%xmm0 \n"
73
"movlps %%xmm0, %0 \n"
74
"movhps %%xmm0, %1 \n"
75
:
"=m"
(s->
tmp_buf
[s->
revtab
[i]]),
76
"=m"
(s->
tmp_buf
[s->
revtab
[i+1]])
77
:
"m"
(z[i])
78
);
79
}
80
memcpy(z, s->
tmp_buf
, n*
sizeof
(
FFTComplex
));
81
}
82
83
void
ff_imdct_calc_sse
(
FFTContext
*s,
FFTSample
*output,
const
FFTSample
*input)
84
{
85
x86_reg
j, k;
86
long
n = s->
mdct_size
;
87
long
n4 = n >> 2;
88
89
s->
imdct_half
(s, output + n4, input);
90
91
j = -n;
92
k = n-16;
93
__asm__
volatile
(
94
"movaps "
MANGLE
(ff_m1m1m1m1)
", %%xmm7 \n"
95
"1: \n"
96
"movaps (%2,%1), %%xmm0 \n"
97
"movaps (%3,%0), %%xmm1 \n"
98
"shufps $0x1b, %%xmm0, %%xmm0 \n"
99
"shufps $0x1b, %%xmm1, %%xmm1 \n"
100
"xorps %%xmm7, %%xmm0 \n"
101
"movaps %%xmm1, (%3,%1) \n"
102
"movaps %%xmm0, (%2,%0) \n"
103
"sub $16, %1 \n"
104
"add $16, %0 \n"
105
"jl 1b \n"
106
:
"+r"
(j),
"+r"
(k)
107
:
"r"
(output+n4),
"r"
(output+n4*3)
108
XMM_CLOBBERS_ONLY
(
"%xmm0"
,
"%xmm1"
,
"%xmm7"
)
109
);
110
}
111