mpegvideo_mmx.c
Go to the documentation of this file.
1 /*
2  * The simplest mpeg encoder (well, it was the simplest!)
3  * Copyright (c) 2000,2001 Fabrice Bellard
4  *
5  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
6  * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/avcodec.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "dsputil_mmx.h"
31 
32 extern uint16_t inv_zigzag_direct16[64];
33 
34 
36  DCTELEM *block, int n, int qscale)
37 {
38  x86_reg level, qmul, qadd, nCoeffs;
39 
40  qmul = qscale << 1;
41 
42  assert(s->block_last_index[n]>=0 || s->h263_aic);
43 
44  if (!s->h263_aic) {
45  if (n < 4)
46  level = block[0] * s->y_dc_scale;
47  else
48  level = block[0] * s->c_dc_scale;
49  qadd = (qscale - 1) | 1;
50  }else{
51  qadd = 0;
52  level= block[0];
53  }
54  if(s->ac_pred)
55  nCoeffs=63;
56  else
57  nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
58 //printf("%d %d ", qmul, qadd);
59 __asm__ volatile(
60  "movd %1, %%mm6 \n\t" //qmul
61  "packssdw %%mm6, %%mm6 \n\t"
62  "packssdw %%mm6, %%mm6 \n\t"
63  "movd %2, %%mm5 \n\t" //qadd
64  "pxor %%mm7, %%mm7 \n\t"
65  "packssdw %%mm5, %%mm5 \n\t"
66  "packssdw %%mm5, %%mm5 \n\t"
67  "psubw %%mm5, %%mm7 \n\t"
68  "pxor %%mm4, %%mm4 \n\t"
69  ".p2align 4 \n\t"
70  "1: \n\t"
71  "movq (%0, %3), %%mm0 \n\t"
72  "movq 8(%0, %3), %%mm1 \n\t"
73 
74  "pmullw %%mm6, %%mm0 \n\t"
75  "pmullw %%mm6, %%mm1 \n\t"
76 
77  "movq (%0, %3), %%mm2 \n\t"
78  "movq 8(%0, %3), %%mm3 \n\t"
79 
80  "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
81  "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
82 
83  "pxor %%mm2, %%mm0 \n\t"
84  "pxor %%mm3, %%mm1 \n\t"
85 
86  "paddw %%mm7, %%mm0 \n\t"
87  "paddw %%mm7, %%mm1 \n\t"
88 
89  "pxor %%mm0, %%mm2 \n\t"
90  "pxor %%mm1, %%mm3 \n\t"
91 
92  "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
93  "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
94 
95  "pandn %%mm2, %%mm0 \n\t"
96  "pandn %%mm3, %%mm1 \n\t"
97 
98  "movq %%mm0, (%0, %3) \n\t"
99  "movq %%mm1, 8(%0, %3) \n\t"
100 
101  "add $16, %3 \n\t"
102  "jng 1b \n\t"
103  ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
104  : "memory"
105  );
106  block[0]= level;
107 }
108 
109 
111  DCTELEM *block, int n, int qscale)
112 {
113  x86_reg qmul, qadd, nCoeffs;
114 
115  qmul = qscale << 1;
116  qadd = (qscale - 1) | 1;
117 
118  assert(s->block_last_index[n]>=0 || s->h263_aic);
119 
120  nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
121 //printf("%d %d ", qmul, qadd);
122 __asm__ volatile(
123  "movd %1, %%mm6 \n\t" //qmul
124  "packssdw %%mm6, %%mm6 \n\t"
125  "packssdw %%mm6, %%mm6 \n\t"
126  "movd %2, %%mm5 \n\t" //qadd
127  "pxor %%mm7, %%mm7 \n\t"
128  "packssdw %%mm5, %%mm5 \n\t"
129  "packssdw %%mm5, %%mm5 \n\t"
130  "psubw %%mm5, %%mm7 \n\t"
131  "pxor %%mm4, %%mm4 \n\t"
132  ".p2align 4 \n\t"
133  "1: \n\t"
134  "movq (%0, %3), %%mm0 \n\t"
135  "movq 8(%0, %3), %%mm1 \n\t"
136 
137  "pmullw %%mm6, %%mm0 \n\t"
138  "pmullw %%mm6, %%mm1 \n\t"
139 
140  "movq (%0, %3), %%mm2 \n\t"
141  "movq 8(%0, %3), %%mm3 \n\t"
142 
143  "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
144  "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
145 
146  "pxor %%mm2, %%mm0 \n\t"
147  "pxor %%mm3, %%mm1 \n\t"
148 
149  "paddw %%mm7, %%mm0 \n\t"
150  "paddw %%mm7, %%mm1 \n\t"
151 
152  "pxor %%mm0, %%mm2 \n\t"
153  "pxor %%mm1, %%mm3 \n\t"
154 
155  "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
156  "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
157 
158  "pandn %%mm2, %%mm0 \n\t"
159  "pandn %%mm3, %%mm1 \n\t"
160 
161  "movq %%mm0, (%0, %3) \n\t"
162  "movq %%mm1, 8(%0, %3) \n\t"
163 
164  "add $16, %3 \n\t"
165  "jng 1b \n\t"
166  ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
167  : "memory"
168  );
169 }
170 
171 
172 /*
173  NK:
174  Note: looking at PARANOID:
175  "enable all paranoid tests for rounding, overflows, etc..."
176 
177 #ifdef PARANOID
178  if (level < -2048 || level > 2047)
179  fprintf(stderr, "unquant error %d %d\n", i, level);
180 #endif
181  We can suppose that result of two multiplications can't be greater than 0xFFFF
182  i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
183  a complex multiplication.
184 =====================================================
185  Full formula for multiplication of 2 integer numbers
186  which are represent as high:low words:
187  input: value1 = high1:low1
188  value2 = high2:low2
189  output: value3 = value1*value2
190  value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
191  this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
192  but this algorithm will compute only 0x66cb0ce4
193  this limited by 16-bit size of operands
194  ---------------------------------
195  tlow1 = high1*low2
196  tlow2 = high2*low1
197  tlow1 = tlow1 + tlow2
198  high3:low3 = low1*low2
199  high3 += tlow1
200 */
202  DCTELEM *block, int n, int qscale)
203 {
204  x86_reg nCoeffs;
205  const uint16_t *quant_matrix;
206  int block0;
207 
208  assert(s->block_last_index[n]>=0);
209 
210  nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
211 
212  if (n < 4)
213  block0 = block[0] * s->y_dc_scale;
214  else
215  block0 = block[0] * s->c_dc_scale;
216  /* XXX: only mpeg1 */
217  quant_matrix = s->intra_matrix;
218 __asm__ volatile(
219  "pcmpeqw %%mm7, %%mm7 \n\t"
220  "psrlw $15, %%mm7 \n\t"
221  "movd %2, %%mm6 \n\t"
222  "packssdw %%mm6, %%mm6 \n\t"
223  "packssdw %%mm6, %%mm6 \n\t"
224  "mov %3, %%"REG_a" \n\t"
225  ".p2align 4 \n\t"
226  "1: \n\t"
227  "movq (%0, %%"REG_a"), %%mm0 \n\t"
228  "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
229  "movq (%1, %%"REG_a"), %%mm4 \n\t"
230  "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
231  "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
232  "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
233  "pxor %%mm2, %%mm2 \n\t"
234  "pxor %%mm3, %%mm3 \n\t"
235  "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
236  "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
237  "pxor %%mm2, %%mm0 \n\t"
238  "pxor %%mm3, %%mm1 \n\t"
239  "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
240  "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
241  "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
242  "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
243  "pxor %%mm4, %%mm4 \n\t"
244  "pxor %%mm5, %%mm5 \n\t" // FIXME slow
245  "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
246  "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
247  "psraw $3, %%mm0 \n\t"
248  "psraw $3, %%mm1 \n\t"
249  "psubw %%mm7, %%mm0 \n\t"
250  "psubw %%mm7, %%mm1 \n\t"
251  "por %%mm7, %%mm0 \n\t"
252  "por %%mm7, %%mm1 \n\t"
253  "pxor %%mm2, %%mm0 \n\t"
254  "pxor %%mm3, %%mm1 \n\t"
255  "psubw %%mm2, %%mm0 \n\t"
256  "psubw %%mm3, %%mm1 \n\t"
257  "pandn %%mm0, %%mm4 \n\t"
258  "pandn %%mm1, %%mm5 \n\t"
259  "movq %%mm4, (%0, %%"REG_a") \n\t"
260  "movq %%mm5, 8(%0, %%"REG_a") \n\t"
261 
262  "add $16, %%"REG_a" \n\t"
263  "js 1b \n\t"
264  ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
265  : "%"REG_a, "memory"
266  );
267  block[0]= block0;
268 }
269 
271  DCTELEM *block, int n, int qscale)
272 {
273  x86_reg nCoeffs;
274  const uint16_t *quant_matrix;
275 
276  assert(s->block_last_index[n]>=0);
277 
278  nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
279 
280  quant_matrix = s->inter_matrix;
281 __asm__ volatile(
282  "pcmpeqw %%mm7, %%mm7 \n\t"
283  "psrlw $15, %%mm7 \n\t"
284  "movd %2, %%mm6 \n\t"
285  "packssdw %%mm6, %%mm6 \n\t"
286  "packssdw %%mm6, %%mm6 \n\t"
287  "mov %3, %%"REG_a" \n\t"
288  ".p2align 4 \n\t"
289  "1: \n\t"
290  "movq (%0, %%"REG_a"), %%mm0 \n\t"
291  "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
292  "movq (%1, %%"REG_a"), %%mm4 \n\t"
293  "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
294  "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
295  "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
296  "pxor %%mm2, %%mm2 \n\t"
297  "pxor %%mm3, %%mm3 \n\t"
298  "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
299  "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
300  "pxor %%mm2, %%mm0 \n\t"
301  "pxor %%mm3, %%mm1 \n\t"
302  "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
303  "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
304  "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
305  "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
306  "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
307  "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
308  "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
309  "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
310  "pxor %%mm4, %%mm4 \n\t"
311  "pxor %%mm5, %%mm5 \n\t" // FIXME slow
312  "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
313  "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
314  "psraw $4, %%mm0 \n\t"
315  "psraw $4, %%mm1 \n\t"
316  "psubw %%mm7, %%mm0 \n\t"
317  "psubw %%mm7, %%mm1 \n\t"
318  "por %%mm7, %%mm0 \n\t"
319  "por %%mm7, %%mm1 \n\t"
320  "pxor %%mm2, %%mm0 \n\t"
321  "pxor %%mm3, %%mm1 \n\t"
322  "psubw %%mm2, %%mm0 \n\t"
323  "psubw %%mm3, %%mm1 \n\t"
324  "pandn %%mm0, %%mm4 \n\t"
325  "pandn %%mm1, %%mm5 \n\t"
326  "movq %%mm4, (%0, %%"REG_a") \n\t"
327  "movq %%mm5, 8(%0, %%"REG_a") \n\t"
328 
329  "add $16, %%"REG_a" \n\t"
330  "js 1b \n\t"
331  ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
332  : "%"REG_a, "memory"
333  );
334 }
335 
337  DCTELEM *block, int n, int qscale)
338 {
339  x86_reg nCoeffs;
340  const uint16_t *quant_matrix;
341  int block0;
342 
343  assert(s->block_last_index[n]>=0);
344 
345  if(s->alternate_scan) nCoeffs= 63; //FIXME
346  else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
347 
348  if (n < 4)
349  block0 = block[0] * s->y_dc_scale;
350  else
351  block0 = block[0] * s->c_dc_scale;
352  quant_matrix = s->intra_matrix;
353 __asm__ volatile(
354  "pcmpeqw %%mm7, %%mm7 \n\t"
355  "psrlw $15, %%mm7 \n\t"
356  "movd %2, %%mm6 \n\t"
357  "packssdw %%mm6, %%mm6 \n\t"
358  "packssdw %%mm6, %%mm6 \n\t"
359  "mov %3, %%"REG_a" \n\t"
360  ".p2align 4 \n\t"
361  "1: \n\t"
362  "movq (%0, %%"REG_a"), %%mm0 \n\t"
363  "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
364  "movq (%1, %%"REG_a"), %%mm4 \n\t"
365  "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
366  "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
367  "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
368  "pxor %%mm2, %%mm2 \n\t"
369  "pxor %%mm3, %%mm3 \n\t"
370  "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
371  "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
372  "pxor %%mm2, %%mm0 \n\t"
373  "pxor %%mm3, %%mm1 \n\t"
374  "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
375  "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
376  "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
377  "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
378  "pxor %%mm4, %%mm4 \n\t"
379  "pxor %%mm5, %%mm5 \n\t" // FIXME slow
380  "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
381  "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
382  "psraw $3, %%mm0 \n\t"
383  "psraw $3, %%mm1 \n\t"
384  "pxor %%mm2, %%mm0 \n\t"
385  "pxor %%mm3, %%mm1 \n\t"
386  "psubw %%mm2, %%mm0 \n\t"
387  "psubw %%mm3, %%mm1 \n\t"
388  "pandn %%mm0, %%mm4 \n\t"
389  "pandn %%mm1, %%mm5 \n\t"
390  "movq %%mm4, (%0, %%"REG_a") \n\t"
391  "movq %%mm5, 8(%0, %%"REG_a") \n\t"
392 
393  "add $16, %%"REG_a" \n\t"
394  "jng 1b \n\t"
395  ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
396  : "%"REG_a, "memory"
397  );
398  block[0]= block0;
399  //Note, we do not do mismatch control for intra as errors cannot accumulate
400 }
401 
403  DCTELEM *block, int n, int qscale)
404 {
405  x86_reg nCoeffs;
406  const uint16_t *quant_matrix;
407 
408  assert(s->block_last_index[n]>=0);
409 
410  if(s->alternate_scan) nCoeffs= 63; //FIXME
411  else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
412 
413  quant_matrix = s->inter_matrix;
414 __asm__ volatile(
415  "pcmpeqw %%mm7, %%mm7 \n\t"
416  "psrlq $48, %%mm7 \n\t"
417  "movd %2, %%mm6 \n\t"
418  "packssdw %%mm6, %%mm6 \n\t"
419  "packssdw %%mm6, %%mm6 \n\t"
420  "mov %3, %%"REG_a" \n\t"
421  ".p2align 4 \n\t"
422  "1: \n\t"
423  "movq (%0, %%"REG_a"), %%mm0 \n\t"
424  "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
425  "movq (%1, %%"REG_a"), %%mm4 \n\t"
426  "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
427  "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
428  "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
429  "pxor %%mm2, %%mm2 \n\t"
430  "pxor %%mm3, %%mm3 \n\t"
431  "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
432  "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
433  "pxor %%mm2, %%mm0 \n\t"
434  "pxor %%mm3, %%mm1 \n\t"
435  "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
436  "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
437  "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
438  "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
439  "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
440  "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
441  "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
442  "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
443  "pxor %%mm4, %%mm4 \n\t"
444  "pxor %%mm5, %%mm5 \n\t" // FIXME slow
445  "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
446  "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
447  "psrlw $4, %%mm0 \n\t"
448  "psrlw $4, %%mm1 \n\t"
449  "pxor %%mm2, %%mm0 \n\t"
450  "pxor %%mm3, %%mm1 \n\t"
451  "psubw %%mm2, %%mm0 \n\t"
452  "psubw %%mm3, %%mm1 \n\t"
453  "pandn %%mm0, %%mm4 \n\t"
454  "pandn %%mm1, %%mm5 \n\t"
455  "pxor %%mm4, %%mm7 \n\t"
456  "pxor %%mm5, %%mm7 \n\t"
457  "movq %%mm4, (%0, %%"REG_a") \n\t"
458  "movq %%mm5, 8(%0, %%"REG_a") \n\t"
459 
460  "add $16, %%"REG_a" \n\t"
461  "jng 1b \n\t"
462  "movd 124(%0, %3), %%mm0 \n\t"
463  "movq %%mm7, %%mm6 \n\t"
464  "psrlq $32, %%mm7 \n\t"
465  "pxor %%mm6, %%mm7 \n\t"
466  "movq %%mm7, %%mm6 \n\t"
467  "psrlq $16, %%mm7 \n\t"
468  "pxor %%mm6, %%mm7 \n\t"
469  "pslld $31, %%mm7 \n\t"
470  "psrlq $15, %%mm7 \n\t"
471  "pxor %%mm7, %%mm0 \n\t"
472  "movd %%mm0, 124(%0, %3) \n\t"
473 
474  ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
475  : "%"REG_a, "memory"
476  );
477 }
478 
480  const int intra= s->mb_intra;
481  int *sum= s->dct_error_sum[intra];
482  uint16_t *offset= s->dct_offset[intra];
483 
484  s->dct_count[intra]++;
485 
486  __asm__ volatile(
487  "pxor %%mm7, %%mm7 \n\t"
488  "1: \n\t"
489  "pxor %%mm0, %%mm0 \n\t"
490  "pxor %%mm1, %%mm1 \n\t"
491  "movq (%0), %%mm2 \n\t"
492  "movq 8(%0), %%mm3 \n\t"
493  "pcmpgtw %%mm2, %%mm0 \n\t"
494  "pcmpgtw %%mm3, %%mm1 \n\t"
495  "pxor %%mm0, %%mm2 \n\t"
496  "pxor %%mm1, %%mm3 \n\t"
497  "psubw %%mm0, %%mm2 \n\t"
498  "psubw %%mm1, %%mm3 \n\t"
499  "movq %%mm2, %%mm4 \n\t"
500  "movq %%mm3, %%mm5 \n\t"
501  "psubusw (%2), %%mm2 \n\t"
502  "psubusw 8(%2), %%mm3 \n\t"
503  "pxor %%mm0, %%mm2 \n\t"
504  "pxor %%mm1, %%mm3 \n\t"
505  "psubw %%mm0, %%mm2 \n\t"
506  "psubw %%mm1, %%mm3 \n\t"
507  "movq %%mm2, (%0) \n\t"
508  "movq %%mm3, 8(%0) \n\t"
509  "movq %%mm4, %%mm2 \n\t"
510  "movq %%mm5, %%mm3 \n\t"
511  "punpcklwd %%mm7, %%mm4 \n\t"
512  "punpckhwd %%mm7, %%mm2 \n\t"
513  "punpcklwd %%mm7, %%mm5 \n\t"
514  "punpckhwd %%mm7, %%mm3 \n\t"
515  "paddd (%1), %%mm4 \n\t"
516  "paddd 8(%1), %%mm2 \n\t"
517  "paddd 16(%1), %%mm5 \n\t"
518  "paddd 24(%1), %%mm3 \n\t"
519  "movq %%mm4, (%1) \n\t"
520  "movq %%mm2, 8(%1) \n\t"
521  "movq %%mm5, 16(%1) \n\t"
522  "movq %%mm3, 24(%1) \n\t"
523  "add $16, %0 \n\t"
524  "add $32, %1 \n\t"
525  "add $16, %2 \n\t"
526  "cmp %3, %0 \n\t"
527  " jb 1b \n\t"
528  : "+r" (block), "+r" (sum), "+r" (offset)
529  : "r"(block+64)
530  );
531 }
532 
534  const int intra= s->mb_intra;
535  int *sum= s->dct_error_sum[intra];
536  uint16_t *offset= s->dct_offset[intra];
537 
538  s->dct_count[intra]++;
539 
540  __asm__ volatile(
541  "pxor %%xmm7, %%xmm7 \n\t"
542  "1: \n\t"
543  "pxor %%xmm0, %%xmm0 \n\t"
544  "pxor %%xmm1, %%xmm1 \n\t"
545  "movdqa (%0), %%xmm2 \n\t"
546  "movdqa 16(%0), %%xmm3 \n\t"
547  "pcmpgtw %%xmm2, %%xmm0 \n\t"
548  "pcmpgtw %%xmm3, %%xmm1 \n\t"
549  "pxor %%xmm0, %%xmm2 \n\t"
550  "pxor %%xmm1, %%xmm3 \n\t"
551  "psubw %%xmm0, %%xmm2 \n\t"
552  "psubw %%xmm1, %%xmm3 \n\t"
553  "movdqa %%xmm2, %%xmm4 \n\t"
554  "movdqa %%xmm3, %%xmm5 \n\t"
555  "psubusw (%2), %%xmm2 \n\t"
556  "psubusw 16(%2), %%xmm3 \n\t"
557  "pxor %%xmm0, %%xmm2 \n\t"
558  "pxor %%xmm1, %%xmm3 \n\t"
559  "psubw %%xmm0, %%xmm2 \n\t"
560  "psubw %%xmm1, %%xmm3 \n\t"
561  "movdqa %%xmm2, (%0) \n\t"
562  "movdqa %%xmm3, 16(%0) \n\t"
563  "movdqa %%xmm4, %%xmm6 \n\t"
564  "movdqa %%xmm5, %%xmm0 \n\t"
565  "punpcklwd %%xmm7, %%xmm4 \n\t"
566  "punpckhwd %%xmm7, %%xmm6 \n\t"
567  "punpcklwd %%xmm7, %%xmm5 \n\t"
568  "punpckhwd %%xmm7, %%xmm0 \n\t"
569  "paddd (%1), %%xmm4 \n\t"
570  "paddd 16(%1), %%xmm6 \n\t"
571  "paddd 32(%1), %%xmm5 \n\t"
572  "paddd 48(%1), %%xmm0 \n\t"
573  "movdqa %%xmm4, (%1) \n\t"
574  "movdqa %%xmm6, 16(%1) \n\t"
575  "movdqa %%xmm5, 32(%1) \n\t"
576  "movdqa %%xmm0, 48(%1) \n\t"
577  "add $32, %0 \n\t"
578  "add $64, %1 \n\t"
579  "add $32, %2 \n\t"
580  "cmp %3, %0 \n\t"
581  " jb 1b \n\t"
582  : "+r" (block), "+r" (sum), "+r" (offset)
583  : "r"(block+64)
584  XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
585  "%xmm4", "%xmm5", "%xmm6", "%xmm7")
586  );
587 }
588 
589 #if HAVE_SSSE3
590 #define HAVE_SSSE3_BAK
591 #endif
592 #undef HAVE_SSSE3
593 #define HAVE_SSSE3 0
594 
595 #undef HAVE_SSE2
596 #undef HAVE_MMX2
597 #define HAVE_SSE2 0
598 #define HAVE_MMX2 0
599 #define RENAME(a) a ## _MMX
600 #define RENAMEl(a) a ## _mmx
601 #include "mpegvideo_mmx_template.c"
602 
603 #undef HAVE_MMX2
604 #define HAVE_MMX2 1
605 #undef RENAME
606 #undef RENAMEl
607 #define RENAME(a) a ## _MMX2
608 #define RENAMEl(a) a ## _mmx2
609 #include "mpegvideo_mmx_template.c"
610 
611 #undef HAVE_SSE2
612 #define HAVE_SSE2 1
613 #undef RENAME
614 #undef RENAMEl
615 #define RENAME(a) a ## _SSE2
616 #define RENAMEl(a) a ## _sse2
617 #include "mpegvideo_mmx_template.c"
618 
619 #ifdef HAVE_SSSE3_BAK
620 #undef HAVE_SSSE3
621 #define HAVE_SSSE3 1
622 #undef RENAME
623 #undef RENAMEl
624 #define RENAME(a) a ## _SSSE3
625 #define RENAMEl(a) a ## _sse2
626 #include "mpegvideo_mmx_template.c"
627 #endif
628 
630 {
631  int mm_flags = av_get_cpu_flags();
632 
633  if (mm_flags & AV_CPU_FLAG_MMX) {
634  const int dct_algo = s->avctx->dct_algo;
635 
640  if(!(s->flags & CODEC_FLAG_BITEXACT))
643 
644  if (mm_flags & AV_CPU_FLAG_SSE2) {
646  } else {
648  }
649 
650  if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
651 #if HAVE_SSSE3
652  if(mm_flags & AV_CPU_FLAG_SSSE3){
653  s->dct_quantize= dct_quantize_SSSE3;
654  } else
655 #endif
656  if(mm_flags & AV_CPU_FLAG_SSE2){
657  s->dct_quantize= dct_quantize_SSE2;
658  } else if(mm_flags & AV_CPU_FLAG_MMX2){
659  s->dct_quantize= dct_quantize_MMX2;
660  } else {
661  s->dct_quantize= dct_quantize_MMX;
662  }
663  }
664  }
665 }