vc1dsp_mmx.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions MMX-optimized
3  * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
4  *
5  * Permission is hereby granted, free of charge, to any person
6  * obtaining a copy of this software and associated documentation
7  * files (the "Software"), to deal in the Software without
8  * restriction, including without limitation the rights to use,
9  * copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following
12  * conditions:
13  *
14  * The above copyright notice and this permission notice shall be
15  * included in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24  * OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "libavutil/cpu.h"
28 #include "libavutil/x86_cpu.h"
29 #include "libavcodec/dsputil.h"
30 #include "dsputil_mmx.h"
31 #include "libavcodec/vc1dsp.h"
32 
33 #define OP_PUT(S,D)
34 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
35 
37 #define NORMALIZE_MMX(SHIFT) \
38  "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
39  "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
40  "psraw "SHIFT", %%mm3 \n\t" \
41  "psraw "SHIFT", %%mm4 \n\t"
42 
43 #define TRANSFER_DO_PACK(OP) \
44  "packuswb %%mm4, %%mm3 \n\t" \
45  OP((%2), %%mm3) \
46  "movq %%mm3, (%2) \n\t"
47 
48 #define TRANSFER_DONT_PACK(OP) \
49  OP(0(%2), %%mm3) \
50  OP(8(%2), %%mm4) \
51  "movq %%mm3, 0(%2) \n\t" \
52  "movq %%mm4, 8(%2) \n\t"
53 
55 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
56 #define DONT_UNPACK(reg)
57 
59 #define LOAD_ROUNDER_MMX(ROUND) \
60  "movd "ROUND", %%mm7 \n\t" \
61  "punpcklwd %%mm7, %%mm7 \n\t" \
62  "punpckldq %%mm7, %%mm7 \n\t"
63 
64 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
65  "paddw %%mm"#R2", %%mm"#R1" \n\t" \
66  "movd (%0,%3), %%mm"#R0" \n\t" \
67  "pmullw %%mm6, %%mm"#R1" \n\t" \
68  "punpcklbw %%mm0, %%mm"#R0" \n\t" \
69  "movd (%0,%2), %%mm"#R3" \n\t" \
70  "psubw %%mm"#R0", %%mm"#R1" \n\t" \
71  "punpcklbw %%mm0, %%mm"#R3" \n\t" \
72  "paddw %%mm7, %%mm"#R1" \n\t" \
73  "psubw %%mm"#R3", %%mm"#R1" \n\t" \
74  "psraw %4, %%mm"#R1" \n\t" \
75  "movq %%mm"#R1", "#OFF"(%1) \n\t" \
76  "add %2, %0 \n\t"
77 
79 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
80  const uint8_t *src, x86_reg stride,
81  int rnd, int64_t shift)
82 {
83  __asm__ volatile(
84  "mov $3, %%"REG_c" \n\t"
85  LOAD_ROUNDER_MMX("%5")
86  "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
87  "1: \n\t"
88  "movd (%0), %%mm2 \n\t"
89  "add %2, %0 \n\t"
90  "movd (%0), %%mm3 \n\t"
91  "punpcklbw %%mm0, %%mm2 \n\t"
92  "punpcklbw %%mm0, %%mm3 \n\t"
93  SHIFT2_LINE( 0, 1, 2, 3, 4)
94  SHIFT2_LINE( 24, 2, 3, 4, 1)
95  SHIFT2_LINE( 48, 3, 4, 1, 2)
96  SHIFT2_LINE( 72, 4, 1, 2, 3)
97  SHIFT2_LINE( 96, 1, 2, 3, 4)
98  SHIFT2_LINE(120, 2, 3, 4, 1)
99  SHIFT2_LINE(144, 3, 4, 1, 2)
100  SHIFT2_LINE(168, 4, 1, 2, 3)
101  "sub %6, %0 \n\t"
102  "add $8, %1 \n\t"
103  "dec %%"REG_c" \n\t"
104  "jnz 1b \n\t"
105  : "+r"(src), "+r"(dst)
106  : "r"(stride), "r"(-2*stride),
107  "m"(shift), "m"(rnd), "r"(9*stride-4)
108  : "%"REG_c, "memory"
109  );
110 }
111 
116 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
117 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
118  const int16_t *src, int rnd)\
119 {\
120  int h = 8;\
121 \
122  src -= 1;\
123  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
124  __asm__ volatile(\
125  LOAD_ROUNDER_MMX("%4")\
126  "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
127  "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
128  "1: \n\t"\
129  "movq 2*0+0(%1), %%mm1 \n\t"\
130  "movq 2*0+8(%1), %%mm2 \n\t"\
131  "movq 2*1+0(%1), %%mm3 \n\t"\
132  "movq 2*1+8(%1), %%mm4 \n\t"\
133  "paddw 2*3+0(%1), %%mm1 \n\t"\
134  "paddw 2*3+8(%1), %%mm2 \n\t"\
135  "paddw 2*2+0(%1), %%mm3 \n\t"\
136  "paddw 2*2+8(%1), %%mm4 \n\t"\
137  "pmullw %%mm5, %%mm3 \n\t"\
138  "pmullw %%mm5, %%mm4 \n\t"\
139  "psubw %%mm1, %%mm3 \n\t"\
140  "psubw %%mm2, %%mm4 \n\t"\
141  NORMALIZE_MMX("$7")\
142  /* Remove bias */\
143  "paddw %%mm6, %%mm3 \n\t"\
144  "paddw %%mm6, %%mm4 \n\t"\
145  TRANSFER_DO_PACK(OP)\
146  "add $24, %1 \n\t"\
147  "add %3, %2 \n\t"\
148  "decl %0 \n\t"\
149  "jnz 1b \n\t"\
150  : "+r"(h), "+r" (src), "+r" (dst)\
151  : "r"(stride), "m"(rnd)\
152  : "memory"\
153  );\
154 }
155 
158 
159 
164 #define VC1_SHIFT2(OP, OPNAME)\
165 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
166  x86_reg stride, int rnd, x86_reg offset)\
167 {\
168  rnd = 8-rnd;\
169  __asm__ volatile(\
170  "mov $8, %%"REG_c" \n\t"\
171  LOAD_ROUNDER_MMX("%5")\
172  "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
173  "1: \n\t"\
174  "movd 0(%0 ), %%mm3 \n\t"\
175  "movd 4(%0 ), %%mm4 \n\t"\
176  "movd 0(%0,%2), %%mm1 \n\t"\
177  "movd 4(%0,%2), %%mm2 \n\t"\
178  "add %2, %0 \n\t"\
179  "punpcklbw %%mm0, %%mm3 \n\t"\
180  "punpcklbw %%mm0, %%mm4 \n\t"\
181  "punpcklbw %%mm0, %%mm1 \n\t"\
182  "punpcklbw %%mm0, %%mm2 \n\t"\
183  "paddw %%mm1, %%mm3 \n\t"\
184  "paddw %%mm2, %%mm4 \n\t"\
185  "movd 0(%0,%3), %%mm1 \n\t"\
186  "movd 4(%0,%3), %%mm2 \n\t"\
187  "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
188  "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
189  "punpcklbw %%mm0, %%mm1 \n\t"\
190  "punpcklbw %%mm0, %%mm2 \n\t"\
191  "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
192  "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
193  "movd 0(%0,%2), %%mm1 \n\t"\
194  "movd 4(%0,%2), %%mm2 \n\t"\
195  "punpcklbw %%mm0, %%mm1 \n\t"\
196  "punpcklbw %%mm0, %%mm2 \n\t"\
197  "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
198  "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
199  NORMALIZE_MMX("$4")\
200  "packuswb %%mm4, %%mm3 \n\t"\
201  OP((%1), %%mm3)\
202  "movq %%mm3, (%1) \n\t"\
203  "add %6, %0 \n\t"\
204  "add %4, %1 \n\t"\
205  "dec %%"REG_c" \n\t"\
206  "jnz 1b \n\t"\
207  : "+r"(src), "+r"(dst)\
208  : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
209  "g"(stride-offset)\
210  : "%"REG_c, "memory"\
211  );\
212 }
213 
214 VC1_SHIFT2(OP_PUT, put_)
215 VC1_SHIFT2(OP_AVG, avg_)
216 
227 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
228  MOVQ "*0+"A1", %%mm1 \n\t" \
229  MOVQ "*4+"A1", %%mm2 \n\t" \
230  UNPACK("%%mm1") \
231  UNPACK("%%mm2") \
232  "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
233  "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
234  MOVQ "*0+"A2", %%mm3 \n\t" \
235  MOVQ "*4+"A2", %%mm4 \n\t" \
236  UNPACK("%%mm3") \
237  UNPACK("%%mm4") \
238  "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
239  "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
240  "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
241  "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
242  MOVQ "*0+"A4", %%mm1 \n\t" \
243  MOVQ "*4+"A4", %%mm2 \n\t" \
244  UNPACK("%%mm1") \
245  UNPACK("%%mm2") \
246  "psllw $2, %%mm1 \n\t" /* 4* */ \
247  "psllw $2, %%mm2 \n\t" /* 4* */ \
248  "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
249  "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
250  MOVQ "*0+"A3", %%mm1 \n\t" \
251  MOVQ "*4+"A3", %%mm2 \n\t" \
252  UNPACK("%%mm1") \
253  UNPACK("%%mm2") \
254  "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
255  "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
256  "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
257  "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
258 
267 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
268 static void \
269 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
270  x86_reg src_stride, \
271  int rnd, int64_t shift) \
272 { \
273  int h = 8; \
274  src -= src_stride; \
275  __asm__ volatile( \
276  LOAD_ROUNDER_MMX("%5") \
277  "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
278  "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
279  ".p2align 3 \n\t" \
280  "1: \n\t" \
281  MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
282  NORMALIZE_MMX("%6") \
283  TRANSFER_DONT_PACK(OP_PUT) \
284  /* Last 3 (in fact 4) bytes on the line */ \
285  "movd 8+"A1", %%mm1 \n\t" \
286  DO_UNPACK("%%mm1") \
287  "movq %%mm1, %%mm3 \n\t" \
288  "paddw %%mm1, %%mm1 \n\t" \
289  "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
290  "movd 8+"A2", %%mm3 \n\t" \
291  DO_UNPACK("%%mm3") \
292  "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
293  "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
294  "movd 8+"A3", %%mm1 \n\t" \
295  DO_UNPACK("%%mm1") \
296  "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
297  "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
298  "movd 8+"A4", %%mm1 \n\t" \
299  DO_UNPACK("%%mm1") \
300  "psllw $2, %%mm1 \n\t" /* 4* */ \
301  "psubw %%mm1, %%mm3 \n\t" \
302  "paddw %%mm7, %%mm3 \n\t" \
303  "psraw %6, %%mm3 \n\t" \
304  "movq %%mm3, 16(%2) \n\t" \
305  "add %3, %1 \n\t" \
306  "add $24, %2 \n\t" \
307  "decl %0 \n\t" \
308  "jnz 1b \n\t" \
309  : "+r"(h), "+r" (src), "+r" (dst) \
310  : "r"(src_stride), "r"(3*src_stride), \
311  "m"(rnd), "m"(shift) \
312  : "memory" \
313  ); \
314 }
315 
323 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
324 static void \
325 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
326  const int16_t *src, int rnd) \
327 { \
328  int h = 8; \
329  src -= 1; \
330  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
331  __asm__ volatile( \
332  LOAD_ROUNDER_MMX("%4") \
333  "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
334  "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
335  ".p2align 3 \n\t" \
336  "1: \n\t" \
337  MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
338  NORMALIZE_MMX("$7") \
339  /* Remove bias */ \
340  "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
341  "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
342  TRANSFER_DO_PACK(OP) \
343  "add $24, %1 \n\t" \
344  "add %3, %2 \n\t" \
345  "decl %0 \n\t" \
346  "jnz 1b \n\t" \
347  : "+r"(h), "+r" (src), "+r" (dst) \
348  : "r"(stride), "m"(rnd) \
349  : "memory" \
350  ); \
351 }
352 
361 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
362 static void \
363 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
364  x86_reg stride, int rnd, x86_reg offset) \
365 { \
366  int h = 8; \
367  src -= offset; \
368  rnd = 32-rnd; \
369  __asm__ volatile ( \
370  LOAD_ROUNDER_MMX("%6") \
371  "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
372  "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
373  ".p2align 3 \n\t" \
374  "1: \n\t" \
375  MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
376  NORMALIZE_MMX("$6") \
377  TRANSFER_DO_PACK(OP) \
378  "add %5, %1 \n\t" \
379  "add %5, %2 \n\t" \
380  "decl %0 \n\t" \
381  "jnz 1b \n\t" \
382  : "+r"(h), "+r" (src), "+r" (dst) \
383  : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
384  : "memory" \
385  ); \
386 }
387 
389 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
390 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
391 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
392 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
393 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
394 
396 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
397 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
398 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
399 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
400 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
401 
402 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
403 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
404 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
405 
417 #define VC1_MSPEL_MC(OP)\
418 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
419  int hmode, int vmode, int rnd)\
420 {\
421  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
422  { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
423  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
424  { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
425  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
426  { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
427 \
428  __asm__ volatile(\
429  "pxor %%mm0, %%mm0 \n\t"\
430  ::: "memory"\
431  );\
432 \
433  if (vmode) { /* Vertical filter to apply */\
434  if (hmode) { /* Horizontal filter to apply, output to tmp */\
435  static const int shift_value[] = { 0, 5, 1, 5 };\
436  int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
437  int r;\
438  DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
439 \
440  r = (1<<(shift-1)) + rnd-1;\
441  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
442 \
443  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
444  return;\
445  }\
446  else { /* No horizontal filter, output 8 lines to dst */\
447  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
448  return;\
449  }\
450  }\
451 \
452  /* Horizontal mode with no vertical mode */\
453  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
454 }
455 
456 VC1_MSPEL_MC(put_)
457 VC1_MSPEL_MC(avg_)
458 
460 #define DECLARE_FUNCTION(a, b) \
461 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
462  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
463 }\
464 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
465  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
466 }
467 
468 DECLARE_FUNCTION(0, 1)
469 DECLARE_FUNCTION(0, 2)
470 DECLARE_FUNCTION(0, 3)
471 
472 DECLARE_FUNCTION(1, 0)
473 DECLARE_FUNCTION(1, 1)
474 DECLARE_FUNCTION(1, 2)
475 DECLARE_FUNCTION(1, 3)
476 
477 DECLARE_FUNCTION(2, 0)
478 DECLARE_FUNCTION(2, 1)
479 DECLARE_FUNCTION(2, 2)
480 DECLARE_FUNCTION(2, 3)
481 
482 DECLARE_FUNCTION(3, 0)
483 DECLARE_FUNCTION(3, 1)
484 DECLARE_FUNCTION(3, 2)
485 DECLARE_FUNCTION(3, 3)
486 
487 static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
488 {
489  int dc = block[0];
490  dc = (17 * dc + 4) >> 3;
491  dc = (17 * dc + 64) >> 7;
492  __asm__ volatile(
493  "movd %0, %%mm0 \n\t"
494  "pshufw $0, %%mm0, %%mm0 \n\t"
495  "pxor %%mm1, %%mm1 \n\t"
496  "psubw %%mm0, %%mm1 \n\t"
497  "packuswb %%mm0, %%mm0 \n\t"
498  "packuswb %%mm1, %%mm1 \n\t"
499  ::"r"(dc)
500  );
501  __asm__ volatile(
502  "movd %0, %%mm2 \n\t"
503  "movd %1, %%mm3 \n\t"
504  "movd %2, %%mm4 \n\t"
505  "movd %3, %%mm5 \n\t"
506  "paddusb %%mm0, %%mm2 \n\t"
507  "paddusb %%mm0, %%mm3 \n\t"
508  "paddusb %%mm0, %%mm4 \n\t"
509  "paddusb %%mm0, %%mm5 \n\t"
510  "psubusb %%mm1, %%mm2 \n\t"
511  "psubusb %%mm1, %%mm3 \n\t"
512  "psubusb %%mm1, %%mm4 \n\t"
513  "psubusb %%mm1, %%mm5 \n\t"
514  "movd %%mm2, %0 \n\t"
515  "movd %%mm3, %1 \n\t"
516  "movd %%mm4, %2 \n\t"
517  "movd %%mm5, %3 \n\t"
518  :"+m"(*(uint32_t*)(dest+0*linesize)),
519  "+m"(*(uint32_t*)(dest+1*linesize)),
520  "+m"(*(uint32_t*)(dest+2*linesize)),
521  "+m"(*(uint32_t*)(dest+3*linesize))
522  );
523 }
524 
525 static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
526 {
527  int dc = block[0];
528  dc = (17 * dc + 4) >> 3;
529  dc = (12 * dc + 64) >> 7;
530  __asm__ volatile(
531  "movd %0, %%mm0 \n\t"
532  "pshufw $0, %%mm0, %%mm0 \n\t"
533  "pxor %%mm1, %%mm1 \n\t"
534  "psubw %%mm0, %%mm1 \n\t"
535  "packuswb %%mm0, %%mm0 \n\t"
536  "packuswb %%mm1, %%mm1 \n\t"
537  ::"r"(dc)
538  );
539  __asm__ volatile(
540  "movd %0, %%mm2 \n\t"
541  "movd %1, %%mm3 \n\t"
542  "movd %2, %%mm4 \n\t"
543  "movd %3, %%mm5 \n\t"
544  "paddusb %%mm0, %%mm2 \n\t"
545  "paddusb %%mm0, %%mm3 \n\t"
546  "paddusb %%mm0, %%mm4 \n\t"
547  "paddusb %%mm0, %%mm5 \n\t"
548  "psubusb %%mm1, %%mm2 \n\t"
549  "psubusb %%mm1, %%mm3 \n\t"
550  "psubusb %%mm1, %%mm4 \n\t"
551  "psubusb %%mm1, %%mm5 \n\t"
552  "movd %%mm2, %0 \n\t"
553  "movd %%mm3, %1 \n\t"
554  "movd %%mm4, %2 \n\t"
555  "movd %%mm5, %3 \n\t"
556  :"+m"(*(uint32_t*)(dest+0*linesize)),
557  "+m"(*(uint32_t*)(dest+1*linesize)),
558  "+m"(*(uint32_t*)(dest+2*linesize)),
559  "+m"(*(uint32_t*)(dest+3*linesize))
560  );
561  dest += 4*linesize;
562  __asm__ volatile(
563  "movd %0, %%mm2 \n\t"
564  "movd %1, %%mm3 \n\t"
565  "movd %2, %%mm4 \n\t"
566  "movd %3, %%mm5 \n\t"
567  "paddusb %%mm0, %%mm2 \n\t"
568  "paddusb %%mm0, %%mm3 \n\t"
569  "paddusb %%mm0, %%mm4 \n\t"
570  "paddusb %%mm0, %%mm5 \n\t"
571  "psubusb %%mm1, %%mm2 \n\t"
572  "psubusb %%mm1, %%mm3 \n\t"
573  "psubusb %%mm1, %%mm4 \n\t"
574  "psubusb %%mm1, %%mm5 \n\t"
575  "movd %%mm2, %0 \n\t"
576  "movd %%mm3, %1 \n\t"
577  "movd %%mm4, %2 \n\t"
578  "movd %%mm5, %3 \n\t"
579  :"+m"(*(uint32_t*)(dest+0*linesize)),
580  "+m"(*(uint32_t*)(dest+1*linesize)),
581  "+m"(*(uint32_t*)(dest+2*linesize)),
582  "+m"(*(uint32_t*)(dest+3*linesize))
583  );
584 }
585 
586 static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
587 {
588  int dc = block[0];
589  dc = ( 3 * dc + 1) >> 1;
590  dc = (17 * dc + 64) >> 7;
591  __asm__ volatile(
592  "movd %0, %%mm0 \n\t"
593  "pshufw $0, %%mm0, %%mm0 \n\t"
594  "pxor %%mm1, %%mm1 \n\t"
595  "psubw %%mm0, %%mm1 \n\t"
596  "packuswb %%mm0, %%mm0 \n\t"
597  "packuswb %%mm1, %%mm1 \n\t"
598  ::"r"(dc)
599  );
600  __asm__ volatile(
601  "movq %0, %%mm2 \n\t"
602  "movq %1, %%mm3 \n\t"
603  "movq %2, %%mm4 \n\t"
604  "movq %3, %%mm5 \n\t"
605  "paddusb %%mm0, %%mm2 \n\t"
606  "paddusb %%mm0, %%mm3 \n\t"
607  "paddusb %%mm0, %%mm4 \n\t"
608  "paddusb %%mm0, %%mm5 \n\t"
609  "psubusb %%mm1, %%mm2 \n\t"
610  "psubusb %%mm1, %%mm3 \n\t"
611  "psubusb %%mm1, %%mm4 \n\t"
612  "psubusb %%mm1, %%mm5 \n\t"
613  "movq %%mm2, %0 \n\t"
614  "movq %%mm3, %1 \n\t"
615  "movq %%mm4, %2 \n\t"
616  "movq %%mm5, %3 \n\t"
617  :"+m"(*(uint32_t*)(dest+0*linesize)),
618  "+m"(*(uint32_t*)(dest+1*linesize)),
619  "+m"(*(uint32_t*)(dest+2*linesize)),
620  "+m"(*(uint32_t*)(dest+3*linesize))
621  );
622 }
623 
624 static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
625 {
626  int dc = block[0];
627  dc = (3 * dc + 1) >> 1;
628  dc = (3 * dc + 16) >> 5;
629  __asm__ volatile(
630  "movd %0, %%mm0 \n\t"
631  "pshufw $0, %%mm0, %%mm0 \n\t"
632  "pxor %%mm1, %%mm1 \n\t"
633  "psubw %%mm0, %%mm1 \n\t"
634  "packuswb %%mm0, %%mm0 \n\t"
635  "packuswb %%mm1, %%mm1 \n\t"
636  ::"r"(dc)
637  );
638  __asm__ volatile(
639  "movq %0, %%mm2 \n\t"
640  "movq %1, %%mm3 \n\t"
641  "movq %2, %%mm4 \n\t"
642  "movq %3, %%mm5 \n\t"
643  "paddusb %%mm0, %%mm2 \n\t"
644  "paddusb %%mm0, %%mm3 \n\t"
645  "paddusb %%mm0, %%mm4 \n\t"
646  "paddusb %%mm0, %%mm5 \n\t"
647  "psubusb %%mm1, %%mm2 \n\t"
648  "psubusb %%mm1, %%mm3 \n\t"
649  "psubusb %%mm1, %%mm4 \n\t"
650  "psubusb %%mm1, %%mm5 \n\t"
651  "movq %%mm2, %0 \n\t"
652  "movq %%mm3, %1 \n\t"
653  "movq %%mm4, %2 \n\t"
654  "movq %%mm5, %3 \n\t"
655  :"+m"(*(uint32_t*)(dest+0*linesize)),
656  "+m"(*(uint32_t*)(dest+1*linesize)),
657  "+m"(*(uint32_t*)(dest+2*linesize)),
658  "+m"(*(uint32_t*)(dest+3*linesize))
659  );
660  dest += 4*linesize;
661  __asm__ volatile(
662  "movq %0, %%mm2 \n\t"
663  "movq %1, %%mm3 \n\t"
664  "movq %2, %%mm4 \n\t"
665  "movq %3, %%mm5 \n\t"
666  "paddusb %%mm0, %%mm2 \n\t"
667  "paddusb %%mm0, %%mm3 \n\t"
668  "paddusb %%mm0, %%mm4 \n\t"
669  "paddusb %%mm0, %%mm5 \n\t"
670  "psubusb %%mm1, %%mm2 \n\t"
671  "psubusb %%mm1, %%mm3 \n\t"
672  "psubusb %%mm1, %%mm4 \n\t"
673  "psubusb %%mm1, %%mm5 \n\t"
674  "movq %%mm2, %0 \n\t"
675  "movq %%mm3, %1 \n\t"
676  "movq %%mm4, %2 \n\t"
677  "movq %%mm5, %3 \n\t"
678  :"+m"(*(uint32_t*)(dest+0*linesize)),
679  "+m"(*(uint32_t*)(dest+1*linesize)),
680  "+m"(*(uint32_t*)(dest+2*linesize)),
681  "+m"(*(uint32_t*)(dest+3*linesize))
682  );
683 }
684 
685 #define LOOP_FILTER(EXT) \
686 void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
687 void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
688 void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
689 void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
690 \
691 static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
692 { \
693  ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \
694  ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
695 } \
696 \
697 static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
698 { \
699  ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \
700  ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
701 }
702 
703 #if HAVE_YASM
704 LOOP_FILTER(mmx)
705 LOOP_FILTER(mmx2)
706 LOOP_FILTER(sse2)
707 LOOP_FILTER(ssse3)
708 
709 void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
710 
711 static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
712 {
713  ff_vc1_h_loop_filter8_sse4(src, stride, pq);
714  ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
715 }
716 
717 #endif
718 
719 void ff_put_vc1_chroma_mc8_mmx_nornd (uint8_t *dst, uint8_t *src,
720  int stride, int h, int x, int y);
721 void ff_avg_vc1_chroma_mc8_mmx2_nornd (uint8_t *dst, uint8_t *src,
722  int stride, int h, int x, int y);
723 void ff_avg_vc1_chroma_mc8_3dnow_nornd(uint8_t *dst, uint8_t *src,
724  int stride, int h, int x, int y);
725 void ff_put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
726  int stride, int h, int x, int y);
727 void ff_avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
728  int stride, int h, int x, int y);
729 
731 {
732  int mm_flags = av_get_cpu_flags();
733 
734  if (mm_flags & AV_CPU_FLAG_MMX) {
736  dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
737  dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
738  dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
739 
740  dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
741  dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
742  dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
743  dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
744 
745  dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
746  dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
747  dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
748  dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
749 
750  dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
751  dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
752  dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
753  dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
754  }
755 
756  if (mm_flags & AV_CPU_FLAG_MMX2){
758  dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2;
759  dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2;
760  dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmx2;
761 
762  dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmx2;
763  dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmx2;
764  dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmx2;
765  dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmx2;
766 
767  dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmx2;
768  dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmx2;
769  dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmx2;
770  dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmx2;
771 
772  dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmx2;
773  dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2;
774  dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2;
775  dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2;
776 
780  dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2;
781  }
782 
783 #define ASSIGN_LF(EXT) \
784  dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
785  dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \
786  dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \
787  dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \
788  dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
789  dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
790 
791 #if HAVE_YASM
792  if (mm_flags & AV_CPU_FLAG_MMX) {
793  ASSIGN_LF(mmx);
795  }
796  return;
797  if (mm_flags & AV_CPU_FLAG_MMX2) {
798  ASSIGN_LF(mmx2);
800  } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
802  }
803 
804  if (mm_flags & AV_CPU_FLAG_SSE2) {
805  dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
806  dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
807  dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
808  dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
809  }
810  if (mm_flags & AV_CPU_FLAG_SSSE3) {
811  ASSIGN_LF(ssse3);
814  }
815  if (mm_flags & AV_CPU_FLAG_SSE4) {
816  dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4;
817  dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
818  }
819 #endif
820 }