43 {0x8000000080000000ULL, 0x8000000080000000ULL};
87 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
88 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
90 #define MOVQ_BFE(regd) \
92 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
93 "paddb %%" #regd ", %%" #regd " \n\t" ::)
96 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
97 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
101 #define MOVQ_BONE(regd) \
103 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
104 "psrlw $15, %%" #regd " \n\t" \
105 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
107 #define MOVQ_WTWO(regd) \
109 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
110 "psrlw $15, %%" #regd " \n\t" \
111 "psllw $1, %%" #regd " \n\t"::)
118 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
119 "movq " #rega ", " #regr " \n\t"\
120 "pand " #regb ", " #regr " \n\t"\
121 "pxor " #rega ", " #regb " \n\t"\
122 "pand " #regfe "," #regb " \n\t"\
123 "psrlq $1, " #regb " \n\t"\
124 "paddb " #regb ", " #regr " \n\t"
126 #define PAVGB_MMX(rega, regb, regr, regfe) \
127 "movq " #rega ", " #regr " \n\t"\
128 "por " #regb ", " #regr " \n\t"\
129 "pxor " #rega ", " #regb " \n\t"\
130 "pand " #regfe "," #regb " \n\t"\
131 "psrlq $1, " #regb " \n\t"\
132 "psubb " #regb ", " #regr " \n\t"
135 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
136 "movq " #rega ", " #regr " \n\t"\
137 "movq " #regc ", " #regp " \n\t"\
138 "pand " #regb ", " #regr " \n\t"\
139 "pand " #regd ", " #regp " \n\t"\
140 "pxor " #rega ", " #regb " \n\t"\
141 "pxor " #regc ", " #regd " \n\t"\
142 "pand %%mm6, " #regb " \n\t"\
143 "pand %%mm6, " #regd " \n\t"\
144 "psrlq $1, " #regb " \n\t"\
145 "psrlq $1, " #regd " \n\t"\
146 "paddb " #regb ", " #regr " \n\t"\
147 "paddb " #regd ", " #regp " \n\t"
149 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
150 "movq " #rega ", " #regr " \n\t"\
151 "movq " #regc ", " #regp " \n\t"\
152 "por " #regb ", " #regr " \n\t"\
153 "por " #regd ", " #regp " \n\t"\
154 "pxor " #rega ", " #regb " \n\t"\
155 "pxor " #regc ", " #regd " \n\t"\
156 "pand %%mm6, " #regb " \n\t"\
157 "pand %%mm6, " #regd " \n\t"\
158 "psrlq $1, " #regd " \n\t"\
159 "psrlq $1, " #regb " \n\t"\
160 "psubb " #regb ", " #regr " \n\t"\
161 "psubb " #regd ", " #regp " \n\t"
165 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
166 #define SET_RND MOVQ_WONE
167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
168 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
169 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
180 #define DEF(x, y) x ## _ ## y ##_mmx
181 #define SET_RND MOVQ_WTWO
182 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
183 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
196 #define DEF(x) x ## _3dnow
197 #define PAVGB "pavgusb"
209 #define DEF(x) x ## _mmx2
212 #define PAVGB "pavgb"
221 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
222 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
223 #define put_pixels16_mmx2 put_pixels16_mmx
224 #define put_pixels8_mmx2 put_pixels8_mmx
225 #define put_pixels4_mmx2 put_pixels4_mmx
226 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
227 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
228 #define put_pixels16_3dnow put_pixels16_mmx
229 #define put_pixels8_3dnow put_pixels8_mmx
230 #define put_pixels4_3dnow put_pixels4_mmx
231 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
232 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
247 "movq %3, %%mm0 \n\t"
248 "movq 8%3, %%mm1 \n\t"
249 "movq 16%3, %%mm2 \n\t"
250 "movq 24%3, %%mm3 \n\t"
251 "movq 32%3, %%mm4 \n\t"
252 "movq 40%3, %%mm5 \n\t"
253 "movq 48%3, %%mm6 \n\t"
254 "movq 56%3, %%mm7 \n\t"
255 "packuswb %%mm1, %%mm0 \n\t"
256 "packuswb %%mm3, %%mm2 \n\t"
257 "packuswb %%mm5, %%mm4 \n\t"
258 "packuswb %%mm7, %%mm6 \n\t"
259 "movq %%mm0, (%0) \n\t"
260 "movq %%mm2, (%0, %1) \n\t"
261 "movq %%mm4, (%0, %1, 2) \n\t"
262 "movq %%mm6, (%0, %2) \n\t"
263 ::
"r" (pix),
"r" ((
x86_reg)line_size),
"r" ((
x86_reg)line_size*3),
"m"(*p)
272 "movq (%3), %%mm0 \n\t"
273 "movq 8(%3), %%mm1 \n\t"
274 "movq 16(%3), %%mm2 \n\t"
275 "movq 24(%3), %%mm3 \n\t"
276 "movq 32(%3), %%mm4 \n\t"
277 "movq 40(%3), %%mm5 \n\t"
278 "movq 48(%3), %%mm6 \n\t"
279 "movq 56(%3), %%mm7 \n\t"
280 "packuswb %%mm1, %%mm0 \n\t"
281 "packuswb %%mm3, %%mm2 \n\t"
282 "packuswb %%mm5, %%mm4 \n\t"
283 "packuswb %%mm7, %%mm6 \n\t"
284 "movq %%mm0, (%0) \n\t"
285 "movq %%mm2, (%0, %1) \n\t"
286 "movq %%mm4, (%0, %1, 2) \n\t"
287 "movq %%mm6, (%0, %2) \n\t"
288 ::
"r" (pix),
"r" ((
x86_reg)line_size),
"r" ((
x86_reg)line_size*3),
"r"(p)
292 #define put_signed_pixels_clamped_mmx_half(off) \
293 "movq "#off"(%2), %%mm1 \n\t"\
294 "movq 16+"#off"(%2), %%mm2 \n\t"\
295 "movq 32+"#off"(%2), %%mm3 \n\t"\
296 "movq 48+"#off"(%2), %%mm4 \n\t"\
297 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
298 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
299 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
300 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
301 "paddb %%mm0, %%mm1 \n\t"\
302 "paddb %%mm0, %%mm2 \n\t"\
303 "paddb %%mm0, %%mm3 \n\t"\
304 "paddb %%mm0, %%mm4 \n\t"\
305 "movq %%mm1, (%0) \n\t"\
306 "movq %%mm2, (%0, %3) \n\t"\
307 "movq %%mm3, (%0, %3, 2) \n\t"\
308 "movq %%mm4, (%0, %1) \n\t"
317 "lea (%3, %3, 2), %1 \n\t"
319 "lea (%0, %3, 4), %0 \n\t"
321 :
"+&r" (pixels),
"=&r" (line_skip3)
322 :
"r" (
block),
"r"(line_skip)
339 "movq (%2), %%mm0 \n\t"
340 "movq 8(%2), %%mm1 \n\t"
341 "movq 16(%2), %%mm2 \n\t"
342 "movq 24(%2), %%mm3 \n\t"
343 "movq %0, %%mm4 \n\t"
344 "movq %1, %%mm6 \n\t"
345 "movq %%mm4, %%mm5 \n\t"
346 "punpcklbw %%mm7, %%mm4 \n\t"
347 "punpckhbw %%mm7, %%mm5 \n\t"
348 "paddsw %%mm4, %%mm0 \n\t"
349 "paddsw %%mm5, %%mm1 \n\t"
350 "movq %%mm6, %%mm5 \n\t"
351 "punpcklbw %%mm7, %%mm6 \n\t"
352 "punpckhbw %%mm7, %%mm5 \n\t"
353 "paddsw %%mm6, %%mm2 \n\t"
354 "paddsw %%mm5, %%mm3 \n\t"
355 "packuswb %%mm1, %%mm0 \n\t"
356 "packuswb %%mm3, %%mm2 \n\t"
357 "movq %%mm0, %0 \n\t"
358 "movq %%mm2, %1 \n\t"
359 :
"+m"(*pix),
"+m"(*(pix+line_size))
370 "lea (%3, %3), %%"REG_a
" \n\t"
373 "movd (%1), %%mm0 \n\t"
374 "movd (%1, %3), %%mm1 \n\t"
375 "movd %%mm0, (%2) \n\t"
376 "movd %%mm1, (%2, %3) \n\t"
377 "add %%"REG_a
", %1 \n\t"
378 "add %%"REG_a
", %2 \n\t"
379 "movd (%1), %%mm0 \n\t"
380 "movd (%1, %3), %%mm1 \n\t"
381 "movd %%mm0, (%2) \n\t"
382 "movd %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a
", %1 \n\t"
384 "add %%"REG_a
", %2 \n\t"
387 :
"+g"(h),
"+r" (pixels),
"+r" (
block)
396 "lea (%3, %3), %%"REG_a
" \n\t"
399 "movq (%1), %%mm0 \n\t"
400 "movq (%1, %3), %%mm1 \n\t"
401 "movq %%mm0, (%2) \n\t"
402 "movq %%mm1, (%2, %3) \n\t"
403 "add %%"REG_a
", %1 \n\t"
404 "add %%"REG_a
", %2 \n\t"
405 "movq (%1), %%mm0 \n\t"
406 "movq (%1, %3), %%mm1 \n\t"
407 "movq %%mm0, (%2) \n\t"
408 "movq %%mm1, (%2, %3) \n\t"
409 "add %%"REG_a
", %1 \n\t"
410 "add %%"REG_a
", %2 \n\t"
413 :
"+g"(h),
"+r" (pixels),
"+r" (
block)
422 "lea (%3, %3), %%"REG_a
" \n\t"
425 "movq (%1), %%mm0 \n\t"
426 "movq 8(%1), %%mm4 \n\t"
427 "movq (%1, %3), %%mm1 \n\t"
428 "movq 8(%1, %3), %%mm5 \n\t"
429 "movq %%mm0, (%2) \n\t"
430 "movq %%mm4, 8(%2) \n\t"
431 "movq %%mm1, (%2, %3) \n\t"
432 "movq %%mm5, 8(%2, %3) \n\t"
433 "add %%"REG_a
", %1 \n\t"
434 "add %%"REG_a
", %2 \n\t"
435 "movq (%1), %%mm0 \n\t"
436 "movq 8(%1), %%mm4 \n\t"
437 "movq (%1, %3), %%mm1 \n\t"
438 "movq 8(%1, %3), %%mm5 \n\t"
439 "movq %%mm0, (%2) \n\t"
440 "movq %%mm4, 8(%2) \n\t"
441 "movq %%mm1, (%2, %3) \n\t"
442 "movq %%mm5, 8(%2, %3) \n\t"
443 "add %%"REG_a
", %1 \n\t"
444 "add %%"REG_a
", %2 \n\t"
447 :
"+g"(h),
"+r" (pixels),
"+r" (
block)
457 "movdqu (%1), %%xmm0 \n\t"
458 "movdqu (%1,%3), %%xmm1 \n\t"
459 "movdqu (%1,%3,2), %%xmm2 \n\t"
460 "movdqu (%1,%4), %%xmm3 \n\t"
461 "lea (%1,%3,4), %1 \n\t"
462 "movdqa %%xmm0, (%2) \n\t"
463 "movdqa %%xmm1, (%2,%3) \n\t"
464 "movdqa %%xmm2, (%2,%3,2) \n\t"
465 "movdqa %%xmm3, (%2,%4) \n\t"
467 "lea (%2,%3,4), %2 \n\t"
469 :
"+g"(h),
"+r" (pixels),
"+r" (
block)
479 "movdqu (%1), %%xmm0 \n\t"
480 "movdqu (%1,%3), %%xmm1 \n\t"
481 "movdqu (%1,%3,2), %%xmm2 \n\t"
482 "movdqu (%1,%4), %%xmm3 \n\t"
483 "lea (%1,%3,4), %1 \n\t"
484 "pavgb (%2), %%xmm0 \n\t"
485 "pavgb (%2,%3), %%xmm1 \n\t"
486 "pavgb (%2,%3,2), %%xmm2 \n\t"
487 "pavgb (%2,%4), %%xmm3 \n\t"
488 "movdqa %%xmm0, (%2) \n\t"
489 "movdqa %%xmm1, (%2,%3) \n\t"
490 "movdqa %%xmm2, (%2,%3,2) \n\t"
491 "movdqa %%xmm3, (%2,%4) \n\t"
493 "lea (%2,%3,4), %2 \n\t"
495 :
"+g"(h),
"+r" (pixels),
"+r" (
block)
501 #define CLEAR_BLOCKS(name,n) \
502 static void name(DCTELEM *blocks)\
505 "pxor %%mm7, %%mm7 \n\t"\
506 "mov %1, %%"REG_a" \n\t"\
508 "movq %%mm7, (%0, %%"REG_a") \n\t"\
509 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
510 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
511 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
512 "add $32, %%"REG_a" \n\t"\
514 : : "r" (((uint8_t *)blocks)+128*n),\
525 "xorps %%xmm0, %%xmm0 \n"
526 "movaps %%xmm0, (%0) \n"
527 "movaps %%xmm0, 16(%0) \n"
528 "movaps %%xmm0, 32(%0) \n"
529 "movaps %%xmm0, 48(%0) \n"
530 "movaps %%xmm0, 64(%0) \n"
531 "movaps %%xmm0, 80(%0) \n"
532 "movaps %%xmm0, 96(%0) \n"
533 "movaps %%xmm0, 112(%0) \n"
542 "xorps %%xmm0, %%xmm0 \n"
543 "mov %1, %%"REG_a
" \n"
545 "movaps %%xmm0, (%0, %%"REG_a
") \n"
546 "movaps %%xmm0, 16(%0, %%"REG_a
") \n"
547 "movaps %%xmm0, 32(%0, %%"REG_a
") \n"
548 "movaps %%xmm0, 48(%0, %%"REG_a
") \n"
549 "movaps %%xmm0, 64(%0, %%"REG_a
") \n"
550 "movaps %%xmm0, 80(%0, %%"REG_a
") \n"
551 "movaps %%xmm0, 96(%0, %%"REG_a
") \n"
552 "movaps %%xmm0, 112(%0, %%"REG_a
") \n"
553 "add $128, %%"REG_a
" \n"
555 : :
"r" (((uint8_t *)blocks)+128*6),
566 "movq (%1, %0), %%mm0 \n\t"
567 "movq (%2, %0), %%mm1 \n\t"
568 "paddb %%mm0, %%mm1 \n\t"
569 "movq %%mm1, (%2, %0) \n\t"
570 "movq 8(%1, %0), %%mm0 \n\t"
571 "movq 8(%2, %0), %%mm1 \n\t"
572 "paddb %%mm0, %%mm1 \n\t"
573 "movq %%mm1, 8(%2, %0) \n\t"
579 :
"r"(src),
"r"(dst),
"r"((
x86_reg)w-15)
582 dst[i+0] += src[i+0];
590 "movq (%2, %0), %%mm0 \n\t"
591 "movq 8(%2, %0), %%mm1 \n\t"
592 "paddb (%3, %0), %%mm0 \n\t"
593 "paddb 8(%3, %0), %%mm1 \n\t"
594 "movq %%mm0, (%1, %0) \n\t"
595 "movq %%mm1, 8(%1, %0) \n\t"
601 :
"r"(dst),
"r"(src1),
"r"(src2),
"r"((
x86_reg)w-15)
604 dst[i] = src1[i] + src2[i];
608 static void add_hfyu_median_prediction_cmov(uint8_t *dst,
const uint8_t *top,
const uint8_t *diff,
int w,
int *left,
int *left_top) {
611 int l = *left & 0xff;
612 int tl = *left_top & 0xff;
617 "movzbl (%3,%4), %2 \n"
630 "add (%6,%4), %b0 \n"
631 "mov %b0, (%5,%4) \n"
634 :
"+&q"(l),
"+&q"(tl),
"=&r"(
t),
"=&q"(x),
"+&r"(w2)
635 :
"r"(dst+w),
"r"(diff+w),
"rm"(top+w)
642 #define H263_LOOP_FILTER \
643 "pxor %%mm7, %%mm7 \n\t"\
644 "movq %0, %%mm0 \n\t"\
645 "movq %0, %%mm1 \n\t"\
646 "movq %3, %%mm2 \n\t"\
647 "movq %3, %%mm3 \n\t"\
648 "punpcklbw %%mm7, %%mm0 \n\t"\
649 "punpckhbw %%mm7, %%mm1 \n\t"\
650 "punpcklbw %%mm7, %%mm2 \n\t"\
651 "punpckhbw %%mm7, %%mm3 \n\t"\
652 "psubw %%mm2, %%mm0 \n\t"\
653 "psubw %%mm3, %%mm1 \n\t"\
654 "movq %1, %%mm2 \n\t"\
655 "movq %1, %%mm3 \n\t"\
656 "movq %2, %%mm4 \n\t"\
657 "movq %2, %%mm5 \n\t"\
658 "punpcklbw %%mm7, %%mm2 \n\t"\
659 "punpckhbw %%mm7, %%mm3 \n\t"\
660 "punpcklbw %%mm7, %%mm4 \n\t"\
661 "punpckhbw %%mm7, %%mm5 \n\t"\
662 "psubw %%mm2, %%mm4 \n\t"\
663 "psubw %%mm3, %%mm5 \n\t"\
664 "psllw $2, %%mm4 \n\t"\
665 "psllw $2, %%mm5 \n\t"\
666 "paddw %%mm0, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm5 \n\t"\
668 "pxor %%mm6, %%mm6 \n\t"\
669 "pcmpgtw %%mm4, %%mm6 \n\t"\
670 "pcmpgtw %%mm5, %%mm7 \n\t"\
671 "pxor %%mm6, %%mm4 \n\t"\
672 "pxor %%mm7, %%mm5 \n\t"\
673 "psubw %%mm6, %%mm4 \n\t"\
674 "psubw %%mm7, %%mm5 \n\t"\
675 "psrlw $3, %%mm4 \n\t"\
676 "psrlw $3, %%mm5 \n\t"\
677 "packuswb %%mm5, %%mm4 \n\t"\
678 "packsswb %%mm7, %%mm6 \n\t"\
679 "pxor %%mm7, %%mm7 \n\t"\
680 "movd %4, %%mm2 \n\t"\
681 "punpcklbw %%mm2, %%mm2 \n\t"\
682 "punpcklbw %%mm2, %%mm2 \n\t"\
683 "punpcklbw %%mm2, %%mm2 \n\t"\
684 "psubusb %%mm4, %%mm2 \n\t"\
685 "movq %%mm2, %%mm3 \n\t"\
686 "psubusb %%mm4, %%mm3 \n\t"\
687 "psubb %%mm3, %%mm2 \n\t"\
688 "movq %1, %%mm3 \n\t"\
689 "movq %2, %%mm4 \n\t"\
690 "pxor %%mm6, %%mm3 \n\t"\
691 "pxor %%mm6, %%mm4 \n\t"\
692 "paddusb %%mm2, %%mm3 \n\t"\
693 "psubusb %%mm2, %%mm4 \n\t"\
694 "pxor %%mm6, %%mm3 \n\t"\
695 "pxor %%mm6, %%mm4 \n\t"\
696 "paddusb %%mm2, %%mm2 \n\t"\
697 "packsswb %%mm1, %%mm0 \n\t"\
698 "pcmpgtb %%mm0, %%mm7 \n\t"\
699 "pxor %%mm7, %%mm0 \n\t"\
700 "psubb %%mm7, %%mm0 \n\t"\
701 "movq %%mm0, %%mm1 \n\t"\
702 "psubusb %%mm2, %%mm0 \n\t"\
703 "psubb %%mm0, %%mm1 \n\t"\
704 "pand %5, %%mm1 \n\t"\
705 "psrlw $2, %%mm1 \n\t"\
706 "pxor %%mm7, %%mm1 \n\t"\
707 "psubb %%mm7, %%mm1 \n\t"\
708 "movq %0, %%mm5 \n\t"\
709 "movq %3, %%mm6 \n\t"\
710 "psubb %%mm1, %%mm5 \n\t"\
711 "paddb %%mm1, %%mm6 \n\t"
721 "movq %%mm3, %1 \n\t"
722 "movq %%mm4, %2 \n\t"
723 "movq %%mm5, %0 \n\t"
724 "movq %%mm6, %3 \n\t"
725 :
"+m" (*(uint64_t*)(src - 2*stride)),
726 "+m" (*(uint64_t*)(src - 1*
stride)),
727 "+m" (*(uint64_t*)(src + 0*stride)),
728 "+m" (*(uint64_t*)(src + 1*
stride))
738 uint8_t *btemp= (uint8_t*)temp;
755 "movq %%mm5, %%mm1 \n\t"
756 "movq %%mm4, %%mm0 \n\t"
757 "punpcklbw %%mm3, %%mm5 \n\t"
758 "punpcklbw %%mm6, %%mm4 \n\t"
759 "punpckhbw %%mm3, %%mm1 \n\t"
760 "punpckhbw %%mm6, %%mm0 \n\t"
761 "movq %%mm5, %%mm3 \n\t"
762 "movq %%mm1, %%mm6 \n\t"
763 "punpcklwd %%mm4, %%mm5 \n\t"
764 "punpcklwd %%mm0, %%mm1 \n\t"
765 "punpckhwd %%mm4, %%mm3 \n\t"
766 "punpckhwd %%mm0, %%mm6 \n\t"
767 "movd %%mm5, (%0) \n\t"
768 "punpckhdq %%mm5, %%mm5 \n\t"
769 "movd %%mm5, (%0,%2) \n\t"
770 "movd %%mm3, (%0,%2,2) \n\t"
771 "punpckhdq %%mm3, %%mm3 \n\t"
772 "movd %%mm3, (%0,%3) \n\t"
773 "movd %%mm1, (%1) \n\t"
774 "punpckhdq %%mm1, %%mm1 \n\t"
775 "movd %%mm1, (%1,%2) \n\t"
776 "movd %%mm6, (%1,%2,2) \n\t"
777 "punpckhdq %%mm6, %%mm6 \n\t"
778 "movd %%mm6, (%1,%3) \n\t"
780 "r" (src + 4*stride),
791 uint8_t *ptr, *last_line;
794 last_line = buf + (height - 1) * wrap;
801 "movd (%0), %%mm0 \n\t"
802 "punpcklbw %%mm0, %%mm0 \n\t"
803 "punpcklwd %%mm0, %%mm0 \n\t"
804 "punpckldq %%mm0, %%mm0 \n\t"
805 "movq %%mm0, -8(%0) \n\t"
806 "movq -8(%0, %2), %%mm1 \n\t"
807 "punpckhbw %%mm1, %%mm1 \n\t"
808 "punpckhwd %%mm1, %%mm1 \n\t"
809 "punpckhdq %%mm1, %%mm1 \n\t"
810 "movq %%mm1, (%0, %2) \n\t"
822 "movd (%0), %%mm0 \n\t"
823 "punpcklbw %%mm0, %%mm0 \n\t"
824 "punpcklwd %%mm0, %%mm0 \n\t"
825 "punpckldq %%mm0, %%mm0 \n\t"
826 "movq %%mm0, -8(%0) \n\t"
827 "movq %%mm0, -16(%0) \n\t"
828 "movq -8(%0, %2), %%mm1 \n\t"
829 "punpckhbw %%mm1, %%mm1 \n\t"
830 "punpckhwd %%mm1, %%mm1 \n\t"
831 "punpckhdq %%mm1, %%mm1 \n\t"
832 "movq %%mm1, (%0, %2) \n\t"
833 "movq %%mm1, 8(%0, %2) \n\t"
844 for(i = 0; i < h; i += 4) {
845 ptr= buf - (i + 1) * wrap - w;
848 "movq (%1, %0), %%mm0 \n\t"
849 "movq %%mm0, (%0) \n\t"
850 "movq %%mm0, (%0, %2) \n\t"
851 "movq %%mm0, (%0, %2, 2) \n\t"
852 "movq %%mm0, (%0, %3) \n\t"
863 for(i = 0; i < w; i += 4) {
864 ptr= last_line + (i + 1) * wrap - w;
867 "movq (%1, %0), %%mm0 \n\t"
868 "movq %%mm0, (%0) \n\t"
869 "movq %%mm0, (%0, %2) \n\t"
870 "movq %%mm0, (%0, %2, 2) \n\t"
871 "movq %%mm0, (%0, %3) \n\t"
882 #define PAETH(cpu, abs3)\
883 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
888 "pxor %%mm7, %%mm7 \n"\
889 "movd (%1,%0), %%mm0 \n"\
890 "movd (%2,%0), %%mm1 \n"\
891 "punpcklbw %%mm7, %%mm0 \n"\
892 "punpcklbw %%mm7, %%mm1 \n"\
895 "movq %%mm1, %%mm2 \n"\
896 "movd (%2,%0), %%mm1 \n"\
897 "movq %%mm2, %%mm3 \n"\
898 "punpcklbw %%mm7, %%mm1 \n"\
899 "movq %%mm2, %%mm4 \n"\
900 "psubw %%mm1, %%mm3 \n"\
901 "psubw %%mm0, %%mm4 \n"\
902 "movq %%mm3, %%mm5 \n"\
903 "paddw %%mm4, %%mm5 \n"\
905 "movq %%mm4, %%mm6 \n"\
906 "pminsw %%mm5, %%mm6 \n"\
907 "pcmpgtw %%mm6, %%mm3 \n"\
908 "pcmpgtw %%mm5, %%mm4 \n"\
909 "movq %%mm4, %%mm6 \n"\
910 "pand %%mm3, %%mm4 \n"\
911 "pandn %%mm3, %%mm6 \n"\
912 "pandn %%mm0, %%mm3 \n"\
913 "movd (%3,%0), %%mm0 \n"\
914 "pand %%mm1, %%mm6 \n"\
915 "pand %%mm4, %%mm2 \n"\
916 "punpcklbw %%mm7, %%mm0 \n"\
918 "paddw %%mm6, %%mm0 \n"\
919 "paddw %%mm2, %%mm3 \n"\
920 "paddw %%mm3, %%mm0 \n"\
921 "pand %%mm5, %%mm0 \n"\
922 "movq %%mm0, %%mm3 \n"\
923 "packuswb %%mm3, %%mm3 \n"\
924 "movd %%mm3, (%1,%0) \n"\
929 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
936 "psubw %%mm5, %%mm7 \n"\
937 "pmaxsw %%mm7, %%mm5 \n"\
938 "pxor %%mm6, %%mm6 \n"\
939 "pxor %%mm7, %%mm7 \n"\
940 "psubw %%mm3, %%mm6 \n"\
941 "psubw %%mm4, %%mm7 \n"\
942 "pmaxsw %%mm6, %%mm3 \n"\
943 "pmaxsw %%mm7, %%mm4 \n"\
944 "pxor %%mm7, %%mm7 \n"
947 "pabsw %%mm3, %%mm3 \n"\
948 "pabsw %%mm4, %%mm4 \n"\
949 "pabsw %%mm5, %%mm5 \n"
956 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
957 "paddw " #m4 ", " #m3 " \n\t" \
958 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
959 "pmullw " #m3 ", %%mm4 \n\t" \
960 "movq "#in7", " #m3 " \n\t" \
961 "movq "#in0", %%mm5 \n\t" \
962 "paddw " #m3 ", %%mm5 \n\t" \
963 "psubw %%mm5, %%mm4 \n\t" \
964 "movq "#in1", %%mm5 \n\t" \
965 "movq "#in2", %%mm6 \n\t" \
966 "paddw " #m6 ", %%mm5 \n\t" \
967 "paddw " #m5 ", %%mm6 \n\t" \
968 "paddw %%mm6, %%mm6 \n\t" \
969 "psubw %%mm6, %%mm5 \n\t" \
970 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
971 "paddw " #rnd ", %%mm4 \n\t" \
972 "paddw %%mm4, %%mm5 \n\t" \
973 "psraw $5, %%mm5 \n\t"\
974 "packuswb %%mm5, %%mm5 \n\t"\
975 OP(%%mm5, out, %%mm7, d)
977 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
978 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
982 "pxor %%mm7, %%mm7 \n\t"\
984 "movq (%0), %%mm0 \n\t" \
985 "movq %%mm0, %%mm1 \n\t" \
986 "movq %%mm0, %%mm2 \n\t" \
987 "punpcklbw %%mm7, %%mm0 \n\t" \
988 "punpckhbw %%mm7, %%mm1 \n\t" \
989 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
990 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
991 "movq %%mm2, %%mm3 \n\t" \
992 "movq %%mm2, %%mm4 \n\t" \
993 "psllq $8, %%mm2 \n\t" \
994 "psllq $16, %%mm3 \n\t" \
995 "psllq $24, %%mm4 \n\t" \
996 "punpckhbw %%mm7, %%mm2 \n\t" \
997 "punpckhbw %%mm7, %%mm3 \n\t" \
998 "punpckhbw %%mm7, %%mm4 \n\t" \
999 "paddw %%mm3, %%mm5 \n\t" \
1000 "paddw %%mm2, %%mm6 \n\t" \
1001 "paddw %%mm5, %%mm5 \n\t" \
1002 "psubw %%mm5, %%mm6 \n\t" \
1003 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
1004 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
1005 "paddw %%mm4, %%mm0 \n\t" \
1006 "paddw %%mm1, %%mm5 \n\t" \
1007 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
1008 "psubw %%mm5, %%mm0 \n\t" \
1009 "paddw %6, %%mm6 \n\t"\
1010 "paddw %%mm6, %%mm0 \n\t" \
1011 "psraw $5, %%mm0 \n\t"\
1012 "movq %%mm0, %5 \n\t"\
1015 "movq 5(%0), %%mm0 \n\t" \
1016 "movq %%mm0, %%mm5 \n\t" \
1017 "movq %%mm0, %%mm6 \n\t" \
1018 "psrlq $8, %%mm0 \n\t" \
1019 "psrlq $16, %%mm5 \n\t" \
1020 "punpcklbw %%mm7, %%mm0 \n\t" \
1021 "punpcklbw %%mm7, %%mm5 \n\t" \
1022 "paddw %%mm0, %%mm2 \n\t" \
1023 "paddw %%mm5, %%mm3 \n\t" \
1024 "paddw %%mm2, %%mm2 \n\t" \
1025 "psubw %%mm2, %%mm3 \n\t" \
1026 "movq %%mm6, %%mm2 \n\t" \
1027 "psrlq $24, %%mm6 \n\t" \
1028 "punpcklbw %%mm7, %%mm2 \n\t" \
1029 "punpcklbw %%mm7, %%mm6 \n\t" \
1030 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
1031 "paddw %%mm2, %%mm1 \n\t" \
1032 "paddw %%mm6, %%mm4 \n\t" \
1033 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
1034 "psubw %%mm4, %%mm3 \n\t" \
1035 "paddw %6, %%mm1 \n\t"\
1036 "paddw %%mm1, %%mm3 \n\t" \
1037 "psraw $5, %%mm3 \n\t"\
1038 "movq %5, %%mm1 \n\t"\
1039 "packuswb %%mm3, %%mm1 \n\t"\
1040 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1043 "movq 9(%0), %%mm1 \n\t" \
1044 "movq %%mm1, %%mm4 \n\t" \
1045 "movq %%mm1, %%mm3 \n\t" \
1046 "psrlq $8, %%mm1 \n\t" \
1047 "psrlq $16, %%mm4 \n\t" \
1048 "punpcklbw %%mm7, %%mm1 \n\t" \
1049 "punpcklbw %%mm7, %%mm4 \n\t" \
1050 "paddw %%mm1, %%mm5 \n\t" \
1051 "paddw %%mm4, %%mm0 \n\t" \
1052 "paddw %%mm5, %%mm5 \n\t" \
1053 "psubw %%mm5, %%mm0 \n\t" \
1054 "movq %%mm3, %%mm5 \n\t" \
1055 "psrlq $24, %%mm3 \n\t" \
1056 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
1057 "punpcklbw %%mm7, %%mm3 \n\t" \
1058 "paddw %%mm3, %%mm2 \n\t" \
1059 "psubw %%mm2, %%mm0 \n\t" \
1060 "movq %%mm5, %%mm2 \n\t" \
1061 "punpcklbw %%mm7, %%mm2 \n\t" \
1062 "punpckhbw %%mm7, %%mm5 \n\t" \
1063 "paddw %%mm2, %%mm6 \n\t" \
1064 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
1065 "paddw %6, %%mm0 \n\t"\
1066 "paddw %%mm6, %%mm0 \n\t" \
1067 "psraw $5, %%mm0 \n\t"\
1070 "paddw %%mm5, %%mm3 \n\t" \
1071 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
1072 "paddw %%mm4, %%mm6 \n\t" \
1073 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
1074 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
1075 "paddw %%mm1, %%mm4 \n\t" \
1076 "paddw %%mm2, %%mm5 \n\t" \
1077 "paddw %%mm6, %%mm6 \n\t" \
1078 "psubw %%mm6, %%mm4 \n\t" \
1079 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
1080 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
1081 "psubw %%mm5, %%mm3 \n\t" \
1082 "paddw %6, %%mm4 \n\t"\
1083 "paddw %%mm3, %%mm4 \n\t" \
1084 "psraw $5, %%mm4 \n\t"\
1085 "packuswb %%mm4, %%mm0 \n\t"\
1086 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1092 : "+a"(src), "+c"(dst), "+D"(h)\
1093 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(temp), "m"(ROUNDER)\
1098 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1104 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1105 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1106 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1107 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1108 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1109 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1110 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1111 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1112 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1113 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1114 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1115 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1116 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1117 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1118 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1119 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1121 "movq (%0), %%mm0 \n\t"\
1122 "movq 8(%0), %%mm1 \n\t"\
1123 "paddw %2, %%mm0 \n\t"\
1124 "paddw %2, %%mm1 \n\t"\
1125 "psraw $5, %%mm0 \n\t"\
1126 "psraw $5, %%mm1 \n\t"\
1127 "packuswb %%mm1, %%mm0 \n\t"\
1128 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1129 "movq 16(%0), %%mm0 \n\t"\
1130 "movq 24(%0), %%mm1 \n\t"\
1131 "paddw %2, %%mm0 \n\t"\
1132 "paddw %2, %%mm1 \n\t"\
1133 "psraw $5, %%mm0 \n\t"\
1134 "psraw $5, %%mm1 \n\t"\
1135 "packuswb %%mm1, %%mm0 \n\t"\
1136 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1137 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1145 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1147 "pxor %%mm7, %%mm7 \n\t"\
1149 "movq (%0), %%mm0 \n\t" \
1150 "movq %%mm0, %%mm1 \n\t" \
1151 "movq %%mm0, %%mm2 \n\t" \
1152 "punpcklbw %%mm7, %%mm0 \n\t" \
1153 "punpckhbw %%mm7, %%mm1 \n\t" \
1154 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
1155 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
1156 "movq %%mm2, %%mm3 \n\t" \
1157 "movq %%mm2, %%mm4 \n\t" \
1158 "psllq $8, %%mm2 \n\t" \
1159 "psllq $16, %%mm3 \n\t" \
1160 "psllq $24, %%mm4 \n\t" \
1161 "punpckhbw %%mm7, %%mm2 \n\t" \
1162 "punpckhbw %%mm7, %%mm3 \n\t" \
1163 "punpckhbw %%mm7, %%mm4 \n\t" \
1164 "paddw %%mm3, %%mm5 \n\t" \
1165 "paddw %%mm2, %%mm6 \n\t" \
1166 "paddw %%mm5, %%mm5 \n\t" \
1167 "psubw %%mm5, %%mm6 \n\t" \
1168 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
1169 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
1170 "paddw %%mm4, %%mm0 \n\t" \
1171 "paddw %%mm1, %%mm5 \n\t" \
1172 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
1173 "psubw %%mm5, %%mm0 \n\t" \
1174 "paddw %5, %%mm6 \n\t"\
1175 "paddw %%mm6, %%mm0 \n\t" \
1176 "psraw $5, %%mm0 \n\t"\
1179 "movd 5(%0), %%mm5 \n\t" \
1180 "punpcklbw %%mm7, %%mm5 \n\t" \
1181 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
1182 "paddw %%mm5, %%mm1 \n\t" \
1183 "paddw %%mm6, %%mm2 \n\t" \
1184 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
1185 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
1186 "paddw %%mm6, %%mm3 \n\t" \
1187 "paddw %%mm5, %%mm4 \n\t" \
1188 "paddw %%mm2, %%mm2 \n\t" \
1189 "psubw %%mm2, %%mm3 \n\t" \
1190 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
1191 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
1192 "psubw %%mm4, %%mm3 \n\t" \
1193 "paddw %5, %%mm1 \n\t"\
1194 "paddw %%mm1, %%mm3 \n\t" \
1195 "psraw $5, %%mm3 \n\t"\
1196 "packuswb %%mm3, %%mm0 \n\t"\
1197 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1203 : "+a"(src), "+c"(dst), "+d"(h)\
1204 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ROUNDER)\
1209 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1215 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1216 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1217 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1218 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1219 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1220 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1221 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1222 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1224 "movq (%0), %%mm0 \n\t"\
1225 "movq 8(%0), %%mm1 \n\t"\
1226 "paddw %2, %%mm0 \n\t"\
1227 "paddw %2, %%mm1 \n\t"\
1228 "psraw $5, %%mm0 \n\t"\
1229 "psraw $5, %%mm1 \n\t"\
1230 "packuswb %%mm1, %%mm0 \n\t"\
1231 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1232 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1240 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1242 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1243 uint64_t temp[17*4];\
1244 uint64_t *temp_ptr= temp;\
1249 "pxor %%mm7, %%mm7 \n\t"\
1251 "movq (%0), %%mm0 \n\t"\
1252 "movq (%0), %%mm1 \n\t"\
1253 "movq 8(%0), %%mm2 \n\t"\
1254 "movq 8(%0), %%mm3 \n\t"\
1255 "punpcklbw %%mm7, %%mm0 \n\t"\
1256 "punpckhbw %%mm7, %%mm1 \n\t"\
1257 "punpcklbw %%mm7, %%mm2 \n\t"\
1258 "punpckhbw %%mm7, %%mm3 \n\t"\
1259 "movq %%mm0, (%1) \n\t"\
1260 "movq %%mm1, 17*8(%1) \n\t"\
1261 "movq %%mm2, 2*17*8(%1) \n\t"\
1262 "movq %%mm3, 3*17*8(%1) \n\t"\
1267 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1268 : "r" ((x86_reg)srcStride)\
1279 "movq (%0), %%mm0 \n\t"\
1280 "movq 8(%0), %%mm1 \n\t"\
1281 "movq 16(%0), %%mm2 \n\t"\
1282 "movq 24(%0), %%mm3 \n\t"\
1283 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1284 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1286 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1288 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1290 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1291 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1293 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1294 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1296 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1297 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1299 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1300 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1302 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1304 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1306 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1307 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1309 "add $136, %0 \n\t"\
1314 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1315 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1320 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1321 uint64_t temp[9*2];\
1322 uint64_t *temp_ptr= temp;\
1327 "pxor %%mm7, %%mm7 \n\t"\
1329 "movq (%0), %%mm0 \n\t"\
1330 "movq (%0), %%mm1 \n\t"\
1331 "punpcklbw %%mm7, %%mm0 \n\t"\
1332 "punpckhbw %%mm7, %%mm1 \n\t"\
1333 "movq %%mm0, (%1) \n\t"\
1334 "movq %%mm1, 9*8(%1) \n\t"\
1339 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1340 : "r" ((x86_reg)srcStride)\
1351 "movq (%0), %%mm0 \n\t"\
1352 "movq 8(%0), %%mm1 \n\t"\
1353 "movq 16(%0), %%mm2 \n\t"\
1354 "movq 24(%0), %%mm3 \n\t"\
1355 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1356 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1358 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1360 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1362 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1364 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1366 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1367 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1374 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1375 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1380 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1381 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1384 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1386 uint8_t * const half= (uint8_t*)temp;\
1387 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1388 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1391 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1392 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1395 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1397 uint8_t * const half= (uint8_t*)temp;\
1398 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1399 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1402 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1404 uint8_t * const half= (uint8_t*)temp;\
1405 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1406 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1409 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1410 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1413 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1415 uint8_t * const half= (uint8_t*)temp;\
1416 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1417 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1419 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1420 uint64_t half[8 + 9];\
1421 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1422 uint8_t * const halfHV= ((uint8_t*)half);\
1423 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1424 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1425 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1426 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1428 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1429 uint64_t half[8 + 9];\
1430 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1431 uint8_t * const halfHV= ((uint8_t*)half);\
1432 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1433 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1434 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1435 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1437 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1438 uint64_t half[8 + 9];\
1439 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1440 uint8_t * const halfHV= ((uint8_t*)half);\
1441 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1442 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1443 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1444 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1446 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1447 uint64_t half[8 + 9];\
1448 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1449 uint8_t * const halfHV= ((uint8_t*)half);\
1450 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1451 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1452 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1453 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1455 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1456 uint64_t half[8 + 9];\
1457 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1458 uint8_t * const halfHV= ((uint8_t*)half);\
1459 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1460 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1461 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1463 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1464 uint64_t half[8 + 9];\
1465 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1466 uint8_t * const halfHV= ((uint8_t*)half);\
1467 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1468 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1469 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1471 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1472 uint64_t half[8 + 9];\
1473 uint8_t * const halfH= ((uint8_t*)half);\
1474 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1475 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1476 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1478 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1479 uint64_t half[8 + 9];\
1480 uint8_t * const halfH= ((uint8_t*)half);\
1481 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1482 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1483 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1485 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1487 uint8_t * const halfH= ((uint8_t*)half);\
1488 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1489 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1491 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1492 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1495 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1497 uint8_t * const half= (uint8_t*)temp;\
1498 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1499 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1502 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1503 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1506 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1508 uint8_t * const half= (uint8_t*)temp;\
1509 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1510 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1513 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1515 uint8_t * const half= (uint8_t*)temp;\
1516 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1517 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1520 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1521 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1524 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1526 uint8_t * const half= (uint8_t*)temp;\
1527 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1528 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1530 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1531 uint64_t half[16*2 + 17*2];\
1532 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1533 uint8_t * const halfHV= ((uint8_t*)half);\
1534 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1535 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1536 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1537 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1539 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1540 uint64_t half[16*2 + 17*2];\
1541 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1542 uint8_t * const halfHV= ((uint8_t*)half);\
1543 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1544 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1545 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1546 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1548 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1549 uint64_t half[16*2 + 17*2];\
1550 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1551 uint8_t * const halfHV= ((uint8_t*)half);\
1552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1553 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1554 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1555 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1557 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1558 uint64_t half[16*2 + 17*2];\
1559 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1560 uint8_t * const halfHV= ((uint8_t*)half);\
1561 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1562 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1563 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1564 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1566 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1567 uint64_t half[16*2 + 17*2];\
1568 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1569 uint8_t * const halfHV= ((uint8_t*)half);\
1570 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1571 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1572 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1574 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1575 uint64_t half[16*2 + 17*2];\
1576 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1577 uint8_t * const halfHV= ((uint8_t*)half);\
1578 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1579 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1580 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1582 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1583 uint64_t half[17*2];\
1584 uint8_t * const halfH= ((uint8_t*)half);\
1585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1586 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1587 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1589 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1590 uint64_t half[17*2];\
1591 uint8_t * const halfH= ((uint8_t*)half);\
1592 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1593 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1594 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1596 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1597 uint64_t half[17*2];\
1598 uint8_t * const halfH= ((uint8_t*)half);\
1599 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1600 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1603 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1604 #define AVG_3DNOW_OP(a,b,temp, size) \
1605 "mov" #size " " #b ", " #temp " \n\t"\
1606 "pavgusb " #temp ", " #a " \n\t"\
1607 "mov" #size " " #a ", " #b " \n\t"
1608 #define AVG_MMX2_OP(a,b,temp, size) \
1609 "mov" #size " " #b ", " #temp " \n\t"\
1610 "pavgb " #temp ", " #a " \n\t"\
1611 "mov" #size " " #a ", " #b " \n\t"
1616 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1618 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1619 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1620 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1621 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1626 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1627 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1628 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1630 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1631 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1632 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1635 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1636 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1637 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1638 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1639 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1640 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1641 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1642 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1643 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1644 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1645 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1646 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1648 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1649 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1651 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1652 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1653 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1654 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1655 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1656 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1657 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1658 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1671 typedef void emu_edge_core_func (uint8_t *buf,
const uint8_t *src,
1676 extern emu_edge_core_func ff_emu_edge_core_mmx;
1677 extern emu_edge_core_func ff_emu_edge_core_sse;
1680 void emulated_edge_mc(uint8_t *buf,
const uint8_t *src,
int linesize,
1681 int block_w,
int block_h,
1682 int src_x,
int src_y,
int w,
int h,
1683 emu_edge_core_func *core_fn)
1685 int start_y, start_x, end_y, end_x, src_y_add=0;
1688 src_y_add = h-1-src_y;
1690 }
else if(src_y<=-block_h){
1691 src_y_add = 1-block_h-src_y;
1697 }
else if(src_x<=-block_w){
1698 src+= (1-block_w-src_x);
1702 start_y=
FFMAX(0, -src_y);
1703 start_x=
FFMAX(0, -src_x);
1704 end_y=
FFMIN(block_h, h-src_y);
1705 end_x=
FFMIN(block_w, w-src_x);
1706 assert(start_x < end_x && block_w > 0);
1707 assert(start_y < end_y && block_h > 0);
1710 src += (src_y_add+start_y)*linesize + start_x;
1712 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1717 void emulated_edge_mc_mmx(uint8_t *buf,
const uint8_t *src,
int linesize,
1718 int block_w,
int block_h,
1719 int src_x,
int src_y,
int w,
int h)
1721 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1722 w, h, &ff_emu_edge_core_mmx);
1726 void emulated_edge_mc_sse(uint8_t *buf,
const uint8_t *src,
int linesize,
1727 int block_w,
int block_h,
1728 int src_x,
int src_y,
int w,
int h)
1730 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1731 w, h, &ff_emu_edge_core_sse);
1736 int linesize,
int block_w,
int block_h,
1737 int src_x,
int src_y,
int w,
int h);
1740 void gmc(uint8_t *dst, uint8_t *src,
int stride,
int h,
int ox,
int oy,
1741 int dxx,
int dxy,
int dyx,
int dyy,
int shift,
int r,
int width,
int height,
1745 const int ix = ox>>(16+shift);
1746 const int iy = oy>>(16+shift);
1747 const int oxs = ox>>4;
1748 const int oys = oy>>4;
1749 const int dxxs = dxx>>4;
1750 const int dxys = dxy>>4;
1751 const int dyxs = dyx>>4;
1752 const int dyys = dyy>>4;
1753 const uint16_t r4[4] = {
r,
r,
r,r};
1754 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1755 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1756 const uint64_t
shift2 = 2*shift;
1757 uint8_t edge_buf[(h+1)*stride];
1760 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1761 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1762 const int dxh = dxy*(h-1);
1763 const int dyw = dyx*(w-1);
1765 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1766 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1768 || (dxx|dxy|dyx|dyy)&15 )
1771 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1776 if( (
unsigned)ix >= width-w ||
1777 (
unsigned)iy >= height-h )
1779 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1784 "movd %0, %%mm6 \n\t"
1785 "pxor %%mm7, %%mm7 \n\t"
1786 "punpcklwd %%mm6, %%mm6 \n\t"
1787 "punpcklwd %%mm6, %%mm6 \n\t"
1791 for(x=0; x<w; x+=4){
1792 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1793 oxs - dxys + dxxs*(x+1),
1794 oxs - dxys + dxxs*(x+2),
1795 oxs - dxys + dxxs*(x+3) };
1796 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1797 oys - dyys + dyxs*(x+1),
1798 oys - dyys + dyxs*(x+2),
1799 oys - dyys + dyxs*(x+3) };
1803 "movq %0, %%mm4 \n\t"
1804 "movq %1, %%mm5 \n\t"
1805 "paddw %2, %%mm4 \n\t"
1806 "paddw %3, %%mm5 \n\t"
1807 "movq %%mm4, %0 \n\t"
1808 "movq %%mm5, %1 \n\t"
1809 "psrlw $12, %%mm4 \n\t"
1810 "psrlw $12, %%mm5 \n\t"
1811 :
"+m"(*dx4),
"+m"(*dy4)
1812 :
"m"(*dxy4),
"m"(*dyy4)
1816 "movq %%mm6, %%mm2 \n\t"
1817 "movq %%mm6, %%mm1 \n\t"
1818 "psubw %%mm4, %%mm2 \n\t"
1819 "psubw %%mm5, %%mm1 \n\t"
1820 "movq %%mm2, %%mm0 \n\t"
1821 "movq %%mm4, %%mm3 \n\t"
1822 "pmullw %%mm1, %%mm0 \n\t"
1823 "pmullw %%mm5, %%mm3 \n\t"
1824 "pmullw %%mm5, %%mm2 \n\t"
1825 "pmullw %%mm4, %%mm1 \n\t"
1827 "movd %4, %%mm5 \n\t"
1828 "movd %3, %%mm4 \n\t"
1829 "punpcklbw %%mm7, %%mm5 \n\t"
1830 "punpcklbw %%mm7, %%mm4 \n\t"
1831 "pmullw %%mm5, %%mm3 \n\t"
1832 "pmullw %%mm4, %%mm2 \n\t"
1834 "movd %2, %%mm5 \n\t"
1835 "movd %1, %%mm4 \n\t"
1836 "punpcklbw %%mm7, %%mm5 \n\t"
1837 "punpcklbw %%mm7, %%mm4 \n\t"
1838 "pmullw %%mm5, %%mm1 \n\t"
1839 "pmullw %%mm4, %%mm0 \n\t"
1840 "paddw %5, %%mm1 \n\t"
1841 "paddw %%mm3, %%mm2 \n\t"
1842 "paddw %%mm1, %%mm0 \n\t"
1843 "paddw %%mm2, %%mm0 \n\t"
1845 "psrlw %6, %%mm0 \n\t"
1846 "packuswb %%mm0, %%mm0 \n\t"
1847 "movd %%mm0, %0 \n\t"
1850 :
"m"(src[0]),
"m"(src[1]),
1851 "m"(src[stride]),
"m"(src[stride+1]),
1862 static void gmc_mmx(uint8_t *dst, uint8_t *src,
int stride,
int h,
int ox,
int oy,
1863 int dxx,
int dxy,
int dyx,
int dyy,
int shift,
int r,
int width,
int height)
1865 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1866 width, height, &emulated_edge_mc_mmx);
1869 static void gmc_sse(uint8_t *dst, uint8_t *src,
int stride,
int h,
int ox,
int oy,
1870 int dxx,
int dxy,
int dyx,
int dyy,
int shift,
int r,
int width,
int height)
1872 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1873 width, height, &emulated_edge_mc_sse);
1876 static void gmc_mmx(uint8_t *dst, uint8_t *src,
int stride,
int h,
int ox,
int oy,
1877 int dxx,
int dxy,
int dyx,
int dyy,
int shift,
int r,
int width,
int height)
1879 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1880 width, height, &ff_emulated_edge_mc_8);
1884 #define PREFETCH(name, op) \
1885 static void name(void *mem, int stride, int h){\
1886 const uint8_t *p= mem;\
1888 __asm__ volatile(#op" %0" :: "m"(*p));\
1892 PREFETCH(prefetch_mmx2, prefetcht0)
1899 int stride,
int h,
int x,
int y);
1901 int stride,
int h,
int x,
int y);
1903 int stride,
int h,
int x,
int y);
1906 int stride,
int h,
int x,
int y);
1908 int stride,
int h,
int x,
int y);
1910 int stride,
int h,
int x,
int y);
1913 int stride,
int h,
int x,
int y);
1915 int stride,
int h,
int x,
int y);
1918 int stride,
int h,
int x,
int y);
1920 int stride,
int h,
int x,
int y);
1923 int stride,
int h,
int x,
int y);
1925 int stride,
int h,
int x,
int y);
1927 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1928 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1929 (uint8_t *dst, uint8_t *src,\
1930 int stride, int h, int x, int y);
1946 avg_pixels8_mmx(dst, src, stride, 8);
1952 avg_pixels16_mmx(dst, src, stride, 16);
1960 avg_pixels8_mmx2(dst, src, stride, 8);
1966 static void ff_libmpeg2mmx_idct_put(uint8_t *dest,
int line_size,
DCTELEM *
block)
1971 static void ff_libmpeg2mmx_idct_add(uint8_t *dest,
int line_size,
DCTELEM *
block)
1976 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest,
int line_size,
DCTELEM *
block)
1981 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest,
int line_size,
DCTELEM *
block)
2011 __asm__
volatile(
"pxor %%mm7, %%mm7":);
2012 for(i=0; i<blocksize; i+=2) {
2014 "movq %0, %%mm0 \n\t"
2015 "movq %1, %%mm1 \n\t"
2016 "movq %%mm0, %%mm2 \n\t"
2017 "movq %%mm1, %%mm3 \n\t"
2018 "pfcmpge %%mm7, %%mm2 \n\t"
2019 "pfcmpge %%mm7, %%mm3 \n\t"
2020 "pslld $31, %%mm2 \n\t"
2021 "pxor %%mm2, %%mm1 \n\t"
2022 "movq %%mm3, %%mm4 \n\t"
2023 "pand %%mm1, %%mm3 \n\t"
2024 "pandn %%mm1, %%mm4 \n\t"
2025 "pfadd %%mm0, %%mm3 \n\t"
2026 "pfsub %%mm4, %%mm0 \n\t"
2027 "movq %%mm3, %1 \n\t"
2028 "movq %%mm0, %0 \n\t"
2029 :
"+m"(mag[i]),
"+m"(ang[i])
2033 __asm__
volatile(
"femms");
2040 "movaps %0, %%xmm5 \n\t"
2043 for(i=0; i<blocksize; i+=4) {
2045 "movaps %0, %%xmm0 \n\t"
2046 "movaps %1, %%xmm1 \n\t"
2047 "xorps %%xmm2, %%xmm2 \n\t"
2048 "xorps %%xmm3, %%xmm3 \n\t"
2049 "cmpleps %%xmm0, %%xmm2 \n\t"
2050 "cmpleps %%xmm1, %%xmm3 \n\t"
2051 "andps %%xmm5, %%xmm2 \n\t"
2052 "xorps %%xmm2, %%xmm1 \n\t"
2053 "movaps %%xmm3, %%xmm4 \n\t"
2054 "andps %%xmm1, %%xmm3 \n\t"
2055 "andnps %%xmm1, %%xmm4 \n\t"
2056 "addps %%xmm0, %%xmm3 \n\t"
2057 "subps %%xmm4, %%xmm0 \n\t"
2058 "movaps %%xmm3, %1 \n\t"
2059 "movaps %%xmm0, %0 \n\t"
2060 :
"+m"(mag[i]),
"+m"(ang[i])
2069 #define MIX5(mono,stereo)\
2071 "movss 0(%2), %%xmm5 \n"\
2072 "movss 8(%2), %%xmm6 \n"\
2073 "movss 24(%2), %%xmm7 \n"\
2074 "shufps $0, %%xmm5, %%xmm5 \n"\
2075 "shufps $0, %%xmm6, %%xmm6 \n"\
2076 "shufps $0, %%xmm7, %%xmm7 \n"\
2078 "movaps (%0,%1), %%xmm0 \n"\
2079 "movaps 0x400(%0,%1), %%xmm1 \n"\
2080 "movaps 0x800(%0,%1), %%xmm2 \n"\
2081 "movaps 0xc00(%0,%1), %%xmm3 \n"\
2082 "movaps 0x1000(%0,%1), %%xmm4 \n"\
2083 "mulps %%xmm5, %%xmm0 \n"\
2084 "mulps %%xmm6, %%xmm1 \n"\
2085 "mulps %%xmm5, %%xmm2 \n"\
2086 "mulps %%xmm7, %%xmm3 \n"\
2087 "mulps %%xmm7, %%xmm4 \n"\
2088 stereo("addps %%xmm1, %%xmm0 \n")\
2089 "addps %%xmm1, %%xmm2 \n"\
2090 "addps %%xmm3, %%xmm0 \n"\
2091 "addps %%xmm4, %%xmm2 \n"\
2092 mono("addps %%xmm2, %%xmm0 \n")\
2093 "movaps %%xmm0, (%0,%1) \n"\
2094 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2098 :"r"(samples[0]+len), "r"(matrix)\
2099 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2100 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2104 #define MIX_MISC(stereo)\
2107 "movaps (%3,%0), %%xmm0 \n"\
2108 stereo("movaps %%xmm0, %%xmm1 \n")\
2109 "mulps %%xmm4, %%xmm0 \n"\
2110 stereo("mulps %%xmm5, %%xmm1 \n")\
2111 "lea 1024(%3,%0), %1 \n"\
2114 "movaps (%1), %%xmm2 \n"\
2115 stereo("movaps %%xmm2, %%xmm3 \n")\
2116 "mulps (%4,%2), %%xmm2 \n"\
2117 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2118 "addps %%xmm2, %%xmm0 \n"\
2119 stereo("addps %%xmm3, %%xmm1 \n")\
2123 "movaps %%xmm0, (%3,%0) \n"\
2124 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2127 :"+&r"(i), "=&r"(j), "=&r"(k)\
2128 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2134 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2137 i = -len*
sizeof(float);
2138 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2140 }
else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2144 j = 2*in_ch*
sizeof(float);
2148 "movss (%2,%0), %%xmm4 \n"
2149 "movss 4(%2,%0), %%xmm5 \n"
2150 "shufps $0, %%xmm4, %%xmm4 \n"
2151 "shufps $0, %%xmm5, %%xmm5 \n"
2152 "movaps %%xmm4, (%1,%0,4) \n"
2153 "movaps %%xmm5, 16(%1,%0,4) \n"
2156 :
"r"(matrix_simd),
"r"(matrix)
2171 "movq (%2,%0), %%mm0 \n\t"
2172 "movq 8(%2,%0), %%mm1 \n\t"
2173 "pfmul (%3,%0), %%mm0 \n\t"
2174 "pfmul 8(%3,%0), %%mm1 \n\t"
2175 "movq %%mm0, (%1,%0) \n\t"
2176 "movq %%mm1, 8(%1,%0) \n\t"
2181 :
"r"(dst),
"r"(src0),
"r"(src1)
2189 "movaps (%2,%0), %%xmm0 \n\t"
2190 "movaps 16(%2,%0), %%xmm1 \n\t"
2191 "mulps (%3,%0), %%xmm0 \n\t"
2192 "mulps 16(%3,%0), %%xmm1 \n\t"
2193 "movaps %%xmm0, (%1,%0) \n\t"
2194 "movaps %%xmm1, 16(%1,%0) \n\t"
2198 :
"r"(dst),
"r"(src0),
"r"(src1)
2207 "pswapd 8(%1), %%mm0 \n\t"
2208 "pswapd (%1), %%mm1 \n\t"
2209 "pfmul (%3,%0), %%mm0 \n\t"
2210 "pfmul 8(%3,%0), %%mm1 \n\t"
2211 "movq %%mm0, (%2,%0) \n\t"
2212 "movq %%mm1, 8(%2,%0) \n\t"
2216 :
"+r"(i),
"+r"(src1)
2217 :
"r"(dst),
"r"(src0)
2219 __asm__
volatile(
"femms");
2225 "movaps 16(%1), %%xmm0 \n\t"
2226 "movaps (%1), %%xmm1 \n\t"
2227 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2228 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2229 "mulps (%3,%0), %%xmm0 \n\t"
2230 "mulps 16(%3,%0), %%xmm1 \n\t"
2231 "movaps %%xmm0, (%2,%0) \n\t"
2232 "movaps %%xmm1, 16(%2,%0) \n\t"
2236 :
"+r"(i),
"+r"(src1)
2237 :
"r"(dst),
"r"(src0)
2242 const float *src2,
int len){
2246 "movq (%2,%0), %%mm0 \n\t"
2247 "movq 8(%2,%0), %%mm1 \n\t"
2248 "pfmul (%3,%0), %%mm0 \n\t"
2249 "pfmul 8(%3,%0), %%mm1 \n\t"
2250 "pfadd (%4,%0), %%mm0 \n\t"
2251 "pfadd 8(%4,%0), %%mm1 \n\t"
2252 "movq %%mm0, (%1,%0) \n\t"
2253 "movq %%mm1, 8(%1,%0) \n\t"
2257 :
"r"(dst),
"r"(src0),
"r"(src1),
"r"(src2)
2260 __asm__
volatile(
"femms");
2263 const float *src2,
int len){
2267 "movaps (%2,%0), %%xmm0 \n\t"
2268 "movaps 16(%2,%0), %%xmm1 \n\t"
2269 "mulps (%3,%0), %%xmm0 \n\t"
2270 "mulps 16(%3,%0), %%xmm1 \n\t"
2271 "addps (%4,%0), %%xmm0 \n\t"
2272 "addps 16(%4,%0), %%xmm1 \n\t"
2273 "movaps %%xmm0, (%1,%0) \n\t"
2274 "movaps %%xmm1, 16(%1,%0) \n\t"
2278 :
"r"(dst),
"r"(src0),
"r"(src1),
"r"(src2)
2284 static void vector_fmul_window_3dnow2(
float *dst,
const float *src0,
const float *src1,
2285 const float *win,
int len){
2290 "pswapd (%5,%1), %%mm1 \n"
2291 "movq (%5,%0), %%mm0 \n"
2292 "pswapd (%4,%1), %%mm5 \n"
2293 "movq (%3,%0), %%mm4 \n"
2294 "movq %%mm0, %%mm2 \n"
2295 "movq %%mm1, %%mm3 \n"
2296 "pfmul %%mm4, %%mm2 \n"
2297 "pfmul %%mm5, %%mm3 \n"
2298 "pfmul %%mm4, %%mm1 \n"
2299 "pfmul %%mm5, %%mm0 \n"
2300 "pfadd %%mm3, %%mm2 \n"
2301 "pfsub %%mm0, %%mm1 \n"
2302 "pswapd %%mm2, %%mm2 \n"
2303 "movq %%mm1, (%2,%0) \n"
2304 "movq %%mm2, (%2,%1) \n"
2310 :
"r"(dst+
len),
"r"(src0+len),
"r"(src1),
"r"(win+len)
2314 static void vector_fmul_window_sse(
float *dst,
const float *src0,
const float *src1,
2315 const float *win,
int len){
2320 "movaps (%5,%1), %%xmm1 \n"
2321 "movaps (%5,%0), %%xmm0 \n"
2322 "movaps (%4,%1), %%xmm5 \n"
2323 "movaps (%3,%0), %%xmm4 \n"
2324 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2325 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2326 "movaps %%xmm0, %%xmm2 \n"
2327 "movaps %%xmm1, %%xmm3 \n"
2328 "mulps %%xmm4, %%xmm2 \n"
2329 "mulps %%xmm5, %%xmm3 \n"
2330 "mulps %%xmm4, %%xmm1 \n"
2331 "mulps %%xmm5, %%xmm0 \n"
2332 "addps %%xmm3, %%xmm2 \n"
2333 "subps %%xmm0, %%xmm1 \n"
2334 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2335 "movaps %%xmm1, (%2,%0) \n"
2336 "movaps %%xmm2, (%2,%1) \n"
2341 :
"r"(dst+
len),
"r"(src0+len),
"r"(src1),
"r"(win+len)
2351 "movss %3, %%xmm4 \n"
2352 "movss %4, %%xmm5 \n"
2353 "shufps $0, %%xmm4, %%xmm4 \n"
2354 "shufps $0, %%xmm5, %%xmm5 \n"
2356 "movaps (%2,%0), %%xmm0 \n\t"
2357 "movaps 16(%2,%0), %%xmm1 \n\t"
2358 "movaps 32(%2,%0), %%xmm2 \n\t"
2359 "movaps 48(%2,%0), %%xmm3 \n\t"
2360 "maxps %%xmm4, %%xmm0 \n\t"
2361 "maxps %%xmm4, %%xmm1 \n\t"
2362 "maxps %%xmm4, %%xmm2 \n\t"
2363 "maxps %%xmm4, %%xmm3 \n\t"
2364 "minps %%xmm5, %%xmm0 \n\t"
2365 "minps %%xmm5, %%xmm1 \n\t"
2366 "minps %%xmm5, %%xmm2 \n\t"
2367 "minps %%xmm5, %%xmm3 \n\t"
2368 "movaps %%xmm0, (%1,%0) \n\t"
2369 "movaps %%xmm1, 16(%1,%0) \n\t"
2370 "movaps %%xmm2, 32(%1,%0) \n\t"
2371 "movaps %%xmm3, 48(%1,%0) \n\t"
2375 :
"r"(dst),
"r"(src),
"m"(min),
"m"(max)
2400 const int16_t *window,
unsigned int len);
2402 const int16_t *window,
unsigned int len);
2404 const int16_t *window,
unsigned int len);
2406 const int16_t *window,
unsigned int len);
2408 const int16_t *window,
unsigned int len);
2410 const int16_t *window,
unsigned int len);
2419 int32_t max,
unsigned int len);
2421 int32_t max,
unsigned int len);
2423 int32_t max,
unsigned int len);
2425 int32_t max,
unsigned int len);
2428 const float *src1,
int len);
2430 const float *src1,
int len);
2440 mm_flags |= (avctx->
dsp_mask & 0xffff);
2442 mm_flags &= ~(avctx->
dsp_mask & 0xffff);
2460 if (mm_flags & AV_CPU_FLAG_MMX) {
2471 if(mm_flags & AV_CPU_FLAG_MMX2){
2472 c->
idct_put= ff_libmpeg2mmx2_idct_put;
2473 c->
idct_add= ff_libmpeg2mmx2_idct_add;
2476 c->
idct_put= ff_libmpeg2mmx_idct_put;
2477 c->
idct_add= ff_libmpeg2mmx_idct_add;
2484 if(mm_flags & AV_CPU_FLAG_SSE2){
2498 if(mm_flags & AV_CPU_FLAG_SSE2){
2503 }
else if(mm_flags & AV_CPU_FLAG_MMX2){
2518 if (!high_bit_depth) {
2521 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2529 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2530 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2531 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2532 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2533 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2535 if (!high_bit_depth) {
2546 #if ARCH_X86_32 || !HAVE_YASM
2549 #if ARCH_X86_32 && HAVE_YASM
2550 if (!high_bit_depth)
2557 if (!high_bit_depth)
2574 if (mm_flags & AV_CPU_FLAG_MMX2) {
2577 if (!high_bit_depth) {
2594 if (!high_bit_depth) {
2618 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2619 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2620 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2621 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2622 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2623 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2624 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2625 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2626 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2627 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2628 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2629 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2630 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2631 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2632 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2633 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2634 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
2643 if (!high_bit_depth) {
2651 else if (bit_depth == 10) {
2692 }
else if (
HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
2695 if (!high_bit_depth) {
2733 if (!high_bit_depth) {
2757 #define H264_QPEL_FUNCS(x, y, CPU)\
2758 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2759 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2760 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2761 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2762 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2764 if (!high_bit_depth) {
2771 if(mm_flags & AV_CPU_FLAG_SSE2){
2772 if (!high_bit_depth) {
2787 #define H264_QPEL_FUNCS_10(x, y, CPU)\
2788 c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
2789 c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
2790 c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
2791 c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
2792 if (bit_depth == 10) {
2797 H264_QPEL_FUNCS_10(1, 0, sse2_cache64)
2798 H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
2799 H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
2810 if (!high_bit_depth) {
2825 else if (bit_depth == 10) {
2826 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64)
2827 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64)
2828 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64)
2856 if(mm_flags & AV_CPU_FLAG_MMX2){
2867 if(mm_flags & AV_CPU_FLAG_SSE){
2881 if (!high_bit_depth)
2888 if(mm_flags & AV_CPU_FLAG_SSE2){
2906 if (mm_flags & AV_CPU_FLAG_SSSE3) {
2925 #if HAVE_AVX && HAVE_YASM
2927 if (bit_depth == 10) {
2930 H264_QPEL_FUNCS_10(1, 0, sse2)
2931 H264_QPEL_FUNCS_10(2, 0, sse2)
2932 H264_QPEL_FUNCS_10(3, 0, sse2)