35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMX2
39 #define PREFETCH "prefetchnta"
42 #define PREFETCH " # nop"
45 #if COMPILE_TEMPLATE_AMD3DNOW
52 #if COMPILE_TEMPLATE_MMX2
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
57 #define SFENCE " # nop"
60 #if !COMPILE_TEMPLATE_SSE2
62 #if !COMPILE_TEMPLATE_AMD3DNOW
67 const uint8_t *s = src;
69 const uint8_t *mm_end;
71 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
73 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
78 "punpckldq 3%1, %%mm0 \n\t"
79 "movd 6%1, %%mm1 \n\t"
80 "punpckldq 9%1, %%mm1 \n\t"
81 "movd 12%1, %%mm2 \n\t"
82 "punpckldq 15%1, %%mm2 \n\t"
83 "movd 18%1, %%mm3 \n\t"
84 "punpckldq 21%1, %%mm3 \n\t"
85 "por %%mm7, %%mm0 \n\t"
86 "por %%mm7, %%mm1 \n\t"
87 "por %%mm7, %%mm2 \n\t"
88 "por %%mm7, %%mm3 \n\t"
99 __asm__
volatile(
SFENCE:::
"memory");
100 __asm__
volatile(
EMMS:::
"memory");
109 #define STORE_BGR24_MMX \
110 "psrlq $8, %%mm2 \n\t" \
111 "psrlq $8, %%mm3 \n\t" \
112 "psrlq $8, %%mm6 \n\t" \
113 "psrlq $8, %%mm7 \n\t" \
114 "pand "MANGLE(mask24l)", %%mm0\n\t" \
115 "pand "MANGLE(mask24l)", %%mm1\n\t" \
116 "pand "MANGLE(mask24l)", %%mm4\n\t" \
117 "pand "MANGLE(mask24l)", %%mm5\n\t" \
118 "pand "MANGLE(mask24h)", %%mm2\n\t" \
119 "pand "MANGLE(mask24h)", %%mm3\n\t" \
120 "pand "MANGLE(mask24h)", %%mm6\n\t" \
121 "pand "MANGLE(mask24h)", %%mm7\n\t" \
122 "por %%mm2, %%mm0 \n\t" \
123 "por %%mm3, %%mm1 \n\t" \
124 "por %%mm6, %%mm4 \n\t" \
125 "por %%mm7, %%mm5 \n\t" \
127 "movq %%mm1, %%mm2 \n\t" \
128 "movq %%mm4, %%mm3 \n\t" \
129 "psllq $48, %%mm2 \n\t" \
130 "psllq $32, %%mm3 \n\t" \
131 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
132 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
133 "por %%mm2, %%mm0 \n\t" \
134 "psrlq $16, %%mm1 \n\t" \
135 "psrlq $32, %%mm4 \n\t" \
136 "psllq $16, %%mm5 \n\t" \
137 "por %%mm3, %%mm1 \n\t" \
138 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
139 "por %%mm5, %%mm4 \n\t" \
141 MOVNTQ" %%mm0, %0 \n\t" \
142 MOVNTQ" %%mm1, 8%0 \n\t" \
149 const uint8_t *s = src;
151 const uint8_t *mm_end;
153 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
158 "movq %1, %%mm0 \n\t"
159 "movq 8%1, %%mm1 \n\t"
160 "movq 16%1, %%mm4 \n\t"
161 "movq 24%1, %%mm5 \n\t"
162 "movq %%mm0, %%mm2 \n\t"
163 "movq %%mm1, %%mm3 \n\t"
164 "movq %%mm4, %%mm6 \n\t"
165 "movq %%mm5, %%mm7 \n\t"
173 __asm__
volatile(
SFENCE:::
"memory");
174 __asm__
volatile(
EMMS:::
"memory");
191 register const uint8_t* s=src;
192 register uint8_t* d=dst;
193 register const uint8_t *end;
194 const uint8_t *mm_end;
196 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
197 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
202 "movq %1, %%mm0 \n\t"
203 "movq 8%1, %%mm2 \n\t"
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
218 __asm__
volatile(
SFENCE:::
"memory");
219 __asm__
volatile(
EMMS:::
"memory");
222 register unsigned x= *((
const uint32_t *)s);
223 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
228 register unsigned short x= *((
const uint16_t *)s);
229 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
235 register const uint8_t* s=src;
236 register uint8_t* d=dst;
237 register const uint8_t *end;
238 const uint8_t *mm_end;
240 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
241 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
242 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
247 "movq %1, %%mm0 \n\t"
248 "movq 8%1, %%mm2 \n\t"
249 "movq %%mm0, %%mm1 \n\t"
250 "movq %%mm2, %%mm3 \n\t"
251 "psrlq $1, %%mm0 \n\t"
252 "psrlq $1, %%mm2 \n\t"
253 "pand %%mm7, %%mm0 \n\t"
254 "pand %%mm7, %%mm2 \n\t"
255 "pand %%mm6, %%mm1 \n\t"
256 "pand %%mm6, %%mm3 \n\t"
257 "por %%mm1, %%mm0 \n\t"
258 "por %%mm3, %%mm2 \n\t"
267 __asm__
volatile(
SFENCE:::
"memory");
268 __asm__
volatile(
EMMS:::
"memory");
271 register uint32_t x= *((
const uint32_t*)s);
272 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
277 register uint16_t x= *((
const uint16_t*)s);
278 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
284 const uint8_t *s = src;
286 const uint8_t *mm_end;
287 uint16_t *d = (uint16_t *)dst;
290 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
292 "movq %3, %%mm5 \n\t"
293 "movq %4, %%mm6 \n\t"
294 "movq %5, %%mm7 \n\t"
299 "movd (%1), %%mm0 \n\t"
300 "movd 4(%1), %%mm3 \n\t"
301 "punpckldq 8(%1), %%mm0 \n\t"
302 "punpckldq 12(%1), %%mm3 \n\t"
303 "movq %%mm0, %%mm1 \n\t"
304 "movq %%mm3, %%mm4 \n\t"
305 "pand %%mm6, %%mm0 \n\t"
306 "pand %%mm6, %%mm3 \n\t"
307 "pmaddwd %%mm7, %%mm0 \n\t"
308 "pmaddwd %%mm7, %%mm3 \n\t"
309 "pand %%mm5, %%mm1 \n\t"
310 "pand %%mm5, %%mm4 \n\t"
311 "por %%mm1, %%mm0 \n\t"
312 "por %%mm4, %%mm3 \n\t"
313 "psrld $5, %%mm0 \n\t"
314 "pslld $11, %%mm3 \n\t"
315 "por %%mm3, %%mm0 \n\t"
323 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
326 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
328 "movq %0, %%mm7 \n\t"
329 "movq %1, %%mm6 \n\t"
330 ::
"m"(red_16mask),
"m"(green_16mask));
334 "movd %1, %%mm0 \n\t"
335 "movd 4%1, %%mm3 \n\t"
336 "punpckldq 8%1, %%mm0 \n\t"
337 "punpckldq 12%1, %%mm3 \n\t"
338 "movq %%mm0, %%mm1 \n\t"
339 "movq %%mm0, %%mm2 \n\t"
340 "movq %%mm3, %%mm4 \n\t"
341 "movq %%mm3, %%mm5 \n\t"
342 "psrlq $3, %%mm0 \n\t"
343 "psrlq $3, %%mm3 \n\t"
344 "pand %2, %%mm0 \n\t"
345 "pand %2, %%mm3 \n\t"
346 "psrlq $5, %%mm1 \n\t"
347 "psrlq $5, %%mm4 \n\t"
348 "pand %%mm6, %%mm1 \n\t"
349 "pand %%mm6, %%mm4 \n\t"
350 "psrlq $8, %%mm2 \n\t"
351 "psrlq $8, %%mm5 \n\t"
352 "pand %%mm7, %%mm2 \n\t"
353 "pand %%mm7, %%mm5 \n\t"
354 "por %%mm1, %%mm0 \n\t"
355 "por %%mm4, %%mm3 \n\t"
356 "por %%mm2, %%mm0 \n\t"
357 "por %%mm5, %%mm3 \n\t"
358 "psllq $16, %%mm3 \n\t"
359 "por %%mm3, %%mm0 \n\t"
361 :
"=m"(*d):
"m"(*s),
"m"(blue_16mask):
"memory");
366 __asm__
volatile(
SFENCE:::
"memory");
367 __asm__
volatile(
EMMS:::
"memory");
369 register int rgb = *(
const uint32_t*)s; s += 4;
370 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
376 const uint8_t *s = src;
378 const uint8_t *mm_end;
379 uint16_t *d = (uint16_t *)dst;
381 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
383 "movq %0, %%mm7 \n\t"
384 "movq %1, %%mm6 \n\t"
385 ::
"m"(red_16mask),
"m"(green_16mask));
390 "movd %1, %%mm0 \n\t"
391 "movd 4%1, %%mm3 \n\t"
392 "punpckldq 8%1, %%mm0 \n\t"
393 "punpckldq 12%1, %%mm3 \n\t"
394 "movq %%mm0, %%mm1 \n\t"
395 "movq %%mm0, %%mm2 \n\t"
396 "movq %%mm3, %%mm4 \n\t"
397 "movq %%mm3, %%mm5 \n\t"
398 "psllq $8, %%mm0 \n\t"
399 "psllq $8, %%mm3 \n\t"
400 "pand %%mm7, %%mm0 \n\t"
401 "pand %%mm7, %%mm3 \n\t"
402 "psrlq $5, %%mm1 \n\t"
403 "psrlq $5, %%mm4 \n\t"
404 "pand %%mm6, %%mm1 \n\t"
405 "pand %%mm6, %%mm4 \n\t"
406 "psrlq $19, %%mm2 \n\t"
407 "psrlq $19, %%mm5 \n\t"
408 "pand %2, %%mm2 \n\t"
409 "pand %2, %%mm5 \n\t"
410 "por %%mm1, %%mm0 \n\t"
411 "por %%mm4, %%mm3 \n\t"
412 "por %%mm2, %%mm0 \n\t"
413 "por %%mm5, %%mm3 \n\t"
414 "psllq $16, %%mm3 \n\t"
415 "por %%mm3, %%mm0 \n\t"
417 :
"=m"(*d):
"m"(*s),
"m"(blue_16mask):
"memory");
421 __asm__
volatile(
SFENCE:::
"memory");
422 __asm__
volatile(
EMMS:::
"memory");
424 register int rgb = *(
const uint32_t*)s; s += 4;
425 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
431 const uint8_t *s = src;
433 const uint8_t *mm_end;
434 uint16_t *d = (uint16_t *)dst;
437 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
439 "movq %3, %%mm5 \n\t"
440 "movq %4, %%mm6 \n\t"
441 "movq %5, %%mm7 \n\t"
446 "movd (%1), %%mm0 \n\t"
447 "movd 4(%1), %%mm3 \n\t"
448 "punpckldq 8(%1), %%mm0 \n\t"
449 "punpckldq 12(%1), %%mm3 \n\t"
450 "movq %%mm0, %%mm1 \n\t"
451 "movq %%mm3, %%mm4 \n\t"
452 "pand %%mm6, %%mm0 \n\t"
453 "pand %%mm6, %%mm3 \n\t"
454 "pmaddwd %%mm7, %%mm0 \n\t"
455 "pmaddwd %%mm7, %%mm3 \n\t"
456 "pand %%mm5, %%mm1 \n\t"
457 "pand %%mm5, %%mm4 \n\t"
458 "por %%mm1, %%mm0 \n\t"
459 "por %%mm4, %%mm3 \n\t"
460 "psrld $6, %%mm0 \n\t"
461 "pslld $10, %%mm3 \n\t"
462 "por %%mm3, %%mm0 \n\t"
470 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
473 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
475 "movq %0, %%mm7 \n\t"
476 "movq %1, %%mm6 \n\t"
477 ::
"m"(red_15mask),
"m"(green_15mask));
481 "movd %1, %%mm0 \n\t"
482 "movd 4%1, %%mm3 \n\t"
483 "punpckldq 8%1, %%mm0 \n\t"
484 "punpckldq 12%1, %%mm3 \n\t"
485 "movq %%mm0, %%mm1 \n\t"
486 "movq %%mm0, %%mm2 \n\t"
487 "movq %%mm3, %%mm4 \n\t"
488 "movq %%mm3, %%mm5 \n\t"
489 "psrlq $3, %%mm0 \n\t"
490 "psrlq $3, %%mm3 \n\t"
491 "pand %2, %%mm0 \n\t"
492 "pand %2, %%mm3 \n\t"
493 "psrlq $6, %%mm1 \n\t"
494 "psrlq $6, %%mm4 \n\t"
495 "pand %%mm6, %%mm1 \n\t"
496 "pand %%mm6, %%mm4 \n\t"
497 "psrlq $9, %%mm2 \n\t"
498 "psrlq $9, %%mm5 \n\t"
499 "pand %%mm7, %%mm2 \n\t"
500 "pand %%mm7, %%mm5 \n\t"
501 "por %%mm1, %%mm0 \n\t"
502 "por %%mm4, %%mm3 \n\t"
503 "por %%mm2, %%mm0 \n\t"
504 "por %%mm5, %%mm3 \n\t"
505 "psllq $16, %%mm3 \n\t"
506 "por %%mm3, %%mm0 \n\t"
508 :
"=m"(*d):
"m"(*s),
"m"(blue_15mask):
"memory");
513 __asm__
volatile(
SFENCE:::
"memory");
514 __asm__
volatile(
EMMS:::
"memory");
516 register int rgb = *(
const uint32_t*)s; s += 4;
517 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
523 const uint8_t *s = src;
525 const uint8_t *mm_end;
526 uint16_t *d = (uint16_t *)dst;
528 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
530 "movq %0, %%mm7 \n\t"
531 "movq %1, %%mm6 \n\t"
532 ::
"m"(red_15mask),
"m"(green_15mask));
537 "movd %1, %%mm0 \n\t"
538 "movd 4%1, %%mm3 \n\t"
539 "punpckldq 8%1, %%mm0 \n\t"
540 "punpckldq 12%1, %%mm3 \n\t"
541 "movq %%mm0, %%mm1 \n\t"
542 "movq %%mm0, %%mm2 \n\t"
543 "movq %%mm3, %%mm4 \n\t"
544 "movq %%mm3, %%mm5 \n\t"
545 "psllq $7, %%mm0 \n\t"
546 "psllq $7, %%mm3 \n\t"
547 "pand %%mm7, %%mm0 \n\t"
548 "pand %%mm7, %%mm3 \n\t"
549 "psrlq $6, %%mm1 \n\t"
550 "psrlq $6, %%mm4 \n\t"
551 "pand %%mm6, %%mm1 \n\t"
552 "pand %%mm6, %%mm4 \n\t"
553 "psrlq $19, %%mm2 \n\t"
554 "psrlq $19, %%mm5 \n\t"
555 "pand %2, %%mm2 \n\t"
556 "pand %2, %%mm5 \n\t"
557 "por %%mm1, %%mm0 \n\t"
558 "por %%mm4, %%mm3 \n\t"
559 "por %%mm2, %%mm0 \n\t"
560 "por %%mm5, %%mm3 \n\t"
561 "psllq $16, %%mm3 \n\t"
562 "por %%mm3, %%mm0 \n\t"
564 :
"=m"(*d):
"m"(*s),
"m"(blue_15mask):
"memory");
568 __asm__
volatile(
SFENCE:::
"memory");
569 __asm__
volatile(
EMMS:::
"memory");
571 register int rgb = *(
const uint32_t*)s; s += 4;
572 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
578 const uint8_t *s = src;
580 const uint8_t *mm_end;
581 uint16_t *d = (uint16_t *)dst;
583 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
585 "movq %0, %%mm7 \n\t"
586 "movq %1, %%mm6 \n\t"
587 ::
"m"(red_16mask),
"m"(green_16mask));
592 "movd %1, %%mm0 \n\t"
593 "movd 3%1, %%mm3 \n\t"
594 "punpckldq 6%1, %%mm0 \n\t"
595 "punpckldq 9%1, %%mm3 \n\t"
596 "movq %%mm0, %%mm1 \n\t"
597 "movq %%mm0, %%mm2 \n\t"
598 "movq %%mm3, %%mm4 \n\t"
599 "movq %%mm3, %%mm5 \n\t"
600 "psrlq $3, %%mm0 \n\t"
601 "psrlq $3, %%mm3 \n\t"
602 "pand %2, %%mm0 \n\t"
603 "pand %2, %%mm3 \n\t"
604 "psrlq $5, %%mm1 \n\t"
605 "psrlq $5, %%mm4 \n\t"
606 "pand %%mm6, %%mm1 \n\t"
607 "pand %%mm6, %%mm4 \n\t"
608 "psrlq $8, %%mm2 \n\t"
609 "psrlq $8, %%mm5 \n\t"
610 "pand %%mm7, %%mm2 \n\t"
611 "pand %%mm7, %%mm5 \n\t"
612 "por %%mm1, %%mm0 \n\t"
613 "por %%mm4, %%mm3 \n\t"
614 "por %%mm2, %%mm0 \n\t"
615 "por %%mm5, %%mm3 \n\t"
616 "psllq $16, %%mm3 \n\t"
617 "por %%mm3, %%mm0 \n\t"
619 :
"=m"(*d):
"m"(*s),
"m"(blue_16mask):
"memory");
623 __asm__
volatile(
SFENCE:::
"memory");
624 __asm__
volatile(
EMMS:::
"memory");
629 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
635 const uint8_t *s = src;
637 const uint8_t *mm_end;
638 uint16_t *d = (uint16_t *)dst;
640 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
642 "movq %0, %%mm7 \n\t"
643 "movq %1, %%mm6 \n\t"
644 ::
"m"(red_16mask),
"m"(green_16mask));
649 "movd %1, %%mm0 \n\t"
650 "movd 3%1, %%mm3 \n\t"
651 "punpckldq 6%1, %%mm0 \n\t"
652 "punpckldq 9%1, %%mm3 \n\t"
653 "movq %%mm0, %%mm1 \n\t"
654 "movq %%mm0, %%mm2 \n\t"
655 "movq %%mm3, %%mm4 \n\t"
656 "movq %%mm3, %%mm5 \n\t"
657 "psllq $8, %%mm0 \n\t"
658 "psllq $8, %%mm3 \n\t"
659 "pand %%mm7, %%mm0 \n\t"
660 "pand %%mm7, %%mm3 \n\t"
661 "psrlq $5, %%mm1 \n\t"
662 "psrlq $5, %%mm4 \n\t"
663 "pand %%mm6, %%mm1 \n\t"
664 "pand %%mm6, %%mm4 \n\t"
665 "psrlq $19, %%mm2 \n\t"
666 "psrlq $19, %%mm5 \n\t"
667 "pand %2, %%mm2 \n\t"
668 "pand %2, %%mm5 \n\t"
669 "por %%mm1, %%mm0 \n\t"
670 "por %%mm4, %%mm3 \n\t"
671 "por %%mm2, %%mm0 \n\t"
672 "por %%mm5, %%mm3 \n\t"
673 "psllq $16, %%mm3 \n\t"
674 "por %%mm3, %%mm0 \n\t"
676 :
"=m"(*d):
"m"(*s),
"m"(blue_16mask):
"memory");
680 __asm__
volatile(
SFENCE:::
"memory");
681 __asm__
volatile(
EMMS:::
"memory");
686 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
692 const uint8_t *s = src;
694 const uint8_t *mm_end;
695 uint16_t *d = (uint16_t *)dst;
697 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
699 "movq %0, %%mm7 \n\t"
700 "movq %1, %%mm6 \n\t"
701 ::
"m"(red_15mask),
"m"(green_15mask));
706 "movd %1, %%mm0 \n\t"
707 "movd 3%1, %%mm3 \n\t"
708 "punpckldq 6%1, %%mm0 \n\t"
709 "punpckldq 9%1, %%mm3 \n\t"
710 "movq %%mm0, %%mm1 \n\t"
711 "movq %%mm0, %%mm2 \n\t"
712 "movq %%mm3, %%mm4 \n\t"
713 "movq %%mm3, %%mm5 \n\t"
714 "psrlq $3, %%mm0 \n\t"
715 "psrlq $3, %%mm3 \n\t"
716 "pand %2, %%mm0 \n\t"
717 "pand %2, %%mm3 \n\t"
718 "psrlq $6, %%mm1 \n\t"
719 "psrlq $6, %%mm4 \n\t"
720 "pand %%mm6, %%mm1 \n\t"
721 "pand %%mm6, %%mm4 \n\t"
722 "psrlq $9, %%mm2 \n\t"
723 "psrlq $9, %%mm5 \n\t"
724 "pand %%mm7, %%mm2 \n\t"
725 "pand %%mm7, %%mm5 \n\t"
726 "por %%mm1, %%mm0 \n\t"
727 "por %%mm4, %%mm3 \n\t"
728 "por %%mm2, %%mm0 \n\t"
729 "por %%mm5, %%mm3 \n\t"
730 "psllq $16, %%mm3 \n\t"
731 "por %%mm3, %%mm0 \n\t"
733 :
"=m"(*d):
"m"(*s),
"m"(blue_15mask):
"memory");
737 __asm__
volatile(
SFENCE:::
"memory");
738 __asm__
volatile(
EMMS:::
"memory");
743 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
749 const uint8_t *s = src;
751 const uint8_t *mm_end;
752 uint16_t *d = (uint16_t *)dst;
754 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
756 "movq %0, %%mm7 \n\t"
757 "movq %1, %%mm6 \n\t"
758 ::
"m"(red_15mask),
"m"(green_15mask));
763 "movd %1, %%mm0 \n\t"
764 "movd 3%1, %%mm3 \n\t"
765 "punpckldq 6%1, %%mm0 \n\t"
766 "punpckldq 9%1, %%mm3 \n\t"
767 "movq %%mm0, %%mm1 \n\t"
768 "movq %%mm0, %%mm2 \n\t"
769 "movq %%mm3, %%mm4 \n\t"
770 "movq %%mm3, %%mm5 \n\t"
771 "psllq $7, %%mm0 \n\t"
772 "psllq $7, %%mm3 \n\t"
773 "pand %%mm7, %%mm0 \n\t"
774 "pand %%mm7, %%mm3 \n\t"
775 "psrlq $6, %%mm1 \n\t"
776 "psrlq $6, %%mm4 \n\t"
777 "pand %%mm6, %%mm1 \n\t"
778 "pand %%mm6, %%mm4 \n\t"
779 "psrlq $19, %%mm2 \n\t"
780 "psrlq $19, %%mm5 \n\t"
781 "pand %2, %%mm2 \n\t"
782 "pand %2, %%mm5 \n\t"
783 "por %%mm1, %%mm0 \n\t"
784 "por %%mm4, %%mm3 \n\t"
785 "por %%mm2, %%mm0 \n\t"
786 "por %%mm5, %%mm3 \n\t"
787 "psllq $16, %%mm3 \n\t"
788 "por %%mm3, %%mm0 \n\t"
790 :
"=m"(*d):
"m"(*s),
"m"(blue_15mask):
"memory");
794 __asm__
volatile(
SFENCE:::
"memory");
795 __asm__
volatile(
EMMS:::
"memory");
800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
828 const uint16_t *mm_end;
830 const uint16_t *s = (
const uint16_t*)src;
831 end = s + src_size/2;
832 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
837 "movq %1, %%mm0 \n\t"
838 "movq %1, %%mm1 \n\t"
839 "movq %1, %%mm2 \n\t"
840 "pand %2, %%mm0 \n\t"
841 "pand %3, %%mm1 \n\t"
842 "pand %4, %%mm2 \n\t"
843 "psllq $3, %%mm0 \n\t"
844 "psrlq $2, %%mm1 \n\t"
845 "psrlq $7, %%mm2 \n\t"
846 "movq %%mm0, %%mm3 \n\t"
847 "movq %%mm1, %%mm4 \n\t"
848 "movq %%mm2, %%mm5 \n\t"
849 "punpcklwd %5, %%mm0 \n\t"
850 "punpcklwd %5, %%mm1 \n\t"
851 "punpcklwd %5, %%mm2 \n\t"
852 "punpckhwd %5, %%mm3 \n\t"
853 "punpckhwd %5, %%mm4 \n\t"
854 "punpckhwd %5, %%mm5 \n\t"
855 "psllq $8, %%mm1 \n\t"
856 "psllq $16, %%mm2 \n\t"
857 "por %%mm1, %%mm0 \n\t"
858 "por %%mm2, %%mm0 \n\t"
859 "psllq $8, %%mm4 \n\t"
860 "psllq $16, %%mm5 \n\t"
861 "por %%mm4, %%mm3 \n\t"
862 "por %%mm5, %%mm3 \n\t"
864 "movq %%mm0, %%mm6 \n\t"
865 "movq %%mm3, %%mm7 \n\t"
867 "movq 8%1, %%mm0 \n\t"
868 "movq 8%1, %%mm1 \n\t"
869 "movq 8%1, %%mm2 \n\t"
870 "pand %2, %%mm0 \n\t"
871 "pand %3, %%mm1 \n\t"
872 "pand %4, %%mm2 \n\t"
873 "psllq $3, %%mm0 \n\t"
874 "psrlq $2, %%mm1 \n\t"
875 "psrlq $7, %%mm2 \n\t"
876 "movq %%mm0, %%mm3 \n\t"
877 "movq %%mm1, %%mm4 \n\t"
878 "movq %%mm2, %%mm5 \n\t"
879 "punpcklwd %5, %%mm0 \n\t"
880 "punpcklwd %5, %%mm1 \n\t"
881 "punpcklwd %5, %%mm2 \n\t"
882 "punpckhwd %5, %%mm3 \n\t"
883 "punpckhwd %5, %%mm4 \n\t"
884 "punpckhwd %5, %%mm5 \n\t"
885 "psllq $8, %%mm1 \n\t"
886 "psllq $16, %%mm2 \n\t"
887 "por %%mm1, %%mm0 \n\t"
888 "por %%mm2, %%mm0 \n\t"
889 "psllq $8, %%mm4 \n\t"
890 "psllq $16, %%mm5 \n\t"
891 "por %%mm4, %%mm3 \n\t"
892 "por %%mm5, %%mm3 \n\t"
895 :
"m"(*s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
899 "movq %%mm0, %%mm4 \n\t"
900 "movq %%mm3, %%mm5 \n\t"
901 "movq %%mm6, %%mm0 \n\t"
902 "movq %%mm7, %%mm1 \n\t"
904 "movq %%mm4, %%mm6 \n\t"
905 "movq %%mm5, %%mm7 \n\t"
906 "movq %%mm0, %%mm2 \n\t"
907 "movq %%mm1, %%mm3 \n\t"
917 __asm__
volatile(
SFENCE:::
"memory");
918 __asm__
volatile(
EMMS:::
"memory");
920 register uint16_t bgr;
922 *d++ = (bgr&0x1F)<<3;
923 *d++ = (bgr&0x3E0)>>2;
924 *d++ = (bgr&0x7C00)>>7;
931 const uint16_t *mm_end;
932 uint8_t *d = (uint8_t *)dst;
933 const uint16_t *s = (
const uint16_t *)src;
934 end = s + src_size/2;
935 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
940 "movq %1, %%mm0 \n\t"
941 "movq %1, %%mm1 \n\t"
942 "movq %1, %%mm2 \n\t"
943 "pand %2, %%mm0 \n\t"
944 "pand %3, %%mm1 \n\t"
945 "pand %4, %%mm2 \n\t"
946 "psllq $3, %%mm0 \n\t"
947 "psrlq $3, %%mm1 \n\t"
948 "psrlq $8, %%mm2 \n\t"
949 "movq %%mm0, %%mm3 \n\t"
950 "movq %%mm1, %%mm4 \n\t"
951 "movq %%mm2, %%mm5 \n\t"
952 "punpcklwd %5, %%mm0 \n\t"
953 "punpcklwd %5, %%mm1 \n\t"
954 "punpcklwd %5, %%mm2 \n\t"
955 "punpckhwd %5, %%mm3 \n\t"
956 "punpckhwd %5, %%mm4 \n\t"
957 "punpckhwd %5, %%mm5 \n\t"
958 "psllq $8, %%mm1 \n\t"
959 "psllq $16, %%mm2 \n\t"
960 "por %%mm1, %%mm0 \n\t"
961 "por %%mm2, %%mm0 \n\t"
962 "psllq $8, %%mm4 \n\t"
963 "psllq $16, %%mm5 \n\t"
964 "por %%mm4, %%mm3 \n\t"
965 "por %%mm5, %%mm3 \n\t"
967 "movq %%mm0, %%mm6 \n\t"
968 "movq %%mm3, %%mm7 \n\t"
970 "movq 8%1, %%mm0 \n\t"
971 "movq 8%1, %%mm1 \n\t"
972 "movq 8%1, %%mm2 \n\t"
973 "pand %2, %%mm0 \n\t"
974 "pand %3, %%mm1 \n\t"
975 "pand %4, %%mm2 \n\t"
976 "psllq $3, %%mm0 \n\t"
977 "psrlq $3, %%mm1 \n\t"
978 "psrlq $8, %%mm2 \n\t"
979 "movq %%mm0, %%mm3 \n\t"
980 "movq %%mm1, %%mm4 \n\t"
981 "movq %%mm2, %%mm5 \n\t"
982 "punpcklwd %5, %%mm0 \n\t"
983 "punpcklwd %5, %%mm1 \n\t"
984 "punpcklwd %5, %%mm2 \n\t"
985 "punpckhwd %5, %%mm3 \n\t"
986 "punpckhwd %5, %%mm4 \n\t"
987 "punpckhwd %5, %%mm5 \n\t"
988 "psllq $8, %%mm1 \n\t"
989 "psllq $16, %%mm2 \n\t"
990 "por %%mm1, %%mm0 \n\t"
991 "por %%mm2, %%mm0 \n\t"
992 "psllq $8, %%mm4 \n\t"
993 "psllq $16, %%mm5 \n\t"
994 "por %%mm4, %%mm3 \n\t"
995 "por %%mm5, %%mm3 \n\t"
997 :
"m"(*s),
"m"(
mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
1001 "movq %%mm0, %%mm4 \n\t"
1002 "movq %%mm3, %%mm5 \n\t"
1003 "movq %%mm6, %%mm0 \n\t"
1004 "movq %%mm7, %%mm1 \n\t"
1006 "movq %%mm4, %%mm6 \n\t"
1007 "movq %%mm5, %%mm7 \n\t"
1008 "movq %%mm0, %%mm2 \n\t"
1009 "movq %%mm1, %%mm3 \n\t"
1019 __asm__
volatile(
SFENCE:::
"memory");
1020 __asm__
volatile(
EMMS:::
"memory");
1022 register uint16_t bgr;
1024 *d++ = (bgr&0x1F)<<3;
1025 *d++ = (bgr&0x7E0)>>3;
1026 *d++ = (bgr&0xF800)>>8;
1037 #define PACK_RGB32 \
1038 "packuswb %%mm7, %%mm0 \n\t" \
1039 "packuswb %%mm7, %%mm1 \n\t" \
1040 "packuswb %%mm7, %%mm2 \n\t" \
1041 "punpcklbw %%mm1, %%mm0 \n\t" \
1042 "punpcklbw %%mm6, %%mm2 \n\t" \
1043 "movq %%mm0, %%mm3 \n\t" \
1044 "punpcklwd %%mm2, %%mm0 \n\t" \
1045 "punpckhwd %%mm2, %%mm3 \n\t" \
1046 MOVNTQ" %%mm0, %0 \n\t" \
1047 MOVNTQ" %%mm3, 8%0 \n\t" \
1051 const uint16_t *end;
1052 const uint16_t *mm_end;
1054 const uint16_t *s = (
const uint16_t *)src;
1055 end = s + src_size/2;
1056 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
1057 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
1058 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
1060 while (s < mm_end) {
1063 "movq %1, %%mm0 \n\t"
1064 "movq %1, %%mm1 \n\t"
1065 "movq %1, %%mm2 \n\t"
1066 "pand %2, %%mm0 \n\t"
1067 "pand %3, %%mm1 \n\t"
1068 "pand %4, %%mm2 \n\t"
1069 "psllq $3, %%mm0 \n\t"
1070 "psrlq $2, %%mm1 \n\t"
1071 "psrlq $7, %%mm2 \n\t"
1074 :
"m"(*s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r)
1079 __asm__
volatile(
SFENCE:::
"memory");
1080 __asm__
volatile(
EMMS:::
"memory");
1082 register uint16_t bgr;
1084 *d++ = (bgr&0x1F)<<3;
1085 *d++ = (bgr&0x3E0)>>2;
1086 *d++ = (bgr&0x7C00)>>7;
1093 const uint16_t *end;
1094 const uint16_t *mm_end;
1096 const uint16_t *s = (
const uint16_t*)src;
1097 end = s + src_size/2;
1098 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
1099 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
1100 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
1102 while (s < mm_end) {
1105 "movq %1, %%mm0 \n\t"
1106 "movq %1, %%mm1 \n\t"
1107 "movq %1, %%mm2 \n\t"
1108 "pand %2, %%mm0 \n\t"
1109 "pand %3, %%mm1 \n\t"
1110 "pand %4, %%mm2 \n\t"
1111 "psllq $3, %%mm0 \n\t"
1112 "psrlq $3, %%mm1 \n\t"
1113 "psrlq $8, %%mm2 \n\t"
1116 :
"m"(*s),
"m"(
mask16b),
"m"(mask16g),
"m"(mask16r)
1121 __asm__
volatile(
SFENCE:::
"memory");
1122 __asm__
volatile(
EMMS:::
"memory");
1124 register uint16_t bgr;
1126 *d++ = (bgr&0x1F)<<3;
1127 *d++ = (bgr&0x7E0)>>3;
1128 *d++ = (bgr&0xF800)>>8;
1136 const uint8_t *s = src-idx;
1137 uint8_t *d = dst-idx;
1142 "movq %3, %%mm7 \n\t"
1143 "pxor %4, %%mm7 \n\t"
1144 "movq %%mm7, %%mm6 \n\t"
1145 "pxor %5, %%mm7 \n\t"
1149 "movq (%1, %0), %%mm0 \n\t"
1150 "movq 8(%1, %0), %%mm1 \n\t"
1151 # if COMPILE_TEMPLATE_MMX2
1152 "pshufw $177, %%mm0, %%mm3 \n\t"
1153 "pshufw $177, %%mm1, %%mm5 \n\t"
1154 "pand %%mm7, %%mm0 \n\t"
1155 "pand %%mm6, %%mm3 \n\t"
1156 "pand %%mm7, %%mm1 \n\t"
1157 "pand %%mm6, %%mm5 \n\t"
1158 "por %%mm3, %%mm0 \n\t"
1159 "por %%mm5, %%mm1 \n\t"
1161 "movq %%mm0, %%mm2 \n\t"
1162 "movq %%mm1, %%mm4 \n\t"
1163 "pand %%mm7, %%mm0 \n\t"
1164 "pand %%mm6, %%mm2 \n\t"
1165 "pand %%mm7, %%mm1 \n\t"
1166 "pand %%mm6, %%mm4 \n\t"
1167 "movq %%mm2, %%mm3 \n\t"
1168 "movq %%mm4, %%mm5 \n\t"
1169 "pslld $16, %%mm2 \n\t"
1170 "psrld $16, %%mm3 \n\t"
1171 "pslld $16, %%mm4 \n\t"
1172 "psrld $16, %%mm5 \n\t"
1173 "por %%mm2, %%mm0 \n\t"
1174 "por %%mm4, %%mm1 \n\t"
1175 "por %%mm3, %%mm0 \n\t"
1176 "por %%mm5, %%mm1 \n\t"
1178 MOVNTQ" %%mm0, (%2, %0) \n\t"
1179 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1186 :
"r" (s),
"r" (d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1188 for (; idx<15; idx+=4) {
1189 register int v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1191 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1198 x86_reg mmx_size= 23 - src_size;
1200 "test %%"REG_a
", %%"REG_a
" \n\t"
1202 "movq "MANGLE(mask24r)
", %%mm5 \n\t"
1203 "movq "MANGLE(mask24g)
", %%mm6 \n\t"
1204 "movq "MANGLE(mask24b)
", %%mm7 \n\t"
1208 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1209 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1210 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t"
1211 "psllq $16, %%mm0 \n\t"
1212 "pand %%mm5, %%mm0 \n\t"
1213 "pand %%mm6, %%mm1 \n\t"
1214 "pand %%mm7, %%mm2 \n\t"
1215 "por %%mm0, %%mm1 \n\t"
1216 "por %%mm2, %%mm1 \n\t"
1217 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t"
1218 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t"
1219 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t"
1220 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t"
1221 "pand %%mm7, %%mm0 \n\t"
1222 "pand %%mm5, %%mm1 \n\t"
1223 "pand %%mm6, %%mm2 \n\t"
1224 "por %%mm0, %%mm1 \n\t"
1225 "por %%mm2, %%mm1 \n\t"
1226 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t"
1227 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t"
1228 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t"
1229 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t"
1230 "pand %%mm6, %%mm0 \n\t"
1231 "pand %%mm7, %%mm1 \n\t"
1232 "pand %%mm5, %%mm2 \n\t"
1233 "por %%mm0, %%mm1 \n\t"
1234 "por %%mm2, %%mm1 \n\t"
1235 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t"
1236 "add $24, %%"REG_a
" \n\t"
1240 :
"r" (src-mmx_size),
"r"(dst-mmx_size)
1243 __asm__
volatile(
SFENCE:::
"memory");
1244 __asm__
volatile(
EMMS:::
"memory");
1246 if (mmx_size==23)
return;
1250 src_size= 23-mmx_size;
1253 for (i=0; i<src_size; i+=3) {
1256 dst[i + 1] = src[i + 1];
1257 dst[i + 2] = src[i + 0];
1262 static inline void RENAME(yuvPlanartoyuy2)(
const uint8_t *ysrc,
const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
1264 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1268 for (y=0; y<
height; y++) {
1271 "xor %%"REG_a
", %%"REG_a
" \n\t"
1274 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1277 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1278 "movq %%mm0, %%mm2 \n\t"
1279 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1280 "punpcklbw %%mm1, %%mm0 \n\t"
1281 "punpckhbw %%mm1, %%mm2 \n\t"
1283 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1284 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1285 "movq %%mm3, %%mm4 \n\t"
1286 "movq %%mm5, %%mm6 \n\t"
1287 "punpcklbw %%mm0, %%mm3 \n\t"
1288 "punpckhbw %%mm0, %%mm4 \n\t"
1289 "punpcklbw %%mm2, %%mm5 \n\t"
1290 "punpckhbw %%mm2, %%mm6 \n\t"
1292 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1293 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1294 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1295 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1297 "add $8, %%"REG_a
" \n\t"
1298 "cmp %4, %%"REG_a
" \n\t"
1300 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1303 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1304 usrc += chromStride;
1305 vsrc += chromStride;
1319 static inline void RENAME(
yv12toyuy2)(
const uint8_t *ysrc,
const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
1321 int lumStride,
int chromStride,
int dstStride)
1324 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst,
width,
height, lumStride, chromStride, dstStride, 2);
1327 static inline void RENAME(yuvPlanartouyvy)(
const uint8_t *ysrc,
const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
1329 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1333 for (y=0; y<
height; y++) {
1336 "xor %%"REG_a
", %%"REG_a
" \n\t"
1339 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1342 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1343 "movq %%mm0, %%mm2 \n\t"
1344 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1345 "punpcklbw %%mm1, %%mm0 \n\t"
1346 "punpckhbw %%mm1, %%mm2 \n\t"
1348 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1349 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1350 "movq %%mm0, %%mm4 \n\t"
1351 "movq %%mm2, %%mm6 \n\t"
1352 "punpcklbw %%mm3, %%mm0 \n\t"
1353 "punpckhbw %%mm3, %%mm4 \n\t"
1354 "punpcklbw %%mm5, %%mm2 \n\t"
1355 "punpckhbw %%mm5, %%mm6 \n\t"
1357 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1358 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1359 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1360 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1362 "add $8, %%"REG_a
" \n\t"
1363 "cmp %4, %%"REG_a
" \n\t"
1365 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1368 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1369 usrc += chromStride;
1370 vsrc += chromStride;
1384 static inline void RENAME(
yv12touyvy)(
const uint8_t *ysrc,
const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
1386 int lumStride,
int chromStride,
int dstStride)
1389 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst,
width,
height, lumStride, chromStride, dstStride, 2);
1395 static inline void RENAME(
yuv422ptouyvy)(
const uint8_t *ysrc,
const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
1397 int lumStride,
int chromStride,
int dstStride)
1399 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst,
width,
height, lumStride, chromStride, dstStride, 1);
1405 static inline void RENAME(
yuv422ptoyuy2)(
const uint8_t *ysrc,
const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
1407 int lumStride,
int chromStride,
int dstStride)
1409 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst,
width,
height, lumStride, chromStride, dstStride, 1);
1416 static inline void RENAME(
yuy2toyv12)(
const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1418 int lumStride,
int chromStride,
int srcStride)
1422 for (y=0; y<
height; y+=2) {
1424 "xor %%"REG_a
", %%"REG_a
" \n\t"
1425 "pcmpeqw %%mm7, %%mm7 \n\t"
1426 "psrlw $8, %%mm7 \n\t"
1429 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1430 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1431 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1432 "movq %%mm0, %%mm2 \n\t"
1433 "movq %%mm1, %%mm3 \n\t"
1434 "psrlw $8, %%mm0 \n\t"
1435 "psrlw $8, %%mm1 \n\t"
1436 "pand %%mm7, %%mm2 \n\t"
1437 "pand %%mm7, %%mm3 \n\t"
1438 "packuswb %%mm1, %%mm0 \n\t"
1439 "packuswb %%mm3, %%mm2 \n\t"
1441 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1443 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1444 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1445 "movq %%mm1, %%mm3 \n\t"
1446 "movq %%mm2, %%mm4 \n\t"
1447 "psrlw $8, %%mm1 \n\t"
1448 "psrlw $8, %%mm2 \n\t"
1449 "pand %%mm7, %%mm3 \n\t"
1450 "pand %%mm7, %%mm4 \n\t"
1451 "packuswb %%mm2, %%mm1 \n\t"
1452 "packuswb %%mm4, %%mm3 \n\t"
1454 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1456 "movq %%mm0, %%mm2 \n\t"
1457 "movq %%mm1, %%mm3 \n\t"
1458 "psrlw $8, %%mm0 \n\t"
1459 "psrlw $8, %%mm1 \n\t"
1460 "pand %%mm7, %%mm2 \n\t"
1461 "pand %%mm7, %%mm3 \n\t"
1462 "packuswb %%mm1, %%mm0 \n\t"
1463 "packuswb %%mm3, %%mm2 \n\t"
1465 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1466 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1468 "add $8, %%"REG_a
" \n\t"
1469 "cmp %4, %%"REG_a
" \n\t"
1471 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1472 :
"memory",
"%"REG_a
1479 "xor %%"REG_a
", %%"REG_a
" \n\t"
1482 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1483 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1484 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1485 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1486 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1487 "pand %%mm7, %%mm0 \n\t"
1488 "pand %%mm7, %%mm1 \n\t"
1489 "pand %%mm7, %%mm2 \n\t"
1490 "pand %%mm7, %%mm3 \n\t"
1491 "packuswb %%mm1, %%mm0 \n\t"
1492 "packuswb %%mm3, %%mm2 \n\t"
1494 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1495 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1497 "add $8, %%"REG_a
" \n\t"
1498 "cmp %4, %%"REG_a
" \n\t"
1501 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1502 :
"memory",
"%"REG_a
1504 udst += chromStride;
1505 vdst += chromStride;
1509 __asm__
volatile(
EMMS" \n\t"
1515 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1516 static inline void RENAME(
planar2x)(
const uint8_t *src, uint8_t *dst,
int srcWidth,
int srcHeight,
int srcStride,
int dstStride)
1523 for (x=0; x<srcWidth-1; x++) {
1524 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1525 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1527 dst[2*srcWidth-1]= src[srcWidth-1];
1531 for (y=1; y<srcHeight; y++) {
1532 const x86_reg mmxSize= srcWidth&~15;
1534 "mov %4, %%"REG_a
" \n\t"
1535 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t"
1536 "movq (%0, %%"REG_a
"), %%mm4 \n\t"
1537 "movq %%mm4, %%mm2 \n\t"
1538 "psllq $8, %%mm4 \n\t"
1539 "pand %%mm0, %%mm2 \n\t"
1540 "por %%mm2, %%mm4 \n\t"
1541 "movq (%1, %%"REG_a
"), %%mm5 \n\t"
1542 "movq %%mm5, %%mm3 \n\t"
1543 "psllq $8, %%mm5 \n\t"
1544 "pand %%mm0, %%mm3 \n\t"
1545 "por %%mm3, %%mm5 \n\t"
1547 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1548 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1549 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1550 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1551 PAVGB" %%mm0, %%mm5 \n\t"
1552 PAVGB" %%mm0, %%mm3 \n\t"
1553 PAVGB" %%mm0, %%mm5 \n\t"
1554 PAVGB" %%mm0, %%mm3 \n\t"
1555 PAVGB" %%mm1, %%mm4 \n\t"
1556 PAVGB" %%mm1, %%mm2 \n\t"
1557 PAVGB" %%mm1, %%mm4 \n\t"
1558 PAVGB" %%mm1, %%mm2 \n\t"
1559 "movq %%mm5, %%mm7 \n\t"
1560 "movq %%mm4, %%mm6 \n\t"
1561 "punpcklbw %%mm3, %%mm5 \n\t"
1562 "punpckhbw %%mm3, %%mm7 \n\t"
1563 "punpcklbw %%mm2, %%mm4 \n\t"
1564 "punpckhbw %%mm2, %%mm6 \n\t"
1565 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1566 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1567 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1568 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1569 "add $8, %%"REG_a
" \n\t"
1570 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1571 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1573 ::
"r" (src + mmxSize ),
"r" (src + srcStride + mmxSize ),
1574 "r" (dst + mmxSize*2),
"r" (dst + dstStride + mmxSize*2),
1579 for (x=mmxSize-1; x<srcWidth-1; x++) {
1580 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1581 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1582 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1583 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1585 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1586 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1595 for (x=0; x<srcWidth-1; x++) {
1596 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1597 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1599 dst[2*srcWidth-1]= src[srcWidth-1];
1601 __asm__
volatile(
EMMS" \n\t"
1607 #if !COMPILE_TEMPLATE_AMD3DNOW
1614 static inline void RENAME(uyvytoyv12)(
const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1616 int lumStride,
int chromStride,
int srcStride)
1619 const x86_reg chromWidth= width>>1;
1620 for (y=0; y<
height; y+=2) {
1622 "xor %%"REG_a
", %%"REG_a
" \n\t"
1623 "pcmpeqw %%mm7, %%mm7 \n\t"
1624 "psrlw $8, %%mm7 \n\t"
1627 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1628 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1629 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1630 "movq %%mm0, %%mm2 \n\t"
1631 "movq %%mm1, %%mm3 \n\t"
1632 "pand %%mm7, %%mm0 \n\t"
1633 "pand %%mm7, %%mm1 \n\t"
1634 "psrlw $8, %%mm2 \n\t"
1635 "psrlw $8, %%mm3 \n\t"
1636 "packuswb %%mm1, %%mm0 \n\t"
1637 "packuswb %%mm3, %%mm2 \n\t"
1639 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1641 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1642 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1643 "movq %%mm1, %%mm3 \n\t"
1644 "movq %%mm2, %%mm4 \n\t"
1645 "pand %%mm7, %%mm1 \n\t"
1646 "pand %%mm7, %%mm2 \n\t"
1647 "psrlw $8, %%mm3 \n\t"
1648 "psrlw $8, %%mm4 \n\t"
1649 "packuswb %%mm2, %%mm1 \n\t"
1650 "packuswb %%mm4, %%mm3 \n\t"
1652 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1654 "movq %%mm0, %%mm2 \n\t"
1655 "movq %%mm1, %%mm3 \n\t"
1656 "psrlw $8, %%mm0 \n\t"
1657 "psrlw $8, %%mm1 \n\t"
1658 "pand %%mm7, %%mm2 \n\t"
1659 "pand %%mm7, %%mm3 \n\t"
1660 "packuswb %%mm1, %%mm0 \n\t"
1661 "packuswb %%mm3, %%mm2 \n\t"
1663 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1664 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1666 "add $8, %%"REG_a
" \n\t"
1667 "cmp %4, %%"REG_a
" \n\t"
1669 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1670 :
"memory",
"%"REG_a
1677 "xor %%"REG_a
", %%"REG_a
" \n\t"
1680 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1681 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1682 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1683 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1684 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1685 "psrlw $8, %%mm0 \n\t"
1686 "psrlw $8, %%mm1 \n\t"
1687 "psrlw $8, %%mm2 \n\t"
1688 "psrlw $8, %%mm3 \n\t"
1689 "packuswb %%mm1, %%mm0 \n\t"
1690 "packuswb %%mm3, %%mm2 \n\t"
1692 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1693 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1695 "add $8, %%"REG_a
" \n\t"
1696 "cmp %4, %%"REG_a
" \n\t"
1699 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1700 :
"memory",
"%"REG_a
1702 udst += chromStride;
1703 vdst += chromStride;
1707 __asm__
volatile(
EMMS" \n\t"
1720 static inline void RENAME(
rgb24toyv12)(
const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1722 int lumStride,
int chromStride,
int srcStride)
1725 const x86_reg chromWidth= width>>1;
1726 for (y=0; y<height-2; y+=2) {
1728 for (i=0; i<2; i++) {
1730 "mov %2, %%"REG_a
" \n\t"
1731 "movq "MANGLE(ff_bgr2YCoeff)
", %%mm6 \n\t"
1732 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1733 "pxor %%mm7, %%mm7 \n\t"
1734 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1738 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1739 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1740 "punpcklbw %%mm7, %%mm0 \n\t"
1741 "punpcklbw %%mm7, %%mm1 \n\t"
1742 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1743 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1744 "punpcklbw %%mm7, %%mm2 \n\t"
1745 "punpcklbw %%mm7, %%mm3 \n\t"
1746 "pmaddwd %%mm6, %%mm0 \n\t"
1747 "pmaddwd %%mm6, %%mm1 \n\t"
1748 "pmaddwd %%mm6, %%mm2 \n\t"
1749 "pmaddwd %%mm6, %%mm3 \n\t"
1750 #ifndef FAST_BGR2YV12
1751 "psrad $8, %%mm0 \n\t"
1752 "psrad $8, %%mm1 \n\t"
1753 "psrad $8, %%mm2 \n\t"
1754 "psrad $8, %%mm3 \n\t"
1756 "packssdw %%mm1, %%mm0 \n\t"
1757 "packssdw %%mm3, %%mm2 \n\t"
1758 "pmaddwd %%mm5, %%mm0 \n\t"
1759 "pmaddwd %%mm5, %%mm2 \n\t"
1760 "packssdw %%mm2, %%mm0 \n\t"
1761 "psraw $7, %%mm0 \n\t"
1763 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1764 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1765 "punpcklbw %%mm7, %%mm4 \n\t"
1766 "punpcklbw %%mm7, %%mm1 \n\t"
1767 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1768 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1769 "punpcklbw %%mm7, %%mm2 \n\t"
1770 "punpcklbw %%mm7, %%mm3 \n\t"
1771 "pmaddwd %%mm6, %%mm4 \n\t"
1772 "pmaddwd %%mm6, %%mm1 \n\t"
1773 "pmaddwd %%mm6, %%mm2 \n\t"
1774 "pmaddwd %%mm6, %%mm3 \n\t"
1775 #ifndef FAST_BGR2YV12
1776 "psrad $8, %%mm4 \n\t"
1777 "psrad $8, %%mm1 \n\t"
1778 "psrad $8, %%mm2 \n\t"
1779 "psrad $8, %%mm3 \n\t"
1781 "packssdw %%mm1, %%mm4 \n\t"
1782 "packssdw %%mm3, %%mm2 \n\t"
1783 "pmaddwd %%mm5, %%mm4 \n\t"
1784 "pmaddwd %%mm5, %%mm2 \n\t"
1785 "add $24, %%"REG_d
" \n\t"
1786 "packssdw %%mm2, %%mm4 \n\t"
1787 "psraw $7, %%mm4 \n\t"
1789 "packuswb %%mm4, %%mm0 \n\t"
1790 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t"
1792 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t"
1793 "add $8, %%"REG_a
" \n\t"
1795 : :
"r" (src+width*3),
"r" (ydst+width),
"g" ((
x86_reg)-width)
1796 :
"%"REG_a,
"%"REG_d
1803 "mov %4, %%"REG_a
" \n\t"
1804 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1805 "movq "MANGLE(ff_bgr2UCoeff)
", %%mm6 \n\t"
1806 "pxor %%mm7, %%mm7 \n\t"
1807 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1808 "add %%"REG_d
", %%"REG_d
" \n\t"
1813 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1814 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1815 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
1816 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1817 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
1818 PAVGB" %%mm1, %%mm0 \n\t"
1819 PAVGB" %%mm3, %%mm2 \n\t"
1820 "movq %%mm0, %%mm1 \n\t"
1821 "movq %%mm2, %%mm3 \n\t"
1822 "psrlq $24, %%mm0 \n\t"
1823 "psrlq $24, %%mm2 \n\t"
1824 PAVGB" %%mm1, %%mm0 \n\t"
1825 PAVGB" %%mm3, %%mm2 \n\t"
1826 "punpcklbw %%mm7, %%mm0 \n\t"
1827 "punpcklbw %%mm7, %%mm2 \n\t"
1829 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1830 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
1831 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1832 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
1833 "punpcklbw %%mm7, %%mm0 \n\t"
1834 "punpcklbw %%mm7, %%mm1 \n\t"
1835 "punpcklbw %%mm7, %%mm2 \n\t"
1836 "punpcklbw %%mm7, %%mm3 \n\t"
1837 "paddw %%mm1, %%mm0 \n\t"
1838 "paddw %%mm3, %%mm2 \n\t"
1839 "paddw %%mm2, %%mm0 \n\t"
1840 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1841 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
1842 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1843 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
1844 "punpcklbw %%mm7, %%mm4 \n\t"
1845 "punpcklbw %%mm7, %%mm1 \n\t"
1846 "punpcklbw %%mm7, %%mm2 \n\t"
1847 "punpcklbw %%mm7, %%mm3 \n\t"
1848 "paddw %%mm1, %%mm4 \n\t"
1849 "paddw %%mm3, %%mm2 \n\t"
1850 "paddw %%mm4, %%mm2 \n\t"
1851 "psrlw $2, %%mm0 \n\t"
1852 "psrlw $2, %%mm2 \n\t"
1854 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1855 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1857 "pmaddwd %%mm0, %%mm1 \n\t"
1858 "pmaddwd %%mm2, %%mm3 \n\t"
1859 "pmaddwd %%mm6, %%mm0 \n\t"
1860 "pmaddwd %%mm6, %%mm2 \n\t"
1861 #ifndef FAST_BGR2YV12
1862 "psrad $8, %%mm0 \n\t"
1863 "psrad $8, %%mm1 \n\t"
1864 "psrad $8, %%mm2 \n\t"
1865 "psrad $8, %%mm3 \n\t"
1867 "packssdw %%mm2, %%mm0 \n\t"
1868 "packssdw %%mm3, %%mm1 \n\t"
1869 "pmaddwd %%mm5, %%mm0 \n\t"
1870 "pmaddwd %%mm5, %%mm1 \n\t"
1871 "packssdw %%mm1, %%mm0 \n\t"
1872 "psraw $7, %%mm0 \n\t"
1874 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1875 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
1876 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
1877 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
1878 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
1879 PAVGB" %%mm1, %%mm4 \n\t"
1880 PAVGB" %%mm3, %%mm2 \n\t"
1881 "movq %%mm4, %%mm1 \n\t"
1882 "movq %%mm2, %%mm3 \n\t"
1883 "psrlq $24, %%mm4 \n\t"
1884 "psrlq $24, %%mm2 \n\t"
1885 PAVGB" %%mm1, %%mm4 \n\t"
1886 PAVGB" %%mm3, %%mm2 \n\t"
1887 "punpcklbw %%mm7, %%mm4 \n\t"
1888 "punpcklbw %%mm7, %%mm2 \n\t"
1890 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1891 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
1892 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
1893 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
1894 "punpcklbw %%mm7, %%mm4 \n\t"
1895 "punpcklbw %%mm7, %%mm1 \n\t"
1896 "punpcklbw %%mm7, %%mm2 \n\t"
1897 "punpcklbw %%mm7, %%mm3 \n\t"
1898 "paddw %%mm1, %%mm4 \n\t"
1899 "paddw %%mm3, %%mm2 \n\t"
1900 "paddw %%mm2, %%mm4 \n\t"
1901 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
1902 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
1903 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
1904 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
1905 "punpcklbw %%mm7, %%mm5 \n\t"
1906 "punpcklbw %%mm7, %%mm1 \n\t"
1907 "punpcklbw %%mm7, %%mm2 \n\t"
1908 "punpcklbw %%mm7, %%mm3 \n\t"
1909 "paddw %%mm1, %%mm5 \n\t"
1910 "paddw %%mm3, %%mm2 \n\t"
1911 "paddw %%mm5, %%mm2 \n\t"
1912 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1913 "psrlw $2, %%mm4 \n\t"
1914 "psrlw $2, %%mm2 \n\t"
1916 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1917 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1919 "pmaddwd %%mm4, %%mm1 \n\t"
1920 "pmaddwd %%mm2, %%mm3 \n\t"
1921 "pmaddwd %%mm6, %%mm4 \n\t"
1922 "pmaddwd %%mm6, %%mm2 \n\t"
1923 #ifndef FAST_BGR2YV12
1924 "psrad $8, %%mm4 \n\t"
1925 "psrad $8, %%mm1 \n\t"
1926 "psrad $8, %%mm2 \n\t"
1927 "psrad $8, %%mm3 \n\t"
1929 "packssdw %%mm2, %%mm4 \n\t"
1930 "packssdw %%mm3, %%mm1 \n\t"
1931 "pmaddwd %%mm5, %%mm4 \n\t"
1932 "pmaddwd %%mm5, %%mm1 \n\t"
1933 "add $24, %%"REG_d
" \n\t"
1934 "packssdw %%mm1, %%mm4 \n\t"
1935 "psraw $7, %%mm4 \n\t"
1937 "movq %%mm0, %%mm1 \n\t"
1938 "punpckldq %%mm4, %%mm0 \n\t"
1939 "punpckhdq %%mm4, %%mm1 \n\t"
1940 "packsswb %%mm1, %%mm0 \n\t"
1941 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t"
1942 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1943 "punpckhdq %%mm0, %%mm0 \n\t"
1944 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1945 "add $4, %%"REG_a
" \n\t"
1947 : :
"r" (src+chromWidth*6),
"r" (src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth)
1948 :
"%"REG_a,
"%"REG_d
1951 udst += chromStride;
1952 vdst += chromStride;
1956 __asm__
volatile(
EMMS" \n\t"
1960 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1964 #if !COMPILE_TEMPLATE_AMD3DNOW
1967 int src2Stride,
int dstStride)
1971 for (h=0; h <
height; h++) {
1974 #if COMPILE_TEMPLATE_SSE2
1976 "xor %%"REG_a
", %%"REG_a
" \n\t"
1980 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
1981 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
1982 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
1983 "punpcklbw %%xmm2, %%xmm0 \n\t"
1984 "punpckhbw %%xmm2, %%xmm1 \n\t"
1985 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
1986 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
1987 "add $16, %%"REG_a
" \n\t"
1988 "cmp %3, %%"REG_a
" \n\t"
1990 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1991 :
"memory",
"%"REG_a
""
1995 "xor %%"REG_a
", %%"REG_a
" \n\t"
1999 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
2000 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
2001 "movq %%mm0, %%mm1 \n\t"
2002 "movq %%mm2, %%mm3 \n\t"
2003 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
2004 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
2005 "punpcklbw %%mm4, %%mm0 \n\t"
2006 "punpckhbw %%mm4, %%mm1 \n\t"
2007 "punpcklbw %%mm5, %%mm2 \n\t"
2008 "punpckhbw %%mm5, %%mm3 \n\t"
2009 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t"
2010 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
2011 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
2012 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
2013 "add $16, %%"REG_a
" \n\t"
2014 "cmp %3, %%"REG_a
" \n\t"
2016 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
2017 :
"memory",
"%"REG_a
2020 for (w= (width&(~15)); w <
width; w++) {
2021 dest[2*w+0] = src1[w];
2022 dest[2*w+1] = src2[w];
2036 #if !COMPILE_TEMPLATE_SSE2
2037 #if !COMPILE_TEMPLATE_AMD3DNOW
2039 uint8_t *dst1, uint8_t *dst2,
2041 int srcStride1,
int srcStride2,
2042 int dstStride1,
int dstStride2)
2046 w=width/2; h=height/2;
2050 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
2052 const uint8_t*
s1=src1+srcStride1*(y>>1);
2053 uint8_t* d=dst1+dstStride1*y;
2055 for (;x<w-31;x+=32) {
2058 "movq %1, %%mm0 \n\t"
2059 "movq 8%1, %%mm2 \n\t"
2060 "movq 16%1, %%mm4 \n\t"
2061 "movq 24%1, %%mm6 \n\t"
2062 "movq %%mm0, %%mm1 \n\t"
2063 "movq %%mm2, %%mm3 \n\t"
2064 "movq %%mm4, %%mm5 \n\t"
2065 "movq %%mm6, %%mm7 \n\t"
2066 "punpcklbw %%mm0, %%mm0 \n\t"
2067 "punpckhbw %%mm1, %%mm1 \n\t"
2068 "punpcklbw %%mm2, %%mm2 \n\t"
2069 "punpckhbw %%mm3, %%mm3 \n\t"
2070 "punpcklbw %%mm4, %%mm4 \n\t"
2071 "punpckhbw %%mm5, %%mm5 \n\t"
2072 "punpcklbw %%mm6, %%mm6 \n\t"
2073 "punpckhbw %%mm7, %%mm7 \n\t"
2076 MOVNTQ" %%mm2, 16%0 \n\t"
2077 MOVNTQ" %%mm3, 24%0 \n\t"
2078 MOVNTQ" %%mm4, 32%0 \n\t"
2079 MOVNTQ" %%mm5, 40%0 \n\t"
2080 MOVNTQ" %%mm6, 48%0 \n\t"
2086 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2089 const uint8_t*
s2=src2+srcStride2*(y>>1);
2090 uint8_t* d=dst2+dstStride2*y;
2092 for (;x<w-31;x+=32) {
2095 "movq %1, %%mm0 \n\t"
2096 "movq 8%1, %%mm2 \n\t"
2097 "movq 16%1, %%mm4 \n\t"
2098 "movq 24%1, %%mm6 \n\t"
2099 "movq %%mm0, %%mm1 \n\t"
2100 "movq %%mm2, %%mm3 \n\t"
2101 "movq %%mm4, %%mm5 \n\t"
2102 "movq %%mm6, %%mm7 \n\t"
2103 "punpcklbw %%mm0, %%mm0 \n\t"
2104 "punpckhbw %%mm1, %%mm1 \n\t"
2105 "punpcklbw %%mm2, %%mm2 \n\t"
2106 "punpckhbw %%mm3, %%mm3 \n\t"
2107 "punpcklbw %%mm4, %%mm4 \n\t"
2108 "punpckhbw %%mm5, %%mm5 \n\t"
2109 "punpcklbw %%mm6, %%mm6 \n\t"
2110 "punpckhbw %%mm7, %%mm7 \n\t"
2113 MOVNTQ" %%mm2, 16%0 \n\t"
2114 MOVNTQ" %%mm3, 24%0 \n\t"
2115 MOVNTQ" %%mm4, 32%0 \n\t"
2116 MOVNTQ" %%mm5, 40%0 \n\t"
2117 MOVNTQ" %%mm6, 48%0 \n\t"
2123 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2135 int srcStride1,
int srcStride2,
2136 int srcStride3,
int dstStride)
2142 const uint8_t* yp=src1+srcStride1*y;
2143 const uint8_t* up=src2+srcStride2*(y>>2);
2144 const uint8_t* vp=src3+srcStride3*(y>>2);
2145 uint8_t* d=dst+dstStride*y;
2152 "movq (%1, %0, 4), %%mm0 \n\t"
2153 "movq (%2, %0), %%mm1 \n\t"
2154 "movq (%3, %0), %%mm2 \n\t"
2155 "movq %%mm0, %%mm3 \n\t"
2156 "movq %%mm1, %%mm4 \n\t"
2157 "movq %%mm2, %%mm5 \n\t"
2158 "punpcklbw %%mm1, %%mm1 \n\t"
2159 "punpcklbw %%mm2, %%mm2 \n\t"
2160 "punpckhbw %%mm4, %%mm4 \n\t"
2161 "punpckhbw %%mm5, %%mm5 \n\t"
2163 "movq %%mm1, %%mm6 \n\t"
2164 "punpcklbw %%mm2, %%mm1 \n\t"
2165 "punpcklbw %%mm1, %%mm0 \n\t"
2166 "punpckhbw %%mm1, %%mm3 \n\t"
2167 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2168 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2170 "punpckhbw %%mm2, %%mm6 \n\t"
2171 "movq 8(%1, %0, 4), %%mm0 \n\t"
2172 "movq %%mm0, %%mm3 \n\t"
2173 "punpcklbw %%mm6, %%mm0 \n\t"
2174 "punpckhbw %%mm6, %%mm3 \n\t"
2175 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2176 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2178 "movq %%mm4, %%mm6 \n\t"
2179 "movq 16(%1, %0, 4), %%mm0 \n\t"
2180 "movq %%mm0, %%mm3 \n\t"
2181 "punpcklbw %%mm5, %%mm4 \n\t"
2182 "punpcklbw %%mm4, %%mm0 \n\t"
2183 "punpckhbw %%mm4, %%mm3 \n\t"
2184 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2185 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2187 "punpckhbw %%mm5, %%mm6 \n\t"
2188 "movq 24(%1, %0, 4), %%mm0 \n\t"
2189 "movq %%mm0, %%mm3 \n\t"
2190 "punpcklbw %%mm6, %%mm0 \n\t"
2191 "punpckhbw %%mm6, %%mm3 \n\t"
2192 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2193 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2196 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2200 const int x2 = x<<2;
2203 d[8*x+2] = yp[x2+1];
2205 d[8*x+4] = yp[x2+2];
2207 d[8*x+6] = yp[x2+3];
2219 static void RENAME(extract_even)(
const uint8_t *src, uint8_t *dst,
x86_reg count)
2228 "pcmpeqw %%mm7, %%mm7 \n\t"
2229 "psrlw $8, %%mm7 \n\t"
2231 "movq -30(%1, %0, 2), %%mm0 \n\t"
2232 "movq -22(%1, %0, 2), %%mm1 \n\t"
2233 "movq -14(%1, %0, 2), %%mm2 \n\t"
2234 "movq -6(%1, %0, 2), %%mm3 \n\t"
2235 "pand %%mm7, %%mm0 \n\t"
2236 "pand %%mm7, %%mm1 \n\t"
2237 "pand %%mm7, %%mm2 \n\t"
2238 "pand %%mm7, %%mm3 \n\t"
2239 "packuswb %%mm1, %%mm0 \n\t"
2240 "packuswb %%mm3, %%mm2 \n\t"
2241 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2242 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2246 :
"r"(src),
"r"(dst)
2251 dst[count]= src[2*count];
2256 #if !COMPILE_TEMPLATE_AMD3DNOW
2257 static void RENAME(extract_even2)(
const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
x86_reg count)
2266 "pcmpeqw %%mm7, %%mm7 \n\t"
2267 "psrlw $8, %%mm7 \n\t"
2269 "movq -28(%1, %0, 4), %%mm0 \n\t"
2270 "movq -20(%1, %0, 4), %%mm1 \n\t"
2271 "movq -12(%1, %0, 4), %%mm2 \n\t"
2272 "movq -4(%1, %0, 4), %%mm3 \n\t"
2273 "pand %%mm7, %%mm0 \n\t"
2274 "pand %%mm7, %%mm1 \n\t"
2275 "pand %%mm7, %%mm2 \n\t"
2276 "pand %%mm7, %%mm3 \n\t"
2277 "packuswb %%mm1, %%mm0 \n\t"
2278 "packuswb %%mm3, %%mm2 \n\t"
2279 "movq %%mm0, %%mm1 \n\t"
2280 "movq %%mm2, %%mm3 \n\t"
2281 "psrlw $8, %%mm0 \n\t"
2282 "psrlw $8, %%mm2 \n\t"
2283 "pand %%mm7, %%mm1 \n\t"
2284 "pand %%mm7, %%mm3 \n\t"
2285 "packuswb %%mm2, %%mm0 \n\t"
2286 "packuswb %%mm3, %%mm1 \n\t"
2287 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2288 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2292 :
"r"(src),
"r"(dst0),
"r"(dst1)
2297 dst0[count]= src[4*count+0];
2298 dst1[count]= src[4*count+2];
2304 static void RENAME(extract_even2avg)(
const uint8_t *src0,
const uint8_t *src1, uint8_t *dst0, uint8_t *dst1,
x86_reg count)
2315 "pcmpeqw %%mm7, %%mm7 \n\t"
2316 "psrlw $8, %%mm7 \n\t"
2318 "movq -28(%1, %0, 4), %%mm0 \n\t"
2319 "movq -20(%1, %0, 4), %%mm1 \n\t"
2320 "movq -12(%1, %0, 4), %%mm2 \n\t"
2321 "movq -4(%1, %0, 4), %%mm3 \n\t"
2322 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2323 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2324 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2325 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2326 "pand %%mm7, %%mm0 \n\t"
2327 "pand %%mm7, %%mm1 \n\t"
2328 "pand %%mm7, %%mm2 \n\t"
2329 "pand %%mm7, %%mm3 \n\t"
2330 "packuswb %%mm1, %%mm0 \n\t"
2331 "packuswb %%mm3, %%mm2 \n\t"
2332 "movq %%mm0, %%mm1 \n\t"
2333 "movq %%mm2, %%mm3 \n\t"
2334 "psrlw $8, %%mm0 \n\t"
2335 "psrlw $8, %%mm2 \n\t"
2336 "pand %%mm7, %%mm1 \n\t"
2337 "pand %%mm7, %%mm3 \n\t"
2338 "packuswb %%mm2, %%mm0 \n\t"
2339 "packuswb %%mm3, %%mm1 \n\t"
2340 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2341 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2345 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2351 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2352 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2357 #if !COMPILE_TEMPLATE_AMD3DNOW
2358 static void RENAME(extract_odd2)(
const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
x86_reg count)
2367 "pcmpeqw %%mm7, %%mm7 \n\t"
2368 "psrlw $8, %%mm7 \n\t"
2370 "movq -28(%1, %0, 4), %%mm0 \n\t"
2371 "movq -20(%1, %0, 4), %%mm1 \n\t"
2372 "movq -12(%1, %0, 4), %%mm2 \n\t"
2373 "movq -4(%1, %0, 4), %%mm3 \n\t"
2374 "psrlw $8, %%mm0 \n\t"
2375 "psrlw $8, %%mm1 \n\t"
2376 "psrlw $8, %%mm2 \n\t"
2377 "psrlw $8, %%mm3 \n\t"
2378 "packuswb %%mm1, %%mm0 \n\t"
2379 "packuswb %%mm3, %%mm2 \n\t"
2380 "movq %%mm0, %%mm1 \n\t"
2381 "movq %%mm2, %%mm3 \n\t"
2382 "psrlw $8, %%mm0 \n\t"
2383 "psrlw $8, %%mm2 \n\t"
2384 "pand %%mm7, %%mm1 \n\t"
2385 "pand %%mm7, %%mm3 \n\t"
2386 "packuswb %%mm2, %%mm0 \n\t"
2387 "packuswb %%mm3, %%mm1 \n\t"
2388 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2389 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2393 :
"r"(src),
"r"(dst0),
"r"(dst1)
2399 dst0[count]= src[4*count+0];
2400 dst1[count]= src[4*count+2];
2406 static void RENAME(extract_odd2avg)(
const uint8_t *src0,
const uint8_t *src1, uint8_t *dst0, uint8_t *dst1,
x86_reg count)
2417 "pcmpeqw %%mm7, %%mm7 \n\t"
2418 "psrlw $8, %%mm7 \n\t"
2420 "movq -28(%1, %0, 4), %%mm0 \n\t"
2421 "movq -20(%1, %0, 4), %%mm1 \n\t"
2422 "movq -12(%1, %0, 4), %%mm2 \n\t"
2423 "movq -4(%1, %0, 4), %%mm3 \n\t"
2424 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2425 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2426 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2427 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2428 "psrlw $8, %%mm0 \n\t"
2429 "psrlw $8, %%mm1 \n\t"
2430 "psrlw $8, %%mm2 \n\t"
2431 "psrlw $8, %%mm3 \n\t"
2432 "packuswb %%mm1, %%mm0 \n\t"
2433 "packuswb %%mm3, %%mm2 \n\t"
2434 "movq %%mm0, %%mm1 \n\t"
2435 "movq %%mm2, %%mm3 \n\t"
2436 "psrlw $8, %%mm0 \n\t"
2437 "psrlw $8, %%mm2 \n\t"
2438 "pand %%mm7, %%mm1 \n\t"
2439 "pand %%mm7, %%mm3 \n\t"
2440 "packuswb %%mm2, %%mm0 \n\t"
2441 "packuswb %%mm3, %%mm1 \n\t"
2442 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2443 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2447 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2455 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2456 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2463 int lumStride,
int chromStride,
int srcStride)
2466 const int chromWidth= -((-
width)>>1);
2468 for (y=0; y<
height; y++) {
2471 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2486 #if !COMPILE_TEMPLATE_AMD3DNOW
2489 int lumStride,
int chromStride,
int srcStride)
2492 const int chromWidth= -((-
width)>>1);
2494 for (y=0; y<
height; y++) {
2496 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2513 int lumStride,
int chromStride,
int srcStride)
2516 const int chromWidth= -((-
width)>>1);
2518 for (y=0; y<
height; y++) {
2521 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2536 #if !COMPILE_TEMPLATE_AMD3DNOW
2539 int lumStride,
int chromStride,
int srcStride)
2542 const int chromWidth= -((-
width)>>1);
2544 for (y=0; y<
height; y++) {
2546 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2564 #if !COMPILE_TEMPLATE_SSE2
2565 #if !COMPILE_TEMPLATE_AMD3DNOW
2595 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2604 #if !COMPILE_TEMPLATE_AMD3DNOW