rgb2rgb_template.c
Go to the documentation of this file.
1 /*
2  * software RGB to RGB converter
3  * pluralize by software PAL8 to RGB converter
4  * software YUV to YUV converter
5  * software YUV to RGB converter
6  * Written by Nick Kurshev.
7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8  * lot of big-endian byte order fixes by Alex Beregszaszi
9  *
10  * This file is part of Libav.
11  *
12  * Libav is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * Libav is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with Libav; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include <stddef.h>
28 
29 #undef PREFETCH
30 #undef MOVNTQ
31 #undef EMMS
32 #undef SFENCE
33 #undef PAVGB
34 
35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMX2
39 #define PREFETCH "prefetchnta"
40 #define PAVGB "pavgb"
41 #else
42 #define PREFETCH " # nop"
43 #endif
44 
45 #if COMPILE_TEMPLATE_AMD3DNOW
46 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
47 #define EMMS "femms"
48 #else
49 #define EMMS "emms"
50 #endif
51 
52 #if COMPILE_TEMPLATE_MMX2
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
55 #else
56 #define MOVNTQ "movq"
57 #define SFENCE " # nop"
58 #endif
59 
60 #if !COMPILE_TEMPLATE_SSE2
61 
62 #if !COMPILE_TEMPLATE_AMD3DNOW
63 
64 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
65 {
66  uint8_t *dest = dst;
67  const uint8_t *s = src;
68  const uint8_t *end;
69  const uint8_t *mm_end;
70  end = s + src_size;
71  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
72  mm_end = end - 23;
73  __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
74  while (s < mm_end) {
75  __asm__ volatile(
76  PREFETCH" 32%1 \n\t"
77  "movd %1, %%mm0 \n\t"
78  "punpckldq 3%1, %%mm0 \n\t"
79  "movd 6%1, %%mm1 \n\t"
80  "punpckldq 9%1, %%mm1 \n\t"
81  "movd 12%1, %%mm2 \n\t"
82  "punpckldq 15%1, %%mm2 \n\t"
83  "movd 18%1, %%mm3 \n\t"
84  "punpckldq 21%1, %%mm3 \n\t"
85  "por %%mm7, %%mm0 \n\t"
86  "por %%mm7, %%mm1 \n\t"
87  "por %%mm7, %%mm2 \n\t"
88  "por %%mm7, %%mm3 \n\t"
89  MOVNTQ" %%mm0, %0 \n\t"
90  MOVNTQ" %%mm1, 8%0 \n\t"
91  MOVNTQ" %%mm2, 16%0 \n\t"
92  MOVNTQ" %%mm3, 24%0"
93  :"=m"(*dest)
94  :"m"(*s)
95  :"memory");
96  dest += 32;
97  s += 24;
98  }
99  __asm__ volatile(SFENCE:::"memory");
100  __asm__ volatile(EMMS:::"memory");
101  while (s < end) {
102  *dest++ = *s++;
103  *dest++ = *s++;
104  *dest++ = *s++;
105  *dest++ = 255;
106  }
107 }
108 
109 #define STORE_BGR24_MMX \
110  "psrlq $8, %%mm2 \n\t" \
111  "psrlq $8, %%mm3 \n\t" \
112  "psrlq $8, %%mm6 \n\t" \
113  "psrlq $8, %%mm7 \n\t" \
114  "pand "MANGLE(mask24l)", %%mm0\n\t" \
115  "pand "MANGLE(mask24l)", %%mm1\n\t" \
116  "pand "MANGLE(mask24l)", %%mm4\n\t" \
117  "pand "MANGLE(mask24l)", %%mm5\n\t" \
118  "pand "MANGLE(mask24h)", %%mm2\n\t" \
119  "pand "MANGLE(mask24h)", %%mm3\n\t" \
120  "pand "MANGLE(mask24h)", %%mm6\n\t" \
121  "pand "MANGLE(mask24h)", %%mm7\n\t" \
122  "por %%mm2, %%mm0 \n\t" \
123  "por %%mm3, %%mm1 \n\t" \
124  "por %%mm6, %%mm4 \n\t" \
125  "por %%mm7, %%mm5 \n\t" \
126  \
127  "movq %%mm1, %%mm2 \n\t" \
128  "movq %%mm4, %%mm3 \n\t" \
129  "psllq $48, %%mm2 \n\t" \
130  "psllq $32, %%mm3 \n\t" \
131  "pand "MANGLE(mask24hh)", %%mm2\n\t" \
132  "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
133  "por %%mm2, %%mm0 \n\t" \
134  "psrlq $16, %%mm1 \n\t" \
135  "psrlq $32, %%mm4 \n\t" \
136  "psllq $16, %%mm5 \n\t" \
137  "por %%mm3, %%mm1 \n\t" \
138  "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
139  "por %%mm5, %%mm4 \n\t" \
140  \
141  MOVNTQ" %%mm0, %0 \n\t" \
142  MOVNTQ" %%mm1, 8%0 \n\t" \
143  MOVNTQ" %%mm4, 16%0"
144 
145 
146 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
147 {
148  uint8_t *dest = dst;
149  const uint8_t *s = src;
150  const uint8_t *end;
151  const uint8_t *mm_end;
152  end = s + src_size;
153  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
154  mm_end = end - 31;
155  while (s < mm_end) {
156  __asm__ volatile(
157  PREFETCH" 32%1 \n\t"
158  "movq %1, %%mm0 \n\t"
159  "movq 8%1, %%mm1 \n\t"
160  "movq 16%1, %%mm4 \n\t"
161  "movq 24%1, %%mm5 \n\t"
162  "movq %%mm0, %%mm2 \n\t"
163  "movq %%mm1, %%mm3 \n\t"
164  "movq %%mm4, %%mm6 \n\t"
165  "movq %%mm5, %%mm7 \n\t"
167  :"=m"(*dest)
168  :"m"(*s)
169  :"memory");
170  dest += 24;
171  s += 32;
172  }
173  __asm__ volatile(SFENCE:::"memory");
174  __asm__ volatile(EMMS:::"memory");
175  while (s < end) {
176  *dest++ = *s++;
177  *dest++ = *s++;
178  *dest++ = *s++;
179  s++;
180  }
181 }
182 
183 /*
184  original by Strepto/Astral
185  ported to gcc & bugfixed: A'rpi
186  MMX2, 3DNOW optimization by Nick Kurshev
187  32-bit C version, and and&add trick by Michael Niedermayer
188 */
189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
190 {
191  register const uint8_t* s=src;
192  register uint8_t* d=dst;
193  register const uint8_t *end;
194  const uint8_t *mm_end;
195  end = s + src_size;
196  __asm__ volatile(PREFETCH" %0"::"m"(*s));
197  __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
198  mm_end = end - 15;
199  while (s<mm_end) {
200  __asm__ volatile(
201  PREFETCH" 32%1 \n\t"
202  "movq %1, %%mm0 \n\t"
203  "movq 8%1, %%mm2 \n\t"
204  "movq %%mm0, %%mm1 \n\t"
205  "movq %%mm2, %%mm3 \n\t"
206  "pand %%mm4, %%mm0 \n\t"
207  "pand %%mm4, %%mm2 \n\t"
208  "paddw %%mm1, %%mm0 \n\t"
209  "paddw %%mm3, %%mm2 \n\t"
210  MOVNTQ" %%mm0, %0 \n\t"
211  MOVNTQ" %%mm2, 8%0"
212  :"=m"(*d)
213  :"m"(*s)
214  );
215  d+=16;
216  s+=16;
217  }
218  __asm__ volatile(SFENCE:::"memory");
219  __asm__ volatile(EMMS:::"memory");
220  mm_end = end - 3;
221  while (s < mm_end) {
222  register unsigned x= *((const uint32_t *)s);
223  *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
224  d+=4;
225  s+=4;
226  }
227  if (s < end) {
228  register unsigned short x= *((const uint16_t *)s);
229  *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
230  }
231 }
232 
233 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
234 {
235  register const uint8_t* s=src;
236  register uint8_t* d=dst;
237  register const uint8_t *end;
238  const uint8_t *mm_end;
239  end = s + src_size;
240  __asm__ volatile(PREFETCH" %0"::"m"(*s));
241  __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
242  __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
243  mm_end = end - 15;
244  while (s<mm_end) {
245  __asm__ volatile(
246  PREFETCH" 32%1 \n\t"
247  "movq %1, %%mm0 \n\t"
248  "movq 8%1, %%mm2 \n\t"
249  "movq %%mm0, %%mm1 \n\t"
250  "movq %%mm2, %%mm3 \n\t"
251  "psrlq $1, %%mm0 \n\t"
252  "psrlq $1, %%mm2 \n\t"
253  "pand %%mm7, %%mm0 \n\t"
254  "pand %%mm7, %%mm2 \n\t"
255  "pand %%mm6, %%mm1 \n\t"
256  "pand %%mm6, %%mm3 \n\t"
257  "por %%mm1, %%mm0 \n\t"
258  "por %%mm3, %%mm2 \n\t"
259  MOVNTQ" %%mm0, %0 \n\t"
260  MOVNTQ" %%mm2, 8%0"
261  :"=m"(*d)
262  :"m"(*s)
263  );
264  d+=16;
265  s+=16;
266  }
267  __asm__ volatile(SFENCE:::"memory");
268  __asm__ volatile(EMMS:::"memory");
269  mm_end = end - 3;
270  while (s < mm_end) {
271  register uint32_t x= *((const uint32_t*)s);
272  *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
273  s+=4;
274  d+=4;
275  }
276  if (s < end) {
277  register uint16_t x= *((const uint16_t*)s);
278  *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
279  }
280 }
281 
282 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
283 {
284  const uint8_t *s = src;
285  const uint8_t *end;
286  const uint8_t *mm_end;
287  uint16_t *d = (uint16_t *)dst;
288  end = s + src_size;
289  mm_end = end - 15;
290 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
291  __asm__ volatile(
292  "movq %3, %%mm5 \n\t"
293  "movq %4, %%mm6 \n\t"
294  "movq %5, %%mm7 \n\t"
295  "jmp 2f \n\t"
296  ".p2align 4 \n\t"
297  "1: \n\t"
298  PREFETCH" 32(%1) \n\t"
299  "movd (%1), %%mm0 \n\t"
300  "movd 4(%1), %%mm3 \n\t"
301  "punpckldq 8(%1), %%mm0 \n\t"
302  "punpckldq 12(%1), %%mm3 \n\t"
303  "movq %%mm0, %%mm1 \n\t"
304  "movq %%mm3, %%mm4 \n\t"
305  "pand %%mm6, %%mm0 \n\t"
306  "pand %%mm6, %%mm3 \n\t"
307  "pmaddwd %%mm7, %%mm0 \n\t"
308  "pmaddwd %%mm7, %%mm3 \n\t"
309  "pand %%mm5, %%mm1 \n\t"
310  "pand %%mm5, %%mm4 \n\t"
311  "por %%mm1, %%mm0 \n\t"
312  "por %%mm4, %%mm3 \n\t"
313  "psrld $5, %%mm0 \n\t"
314  "pslld $11, %%mm3 \n\t"
315  "por %%mm3, %%mm0 \n\t"
316  MOVNTQ" %%mm0, (%0) \n\t"
317  "add $16, %1 \n\t"
318  "add $8, %0 \n\t"
319  "2: \n\t"
320  "cmp %2, %1 \n\t"
321  " jb 1b \n\t"
322  : "+r" (d), "+r"(s)
323  : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
324  );
325 #else
326  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
327  __asm__ volatile(
328  "movq %0, %%mm7 \n\t"
329  "movq %1, %%mm6 \n\t"
330  ::"m"(red_16mask),"m"(green_16mask));
331  while (s < mm_end) {
332  __asm__ volatile(
333  PREFETCH" 32%1 \n\t"
334  "movd %1, %%mm0 \n\t"
335  "movd 4%1, %%mm3 \n\t"
336  "punpckldq 8%1, %%mm0 \n\t"
337  "punpckldq 12%1, %%mm3 \n\t"
338  "movq %%mm0, %%mm1 \n\t"
339  "movq %%mm0, %%mm2 \n\t"
340  "movq %%mm3, %%mm4 \n\t"
341  "movq %%mm3, %%mm5 \n\t"
342  "psrlq $3, %%mm0 \n\t"
343  "psrlq $3, %%mm3 \n\t"
344  "pand %2, %%mm0 \n\t"
345  "pand %2, %%mm3 \n\t"
346  "psrlq $5, %%mm1 \n\t"
347  "psrlq $5, %%mm4 \n\t"
348  "pand %%mm6, %%mm1 \n\t"
349  "pand %%mm6, %%mm4 \n\t"
350  "psrlq $8, %%mm2 \n\t"
351  "psrlq $8, %%mm5 \n\t"
352  "pand %%mm7, %%mm2 \n\t"
353  "pand %%mm7, %%mm5 \n\t"
354  "por %%mm1, %%mm0 \n\t"
355  "por %%mm4, %%mm3 \n\t"
356  "por %%mm2, %%mm0 \n\t"
357  "por %%mm5, %%mm3 \n\t"
358  "psllq $16, %%mm3 \n\t"
359  "por %%mm3, %%mm0 \n\t"
360  MOVNTQ" %%mm0, %0 \n\t"
361  :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
362  d += 4;
363  s += 16;
364  }
365 #endif
366  __asm__ volatile(SFENCE:::"memory");
367  __asm__ volatile(EMMS:::"memory");
368  while (s < end) {
369  register int rgb = *(const uint32_t*)s; s += 4;
370  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
371  }
372 }
373 
374 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
375 {
376  const uint8_t *s = src;
377  const uint8_t *end;
378  const uint8_t *mm_end;
379  uint16_t *d = (uint16_t *)dst;
380  end = s + src_size;
381  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
382  __asm__ volatile(
383  "movq %0, %%mm7 \n\t"
384  "movq %1, %%mm6 \n\t"
385  ::"m"(red_16mask),"m"(green_16mask));
386  mm_end = end - 15;
387  while (s < mm_end) {
388  __asm__ volatile(
389  PREFETCH" 32%1 \n\t"
390  "movd %1, %%mm0 \n\t"
391  "movd 4%1, %%mm3 \n\t"
392  "punpckldq 8%1, %%mm0 \n\t"
393  "punpckldq 12%1, %%mm3 \n\t"
394  "movq %%mm0, %%mm1 \n\t"
395  "movq %%mm0, %%mm2 \n\t"
396  "movq %%mm3, %%mm4 \n\t"
397  "movq %%mm3, %%mm5 \n\t"
398  "psllq $8, %%mm0 \n\t"
399  "psllq $8, %%mm3 \n\t"
400  "pand %%mm7, %%mm0 \n\t"
401  "pand %%mm7, %%mm3 \n\t"
402  "psrlq $5, %%mm1 \n\t"
403  "psrlq $5, %%mm4 \n\t"
404  "pand %%mm6, %%mm1 \n\t"
405  "pand %%mm6, %%mm4 \n\t"
406  "psrlq $19, %%mm2 \n\t"
407  "psrlq $19, %%mm5 \n\t"
408  "pand %2, %%mm2 \n\t"
409  "pand %2, %%mm5 \n\t"
410  "por %%mm1, %%mm0 \n\t"
411  "por %%mm4, %%mm3 \n\t"
412  "por %%mm2, %%mm0 \n\t"
413  "por %%mm5, %%mm3 \n\t"
414  "psllq $16, %%mm3 \n\t"
415  "por %%mm3, %%mm0 \n\t"
416  MOVNTQ" %%mm0, %0 \n\t"
417  :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
418  d += 4;
419  s += 16;
420  }
421  __asm__ volatile(SFENCE:::"memory");
422  __asm__ volatile(EMMS:::"memory");
423  while (s < end) {
424  register int rgb = *(const uint32_t*)s; s += 4;
425  *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
426  }
427 }
428 
429 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
430 {
431  const uint8_t *s = src;
432  const uint8_t *end;
433  const uint8_t *mm_end;
434  uint16_t *d = (uint16_t *)dst;
435  end = s + src_size;
436  mm_end = end - 15;
437 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
438  __asm__ volatile(
439  "movq %3, %%mm5 \n\t"
440  "movq %4, %%mm6 \n\t"
441  "movq %5, %%mm7 \n\t"
442  "jmp 2f \n\t"
443  ".p2align 4 \n\t"
444  "1: \n\t"
445  PREFETCH" 32(%1) \n\t"
446  "movd (%1), %%mm0 \n\t"
447  "movd 4(%1), %%mm3 \n\t"
448  "punpckldq 8(%1), %%mm0 \n\t"
449  "punpckldq 12(%1), %%mm3 \n\t"
450  "movq %%mm0, %%mm1 \n\t"
451  "movq %%mm3, %%mm4 \n\t"
452  "pand %%mm6, %%mm0 \n\t"
453  "pand %%mm6, %%mm3 \n\t"
454  "pmaddwd %%mm7, %%mm0 \n\t"
455  "pmaddwd %%mm7, %%mm3 \n\t"
456  "pand %%mm5, %%mm1 \n\t"
457  "pand %%mm5, %%mm4 \n\t"
458  "por %%mm1, %%mm0 \n\t"
459  "por %%mm4, %%mm3 \n\t"
460  "psrld $6, %%mm0 \n\t"
461  "pslld $10, %%mm3 \n\t"
462  "por %%mm3, %%mm0 \n\t"
463  MOVNTQ" %%mm0, (%0) \n\t"
464  "add $16, %1 \n\t"
465  "add $8, %0 \n\t"
466  "2: \n\t"
467  "cmp %2, %1 \n\t"
468  " jb 1b \n\t"
469  : "+r" (d), "+r"(s)
470  : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
471  );
472 #else
473  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
474  __asm__ volatile(
475  "movq %0, %%mm7 \n\t"
476  "movq %1, %%mm6 \n\t"
477  ::"m"(red_15mask),"m"(green_15mask));
478  while (s < mm_end) {
479  __asm__ volatile(
480  PREFETCH" 32%1 \n\t"
481  "movd %1, %%mm0 \n\t"
482  "movd 4%1, %%mm3 \n\t"
483  "punpckldq 8%1, %%mm0 \n\t"
484  "punpckldq 12%1, %%mm3 \n\t"
485  "movq %%mm0, %%mm1 \n\t"
486  "movq %%mm0, %%mm2 \n\t"
487  "movq %%mm3, %%mm4 \n\t"
488  "movq %%mm3, %%mm5 \n\t"
489  "psrlq $3, %%mm0 \n\t"
490  "psrlq $3, %%mm3 \n\t"
491  "pand %2, %%mm0 \n\t"
492  "pand %2, %%mm3 \n\t"
493  "psrlq $6, %%mm1 \n\t"
494  "psrlq $6, %%mm4 \n\t"
495  "pand %%mm6, %%mm1 \n\t"
496  "pand %%mm6, %%mm4 \n\t"
497  "psrlq $9, %%mm2 \n\t"
498  "psrlq $9, %%mm5 \n\t"
499  "pand %%mm7, %%mm2 \n\t"
500  "pand %%mm7, %%mm5 \n\t"
501  "por %%mm1, %%mm0 \n\t"
502  "por %%mm4, %%mm3 \n\t"
503  "por %%mm2, %%mm0 \n\t"
504  "por %%mm5, %%mm3 \n\t"
505  "psllq $16, %%mm3 \n\t"
506  "por %%mm3, %%mm0 \n\t"
507  MOVNTQ" %%mm0, %0 \n\t"
508  :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
509  d += 4;
510  s += 16;
511  }
512 #endif
513  __asm__ volatile(SFENCE:::"memory");
514  __asm__ volatile(EMMS:::"memory");
515  while (s < end) {
516  register int rgb = *(const uint32_t*)s; s += 4;
517  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
518  }
519 }
520 
521 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
522 {
523  const uint8_t *s = src;
524  const uint8_t *end;
525  const uint8_t *mm_end;
526  uint16_t *d = (uint16_t *)dst;
527  end = s + src_size;
528  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
529  __asm__ volatile(
530  "movq %0, %%mm7 \n\t"
531  "movq %1, %%mm6 \n\t"
532  ::"m"(red_15mask),"m"(green_15mask));
533  mm_end = end - 15;
534  while (s < mm_end) {
535  __asm__ volatile(
536  PREFETCH" 32%1 \n\t"
537  "movd %1, %%mm0 \n\t"
538  "movd 4%1, %%mm3 \n\t"
539  "punpckldq 8%1, %%mm0 \n\t"
540  "punpckldq 12%1, %%mm3 \n\t"
541  "movq %%mm0, %%mm1 \n\t"
542  "movq %%mm0, %%mm2 \n\t"
543  "movq %%mm3, %%mm4 \n\t"
544  "movq %%mm3, %%mm5 \n\t"
545  "psllq $7, %%mm0 \n\t"
546  "psllq $7, %%mm3 \n\t"
547  "pand %%mm7, %%mm0 \n\t"
548  "pand %%mm7, %%mm3 \n\t"
549  "psrlq $6, %%mm1 \n\t"
550  "psrlq $6, %%mm4 \n\t"
551  "pand %%mm6, %%mm1 \n\t"
552  "pand %%mm6, %%mm4 \n\t"
553  "psrlq $19, %%mm2 \n\t"
554  "psrlq $19, %%mm5 \n\t"
555  "pand %2, %%mm2 \n\t"
556  "pand %2, %%mm5 \n\t"
557  "por %%mm1, %%mm0 \n\t"
558  "por %%mm4, %%mm3 \n\t"
559  "por %%mm2, %%mm0 \n\t"
560  "por %%mm5, %%mm3 \n\t"
561  "psllq $16, %%mm3 \n\t"
562  "por %%mm3, %%mm0 \n\t"
563  MOVNTQ" %%mm0, %0 \n\t"
564  :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
565  d += 4;
566  s += 16;
567  }
568  __asm__ volatile(SFENCE:::"memory");
569  __asm__ volatile(EMMS:::"memory");
570  while (s < end) {
571  register int rgb = *(const uint32_t*)s; s += 4;
572  *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
573  }
574 }
575 
576 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
577 {
578  const uint8_t *s = src;
579  const uint8_t *end;
580  const uint8_t *mm_end;
581  uint16_t *d = (uint16_t *)dst;
582  end = s + src_size;
583  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
584  __asm__ volatile(
585  "movq %0, %%mm7 \n\t"
586  "movq %1, %%mm6 \n\t"
587  ::"m"(red_16mask),"m"(green_16mask));
588  mm_end = end - 11;
589  while (s < mm_end) {
590  __asm__ volatile(
591  PREFETCH" 32%1 \n\t"
592  "movd %1, %%mm0 \n\t"
593  "movd 3%1, %%mm3 \n\t"
594  "punpckldq 6%1, %%mm0 \n\t"
595  "punpckldq 9%1, %%mm3 \n\t"
596  "movq %%mm0, %%mm1 \n\t"
597  "movq %%mm0, %%mm2 \n\t"
598  "movq %%mm3, %%mm4 \n\t"
599  "movq %%mm3, %%mm5 \n\t"
600  "psrlq $3, %%mm0 \n\t"
601  "psrlq $3, %%mm3 \n\t"
602  "pand %2, %%mm0 \n\t"
603  "pand %2, %%mm3 \n\t"
604  "psrlq $5, %%mm1 \n\t"
605  "psrlq $5, %%mm4 \n\t"
606  "pand %%mm6, %%mm1 \n\t"
607  "pand %%mm6, %%mm4 \n\t"
608  "psrlq $8, %%mm2 \n\t"
609  "psrlq $8, %%mm5 \n\t"
610  "pand %%mm7, %%mm2 \n\t"
611  "pand %%mm7, %%mm5 \n\t"
612  "por %%mm1, %%mm0 \n\t"
613  "por %%mm4, %%mm3 \n\t"
614  "por %%mm2, %%mm0 \n\t"
615  "por %%mm5, %%mm3 \n\t"
616  "psllq $16, %%mm3 \n\t"
617  "por %%mm3, %%mm0 \n\t"
618  MOVNTQ" %%mm0, %0 \n\t"
619  :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
620  d += 4;
621  s += 12;
622  }
623  __asm__ volatile(SFENCE:::"memory");
624  __asm__ volatile(EMMS:::"memory");
625  while (s < end) {
626  const int b = *s++;
627  const int g = *s++;
628  const int r = *s++;
629  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
630  }
631 }
632 
633 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
634 {
635  const uint8_t *s = src;
636  const uint8_t *end;
637  const uint8_t *mm_end;
638  uint16_t *d = (uint16_t *)dst;
639  end = s + src_size;
640  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
641  __asm__ volatile(
642  "movq %0, %%mm7 \n\t"
643  "movq %1, %%mm6 \n\t"
644  ::"m"(red_16mask),"m"(green_16mask));
645  mm_end = end - 15;
646  while (s < mm_end) {
647  __asm__ volatile(
648  PREFETCH" 32%1 \n\t"
649  "movd %1, %%mm0 \n\t"
650  "movd 3%1, %%mm3 \n\t"
651  "punpckldq 6%1, %%mm0 \n\t"
652  "punpckldq 9%1, %%mm3 \n\t"
653  "movq %%mm0, %%mm1 \n\t"
654  "movq %%mm0, %%mm2 \n\t"
655  "movq %%mm3, %%mm4 \n\t"
656  "movq %%mm3, %%mm5 \n\t"
657  "psllq $8, %%mm0 \n\t"
658  "psllq $8, %%mm3 \n\t"
659  "pand %%mm7, %%mm0 \n\t"
660  "pand %%mm7, %%mm3 \n\t"
661  "psrlq $5, %%mm1 \n\t"
662  "psrlq $5, %%mm4 \n\t"
663  "pand %%mm6, %%mm1 \n\t"
664  "pand %%mm6, %%mm4 \n\t"
665  "psrlq $19, %%mm2 \n\t"
666  "psrlq $19, %%mm5 \n\t"
667  "pand %2, %%mm2 \n\t"
668  "pand %2, %%mm5 \n\t"
669  "por %%mm1, %%mm0 \n\t"
670  "por %%mm4, %%mm3 \n\t"
671  "por %%mm2, %%mm0 \n\t"
672  "por %%mm5, %%mm3 \n\t"
673  "psllq $16, %%mm3 \n\t"
674  "por %%mm3, %%mm0 \n\t"
675  MOVNTQ" %%mm0, %0 \n\t"
676  :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
677  d += 4;
678  s += 12;
679  }
680  __asm__ volatile(SFENCE:::"memory");
681  __asm__ volatile(EMMS:::"memory");
682  while (s < end) {
683  const int r = *s++;
684  const int g = *s++;
685  const int b = *s++;
686  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
687  }
688 }
689 
690 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
691 {
692  const uint8_t *s = src;
693  const uint8_t *end;
694  const uint8_t *mm_end;
695  uint16_t *d = (uint16_t *)dst;
696  end = s + src_size;
697  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
698  __asm__ volatile(
699  "movq %0, %%mm7 \n\t"
700  "movq %1, %%mm6 \n\t"
701  ::"m"(red_15mask),"m"(green_15mask));
702  mm_end = end - 11;
703  while (s < mm_end) {
704  __asm__ volatile(
705  PREFETCH" 32%1 \n\t"
706  "movd %1, %%mm0 \n\t"
707  "movd 3%1, %%mm3 \n\t"
708  "punpckldq 6%1, %%mm0 \n\t"
709  "punpckldq 9%1, %%mm3 \n\t"
710  "movq %%mm0, %%mm1 \n\t"
711  "movq %%mm0, %%mm2 \n\t"
712  "movq %%mm3, %%mm4 \n\t"
713  "movq %%mm3, %%mm5 \n\t"
714  "psrlq $3, %%mm0 \n\t"
715  "psrlq $3, %%mm3 \n\t"
716  "pand %2, %%mm0 \n\t"
717  "pand %2, %%mm3 \n\t"
718  "psrlq $6, %%mm1 \n\t"
719  "psrlq $6, %%mm4 \n\t"
720  "pand %%mm6, %%mm1 \n\t"
721  "pand %%mm6, %%mm4 \n\t"
722  "psrlq $9, %%mm2 \n\t"
723  "psrlq $9, %%mm5 \n\t"
724  "pand %%mm7, %%mm2 \n\t"
725  "pand %%mm7, %%mm5 \n\t"
726  "por %%mm1, %%mm0 \n\t"
727  "por %%mm4, %%mm3 \n\t"
728  "por %%mm2, %%mm0 \n\t"
729  "por %%mm5, %%mm3 \n\t"
730  "psllq $16, %%mm3 \n\t"
731  "por %%mm3, %%mm0 \n\t"
732  MOVNTQ" %%mm0, %0 \n\t"
733  :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
734  d += 4;
735  s += 12;
736  }
737  __asm__ volatile(SFENCE:::"memory");
738  __asm__ volatile(EMMS:::"memory");
739  while (s < end) {
740  const int b = *s++;
741  const int g = *s++;
742  const int r = *s++;
743  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
744  }
745 }
746 
747 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
748 {
749  const uint8_t *s = src;
750  const uint8_t *end;
751  const uint8_t *mm_end;
752  uint16_t *d = (uint16_t *)dst;
753  end = s + src_size;
754  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
755  __asm__ volatile(
756  "movq %0, %%mm7 \n\t"
757  "movq %1, %%mm6 \n\t"
758  ::"m"(red_15mask),"m"(green_15mask));
759  mm_end = end - 15;
760  while (s < mm_end) {
761  __asm__ volatile(
762  PREFETCH" 32%1 \n\t"
763  "movd %1, %%mm0 \n\t"
764  "movd 3%1, %%mm3 \n\t"
765  "punpckldq 6%1, %%mm0 \n\t"
766  "punpckldq 9%1, %%mm3 \n\t"
767  "movq %%mm0, %%mm1 \n\t"
768  "movq %%mm0, %%mm2 \n\t"
769  "movq %%mm3, %%mm4 \n\t"
770  "movq %%mm3, %%mm5 \n\t"
771  "psllq $7, %%mm0 \n\t"
772  "psllq $7, %%mm3 \n\t"
773  "pand %%mm7, %%mm0 \n\t"
774  "pand %%mm7, %%mm3 \n\t"
775  "psrlq $6, %%mm1 \n\t"
776  "psrlq $6, %%mm4 \n\t"
777  "pand %%mm6, %%mm1 \n\t"
778  "pand %%mm6, %%mm4 \n\t"
779  "psrlq $19, %%mm2 \n\t"
780  "psrlq $19, %%mm5 \n\t"
781  "pand %2, %%mm2 \n\t"
782  "pand %2, %%mm5 \n\t"
783  "por %%mm1, %%mm0 \n\t"
784  "por %%mm4, %%mm3 \n\t"
785  "por %%mm2, %%mm0 \n\t"
786  "por %%mm5, %%mm3 \n\t"
787  "psllq $16, %%mm3 \n\t"
788  "por %%mm3, %%mm0 \n\t"
789  MOVNTQ" %%mm0, %0 \n\t"
790  :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
791  d += 4;
792  s += 12;
793  }
794  __asm__ volatile(SFENCE:::"memory");
795  __asm__ volatile(EMMS:::"memory");
796  while (s < end) {
797  const int r = *s++;
798  const int g = *s++;
799  const int b = *s++;
800  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801  }
802 }
803 
804 /*
805  I use less accurate approximation here by simply left-shifting the input
806  value and filling the low order bits with zeroes. This method improves PNG
807  compression but this scheme cannot reproduce white exactly, since it does
808  not generate an all-ones maximum value; the net effect is to darken the
809  image slightly.
810 
811  The better method should be "left bit replication":
812 
813  4 3 2 1 0
814  ---------
815  1 1 0 1 1
816 
817  7 6 5 4 3 2 1 0
818  ----------------
819  1 1 0 1 1 1 1 0
820  |=======| |===|
821  | leftmost bits repeated to fill open bits
822  |
823  original bits
824 */
825 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
826 {
827  const uint16_t *end;
828  const uint16_t *mm_end;
829  uint8_t *d = dst;
830  const uint16_t *s = (const uint16_t*)src;
831  end = s + src_size/2;
832  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
833  mm_end = end - 7;
834  while (s < mm_end) {
835  __asm__ volatile(
836  PREFETCH" 32%1 \n\t"
837  "movq %1, %%mm0 \n\t"
838  "movq %1, %%mm1 \n\t"
839  "movq %1, %%mm2 \n\t"
840  "pand %2, %%mm0 \n\t"
841  "pand %3, %%mm1 \n\t"
842  "pand %4, %%mm2 \n\t"
843  "psllq $3, %%mm0 \n\t"
844  "psrlq $2, %%mm1 \n\t"
845  "psrlq $7, %%mm2 \n\t"
846  "movq %%mm0, %%mm3 \n\t"
847  "movq %%mm1, %%mm4 \n\t"
848  "movq %%mm2, %%mm5 \n\t"
849  "punpcklwd %5, %%mm0 \n\t"
850  "punpcklwd %5, %%mm1 \n\t"
851  "punpcklwd %5, %%mm2 \n\t"
852  "punpckhwd %5, %%mm3 \n\t"
853  "punpckhwd %5, %%mm4 \n\t"
854  "punpckhwd %5, %%mm5 \n\t"
855  "psllq $8, %%mm1 \n\t"
856  "psllq $16, %%mm2 \n\t"
857  "por %%mm1, %%mm0 \n\t"
858  "por %%mm2, %%mm0 \n\t"
859  "psllq $8, %%mm4 \n\t"
860  "psllq $16, %%mm5 \n\t"
861  "por %%mm4, %%mm3 \n\t"
862  "por %%mm5, %%mm3 \n\t"
863 
864  "movq %%mm0, %%mm6 \n\t"
865  "movq %%mm3, %%mm7 \n\t"
866 
867  "movq 8%1, %%mm0 \n\t"
868  "movq 8%1, %%mm1 \n\t"
869  "movq 8%1, %%mm2 \n\t"
870  "pand %2, %%mm0 \n\t"
871  "pand %3, %%mm1 \n\t"
872  "pand %4, %%mm2 \n\t"
873  "psllq $3, %%mm0 \n\t"
874  "psrlq $2, %%mm1 \n\t"
875  "psrlq $7, %%mm2 \n\t"
876  "movq %%mm0, %%mm3 \n\t"
877  "movq %%mm1, %%mm4 \n\t"
878  "movq %%mm2, %%mm5 \n\t"
879  "punpcklwd %5, %%mm0 \n\t"
880  "punpcklwd %5, %%mm1 \n\t"
881  "punpcklwd %5, %%mm2 \n\t"
882  "punpckhwd %5, %%mm3 \n\t"
883  "punpckhwd %5, %%mm4 \n\t"
884  "punpckhwd %5, %%mm5 \n\t"
885  "psllq $8, %%mm1 \n\t"
886  "psllq $16, %%mm2 \n\t"
887  "por %%mm1, %%mm0 \n\t"
888  "por %%mm2, %%mm0 \n\t"
889  "psllq $8, %%mm4 \n\t"
890  "psllq $16, %%mm5 \n\t"
891  "por %%mm4, %%mm3 \n\t"
892  "por %%mm5, %%mm3 \n\t"
893 
894  :"=m"(*d)
895  :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
896  :"memory");
897  /* borrowed 32 to 24 */
898  __asm__ volatile(
899  "movq %%mm0, %%mm4 \n\t"
900  "movq %%mm3, %%mm5 \n\t"
901  "movq %%mm6, %%mm0 \n\t"
902  "movq %%mm7, %%mm1 \n\t"
903 
904  "movq %%mm4, %%mm6 \n\t"
905  "movq %%mm5, %%mm7 \n\t"
906  "movq %%mm0, %%mm2 \n\t"
907  "movq %%mm1, %%mm3 \n\t"
908 
910 
911  :"=m"(*d)
912  :"m"(*s)
913  :"memory");
914  d += 24;
915  s += 8;
916  }
917  __asm__ volatile(SFENCE:::"memory");
918  __asm__ volatile(EMMS:::"memory");
919  while (s < end) {
920  register uint16_t bgr;
921  bgr = *s++;
922  *d++ = (bgr&0x1F)<<3;
923  *d++ = (bgr&0x3E0)>>2;
924  *d++ = (bgr&0x7C00)>>7;
925  }
926 }
927 
928 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
929 {
930  const uint16_t *end;
931  const uint16_t *mm_end;
932  uint8_t *d = (uint8_t *)dst;
933  const uint16_t *s = (const uint16_t *)src;
934  end = s + src_size/2;
935  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
936  mm_end = end - 7;
937  while (s < mm_end) {
938  __asm__ volatile(
939  PREFETCH" 32%1 \n\t"
940  "movq %1, %%mm0 \n\t"
941  "movq %1, %%mm1 \n\t"
942  "movq %1, %%mm2 \n\t"
943  "pand %2, %%mm0 \n\t"
944  "pand %3, %%mm1 \n\t"
945  "pand %4, %%mm2 \n\t"
946  "psllq $3, %%mm0 \n\t"
947  "psrlq $3, %%mm1 \n\t"
948  "psrlq $8, %%mm2 \n\t"
949  "movq %%mm0, %%mm3 \n\t"
950  "movq %%mm1, %%mm4 \n\t"
951  "movq %%mm2, %%mm5 \n\t"
952  "punpcklwd %5, %%mm0 \n\t"
953  "punpcklwd %5, %%mm1 \n\t"
954  "punpcklwd %5, %%mm2 \n\t"
955  "punpckhwd %5, %%mm3 \n\t"
956  "punpckhwd %5, %%mm4 \n\t"
957  "punpckhwd %5, %%mm5 \n\t"
958  "psllq $8, %%mm1 \n\t"
959  "psllq $16, %%mm2 \n\t"
960  "por %%mm1, %%mm0 \n\t"
961  "por %%mm2, %%mm0 \n\t"
962  "psllq $8, %%mm4 \n\t"
963  "psllq $16, %%mm5 \n\t"
964  "por %%mm4, %%mm3 \n\t"
965  "por %%mm5, %%mm3 \n\t"
966 
967  "movq %%mm0, %%mm6 \n\t"
968  "movq %%mm3, %%mm7 \n\t"
969 
970  "movq 8%1, %%mm0 \n\t"
971  "movq 8%1, %%mm1 \n\t"
972  "movq 8%1, %%mm2 \n\t"
973  "pand %2, %%mm0 \n\t"
974  "pand %3, %%mm1 \n\t"
975  "pand %4, %%mm2 \n\t"
976  "psllq $3, %%mm0 \n\t"
977  "psrlq $3, %%mm1 \n\t"
978  "psrlq $8, %%mm2 \n\t"
979  "movq %%mm0, %%mm3 \n\t"
980  "movq %%mm1, %%mm4 \n\t"
981  "movq %%mm2, %%mm5 \n\t"
982  "punpcklwd %5, %%mm0 \n\t"
983  "punpcklwd %5, %%mm1 \n\t"
984  "punpcklwd %5, %%mm2 \n\t"
985  "punpckhwd %5, %%mm3 \n\t"
986  "punpckhwd %5, %%mm4 \n\t"
987  "punpckhwd %5, %%mm5 \n\t"
988  "psllq $8, %%mm1 \n\t"
989  "psllq $16, %%mm2 \n\t"
990  "por %%mm1, %%mm0 \n\t"
991  "por %%mm2, %%mm0 \n\t"
992  "psllq $8, %%mm4 \n\t"
993  "psllq $16, %%mm5 \n\t"
994  "por %%mm4, %%mm3 \n\t"
995  "por %%mm5, %%mm3 \n\t"
996  :"=m"(*d)
997  :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
998  :"memory");
999  /* borrowed 32 to 24 */
1000  __asm__ volatile(
1001  "movq %%mm0, %%mm4 \n\t"
1002  "movq %%mm3, %%mm5 \n\t"
1003  "movq %%mm6, %%mm0 \n\t"
1004  "movq %%mm7, %%mm1 \n\t"
1005 
1006  "movq %%mm4, %%mm6 \n\t"
1007  "movq %%mm5, %%mm7 \n\t"
1008  "movq %%mm0, %%mm2 \n\t"
1009  "movq %%mm1, %%mm3 \n\t"
1010 
1012 
1013  :"=m"(*d)
1014  :"m"(*s)
1015  :"memory");
1016  d += 24;
1017  s += 8;
1018  }
1019  __asm__ volatile(SFENCE:::"memory");
1020  __asm__ volatile(EMMS:::"memory");
1021  while (s < end) {
1022  register uint16_t bgr;
1023  bgr = *s++;
1024  *d++ = (bgr&0x1F)<<3;
1025  *d++ = (bgr&0x7E0)>>3;
1026  *d++ = (bgr&0xF800)>>8;
1027  }
1028 }
1029 
1030 /*
1031  * mm0 = 00 B3 00 B2 00 B1 00 B0
1032  * mm1 = 00 G3 00 G2 00 G1 00 G0
1033  * mm2 = 00 R3 00 R2 00 R1 00 R0
1034  * mm6 = FF FF FF FF FF FF FF FF
1035  * mm7 = 00 00 00 00 00 00 00 00
1036  */
1037 #define PACK_RGB32 \
1038  "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1039  "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1040  "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1041  "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1042  "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1043  "movq %%mm0, %%mm3 \n\t" \
1044  "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1045  "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1046  MOVNTQ" %%mm0, %0 \n\t" \
1047  MOVNTQ" %%mm3, 8%0 \n\t" \
1048 
1049 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
1050 {
1051  const uint16_t *end;
1052  const uint16_t *mm_end;
1053  uint8_t *d = dst;
1054  const uint16_t *s = (const uint16_t *)src;
1055  end = s + src_size/2;
1056  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1057  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1058  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1059  mm_end = end - 3;
1060  while (s < mm_end) {
1061  __asm__ volatile(
1062  PREFETCH" 32%1 \n\t"
1063  "movq %1, %%mm0 \n\t"
1064  "movq %1, %%mm1 \n\t"
1065  "movq %1, %%mm2 \n\t"
1066  "pand %2, %%mm0 \n\t"
1067  "pand %3, %%mm1 \n\t"
1068  "pand %4, %%mm2 \n\t"
1069  "psllq $3, %%mm0 \n\t"
1070  "psrlq $2, %%mm1 \n\t"
1071  "psrlq $7, %%mm2 \n\t"
1072  PACK_RGB32
1073  :"=m"(*d)
1074  :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1075  :"memory");
1076  d += 16;
1077  s += 4;
1078  }
1079  __asm__ volatile(SFENCE:::"memory");
1080  __asm__ volatile(EMMS:::"memory");
1081  while (s < end) {
1082  register uint16_t bgr;
1083  bgr = *s++;
1084  *d++ = (bgr&0x1F)<<3;
1085  *d++ = (bgr&0x3E0)>>2;
1086  *d++ = (bgr&0x7C00)>>7;
1087  *d++ = 255;
1088  }
1089 }
1090 
1091 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1092 {
1093  const uint16_t *end;
1094  const uint16_t *mm_end;
1095  uint8_t *d = dst;
1096  const uint16_t *s = (const uint16_t*)src;
1097  end = s + src_size/2;
1098  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1099  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1100  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1101  mm_end = end - 3;
1102  while (s < mm_end) {
1103  __asm__ volatile(
1104  PREFETCH" 32%1 \n\t"
1105  "movq %1, %%mm0 \n\t"
1106  "movq %1, %%mm1 \n\t"
1107  "movq %1, %%mm2 \n\t"
1108  "pand %2, %%mm0 \n\t"
1109  "pand %3, %%mm1 \n\t"
1110  "pand %4, %%mm2 \n\t"
1111  "psllq $3, %%mm0 \n\t"
1112  "psrlq $3, %%mm1 \n\t"
1113  "psrlq $8, %%mm2 \n\t"
1114  PACK_RGB32
1115  :"=m"(*d)
1116  :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1117  :"memory");
1118  d += 16;
1119  s += 4;
1120  }
1121  __asm__ volatile(SFENCE:::"memory");
1122  __asm__ volatile(EMMS:::"memory");
1123  while (s < end) {
1124  register uint16_t bgr;
1125  bgr = *s++;
1126  *d++ = (bgr&0x1F)<<3;
1127  *d++ = (bgr&0x7E0)>>3;
1128  *d++ = (bgr&0xF800)>>8;
1129  *d++ = 255;
1130  }
1131 }
1132 
1133 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1134 {
1135  x86_reg idx = 15 - src_size;
1136  const uint8_t *s = src-idx;
1137  uint8_t *d = dst-idx;
1138  __asm__ volatile(
1139  "test %0, %0 \n\t"
1140  "jns 2f \n\t"
1141  PREFETCH" (%1, %0) \n\t"
1142  "movq %3, %%mm7 \n\t"
1143  "pxor %4, %%mm7 \n\t"
1144  "movq %%mm7, %%mm6 \n\t"
1145  "pxor %5, %%mm7 \n\t"
1146  ".p2align 4 \n\t"
1147  "1: \n\t"
1148  PREFETCH" 32(%1, %0) \n\t"
1149  "movq (%1, %0), %%mm0 \n\t"
1150  "movq 8(%1, %0), %%mm1 \n\t"
1151 # if COMPILE_TEMPLATE_MMX2
1152  "pshufw $177, %%mm0, %%mm3 \n\t"
1153  "pshufw $177, %%mm1, %%mm5 \n\t"
1154  "pand %%mm7, %%mm0 \n\t"
1155  "pand %%mm6, %%mm3 \n\t"
1156  "pand %%mm7, %%mm1 \n\t"
1157  "pand %%mm6, %%mm5 \n\t"
1158  "por %%mm3, %%mm0 \n\t"
1159  "por %%mm5, %%mm1 \n\t"
1160 # else
1161  "movq %%mm0, %%mm2 \n\t"
1162  "movq %%mm1, %%mm4 \n\t"
1163  "pand %%mm7, %%mm0 \n\t"
1164  "pand %%mm6, %%mm2 \n\t"
1165  "pand %%mm7, %%mm1 \n\t"
1166  "pand %%mm6, %%mm4 \n\t"
1167  "movq %%mm2, %%mm3 \n\t"
1168  "movq %%mm4, %%mm5 \n\t"
1169  "pslld $16, %%mm2 \n\t"
1170  "psrld $16, %%mm3 \n\t"
1171  "pslld $16, %%mm4 \n\t"
1172  "psrld $16, %%mm5 \n\t"
1173  "por %%mm2, %%mm0 \n\t"
1174  "por %%mm4, %%mm1 \n\t"
1175  "por %%mm3, %%mm0 \n\t"
1176  "por %%mm5, %%mm1 \n\t"
1177 # endif
1178  MOVNTQ" %%mm0, (%2, %0) \n\t"
1179  MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1180  "add $16, %0 \n\t"
1181  "js 1b \n\t"
1182  SFENCE" \n\t"
1183  EMMS" \n\t"
1184  "2: \n\t"
1185  : "+&r"(idx)
1186  : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1187  : "memory");
1188  for (; idx<15; idx+=4) {
1189  register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1190  v &= 0xff00ff;
1191  *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1192  }
1193 }
1194 
1195 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1196 {
1197  unsigned i;
1198  x86_reg mmx_size= 23 - src_size;
1199  __asm__ volatile (
1200  "test %%"REG_a", %%"REG_a" \n\t"
1201  "jns 2f \n\t"
1202  "movq "MANGLE(mask24r)", %%mm5 \n\t"
1203  "movq "MANGLE(mask24g)", %%mm6 \n\t"
1204  "movq "MANGLE(mask24b)", %%mm7 \n\t"
1205  ".p2align 4 \n\t"
1206  "1: \n\t"
1207  PREFETCH" 32(%1, %%"REG_a") \n\t"
1208  "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1209  "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1210  "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1211  "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1212  "pand %%mm5, %%mm0 \n\t"
1213  "pand %%mm6, %%mm1 \n\t"
1214  "pand %%mm7, %%mm2 \n\t"
1215  "por %%mm0, %%mm1 \n\t"
1216  "por %%mm2, %%mm1 \n\t"
1217  "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1218  MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1219  "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1220  "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1221  "pand %%mm7, %%mm0 \n\t"
1222  "pand %%mm5, %%mm1 \n\t"
1223  "pand %%mm6, %%mm2 \n\t"
1224  "por %%mm0, %%mm1 \n\t"
1225  "por %%mm2, %%mm1 \n\t"
1226  "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1227  MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1228  "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1229  "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1230  "pand %%mm6, %%mm0 \n\t"
1231  "pand %%mm7, %%mm1 \n\t"
1232  "pand %%mm5, %%mm2 \n\t"
1233  "por %%mm0, %%mm1 \n\t"
1234  "por %%mm2, %%mm1 \n\t"
1235  MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1236  "add $24, %%"REG_a" \n\t"
1237  " js 1b \n\t"
1238  "2: \n\t"
1239  : "+a" (mmx_size)
1240  : "r" (src-mmx_size), "r"(dst-mmx_size)
1241  );
1242 
1243  __asm__ volatile(SFENCE:::"memory");
1244  __asm__ volatile(EMMS:::"memory");
1245 
1246  if (mmx_size==23) return; //finished, was multiple of 8
1247 
1248  src+= src_size;
1249  dst+= src_size;
1250  src_size= 23-mmx_size;
1251  src-= src_size;
1252  dst-= src_size;
1253  for (i=0; i<src_size; i+=3) {
1254  register uint8_t x;
1255  x = src[i + 2];
1256  dst[i + 1] = src[i + 1];
1257  dst[i + 2] = src[i + 0];
1258  dst[i + 0] = x;
1259  }
1260 }
1261 
1262 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1263  int width, int height,
1264  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1265 {
1266  int y;
1267  const x86_reg chromWidth= width>>1;
1268  for (y=0; y<height; y++) {
1269  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1270  __asm__ volatile(
1271  "xor %%"REG_a", %%"REG_a" \n\t"
1272  ".p2align 4 \n\t"
1273  "1: \n\t"
1274  PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1275  PREFETCH" 32(%2, %%"REG_a") \n\t"
1276  PREFETCH" 32(%3, %%"REG_a") \n\t"
1277  "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1278  "movq %%mm0, %%mm2 \n\t" // U(0)
1279  "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1280  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1281  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1282 
1283  "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1284  "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1285  "movq %%mm3, %%mm4 \n\t" // Y(0)
1286  "movq %%mm5, %%mm6 \n\t" // Y(8)
1287  "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1288  "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1289  "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1290  "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1291 
1292  MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1293  MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1294  MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1295  MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1296 
1297  "add $8, %%"REG_a" \n\t"
1298  "cmp %4, %%"REG_a" \n\t"
1299  " jb 1b \n\t"
1300  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1301  : "%"REG_a
1302  );
1303  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1304  usrc += chromStride;
1305  vsrc += chromStride;
1306  }
1307  ysrc += lumStride;
1308  dst += dstStride;
1309  }
1310  __asm__(EMMS" \n\t"
1311  SFENCE" \n\t"
1312  :::"memory");
1313 }
1314 
1319 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1320  int width, int height,
1321  int lumStride, int chromStride, int dstStride)
1322 {
1323  //FIXME interpolate chroma
1324  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1325 }
1326 
1327 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1328  int width, int height,
1329  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1330 {
1331  int y;
1332  const x86_reg chromWidth= width>>1;
1333  for (y=0; y<height; y++) {
1334  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1335  __asm__ volatile(
1336  "xor %%"REG_a", %%"REG_a" \n\t"
1337  ".p2align 4 \n\t"
1338  "1: \n\t"
1339  PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1340  PREFETCH" 32(%2, %%"REG_a") \n\t"
1341  PREFETCH" 32(%3, %%"REG_a") \n\t"
1342  "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1343  "movq %%mm0, %%mm2 \n\t" // U(0)
1344  "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1345  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1346  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1347 
1348  "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1349  "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1350  "movq %%mm0, %%mm4 \n\t" // Y(0)
1351  "movq %%mm2, %%mm6 \n\t" // Y(8)
1352  "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1353  "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1354  "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1355  "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1356 
1357  MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1358  MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1359  MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1360  MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1361 
1362  "add $8, %%"REG_a" \n\t"
1363  "cmp %4, %%"REG_a" \n\t"
1364  " jb 1b \n\t"
1365  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1366  : "%"REG_a
1367  );
1368  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1369  usrc += chromStride;
1370  vsrc += chromStride;
1371  }
1372  ysrc += lumStride;
1373  dst += dstStride;
1374  }
1375  __asm__(EMMS" \n\t"
1376  SFENCE" \n\t"
1377  :::"memory");
1378 }
1379 
1384 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1385  int width, int height,
1386  int lumStride, int chromStride, int dstStride)
1387 {
1388  //FIXME interpolate chroma
1389  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1390 }
1391 
1395 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1396  int width, int height,
1397  int lumStride, int chromStride, int dstStride)
1398 {
1399  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1400 }
1401 
1405 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1406  int width, int height,
1407  int lumStride, int chromStride, int dstStride)
1408 {
1409  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1410 }
1411 
1416 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1417  int width, int height,
1418  int lumStride, int chromStride, int srcStride)
1419 {
1420  int y;
1421  const x86_reg chromWidth= width>>1;
1422  for (y=0; y<height; y+=2) {
1423  __asm__ volatile(
1424  "xor %%"REG_a", %%"REG_a" \n\t"
1425  "pcmpeqw %%mm7, %%mm7 \n\t"
1426  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1427  ".p2align 4 \n\t"
1428  "1: \n\t"
1429  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1430  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1431  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1432  "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1433  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1434  "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1435  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1436  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1437  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1438  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1439  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1440 
1441  MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1442 
1443  "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1444  "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1445  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1446  "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1447  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1448  "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1449  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1450  "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1451  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1452  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1453 
1454  MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1455 
1456  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1457  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1458  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1459  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1460  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1461  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1462  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1463  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1464 
1465  MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1466  MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1467 
1468  "add $8, %%"REG_a" \n\t"
1469  "cmp %4, %%"REG_a" \n\t"
1470  " jb 1b \n\t"
1471  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1472  : "memory", "%"REG_a
1473  );
1474 
1475  ydst += lumStride;
1476  src += srcStride;
1477 
1478  __asm__ volatile(
1479  "xor %%"REG_a", %%"REG_a" \n\t"
1480  ".p2align 4 \n\t"
1481  "1: \n\t"
1482  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1483  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1484  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1485  "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1486  "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1487  "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1488  "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1489  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1490  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1491  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1492  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1493 
1494  MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1495  MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1496 
1497  "add $8, %%"REG_a" \n\t"
1498  "cmp %4, %%"REG_a" \n\t"
1499  " jb 1b \n\t"
1500 
1501  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1502  : "memory", "%"REG_a
1503  );
1504  udst += chromStride;
1505  vdst += chromStride;
1506  ydst += lumStride;
1507  src += srcStride;
1508  }
1509  __asm__ volatile(EMMS" \n\t"
1510  SFENCE" \n\t"
1511  :::"memory");
1512 }
1513 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1514 
1515 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1516 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1517 {
1518  int x,y;
1519 
1520  dst[0]= src[0];
1521 
1522  // first line
1523  for (x=0; x<srcWidth-1; x++) {
1524  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1525  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1526  }
1527  dst[2*srcWidth-1]= src[srcWidth-1];
1528 
1529  dst+= dstStride;
1530 
1531  for (y=1; y<srcHeight; y++) {
1532  const x86_reg mmxSize= srcWidth&~15;
1533  __asm__ volatile(
1534  "mov %4, %%"REG_a" \n\t"
1535  "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1536  "movq (%0, %%"REG_a"), %%mm4 \n\t"
1537  "movq %%mm4, %%mm2 \n\t"
1538  "psllq $8, %%mm4 \n\t"
1539  "pand %%mm0, %%mm2 \n\t"
1540  "por %%mm2, %%mm4 \n\t"
1541  "movq (%1, %%"REG_a"), %%mm5 \n\t"
1542  "movq %%mm5, %%mm3 \n\t"
1543  "psllq $8, %%mm5 \n\t"
1544  "pand %%mm0, %%mm3 \n\t"
1545  "por %%mm3, %%mm5 \n\t"
1546  "1: \n\t"
1547  "movq (%0, %%"REG_a"), %%mm0 \n\t"
1548  "movq (%1, %%"REG_a"), %%mm1 \n\t"
1549  "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1550  "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1551  PAVGB" %%mm0, %%mm5 \n\t"
1552  PAVGB" %%mm0, %%mm3 \n\t"
1553  PAVGB" %%mm0, %%mm5 \n\t"
1554  PAVGB" %%mm0, %%mm3 \n\t"
1555  PAVGB" %%mm1, %%mm4 \n\t"
1556  PAVGB" %%mm1, %%mm2 \n\t"
1557  PAVGB" %%mm1, %%mm4 \n\t"
1558  PAVGB" %%mm1, %%mm2 \n\t"
1559  "movq %%mm5, %%mm7 \n\t"
1560  "movq %%mm4, %%mm6 \n\t"
1561  "punpcklbw %%mm3, %%mm5 \n\t"
1562  "punpckhbw %%mm3, %%mm7 \n\t"
1563  "punpcklbw %%mm2, %%mm4 \n\t"
1564  "punpckhbw %%mm2, %%mm6 \n\t"
1565  MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1566  MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1567  MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1568  MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1569  "add $8, %%"REG_a" \n\t"
1570  "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1571  "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1572  " js 1b \n\t"
1573  :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1574  "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1575  "g" (-mmxSize)
1576  : "%"REG_a
1577  );
1578 
1579  for (x=mmxSize-1; x<srcWidth-1; x++) {
1580  dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1581  dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1582  dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1583  dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1584  }
1585  dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1586  dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1587 
1588  dst+=dstStride*2;
1589  src+=srcStride;
1590  }
1591 
1592  // last line
1593  dst[0]= src[0];
1594 
1595  for (x=0; x<srcWidth-1; x++) {
1596  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1597  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1598  }
1599  dst[2*srcWidth-1]= src[srcWidth-1];
1600 
1601  __asm__ volatile(EMMS" \n\t"
1602  SFENCE" \n\t"
1603  :::"memory");
1604 }
1605 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
1606 
1607 #if !COMPILE_TEMPLATE_AMD3DNOW
1608 
1614 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1615  int width, int height,
1616  int lumStride, int chromStride, int srcStride)
1617 {
1618  int y;
1619  const x86_reg chromWidth= width>>1;
1620  for (y=0; y<height; y+=2) {
1621  __asm__ volatile(
1622  "xor %%"REG_a", %%"REG_a" \n\t"
1623  "pcmpeqw %%mm7, %%mm7 \n\t"
1624  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1625  ".p2align 4 \n\t"
1626  "1: \n\t"
1627  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1628  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1629  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1630  "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1631  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1632  "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1633  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1634  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1635  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1636  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1637  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1638 
1639  MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1640 
1641  "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1642  "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1643  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1644  "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1645  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1646  "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1647  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1648  "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1649  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1650  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1651 
1652  MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1653 
1654  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1655  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1656  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1657  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1658  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1659  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1660  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1661  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1662 
1663  MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1664  MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1665 
1666  "add $8, %%"REG_a" \n\t"
1667  "cmp %4, %%"REG_a" \n\t"
1668  " jb 1b \n\t"
1669  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1670  : "memory", "%"REG_a
1671  );
1672 
1673  ydst += lumStride;
1674  src += srcStride;
1675 
1676  __asm__ volatile(
1677  "xor %%"REG_a", %%"REG_a" \n\t"
1678  ".p2align 4 \n\t"
1679  "1: \n\t"
1680  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1681  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1682  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1683  "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1684  "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1685  "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1686  "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1687  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1688  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1689  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1690  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1691 
1692  MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1693  MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1694 
1695  "add $8, %%"REG_a" \n\t"
1696  "cmp %4, %%"REG_a" \n\t"
1697  " jb 1b \n\t"
1698 
1699  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1700  : "memory", "%"REG_a
1701  );
1702  udst += chromStride;
1703  vdst += chromStride;
1704  ydst += lumStride;
1705  src += srcStride;
1706  }
1707  __asm__ volatile(EMMS" \n\t"
1708  SFENCE" \n\t"
1709  :::"memory");
1710 }
1711 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1712 
1720 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1721  int width, int height,
1722  int lumStride, int chromStride, int srcStride)
1723 {
1724  int y;
1725  const x86_reg chromWidth= width>>1;
1726  for (y=0; y<height-2; y+=2) {
1727  int i;
1728  for (i=0; i<2; i++) {
1729  __asm__ volatile(
1730  "mov %2, %%"REG_a" \n\t"
1731  "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1732  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1733  "pxor %%mm7, %%mm7 \n\t"
1734  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1735  ".p2align 4 \n\t"
1736  "1: \n\t"
1737  PREFETCH" 64(%0, %%"REG_d") \n\t"
1738  "movd (%0, %%"REG_d"), %%mm0 \n\t"
1739  "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1740  "punpcklbw %%mm7, %%mm0 \n\t"
1741  "punpcklbw %%mm7, %%mm1 \n\t"
1742  "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1743  "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1744  "punpcklbw %%mm7, %%mm2 \n\t"
1745  "punpcklbw %%mm7, %%mm3 \n\t"
1746  "pmaddwd %%mm6, %%mm0 \n\t"
1747  "pmaddwd %%mm6, %%mm1 \n\t"
1748  "pmaddwd %%mm6, %%mm2 \n\t"
1749  "pmaddwd %%mm6, %%mm3 \n\t"
1750 #ifndef FAST_BGR2YV12
1751  "psrad $8, %%mm0 \n\t"
1752  "psrad $8, %%mm1 \n\t"
1753  "psrad $8, %%mm2 \n\t"
1754  "psrad $8, %%mm3 \n\t"
1755 #endif
1756  "packssdw %%mm1, %%mm0 \n\t"
1757  "packssdw %%mm3, %%mm2 \n\t"
1758  "pmaddwd %%mm5, %%mm0 \n\t"
1759  "pmaddwd %%mm5, %%mm2 \n\t"
1760  "packssdw %%mm2, %%mm0 \n\t"
1761  "psraw $7, %%mm0 \n\t"
1762 
1763  "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1764  "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1765  "punpcklbw %%mm7, %%mm4 \n\t"
1766  "punpcklbw %%mm7, %%mm1 \n\t"
1767  "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1768  "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1769  "punpcklbw %%mm7, %%mm2 \n\t"
1770  "punpcklbw %%mm7, %%mm3 \n\t"
1771  "pmaddwd %%mm6, %%mm4 \n\t"
1772  "pmaddwd %%mm6, %%mm1 \n\t"
1773  "pmaddwd %%mm6, %%mm2 \n\t"
1774  "pmaddwd %%mm6, %%mm3 \n\t"
1775 #ifndef FAST_BGR2YV12
1776  "psrad $8, %%mm4 \n\t"
1777  "psrad $8, %%mm1 \n\t"
1778  "psrad $8, %%mm2 \n\t"
1779  "psrad $8, %%mm3 \n\t"
1780 #endif
1781  "packssdw %%mm1, %%mm4 \n\t"
1782  "packssdw %%mm3, %%mm2 \n\t"
1783  "pmaddwd %%mm5, %%mm4 \n\t"
1784  "pmaddwd %%mm5, %%mm2 \n\t"
1785  "add $24, %%"REG_d" \n\t"
1786  "packssdw %%mm2, %%mm4 \n\t"
1787  "psraw $7, %%mm4 \n\t"
1788 
1789  "packuswb %%mm4, %%mm0 \n\t"
1790  "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1791 
1792  MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1793  "add $8, %%"REG_a" \n\t"
1794  " js 1b \n\t"
1795  : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1796  : "%"REG_a, "%"REG_d
1797  );
1798  ydst += lumStride;
1799  src += srcStride;
1800  }
1801  src -= srcStride*2;
1802  __asm__ volatile(
1803  "mov %4, %%"REG_a" \n\t"
1804  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1805  "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1806  "pxor %%mm7, %%mm7 \n\t"
1807  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1808  "add %%"REG_d", %%"REG_d" \n\t"
1809  ".p2align 4 \n\t"
1810  "1: \n\t"
1811  PREFETCH" 64(%0, %%"REG_d") \n\t"
1812  PREFETCH" 64(%1, %%"REG_d") \n\t"
1813 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1814  "movq (%0, %%"REG_d"), %%mm0 \n\t"
1815  "movq (%1, %%"REG_d"), %%mm1 \n\t"
1816  "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1817  "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1818  PAVGB" %%mm1, %%mm0 \n\t"
1819  PAVGB" %%mm3, %%mm2 \n\t"
1820  "movq %%mm0, %%mm1 \n\t"
1821  "movq %%mm2, %%mm3 \n\t"
1822  "psrlq $24, %%mm0 \n\t"
1823  "psrlq $24, %%mm2 \n\t"
1824  PAVGB" %%mm1, %%mm0 \n\t"
1825  PAVGB" %%mm3, %%mm2 \n\t"
1826  "punpcklbw %%mm7, %%mm0 \n\t"
1827  "punpcklbw %%mm7, %%mm2 \n\t"
1828 #else
1829  "movd (%0, %%"REG_d"), %%mm0 \n\t"
1830  "movd (%1, %%"REG_d"), %%mm1 \n\t"
1831  "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1832  "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1833  "punpcklbw %%mm7, %%mm0 \n\t"
1834  "punpcklbw %%mm7, %%mm1 \n\t"
1835  "punpcklbw %%mm7, %%mm2 \n\t"
1836  "punpcklbw %%mm7, %%mm3 \n\t"
1837  "paddw %%mm1, %%mm0 \n\t"
1838  "paddw %%mm3, %%mm2 \n\t"
1839  "paddw %%mm2, %%mm0 \n\t"
1840  "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1841  "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1842  "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1843  "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1844  "punpcklbw %%mm7, %%mm4 \n\t"
1845  "punpcklbw %%mm7, %%mm1 \n\t"
1846  "punpcklbw %%mm7, %%mm2 \n\t"
1847  "punpcklbw %%mm7, %%mm3 \n\t"
1848  "paddw %%mm1, %%mm4 \n\t"
1849  "paddw %%mm3, %%mm2 \n\t"
1850  "paddw %%mm4, %%mm2 \n\t"
1851  "psrlw $2, %%mm0 \n\t"
1852  "psrlw $2, %%mm2 \n\t"
1853 #endif
1854  "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1855  "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1856 
1857  "pmaddwd %%mm0, %%mm1 \n\t"
1858  "pmaddwd %%mm2, %%mm3 \n\t"
1859  "pmaddwd %%mm6, %%mm0 \n\t"
1860  "pmaddwd %%mm6, %%mm2 \n\t"
1861 #ifndef FAST_BGR2YV12
1862  "psrad $8, %%mm0 \n\t"
1863  "psrad $8, %%mm1 \n\t"
1864  "psrad $8, %%mm2 \n\t"
1865  "psrad $8, %%mm3 \n\t"
1866 #endif
1867  "packssdw %%mm2, %%mm0 \n\t"
1868  "packssdw %%mm3, %%mm1 \n\t"
1869  "pmaddwd %%mm5, %%mm0 \n\t"
1870  "pmaddwd %%mm5, %%mm1 \n\t"
1871  "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1872  "psraw $7, %%mm0 \n\t"
1873 
1874 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1875  "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1876  "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1877  "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1878  "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1879  PAVGB" %%mm1, %%mm4 \n\t"
1880  PAVGB" %%mm3, %%mm2 \n\t"
1881  "movq %%mm4, %%mm1 \n\t"
1882  "movq %%mm2, %%mm3 \n\t"
1883  "psrlq $24, %%mm4 \n\t"
1884  "psrlq $24, %%mm2 \n\t"
1885  PAVGB" %%mm1, %%mm4 \n\t"
1886  PAVGB" %%mm3, %%mm2 \n\t"
1887  "punpcklbw %%mm7, %%mm4 \n\t"
1888  "punpcklbw %%mm7, %%mm2 \n\t"
1889 #else
1890  "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1891  "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1892  "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1893  "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1894  "punpcklbw %%mm7, %%mm4 \n\t"
1895  "punpcklbw %%mm7, %%mm1 \n\t"
1896  "punpcklbw %%mm7, %%mm2 \n\t"
1897  "punpcklbw %%mm7, %%mm3 \n\t"
1898  "paddw %%mm1, %%mm4 \n\t"
1899  "paddw %%mm3, %%mm2 \n\t"
1900  "paddw %%mm2, %%mm4 \n\t"
1901  "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1902  "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1903  "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1904  "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1905  "punpcklbw %%mm7, %%mm5 \n\t"
1906  "punpcklbw %%mm7, %%mm1 \n\t"
1907  "punpcklbw %%mm7, %%mm2 \n\t"
1908  "punpcklbw %%mm7, %%mm3 \n\t"
1909  "paddw %%mm1, %%mm5 \n\t"
1910  "paddw %%mm3, %%mm2 \n\t"
1911  "paddw %%mm5, %%mm2 \n\t"
1912  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1913  "psrlw $2, %%mm4 \n\t"
1914  "psrlw $2, %%mm2 \n\t"
1915 #endif
1916  "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1917  "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1918 
1919  "pmaddwd %%mm4, %%mm1 \n\t"
1920  "pmaddwd %%mm2, %%mm3 \n\t"
1921  "pmaddwd %%mm6, %%mm4 \n\t"
1922  "pmaddwd %%mm6, %%mm2 \n\t"
1923 #ifndef FAST_BGR2YV12
1924  "psrad $8, %%mm4 \n\t"
1925  "psrad $8, %%mm1 \n\t"
1926  "psrad $8, %%mm2 \n\t"
1927  "psrad $8, %%mm3 \n\t"
1928 #endif
1929  "packssdw %%mm2, %%mm4 \n\t"
1930  "packssdw %%mm3, %%mm1 \n\t"
1931  "pmaddwd %%mm5, %%mm4 \n\t"
1932  "pmaddwd %%mm5, %%mm1 \n\t"
1933  "add $24, %%"REG_d" \n\t"
1934  "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1935  "psraw $7, %%mm4 \n\t"
1936 
1937  "movq %%mm0, %%mm1 \n\t"
1938  "punpckldq %%mm4, %%mm0 \n\t"
1939  "punpckhdq %%mm4, %%mm1 \n\t"
1940  "packsswb %%mm1, %%mm0 \n\t"
1941  "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1942  "movd %%mm0, (%2, %%"REG_a") \n\t"
1943  "punpckhdq %%mm0, %%mm0 \n\t"
1944  "movd %%mm0, (%3, %%"REG_a") \n\t"
1945  "add $4, %%"REG_a" \n\t"
1946  " js 1b \n\t"
1947  : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1948  : "%"REG_a, "%"REG_d
1949  );
1950 
1951  udst += chromStride;
1952  vdst += chromStride;
1953  src += srcStride*2;
1954  }
1955 
1956  __asm__ volatile(EMMS" \n\t"
1957  SFENCE" \n\t"
1958  :::"memory");
1959 
1960  rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1961 }
1962 #endif /* !COMPILE_TEMPLATE_SSE2 */
1963 
1964 #if !COMPILE_TEMPLATE_AMD3DNOW
1965 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1966  int width, int height, int src1Stride,
1967  int src2Stride, int dstStride)
1968 {
1969  int h;
1970 
1971  for (h=0; h < height; h++) {
1972  int w;
1973 
1974 #if COMPILE_TEMPLATE_SSE2
1975  __asm__(
1976  "xor %%"REG_a", %%"REG_a" \n\t"
1977  "1: \n\t"
1978  PREFETCH" 64(%1, %%"REG_a") \n\t"
1979  PREFETCH" 64(%2, %%"REG_a") \n\t"
1980  "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1981  "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1982  "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1983  "punpcklbw %%xmm2, %%xmm0 \n\t"
1984  "punpckhbw %%xmm2, %%xmm1 \n\t"
1985  "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1986  "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1987  "add $16, %%"REG_a" \n\t"
1988  "cmp %3, %%"REG_a" \n\t"
1989  " jb 1b \n\t"
1990  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1991  : "memory", "%"REG_a""
1992  );
1993 #else
1994  __asm__(
1995  "xor %%"REG_a", %%"REG_a" \n\t"
1996  "1: \n\t"
1997  PREFETCH" 64(%1, %%"REG_a") \n\t"
1998  PREFETCH" 64(%2, %%"REG_a") \n\t"
1999  "movq (%1, %%"REG_a"), %%mm0 \n\t"
2000  "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2001  "movq %%mm0, %%mm1 \n\t"
2002  "movq %%mm2, %%mm3 \n\t"
2003  "movq (%2, %%"REG_a"), %%mm4 \n\t"
2004  "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2005  "punpcklbw %%mm4, %%mm0 \n\t"
2006  "punpckhbw %%mm4, %%mm1 \n\t"
2007  "punpcklbw %%mm5, %%mm2 \n\t"
2008  "punpckhbw %%mm5, %%mm3 \n\t"
2009  MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2010  MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2011  MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2012  MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2013  "add $16, %%"REG_a" \n\t"
2014  "cmp %3, %%"REG_a" \n\t"
2015  " jb 1b \n\t"
2016  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2017  : "memory", "%"REG_a
2018  );
2019 #endif
2020  for (w= (width&(~15)); w < width; w++) {
2021  dest[2*w+0] = src1[w];
2022  dest[2*w+1] = src2[w];
2023  }
2024  dest += dstStride;
2025  src1 += src1Stride;
2026  src2 += src2Stride;
2027  }
2028  __asm__(
2029  EMMS" \n\t"
2030  SFENCE" \n\t"
2031  ::: "memory"
2032  );
2033 }
2034 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2035 
2036 #if !COMPILE_TEMPLATE_SSE2
2037 #if !COMPILE_TEMPLATE_AMD3DNOW
2038 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2039  uint8_t *dst1, uint8_t *dst2,
2040  int width, int height,
2041  int srcStride1, int srcStride2,
2042  int dstStride1, int dstStride2)
2043 {
2044  x86_reg y;
2045  int x,w,h;
2046  w=width/2; h=height/2;
2047  __asm__ volatile(
2048  PREFETCH" %0 \n\t"
2049  PREFETCH" %1 \n\t"
2050  ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2051  for (y=0;y<h;y++) {
2052  const uint8_t* s1=src1+srcStride1*(y>>1);
2053  uint8_t* d=dst1+dstStride1*y;
2054  x=0;
2055  for (;x<w-31;x+=32) {
2056  __asm__ volatile(
2057  PREFETCH" 32%1 \n\t"
2058  "movq %1, %%mm0 \n\t"
2059  "movq 8%1, %%mm2 \n\t"
2060  "movq 16%1, %%mm4 \n\t"
2061  "movq 24%1, %%mm6 \n\t"
2062  "movq %%mm0, %%mm1 \n\t"
2063  "movq %%mm2, %%mm3 \n\t"
2064  "movq %%mm4, %%mm5 \n\t"
2065  "movq %%mm6, %%mm7 \n\t"
2066  "punpcklbw %%mm0, %%mm0 \n\t"
2067  "punpckhbw %%mm1, %%mm1 \n\t"
2068  "punpcklbw %%mm2, %%mm2 \n\t"
2069  "punpckhbw %%mm3, %%mm3 \n\t"
2070  "punpcklbw %%mm4, %%mm4 \n\t"
2071  "punpckhbw %%mm5, %%mm5 \n\t"
2072  "punpcklbw %%mm6, %%mm6 \n\t"
2073  "punpckhbw %%mm7, %%mm7 \n\t"
2074  MOVNTQ" %%mm0, %0 \n\t"
2075  MOVNTQ" %%mm1, 8%0 \n\t"
2076  MOVNTQ" %%mm2, 16%0 \n\t"
2077  MOVNTQ" %%mm3, 24%0 \n\t"
2078  MOVNTQ" %%mm4, 32%0 \n\t"
2079  MOVNTQ" %%mm5, 40%0 \n\t"
2080  MOVNTQ" %%mm6, 48%0 \n\t"
2081  MOVNTQ" %%mm7, 56%0"
2082  :"=m"(d[2*x])
2083  :"m"(s1[x])
2084  :"memory");
2085  }
2086  for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2087  }
2088  for (y=0;y<h;y++) {
2089  const uint8_t* s2=src2+srcStride2*(y>>1);
2090  uint8_t* d=dst2+dstStride2*y;
2091  x=0;
2092  for (;x<w-31;x+=32) {
2093  __asm__ volatile(
2094  PREFETCH" 32%1 \n\t"
2095  "movq %1, %%mm0 \n\t"
2096  "movq 8%1, %%mm2 \n\t"
2097  "movq 16%1, %%mm4 \n\t"
2098  "movq 24%1, %%mm6 \n\t"
2099  "movq %%mm0, %%mm1 \n\t"
2100  "movq %%mm2, %%mm3 \n\t"
2101  "movq %%mm4, %%mm5 \n\t"
2102  "movq %%mm6, %%mm7 \n\t"
2103  "punpcklbw %%mm0, %%mm0 \n\t"
2104  "punpckhbw %%mm1, %%mm1 \n\t"
2105  "punpcklbw %%mm2, %%mm2 \n\t"
2106  "punpckhbw %%mm3, %%mm3 \n\t"
2107  "punpcklbw %%mm4, %%mm4 \n\t"
2108  "punpckhbw %%mm5, %%mm5 \n\t"
2109  "punpcklbw %%mm6, %%mm6 \n\t"
2110  "punpckhbw %%mm7, %%mm7 \n\t"
2111  MOVNTQ" %%mm0, %0 \n\t"
2112  MOVNTQ" %%mm1, 8%0 \n\t"
2113  MOVNTQ" %%mm2, 16%0 \n\t"
2114  MOVNTQ" %%mm3, 24%0 \n\t"
2115  MOVNTQ" %%mm4, 32%0 \n\t"
2116  MOVNTQ" %%mm5, 40%0 \n\t"
2117  MOVNTQ" %%mm6, 48%0 \n\t"
2118  MOVNTQ" %%mm7, 56%0"
2119  :"=m"(d[2*x])
2120  :"m"(s2[x])
2121  :"memory");
2122  }
2123  for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2124  }
2125  __asm__(
2126  EMMS" \n\t"
2127  SFENCE" \n\t"
2128  ::: "memory"
2129  );
2130 }
2131 
2132 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2133  uint8_t *dst,
2134  int width, int height,
2135  int srcStride1, int srcStride2,
2136  int srcStride3, int dstStride)
2137 {
2138  x86_reg x;
2139  int y,w,h;
2140  w=width/2; h=height;
2141  for (y=0;y<h;y++) {
2142  const uint8_t* yp=src1+srcStride1*y;
2143  const uint8_t* up=src2+srcStride2*(y>>2);
2144  const uint8_t* vp=src3+srcStride3*(y>>2);
2145  uint8_t* d=dst+dstStride*y;
2146  x=0;
2147  for (;x<w-7;x+=8) {
2148  __asm__ volatile(
2149  PREFETCH" 32(%1, %0) \n\t"
2150  PREFETCH" 32(%2, %0) \n\t"
2151  PREFETCH" 32(%3, %0) \n\t"
2152  "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2153  "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2154  "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2155  "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2156  "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2157  "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2158  "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2159  "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2160  "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2161  "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2162 
2163  "movq %%mm1, %%mm6 \n\t"
2164  "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2165  "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2166  "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2167  MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2168  MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2169 
2170  "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2171  "movq 8(%1, %0, 4), %%mm0 \n\t"
2172  "movq %%mm0, %%mm3 \n\t"
2173  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2174  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2175  MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2176  MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2177 
2178  "movq %%mm4, %%mm6 \n\t"
2179  "movq 16(%1, %0, 4), %%mm0 \n\t"
2180  "movq %%mm0, %%mm3 \n\t"
2181  "punpcklbw %%mm5, %%mm4 \n\t"
2182  "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2183  "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2184  MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2185  MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2186 
2187  "punpckhbw %%mm5, %%mm6 \n\t"
2188  "movq 24(%1, %0, 4), %%mm0 \n\t"
2189  "movq %%mm0, %%mm3 \n\t"
2190  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2191  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2192  MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2193  MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2194 
2195  : "+r" (x)
2196  : "r"(yp), "r" (up), "r"(vp), "r"(d)
2197  :"memory");
2198  }
2199  for (; x<w; x++) {
2200  const int x2 = x<<2;
2201  d[8*x+0] = yp[x2];
2202  d[8*x+1] = up[x];
2203  d[8*x+2] = yp[x2+1];
2204  d[8*x+3] = vp[x];
2205  d[8*x+4] = yp[x2+2];
2206  d[8*x+5] = up[x];
2207  d[8*x+6] = yp[x2+3];
2208  d[8*x+7] = vp[x];
2209  }
2210  }
2211  __asm__(
2212  EMMS" \n\t"
2213  SFENCE" \n\t"
2214  ::: "memory"
2215  );
2216 }
2217 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2218 
2219 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2220 {
2221  dst += count;
2222  src += 2*count;
2223  count= - count;
2224 
2225  if(count <= -16) {
2226  count += 15;
2227  __asm__ volatile(
2228  "pcmpeqw %%mm7, %%mm7 \n\t"
2229  "psrlw $8, %%mm7 \n\t"
2230  "1: \n\t"
2231  "movq -30(%1, %0, 2), %%mm0 \n\t"
2232  "movq -22(%1, %0, 2), %%mm1 \n\t"
2233  "movq -14(%1, %0, 2), %%mm2 \n\t"
2234  "movq -6(%1, %0, 2), %%mm3 \n\t"
2235  "pand %%mm7, %%mm0 \n\t"
2236  "pand %%mm7, %%mm1 \n\t"
2237  "pand %%mm7, %%mm2 \n\t"
2238  "pand %%mm7, %%mm3 \n\t"
2239  "packuswb %%mm1, %%mm0 \n\t"
2240  "packuswb %%mm3, %%mm2 \n\t"
2241  MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2242  MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2243  "add $16, %0 \n\t"
2244  " js 1b \n\t"
2245  : "+r"(count)
2246  : "r"(src), "r"(dst)
2247  );
2248  count -= 15;
2249  }
2250  while(count<0) {
2251  dst[count]= src[2*count];
2252  count++;
2253  }
2254 }
2255 
2256 #if !COMPILE_TEMPLATE_AMD3DNOW
2257 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2258 {
2259  dst0+= count;
2260  dst1+= count;
2261  src += 4*count;
2262  count= - count;
2263  if(count <= -8) {
2264  count += 7;
2265  __asm__ volatile(
2266  "pcmpeqw %%mm7, %%mm7 \n\t"
2267  "psrlw $8, %%mm7 \n\t"
2268  "1: \n\t"
2269  "movq -28(%1, %0, 4), %%mm0 \n\t"
2270  "movq -20(%1, %0, 4), %%mm1 \n\t"
2271  "movq -12(%1, %0, 4), %%mm2 \n\t"
2272  "movq -4(%1, %0, 4), %%mm3 \n\t"
2273  "pand %%mm7, %%mm0 \n\t"
2274  "pand %%mm7, %%mm1 \n\t"
2275  "pand %%mm7, %%mm2 \n\t"
2276  "pand %%mm7, %%mm3 \n\t"
2277  "packuswb %%mm1, %%mm0 \n\t"
2278  "packuswb %%mm3, %%mm2 \n\t"
2279  "movq %%mm0, %%mm1 \n\t"
2280  "movq %%mm2, %%mm3 \n\t"
2281  "psrlw $8, %%mm0 \n\t"
2282  "psrlw $8, %%mm2 \n\t"
2283  "pand %%mm7, %%mm1 \n\t"
2284  "pand %%mm7, %%mm3 \n\t"
2285  "packuswb %%mm2, %%mm0 \n\t"
2286  "packuswb %%mm3, %%mm1 \n\t"
2287  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2288  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2289  "add $8, %0 \n\t"
2290  " js 1b \n\t"
2291  : "+r"(count)
2292  : "r"(src), "r"(dst0), "r"(dst1)
2293  );
2294  count -= 7;
2295  }
2296  while(count<0) {
2297  dst0[count]= src[4*count+0];
2298  dst1[count]= src[4*count+2];
2299  count++;
2300  }
2301 }
2302 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2303 
2304 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2305 {
2306  dst0 += count;
2307  dst1 += count;
2308  src0 += 4*count;
2309  src1 += 4*count;
2310  count= - count;
2311 #ifdef PAVGB
2312  if(count <= -8) {
2313  count += 7;
2314  __asm__ volatile(
2315  "pcmpeqw %%mm7, %%mm7 \n\t"
2316  "psrlw $8, %%mm7 \n\t"
2317  "1: \n\t"
2318  "movq -28(%1, %0, 4), %%mm0 \n\t"
2319  "movq -20(%1, %0, 4), %%mm1 \n\t"
2320  "movq -12(%1, %0, 4), %%mm2 \n\t"
2321  "movq -4(%1, %0, 4), %%mm3 \n\t"
2322  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2323  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2324  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2325  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2326  "pand %%mm7, %%mm0 \n\t"
2327  "pand %%mm7, %%mm1 \n\t"
2328  "pand %%mm7, %%mm2 \n\t"
2329  "pand %%mm7, %%mm3 \n\t"
2330  "packuswb %%mm1, %%mm0 \n\t"
2331  "packuswb %%mm3, %%mm2 \n\t"
2332  "movq %%mm0, %%mm1 \n\t"
2333  "movq %%mm2, %%mm3 \n\t"
2334  "psrlw $8, %%mm0 \n\t"
2335  "psrlw $8, %%mm2 \n\t"
2336  "pand %%mm7, %%mm1 \n\t"
2337  "pand %%mm7, %%mm3 \n\t"
2338  "packuswb %%mm2, %%mm0 \n\t"
2339  "packuswb %%mm3, %%mm1 \n\t"
2340  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2341  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2342  "add $8, %0 \n\t"
2343  " js 1b \n\t"
2344  : "+r"(count)
2345  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2346  );
2347  count -= 7;
2348  }
2349 #endif
2350  while(count<0) {
2351  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2352  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2353  count++;
2354  }
2355 }
2356 
2357 #if !COMPILE_TEMPLATE_AMD3DNOW
2358 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2359 {
2360  dst0+= count;
2361  dst1+= count;
2362  src += 4*count;
2363  count= - count;
2364  if(count <= -8) {
2365  count += 7;
2366  __asm__ volatile(
2367  "pcmpeqw %%mm7, %%mm7 \n\t"
2368  "psrlw $8, %%mm7 \n\t"
2369  "1: \n\t"
2370  "movq -28(%1, %0, 4), %%mm0 \n\t"
2371  "movq -20(%1, %0, 4), %%mm1 \n\t"
2372  "movq -12(%1, %0, 4), %%mm2 \n\t"
2373  "movq -4(%1, %0, 4), %%mm3 \n\t"
2374  "psrlw $8, %%mm0 \n\t"
2375  "psrlw $8, %%mm1 \n\t"
2376  "psrlw $8, %%mm2 \n\t"
2377  "psrlw $8, %%mm3 \n\t"
2378  "packuswb %%mm1, %%mm0 \n\t"
2379  "packuswb %%mm3, %%mm2 \n\t"
2380  "movq %%mm0, %%mm1 \n\t"
2381  "movq %%mm2, %%mm3 \n\t"
2382  "psrlw $8, %%mm0 \n\t"
2383  "psrlw $8, %%mm2 \n\t"
2384  "pand %%mm7, %%mm1 \n\t"
2385  "pand %%mm7, %%mm3 \n\t"
2386  "packuswb %%mm2, %%mm0 \n\t"
2387  "packuswb %%mm3, %%mm1 \n\t"
2388  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2389  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2390  "add $8, %0 \n\t"
2391  " js 1b \n\t"
2392  : "+r"(count)
2393  : "r"(src), "r"(dst0), "r"(dst1)
2394  );
2395  count -= 7;
2396  }
2397  src++;
2398  while(count<0) {
2399  dst0[count]= src[4*count+0];
2400  dst1[count]= src[4*count+2];
2401  count++;
2402  }
2403 }
2404 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2405 
2406 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2407 {
2408  dst0 += count;
2409  dst1 += count;
2410  src0 += 4*count;
2411  src1 += 4*count;
2412  count= - count;
2413 #ifdef PAVGB
2414  if(count <= -8) {
2415  count += 7;
2416  __asm__ volatile(
2417  "pcmpeqw %%mm7, %%mm7 \n\t"
2418  "psrlw $8, %%mm7 \n\t"
2419  "1: \n\t"
2420  "movq -28(%1, %0, 4), %%mm0 \n\t"
2421  "movq -20(%1, %0, 4), %%mm1 \n\t"
2422  "movq -12(%1, %0, 4), %%mm2 \n\t"
2423  "movq -4(%1, %0, 4), %%mm3 \n\t"
2424  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2425  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2426  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2427  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2428  "psrlw $8, %%mm0 \n\t"
2429  "psrlw $8, %%mm1 \n\t"
2430  "psrlw $8, %%mm2 \n\t"
2431  "psrlw $8, %%mm3 \n\t"
2432  "packuswb %%mm1, %%mm0 \n\t"
2433  "packuswb %%mm3, %%mm2 \n\t"
2434  "movq %%mm0, %%mm1 \n\t"
2435  "movq %%mm2, %%mm3 \n\t"
2436  "psrlw $8, %%mm0 \n\t"
2437  "psrlw $8, %%mm2 \n\t"
2438  "pand %%mm7, %%mm1 \n\t"
2439  "pand %%mm7, %%mm3 \n\t"
2440  "packuswb %%mm2, %%mm0 \n\t"
2441  "packuswb %%mm3, %%mm1 \n\t"
2442  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2443  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2444  "add $8, %0 \n\t"
2445  " js 1b \n\t"
2446  : "+r"(count)
2447  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2448  );
2449  count -= 7;
2450  }
2451 #endif
2452  src0++;
2453  src1++;
2454  while(count<0) {
2455  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2456  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2457  count++;
2458  }
2459 }
2460 
2461 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2462  int width, int height,
2463  int lumStride, int chromStride, int srcStride)
2464 {
2465  int y;
2466  const int chromWidth= -((-width)>>1);
2467 
2468  for (y=0; y<height; y++) {
2469  RENAME(extract_even)(src, ydst, width);
2470  if(y&1) {
2471  RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2472  udst+= chromStride;
2473  vdst+= chromStride;
2474  }
2475 
2476  src += srcStride;
2477  ydst+= lumStride;
2478  }
2479  __asm__(
2480  EMMS" \n\t"
2481  SFENCE" \n\t"
2482  ::: "memory"
2483  );
2484 }
2485 
2486 #if !COMPILE_TEMPLATE_AMD3DNOW
2487 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2488  int width, int height,
2489  int lumStride, int chromStride, int srcStride)
2490 {
2491  int y;
2492  const int chromWidth= -((-width)>>1);
2493 
2494  for (y=0; y<height; y++) {
2495  RENAME(extract_even)(src, ydst, width);
2496  RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2497 
2498  src += srcStride;
2499  ydst+= lumStride;
2500  udst+= chromStride;
2501  vdst+= chromStride;
2502  }
2503  __asm__(
2504  EMMS" \n\t"
2505  SFENCE" \n\t"
2506  ::: "memory"
2507  );
2508 }
2509 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2510 
2511 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2512  int width, int height,
2513  int lumStride, int chromStride, int srcStride)
2514 {
2515  int y;
2516  const int chromWidth= -((-width)>>1);
2517 
2518  for (y=0; y<height; y++) {
2519  RENAME(extract_even)(src+1, ydst, width);
2520  if(y&1) {
2521  RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2522  udst+= chromStride;
2523  vdst+= chromStride;
2524  }
2525 
2526  src += srcStride;
2527  ydst+= lumStride;
2528  }
2529  __asm__(
2530  EMMS" \n\t"
2531  SFENCE" \n\t"
2532  ::: "memory"
2533  );
2534 }
2535 
2536 #if !COMPILE_TEMPLATE_AMD3DNOW
2537 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2538  int width, int height,
2539  int lumStride, int chromStride, int srcStride)
2540 {
2541  int y;
2542  const int chromWidth= -((-width)>>1);
2543 
2544  for (y=0; y<height; y++) {
2545  RENAME(extract_even)(src+1, ydst, width);
2546  RENAME(extract_even2)(src, udst, vdst, chromWidth);
2547 
2548  src += srcStride;
2549  ydst+= lumStride;
2550  udst+= chromStride;
2551  vdst+= chromStride;
2552  }
2553  __asm__(
2554  EMMS" \n\t"
2555  SFENCE" \n\t"
2556  ::: "memory"
2557  );
2558 }
2559 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2560 #endif /* !COMPILE_TEMPLATE_SSE2 */
2561 
2562 static inline void RENAME(rgb2rgb_init)(void)
2563 {
2564 #if !COMPILE_TEMPLATE_SSE2
2565 #if !COMPILE_TEMPLATE_AMD3DNOW
2593 #endif /* !COMPILE_TEMPLATE_SSE2 */
2594 
2595 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2597 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
2599 
2602 #endif /* COMPILE_TEMPLATE_SSE2 */
2603 
2604 #if !COMPILE_TEMPLATE_AMD3DNOW
2606 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2607 }