25 #if COMPILE_TEMPLATE_MMX2
26 #define PREFETCH "prefetchnta"
28 #define PREFETCH " # nop"
31 #if COMPILE_TEMPLATE_MMX2
32 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
36 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
38 #define YSCALEYUV2PACKEDX_UV \
40 "xor %%"REG_a", %%"REG_a" \n\t"\
44 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
45 "mov (%%"REG_d"), %%"REG_S" \n\t"\
46 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
47 "movq %%mm3, %%mm4 \n\t"\
50 "movq 8(%%"REG_d"), %%mm0 \n\t" \
51 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \
52 "add %6, %%"REG_S" \n\t" \
53 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" \
54 "add $16, %%"REG_d" \n\t"\
55 "mov (%%"REG_d"), %%"REG_S" \n\t"\
56 "pmulhw %%mm0, %%mm2 \n\t"\
57 "pmulhw %%mm0, %%mm5 \n\t"\
58 "paddw %%mm2, %%mm3 \n\t"\
59 "paddw %%mm5, %%mm4 \n\t"\
60 "test %%"REG_S", %%"REG_S" \n\t"\
63 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
64 "lea "offset"(%0), %%"REG_d" \n\t"\
65 "mov (%%"REG_d"), %%"REG_S" \n\t"\
66 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
67 "movq "#dst1", "#dst2" \n\t"\
70 "movq 8(%%"REG_d"), "#coeff" \n\t" \
71 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" \
72 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" \
73 "add $16, %%"REG_d" \n\t"\
74 "mov (%%"REG_d"), %%"REG_S" \n\t"\
75 "pmulhw "#coeff", "#src1" \n\t"\
76 "pmulhw "#coeff", "#src2" \n\t"\
77 "paddw "#src1", "#dst1" \n\t"\
78 "paddw "#src2", "#dst2" \n\t"\
79 "test %%"REG_S", %%"REG_S" \n\t"\
82 #define YSCALEYUV2PACKEDX \
83 YSCALEYUV2PACKEDX_UV \
84 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
86 #define YSCALEYUV2PACKEDX_END \
87 :: "r" (&c->redDither), \
88 "m" (dummy), "m" (dummy), "m" (dummy),\
89 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
90 : "%"REG_a, "%"REG_d, "%"REG_S \
93 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
95 "xor %%"REG_a", %%"REG_a" \n\t"\
99 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
100 "mov (%%"REG_d"), %%"REG_S" \n\t"\
101 "pxor %%mm4, %%mm4 \n\t"\
102 "pxor %%mm5, %%mm5 \n\t"\
103 "pxor %%mm6, %%mm6 \n\t"\
104 "pxor %%mm7, %%mm7 \n\t"\
107 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" \
108 "add %6, %%"REG_S" \n\t" \
109 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \
110 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
111 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" \
112 "movq %%mm0, %%mm3 \n\t"\
113 "punpcklwd %%mm1, %%mm0 \n\t"\
114 "punpckhwd %%mm1, %%mm3 \n\t"\
115 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" \
116 "pmaddwd %%mm1, %%mm0 \n\t"\
117 "pmaddwd %%mm1, %%mm3 \n\t"\
118 "paddd %%mm0, %%mm4 \n\t"\
119 "paddd %%mm3, %%mm5 \n\t"\
120 "add %6, %%"REG_S" \n\t" \
121 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" \
122 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
124 "test %%"REG_S", %%"REG_S" \n\t"\
125 "movq %%mm2, %%mm0 \n\t"\
126 "punpcklwd %%mm3, %%mm2 \n\t"\
127 "punpckhwd %%mm3, %%mm0 \n\t"\
128 "pmaddwd %%mm1, %%mm2 \n\t"\
129 "pmaddwd %%mm1, %%mm0 \n\t"\
130 "paddd %%mm2, %%mm6 \n\t"\
131 "paddd %%mm0, %%mm7 \n\t"\
133 "psrad $16, %%mm4 \n\t"\
134 "psrad $16, %%mm5 \n\t"\
135 "psrad $16, %%mm6 \n\t"\
136 "psrad $16, %%mm7 \n\t"\
137 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
138 "packssdw %%mm5, %%mm4 \n\t"\
139 "packssdw %%mm7, %%mm6 \n\t"\
140 "paddw %%mm0, %%mm4 \n\t"\
141 "paddw %%mm0, %%mm6 \n\t"\
142 "movq %%mm4, "U_TEMP"(%0) \n\t"\
143 "movq %%mm6, "V_TEMP"(%0) \n\t"\
145 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
146 "lea "offset"(%0), %%"REG_d" \n\t"\
147 "mov (%%"REG_d"), %%"REG_S" \n\t"\
148 "pxor %%mm1, %%mm1 \n\t"\
149 "pxor %%mm5, %%mm5 \n\t"\
150 "pxor %%mm7, %%mm7 \n\t"\
151 "pxor %%mm6, %%mm6 \n\t"\
154 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" \
155 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" \
156 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
157 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" \
158 "movq %%mm0, %%mm3 \n\t"\
159 "punpcklwd %%mm4, %%mm0 \n\t"\
160 "punpckhwd %%mm4, %%mm3 \n\t"\
161 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" \
162 "pmaddwd %%mm4, %%mm0 \n\t"\
163 "pmaddwd %%mm4, %%mm3 \n\t"\
164 "paddd %%mm0, %%mm1 \n\t"\
165 "paddd %%mm3, %%mm5 \n\t"\
166 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" \
167 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
168 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
169 "test %%"REG_S", %%"REG_S" \n\t"\
170 "movq %%mm2, %%mm0 \n\t"\
171 "punpcklwd %%mm3, %%mm2 \n\t"\
172 "punpckhwd %%mm3, %%mm0 \n\t"\
173 "pmaddwd %%mm4, %%mm2 \n\t"\
174 "pmaddwd %%mm4, %%mm0 \n\t"\
175 "paddd %%mm2, %%mm7 \n\t"\
176 "paddd %%mm0, %%mm6 \n\t"\
178 "psrad $16, %%mm1 \n\t"\
179 "psrad $16, %%mm5 \n\t"\
180 "psrad $16, %%mm7 \n\t"\
181 "psrad $16, %%mm6 \n\t"\
182 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
183 "packssdw %%mm5, %%mm1 \n\t"\
184 "packssdw %%mm6, %%mm7 \n\t"\
185 "paddw %%mm0, %%mm1 \n\t"\
186 "paddw %%mm0, %%mm7 \n\t"\
187 "movq "U_TEMP"(%0), %%mm3 \n\t"\
188 "movq "V_TEMP"(%0), %%mm4 \n\t"\
190 #define YSCALEYUV2PACKEDX_ACCURATE \
191 YSCALEYUV2PACKEDX_ACCURATE_UV \
192 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
194 #define YSCALEYUV2RGBX \
195 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \
196 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \
197 "movq %%mm3, %%mm2 \n\t" \
198 "movq %%mm4, %%mm5 \n\t" \
199 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
200 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
202 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
203 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
204 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \
205 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \
206 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
207 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
209 "paddw %%mm3, %%mm4 \n\t"\
210 "movq %%mm2, %%mm0 \n\t"\
211 "movq %%mm5, %%mm6 \n\t"\
212 "movq %%mm4, %%mm3 \n\t"\
213 "punpcklwd %%mm2, %%mm2 \n\t"\
214 "punpcklwd %%mm5, %%mm5 \n\t"\
215 "punpcklwd %%mm4, %%mm4 \n\t"\
216 "paddw %%mm1, %%mm2 \n\t"\
217 "paddw %%mm1, %%mm5 \n\t"\
218 "paddw %%mm1, %%mm4 \n\t"\
219 "punpckhwd %%mm0, %%mm0 \n\t"\
220 "punpckhwd %%mm6, %%mm6 \n\t"\
221 "punpckhwd %%mm3, %%mm3 \n\t"\
222 "paddw %%mm7, %%mm0 \n\t"\
223 "paddw %%mm7, %%mm6 \n\t"\
224 "paddw %%mm7, %%mm3 \n\t"\
226 "packuswb %%mm0, %%mm2 \n\t"\
227 "packuswb %%mm6, %%mm5 \n\t"\
228 "packuswb %%mm3, %%mm4 \n\t"\
230 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
231 "movq "#b", "#q2" \n\t" \
232 "movq "#r", "#t" \n\t" \
233 "punpcklbw "#g", "#b" \n\t" \
234 "punpcklbw "#a", "#r" \n\t" \
235 "punpckhbw "#g", "#q2" \n\t" \
236 "punpckhbw "#a", "#t" \n\t" \
237 "movq "#b", "#q0" \n\t" \
238 "movq "#q2", "#q3" \n\t" \
239 "punpcklwd "#r", "#q0" \n\t" \
240 "punpckhwd "#r", "#b" \n\t" \
241 "punpcklwd "#t", "#q2" \n\t" \
242 "punpckhwd "#t", "#q3" \n\t" \
244 MOVNTQ( q0, (dst, index, 4))\
245 MOVNTQ( b, 8(dst, index, 4))\
246 MOVNTQ( q2, 16(dst, index, 4))\
247 MOVNTQ( q3, 24(dst, index, 4))\
249 "add $8, "#index" \n\t"\
250 "cmp "#dstw", "#index" \n\t"\
252 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
255 const int16_t **lumSrc,
int lumFilterSize,
256 const int16_t *chrFilter,
const int16_t **chrUSrc,
257 const int16_t **chrVSrc,
258 int chrFilterSize,
const int16_t **alpSrc,
263 x86_reg uv_off = c->uv_off_byte;
268 "movq %%mm2, "U_TEMP"(%0) \n\t"
269 "movq %%mm4, "V_TEMP"(%0) \n\t"
270 "movq %%mm5, "Y_TEMP"(%0) \n\t"
272 "movq "Y_TEMP"(%0), %%mm5 \n\t"
273 "psraw $3, %%mm1 \n\t"
274 "psraw $3, %%mm7 \n\t"
275 "packuswb %%mm7, %%mm1 \n\t"
276 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
281 "pcmpeqd %%mm7, %%mm7 \n\t"
282 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
288 const int16_t **lumSrc,
int lumFilterSize,
289 const int16_t *chrFilter,
const int16_t **chrUSrc,
290 const int16_t **chrVSrc,
291 int chrFilterSize,
const int16_t **alpSrc,
296 x86_reg uv_off = c->uv_off_byte;
302 "psraw $3, %%mm1 \n\t"
303 "psraw $3, %%mm7 \n\t"
304 "packuswb %%mm7, %%mm1 \n\t"
305 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
310 "pcmpeqd %%mm7, %%mm7 \n\t"
311 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
316 #define REAL_WRITERGB16(dst, dstw, index) \
317 "pand "MANGLE(bF8)", %%mm2 \n\t" \
318 "pand "MANGLE(bFC)", %%mm4 \n\t" \
319 "pand "MANGLE(bF8)", %%mm5 \n\t" \
320 "psrlq $3, %%mm2 \n\t"\
322 "movq %%mm2, %%mm1 \n\t"\
323 "movq %%mm4, %%mm3 \n\t"\
325 "punpcklbw %%mm7, %%mm3 \n\t"\
326 "punpcklbw %%mm5, %%mm2 \n\t"\
327 "punpckhbw %%mm7, %%mm4 \n\t"\
328 "punpckhbw %%mm5, %%mm1 \n\t"\
330 "psllq $3, %%mm3 \n\t"\
331 "psllq $3, %%mm4 \n\t"\
333 "por %%mm3, %%mm2 \n\t"\
334 "por %%mm4, %%mm1 \n\t"\
336 MOVNTQ(%%mm2, (dst, index, 2))\
337 MOVNTQ(%%mm1, 8(dst, index, 2))\
339 "add $8, "#index" \n\t"\
340 "cmp "#dstw", "#index" \n\t"\
342 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
345 const int16_t **lumSrc,
int lumFilterSize,
346 const int16_t *chrFilter,
const int16_t **chrUSrc,
347 const int16_t **chrVSrc,
348 int chrFilterSize,
const int16_t **alpSrc,
353 x86_reg uv_off = c->uv_off_byte;
357 "pxor %%mm7, %%mm7 \n\t"
369 const int16_t **lumSrc,
int lumFilterSize,
370 const int16_t *chrFilter,
const int16_t **chrUSrc,
371 const int16_t **chrVSrc,
372 int chrFilterSize,
const int16_t **alpSrc,
377 x86_reg uv_off = c->uv_off_byte;
381 "pxor %%mm7, %%mm7 \n\t"
392 #define REAL_WRITERGB15(dst, dstw, index) \
393 "pand "MANGLE(bF8)", %%mm2 \n\t" \
394 "pand "MANGLE(bF8)", %%mm4 \n\t" \
395 "pand "MANGLE(bF8)", %%mm5 \n\t" \
396 "psrlq $3, %%mm2 \n\t"\
397 "psrlq $1, %%mm5 \n\t"\
399 "movq %%mm2, %%mm1 \n\t"\
400 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklbw %%mm7, %%mm3 \n\t"\
403 "punpcklbw %%mm5, %%mm2 \n\t"\
404 "punpckhbw %%mm7, %%mm4 \n\t"\
405 "punpckhbw %%mm5, %%mm1 \n\t"\
407 "psllq $2, %%mm3 \n\t"\
408 "psllq $2, %%mm4 \n\t"\
410 "por %%mm3, %%mm2 \n\t"\
411 "por %%mm4, %%mm1 \n\t"\
413 MOVNTQ(%%mm2, (dst, index, 2))\
414 MOVNTQ(%%mm1, 8(dst, index, 2))\
416 "add $8, "#index" \n\t"\
417 "cmp "#dstw", "#index" \n\t"\
419 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
422 const int16_t **lumSrc,
int lumFilterSize,
423 const int16_t *chrFilter,
const int16_t **chrUSrc,
424 const int16_t **chrVSrc,
425 int chrFilterSize,
const int16_t **alpSrc,
430 x86_reg uv_off = c->uv_off_byte;
434 "pxor %%mm7, %%mm7 \n\t"
446 const int16_t **lumSrc,
int lumFilterSize,
447 const int16_t *chrFilter,
const int16_t **chrUSrc,
448 const int16_t **chrVSrc,
449 int chrFilterSize,
const int16_t **alpSrc,
454 x86_reg uv_off = c->uv_off_byte;
458 "pxor %%mm7, %%mm7 \n\t"
469 #define WRITEBGR24MMX(dst, dstw, index) \
471 "movq %%mm2, %%mm1 \n\t" \
472 "movq %%mm5, %%mm6 \n\t" \
473 "punpcklbw %%mm4, %%mm2 \n\t" \
474 "punpcklbw %%mm7, %%mm5 \n\t" \
475 "punpckhbw %%mm4, %%mm1 \n\t" \
476 "punpckhbw %%mm7, %%mm6 \n\t" \
477 "movq %%mm2, %%mm0 \n\t" \
478 "movq %%mm1, %%mm3 \n\t" \
479 "punpcklwd %%mm5, %%mm0 \n\t" \
480 "punpckhwd %%mm5, %%mm2 \n\t" \
481 "punpcklwd %%mm6, %%mm1 \n\t" \
482 "punpckhwd %%mm6, %%mm3 \n\t" \
484 "movq %%mm0, %%mm4 \n\t" \
485 "movq %%mm2, %%mm6 \n\t" \
486 "movq %%mm1, %%mm5 \n\t" \
487 "movq %%mm3, %%mm7 \n\t" \
489 "psllq $40, %%mm0 \n\t" \
490 "psllq $40, %%mm2 \n\t" \
491 "psllq $40, %%mm1 \n\t" \
492 "psllq $40, %%mm3 \n\t" \
494 "punpckhdq %%mm4, %%mm0 \n\t" \
495 "punpckhdq %%mm6, %%mm2 \n\t" \
496 "punpckhdq %%mm5, %%mm1 \n\t" \
497 "punpckhdq %%mm7, %%mm3 \n\t" \
499 "psrlq $8, %%mm0 \n\t" \
500 "movq %%mm2, %%mm6 \n\t" \
501 "psllq $40, %%mm2 \n\t" \
502 "por %%mm2, %%mm0 \n\t" \
503 MOVNTQ(%%mm0, (dst))\
505 "psrlq $24, %%mm6 \n\t" \
506 "movq %%mm1, %%mm5 \n\t" \
507 "psllq $24, %%mm1 \n\t" \
508 "por %%mm1, %%mm6 \n\t" \
509 MOVNTQ(%%mm6, 8(dst))\
511 "psrlq $40, %%mm5 \n\t" \
512 "psllq $8, %%mm3 \n\t" \
513 "por %%mm3, %%mm5 \n\t" \
514 MOVNTQ(%%mm5, 16(dst))\
516 "add $24, "#dst" \n\t"\
518 "add $8, "#index" \n\t"\
519 "cmp "#dstw", "#index" \n\t"\
522 #define WRITEBGR24MMX2(dst, dstw, index) \
524 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
525 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
526 "pshufw $0x50, %%mm2, %%mm1 \n\t" \
527 "pshufw $0x50, %%mm4, %%mm3 \n\t" \
528 "pshufw $0x00, %%mm5, %%mm6 \n\t" \
530 "pand %%mm0, %%mm1 \n\t" \
531 "pand %%mm0, %%mm3 \n\t" \
532 "pand %%mm7, %%mm6 \n\t" \
534 "psllq $8, %%mm3 \n\t" \
535 "por %%mm1, %%mm6 \n\t"\
536 "por %%mm3, %%mm6 \n\t"\
537 MOVNTQ(%%mm6, (dst))\
539 "psrlq $8, %%mm4 \n\t" \
540 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \
541 "pshufw $0x55, %%mm4, %%mm3 \n\t" \
542 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \
544 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \
545 "pand %%mm7, %%mm3 \n\t" \
546 "pand %%mm0, %%mm6 \n\t" \
548 "por %%mm1, %%mm3 \n\t" \
549 "por %%mm3, %%mm6 \n\t"\
550 MOVNTQ(%%mm6, 8(dst))\
552 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \
553 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \
554 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \
556 "pand %%mm7, %%mm1 \n\t" \
557 "pand %%mm0, %%mm3 \n\t" \
558 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \
560 "por %%mm1, %%mm3 \n\t"\
561 "por %%mm3, %%mm6 \n\t"\
562 MOVNTQ(%%mm6, 16(dst))\
564 "add $24, "#dst" \n\t"\
566 "add $8, "#index" \n\t"\
567 "cmp "#dstw", "#index" \n\t"\
570 #if COMPILE_TEMPLATE_MMX2
572 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
575 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
579 const int16_t **lumSrc,
int lumFilterSize,
580 const int16_t *chrFilter,
const int16_t **chrUSrc,
581 const int16_t **chrVSrc,
582 int chrFilterSize,
const int16_t **alpSrc,
587 x86_reg uv_off = c->uv_off_byte;
591 "pxor %%mm7, %%mm7 \n\t"
592 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t"
593 "add %4, %%"REG_c
" \n\t"
595 ::
"r" (&c->redDither),
596 "m" (dummy),
"m" (
dummy),
"m" (dummy),
597 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
598 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S
603 const int16_t **lumSrc,
int lumFilterSize,
604 const int16_t *chrFilter,
const int16_t **chrUSrc,
605 const int16_t **chrVSrc,
606 int chrFilterSize,
const int16_t **alpSrc,
611 x86_reg uv_off = c->uv_off_byte;
615 "pxor %%mm7, %%mm7 \n\t"
616 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t"
617 "add %4, %%"REG_c
" \n\t"
619 ::
"r" (&c->redDither),
620 "m" (dummy),
"m" (
dummy),
"m" (dummy),
621 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
622 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S
626 #define REAL_WRITEYUY2(dst, dstw, index) \
627 "packuswb %%mm3, %%mm3 \n\t"\
628 "packuswb %%mm4, %%mm4 \n\t"\
629 "packuswb %%mm7, %%mm1 \n\t"\
630 "punpcklbw %%mm4, %%mm3 \n\t"\
631 "movq %%mm1, %%mm7 \n\t"\
632 "punpcklbw %%mm3, %%mm1 \n\t"\
633 "punpckhbw %%mm3, %%mm7 \n\t"\
635 MOVNTQ(%%mm1, (dst, index, 2))\
636 MOVNTQ(%%mm7, 8(dst, index, 2))\
638 "add $8, "#index" \n\t"\
639 "cmp "#dstw", "#index" \n\t"\
641 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
644 const int16_t **lumSrc,
int lumFilterSize,
645 const int16_t *chrFilter,
const int16_t **chrUSrc,
646 const int16_t **chrVSrc,
647 int chrFilterSize,
const int16_t **alpSrc,
652 x86_reg uv_off = c->uv_off_byte;
656 "psraw $3, %%mm3 \n\t"
657 "psraw $3, %%mm4 \n\t"
658 "psraw $3, %%mm1 \n\t"
659 "psraw $3, %%mm7 \n\t"
665 const int16_t **lumSrc,
int lumFilterSize,
666 const int16_t *chrFilter,
const int16_t **chrUSrc,
667 const int16_t **chrVSrc,
668 int chrFilterSize,
const int16_t **alpSrc,
673 x86_reg uv_off = c->uv_off_byte;
677 "psraw $3, %%mm3 \n\t"
678 "psraw $3, %%mm4 \n\t"
679 "psraw $3, %%mm1 \n\t"
680 "psraw $3, %%mm7 \n\t"
685 #define REAL_YSCALEYUV2RGB_UV(index, c) \
686 "xor "#index", "#index" \n\t"\
689 "movq (%2, "#index"), %%mm2 \n\t" \
690 "movq (%3, "#index"), %%mm3 \n\t" \
691 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
692 "movq (%2, "#index"), %%mm5 \n\t" \
693 "movq (%3, "#index"), %%mm4 \n\t" \
694 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
695 "psubw %%mm3, %%mm2 \n\t" \
696 "psubw %%mm4, %%mm5 \n\t" \
697 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
698 "pmulhw %%mm0, %%mm2 \n\t" \
699 "pmulhw %%mm0, %%mm5 \n\t" \
700 "psraw $4, %%mm3 \n\t" \
701 "psraw $4, %%mm4 \n\t" \
702 "paddw %%mm2, %%mm3 \n\t" \
703 "paddw %%mm5, %%mm4 \n\t" \
704 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
705 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
706 "movq %%mm3, %%mm2 \n\t" \
707 "movq %%mm4, %%mm5 \n\t" \
708 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
709 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
712 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
713 "movq ("#b1", "#index", 2), %%mm0 \n\t" \
714 "movq ("#b2", "#index", 2), %%mm1 \n\t" \
715 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \
716 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \
717 "psubw %%mm1, %%mm0 \n\t" \
718 "psubw %%mm7, %%mm6 \n\t" \
719 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
720 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
721 "psraw $4, %%mm1 \n\t" \
722 "psraw $4, %%mm7 \n\t" \
723 "paddw %%mm0, %%mm1 \n\t" \
724 "paddw %%mm6, %%mm7 \n\t" \
726 #define REAL_YSCALEYUV2RGB_COEFF(c) \
727 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
728 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
729 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
730 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
731 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
732 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
734 "paddw %%mm3, %%mm4 \n\t"\
735 "movq %%mm2, %%mm0 \n\t"\
736 "movq %%mm5, %%mm6 \n\t"\
737 "movq %%mm4, %%mm3 \n\t"\
738 "punpcklwd %%mm2, %%mm2 \n\t"\
739 "punpcklwd %%mm5, %%mm5 \n\t"\
740 "punpcklwd %%mm4, %%mm4 \n\t"\
741 "paddw %%mm1, %%mm2 \n\t"\
742 "paddw %%mm1, %%mm5 \n\t"\
743 "paddw %%mm1, %%mm4 \n\t"\
744 "punpckhwd %%mm0, %%mm0 \n\t"\
745 "punpckhwd %%mm6, %%mm6 \n\t"\
746 "punpckhwd %%mm3, %%mm3 \n\t"\
747 "paddw %%mm7, %%mm0 \n\t"\
748 "paddw %%mm7, %%mm6 \n\t"\
749 "paddw %%mm7, %%mm3 \n\t"\
751 "packuswb %%mm0, %%mm2 \n\t"\
752 "packuswb %%mm6, %%mm5 \n\t"\
753 "packuswb %%mm3, %%mm4 \n\t"\
755 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
757 #define YSCALEYUV2RGB(index, c) \
758 REAL_YSCALEYUV2RGB_UV(index, c) \
759 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
760 REAL_YSCALEYUV2RGB_COEFF(c)
766 const int16_t *ubuf[2],
const int16_t *vbuf[2],
767 const int16_t *abuf[2], uint8_t *dest,
768 int dstW,
int yalpha,
int uvalpha,
int y)
770 const int16_t *buf0 = buf[0], *buf1 = buf[1],
771 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
774 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
779 "psraw $3, %%mm1 \n\t"
780 "psraw $3, %%mm7 \n\t"
781 "packuswb %%mm7, %%mm1 \n\t"
782 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
783 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"r" (dest),
785 "r" (abuf0),
"r" (abuf1)
789 *(
const uint16_t **)(&c->u_temp)=abuf0;
790 *(
const uint16_t **)(&c->v_temp)=abuf1;
793 "mov %4, %%"REG_b
" \n\t"
794 "push %%"REG_BP
" \n\t"
798 "mov "U_TEMP"(%5), %0 \n\t"
799 "mov "V_TEMP"(%5), %1 \n\t"
801 "psraw $3, %%mm1 \n\t"
802 "psraw $3, %%mm7 \n\t"
803 "packuswb %%mm7, %%mm1 \n\t"
806 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
807 "pop %%"REG_BP
" \n\t"
809 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
816 "mov %4, %%"REG_b
" \n\t"
817 "push %%"REG_BP
" \n\t"
819 "pcmpeqd %%mm7, %%mm7 \n\t"
820 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
821 "pop %%"REG_BP
" \n\t"
823 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
830 const int16_t *ubuf[2],
const int16_t *vbuf[2],
831 const int16_t *abuf[2], uint8_t *dest,
832 int dstW,
int yalpha,
int uvalpha,
int y)
834 const int16_t *buf0 = buf[0], *buf1 = buf[1],
835 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
840 "mov %4, %%"REG_b
" \n\t"
841 "push %%"REG_BP
" \n\t"
843 "pxor %%mm7, %%mm7 \n\t"
845 "pop %%"REG_BP
" \n\t"
847 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
853 const int16_t *ubuf[2],
const int16_t *vbuf[2],
854 const int16_t *abuf[2], uint8_t *dest,
855 int dstW,
int yalpha,
int uvalpha,
int y)
857 const int16_t *buf0 = buf[0], *buf1 = buf[1],
858 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
863 "mov %4, %%"REG_b
" \n\t"
864 "push %%"REG_BP
" \n\t"
866 "pxor %%mm7, %%mm7 \n\t"
874 "pop %%"REG_BP
" \n\t"
876 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
882 const int16_t *ubuf[2],
const int16_t *vbuf[2],
883 const int16_t *abuf[2], uint8_t *dest,
884 int dstW,
int yalpha,
int uvalpha,
int y)
886 const int16_t *buf0 = buf[0], *buf1 = buf[1],
887 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
892 "mov %4, %%"REG_b
" \n\t"
893 "push %%"REG_BP
" \n\t"
895 "pxor %%mm7, %%mm7 \n\t"
903 "pop %%"REG_BP
" \n\t"
905 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
910 #define REAL_YSCALEYUV2PACKED(index, c) \
911 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
912 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
913 "psraw $3, %%mm0 \n\t"\
914 "psraw $3, %%mm1 \n\t"\
915 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
916 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
917 "xor "#index", "#index" \n\t"\
920 "movq (%2, "#index"), %%mm2 \n\t" \
921 "movq (%3, "#index"), %%mm3 \n\t" \
922 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
923 "movq (%2, "#index"), %%mm5 \n\t" \
924 "movq (%3, "#index"), %%mm4 \n\t" \
925 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
926 "psubw %%mm3, %%mm2 \n\t" \
927 "psubw %%mm4, %%mm5 \n\t" \
928 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
929 "pmulhw %%mm0, %%mm2 \n\t" \
930 "pmulhw %%mm0, %%mm5 \n\t" \
931 "psraw $7, %%mm3 \n\t" \
932 "psraw $7, %%mm4 \n\t" \
933 "paddw %%mm2, %%mm3 \n\t" \
934 "paddw %%mm5, %%mm4 \n\t" \
935 "movq (%0, "#index", 2), %%mm0 \n\t" \
936 "movq (%1, "#index", 2), %%mm1 \n\t" \
937 "movq 8(%0, "#index", 2), %%mm6 \n\t" \
938 "movq 8(%1, "#index", 2), %%mm7 \n\t" \
939 "psubw %%mm1, %%mm0 \n\t" \
940 "psubw %%mm7, %%mm6 \n\t" \
941 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
942 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
943 "psraw $7, %%mm1 \n\t" \
944 "psraw $7, %%mm7 \n\t" \
945 "paddw %%mm0, %%mm1 \n\t" \
946 "paddw %%mm6, %%mm7 \n\t" \
948 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
951 const int16_t *ubuf[2],
const int16_t *vbuf[2],
952 const int16_t *abuf[2], uint8_t *dest,
953 int dstW,
int yalpha,
int uvalpha,
int y)
955 const int16_t *buf0 = buf[0], *buf1 = buf[1],
956 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
961 "mov %4, %%"REG_b
" \n\t"
962 "push %%"REG_BP
" \n\t"
965 "pop %%"REG_BP
" \n\t"
967 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
972 #define REAL_YSCALEYUV2RGB1(index, c) \
973 "xor "#index", "#index" \n\t"\
976 "movq (%2, "#index"), %%mm3 \n\t" \
977 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
978 "movq (%2, "#index"), %%mm4 \n\t" \
979 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
980 "psraw $4, %%mm3 \n\t" \
981 "psraw $4, %%mm4 \n\t" \
982 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
983 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
984 "movq %%mm3, %%mm2 \n\t" \
985 "movq %%mm4, %%mm5 \n\t" \
986 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
987 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
989 "movq (%0, "#index", 2), %%mm1 \n\t" \
990 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
991 "psraw $4, %%mm1 \n\t" \
992 "psraw $4, %%mm7 \n\t" \
993 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
994 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
995 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
996 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
997 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
998 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1000 "paddw %%mm3, %%mm4 \n\t"\
1001 "movq %%mm2, %%mm0 \n\t"\
1002 "movq %%mm5, %%mm6 \n\t"\
1003 "movq %%mm4, %%mm3 \n\t"\
1004 "punpcklwd %%mm2, %%mm2 \n\t"\
1005 "punpcklwd %%mm5, %%mm5 \n\t"\
1006 "punpcklwd %%mm4, %%mm4 \n\t"\
1007 "paddw %%mm1, %%mm2 \n\t"\
1008 "paddw %%mm1, %%mm5 \n\t"\
1009 "paddw %%mm1, %%mm4 \n\t"\
1010 "punpckhwd %%mm0, %%mm0 \n\t"\
1011 "punpckhwd %%mm6, %%mm6 \n\t"\
1012 "punpckhwd %%mm3, %%mm3 \n\t"\
1013 "paddw %%mm7, %%mm0 \n\t"\
1014 "paddw %%mm7, %%mm6 \n\t"\
1015 "paddw %%mm7, %%mm3 \n\t"\
1017 "packuswb %%mm0, %%mm2 \n\t"\
1018 "packuswb %%mm6, %%mm5 \n\t"\
1019 "packuswb %%mm3, %%mm4 \n\t"\
1021 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1024 #define REAL_YSCALEYUV2RGB1b(index, c) \
1025 "xor "#index", "#index" \n\t"\
1028 "movq (%2, "#index"), %%mm2 \n\t" \
1029 "movq (%3, "#index"), %%mm3 \n\t" \
1030 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1031 "movq (%2, "#index"), %%mm5 \n\t" \
1032 "movq (%3, "#index"), %%mm4 \n\t" \
1033 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1034 "paddw %%mm2, %%mm3 \n\t" \
1035 "paddw %%mm5, %%mm4 \n\t" \
1036 "psrlw $5, %%mm3 \n\t" \
1037 "psrlw $5, %%mm4 \n\t" \
1038 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
1039 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
1040 "movq %%mm3, %%mm2 \n\t" \
1041 "movq %%mm4, %%mm5 \n\t" \
1042 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1043 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1045 "movq (%0, "#index", 2), %%mm1 \n\t" \
1046 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1047 "psraw $4, %%mm1 \n\t" \
1048 "psraw $4, %%mm7 \n\t" \
1049 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1050 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1051 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
1052 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
1053 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1054 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1056 "paddw %%mm3, %%mm4 \n\t"\
1057 "movq %%mm2, %%mm0 \n\t"\
1058 "movq %%mm5, %%mm6 \n\t"\
1059 "movq %%mm4, %%mm3 \n\t"\
1060 "punpcklwd %%mm2, %%mm2 \n\t"\
1061 "punpcklwd %%mm5, %%mm5 \n\t"\
1062 "punpcklwd %%mm4, %%mm4 \n\t"\
1063 "paddw %%mm1, %%mm2 \n\t"\
1064 "paddw %%mm1, %%mm5 \n\t"\
1065 "paddw %%mm1, %%mm4 \n\t"\
1066 "punpckhwd %%mm0, %%mm0 \n\t"\
1067 "punpckhwd %%mm6, %%mm6 \n\t"\
1068 "punpckhwd %%mm3, %%mm3 \n\t"\
1069 "paddw %%mm7, %%mm0 \n\t"\
1070 "paddw %%mm7, %%mm6 \n\t"\
1071 "paddw %%mm7, %%mm3 \n\t"\
1073 "packuswb %%mm0, %%mm2 \n\t"\
1074 "packuswb %%mm6, %%mm5 \n\t"\
1075 "packuswb %%mm3, %%mm4 \n\t"\
1077 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1079 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1080 "movq (%1, "#index", 2), %%mm7 \n\t" \
1081 "movq 8(%1, "#index", 2), %%mm1 \n\t" \
1082 "psraw $7, %%mm7 \n\t" \
1083 "psraw $7, %%mm1 \n\t" \
1084 "packuswb %%mm1, %%mm7 \n\t"
1085 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1091 const int16_t *ubuf[2],
const int16_t *bguf[2],
1092 const int16_t *abuf0, uint8_t *dest,
1093 int dstW,
int uvalpha,
int y)
1095 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1096 const int16_t *buf1= buf0;
1098 if (uvalpha < 2048) {
1102 "mov %4, %%"REG_b
" \n\t"
1103 "push %%"REG_BP
" \n\t"
1106 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1107 "pop %%"REG_BP
" \n\t"
1109 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1115 "mov %4, %%"REG_b
" \n\t"
1116 "push %%"REG_BP
" \n\t"
1118 "pcmpeqd %%mm7, %%mm7 \n\t"
1119 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1120 "pop %%"REG_BP
" \n\t"
1122 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1130 "mov %4, %%"REG_b
" \n\t"
1131 "push %%"REG_BP
" \n\t"
1134 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1135 "pop %%"REG_BP
" \n\t"
1137 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1143 "mov %4, %%"REG_b
" \n\t"
1144 "push %%"REG_BP
" \n\t"
1146 "pcmpeqd %%mm7, %%mm7 \n\t"
1147 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1148 "pop %%"REG_BP
" \n\t"
1150 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1158 const int16_t *ubuf[2],
const int16_t *bguf[2],
1159 const int16_t *abuf0, uint8_t *dest,
1160 int dstW,
int uvalpha,
int y)
1162 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1163 const int16_t *buf1= buf0;
1165 if (uvalpha < 2048) {
1168 "mov %4, %%"REG_b
" \n\t"
1169 "push %%"REG_BP
" \n\t"
1171 "pxor %%mm7, %%mm7 \n\t"
1173 "pop %%"REG_BP
" \n\t"
1175 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1181 "mov %4, %%"REG_b
" \n\t"
1182 "push %%"REG_BP
" \n\t"
1184 "pxor %%mm7, %%mm7 \n\t"
1186 "pop %%"REG_BP
" \n\t"
1188 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1195 const int16_t *ubuf[2],
const int16_t *bguf[2],
1196 const int16_t *abuf0, uint8_t *dest,
1197 int dstW,
int uvalpha,
int y)
1199 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1200 const int16_t *buf1= buf0;
1202 if (uvalpha < 2048) {
1205 "mov %4, %%"REG_b
" \n\t"
1206 "push %%"REG_BP
" \n\t"
1208 "pxor %%mm7, %%mm7 \n\t"
1216 "pop %%"REG_BP
" \n\t"
1218 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1224 "mov %4, %%"REG_b
" \n\t"
1225 "push %%"REG_BP
" \n\t"
1227 "pxor %%mm7, %%mm7 \n\t"
1235 "pop %%"REG_BP
" \n\t"
1237 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1244 const int16_t *ubuf[2],
const int16_t *bguf[2],
1245 const int16_t *abuf0, uint8_t *dest,
1246 int dstW,
int uvalpha,
int y)
1248 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1249 const int16_t *buf1= buf0;
1251 if (uvalpha < 2048) {
1254 "mov %4, %%"REG_b
" \n\t"
1255 "push %%"REG_BP
" \n\t"
1257 "pxor %%mm7, %%mm7 \n\t"
1265 "pop %%"REG_BP
" \n\t"
1267 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1273 "mov %4, %%"REG_b
" \n\t"
1274 "push %%"REG_BP
" \n\t"
1276 "pxor %%mm7, %%mm7 \n\t"
1284 "pop %%"REG_BP
" \n\t"
1286 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1292 #define REAL_YSCALEYUV2PACKED1(index, c) \
1293 "xor "#index", "#index" \n\t"\
1296 "movq (%2, "#index"), %%mm3 \n\t" \
1297 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1298 "movq (%2, "#index"), %%mm4 \n\t" \
1299 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1300 "psraw $7, %%mm3 \n\t" \
1301 "psraw $7, %%mm4 \n\t" \
1302 "movq (%0, "#index", 2), %%mm1 \n\t" \
1303 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1304 "psraw $7, %%mm1 \n\t" \
1305 "psraw $7, %%mm7 \n\t" \
1307 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1309 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1310 "xor "#index", "#index" \n\t"\
1313 "movq (%2, "#index"), %%mm2 \n\t" \
1314 "movq (%3, "#index"), %%mm3 \n\t" \
1315 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1316 "movq (%2, "#index"), %%mm5 \n\t" \
1317 "movq (%3, "#index"), %%mm4 \n\t" \
1318 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1319 "paddw %%mm2, %%mm3 \n\t" \
1320 "paddw %%mm5, %%mm4 \n\t" \
1321 "psrlw $8, %%mm3 \n\t" \
1322 "psrlw $8, %%mm4 \n\t" \
1323 "movq (%0, "#index", 2), %%mm1 \n\t" \
1324 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1325 "psraw $7, %%mm1 \n\t" \
1326 "psraw $7, %%mm7 \n\t"
1327 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1330 const int16_t *ubuf[2],
const int16_t *bguf[2],
1331 const int16_t *abuf0, uint8_t *dest,
1332 int dstW,
int uvalpha,
int y)
1334 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1335 const int16_t *buf1= buf0;
1337 if (uvalpha < 2048) {
1340 "mov %4, %%"REG_b
" \n\t"
1341 "push %%"REG_BP
" \n\t"
1344 "pop %%"REG_BP
" \n\t"
1346 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1352 "mov %4, %%"REG_b
" \n\t"
1353 "push %%"REG_BP
" \n\t"
1356 "pop %%"REG_BP
" \n\t"
1358 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1370 "movq "MANGLE(ff_bgr24toY1Coeff)
", %%mm5 \n\t"
1371 "movq "MANGLE(ff_bgr24toY2Coeff)
", %%mm6 \n\t"
1376 "movq "MANGLE(ff_rgb24toY1Coeff)
", %%mm5 \n\t"
1377 "movq "MANGLE(ff_rgb24toY2Coeff)
", %%mm6 \n\t"
1383 "movq "MANGLE(ff_bgr24toYOffset)
", %%mm4 \n\t"
1384 "mov %2, %%"REG_a
" \n\t"
1385 "pxor %%mm7, %%mm7 \n\t"
1388 "movd (%0), %%mm0 \n\t"
1389 "movd 2(%0), %%mm1 \n\t"
1390 "movd 6(%0), %%mm2 \n\t"
1391 "movd 8(%0), %%mm3 \n\t"
1393 "punpcklbw %%mm7, %%mm0 \n\t"
1394 "punpcklbw %%mm7, %%mm1 \n\t"
1395 "punpcklbw %%mm7, %%mm2 \n\t"
1396 "punpcklbw %%mm7, %%mm3 \n\t"
1397 "pmaddwd %%mm5, %%mm0 \n\t"
1398 "pmaddwd %%mm6, %%mm1 \n\t"
1399 "pmaddwd %%mm5, %%mm2 \n\t"
1400 "pmaddwd %%mm6, %%mm3 \n\t"
1401 "paddd %%mm1, %%mm0 \n\t"
1402 "paddd %%mm3, %%mm2 \n\t"
1403 "paddd %%mm4, %%mm0 \n\t"
1404 "paddd %%mm4, %%mm2 \n\t"
1405 "psrad $15, %%mm0 \n\t"
1406 "psrad $15, %%mm2 \n\t"
1407 "packssdw %%mm2, %%mm0 \n\t"
1408 "packuswb %%mm0, %%mm0 \n\t"
1409 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1410 "add $4, %%"REG_a
" \n\t"
1418 static void RENAME(bgr24ToY)(uint8_t *dst,
const uint8_t *src,
1419 int width, uint32_t *unused)
1424 static void RENAME(rgb24ToY)(uint8_t *dst,
const uint8_t *src,
1425 int width, uint32_t *unused)
1431 const uint8_t *src,
int width,
1435 "movq 24(%4), %%mm6 \n\t"
1436 "mov %3, %%"REG_a
" \n\t"
1437 "pxor %%mm7, %%mm7 \n\t"
1440 "movd (%0), %%mm0 \n\t"
1441 "movd 2(%0), %%mm1 \n\t"
1442 "punpcklbw %%mm7, %%mm0 \n\t"
1443 "punpcklbw %%mm7, %%mm1 \n\t"
1444 "movq %%mm0, %%mm2 \n\t"
1445 "movq %%mm1, %%mm3 \n\t"
1446 "pmaddwd (%4), %%mm0 \n\t"
1447 "pmaddwd 8(%4), %%mm1 \n\t"
1448 "pmaddwd 16(%4), %%mm2 \n\t"
1449 "pmaddwd %%mm6, %%mm3 \n\t"
1450 "paddd %%mm1, %%mm0 \n\t"
1451 "paddd %%mm3, %%mm2 \n\t"
1453 "movd 6(%0), %%mm1 \n\t"
1454 "movd 8(%0), %%mm3 \n\t"
1456 "punpcklbw %%mm7, %%mm1 \n\t"
1457 "punpcklbw %%mm7, %%mm3 \n\t"
1458 "movq %%mm1, %%mm4 \n\t"
1459 "movq %%mm3, %%mm5 \n\t"
1460 "pmaddwd (%4), %%mm1 \n\t"
1461 "pmaddwd 8(%4), %%mm3 \n\t"
1462 "pmaddwd 16(%4), %%mm4 \n\t"
1463 "pmaddwd %%mm6, %%mm5 \n\t"
1464 "paddd %%mm3, %%mm1 \n\t"
1465 "paddd %%mm5, %%mm4 \n\t"
1467 "movq "MANGLE(ff_bgr24toUVOffset)
", %%mm3 \n\t"
1468 "paddd %%mm3, %%mm0 \n\t"
1469 "paddd %%mm3, %%mm2 \n\t"
1470 "paddd %%mm3, %%mm1 \n\t"
1471 "paddd %%mm3, %%mm4 \n\t"
1472 "psrad $15, %%mm0 \n\t"
1473 "psrad $15, %%mm2 \n\t"
1474 "psrad $15, %%mm1 \n\t"
1475 "psrad $15, %%mm4 \n\t"
1476 "packssdw %%mm1, %%mm0 \n\t"
1477 "packssdw %%mm4, %%mm2 \n\t"
1478 "packuswb %%mm0, %%mm0 \n\t"
1479 "packuswb %%mm2, %%mm2 \n\t"
1480 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1481 "movd %%mm2, (%2, %%"REG_a
") \n\t"
1482 "add $4, %%"REG_a
" \n\t"
1490 static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
1491 const uint8_t *src1,
const uint8_t *src2,
1492 int width, uint32_t *unused)
1495 assert(src1 == src2);
1498 static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
1499 const uint8_t *src1,
const uint8_t *src2,
1500 int width, uint32_t *unused)
1506 #if COMPILE_TEMPLATE_MMX2
1508 int dstWidth,
const uint8_t *src,
1511 int32_t *filterPos = c->hLumFilterPos;
1512 int16_t *
filter = c->hLumFilter;
1513 void *mmx2FilterCode= c->lumMmx2FilterCode;
1524 "mov %%"REG_b
", %5 \n\t"
1526 "mov -8(%%rsp), %%"REG_a
" \n\t"
1527 "mov %%"REG_a
", %6 \n\t"
1531 "mov -8(%%rsp), %%"REG_a
" \n\t"
1532 "mov %%"REG_a
", %5 \n\t"
1535 "pxor %%mm7, %%mm7 \n\t"
1536 "mov %0, %%"REG_c
" \n\t"
1537 "mov %1, %%"REG_D
" \n\t"
1538 "mov %2, %%"REG_d
" \n\t"
1539 "mov %3, %%"REG_b
" \n\t"
1540 "xor %%"REG_a
", %%"REG_a
" \n\t"
1546 #define CALL_MMX2_FILTER_CODE \
1547 "movl (%%"REG_b"), %%esi \n\t"\
1549 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
1550 "add %%"REG_S", %%"REG_c" \n\t"\
1551 "add %%"REG_a", %%"REG_D" \n\t"\
1552 "xor %%"REG_a", %%"REG_a" \n\t"\
1555 #define CALL_MMX2_FILTER_CODE \
1556 "movl (%%"REG_b"), %%esi \n\t"\
1558 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
1559 "add %%"REG_a", %%"REG_D" \n\t"\
1560 "xor %%"REG_a", %%"REG_a" \n\t"\
1564 CALL_MMX2_FILTER_CODE
1565 CALL_MMX2_FILTER_CODE
1566 CALL_MMX2_FILTER_CODE
1567 CALL_MMX2_FILTER_CODE
1568 CALL_MMX2_FILTER_CODE
1569 CALL_MMX2_FILTER_CODE
1570 CALL_MMX2_FILTER_CODE
1571 CALL_MMX2_FILTER_CODE
1574 "mov %5, %%"REG_b
" \n\t"
1576 "mov %6, %%"REG_a
" \n\t"
1577 "mov %%"REG_a
", -8(%%rsp) \n\t"
1581 "mov %5, %%"REG_a
" \n\t"
1582 "mov %%"REG_a
", -8(%%rsp) \n\t"
1585 ::
"m" (src),
"m" (dst),
"m" (
filter),
"m" (filterPos),
1586 "m" (mmx2FilterCode)
1593 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S,
"%"REG_D
1599 for (i=dstWidth-1; (i*xInc)>>16 >=
srcW-1; i--)
1600 dst[i] = src[
srcW-1]*128;
1604 int dstWidth,
const uint8_t *src1,
1605 const uint8_t *src2,
int srcW,
int xInc)
1607 int32_t *filterPos = c->hChrFilterPos;
1608 int16_t *filter = c->hChrFilter;
1609 void *mmx2FilterCode= c->chrMmx2FilterCode;
1620 "mov %%"REG_b
", %7 \n\t"
1622 "mov -8(%%rsp), %%"REG_a
" \n\t"
1623 "mov %%"REG_a
", %8 \n\t"
1627 "mov -8(%%rsp), %%"REG_a
" \n\t"
1628 "mov %%"REG_a
", %7 \n\t"
1631 "pxor %%mm7, %%mm7 \n\t"
1632 "mov %0, %%"REG_c
" \n\t"
1633 "mov %1, %%"REG_D
" \n\t"
1634 "mov %2, %%"REG_d
" \n\t"
1635 "mov %3, %%"REG_b
" \n\t"
1636 "xor %%"REG_a
", %%"REG_a
" \n\t"
1641 CALL_MMX2_FILTER_CODE
1642 CALL_MMX2_FILTER_CODE
1643 CALL_MMX2_FILTER_CODE
1644 CALL_MMX2_FILTER_CODE
1645 "xor %%"REG_a
", %%"REG_a
" \n\t"
1646 "mov %5, %%"REG_c
" \n\t"
1647 "mov %6, %%"REG_D
" \n\t"
1652 CALL_MMX2_FILTER_CODE
1653 CALL_MMX2_FILTER_CODE
1654 CALL_MMX2_FILTER_CODE
1655 CALL_MMX2_FILTER_CODE
1658 "mov %7, %%"REG_b
" \n\t"
1660 "mov %8, %%"REG_a
" \n\t"
1661 "mov %%"REG_a
", -8(%%rsp) \n\t"
1665 "mov %7, %%"REG_a
" \n\t"
1666 "mov %%"REG_a
", -8(%%rsp) \n\t"
1669 ::
"m" (src1),
"m" (dst1),
"m" (
filter),
"m" (filterPos),
1670 "m" (mmx2FilterCode),
"m" (src2),
"m"(dst2)
1677 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S,
"%"REG_D
1683 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1684 dst1[i] = src1[srcW-1]*128;
1685 dst2[i] = src2[srcW-1]*128;
1700 switch (c->dstFormat) {
1711 switch (c->dstFormat) {
1723 switch (c->dstFormat) {
1725 c->yuv2packed1 =
RENAME(yuv2rgb32_1);
1726 c->yuv2packed2 =
RENAME(yuv2rgb32_2);
1729 c->yuv2packed1 =
RENAME(yuv2bgr24_1);
1730 c->yuv2packed2 =
RENAME(yuv2bgr24_2);
1733 c->yuv2packed1 =
RENAME(yuv2rgb555_1);
1734 c->yuv2packed2 =
RENAME(yuv2rgb555_2);
1737 c->yuv2packed1 =
RENAME(yuv2rgb565_1);
1738 c->yuv2packed2 =
RENAME(yuv2rgb565_2);
1741 c->yuv2packed1 =
RENAME(yuv2yuyv422_1);
1742 c->yuv2packed2 =
RENAME(yuv2yuyv422_2);
1750 if (c->srcBpc == 8 && c->dstBpc <= 10) {
1752 #if COMPILE_TEMPLATE_MMX2
1759 c->hyscale_fast =
NULL;
1760 c->hcscale_fast =
NULL;
1761 #if COMPILE_TEMPLATE_MMX2
1766 if (!c->chrSrcHSubSample) {
1774 switch (srcFormat) {