44 "movq (%3), %%mm0 \n\t"
45 "movq 8(%3), %%mm1 \n\t"
46 "movq 16(%3), %%mm2 \n\t"
47 "movq 24(%3), %%mm3 \n\t"
48 "movq 32(%3), %%mm4 \n\t"
49 "movq 40(%3), %%mm5 \n\t"
50 "movq 48(%3), %%mm6 \n\t"
51 "movq 56(%3), %%mm7 \n\t"
52 "packuswb %%mm1, %%mm0 \n\t"
53 "packuswb %%mm3, %%mm2 \n\t"
54 "packuswb %%mm5, %%mm4 \n\t"
55 "packuswb %%mm7, %%mm6 \n\t"
56 "movq %%mm0, (%0) \n\t"
57 "movq %%mm2, (%0, %1) \n\t"
58 "movq %%mm4, (%0, %1, 2) \n\t"
59 "movq %%mm6, (%0, %2) \n\t"
70 "movq (%3), %%mm0 \n\t"
71 "movq 8(%3), %%mm1 \n\t"
72 "movq 16(%3), %%mm2 \n\t"
73 "movq 24(%3), %%mm3 \n\t"
74 "movq 32(%3), %%mm4 \n\t"
75 "movq 40(%3), %%mm5 \n\t"
76 "movq 48(%3), %%mm6 \n\t"
77 "movq 56(%3), %%mm7 \n\t"
78 "packuswb %%mm1, %%mm0 \n\t"
79 "packuswb %%mm3, %%mm2 \n\t"
80 "packuswb %%mm5, %%mm4 \n\t"
81 "packuswb %%mm7, %%mm6 \n\t"
82 "movq %%mm0, (%0) \n\t"
83 "movq %%mm2, (%0, %1) \n\t"
84 "movq %%mm4, (%0, %1, 2) \n\t"
85 "movq %%mm6, (%0, %2) \n\t"
86 ::
"r"(pix),
"r"((
x86_reg)line_size),
"r"((
x86_reg)line_size * 3),
"r"(p)
90 #define put_signed_pixels_clamped_mmx_half(off) \
91 "movq "#off"(%2), %%mm1 \n\t" \
92 "movq 16 + "#off"(%2), %%mm2 \n\t" \
93 "movq 32 + "#off"(%2), %%mm3 \n\t" \
94 "movq 48 + "#off"(%2), %%mm4 \n\t" \
95 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
96 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
97 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
98 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
99 "paddb %%mm0, %%mm1 \n\t" \
100 "paddb %%mm0, %%mm2 \n\t" \
101 "paddb %%mm0, %%mm3 \n\t" \
102 "paddb %%mm0, %%mm4 \n\t" \
103 "movq %%mm1, (%0) \n\t" \
104 "movq %%mm2, (%0, %3) \n\t" \
105 "movq %%mm3, (%0, %3, 2) \n\t" \
106 "movq %%mm4, (%0, %1) \n\t"
116 "lea (%3, %3, 2), %1 \n\t"
117 put_signed_pixels_clamped_mmx_half(0)
118 "lea (%0, %3, 4), %0 \n\t"
119 put_signed_pixels_clamped_mmx_half(64)
120 :
"+&r"(pixels),
"=&r"(line_skip3)
121 :
"r"(block),
"r"(line_skip)
139 "movq (%2), %%mm0 \n\t"
140 "movq 8(%2), %%mm1 \n\t"
141 "movq 16(%2), %%mm2 \n\t"
142 "movq 24(%2), %%mm3 \n\t"
143 "movq %0, %%mm4 \n\t"
144 "movq %1, %%mm6 \n\t"
145 "movq %%mm4, %%mm5 \n\t"
146 "punpcklbw %%mm7, %%mm4 \n\t"
147 "punpckhbw %%mm7, %%mm5 \n\t"
148 "paddsw %%mm4, %%mm0 \n\t"
149 "paddsw %%mm5, %%mm1 \n\t"
150 "movq %%mm6, %%mm5 \n\t"
151 "punpcklbw %%mm7, %%mm6 \n\t"
152 "punpckhbw %%mm7, %%mm5 \n\t"
153 "paddsw %%mm6, %%mm2 \n\t"
154 "paddsw %%mm5, %%mm3 \n\t"
155 "packuswb %%mm1, %%mm0 \n\t"
156 "packuswb %%mm3, %%mm2 \n\t"
157 "movq %%mm0, %0 \n\t"
158 "movq %%mm2, %1 \n\t"
159 :
"+m"(*pix),
"+m"(*(pix + line_size))
162 pix += line_size * 2;
167 #define CLEAR_BLOCKS(name, n) \
168 void name(int16_t *blocks) \
171 "pxor %%mm7, %%mm7 \n\t" \
172 "mov %1, %%"REG_a" \n\t" \
174 "movq %%mm7, (%0, %%"REG_a") \n\t" \
175 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
176 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
177 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
178 "add $32, %%"REG_a" \n\t" \
180 :: "r"(((uint8_t *)blocks) + 128 * n), \
191 "xorps %%xmm0, %%xmm0 \n"
192 "movaps %%xmm0, (%0) \n"
193 "movaps %%xmm0, 16(%0) \n"
194 "movaps %%xmm0, 32(%0) \n"
195 "movaps %%xmm0, 48(%0) \n"
196 "movaps %%xmm0, 64(%0) \n"
197 "movaps %%xmm0, 80(%0) \n"
198 "movaps %%xmm0, 96(%0) \n"
199 "movaps %%xmm0, 112(%0) \n"
208 "xorps %%xmm0, %%xmm0 \n"
209 "mov %1, %%"REG_a
" \n"
211 "movaps %%xmm0, (%0, %%"REG_a
") \n"
212 "movaps %%xmm0, 16(%0, %%"REG_a
") \n"
213 "movaps %%xmm0, 32(%0, %%"REG_a
") \n"
214 "movaps %%xmm0, 48(%0, %%"REG_a
") \n"
215 "movaps %%xmm0, 64(%0, %%"REG_a
") \n"
216 "movaps %%xmm0, 80(%0, %%"REG_a
") \n"
217 "movaps %%xmm0, 96(%0, %%"REG_a
") \n"
218 "movaps %%xmm0, 112(%0, %%"REG_a
") \n"
219 "add $128, %%"REG_a
" \n"
221 ::
"r"(((
uint8_t *)blocks) + 128 * 6),
233 "movq (%1, %0), %%mm0 \n\t"
234 "movq (%2, %0), %%mm1 \n\t"
235 "paddb %%mm0, %%mm1 \n\t"
236 "movq %%mm1, (%2, %0) \n\t"
237 "movq 8(%1, %0), %%mm0 \n\t"
238 "movq 8(%2, %0), %%mm1 \n\t"
239 "paddb %%mm0, %%mm1 \n\t"
240 "movq %%mm1, 8(%2, %0) \n\t"
246 :
"r"(src),
"r"(dst),
"r"((
x86_reg)w - 15)
249 dst[i + 0] += src[i + 0];
255 int w,
int h,
int sides)
260 last_line = buf + (height - 1) * wrap;
266 "movd (%0), %%mm0 \n\t"
267 "punpcklbw %%mm0, %%mm0 \n\t"
268 "punpcklwd %%mm0, %%mm0 \n\t"
269 "punpckldq %%mm0, %%mm0 \n\t"
270 "movq %%mm0, -8(%0) \n\t"
271 "movq -8(%0, %2), %%mm1 \n\t"
272 "punpckhbw %%mm1, %%mm1 \n\t"
273 "punpckhwd %%mm1, %%mm1 \n\t"
274 "punpckhdq %%mm1, %%mm1 \n\t"
275 "movq %%mm1, (%0, %2) \n\t"
285 "movd (%0), %%mm0 \n\t"
286 "punpcklbw %%mm0, %%mm0 \n\t"
287 "punpcklwd %%mm0, %%mm0 \n\t"
288 "punpckldq %%mm0, %%mm0 \n\t"
289 "movq %%mm0, -8(%0) \n\t"
290 "movq %%mm0, -16(%0) \n\t"
291 "movq -8(%0, %2), %%mm1 \n\t"
292 "punpckhbw %%mm1, %%mm1 \n\t"
293 "punpckhwd %%mm1, %%mm1 \n\t"
294 "punpckhdq %%mm1, %%mm1 \n\t"
295 "movq %%mm1, (%0, %2) \n\t"
296 "movq %%mm1, 8(%0, %2) \n\t"
307 for (i = 0; i < h; i += 4) {
308 ptr = buf - (i + 1) * wrap - w;
311 "movq (%1, %0), %%mm0 \n\t"
312 "movq %%mm0, (%0) \n\t"
313 "movq %%mm0, (%0, %2) \n\t"
314 "movq %%mm0, (%0, %2, 2) \n\t"
315 "movq %%mm0, (%0, %3) \n\t"
321 "r"((
x86_reg) -wrap * 3),
"r"(ptr + width + 2 * w)
327 for (i = 0; i < h; i += 4) {
328 ptr = last_line + (i + 1) * wrap - w;
331 "movq (%1, %0), %%mm0 \n\t"
332 "movq %%mm0, (%0) \n\t"
333 "movq %%mm0, (%0, %2) \n\t"
334 "movq %%mm0, (%0, %2, 2) \n\t"
335 "movq %%mm0, (%0, %3) \n\t"
342 "r"(ptr + width + 2 * w)
349 int stride,
int h,
int ox,
int oy,
350 int dxx,
int dxy,
int dyx,
int dyy,
351 int shift,
int r,
int width,
int height)
354 const int ix = ox >> (16 + shift);
355 const int iy = oy >> (16 + shift);
356 const int oxs = ox >> 4;
357 const int oys = oy >> 4;
358 const int dxxs = dxx >> 4;
359 const int dxys = dxy >> 4;
360 const int dyxs = dyx >> 4;
361 const int dyys = dyy >> 4;
362 const uint16_t r4[4] = {
r,
r,
r, r };
363 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
364 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
365 const uint64_t
shift2 = 2 * shift;
368 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
369 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
370 const int dxh = dxy * (h - 1);
371 const int dyw = dyx * (w - 1);
373 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
374 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
376 || (dxx | dxy | dyx | dyy) & 15 ||
377 (unsigned)ix >= width - w ||
378 (
unsigned)iy >= height - h) {
380 ff_gmc_c(dst, src,
stride, h, ox, oy, dxx, dxy, dyx, dyy,
381 shift,
r, width, height);
388 "movd %0, %%mm6 \n\t"
389 "pxor %%mm7, %%mm7 \n\t"
390 "punpcklwd %%mm6, %%mm6 \n\t"
391 "punpcklwd %%mm6, %%mm6 \n\t"
395 for (x = 0; x < w; x += 4) {
396 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
397 oxs - dxys + dxxs * (x + 1),
398 oxs - dxys + dxxs * (x + 2),
399 oxs - dxys + dxxs * (x + 3) };
400 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
401 oys - dyys + dyxs * (x + 1),
402 oys - dyys + dyxs * (x + 2),
403 oys - dyys + dyxs * (x + 3) };
405 for (y = 0; y < h; y++) {
407 "movq %0, %%mm4 \n\t"
408 "movq %1, %%mm5 \n\t"
409 "paddw %2, %%mm4 \n\t"
410 "paddw %3, %%mm5 \n\t"
411 "movq %%mm4, %0 \n\t"
412 "movq %%mm5, %1 \n\t"
413 "psrlw $12, %%mm4 \n\t"
414 "psrlw $12, %%mm5 \n\t"
415 :
"+m"(*dx4),
"+m"(*dy4)
416 :
"m"(*dxy4),
"m"(*dyy4)
420 "movq %%mm6, %%mm2 \n\t"
421 "movq %%mm6, %%mm1 \n\t"
422 "psubw %%mm4, %%mm2 \n\t"
423 "psubw %%mm5, %%mm1 \n\t"
424 "movq %%mm2, %%mm0 \n\t"
425 "movq %%mm4, %%mm3 \n\t"
426 "pmullw %%mm1, %%mm0 \n\t"
427 "pmullw %%mm5, %%mm3 \n\t"
428 "pmullw %%mm5, %%mm2 \n\t"
429 "pmullw %%mm4, %%mm1 \n\t"
431 "movd %4, %%mm5 \n\t"
432 "movd %3, %%mm4 \n\t"
433 "punpcklbw %%mm7, %%mm5 \n\t"
434 "punpcklbw %%mm7, %%mm4 \n\t"
435 "pmullw %%mm5, %%mm3 \n\t"
436 "pmullw %%mm4, %%mm2 \n\t"
438 "movd %2, %%mm5 \n\t"
439 "movd %1, %%mm4 \n\t"
440 "punpcklbw %%mm7, %%mm5 \n\t"
441 "punpcklbw %%mm7, %%mm4 \n\t"
442 "pmullw %%mm5, %%mm1 \n\t"
443 "pmullw %%mm4, %%mm0 \n\t"
444 "paddw %5, %%mm1 \n\t"
445 "paddw %%mm3, %%mm2 \n\t"
446 "paddw %%mm1, %%mm0 \n\t"
447 "paddw %%mm2, %%mm0 \n\t"
449 "psrlw %6, %%mm0 \n\t"
450 "packuswb %%mm0, %%mm0 \n\t"
451 "movd %%mm0, %0 \n\t"
453 :
"=m"(dst[x + y *
stride])
454 :
"m"(src[0]),
"m"(src[1]),
455 "m"(src[stride]),
"m"(src[stride + 1]),
465 float min,
float max,
int len)
469 "movss %3, %%xmm4 \n\t"
470 "movss %4, %%xmm5 \n\t"
471 "shufps $0, %%xmm4, %%xmm4 \n\t"
472 "shufps $0, %%xmm5, %%xmm5 \n\t"
474 "movaps (%2, %0), %%xmm0 \n\t"
475 "movaps 16(%2, %0), %%xmm1 \n\t"
476 "movaps 32(%2, %0), %%xmm2 \n\t"
477 "movaps 48(%2, %0), %%xmm3 \n\t"
478 "maxps %%xmm4, %%xmm0 \n\t"
479 "maxps %%xmm4, %%xmm1 \n\t"
480 "maxps %%xmm4, %%xmm2 \n\t"
481 "maxps %%xmm4, %%xmm3 \n\t"
482 "minps %%xmm5, %%xmm0 \n\t"
483 "minps %%xmm5, %%xmm1 \n\t"
484 "minps %%xmm5, %%xmm2 \n\t"
485 "minps %%xmm5, %%xmm3 \n\t"
486 "movaps %%xmm0, (%1, %0) \n\t"
487 "movaps %%xmm1, 16(%1, %0) \n\t"
488 "movaps %%xmm2, 32(%1, %0) \n\t"
489 "movaps %%xmm3, 48(%1, %0) \n\t"
493 :
"r"(dst),
"r"(src),
"m"(min),
"m"(max)
void ff_clear_blocks_mmx(int16_t *blocks)
void ff_clear_blocks_sse(int16_t *blocks)
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
void ff_clear_block_sse(int16_t *block)
void ff_gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
void ff_clear_block_mmx(int16_t *block)
static const int shift2[6]
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
void ff_vector_clipf_sse(float *dst, const float *src, float min, float max, int len)
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)