dsputil_mmx_avg_template.c
Go to the documentation of this file.
1 /*
2  * DSP utils : average functions are compiled twice for 3dnow/mmx2
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8  * and improved by Zdenek Kabelac <kabi@users.sf.net>
9  *
10  * This file is part of Libav.
11  *
12  * Libav is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * Libav is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with Libav; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
28  clobber bug - now it will work with 2.95.2 and also with -fPIC
29  */
30 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
31 {
32  __asm__ volatile(
33  "lea (%3, %3), %%"REG_a" \n\t"
34  "1: \n\t"
35  "movq (%1), %%mm0 \n\t"
36  "movq (%1, %3), %%mm1 \n\t"
37  PAVGB" 1(%1), %%mm0 \n\t"
38  PAVGB" 1(%1, %3), %%mm1 \n\t"
39  "movq %%mm0, (%2) \n\t"
40  "movq %%mm1, (%2, %3) \n\t"
41  "add %%"REG_a", %1 \n\t"
42  "add %%"REG_a", %2 \n\t"
43  "movq (%1), %%mm0 \n\t"
44  "movq (%1, %3), %%mm1 \n\t"
45  PAVGB" 1(%1), %%mm0 \n\t"
46  PAVGB" 1(%1, %3), %%mm1 \n\t"
47  "add %%"REG_a", %1 \n\t"
48  "movq %%mm0, (%2) \n\t"
49  "movq %%mm1, (%2, %3) \n\t"
50  "add %%"REG_a", %2 \n\t"
51  "subl $4, %0 \n\t"
52  "jnz 1b \n\t"
53  :"+g"(h), "+S"(pixels), "+D"(block)
54  :"r" ((x86_reg)line_size)
55  :"%"REG_a, "memory");
56 }
57 
58 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
59 {
60  __asm__ volatile(
61  "testl $1, %0 \n\t"
62  " jz 1f \n\t"
63  "movd (%1), %%mm0 \n\t"
64  "movd (%2), %%mm1 \n\t"
65  "add %4, %1 \n\t"
66  "add $4, %2 \n\t"
67  PAVGB" %%mm1, %%mm0 \n\t"
68  "movd %%mm0, (%3) \n\t"
69  "add %5, %3 \n\t"
70  "decl %0 \n\t"
71  "1: \n\t"
72  "movd (%1), %%mm0 \n\t"
73  "add %4, %1 \n\t"
74  "movd (%1), %%mm1 \n\t"
75  "movd (%2), %%mm2 \n\t"
76  "movd 4(%2), %%mm3 \n\t"
77  "add %4, %1 \n\t"
78  PAVGB" %%mm2, %%mm0 \n\t"
79  PAVGB" %%mm3, %%mm1 \n\t"
80  "movd %%mm0, (%3) \n\t"
81  "add %5, %3 \n\t"
82  "movd %%mm1, (%3) \n\t"
83  "add %5, %3 \n\t"
84  "movd (%1), %%mm0 \n\t"
85  "add %4, %1 \n\t"
86  "movd (%1), %%mm1 \n\t"
87  "movd 8(%2), %%mm2 \n\t"
88  "movd 12(%2), %%mm3 \n\t"
89  "add %4, %1 \n\t"
90  PAVGB" %%mm2, %%mm0 \n\t"
91  PAVGB" %%mm3, %%mm1 \n\t"
92  "movd %%mm0, (%3) \n\t"
93  "add %5, %3 \n\t"
94  "movd %%mm1, (%3) \n\t"
95  "add %5, %3 \n\t"
96  "add $16, %2 \n\t"
97  "subl $4, %0 \n\t"
98  "jnz 1b \n\t"
99 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
100  :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
101 #else
102  :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
103 #endif
104  :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
105  :"memory");
106 }
107 
108 
109 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
110 {
111  __asm__ volatile(
112  "testl $1, %0 \n\t"
113  " jz 1f \n\t"
114  "movq (%1), %%mm0 \n\t"
115  "movq (%2), %%mm1 \n\t"
116  "add %4, %1 \n\t"
117  "add $8, %2 \n\t"
118  PAVGB" %%mm1, %%mm0 \n\t"
119  "movq %%mm0, (%3) \n\t"
120  "add %5, %3 \n\t"
121  "decl %0 \n\t"
122  "1: \n\t"
123  "movq (%1), %%mm0 \n\t"
124  "add %4, %1 \n\t"
125  "movq (%1), %%mm1 \n\t"
126  "add %4, %1 \n\t"
127  PAVGB" (%2), %%mm0 \n\t"
128  PAVGB" 8(%2), %%mm1 \n\t"
129  "movq %%mm0, (%3) \n\t"
130  "add %5, %3 \n\t"
131  "movq %%mm1, (%3) \n\t"
132  "add %5, %3 \n\t"
133  "movq (%1), %%mm0 \n\t"
134  "add %4, %1 \n\t"
135  "movq (%1), %%mm1 \n\t"
136  "add %4, %1 \n\t"
137  PAVGB" 16(%2), %%mm0 \n\t"
138  PAVGB" 24(%2), %%mm1 \n\t"
139  "movq %%mm0, (%3) \n\t"
140  "add %5, %3 \n\t"
141  "movq %%mm1, (%3) \n\t"
142  "add %5, %3 \n\t"
143  "add $32, %2 \n\t"
144  "subl $4, %0 \n\t"
145  "jnz 1b \n\t"
146 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
147  :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
148 #else
149  :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
150 #endif
151  :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
152  :"memory");
153 //the following should be used, though better not with gcc ...
154 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
155  :"r"(src1Stride), "r"(dstStride)
156  :"memory");*/
157 }
158 
159 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
160 {
161  __asm__ volatile(
162  "pcmpeqb %%mm6, %%mm6 \n\t"
163  "testl $1, %0 \n\t"
164  " jz 1f \n\t"
165  "movq (%1), %%mm0 \n\t"
166  "movq (%2), %%mm1 \n\t"
167  "add %4, %1 \n\t"
168  "add $8, %2 \n\t"
169  "pxor %%mm6, %%mm0 \n\t"
170  "pxor %%mm6, %%mm1 \n\t"
171  PAVGB" %%mm1, %%mm0 \n\t"
172  "pxor %%mm6, %%mm0 \n\t"
173  "movq %%mm0, (%3) \n\t"
174  "add %5, %3 \n\t"
175  "decl %0 \n\t"
176  "1: \n\t"
177  "movq (%1), %%mm0 \n\t"
178  "add %4, %1 \n\t"
179  "movq (%1), %%mm1 \n\t"
180  "add %4, %1 \n\t"
181  "movq (%2), %%mm2 \n\t"
182  "movq 8(%2), %%mm3 \n\t"
183  "pxor %%mm6, %%mm0 \n\t"
184  "pxor %%mm6, %%mm1 \n\t"
185  "pxor %%mm6, %%mm2 \n\t"
186  "pxor %%mm6, %%mm3 \n\t"
187  PAVGB" %%mm2, %%mm0 \n\t"
188  PAVGB" %%mm3, %%mm1 \n\t"
189  "pxor %%mm6, %%mm0 \n\t"
190  "pxor %%mm6, %%mm1 \n\t"
191  "movq %%mm0, (%3) \n\t"
192  "add %5, %3 \n\t"
193  "movq %%mm1, (%3) \n\t"
194  "add %5, %3 \n\t"
195  "movq (%1), %%mm0 \n\t"
196  "add %4, %1 \n\t"
197  "movq (%1), %%mm1 \n\t"
198  "add %4, %1 \n\t"
199  "movq 16(%2), %%mm2 \n\t"
200  "movq 24(%2), %%mm3 \n\t"
201  "pxor %%mm6, %%mm0 \n\t"
202  "pxor %%mm6, %%mm1 \n\t"
203  "pxor %%mm6, %%mm2 \n\t"
204  "pxor %%mm6, %%mm3 \n\t"
205  PAVGB" %%mm2, %%mm0 \n\t"
206  PAVGB" %%mm3, %%mm1 \n\t"
207  "pxor %%mm6, %%mm0 \n\t"
208  "pxor %%mm6, %%mm1 \n\t"
209  "movq %%mm0, (%3) \n\t"
210  "add %5, %3 \n\t"
211  "movq %%mm1, (%3) \n\t"
212  "add %5, %3 \n\t"
213  "add $32, %2 \n\t"
214  "subl $4, %0 \n\t"
215  "jnz 1b \n\t"
216 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
217  :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
218 #else
219  :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
220 #endif
221  :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
222  :"memory");
223 //the following should be used, though better not with gcc ...
224 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
225  :"r"(src1Stride), "r"(dstStride)
226  :"memory");*/
227 }
228 
229 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
230 {
231  __asm__ volatile(
232  "testl $1, %0 \n\t"
233  " jz 1f \n\t"
234  "movd (%1), %%mm0 \n\t"
235  "movd (%2), %%mm1 \n\t"
236  "add %4, %1 \n\t"
237  "add $4, %2 \n\t"
238  PAVGB" %%mm1, %%mm0 \n\t"
239  PAVGB" (%3), %%mm0 \n\t"
240  "movd %%mm0, (%3) \n\t"
241  "add %5, %3 \n\t"
242  "decl %0 \n\t"
243  "1: \n\t"
244  "movd (%1), %%mm0 \n\t"
245  "add %4, %1 \n\t"
246  "movd (%1), %%mm1 \n\t"
247  "add %4, %1 \n\t"
248  PAVGB" (%2), %%mm0 \n\t"
249  PAVGB" 4(%2), %%mm1 \n\t"
250  PAVGB" (%3), %%mm0 \n\t"
251  "movd %%mm0, (%3) \n\t"
252  "add %5, %3 \n\t"
253  PAVGB" (%3), %%mm1 \n\t"
254  "movd %%mm1, (%3) \n\t"
255  "add %5, %3 \n\t"
256  "movd (%1), %%mm0 \n\t"
257  "add %4, %1 \n\t"
258  "movd (%1), %%mm1 \n\t"
259  "add %4, %1 \n\t"
260  PAVGB" 8(%2), %%mm0 \n\t"
261  PAVGB" 12(%2), %%mm1 \n\t"
262  PAVGB" (%3), %%mm0 \n\t"
263  "movd %%mm0, (%3) \n\t"
264  "add %5, %3 \n\t"
265  PAVGB" (%3), %%mm1 \n\t"
266  "movd %%mm1, (%3) \n\t"
267  "add %5, %3 \n\t"
268  "add $16, %2 \n\t"
269  "subl $4, %0 \n\t"
270  "jnz 1b \n\t"
271 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
272  :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
273 #else
274  :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
275 #endif
276  :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
277  :"memory");
278 }
279 
280 
281 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
282 {
283  __asm__ volatile(
284  "testl $1, %0 \n\t"
285  " jz 1f \n\t"
286  "movq (%1), %%mm0 \n\t"
287  "movq (%2), %%mm1 \n\t"
288  "add %4, %1 \n\t"
289  "add $8, %2 \n\t"
290  PAVGB" %%mm1, %%mm0 \n\t"
291  PAVGB" (%3), %%mm0 \n\t"
292  "movq %%mm0, (%3) \n\t"
293  "add %5, %3 \n\t"
294  "decl %0 \n\t"
295  "1: \n\t"
296  "movq (%1), %%mm0 \n\t"
297  "add %4, %1 \n\t"
298  "movq (%1), %%mm1 \n\t"
299  "add %4, %1 \n\t"
300  PAVGB" (%2), %%mm0 \n\t"
301  PAVGB" 8(%2), %%mm1 \n\t"
302  PAVGB" (%3), %%mm0 \n\t"
303  "movq %%mm0, (%3) \n\t"
304  "add %5, %3 \n\t"
305  PAVGB" (%3), %%mm1 \n\t"
306  "movq %%mm1, (%3) \n\t"
307  "add %5, %3 \n\t"
308  "movq (%1), %%mm0 \n\t"
309  "add %4, %1 \n\t"
310  "movq (%1), %%mm1 \n\t"
311  "add %4, %1 \n\t"
312  PAVGB" 16(%2), %%mm0 \n\t"
313  PAVGB" 24(%2), %%mm1 \n\t"
314  PAVGB" (%3), %%mm0 \n\t"
315  "movq %%mm0, (%3) \n\t"
316  "add %5, %3 \n\t"
317  PAVGB" (%3), %%mm1 \n\t"
318  "movq %%mm1, (%3) \n\t"
319  "add %5, %3 \n\t"
320  "add $32, %2 \n\t"
321  "subl $4, %0 \n\t"
322  "jnz 1b \n\t"
323 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
324  :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
325 #else
326  :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
327 #endif
328  :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
329  :"memory");
330 //the following should be used, though better not with gcc ...
331 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
332  :"r"(src1Stride), "r"(dstStride)
333  :"memory");*/
334 }
335 
336 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
337 {
338  __asm__ volatile(
339  "lea (%3, %3), %%"REG_a" \n\t"
340  "1: \n\t"
341  "movq (%1), %%mm0 \n\t"
342  "movq (%1, %3), %%mm1 \n\t"
343  "movq 8(%1), %%mm2 \n\t"
344  "movq 8(%1, %3), %%mm3 \n\t"
345  PAVGB" 1(%1), %%mm0 \n\t"
346  PAVGB" 1(%1, %3), %%mm1 \n\t"
347  PAVGB" 9(%1), %%mm2 \n\t"
348  PAVGB" 9(%1, %3), %%mm3 \n\t"
349  "movq %%mm0, (%2) \n\t"
350  "movq %%mm1, (%2, %3) \n\t"
351  "movq %%mm2, 8(%2) \n\t"
352  "movq %%mm3, 8(%2, %3) \n\t"
353  "add %%"REG_a", %1 \n\t"
354  "add %%"REG_a", %2 \n\t"
355  "movq (%1), %%mm0 \n\t"
356  "movq (%1, %3), %%mm1 \n\t"
357  "movq 8(%1), %%mm2 \n\t"
358  "movq 8(%1, %3), %%mm3 \n\t"
359  PAVGB" 1(%1), %%mm0 \n\t"
360  PAVGB" 1(%1, %3), %%mm1 \n\t"
361  PAVGB" 9(%1), %%mm2 \n\t"
362  PAVGB" 9(%1, %3), %%mm3 \n\t"
363  "add %%"REG_a", %1 \n\t"
364  "movq %%mm0, (%2) \n\t"
365  "movq %%mm1, (%2, %3) \n\t"
366  "movq %%mm2, 8(%2) \n\t"
367  "movq %%mm3, 8(%2, %3) \n\t"
368  "add %%"REG_a", %2 \n\t"
369  "subl $4, %0 \n\t"
370  "jnz 1b \n\t"
371  :"+g"(h), "+S"(pixels), "+D"(block)
372  :"r" ((x86_reg)line_size)
373  :"%"REG_a, "memory");
374 }
375 
376 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
377 {
378  __asm__ volatile(
379  "testl $1, %0 \n\t"
380  " jz 1f \n\t"
381  "movq (%1), %%mm0 \n\t"
382  "movq 8(%1), %%mm1 \n\t"
383  PAVGB" (%2), %%mm0 \n\t"
384  PAVGB" 8(%2), %%mm1 \n\t"
385  "add %4, %1 \n\t"
386  "add $16, %2 \n\t"
387  "movq %%mm0, (%3) \n\t"
388  "movq %%mm1, 8(%3) \n\t"
389  "add %5, %3 \n\t"
390  "decl %0 \n\t"
391  "1: \n\t"
392  "movq (%1), %%mm0 \n\t"
393  "movq 8(%1), %%mm1 \n\t"
394  "add %4, %1 \n\t"
395  PAVGB" (%2), %%mm0 \n\t"
396  PAVGB" 8(%2), %%mm1 \n\t"
397  "movq %%mm0, (%3) \n\t"
398  "movq %%mm1, 8(%3) \n\t"
399  "add %5, %3 \n\t"
400  "movq (%1), %%mm0 \n\t"
401  "movq 8(%1), %%mm1 \n\t"
402  "add %4, %1 \n\t"
403  PAVGB" 16(%2), %%mm0 \n\t"
404  PAVGB" 24(%2), %%mm1 \n\t"
405  "movq %%mm0, (%3) \n\t"
406  "movq %%mm1, 8(%3) \n\t"
407  "add %5, %3 \n\t"
408  "add $32, %2 \n\t"
409  "subl $2, %0 \n\t"
410  "jnz 1b \n\t"
411 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
412  :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
413 #else
414  :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
415 #endif
416  :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
417  :"memory");
418 //the following should be used, though better not with gcc ...
419 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
420  :"r"(src1Stride), "r"(dstStride)
421  :"memory");*/
422 }
423 
424 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
425 {
426  __asm__ volatile(
427  "testl $1, %0 \n\t"
428  " jz 1f \n\t"
429  "movq (%1), %%mm0 \n\t"
430  "movq 8(%1), %%mm1 \n\t"
431  PAVGB" (%2), %%mm0 \n\t"
432  PAVGB" 8(%2), %%mm1 \n\t"
433  "add %4, %1 \n\t"
434  "add $16, %2 \n\t"
435  PAVGB" (%3), %%mm0 \n\t"
436  PAVGB" 8(%3), %%mm1 \n\t"
437  "movq %%mm0, (%3) \n\t"
438  "movq %%mm1, 8(%3) \n\t"
439  "add %5, %3 \n\t"
440  "decl %0 \n\t"
441  "1: \n\t"
442  "movq (%1), %%mm0 \n\t"
443  "movq 8(%1), %%mm1 \n\t"
444  "add %4, %1 \n\t"
445  PAVGB" (%2), %%mm0 \n\t"
446  PAVGB" 8(%2), %%mm1 \n\t"
447  PAVGB" (%3), %%mm0 \n\t"
448  PAVGB" 8(%3), %%mm1 \n\t"
449  "movq %%mm0, (%3) \n\t"
450  "movq %%mm1, 8(%3) \n\t"
451  "add %5, %3 \n\t"
452  "movq (%1), %%mm0 \n\t"
453  "movq 8(%1), %%mm1 \n\t"
454  "add %4, %1 \n\t"
455  PAVGB" 16(%2), %%mm0 \n\t"
456  PAVGB" 24(%2), %%mm1 \n\t"
457  PAVGB" (%3), %%mm0 \n\t"
458  PAVGB" 8(%3), %%mm1 \n\t"
459  "movq %%mm0, (%3) \n\t"
460  "movq %%mm1, 8(%3) \n\t"
461  "add %5, %3 \n\t"
462  "add $32, %2 \n\t"
463  "subl $2, %0 \n\t"
464  "jnz 1b \n\t"
465 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
466  :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
467 #else
468  :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
469 #endif
470  :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
471  :"memory");
472 //the following should be used, though better not with gcc ...
473 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
474  :"r"(src1Stride), "r"(dstStride)
475  :"memory");*/
476 }
477 
478 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
479 {
480  __asm__ volatile(
481  "pcmpeqb %%mm6, %%mm6 \n\t"
482  "testl $1, %0 \n\t"
483  " jz 1f \n\t"
484  "movq (%1), %%mm0 \n\t"
485  "movq 8(%1), %%mm1 \n\t"
486  "movq (%2), %%mm2 \n\t"
487  "movq 8(%2), %%mm3 \n\t"
488  "pxor %%mm6, %%mm0 \n\t"
489  "pxor %%mm6, %%mm1 \n\t"
490  "pxor %%mm6, %%mm2 \n\t"
491  "pxor %%mm6, %%mm3 \n\t"
492  PAVGB" %%mm2, %%mm0 \n\t"
493  PAVGB" %%mm3, %%mm1 \n\t"
494  "pxor %%mm6, %%mm0 \n\t"
495  "pxor %%mm6, %%mm1 \n\t"
496  "add %4, %1 \n\t"
497  "add $16, %2 \n\t"
498  "movq %%mm0, (%3) \n\t"
499  "movq %%mm1, 8(%3) \n\t"
500  "add %5, %3 \n\t"
501  "decl %0 \n\t"
502  "1: \n\t"
503  "movq (%1), %%mm0 \n\t"
504  "movq 8(%1), %%mm1 \n\t"
505  "add %4, %1 \n\t"
506  "movq (%2), %%mm2 \n\t"
507  "movq 8(%2), %%mm3 \n\t"
508  "pxor %%mm6, %%mm0 \n\t"
509  "pxor %%mm6, %%mm1 \n\t"
510  "pxor %%mm6, %%mm2 \n\t"
511  "pxor %%mm6, %%mm3 \n\t"
512  PAVGB" %%mm2, %%mm0 \n\t"
513  PAVGB" %%mm3, %%mm1 \n\t"
514  "pxor %%mm6, %%mm0 \n\t"
515  "pxor %%mm6, %%mm1 \n\t"
516  "movq %%mm0, (%3) \n\t"
517  "movq %%mm1, 8(%3) \n\t"
518  "add %5, %3 \n\t"
519  "movq (%1), %%mm0 \n\t"
520  "movq 8(%1), %%mm1 \n\t"
521  "add %4, %1 \n\t"
522  "movq 16(%2), %%mm2 \n\t"
523  "movq 24(%2), %%mm3 \n\t"
524  "pxor %%mm6, %%mm0 \n\t"
525  "pxor %%mm6, %%mm1 \n\t"
526  "pxor %%mm6, %%mm2 \n\t"
527  "pxor %%mm6, %%mm3 \n\t"
528  PAVGB" %%mm2, %%mm0 \n\t"
529  PAVGB" %%mm3, %%mm1 \n\t"
530  "pxor %%mm6, %%mm0 \n\t"
531  "pxor %%mm6, %%mm1 \n\t"
532  "movq %%mm0, (%3) \n\t"
533  "movq %%mm1, 8(%3) \n\t"
534  "add %5, %3 \n\t"
535  "add $32, %2 \n\t"
536  "subl $2, %0 \n\t"
537  "jnz 1b \n\t"
538 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
539  :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
540 #else
541  :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
542 #endif
543  :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
544  :"memory");
545 //the following should be used, though better not with gcc ...
546 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
547  :"r"(src1Stride), "r"(dstStride)
548  :"memory");*/
549 }
550 
551 /* GL: this function does incorrect rounding if overflow */
552 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
553 {
554  MOVQ_BONE(mm6);
555  __asm__ volatile(
556  "lea (%3, %3), %%"REG_a" \n\t"
557  "1: \n\t"
558  "movq (%1), %%mm0 \n\t"
559  "movq (%1, %3), %%mm2 \n\t"
560  "movq 1(%1), %%mm1 \n\t"
561  "movq 1(%1, %3), %%mm3 \n\t"
562  "add %%"REG_a", %1 \n\t"
563  "psubusb %%mm6, %%mm0 \n\t"
564  "psubusb %%mm6, %%mm2 \n\t"
565  PAVGB" %%mm1, %%mm0 \n\t"
566  PAVGB" %%mm3, %%mm2 \n\t"
567  "movq %%mm0, (%2) \n\t"
568  "movq %%mm2, (%2, %3) \n\t"
569  "movq (%1), %%mm0 \n\t"
570  "movq 1(%1), %%mm1 \n\t"
571  "movq (%1, %3), %%mm2 \n\t"
572  "movq 1(%1, %3), %%mm3 \n\t"
573  "add %%"REG_a", %2 \n\t"
574  "add %%"REG_a", %1 \n\t"
575  "psubusb %%mm6, %%mm0 \n\t"
576  "psubusb %%mm6, %%mm2 \n\t"
577  PAVGB" %%mm1, %%mm0 \n\t"
578  PAVGB" %%mm3, %%mm2 \n\t"
579  "movq %%mm0, (%2) \n\t"
580  "movq %%mm2, (%2, %3) \n\t"
581  "add %%"REG_a", %2 \n\t"
582  "subl $4, %0 \n\t"
583  "jnz 1b \n\t"
584  :"+g"(h), "+S"(pixels), "+D"(block)
585  :"r" ((x86_reg)line_size)
586  :"%"REG_a, "memory");
587 }
588 
589 static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
590 {
591  __asm__ volatile (
592  "pcmpeqb %%mm6, %%mm6 \n\t"
593  "1: \n\t"
594  "movq (%1), %%mm0 \n\t"
595  "movq (%1, %3), %%mm2 \n\t"
596  "movq 1(%1), %%mm1 \n\t"
597  "movq 1(%1, %3), %%mm3 \n\t"
598  "pxor %%mm6, %%mm0 \n\t"
599  "pxor %%mm6, %%mm2 \n\t"
600  "pxor %%mm6, %%mm1 \n\t"
601  "pxor %%mm6, %%mm3 \n\t"
602  PAVGB" %%mm1, %%mm0 \n\t"
603  PAVGB" %%mm3, %%mm2 \n\t"
604  "pxor %%mm6, %%mm0 \n\t"
605  "pxor %%mm6, %%mm2 \n\t"
606  "movq %%mm0, (%2) \n\t"
607  "movq %%mm2, (%2, %3) \n\t"
608  "movq (%1, %3,2), %%mm0 \n\t"
609  "movq 1(%1, %3,2), %%mm1 \n\t"
610  "movq (%1, %4), %%mm2 \n\t"
611  "movq 1(%1, %4), %%mm3 \n\t"
612  "pxor %%mm6, %%mm0 \n\t"
613  "pxor %%mm6, %%mm1 \n\t"
614  "pxor %%mm6, %%mm2 \n\t"
615  "pxor %%mm6, %%mm3 \n\t"
616  PAVGB" %%mm1, %%mm0 \n\t"
617  PAVGB" %%mm3, %%mm2 \n\t"
618  "pxor %%mm6, %%mm0 \n\t"
619  "pxor %%mm6, %%mm2 \n\t"
620  "movq %%mm0, (%2, %3,2) \n\t"
621  "movq %%mm2, (%2, %4) \n\t"
622  "lea (%1, %3,4), %1 \n\t"
623  "lea (%2, %3,4), %2 \n\t"
624  "subl $4, %0 \n\t"
625  "jg 1b \n\t"
626  : "+g"(h), "+r"(pixels), "+r"(block)
627  : "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
628  : "memory"
629  );
630 }
631 
632 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
633 {
634  __asm__ volatile(
635  "lea (%3, %3), %%"REG_a" \n\t"
636  "movq (%1), %%mm0 \n\t"
637  "sub %3, %2 \n\t"
638  "1: \n\t"
639  "movq (%1, %3), %%mm1 \n\t"
640  "movq (%1, %%"REG_a"), %%mm2 \n\t"
641  "add %%"REG_a", %1 \n\t"
642  PAVGB" %%mm1, %%mm0 \n\t"
643  PAVGB" %%mm2, %%mm1 \n\t"
644  "movq %%mm0, (%2, %3) \n\t"
645  "movq %%mm1, (%2, %%"REG_a") \n\t"
646  "movq (%1, %3), %%mm1 \n\t"
647  "movq (%1, %%"REG_a"), %%mm0 \n\t"
648  "add %%"REG_a", %2 \n\t"
649  "add %%"REG_a", %1 \n\t"
650  PAVGB" %%mm1, %%mm2 \n\t"
651  PAVGB" %%mm0, %%mm1 \n\t"
652  "movq %%mm2, (%2, %3) \n\t"
653  "movq %%mm1, (%2, %%"REG_a") \n\t"
654  "add %%"REG_a", %2 \n\t"
655  "subl $4, %0 \n\t"
656  "jnz 1b \n\t"
657  :"+g"(h), "+S"(pixels), "+D" (block)
658  :"r" ((x86_reg)line_size)
659  :"%"REG_a, "memory");
660 }
661 
662 /* GL: this function does incorrect rounding if overflow */
663 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
664 {
665  MOVQ_BONE(mm6);
666  __asm__ volatile(
667  "lea (%3, %3), %%"REG_a" \n\t"
668  "movq (%1), %%mm0 \n\t"
669  "sub %3, %2 \n\t"
670  "1: \n\t"
671  "movq (%1, %3), %%mm1 \n\t"
672  "movq (%1, %%"REG_a"), %%mm2 \n\t"
673  "add %%"REG_a", %1 \n\t"
674  "psubusb %%mm6, %%mm1 \n\t"
675  PAVGB" %%mm1, %%mm0 \n\t"
676  PAVGB" %%mm2, %%mm1 \n\t"
677  "movq %%mm0, (%2, %3) \n\t"
678  "movq %%mm1, (%2, %%"REG_a") \n\t"
679  "movq (%1, %3), %%mm1 \n\t"
680  "movq (%1, %%"REG_a"), %%mm0 \n\t"
681  "add %%"REG_a", %2 \n\t"
682  "add %%"REG_a", %1 \n\t"
683  "psubusb %%mm6, %%mm1 \n\t"
684  PAVGB" %%mm1, %%mm2 \n\t"
685  PAVGB" %%mm0, %%mm1 \n\t"
686  "movq %%mm2, (%2, %3) \n\t"
687  "movq %%mm1, (%2, %%"REG_a") \n\t"
688  "add %%"REG_a", %2 \n\t"
689  "subl $4, %0 \n\t"
690  "jnz 1b \n\t"
691  :"+g"(h), "+S"(pixels), "+D" (block)
692  :"r" ((x86_reg)line_size)
693  :"%"REG_a, "memory");
694 }
695 
696 static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
697 {
698  __asm__ volatile (
699  "movq (%1), %%mm0 \n\t"
700  "pcmpeqb %%mm6, %%mm6 \n\t"
701  "add %3, %1 \n\t"
702  "pxor %%mm6, %%mm0 \n\t"
703  "1: \n\t"
704  "movq (%1), %%mm1 \n\t"
705  "movq (%1, %3), %%mm2 \n\t"
706  "pxor %%mm6, %%mm1 \n\t"
707  "pxor %%mm6, %%mm2 \n\t"
708  PAVGB" %%mm1, %%mm0 \n\t"
709  PAVGB" %%mm2, %%mm1 \n\t"
710  "pxor %%mm6, %%mm0 \n\t"
711  "pxor %%mm6, %%mm1 \n\t"
712  "movq %%mm0, (%2) \n\t"
713  "movq %%mm1, (%2, %3) \n\t"
714  "movq (%1, %3,2), %%mm1 \n\t"
715  "movq (%1, %4), %%mm0 \n\t"
716  "pxor %%mm6, %%mm1 \n\t"
717  "pxor %%mm6, %%mm0 \n\t"
718  PAVGB" %%mm1, %%mm2 \n\t"
719  PAVGB" %%mm0, %%mm1 \n\t"
720  "pxor %%mm6, %%mm2 \n\t"
721  "pxor %%mm6, %%mm1 \n\t"
722  "movq %%mm2, (%2, %3,2) \n\t"
723  "movq %%mm1, (%2, %4) \n\t"
724  "lea (%1, %3,4), %1 \n\t"
725  "lea (%2, %3,4), %2 \n\t"
726  "subl $4, %0 \n\t"
727  "jg 1b \n\t"
728  :"+g"(h), "+r"(pixels), "+r" (block)
729  :"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
730  :"memory"
731  );
732 }
733 
734 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
735 {
736  __asm__ volatile(
737  "lea (%3, %3), %%"REG_a" \n\t"
738  "1: \n\t"
739  "movq (%2), %%mm0 \n\t"
740  "movq (%2, %3), %%mm1 \n\t"
741  PAVGB" (%1), %%mm0 \n\t"
742  PAVGB" (%1, %3), %%mm1 \n\t"
743  "movq %%mm0, (%2) \n\t"
744  "movq %%mm1, (%2, %3) \n\t"
745  "add %%"REG_a", %1 \n\t"
746  "add %%"REG_a", %2 \n\t"
747  "movq (%2), %%mm0 \n\t"
748  "movq (%2, %3), %%mm1 \n\t"
749  PAVGB" (%1), %%mm0 \n\t"
750  PAVGB" (%1, %3), %%mm1 \n\t"
751  "add %%"REG_a", %1 \n\t"
752  "movq %%mm0, (%2) \n\t"
753  "movq %%mm1, (%2, %3) \n\t"
754  "add %%"REG_a", %2 \n\t"
755  "subl $4, %0 \n\t"
756  "jnz 1b \n\t"
757  :"+g"(h), "+S"(pixels), "+D"(block)
758  :"r" ((x86_reg)line_size)
759  :"%"REG_a, "memory");
760 }
761 
762 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
763 {
764  __asm__ volatile(
765  "lea (%3, %3), %%"REG_a" \n\t"
766  "1: \n\t"
767  "movq (%1), %%mm0 \n\t"
768  "movq (%1, %3), %%mm2 \n\t"
769  PAVGB" 1(%1), %%mm0 \n\t"
770  PAVGB" 1(%1, %3), %%mm2 \n\t"
771  PAVGB" (%2), %%mm0 \n\t"
772  PAVGB" (%2, %3), %%mm2 \n\t"
773  "add %%"REG_a", %1 \n\t"
774  "movq %%mm0, (%2) \n\t"
775  "movq %%mm2, (%2, %3) \n\t"
776  "movq (%1), %%mm0 \n\t"
777  "movq (%1, %3), %%mm2 \n\t"
778  PAVGB" 1(%1), %%mm0 \n\t"
779  PAVGB" 1(%1, %3), %%mm2 \n\t"
780  "add %%"REG_a", %2 \n\t"
781  "add %%"REG_a", %1 \n\t"
782  PAVGB" (%2), %%mm0 \n\t"
783  PAVGB" (%2, %3), %%mm2 \n\t"
784  "movq %%mm0, (%2) \n\t"
785  "movq %%mm2, (%2, %3) \n\t"
786  "add %%"REG_a", %2 \n\t"
787  "subl $4, %0 \n\t"
788  "jnz 1b \n\t"
789  :"+g"(h), "+S"(pixels), "+D"(block)
790  :"r" ((x86_reg)line_size)
791  :"%"REG_a, "memory");
792 }
793 
794 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
795 {
796  __asm__ volatile(
797  "lea (%3, %3), %%"REG_a" \n\t"
798  "movq (%1), %%mm0 \n\t"
799  "sub %3, %2 \n\t"
800  "1: \n\t"
801  "movq (%1, %3), %%mm1 \n\t"
802  "movq (%1, %%"REG_a"), %%mm2 \n\t"
803  "add %%"REG_a", %1 \n\t"
804  PAVGB" %%mm1, %%mm0 \n\t"
805  PAVGB" %%mm2, %%mm1 \n\t"
806  "movq (%2, %3), %%mm3 \n\t"
807  "movq (%2, %%"REG_a"), %%mm4 \n\t"
808  PAVGB" %%mm3, %%mm0 \n\t"
809  PAVGB" %%mm4, %%mm1 \n\t"
810  "movq %%mm0, (%2, %3) \n\t"
811  "movq %%mm1, (%2, %%"REG_a") \n\t"
812  "movq (%1, %3), %%mm1 \n\t"
813  "movq (%1, %%"REG_a"), %%mm0 \n\t"
814  PAVGB" %%mm1, %%mm2 \n\t"
815  PAVGB" %%mm0, %%mm1 \n\t"
816  "add %%"REG_a", %2 \n\t"
817  "add %%"REG_a", %1 \n\t"
818  "movq (%2, %3), %%mm3 \n\t"
819  "movq (%2, %%"REG_a"), %%mm4 \n\t"
820  PAVGB" %%mm3, %%mm2 \n\t"
821  PAVGB" %%mm4, %%mm1 \n\t"
822  "movq %%mm2, (%2, %3) \n\t"
823  "movq %%mm1, (%2, %%"REG_a") \n\t"
824  "add %%"REG_a", %2 \n\t"
825  "subl $4, %0 \n\t"
826  "jnz 1b \n\t"
827  :"+g"(h), "+S"(pixels), "+D"(block)
828  :"r" ((x86_reg)line_size)
829  :"%"REG_a, "memory");
830 }
831 
832 /* Note this is not correctly rounded, but this function is only
833  * used for B-frames so it does not matter. */
834 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
835 {
836  MOVQ_BONE(mm6);
837  __asm__ volatile(
838  "lea (%3, %3), %%"REG_a" \n\t"
839  "movq (%1), %%mm0 \n\t"
840  PAVGB" 1(%1), %%mm0 \n\t"
841  ".p2align 3 \n\t"
842  "1: \n\t"
843  "movq (%1, %%"REG_a"), %%mm2 \n\t"
844  "movq (%1, %3), %%mm1 \n\t"
845  "psubusb %%mm6, %%mm2 \n\t"
846  PAVGB" 1(%1, %3), %%mm1 \n\t"
847  PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
848  "add %%"REG_a", %1 \n\t"
849  PAVGB" %%mm1, %%mm0 \n\t"
850  PAVGB" %%mm2, %%mm1 \n\t"
851  PAVGB" (%2), %%mm0 \n\t"
852  PAVGB" (%2, %3), %%mm1 \n\t"
853  "movq %%mm0, (%2) \n\t"
854  "movq %%mm1, (%2, %3) \n\t"
855  "movq (%1, %3), %%mm1 \n\t"
856  "movq (%1, %%"REG_a"), %%mm0 \n\t"
857  PAVGB" 1(%1, %3), %%mm1 \n\t"
858  PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
859  "add %%"REG_a", %2 \n\t"
860  "add %%"REG_a", %1 \n\t"
861  PAVGB" %%mm1, %%mm2 \n\t"
862  PAVGB" %%mm0, %%mm1 \n\t"
863  PAVGB" (%2), %%mm2 \n\t"
864  PAVGB" (%2, %3), %%mm1 \n\t"
865  "movq %%mm2, (%2) \n\t"
866  "movq %%mm1, (%2, %3) \n\t"
867  "add %%"REG_a", %2 \n\t"
868  "subl $4, %0 \n\t"
869  "jnz 1b \n\t"
870  :"+g"(h), "+S"(pixels), "+D"(block)
871  :"r" ((x86_reg)line_size)
872  :"%"REG_a, "memory");
873 }
874 
875 static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
876 {
877  do {
878  __asm__ volatile(
879  "movd (%1), %%mm0 \n\t"
880  "movd (%1, %2), %%mm1 \n\t"
881  "movd (%1, %2, 2), %%mm2 \n\t"
882  "movd (%1, %3), %%mm3 \n\t"
883  PAVGB" (%0), %%mm0 \n\t"
884  PAVGB" (%0, %2), %%mm1 \n\t"
885  PAVGB" (%0, %2, 2), %%mm2 \n\t"
886  PAVGB" (%0, %3), %%mm3 \n\t"
887  "movd %%mm0, (%1) \n\t"
888  "movd %%mm1, (%1, %2) \n\t"
889  "movd %%mm2, (%1, %2, 2) \n\t"
890  "movd %%mm3, (%1, %3) \n\t"
891  ::"S"(pixels), "D"(block),
892  "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
893  :"memory");
894  block += 4*line_size;
895  pixels += 4*line_size;
896  h -= 4;
897  } while(h > 0);
898 }
899 
900 //FIXME the following could be optimized too ...
901 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
902  DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
903  DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
904 }
905 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
906  DEF(put_pixels8_y2)(block , pixels , line_size, h);
907  DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
908 }
909 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
910  DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
911  DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
912 }
913 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
914  DEF(avg_pixels8)(block , pixels , line_size, h);
915  DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
916 }
917 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
918  DEF(avg_pixels8_x2)(block , pixels , line_size, h);
919  DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
920 }
921 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
922  DEF(avg_pixels8_y2)(block , pixels , line_size, h);
923  DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
924 }
925 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
926  DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
927  DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
928 }
929 
930 #define QPEL_2TAP_L3(OPNAME) \
931 static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
932  __asm__ volatile(\
933  "1: \n\t"\
934  "movq (%1,%2), %%mm0 \n\t"\
935  "movq 8(%1,%2), %%mm1 \n\t"\
936  PAVGB" (%1,%3), %%mm0 \n\t"\
937  PAVGB" 8(%1,%3), %%mm1 \n\t"\
938  PAVGB" (%1), %%mm0 \n\t"\
939  PAVGB" 8(%1), %%mm1 \n\t"\
940  STORE_OP( (%1,%4),%%mm0)\
941  STORE_OP(8(%1,%4),%%mm1)\
942  "movq %%mm0, (%1,%4) \n\t"\
943  "movq %%mm1, 8(%1,%4) \n\t"\
944  "add %5, %1 \n\t"\
945  "decl %0 \n\t"\
946  "jnz 1b \n\t"\
947  :"+g"(h), "+r"(src)\
948  :"r"((x86_reg)off1), "r"((x86_reg)off2),\
949  "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
950  :"memory"\
951  );\
952 }\
953 static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
954  __asm__ volatile(\
955  "1: \n\t"\
956  "movq (%1,%2), %%mm0 \n\t"\
957  PAVGB" (%1,%3), %%mm0 \n\t"\
958  PAVGB" (%1), %%mm0 \n\t"\
959  STORE_OP((%1,%4),%%mm0)\
960  "movq %%mm0, (%1,%4) \n\t"\
961  "add %5, %1 \n\t"\
962  "decl %0 \n\t"\
963  "jnz 1b \n\t"\
964  :"+g"(h), "+r"(src)\
965  :"r"((x86_reg)off1), "r"((x86_reg)off2),\
966  "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
967  :"memory"\
968  );\
969 }
970 
971 #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
972 QPEL_2TAP_L3(avg_)
973 #undef STORE_OP
974 #define STORE_OP(a,b)
975 QPEL_2TAP_L3(put_)
976 #undef STORE_OP
977 #undef QPEL_2TAP_L3