dsputil.c
Go to the documentation of this file.
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
30 #include "libavutil/imgutils.h"
31 #include "avcodec.h"
32 #include "dsputil.h"
33 #include "simple_idct.h"
34 #include "faandct.h"
35 #include "faanidct.h"
36 #include "mathops.h"
37 #include "mpegvideo.h"
38 #include "config.h"
39 #include "ac3dec.h"
40 #include "vorbis.h"
41 #include "png.h"
42 
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
45 
46 #define BIT_DEPTH 9
47 #include "dsputil_template.c"
48 #undef BIT_DEPTH
49 
50 #define BIT_DEPTH 10
51 #include "dsputil_template.c"
52 #undef BIT_DEPTH
53 
54 #define BIT_DEPTH 8
55 #include "dsputil_template.c"
56 
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
60 
61 const uint8_t ff_zigzag_direct[64] = {
62  0, 1, 8, 16, 9, 2, 3, 10,
63  17, 24, 32, 25, 18, 11, 4, 5,
64  12, 19, 26, 33, 40, 48, 41, 34,
65  27, 20, 13, 6, 7, 14, 21, 28,
66  35, 42, 49, 56, 57, 50, 43, 36,
67  29, 22, 15, 23, 30, 37, 44, 51,
68  58, 59, 52, 45, 38, 31, 39, 46,
69  53, 60, 61, 54, 47, 55, 62, 63
70 };
71 
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73  specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75  0, 8, 1, 9, 16, 24, 2, 10,
76  17, 25, 32, 40, 48, 56, 33, 41,
77  18, 26, 3, 11, 4, 12, 19, 27,
78  34, 42, 49, 57, 50, 58, 35, 43,
79  20, 28, 5, 13, 6, 14, 21, 29,
80  36, 44, 51, 59, 52, 60, 37, 45,
81  22, 30, 7, 15, 23, 31, 38, 46,
82  53, 61, 54, 62, 39, 47, 55, 63,
83 };
84 
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
87 
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89  0, 1, 2, 3, 8, 9, 16, 17,
90  10, 11, 4, 5, 6, 7, 15, 14,
91  13, 12, 19, 18, 24, 25, 32, 33,
92  26, 27, 20, 21, 22, 23, 28, 29,
93  30, 31, 34, 35, 40, 41, 48, 49,
94  42, 43, 36, 37, 38, 39, 44, 45,
95  46, 47, 50, 51, 56, 57, 58, 59,
96  52, 53, 54, 55, 60, 61, 62, 63,
97 };
98 
99 const uint8_t ff_alternate_vertical_scan[64] = {
100  0, 8, 16, 24, 1, 9, 2, 10,
101  17, 25, 32, 40, 48, 56, 57, 49,
102  41, 33, 26, 18, 3, 11, 4, 12,
103  19, 27, 34, 42, 50, 58, 35, 43,
104  51, 59, 20, 28, 5, 13, 6, 14,
105  21, 29, 36, 44, 52, 60, 37, 45,
106  53, 61, 22, 30, 7, 15, 23, 31,
107  38, 46, 54, 62, 39, 47, 55, 63,
108 };
109 
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112  0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113  0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114  0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115  0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116  0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117  0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118  0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119  0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
120 };
121 
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
123 
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
125  int i;
126  int end;
127 
128  st->scantable= src_scantable;
129 
130  for(i=0; i<64; i++){
131  int j;
132  j = src_scantable[i];
133  st->permutated[i] = permutation[j];
134 #if ARCH_PPC
135  st->inverse[j] = i;
136 #endif
137  }
138 
139  end=-1;
140  for(i=0; i<64; i++){
141  int j;
142  j = st->permutated[i];
143  if(j>end) end=j;
144  st->raster_end[i]= end;
145  }
146 }
147 
148 void ff_init_scantable_permutation(uint8_t *idct_permutation,
149  int idct_permutation_type)
150 {
151  int i;
152 
153  switch(idct_permutation_type){
154  case FF_NO_IDCT_PERM:
155  for(i=0; i<64; i++)
156  idct_permutation[i]= i;
157  break;
159  for(i=0; i<64; i++)
160  idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
161  break;
162  case FF_SIMPLE_IDCT_PERM:
163  for(i=0; i<64; i++)
164  idct_permutation[i]= simple_mmx_permutation[i];
165  break;
167  for(i=0; i<64; i++)
168  idct_permutation[i]= ((i&7)<<3) | (i>>3);
169  break;
171  for(i=0; i<64; i++)
172  idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
173  break;
174  case FF_SSE2_IDCT_PERM:
175  for(i=0; i<64; i++)
176  idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
177  break;
178  default:
179  av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
180  }
181 }
182 
183 static int pix_sum_c(uint8_t * pix, int line_size)
184 {
185  int s, i, j;
186 
187  s = 0;
188  for (i = 0; i < 16; i++) {
189  for (j = 0; j < 16; j += 8) {
190  s += pix[0];
191  s += pix[1];
192  s += pix[2];
193  s += pix[3];
194  s += pix[4];
195  s += pix[5];
196  s += pix[6];
197  s += pix[7];
198  pix += 8;
199  }
200  pix += line_size - 16;
201  }
202  return s;
203 }
204 
205 static int pix_norm1_c(uint8_t * pix, int line_size)
206 {
207  int s, i, j;
208  uint32_t *sq = ff_squareTbl + 256;
209 
210  s = 0;
211  for (i = 0; i < 16; i++) {
212  for (j = 0; j < 16; j += 8) {
213 #if 0
214  s += sq[pix[0]];
215  s += sq[pix[1]];
216  s += sq[pix[2]];
217  s += sq[pix[3]];
218  s += sq[pix[4]];
219  s += sq[pix[5]];
220  s += sq[pix[6]];
221  s += sq[pix[7]];
222 #else
223 #if HAVE_FAST_64BIT
224  register uint64_t x=*(uint64_t*)pix;
225  s += sq[x&0xff];
226  s += sq[(x>>8)&0xff];
227  s += sq[(x>>16)&0xff];
228  s += sq[(x>>24)&0xff];
229  s += sq[(x>>32)&0xff];
230  s += sq[(x>>40)&0xff];
231  s += sq[(x>>48)&0xff];
232  s += sq[(x>>56)&0xff];
233 #else
234  register uint32_t x=*(uint32_t*)pix;
235  s += sq[x&0xff];
236  s += sq[(x>>8)&0xff];
237  s += sq[(x>>16)&0xff];
238  s += sq[(x>>24)&0xff];
239  x=*(uint32_t*)(pix+4);
240  s += sq[x&0xff];
241  s += sq[(x>>8)&0xff];
242  s += sq[(x>>16)&0xff];
243  s += sq[(x>>24)&0xff];
244 #endif
245 #endif
246  pix += 8;
247  }
248  pix += line_size - 16;
249  }
250  return s;
251 }
252 
253 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
254  int i;
255 
256  for(i=0; i+8<=w; i+=8){
257  dst[i+0]= av_bswap32(src[i+0]);
258  dst[i+1]= av_bswap32(src[i+1]);
259  dst[i+2]= av_bswap32(src[i+2]);
260  dst[i+3]= av_bswap32(src[i+3]);
261  dst[i+4]= av_bswap32(src[i+4]);
262  dst[i+5]= av_bswap32(src[i+5]);
263  dst[i+6]= av_bswap32(src[i+6]);
264  dst[i+7]= av_bswap32(src[i+7]);
265  }
266  for(;i<w; i++){
267  dst[i+0]= av_bswap32(src[i+0]);
268  }
269 }
270 
271 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
272 {
273  while (len--)
274  *dst++ = av_bswap16(*src++);
275 }
276 
277 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
278 {
279  int s, i;
280  uint32_t *sq = ff_squareTbl + 256;
281 
282  s = 0;
283  for (i = 0; i < h; i++) {
284  s += sq[pix1[0] - pix2[0]];
285  s += sq[pix1[1] - pix2[1]];
286  s += sq[pix1[2] - pix2[2]];
287  s += sq[pix1[3] - pix2[3]];
288  pix1 += line_size;
289  pix2 += line_size;
290  }
291  return s;
292 }
293 
294 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
295 {
296  int s, i;
297  uint32_t *sq = ff_squareTbl + 256;
298 
299  s = 0;
300  for (i = 0; i < h; i++) {
301  s += sq[pix1[0] - pix2[0]];
302  s += sq[pix1[1] - pix2[1]];
303  s += sq[pix1[2] - pix2[2]];
304  s += sq[pix1[3] - pix2[3]];
305  s += sq[pix1[4] - pix2[4]];
306  s += sq[pix1[5] - pix2[5]];
307  s += sq[pix1[6] - pix2[6]];
308  s += sq[pix1[7] - pix2[7]];
309  pix1 += line_size;
310  pix2 += line_size;
311  }
312  return s;
313 }
314 
315 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
316 {
317  int s, i;
318  uint32_t *sq = ff_squareTbl + 256;
319 
320  s = 0;
321  for (i = 0; i < h; i++) {
322  s += sq[pix1[ 0] - pix2[ 0]];
323  s += sq[pix1[ 1] - pix2[ 1]];
324  s += sq[pix1[ 2] - pix2[ 2]];
325  s += sq[pix1[ 3] - pix2[ 3]];
326  s += sq[pix1[ 4] - pix2[ 4]];
327  s += sq[pix1[ 5] - pix2[ 5]];
328  s += sq[pix1[ 6] - pix2[ 6]];
329  s += sq[pix1[ 7] - pix2[ 7]];
330  s += sq[pix1[ 8] - pix2[ 8]];
331  s += sq[pix1[ 9] - pix2[ 9]];
332  s += sq[pix1[10] - pix2[10]];
333  s += sq[pix1[11] - pix2[11]];
334  s += sq[pix1[12] - pix2[12]];
335  s += sq[pix1[13] - pix2[13]];
336  s += sq[pix1[14] - pix2[14]];
337  s += sq[pix1[15] - pix2[15]];
338 
339  pix1 += line_size;
340  pix2 += line_size;
341  }
342  return s;
343 }
344 
345 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
346  const uint8_t *s2, int stride){
347  int i;
348 
349  /* read the pixels */
350  for(i=0;i<8;i++) {
351  block[0] = s1[0] - s2[0];
352  block[1] = s1[1] - s2[1];
353  block[2] = s1[2] - s2[2];
354  block[3] = s1[3] - s2[3];
355  block[4] = s1[4] - s2[4];
356  block[5] = s1[5] - s2[5];
357  block[6] = s1[6] - s2[6];
358  block[7] = s1[7] - s2[7];
359  s1 += stride;
360  s2 += stride;
361  block += 8;
362  }
363 }
364 
365 
366 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
367  int line_size)
368 {
369  int i;
370 
371  /* read the pixels */
372  for(i=0;i<8;i++) {
373  pixels[0] = av_clip_uint8(block[0]);
374  pixels[1] = av_clip_uint8(block[1]);
375  pixels[2] = av_clip_uint8(block[2]);
376  pixels[3] = av_clip_uint8(block[3]);
377  pixels[4] = av_clip_uint8(block[4]);
378  pixels[5] = av_clip_uint8(block[5]);
379  pixels[6] = av_clip_uint8(block[6]);
380  pixels[7] = av_clip_uint8(block[7]);
381 
382  pixels += line_size;
383  block += 8;
384  }
385 }
386 
387 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
388  int line_size)
389 {
390  int i;
391 
392  /* read the pixels */
393  for(i=0;i<4;i++) {
394  pixels[0] = av_clip_uint8(block[0]);
395  pixels[1] = av_clip_uint8(block[1]);
396  pixels[2] = av_clip_uint8(block[2]);
397  pixels[3] = av_clip_uint8(block[3]);
398 
399  pixels += line_size;
400  block += 8;
401  }
402 }
403 
404 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
405  int line_size)
406 {
407  int i;
408 
409  /* read the pixels */
410  for(i=0;i<2;i++) {
411  pixels[0] = av_clip_uint8(block[0]);
412  pixels[1] = av_clip_uint8(block[1]);
413 
414  pixels += line_size;
415  block += 8;
416  }
417 }
418 
420  uint8_t *restrict pixels,
421  int line_size)
422 {
423  int i, j;
424 
425  for (i = 0; i < 8; i++) {
426  for (j = 0; j < 8; j++) {
427  if (*block < -128)
428  *pixels = 0;
429  else if (*block > 127)
430  *pixels = 255;
431  else
432  *pixels = (uint8_t)(*block + 128);
433  block++;
434  pixels++;
435  }
436  pixels += (line_size - 8);
437  }
438 }
439 
440 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
441  int line_size)
442 {
443  int i;
444 
445  /* read the pixels */
446  for(i=0;i<8;i++) {
447  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
448  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
449  pixels[2] = av_clip_uint8(pixels[2] + block[2]);
450  pixels[3] = av_clip_uint8(pixels[3] + block[3]);
451  pixels[4] = av_clip_uint8(pixels[4] + block[4]);
452  pixels[5] = av_clip_uint8(pixels[5] + block[5]);
453  pixels[6] = av_clip_uint8(pixels[6] + block[6]);
454  pixels[7] = av_clip_uint8(pixels[7] + block[7]);
455  pixels += line_size;
456  block += 8;
457  }
458 }
459 
460 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
461  int line_size)
462 {
463  int i;
464 
465  /* read the pixels */
466  for(i=0;i<4;i++) {
467  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
468  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
469  pixels[2] = av_clip_uint8(pixels[2] + block[2]);
470  pixels[3] = av_clip_uint8(pixels[3] + block[3]);
471  pixels += line_size;
472  block += 8;
473  }
474 }
475 
476 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
477  int line_size)
478 {
479  int i;
480 
481  /* read the pixels */
482  for(i=0;i<2;i++) {
483  pixels[0] = av_clip_uint8(pixels[0] + block[0]);
484  pixels[1] = av_clip_uint8(pixels[1] + block[1]);
485  pixels += line_size;
486  block += 8;
487  }
488 }
489 
491 {
492  int sum=0, i;
493  for(i=0; i<64; i++)
494  sum+= FFABS(block[i]);
495  return sum;
496 }
497 
498 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
499 {
500  int i;
501 
502  for (i = 0; i < h; i++) {
503  memset(block, value, 16);
504  block += line_size;
505  }
506 }
507 
508 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
509 {
510  int i;
511 
512  for (i = 0; i < h; i++) {
513  memset(block, value, 8);
514  block += line_size;
515  }
516 }
517 
518 #define avg2(a,b) ((a+b+1)>>1)
519 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
520 
521 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
522 {
523  const int A=(16-x16)*(16-y16);
524  const int B=( x16)*(16-y16);
525  const int C=(16-x16)*( y16);
526  const int D=( x16)*( y16);
527  int i;
528 
529  for(i=0; i<h; i++)
530  {
531  dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
532  dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
533  dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
534  dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
535  dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
536  dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
537  dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
538  dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
539  dst+= stride;
540  src+= stride;
541  }
542 }
543 
544 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
545  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
546 {
547  int y, vx, vy;
548  const int s= 1<<shift;
549 
550  width--;
551  height--;
552 
553  for(y=0; y<h; y++){
554  int x;
555 
556  vx= ox;
557  vy= oy;
558  for(x=0; x<8; x++){ //XXX FIXME optimize
559  int src_x, src_y, frac_x, frac_y, index;
560 
561  src_x= vx>>16;
562  src_y= vy>>16;
563  frac_x= src_x&(s-1);
564  frac_y= src_y&(s-1);
565  src_x>>=shift;
566  src_y>>=shift;
567 
568  if((unsigned)src_x < width){
569  if((unsigned)src_y < height){
570  index= src_x + src_y*stride;
571  dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
572  + src[index +1]* frac_x )*(s-frac_y)
573  + ( src[index+stride ]*(s-frac_x)
574  + src[index+stride+1]* frac_x )* frac_y
575  + r)>>(shift*2);
576  }else{
577  index= src_x + av_clip(src_y, 0, height)*stride;
578  dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
579  + src[index +1]* frac_x )*s
580  + r)>>(shift*2);
581  }
582  }else{
583  if((unsigned)src_y < height){
584  index= av_clip(src_x, 0, width) + src_y*stride;
585  dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
586  + src[index+stride ]* frac_y )*s
587  + r)>>(shift*2);
588  }else{
589  index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
590  dst[y*stride + x]= src[index ];
591  }
592  }
593 
594  vx+= dxx;
595  vy+= dyx;
596  }
597  ox += dxy;
598  oy += dyy;
599  }
600 }
601 
602 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
603  switch(width){
604  case 2: put_pixels2_8_c (dst, src, stride, height); break;
605  case 4: put_pixels4_8_c (dst, src, stride, height); break;
606  case 8: put_pixels8_8_c (dst, src, stride, height); break;
607  case 16:put_pixels16_8_c(dst, src, stride, height); break;
608  }
609 }
610 
611 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
612  int i,j;
613  for (i=0; i < height; i++) {
614  for (j=0; j < width; j++) {
615  dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
616  }
617  src += stride;
618  dst += stride;
619  }
620 }
621 
622 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
623  int i,j;
624  for (i=0; i < height; i++) {
625  for (j=0; j < width; j++) {
626  dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
627  }
628  src += stride;
629  dst += stride;
630  }
631 }
632 
633 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
634  int i,j;
635  for (i=0; i < height; i++) {
636  for (j=0; j < width; j++) {
637  dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
638  }
639  src += stride;
640  dst += stride;
641  }
642 }
643 
644 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
645  int i,j;
646  for (i=0; i < height; i++) {
647  for (j=0; j < width; j++) {
648  dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
649  }
650  src += stride;
651  dst += stride;
652  }
653 }
654 
655 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
656  int i,j;
657  for (i=0; i < height; i++) {
658  for (j=0; j < width; j++) {
659  dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
660  }
661  src += stride;
662  dst += stride;
663  }
664 }
665 
666 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
667  int i,j;
668  for (i=0; i < height; i++) {
669  for (j=0; j < width; j++) {
670  dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
671  }
672  src += stride;
673  dst += stride;
674  }
675 }
676 
677 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
678  int i,j;
679  for (i=0; i < height; i++) {
680  for (j=0; j < width; j++) {
681  dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
682  }
683  src += stride;
684  dst += stride;
685  }
686 }
687 
688 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
689  int i,j;
690  for (i=0; i < height; i++) {
691  for (j=0; j < width; j++) {
692  dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
693  }
694  src += stride;
695  dst += stride;
696  }
697 }
698 
699 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
700  switch(width){
701  case 2: avg_pixels2_8_c (dst, src, stride, height); break;
702  case 4: avg_pixels4_8_c (dst, src, stride, height); break;
703  case 8: avg_pixels8_8_c (dst, src, stride, height); break;
704  case 16:avg_pixels16_8_c(dst, src, stride, height); break;
705  }
706 }
707 
708 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
709  int i,j;
710  for (i=0; i < height; i++) {
711  for (j=0; j < width; j++) {
712  dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
713  }
714  src += stride;
715  dst += stride;
716  }
717 }
718 
719 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
720  int i,j;
721  for (i=0; i < height; i++) {
722  for (j=0; j < width; j++) {
723  dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
724  }
725  src += stride;
726  dst += stride;
727  }
728 }
729 
730 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
731  int i,j;
732  for (i=0; i < height; i++) {
733  for (j=0; j < width; j++) {
734  dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
735  }
736  src += stride;
737  dst += stride;
738  }
739 }
740 
741 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
742  int i,j;
743  for (i=0; i < height; i++) {
744  for (j=0; j < width; j++) {
745  dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
746  }
747  src += stride;
748  dst += stride;
749  }
750 }
751 
752 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
753  int i,j;
754  for (i=0; i < height; i++) {
755  for (j=0; j < width; j++) {
756  dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
757  }
758  src += stride;
759  dst += stride;
760  }
761 }
762 
763 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
764  int i,j;
765  for (i=0; i < height; i++) {
766  for (j=0; j < width; j++) {
767  dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
768  }
769  src += stride;
770  dst += stride;
771  }
772 }
773 
774 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
775  int i,j;
776  for (i=0; i < height; i++) {
777  for (j=0; j < width; j++) {
778  dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
779  }
780  src += stride;
781  dst += stride;
782  }
783 }
784 
785 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
786  int i,j;
787  for (i=0; i < height; i++) {
788  for (j=0; j < width; j++) {
789  dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
790  }
791  src += stride;
792  dst += stride;
793  }
794 }
795 
796 #define QPEL_MC(r, OPNAME, RND, OP) \
797 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
798  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
799  int i;\
800  for(i=0; i<h; i++)\
801  {\
802  OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
803  OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
804  OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
805  OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
806  OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
807  OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
808  OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
809  OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
810  dst+=dstStride;\
811  src+=srcStride;\
812  }\
813 }\
814 \
815 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
816  const int w=8;\
817  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
818  int i;\
819  for(i=0; i<w; i++)\
820  {\
821  const int src0= src[0*srcStride];\
822  const int src1= src[1*srcStride];\
823  const int src2= src[2*srcStride];\
824  const int src3= src[3*srcStride];\
825  const int src4= src[4*srcStride];\
826  const int src5= src[5*srcStride];\
827  const int src6= src[6*srcStride];\
828  const int src7= src[7*srcStride];\
829  const int src8= src[8*srcStride];\
830  OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
831  OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
832  OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
833  OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
834  OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
835  OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
836  OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
837  OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
838  dst++;\
839  src++;\
840  }\
841 }\
842 \
843 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
844  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
845  int i;\
846  \
847  for(i=0; i<h; i++)\
848  {\
849  OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
850  OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
851  OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
852  OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
853  OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
854  OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
855  OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
856  OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
857  OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
858  OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
859  OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
860  OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
861  OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
862  OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
863  OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
864  OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
865  dst+=dstStride;\
866  src+=srcStride;\
867  }\
868 }\
869 \
870 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
871  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
872  int i;\
873  const int w=16;\
874  for(i=0; i<w; i++)\
875  {\
876  const int src0= src[0*srcStride];\
877  const int src1= src[1*srcStride];\
878  const int src2= src[2*srcStride];\
879  const int src3= src[3*srcStride];\
880  const int src4= src[4*srcStride];\
881  const int src5= src[5*srcStride];\
882  const int src6= src[6*srcStride];\
883  const int src7= src[7*srcStride];\
884  const int src8= src[8*srcStride];\
885  const int src9= src[9*srcStride];\
886  const int src10= src[10*srcStride];\
887  const int src11= src[11*srcStride];\
888  const int src12= src[12*srcStride];\
889  const int src13= src[13*srcStride];\
890  const int src14= src[14*srcStride];\
891  const int src15= src[15*srcStride];\
892  const int src16= src[16*srcStride];\
893  OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
894  OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
895  OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
896  OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
897  OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
898  OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
899  OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
900  OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
901  OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
902  OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
903  OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
904  OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
905  OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
906  OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
907  OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
908  OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
909  dst++;\
910  src++;\
911  }\
912 }\
913 \
914 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
915  uint8_t half[64];\
916  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
917  OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
918 }\
919 \
920 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
921  OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
922 }\
923 \
924 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
925  uint8_t half[64];\
926  put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
927  OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
928 }\
929 \
930 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
931  uint8_t full[16*9];\
932  uint8_t half[64];\
933  copy_block9(full, src, 16, stride, 9);\
934  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
935  OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
936 }\
937 \
938 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
939  uint8_t full[16*9];\
940  copy_block9(full, src, 16, stride, 9);\
941  OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
942 }\
943 \
944 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
945  uint8_t full[16*9];\
946  uint8_t half[64];\
947  copy_block9(full, src, 16, stride, 9);\
948  put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
949  OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
950 }\
951 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
952  uint8_t full[16*9];\
953  uint8_t halfH[72];\
954  uint8_t halfV[64];\
955  uint8_t halfHV[64];\
956  copy_block9(full, src, 16, stride, 9);\
957  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
958  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
959  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
960  OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
961 }\
962 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
963  uint8_t full[16*9];\
964  uint8_t halfH[72];\
965  uint8_t halfHV[64];\
966  copy_block9(full, src, 16, stride, 9);\
967  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
968  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
969  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
970  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
971 }\
972 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
973  uint8_t full[16*9];\
974  uint8_t halfH[72];\
975  uint8_t halfV[64];\
976  uint8_t halfHV[64];\
977  copy_block9(full, src, 16, stride, 9);\
978  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
979  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
980  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
981  OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
982 }\
983 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
984  uint8_t full[16*9];\
985  uint8_t halfH[72];\
986  uint8_t halfHV[64];\
987  copy_block9(full, src, 16, stride, 9);\
988  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
989  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
990  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
991  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
992 }\
993 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
994  uint8_t full[16*9];\
995  uint8_t halfH[72];\
996  uint8_t halfV[64];\
997  uint8_t halfHV[64];\
998  copy_block9(full, src, 16, stride, 9);\
999  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1000  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1001  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1002  OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1003 }\
1004 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1005  uint8_t full[16*9];\
1006  uint8_t halfH[72];\
1007  uint8_t halfHV[64];\
1008  copy_block9(full, src, 16, stride, 9);\
1009  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1010  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1011  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1012  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1013 }\
1014 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1015  uint8_t full[16*9];\
1016  uint8_t halfH[72];\
1017  uint8_t halfV[64];\
1018  uint8_t halfHV[64];\
1019  copy_block9(full, src, 16, stride, 9);\
1020  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1021  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1022  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1023  OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1024 }\
1025 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1026  uint8_t full[16*9];\
1027  uint8_t halfH[72];\
1028  uint8_t halfHV[64];\
1029  copy_block9(full, src, 16, stride, 9);\
1030  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1031  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1032  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1033  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1034 }\
1035 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1036  uint8_t halfH[72];\
1037  uint8_t halfHV[64];\
1038  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1039  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1040  OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1041 }\
1042 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1043  uint8_t halfH[72];\
1044  uint8_t halfHV[64];\
1045  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1046  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1047  OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1048 }\
1049 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1050  uint8_t full[16*9];\
1051  uint8_t halfH[72];\
1052  uint8_t halfV[64];\
1053  uint8_t halfHV[64];\
1054  copy_block9(full, src, 16, stride, 9);\
1055  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1056  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1057  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1058  OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1059 }\
1060 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1061  uint8_t full[16*9];\
1062  uint8_t halfH[72];\
1063  copy_block9(full, src, 16, stride, 9);\
1064  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1065  put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1066  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1067 }\
1068 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1069  uint8_t full[16*9];\
1070  uint8_t halfH[72];\
1071  uint8_t halfV[64];\
1072  uint8_t halfHV[64];\
1073  copy_block9(full, src, 16, stride, 9);\
1074  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1075  put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1076  put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1077  OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1078 }\
1079 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1080  uint8_t full[16*9];\
1081  uint8_t halfH[72];\
1082  copy_block9(full, src, 16, stride, 9);\
1083  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1084  put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1085  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1086 }\
1087 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1088  uint8_t halfH[72];\
1089  put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1090  OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1091 }\
1092 \
1093 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1094  uint8_t half[256];\
1095  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1096  OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1097 }\
1098 \
1099 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1100  OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1101 }\
1102 \
1103 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1104  uint8_t half[256];\
1105  put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1106  OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1107 }\
1108 \
1109 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1110  uint8_t full[24*17];\
1111  uint8_t half[256];\
1112  copy_block17(full, src, 24, stride, 17);\
1113  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1114  OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1115 }\
1116 \
1117 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1118  uint8_t full[24*17];\
1119  copy_block17(full, src, 24, stride, 17);\
1120  OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1121 }\
1122 \
1123 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1124  uint8_t full[24*17];\
1125  uint8_t half[256];\
1126  copy_block17(full, src, 24, stride, 17);\
1127  put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1128  OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1129 }\
1130 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1131  uint8_t full[24*17];\
1132  uint8_t halfH[272];\
1133  uint8_t halfV[256];\
1134  uint8_t halfHV[256];\
1135  copy_block17(full, src, 24, stride, 17);\
1136  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1137  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1138  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1139  OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1140 }\
1141 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1142  uint8_t full[24*17];\
1143  uint8_t halfH[272];\
1144  uint8_t halfHV[256];\
1145  copy_block17(full, src, 24, stride, 17);\
1146  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1147  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1148  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1149  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1150 }\
1151 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1152  uint8_t full[24*17];\
1153  uint8_t halfH[272];\
1154  uint8_t halfV[256];\
1155  uint8_t halfHV[256];\
1156  copy_block17(full, src, 24, stride, 17);\
1157  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1158  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1159  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1160  OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1161 }\
1162 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1163  uint8_t full[24*17];\
1164  uint8_t halfH[272];\
1165  uint8_t halfHV[256];\
1166  copy_block17(full, src, 24, stride, 17);\
1167  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1168  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1169  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1170  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1171 }\
1172 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1173  uint8_t full[24*17];\
1174  uint8_t halfH[272];\
1175  uint8_t halfV[256];\
1176  uint8_t halfHV[256];\
1177  copy_block17(full, src, 24, stride, 17);\
1178  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1179  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1180  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1181  OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1182 }\
1183 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1184  uint8_t full[24*17];\
1185  uint8_t halfH[272];\
1186  uint8_t halfHV[256];\
1187  copy_block17(full, src, 24, stride, 17);\
1188  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1189  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1190  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1191  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1192 }\
1193 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1194  uint8_t full[24*17];\
1195  uint8_t halfH[272];\
1196  uint8_t halfV[256];\
1197  uint8_t halfHV[256];\
1198  copy_block17(full, src, 24, stride, 17);\
1199  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1200  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1201  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1202  OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1203 }\
1204 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1205  uint8_t full[24*17];\
1206  uint8_t halfH[272];\
1207  uint8_t halfHV[256];\
1208  copy_block17(full, src, 24, stride, 17);\
1209  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1210  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1211  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1212  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1213 }\
1214 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1215  uint8_t halfH[272];\
1216  uint8_t halfHV[256];\
1217  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1218  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1219  OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1220 }\
1221 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1222  uint8_t halfH[272];\
1223  uint8_t halfHV[256];\
1224  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1225  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226  OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1227 }\
1228 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1229  uint8_t full[24*17];\
1230  uint8_t halfH[272];\
1231  uint8_t halfV[256];\
1232  uint8_t halfHV[256];\
1233  copy_block17(full, src, 24, stride, 17);\
1234  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1235  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1236  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1237  OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1238 }\
1239 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1240  uint8_t full[24*17];\
1241  uint8_t halfH[272];\
1242  copy_block17(full, src, 24, stride, 17);\
1243  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1244  put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1245  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1246 }\
1247 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1248  uint8_t full[24*17];\
1249  uint8_t halfH[272];\
1250  uint8_t halfV[256];\
1251  uint8_t halfHV[256];\
1252  copy_block17(full, src, 24, stride, 17);\
1253  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1254  put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1255  put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1256  OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1257 }\
1258 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1259  uint8_t full[24*17];\
1260  uint8_t halfH[272];\
1261  copy_block17(full, src, 24, stride, 17);\
1262  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1263  put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1264  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1265 }\
1266 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1267  uint8_t halfH[272];\
1268  put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1269  OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1270 }
1271 
1272 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1273 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1274 #define op_put(a, b) a = cm[((b) + 16)>>5]
1275 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1276 
1277 QPEL_MC(0, put_ , _ , op_put)
1278 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1279 QPEL_MC(0, avg_ , _ , op_avg)
1280 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1281 #undef op_avg
1282 #undef op_avg_no_rnd
1283 #undef op_put
1284 #undef op_put_no_rnd
1285 
1286 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1287 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1288 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1289 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1290 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1291 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1292 
1293 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1294  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1295  int i;
1296 
1297  for(i=0; i<h; i++){
1298  dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1299  dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1300  dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1301  dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1302  dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1303  dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1304  dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1305  dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1306  dst+=dstStride;
1307  src+=srcStride;
1308  }
1309 }
1310 
1311 #if CONFIG_RV40_DECODER
1312 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1313  put_pixels16_xy2_8_c(dst, src, stride, 16);
1314 }
1315 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1316  avg_pixels16_xy2_8_c(dst, src, stride, 16);
1317 }
1318 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1319  put_pixels8_xy2_8_c(dst, src, stride, 8);
1320 }
1321 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1322  avg_pixels8_xy2_8_c(dst, src, stride, 8);
1323 }
1324 #endif /* CONFIG_RV40_DECODER */
1325 
1326 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1327  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1328  int i;
1329 
1330  for(i=0; i<w; i++){
1331  const int src_1= src[ -srcStride];
1332  const int src0 = src[0 ];
1333  const int src1 = src[ srcStride];
1334  const int src2 = src[2*srcStride];
1335  const int src3 = src[3*srcStride];
1336  const int src4 = src[4*srcStride];
1337  const int src5 = src[5*srcStride];
1338  const int src6 = src[6*srcStride];
1339  const int src7 = src[7*srcStride];
1340  const int src8 = src[8*srcStride];
1341  const int src9 = src[9*srcStride];
1342  dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1343  dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1344  dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1345  dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1346  dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1347  dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1348  dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1349  dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1350  src++;
1351  dst++;
1352  }
1353 }
1354 
1355 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1356  uint8_t half[64];
1357  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1358  put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1359 }
1360 
1361 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1362  wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1363 }
1364 
1365 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1366  uint8_t half[64];
1367  wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1368  put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1369 }
1370 
1371 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1372  wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1373 }
1374 
1375 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1376  uint8_t halfH[88];
1377  uint8_t halfV[64];
1378  uint8_t halfHV[64];
1379  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1380  wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1381  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1382  put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1383 }
1384 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1385  uint8_t halfH[88];
1386  uint8_t halfV[64];
1387  uint8_t halfHV[64];
1388  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1389  wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1390  wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1391  put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1392 }
1393 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1394  uint8_t halfH[88];
1395  wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1396  wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1397 }
1398 
1399 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1401  int x;
1402  const int strength= ff_h263_loop_filter_strength[qscale];
1403 
1404  for(x=0; x<8; x++){
1405  int d1, d2, ad1;
1406  int p0= src[x-2*stride];
1407  int p1= src[x-1*stride];
1408  int p2= src[x+0*stride];
1409  int p3= src[x+1*stride];
1410  int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1411 
1412  if (d<-2*strength) d1= 0;
1413  else if(d<- strength) d1=-2*strength - d;
1414  else if(d< strength) d1= d;
1415  else if(d< 2*strength) d1= 2*strength - d;
1416  else d1= 0;
1417 
1418  p1 += d1;
1419  p2 -= d1;
1420  if(p1&256) p1= ~(p1>>31);
1421  if(p2&256) p2= ~(p2>>31);
1422 
1423  src[x-1*stride] = p1;
1424  src[x+0*stride] = p2;
1425 
1426  ad1= FFABS(d1)>>1;
1427 
1428  d2= av_clip((p0-p3)/4, -ad1, ad1);
1429 
1430  src[x-2*stride] = p0 - d2;
1431  src[x+ stride] = p3 + d2;
1432  }
1433  }
1434 }
1435 
1436 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1438  int y;
1439  const int strength= ff_h263_loop_filter_strength[qscale];
1440 
1441  for(y=0; y<8; y++){
1442  int d1, d2, ad1;
1443  int p0= src[y*stride-2];
1444  int p1= src[y*stride-1];
1445  int p2= src[y*stride+0];
1446  int p3= src[y*stride+1];
1447  int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1448 
1449  if (d<-2*strength) d1= 0;
1450  else if(d<- strength) d1=-2*strength - d;
1451  else if(d< strength) d1= d;
1452  else if(d< 2*strength) d1= 2*strength - d;
1453  else d1= 0;
1454 
1455  p1 += d1;
1456  p2 -= d1;
1457  if(p1&256) p1= ~(p1>>31);
1458  if(p2&256) p2= ~(p2>>31);
1459 
1460  src[y*stride-1] = p1;
1461  src[y*stride+0] = p2;
1462 
1463  ad1= FFABS(d1)>>1;
1464 
1465  d2= av_clip((p0-p3)/4, -ad1, ad1);
1466 
1467  src[y*stride-2] = p0 - d2;
1468  src[y*stride+1] = p3 + d2;
1469  }
1470  }
1471 }
1472 
1473 static void h261_loop_filter_c(uint8_t *src, int stride){
1474  int x,y,xy,yz;
1475  int temp[64];
1476 
1477  for(x=0; x<8; x++){
1478  temp[x ] = 4*src[x ];
1479  temp[x + 7*8] = 4*src[x + 7*stride];
1480  }
1481  for(y=1; y<7; y++){
1482  for(x=0; x<8; x++){
1483  xy = y * stride + x;
1484  yz = y * 8 + x;
1485  temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1486  }
1487  }
1488 
1489  for(y=0; y<8; y++){
1490  src[ y*stride] = (temp[ y*8] + 2)>>2;
1491  src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1492  for(x=1; x<7; x++){
1493  xy = y * stride + x;
1494  yz = y * 8 + x;
1495  src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1496  }
1497  }
1498 }
1499 
1500 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1501 {
1502  int s, i;
1503 
1504  s = 0;
1505  for(i=0;i<h;i++) {
1506  s += abs(pix1[0] - pix2[0]);
1507  s += abs(pix1[1] - pix2[1]);
1508  s += abs(pix1[2] - pix2[2]);
1509  s += abs(pix1[3] - pix2[3]);
1510  s += abs(pix1[4] - pix2[4]);
1511  s += abs(pix1[5] - pix2[5]);
1512  s += abs(pix1[6] - pix2[6]);
1513  s += abs(pix1[7] - pix2[7]);
1514  s += abs(pix1[8] - pix2[8]);
1515  s += abs(pix1[9] - pix2[9]);
1516  s += abs(pix1[10] - pix2[10]);
1517  s += abs(pix1[11] - pix2[11]);
1518  s += abs(pix1[12] - pix2[12]);
1519  s += abs(pix1[13] - pix2[13]);
1520  s += abs(pix1[14] - pix2[14]);
1521  s += abs(pix1[15] - pix2[15]);
1522  pix1 += line_size;
1523  pix2 += line_size;
1524  }
1525  return s;
1526 }
1527 
1528 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1529 {
1530  int s, i;
1531 
1532  s = 0;
1533  for(i=0;i<h;i++) {
1534  s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1535  s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1536  s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1537  s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1538  s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1539  s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1540  s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1541  s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1542  s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1543  s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1544  s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1545  s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1546  s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1547  s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1548  s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1549  s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1550  pix1 += line_size;
1551  pix2 += line_size;
1552  }
1553  return s;
1554 }
1555 
1556 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1557 {
1558  int s, i;
1559  uint8_t *pix3 = pix2 + line_size;
1560 
1561  s = 0;
1562  for(i=0;i<h;i++) {
1563  s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1564  s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1565  s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1566  s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1567  s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1568  s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1569  s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1570  s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1571  s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1572  s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1573  s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1574  s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1575  s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1576  s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1577  s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1578  s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1579  pix1 += line_size;
1580  pix2 += line_size;
1581  pix3 += line_size;
1582  }
1583  return s;
1584 }
1585 
1586 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1587 {
1588  int s, i;
1589  uint8_t *pix3 = pix2 + line_size;
1590 
1591  s = 0;
1592  for(i=0;i<h;i++) {
1593  s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1594  s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1595  s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1596  s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1597  s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1598  s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1599  s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1600  s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1601  s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1602  s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1603  s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1604  s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1605  s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1606  s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1607  s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1608  s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1609  pix1 += line_size;
1610  pix2 += line_size;
1611  pix3 += line_size;
1612  }
1613  return s;
1614 }
1615 
1616 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1617 {
1618  int s, i;
1619 
1620  s = 0;
1621  for(i=0;i<h;i++) {
1622  s += abs(pix1[0] - pix2[0]);
1623  s += abs(pix1[1] - pix2[1]);
1624  s += abs(pix1[2] - pix2[2]);
1625  s += abs(pix1[3] - pix2[3]);
1626  s += abs(pix1[4] - pix2[4]);
1627  s += abs(pix1[5] - pix2[5]);
1628  s += abs(pix1[6] - pix2[6]);
1629  s += abs(pix1[7] - pix2[7]);
1630  pix1 += line_size;
1631  pix2 += line_size;
1632  }
1633  return s;
1634 }
1635 
1636 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1637 {
1638  int s, i;
1639 
1640  s = 0;
1641  for(i=0;i<h;i++) {
1642  s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1643  s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1644  s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1645  s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1646  s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1647  s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1648  s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1649  s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1650  pix1 += line_size;
1651  pix2 += line_size;
1652  }
1653  return s;
1654 }
1655 
1656 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1657 {
1658  int s, i;
1659  uint8_t *pix3 = pix2 + line_size;
1660 
1661  s = 0;
1662  for(i=0;i<h;i++) {
1663  s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1664  s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1665  s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1666  s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1667  s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1668  s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1669  s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1670  s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1671  pix1 += line_size;
1672  pix2 += line_size;
1673  pix3 += line_size;
1674  }
1675  return s;
1676 }
1677 
1678 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1679 {
1680  int s, i;
1681  uint8_t *pix3 = pix2 + line_size;
1682 
1683  s = 0;
1684  for(i=0;i<h;i++) {
1685  s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1686  s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1687  s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1688  s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1689  s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1690  s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1691  s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1692  s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1693  pix1 += line_size;
1694  pix2 += line_size;
1695  pix3 += line_size;
1696  }
1697  return s;
1698 }
1699 
1700 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1701  MpegEncContext *c = v;
1702  int score1=0;
1703  int score2=0;
1704  int x,y;
1705 
1706  for(y=0; y<h; y++){
1707  for(x=0; x<16; x++){
1708  score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1709  }
1710  if(y+1<h){
1711  for(x=0; x<15; x++){
1712  score2+= FFABS( s1[x ] - s1[x +stride]
1713  - s1[x+1] + s1[x+1+stride])
1714  -FFABS( s2[x ] - s2[x +stride]
1715  - s2[x+1] + s2[x+1+stride]);
1716  }
1717  }
1718  s1+= stride;
1719  s2+= stride;
1720  }
1721 
1722  if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1723  else return score1 + FFABS(score2)*8;
1724 }
1725 
1726 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1727  MpegEncContext *c = v;
1728  int score1=0;
1729  int score2=0;
1730  int x,y;
1731 
1732  for(y=0; y<h; y++){
1733  for(x=0; x<8; x++){
1734  score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1735  }
1736  if(y+1<h){
1737  for(x=0; x<7; x++){
1738  score2+= FFABS( s1[x ] - s1[x +stride]
1739  - s1[x+1] + s1[x+1+stride])
1740  -FFABS( s2[x ] - s2[x +stride]
1741  - s2[x+1] + s2[x+1+stride]);
1742  }
1743  }
1744  s1+= stride;
1745  s2+= stride;
1746  }
1747 
1748  if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1749  else return score1 + FFABS(score2)*8;
1750 }
1751 
1752 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1753  int i;
1754  unsigned int sum=0;
1755 
1756  for(i=0; i<8*8; i++){
1757  int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1758  int w= weight[i];
1759  b>>= RECON_SHIFT;
1760  assert(-512<b && b<512);
1761 
1762  sum += (w*b)*(w*b)>>4;
1763  }
1764  return sum>>2;
1765 }
1766 
1767 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1768  int i;
1769 
1770  for(i=0; i<8*8; i++){
1771  rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1772  }
1773 }
1774 
1783 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1784 {
1785  int i;
1786  DCTELEM temp[64];
1787 
1788  if(last<=0) return;
1789  //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1790 
1791  for(i=0; i<=last; i++){
1792  const int j= scantable[i];
1793  temp[j]= block[j];
1794  block[j]=0;
1795  }
1796 
1797  for(i=0; i<=last; i++){
1798  const int j= scantable[i];
1799  const int perm_j= permutation[j];
1800  block[perm_j]= temp[j];
1801  }
1802 }
1803 
1804 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1805  return 0;
1806 }
1807 
1808 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1809  int i;
1810 
1811  memset(cmp, 0, sizeof(void*)*6);
1812 
1813  for(i=0; i<6; i++){
1814  switch(type&0xFF){
1815  case FF_CMP_SAD:
1816  cmp[i]= c->sad[i];
1817  break;
1818  case FF_CMP_SATD:
1819  cmp[i]= c->hadamard8_diff[i];
1820  break;
1821  case FF_CMP_SSE:
1822  cmp[i]= c->sse[i];
1823  break;
1824  case FF_CMP_DCT:
1825  cmp[i]= c->dct_sad[i];
1826  break;
1827  case FF_CMP_DCT264:
1828  cmp[i]= c->dct264_sad[i];
1829  break;
1830  case FF_CMP_DCTMAX:
1831  cmp[i]= c->dct_max[i];
1832  break;
1833  case FF_CMP_PSNR:
1834  cmp[i]= c->quant_psnr[i];
1835  break;
1836  case FF_CMP_BIT:
1837  cmp[i]= c->bit[i];
1838  break;
1839  case FF_CMP_RD:
1840  cmp[i]= c->rd[i];
1841  break;
1842  case FF_CMP_VSAD:
1843  cmp[i]= c->vsad[i];
1844  break;
1845  case FF_CMP_VSSE:
1846  cmp[i]= c->vsse[i];
1847  break;
1848  case FF_CMP_ZERO:
1849  cmp[i]= zero_cmp;
1850  break;
1851  case FF_CMP_NSSE:
1852  cmp[i]= c->nsse[i];
1853  break;
1854 #if CONFIG_DWT
1855  case FF_CMP_W53:
1856  cmp[i]= c->w53[i];
1857  break;
1858  case FF_CMP_W97:
1859  cmp[i]= c->w97[i];
1860  break;
1861 #endif
1862  default:
1863  av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1864  }
1865  }
1866 }
1867 
1868 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1869  long i;
1870  for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1871  long a = *(long*)(src+i);
1872  long b = *(long*)(dst+i);
1873  *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1874  }
1875  for(; i<w; i++)
1876  dst[i+0] += src[i+0];
1877 }
1878 
1879 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1880  long i;
1881  for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1882  long a = *(long*)(src1+i);
1883  long b = *(long*)(src2+i);
1884  *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1885  }
1886  for(; i<w; i++)
1887  dst[i] = src1[i]+src2[i];
1888 }
1889 
1890 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1891  long i;
1892 #if !HAVE_FAST_UNALIGNED
1893  if((long)src2 & (sizeof(long)-1)){
1894  for(i=0; i+7<w; i+=8){
1895  dst[i+0] = src1[i+0]-src2[i+0];
1896  dst[i+1] = src1[i+1]-src2[i+1];
1897  dst[i+2] = src1[i+2]-src2[i+2];
1898  dst[i+3] = src1[i+3]-src2[i+3];
1899  dst[i+4] = src1[i+4]-src2[i+4];
1900  dst[i+5] = src1[i+5]-src2[i+5];
1901  dst[i+6] = src1[i+6]-src2[i+6];
1902  dst[i+7] = src1[i+7]-src2[i+7];
1903  }
1904  }else
1905 #endif
1906  for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1907  long a = *(long*)(src1+i);
1908  long b = *(long*)(src2+i);
1909  *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1910  }
1911  for(; i<w; i++)
1912  dst[i+0] = src1[i+0]-src2[i+0];
1913 }
1914 
1915 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1916  int i;
1917  uint8_t l, lt;
1918 
1919  l= *left;
1920  lt= *left_top;
1921 
1922  for(i=0; i<w; i++){
1923  l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1924  lt= src1[i];
1925  dst[i]= l;
1926  }
1927 
1928  *left= l;
1929  *left_top= lt;
1930 }
1931 
1932 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1933  int i;
1934  uint8_t l, lt;
1935 
1936  l= *left;
1937  lt= *left_top;
1938 
1939  for(i=0; i<w; i++){
1940  const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1941  lt= src1[i];
1942  l= src2[i];
1943  dst[i]= l - pred;
1944  }
1945 
1946  *left= l;
1947  *left_top= lt;
1948 }
1949 
1950 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1951  int i;
1952 
1953  for(i=0; i<w-1; i++){
1954  acc+= src[i];
1955  dst[i]= acc;
1956  i++;
1957  acc+= src[i];
1958  dst[i]= acc;
1959  }
1960 
1961  for(; i<w; i++){
1962  acc+= src[i];
1963  dst[i]= acc;
1964  }
1965 
1966  return acc;
1967 }
1968 
1969 #if HAVE_BIGENDIAN
1970 #define B 3
1971 #define G 2
1972 #define R 1
1973 #define A 0
1974 #else
1975 #define B 0
1976 #define G 1
1977 #define R 2
1978 #define A 3
1979 #endif
1980 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1981  int i;
1982  int r,g,b,a;
1983  r= *red;
1984  g= *green;
1985  b= *blue;
1986  a= *alpha;
1987 
1988  for(i=0; i<w; i++){
1989  b+= src[4*i+B];
1990  g+= src[4*i+G];
1991  r+= src[4*i+R];
1992  a+= src[4*i+A];
1993 
1994  dst[4*i+B]= b;
1995  dst[4*i+G]= g;
1996  dst[4*i+R]= r;
1997  dst[4*i+A]= a;
1998  }
1999 
2000  *red= r;
2001  *green= g;
2002  *blue= b;
2003  *alpha= a;
2004 }
2005 #undef B
2006 #undef G
2007 #undef R
2008 #undef A
2009 
2010 #define BUTTERFLY2(o1,o2,i1,i2) \
2011 o1= (i1)+(i2);\
2012 o2= (i1)-(i2);
2013 
2014 #define BUTTERFLY1(x,y) \
2015 {\
2016  int a,b;\
2017  a= x;\
2018  b= y;\
2019  x= a+b;\
2020  y= a-b;\
2021 }
2022 
2023 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2024 
2025 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2026  int i;
2027  int temp[64];
2028  int sum=0;
2029 
2030  assert(h==8);
2031 
2032  for(i=0; i<8; i++){
2033  //FIXME try pointer walks
2034  BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2035  BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2036  BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2037  BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2038 
2039  BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2040  BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2041  BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2042  BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2043 
2044  BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2045  BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2046  BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2047  BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2048  }
2049 
2050  for(i=0; i<8; i++){
2051  BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2052  BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2053  BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2054  BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2055 
2056  BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2057  BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2058  BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2059  BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2060 
2061  sum +=
2062  BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2063  +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2064  +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2065  +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2066  }
2067  return sum;
2068 }
2069 
2070 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2071  int i;
2072  int temp[64];
2073  int sum=0;
2074 
2075  assert(h==8);
2076 
2077  for(i=0; i<8; i++){
2078  //FIXME try pointer walks
2079  BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2080  BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2081  BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2082  BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2083 
2084  BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2085  BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2086  BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2087  BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2088 
2089  BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2090  BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2091  BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2092  BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2093  }
2094 
2095  for(i=0; i<8; i++){
2096  BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2097  BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2098  BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2099  BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2100 
2101  BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2102  BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2103  BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2104  BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2105 
2106  sum +=
2107  BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2108  +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2109  +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2110  +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2111  }
2112 
2113  sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2114 
2115  return sum;
2116 }
2117 
2118 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2119  MpegEncContext * const s= (MpegEncContext *)c;
2120  LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2121 
2122  assert(h==8);
2123 
2124  s->dsp.diff_pixels(temp, src1, src2, stride);
2125  s->dsp.fdct(temp);
2126  return s->dsp.sum_abs_dctelem(temp);
2127 }
2128 
2129 #if CONFIG_GPL
2130 #define DCT8_1D {\
2131  const int s07 = SRC(0) + SRC(7);\
2132  const int s16 = SRC(1) + SRC(6);\
2133  const int s25 = SRC(2) + SRC(5);\
2134  const int s34 = SRC(3) + SRC(4);\
2135  const int a0 = s07 + s34;\
2136  const int a1 = s16 + s25;\
2137  const int a2 = s07 - s34;\
2138  const int a3 = s16 - s25;\
2139  const int d07 = SRC(0) - SRC(7);\
2140  const int d16 = SRC(1) - SRC(6);\
2141  const int d25 = SRC(2) - SRC(5);\
2142  const int d34 = SRC(3) - SRC(4);\
2143  const int a4 = d16 + d25 + (d07 + (d07>>1));\
2144  const int a5 = d07 - d34 - (d25 + (d25>>1));\
2145  const int a6 = d07 + d34 - (d16 + (d16>>1));\
2146  const int a7 = d16 - d25 + (d34 + (d34>>1));\
2147  DST(0, a0 + a1 ) ;\
2148  DST(1, a4 + (a7>>2)) ;\
2149  DST(2, a2 + (a3>>1)) ;\
2150  DST(3, a5 + (a6>>2)) ;\
2151  DST(4, a0 - a1 ) ;\
2152  DST(5, a6 - (a5>>2)) ;\
2153  DST(6, (a2>>1) - a3 ) ;\
2154  DST(7, (a4>>2) - a7 ) ;\
2155 }
2156 
2157 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2158  MpegEncContext * const s= (MpegEncContext *)c;
2159  DCTELEM dct[8][8];
2160  int i;
2161  int sum=0;
2162 
2163  s->dsp.diff_pixels(dct[0], src1, src2, stride);
2164 
2165 #define SRC(x) dct[i][x]
2166 #define DST(x,v) dct[i][x]= v
2167  for( i = 0; i < 8; i++ )
2168  DCT8_1D
2169 #undef SRC
2170 #undef DST
2171 
2172 #define SRC(x) dct[x][i]
2173 #define DST(x,v) sum += FFABS(v)
2174  for( i = 0; i < 8; i++ )
2175  DCT8_1D
2176 #undef SRC
2177 #undef DST
2178  return sum;
2179 }
2180 #endif
2181 
2182 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2183  MpegEncContext * const s= (MpegEncContext *)c;
2184  LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2185  int sum=0, i;
2186 
2187  assert(h==8);
2188 
2189  s->dsp.diff_pixels(temp, src1, src2, stride);
2190  s->dsp.fdct(temp);
2191 
2192  for(i=0; i<64; i++)
2193  sum= FFMAX(sum, FFABS(temp[i]));
2194 
2195  return sum;
2196 }
2197 
2198 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2199  MpegEncContext * const s= (MpegEncContext *)c;
2200  LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2201  DCTELEM * const bak = temp+64;
2202  int sum=0, i;
2203 
2204  assert(h==8);
2205  s->mb_intra=0;
2206 
2207  s->dsp.diff_pixels(temp, src1, src2, stride);
2208 
2209  memcpy(bak, temp, 64*sizeof(DCTELEM));
2210 
2211  s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2212  s->dct_unquantize_inter(s, temp, 0, s->qscale);
2213  ff_simple_idct_8(temp); //FIXME
2214 
2215  for(i=0; i<64; i++)
2216  sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2217 
2218  return sum;
2219 }
2220 
2221 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2222  MpegEncContext * const s= (MpegEncContext *)c;
2223  const uint8_t *scantable= s->intra_scantable.permutated;
2224  LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2225  LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2226  LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2227  int i, last, run, bits, level, distortion, start_i;
2228  const int esc_length= s->ac_esc_length;
2229  uint8_t * length;
2230  uint8_t * last_length;
2231 
2232  assert(h==8);
2233 
2234  copy_block8(lsrc1, src1, 8, stride, 8);
2235  copy_block8(lsrc2, src2, 8, stride, 8);
2236 
2237  s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2238 
2239  s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2240 
2241  bits=0;
2242 
2243  if (s->mb_intra) {
2244  start_i = 1;
2245  length = s->intra_ac_vlc_length;
2246  last_length= s->intra_ac_vlc_last_length;
2247  bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2248  } else {
2249  start_i = 0;
2250  length = s->inter_ac_vlc_length;
2251  last_length= s->inter_ac_vlc_last_length;
2252  }
2253 
2254  if(last>=start_i){
2255  run=0;
2256  for(i=start_i; i<last; i++){
2257  int j= scantable[i];
2258  level= temp[j];
2259 
2260  if(level){
2261  level+=64;
2262  if((level&(~127)) == 0){
2263  bits+= length[UNI_AC_ENC_INDEX(run, level)];
2264  }else
2265  bits+= esc_length;
2266  run=0;
2267  }else
2268  run++;
2269  }
2270  i= scantable[last];
2271 
2272  level= temp[i] + 64;
2273 
2274  assert(level - 64);
2275 
2276  if((level&(~127)) == 0){
2277  bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2278  }else
2279  bits+= esc_length;
2280 
2281  }
2282 
2283  if(last>=0){
2284  if(s->mb_intra)
2285  s->dct_unquantize_intra(s, temp, 0, s->qscale);
2286  else
2287  s->dct_unquantize_inter(s, temp, 0, s->qscale);
2288  }
2289 
2290  s->dsp.idct_add(lsrc2, 8, temp);
2291 
2292  distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2293 
2294  return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2295 }
2296 
2297 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2298  MpegEncContext * const s= (MpegEncContext *)c;
2299  const uint8_t *scantable= s->intra_scantable.permutated;
2300  LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2301  int i, last, run, bits, level, start_i;
2302  const int esc_length= s->ac_esc_length;
2303  uint8_t * length;
2304  uint8_t * last_length;
2305 
2306  assert(h==8);
2307 
2308  s->dsp.diff_pixels(temp, src1, src2, stride);
2309 
2310  s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2311 
2312  bits=0;
2313 
2314  if (s->mb_intra) {
2315  start_i = 1;
2316  length = s->intra_ac_vlc_length;
2317  last_length= s->intra_ac_vlc_last_length;
2318  bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2319  } else {
2320  start_i = 0;
2321  length = s->inter_ac_vlc_length;
2322  last_length= s->inter_ac_vlc_last_length;
2323  }
2324 
2325  if(last>=start_i){
2326  run=0;
2327  for(i=start_i; i<last; i++){
2328  int j= scantable[i];
2329  level= temp[j];
2330 
2331  if(level){
2332  level+=64;
2333  if((level&(~127)) == 0){
2334  bits+= length[UNI_AC_ENC_INDEX(run, level)];
2335  }else
2336  bits+= esc_length;
2337  run=0;
2338  }else
2339  run++;
2340  }
2341  i= scantable[last];
2342 
2343  level= temp[i] + 64;
2344 
2345  assert(level - 64);
2346 
2347  if((level&(~127)) == 0){
2348  bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2349  }else
2350  bits+= esc_length;
2351  }
2352 
2353  return bits;
2354 }
2355 
2356 #define VSAD_INTRA(size) \
2357 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2358  int score=0; \
2359  int x,y; \
2360  \
2361  for(y=1; y<h; y++){ \
2362  for(x=0; x<size; x+=4){ \
2363  score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2364  +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2365  } \
2366  s+= stride; \
2367  } \
2368  \
2369  return score; \
2370 }
2371 VSAD_INTRA(8)
2372 VSAD_INTRA(16)
2373 
2374 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2375  int score=0;
2376  int x,y;
2377 
2378  for(y=1; y<h; y++){
2379  for(x=0; x<16; x++){
2380  score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2381  }
2382  s1+= stride;
2383  s2+= stride;
2384  }
2385 
2386  return score;
2387 }
2388 
2389 #define SQ(a) ((a)*(a))
2390 #define VSSE_INTRA(size) \
2391 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2392  int score=0; \
2393  int x,y; \
2394  \
2395  for(y=1; y<h; y++){ \
2396  for(x=0; x<size; x+=4){ \
2397  score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2398  +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2399  } \
2400  s+= stride; \
2401  } \
2402  \
2403  return score; \
2404 }
2405 VSSE_INTRA(8)
2406 VSSE_INTRA(16)
2407 
2408 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2409  int score=0;
2410  int x,y;
2411 
2412  for(y=1; y<h; y++){
2413  for(x=0; x<16; x++){
2414  score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2415  }
2416  s1+= stride;
2417  s2+= stride;
2418  }
2419 
2420  return score;
2421 }
2422 
2423 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2424  int size){
2425  int score=0;
2426  int i;
2427  for(i=0; i<size; i++)
2428  score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2429  return score;
2430 }
2431 
2432 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2433 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2434 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2435 #if CONFIG_GPL
2436 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2437 #endif
2438 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2439 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2440 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2441 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2442 
2443 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2444  int i;
2445  for(i=0; i<len; i++)
2446  dst[i] = src0[i] * src1[i];
2447 }
2448 
2449 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2450  int i;
2451  src1 += len-1;
2452  for(i=0; i<len; i++)
2453  dst[i] = src0[i] * src1[-i];
2454 }
2455 
2456 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2457  int i;
2458  for(i=0; i<len; i++)
2459  dst[i] = src0[i] * src1[i] + src2[i];
2460 }
2461 
2462 static void vector_fmul_window_c(float *dst, const float *src0,
2463  const float *src1, const float *win, int len)
2464 {
2465  int i,j;
2466  dst += len;
2467  win += len;
2468  src0+= len;
2469  for(i=-len, j=len-1; i<0; i++, j--) {
2470  float s0 = src0[i];
2471  float s1 = src1[j];
2472  float wi = win[i];
2473  float wj = win[j];
2474  dst[i] = s0*wj - s1*wi;
2475  dst[j] = s0*wi + s1*wj;
2476  }
2477 }
2478 
2479 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2480  int len)
2481 {
2482  int i;
2483  for (i = 0; i < len; i++)
2484  dst[i] = src[i] * mul;
2485 }
2486 
2487 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2488  int len)
2489 {
2490  int i;
2491  for (i = 0; i < len; i++)
2492  dst[i] += src[i] * mul;
2493 }
2494 
2495 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2496  int len)
2497 {
2498  int i;
2499  for (i = 0; i < len; i++) {
2500  float t = v1[i] - v2[i];
2501  v1[i] += v2[i];
2502  v2[i] = t;
2503  }
2504 }
2505 
2506 static void butterflies_float_interleave_c(float *dst, const float *src0,
2507  const float *src1, int len)
2508 {
2509  int i;
2510  for (i = 0; i < len; i++) {
2511  float f1 = src0[i];
2512  float f2 = src1[i];
2513  dst[2*i ] = f1 + f2;
2514  dst[2*i + 1] = f1 - f2;
2515  }
2516 }
2517 
2518 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2519 {
2520  float p = 0.0;
2521  int i;
2522 
2523  for (i = 0; i < len; i++)
2524  p += v1[i] * v2[i];
2525 
2526  return p;
2527 }
2528 
2529 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2530  uint32_t maxi, uint32_t maxisign)
2531 {
2532 
2533  if(a > mini) return mini;
2534  else if((a^(1U<<31)) > maxisign) return maxi;
2535  else return a;
2536 }
2537 
2538 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2539  int i;
2540  uint32_t mini = *(uint32_t*)min;
2541  uint32_t maxi = *(uint32_t*)max;
2542  uint32_t maxisign = maxi ^ (1U<<31);
2543  uint32_t *dsti = (uint32_t*)dst;
2544  const uint32_t *srci = (const uint32_t*)src;
2545  for(i=0; i<len; i+=8) {
2546  dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2547  dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2548  dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2549  dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2550  dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2551  dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2552  dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2553  dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2554  }
2555 }
2556 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2557  int i;
2558  if(min < 0 && max > 0) {
2559  vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2560  } else {
2561  for(i=0; i < len; i+=8) {
2562  dst[i ] = av_clipf(src[i ], min, max);
2563  dst[i + 1] = av_clipf(src[i + 1], min, max);
2564  dst[i + 2] = av_clipf(src[i + 2], min, max);
2565  dst[i + 3] = av_clipf(src[i + 3], min, max);
2566  dst[i + 4] = av_clipf(src[i + 4], min, max);
2567  dst[i + 5] = av_clipf(src[i + 5], min, max);
2568  dst[i + 6] = av_clipf(src[i + 6], min, max);
2569  dst[i + 7] = av_clipf(src[i + 7], min, max);
2570  }
2571  }
2572 }
2573 
2574 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2575 {
2576  int res = 0;
2577 
2578  while (order--)
2579  res += (*v1++ * *v2++) >> shift;
2580 
2581  return res;
2582 }
2583 
2584 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2585 {
2586  int res = 0;
2587  while (order--) {
2588  res += *v1 * *v2++;
2589  *v1++ += mul * *v3++;
2590  }
2591  return res;
2592 }
2593 
2594 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2595  const int16_t *window, unsigned int len)
2596 {
2597  int i;
2598  int len2 = len >> 1;
2599 
2600  for (i = 0; i < len2; i++) {
2601  int16_t w = window[i];
2602  output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2603  output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2604  }
2605 }
2606 
2607 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2608  int32_t max, unsigned int len)
2609 {
2610  do {
2611  *dst++ = av_clip(*src++, min, max);
2612  *dst++ = av_clip(*src++, min, max);
2613  *dst++ = av_clip(*src++, min, max);
2614  *dst++ = av_clip(*src++, min, max);
2615  *dst++ = av_clip(*src++, min, max);
2616  *dst++ = av_clip(*src++, min, max);
2617  *dst++ = av_clip(*src++, min, max);
2618  *dst++ = av_clip(*src++, min, max);
2619  len -= 8;
2620  } while (len > 0);
2621 }
2622 
2623 #define W0 2048
2624 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2625 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2626 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2627 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2628 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2629 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2630 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2631 
2632 static void wmv2_idct_row(short * b)
2633 {
2634  int s1,s2;
2635  int a0,a1,a2,a3,a4,a5,a6,a7;
2636  /*step 1*/
2637  a1 = W1*b[1]+W7*b[7];
2638  a7 = W7*b[1]-W1*b[7];
2639  a5 = W5*b[5]+W3*b[3];
2640  a3 = W3*b[5]-W5*b[3];
2641  a2 = W2*b[2]+W6*b[6];
2642  a6 = W6*b[2]-W2*b[6];
2643  a0 = W0*b[0]+W0*b[4];
2644  a4 = W0*b[0]-W0*b[4];
2645  /*step 2*/
2646  s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2647  s2 = (181*(a1-a5-a7+a3)+128)>>8;
2648  /*step 3*/
2649  b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2650  b[1] = (a4+a6 +s1 + (1<<7))>>8;
2651  b[2] = (a4-a6 +s2 + (1<<7))>>8;
2652  b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2653  b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2654  b[5] = (a4-a6 -s2 + (1<<7))>>8;
2655  b[6] = (a4+a6 -s1 + (1<<7))>>8;
2656  b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2657 }
2658 static void wmv2_idct_col(short * b)
2659 {
2660  int s1,s2;
2661  int a0,a1,a2,a3,a4,a5,a6,a7;
2662  /*step 1, with extended precision*/
2663  a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2664  a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2665  a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2666  a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2667  a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2668  a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2669  a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2670  a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2671  /*step 2*/
2672  s1 = (181*(a1-a5+a7-a3)+128)>>8;
2673  s2 = (181*(a1-a5-a7+a3)+128)>>8;
2674  /*step 3*/
2675  b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2676  b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2677  b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2678  b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2679 
2680  b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2681  b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2682  b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2683  b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2684 }
2685 void ff_wmv2_idct_c(short * block){
2686  int i;
2687 
2688  for(i=0;i<64;i+=8){
2689  wmv2_idct_row(block+i);
2690  }
2691  for(i=0;i<8;i++){
2692  wmv2_idct_col(block+i);
2693  }
2694 }
2695 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2696  converted */
2697 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2698 {
2699  ff_wmv2_idct_c(block);
2700  ff_put_pixels_clamped_c(block, dest, line_size);
2701 }
2702 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2703 {
2704  ff_wmv2_idct_c(block);
2705  ff_add_pixels_clamped_c(block, dest, line_size);
2706 }
2707 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2708 {
2709  j_rev_dct (block);
2710  ff_put_pixels_clamped_c(block, dest, line_size);
2711 }
2712 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2713 {
2714  j_rev_dct (block);
2715  ff_add_pixels_clamped_c(block, dest, line_size);
2716 }
2717 
2718 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2719 {
2720  j_rev_dct4 (block);
2721  put_pixels_clamped4_c(block, dest, line_size);
2722 }
2723 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2724 {
2725  j_rev_dct4 (block);
2726  add_pixels_clamped4_c(block, dest, line_size);
2727 }
2728 
2729 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2730 {
2731  j_rev_dct2 (block);
2732  put_pixels_clamped2_c(block, dest, line_size);
2733 }
2734 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2735 {
2736  j_rev_dct2 (block);
2737  add_pixels_clamped2_c(block, dest, line_size);
2738 }
2739 
2740 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2741 {
2742  dest[0] = av_clip_uint8((block[0] + 4)>>3);
2743 }
2744 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2745 {
2746  dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2747 }
2748 
2749 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2750 
2751 /* init static data */
2753 {
2754  int i;
2755 
2756  for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2757  for(i=0;i<MAX_NEG_CROP;i++) {
2758  ff_cropTbl[i] = 0;
2759  ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2760  }
2761 
2762  for(i=0;i<512;i++) {
2763  ff_squareTbl[i] = (i - 256) * (i - 256);
2764  }
2765 
2766  for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2767 }
2768 
2770  static int did_fail=0;
2771  LOCAL_ALIGNED_16(int, aligned, [4]);
2772 
2773  if((intptr_t)aligned & 15){
2774  if(!did_fail){
2775 #if HAVE_MMX || HAVE_ALTIVEC
2777  "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2778  "and may be very slow or crash. This is not a bug in libavcodec,\n"
2779  "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2780  "Do not report crashes to Libav developers.\n");
2781 #endif
2782  did_fail=1;
2783  }
2784  return -1;
2785  }
2786  return 0;
2787 }
2788 
2790 {
2791  int i, j;
2792 
2794 
2795 #if CONFIG_ENCODERS
2796  if (avctx->bits_per_raw_sample == 10) {
2799  } else {
2800  if(avctx->dct_algo==FF_DCT_FASTINT) {
2801  c->fdct = fdct_ifast;
2802  c->fdct248 = fdct_ifast248;
2803  }
2804  else if(avctx->dct_algo==FF_DCT_FAAN) {
2805  c->fdct = ff_faandct;
2806  c->fdct248 = ff_faandct248;
2807  }
2808  else {
2809  c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2811  }
2812  }
2813 #endif //CONFIG_ENCODERS
2814 
2815  if(avctx->lowres==1){
2818  c->idct = j_rev_dct4;
2820  }else if(avctx->lowres==2){
2823  c->idct = j_rev_dct2;
2825  }else if(avctx->lowres==3){
2828  c->idct = j_rev_dct1;
2830  }else{
2831  if (avctx->bits_per_raw_sample == 10) {
2834  c->idct = ff_simple_idct_10;
2836  } else {
2837  if(avctx->idct_algo==FF_IDCT_INT){
2840  c->idct = j_rev_dct;
2843  avctx->idct_algo==FF_IDCT_VP3){
2846  c->idct = ff_vp3_idct_c;
2848  }else if(avctx->idct_algo==FF_IDCT_WMV2){
2851  c->idct = ff_wmv2_idct_c;
2853  }else if(avctx->idct_algo==FF_IDCT_FAAN){
2856  c->idct = ff_faanidct;
2858  }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2861  }else{ //accurate/default
2864  c->idct = ff_simple_idct_8;
2866  }
2867  }
2868  }
2869 
2875  c->gmc1 = gmc1_c;
2876  c->gmc = ff_gmc_c;
2877  c->pix_sum = pix_sum_c;
2878  c->pix_norm1 = pix_norm1_c;
2879 
2881  c->fill_block_tab[1] = fill_block8_c;
2882 
2883  /* TODO [0] 16 [1] 8 */
2884  c->pix_abs[0][0] = pix_abs16_c;
2885  c->pix_abs[0][1] = pix_abs16_x2_c;
2886  c->pix_abs[0][2] = pix_abs16_y2_c;
2887  c->pix_abs[0][3] = pix_abs16_xy2_c;
2888  c->pix_abs[1][0] = pix_abs8_c;
2889  c->pix_abs[1][1] = pix_abs8_x2_c;
2890  c->pix_abs[1][2] = pix_abs8_y2_c;
2891  c->pix_abs[1][3] = pix_abs8_xy2_c;
2892 
2902 
2912 
2913 #define dspfunc(PFX, IDX, NUM) \
2914  c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2915  c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2916  c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2917  c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2918  c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2919  c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2920  c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2921  c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2922  c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2923  c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2924  c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2925  c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2926  c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2927  c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2928  c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2929  c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2930 
2931  dspfunc(put_qpel, 0, 16);
2932  dspfunc(put_no_rnd_qpel, 0, 16);
2933 
2934  dspfunc(avg_qpel, 0, 16);
2935  /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2936 
2937  dspfunc(put_qpel, 1, 8);
2938  dspfunc(put_no_rnd_qpel, 1, 8);
2939 
2940  dspfunc(avg_qpel, 1, 8);
2941  /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2942 
2943 #undef dspfunc
2944 
2945 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2946  ff_mlp_init(c, avctx);
2947 #endif
2948 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2949  ff_intrax8dsp_init(c,avctx);
2950 #endif
2951 
2960 
2961 #define SET_CMP_FUNC(name) \
2962  c->name[0]= name ## 16_c;\
2963  c->name[1]= name ## 8x8_c;
2964 
2965  SET_CMP_FUNC(hadamard8_diff)
2966  c->hadamard8_diff[4]= hadamard8_intra16_c;
2968  SET_CMP_FUNC(dct_sad)
2969  SET_CMP_FUNC(dct_max)
2970 #if CONFIG_GPL
2971  SET_CMP_FUNC(dct264_sad)
2972 #endif
2973  c->sad[0]= pix_abs16_c;
2974  c->sad[1]= pix_abs8_c;
2975  c->sse[0]= sse16_c;
2976  c->sse[1]= sse8_c;
2977  c->sse[2]= sse4_c;
2978  SET_CMP_FUNC(quant_psnr)
2979  SET_CMP_FUNC(rd)
2980  SET_CMP_FUNC(bit)
2981  c->vsad[0]= vsad16_c;
2982  c->vsad[4]= vsad_intra16_c;
2983  c->vsad[5]= vsad_intra8_c;
2984  c->vsse[0]= vsse16_c;
2985  c->vsse[4]= vsse_intra16_c;
2986  c->vsse[5]= vsse_intra8_c;
2987  c->nsse[0]= nsse16_c;
2988  c->nsse[1]= nsse8_c;
2989 #if CONFIG_DWT
2991 #endif
2992 
2994 
2995  c->add_bytes= add_bytes_c;
3002  c->bswap_buf= bswap_buf;
3003  c->bswap16_buf = bswap16_buf;
3004 #if CONFIG_PNG_DECODER
3006 #endif
3007 
3011  }
3012 
3013  if (CONFIG_VP3_DECODER) {
3017  }
3018 
3020 
3023 
3024 #if CONFIG_VORBIS_DECODER
3026 #endif
3027 #if CONFIG_AC3_DECODER
3029 #endif
3044 
3045  c->shrink[0]= av_image_copy_plane;
3046  c->shrink[1]= ff_shrink22;
3047  c->shrink[2]= ff_shrink44;
3048  c->shrink[3]= ff_shrink88;
3049 
3050  c->prefetch= just_return;
3051 
3052  memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3053  memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3054 
3055 #undef FUNC
3056 #undef FUNCC
3057 #define FUNC(f, depth) f ## _ ## depth
3058 #define FUNCC(f, depth) f ## _ ## depth ## _c
3059 
3060 #define dspfunc1(PFX, IDX, NUM, depth)\
3061  c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3062  c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3063  c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3064  c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3065 
3066 #define dspfunc2(PFX, IDX, NUM, depth)\
3067  c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3068  c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3069  c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3070  c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3071  c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3072  c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3073  c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3074  c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3075  c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3076  c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3077  c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3078  c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3079  c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3080  c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3081  c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3082  c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3083 
3084 
3085 #define BIT_DEPTH_FUNCS(depth, dct)\
3086  c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3087  c->draw_edges = FUNCC(draw_edges , depth);\
3088  c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3089  c->clear_block = FUNCC(clear_block ## dct , depth);\
3090  c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3091  c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3092  c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3093  c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3094  c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3095 \
3096  c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3097  c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3098  c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3099  c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3100  c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3101  c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3102 \
3103  dspfunc1(put , 0, 16, depth);\
3104  dspfunc1(put , 1, 8, depth);\
3105  dspfunc1(put , 2, 4, depth);\
3106  dspfunc1(put , 3, 2, depth);\
3107  dspfunc1(put_no_rnd, 0, 16, depth);\
3108  dspfunc1(put_no_rnd, 1, 8, depth);\
3109  dspfunc1(avg , 0, 16, depth);\
3110  dspfunc1(avg , 1, 8, depth);\
3111  dspfunc1(avg , 2, 4, depth);\
3112  dspfunc1(avg , 3, 2, depth);\
3113  dspfunc1(avg_no_rnd, 0, 16, depth);\
3114  dspfunc1(avg_no_rnd, 1, 8, depth);\
3115 \
3116  dspfunc2(put_h264_qpel, 0, 16, depth);\
3117  dspfunc2(put_h264_qpel, 1, 8, depth);\
3118  dspfunc2(put_h264_qpel, 2, 4, depth);\
3119  dspfunc2(put_h264_qpel, 3, 2, depth);\
3120  dspfunc2(avg_h264_qpel, 0, 16, depth);\
3121  dspfunc2(avg_h264_qpel, 1, 8, depth);\
3122  dspfunc2(avg_h264_qpel, 2, 4, depth);
3123 
3124  switch (avctx->bits_per_raw_sample) {
3125  case 9:
3126  if (c->dct_bits == 32) {
3127  BIT_DEPTH_FUNCS(9, _32);
3128  } else {
3129  BIT_DEPTH_FUNCS(9, _16);
3130  }
3131  break;
3132  case 10:
3133  if (c->dct_bits == 32) {
3134  BIT_DEPTH_FUNCS(10, _32);
3135  } else {
3136  BIT_DEPTH_FUNCS(10, _16);
3137  }
3138  break;
3139  default:
3140  av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3141  case 8:
3142  BIT_DEPTH_FUNCS(8, _16);
3143  break;
3144  }
3145 
3146 
3147  if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3148  if (ARCH_ARM) dsputil_init_arm (c, avctx);
3149  if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3150  if (HAVE_VIS) dsputil_init_vis (c, avctx);
3151  if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3152  if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3153  if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3154  if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3155  if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3156 
3157  for (i = 0; i < 4; i++) {
3158  for (j = 0; j < 16; j++) {
3159  if(!c->put_2tap_qpel_pixels_tab[i][j])
3160  c->put_2tap_qpel_pixels_tab[i][j] =
3161  c->put_h264_qpel_pixels_tab[i][j];
3162  if(!c->avg_2tap_qpel_pixels_tab[i][j])
3163  c->avg_2tap_qpel_pixels_tab[i][j] =
3164  c->avg_h264_qpel_pixels_tab[i][j];
3165  }
3166  }
3167 
3170 }