postprocess.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of Libav.
7  *
8  * Libav is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * Libav is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with Libav; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
28 /*
29  C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49 
50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = almost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58 
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66  (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73 
74 //Changelog: use git log
75 
76 #include "config.h"
77 #include "libavutil/avutil.h"
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <stdlib.h>
81 #include <string.h>
82 //#undef HAVE_MMX2
83 //#define HAVE_AMD3DNOW
84 //#undef HAVE_MMX
85 //#undef ARCH_X86
86 //#define DEBUG_BRIGHTNESS
87 #include "postprocess.h"
88 #include "postprocess_internal.h"
89 #include "libavutil/avstring.h"
90 
91 unsigned postproc_version(void)
92 {
94 }
95 
96 const char *postproc_configuration(void)
97 {
98  return LIBAV_CONFIGURATION;
99 }
100 
101 const char *postproc_license(void)
102 {
103 #define LICENSE_PREFIX "libpostproc license: "
104  return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
105 }
106 
107 #if HAVE_ALTIVEC_H
108 #include <altivec.h>
109 #endif
110 
111 #define GET_MODE_BUFFER_SIZE 500
112 #define OPTIONS_ARRAY_SIZE 10
113 #define BLOCK_SIZE 8
114 #define TEMP_STRIDE 8
115 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
116 
117 #if ARCH_X86
118 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
119 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
120 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
121 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
122 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
123 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
124 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
125 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
126 #endif
127 
128 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
129 
130 
131 static struct PPFilter filters[]=
132 {
133  {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
134  {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
135 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
136  {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
137  {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
138  {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
139  {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
140  {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
141  {"dr", "dering", 1, 5, 6, DERING},
142  {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
143  {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
144  {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
145  {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
146  {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
147  {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
148  {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
149  {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
150  {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
151  {NULL, NULL,0,0,0,0} //End Marker
152 };
153 
154 static const char *replaceTable[]=
155 {
156  "default", "hb:a,vb:a,dr:a",
157  "de", "hb:a,vb:a,dr:a",
158  "fast", "h1:a,v1:a,dr:a",
159  "fa", "h1:a,v1:a,dr:a",
160  "ac", "ha:a:128:7,va:a,dr:a",
161  NULL //End Marker
162 };
163 
164 
165 #if ARCH_X86
166 static inline void prefetchnta(void *p)
167 {
168  __asm__ volatile( "prefetchnta (%0)\n\t"
169  : : "r" (p)
170  );
171 }
172 
173 static inline void prefetcht0(void *p)
174 {
175  __asm__ volatile( "prefetcht0 (%0)\n\t"
176  : : "r" (p)
177  );
178 }
179 
180 static inline void prefetcht1(void *p)
181 {
182  __asm__ volatile( "prefetcht1 (%0)\n\t"
183  : : "r" (p)
184  );
185 }
186 
187 static inline void prefetcht2(void *p)
188 {
189  __asm__ volatile( "prefetcht2 (%0)\n\t"
190  : : "r" (p)
191  );
192 }
193 #endif
194 
195 /* The horizontal functions exist only in C because the MMX
196  * code is faster with vertical filters and transposing. */
197 
201 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
202 {
203  int numEq= 0;
204  int y;
205  const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
206  const int dcThreshold= dcOffset*2 + 1;
207 
208  for(y=0; y<BLOCK_SIZE; y++){
209  if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
210  if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
211  if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
212  if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
213  if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
214  if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
215  if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
216  src+= stride;
217  }
218  return numEq > c->ppMode.flatnessThreshold;
219 }
220 
224 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c)
225 {
226  int numEq= 0;
227  int y;
228  const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
229  const int dcThreshold= dcOffset*2 + 1;
230 
231  src+= stride*4; // src points to begin of the 8x8 Block
232  for(y=0; y<BLOCK_SIZE-1; y++){
233  if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
234  if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
235  if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
236  if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
237  if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
238  if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
239  if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
240  if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
241  src+= stride;
242  }
243  return numEq > c->ppMode.flatnessThreshold;
244 }
245 
246 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
247 {
248  int i;
249  for(i=0; i<2; i++){
250  if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
251  src += stride;
252  if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
253  src += stride;
254  if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
255  src += stride;
256  if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
257  src += stride;
258  }
259  return 1;
260 }
261 
262 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
263 {
264  int x;
265  src+= stride*4;
266  for(x=0; x<BLOCK_SIZE; x+=4){
267  if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
268  if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
269  if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
270  if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
271  }
272  return 1;
273 }
274 
275 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c)
276 {
277  if( isHorizDC_C(src, stride, c) ){
278  if( isHorizMinMaxOk_C(src, stride, c->QP) )
279  return 1;
280  else
281  return 0;
282  }else{
283  return 2;
284  }
285 }
286 
287 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c)
288 {
289  if( isVertDC_C(src, stride, c) ){
290  if( isVertMinMaxOk_C(src, stride, c->QP) )
291  return 1;
292  else
293  return 0;
294  }else{
295  return 2;
296  }
297 }
298 
299 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
300 {
301  int y;
302  for(y=0; y<BLOCK_SIZE; y++){
303  const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
304 
305  if(FFABS(middleEnergy) < 8*c->QP){
306  const int q=(dst[3] - dst[4])/2;
307  const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
308  const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
309 
310  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
311  d= FFMAX(d, 0);
312 
313  d= (5*d + 32) >> 6;
314  d*= FFSIGN(-middleEnergy);
315 
316  if(q>0)
317  {
318  d= d<0 ? 0 : d;
319  d= d>q ? q : d;
320  }
321  else
322  {
323  d= d>0 ? 0 : d;
324  d= d<q ? q : d;
325  }
326 
327  dst[3]-= d;
328  dst[4]+= d;
329  }
330  dst+= stride;
331  }
332 }
333 
338 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
339 {
340  int y;
341  for(y=0; y<BLOCK_SIZE; y++){
342  const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
343  const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
344 
345  int sums[10];
346  sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
347  sums[1] = sums[0] - first + dst[3];
348  sums[2] = sums[1] - first + dst[4];
349  sums[3] = sums[2] - first + dst[5];
350  sums[4] = sums[3] - first + dst[6];
351  sums[5] = sums[4] - dst[0] + dst[7];
352  sums[6] = sums[5] - dst[1] + last;
353  sums[7] = sums[6] - dst[2] + last;
354  sums[8] = sums[7] - dst[3] + last;
355  sums[9] = sums[8] - dst[4] + last;
356 
357  dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
358  dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
359  dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
360  dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
361  dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
362  dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
363  dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
364  dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
365 
366  dst+= stride;
367  }
368 }
369 
378 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
379 {
380  int y;
381  static uint64_t *lut= NULL;
382  if(lut==NULL)
383  {
384  int i;
385  lut = av_malloc(256*8);
386  for(i=0; i<256; i++)
387  {
388  int v= i < 128 ? 2*i : 2*(i-256);
389 /*
390 //Simulate 112242211 9-Tap filter
391  uint64_t a= (v/16) & 0xFF;
392  uint64_t b= (v/8) & 0xFF;
393  uint64_t c= (v/4) & 0xFF;
394  uint64_t d= (3*v/8) & 0xFF;
395 */
396 //Simulate piecewise linear interpolation
397  uint64_t a= (v/16) & 0xFF;
398  uint64_t b= (v*3/16) & 0xFF;
399  uint64_t c= (v*5/16) & 0xFF;
400  uint64_t d= (7*v/16) & 0xFF;
401  uint64_t A= (0x100 - a)&0xFF;
402  uint64_t B= (0x100 - b)&0xFF;
403  uint64_t C= (0x100 - c)&0xFF;
404  uint64_t D= (0x100 - c)&0xFF;
405 
406  lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
407  (D<<24) | (C<<16) | (B<<8) | (A);
408  //lut[i] = (v<<32) | (v<<24);
409  }
410  }
411 
412  for(y=0; y<BLOCK_SIZE; y++){
413  int a= src[1] - src[2];
414  int b= src[3] - src[4];
415  int c= src[5] - src[6];
416 
417  int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
418 
419  if(d < QP){
420  int v = d * FFSIGN(-b);
421 
422  src[1] +=v/8;
423  src[2] +=v/4;
424  src[3] +=3*v/8;
425  src[4] -=3*v/8;
426  src[5] -=v/4;
427  src[6] -=v/8;
428  }
429  src+=stride;
430  }
431 }
432 
436 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
437  int y;
438  const int QP= c->QP;
439  const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
440  const int dcThreshold= dcOffset*2 + 1;
441 //START_TIMER
442  src+= step*4; // src points to begin of the 8x8 Block
443  for(y=0; y<8; y++){
444  int numEq= 0;
445 
446  if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
447  if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
448  if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
449  if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
450  if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
451  if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
452  if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
453  if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
454  if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
455  if(numEq > c->ppMode.flatnessThreshold){
456  int min, max, x;
457 
458  if(src[0] > src[step]){
459  max= src[0];
460  min= src[step];
461  }else{
462  max= src[step];
463  min= src[0];
464  }
465  for(x=2; x<8; x+=2){
466  if(src[x*step] > src[(x+1)*step]){
467  if(src[x *step] > max) max= src[ x *step];
468  if(src[(x+1)*step] < min) min= src[(x+1)*step];
469  }else{
470  if(src[(x+1)*step] > max) max= src[(x+1)*step];
471  if(src[ x *step] < min) min= src[ x *step];
472  }
473  }
474  if(max-min < 2*QP){
475  const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
476  const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
477 
478  int sums[10];
479  sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
480  sums[1] = sums[0] - first + src[3*step];
481  sums[2] = sums[1] - first + src[4*step];
482  sums[3] = sums[2] - first + src[5*step];
483  sums[4] = sums[3] - first + src[6*step];
484  sums[5] = sums[4] - src[0*step] + src[7*step];
485  sums[6] = sums[5] - src[1*step] + last;
486  sums[7] = sums[6] - src[2*step] + last;
487  sums[8] = sums[7] - src[3*step] + last;
488  sums[9] = sums[8] - src[4*step] + last;
489 
490  src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
491  src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
492  src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
493  src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
494  src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
495  src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
496  src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
497  src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
498  }
499  }else{
500  const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
501 
502  if(FFABS(middleEnergy) < 8*QP){
503  const int q=(src[3*step] - src[4*step])/2;
504  const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
505  const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
506 
507  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
508  d= FFMAX(d, 0);
509 
510  d= (5*d + 32) >> 6;
511  d*= FFSIGN(-middleEnergy);
512 
513  if(q>0){
514  d= d<0 ? 0 : d;
515  d= d>q ? q : d;
516  }else{
517  d= d>0 ? 0 : d;
518  d= d<q ? q : d;
519  }
520 
521  src[3*step]-= d;
522  src[4*step]+= d;
523  }
524  }
525 
526  src += stride;
527  }
528 /*if(step==16){
529  STOP_TIMER("step16")
530 }else{
531  STOP_TIMER("stepX")
532 }*/
533 }
534 
535 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
536 //Plain C versions
537 #if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT
538 #define COMPILE_C
539 #endif
540 
541 #if HAVE_ALTIVEC
542 #define COMPILE_ALTIVEC
543 #endif //HAVE_ALTIVEC
544 
545 #if ARCH_X86
546 
547 #if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
548 #define COMPILE_MMX
549 #endif
550 
551 #if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
552 #define COMPILE_MMX2
553 #endif
554 
555 #if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
556 #define COMPILE_3DNOW
557 #endif
558 #endif /* ARCH_X86 */
559 
560 #undef HAVE_MMX
561 #define HAVE_MMX 0
562 #undef HAVE_MMX2
563 #define HAVE_MMX2 0
564 #undef HAVE_AMD3DNOW
565 #define HAVE_AMD3DNOW 0
566 #undef HAVE_ALTIVEC
567 #define HAVE_ALTIVEC 0
568 
569 #ifdef COMPILE_C
570 #define RENAME(a) a ## _C
571 #include "postprocess_template.c"
572 #endif
573 
574 #ifdef COMPILE_ALTIVEC
575 #undef RENAME
576 #undef HAVE_ALTIVEC
577 #define HAVE_ALTIVEC 1
578 #define RENAME(a) a ## _altivec
580 #include "postprocess_template.c"
581 #endif
582 
583 //MMX versions
584 #ifdef COMPILE_MMX
585 #undef RENAME
586 #undef HAVE_MMX
587 #define HAVE_MMX 1
588 #define RENAME(a) a ## _MMX
589 #include "postprocess_template.c"
590 #endif
591 
592 //MMX2 versions
593 #ifdef COMPILE_MMX2
594 #undef RENAME
595 #undef HAVE_MMX
596 #undef HAVE_MMX2
597 #define HAVE_MMX 1
598 #define HAVE_MMX2 1
599 #define RENAME(a) a ## _MMX2
600 #include "postprocess_template.c"
601 #endif
602 
603 //3DNOW versions
604 #ifdef COMPILE_3DNOW
605 #undef RENAME
606 #undef HAVE_MMX
607 #undef HAVE_MMX2
608 #undef HAVE_AMD3DNOW
609 #define HAVE_MMX 1
610 #define HAVE_MMX2 0
611 #define HAVE_AMD3DNOW 1
612 #define RENAME(a) a ## _3DNow
613 #include "postprocess_template.c"
614 #endif
615 
616 // minor note: the HAVE_xyz is messed up after that line so do not use it.
617 
618 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
619  const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
620 {
621  PPContext *c= (PPContext *)vc;
622  PPMode *ppMode= (PPMode *)vm;
623  c->ppMode= *ppMode; //FIXME
624 
625  // Using ifs here as they are faster than function pointers although the
626  // difference would not be measurable here but it is much better because
627  // someone might exchange the CPU whithout restarting MPlayer ;)
628 #if CONFIG_RUNTIME_CPUDETECT
629 #if ARCH_X86
630  // ordered per speed fastest first
631  if(c->cpuCaps & PP_CPU_CAPS_MMX2)
632  postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
633  else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
634  postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
635  else if(c->cpuCaps & PP_CPU_CAPS_MMX)
636  postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
637  else
638  postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
639 #else
640 #if HAVE_ALTIVEC
642  postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
643  else
644 #endif
645  postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
646 #endif
647 #else /* CONFIG_RUNTIME_CPUDETECT */
648 #if HAVE_MMX2
649  postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
650 #elif HAVE_AMD3DNOW
651  postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
652 #elif HAVE_MMX
653  postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
654 #elif HAVE_ALTIVEC
655  postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
656 #else
657  postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
658 #endif
659 #endif /* !CONFIG_RUNTIME_CPUDETECT */
660 }
661 
662 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
663 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
664 
665 /* -pp Command line Help
666 */
667 const char pp_help[] =
668 "Available postprocessing filters:\n"
669 "Filters Options\n"
670 "short long name short long option Description\n"
671 "* * a autoq CPU power dependent enabler\n"
672 " c chrom chrominance filtering enabled\n"
673 " y nochrom chrominance filtering disabled\n"
674 " n noluma luma filtering disabled\n"
675 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
676 " 1. difference factor: default=32, higher -> more deblocking\n"
677 " 2. flatness threshold: default=39, lower -> more deblocking\n"
678 " the h & v deblocking filters share these\n"
679 " so you can't set different thresholds for h / v\n"
680 "vb vdeblock (2 threshold) vertical deblocking filter\n"
681 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
682 "va vadeblock (2 threshold) vertical deblocking filter\n"
683 "h1 x1hdeblock experimental h deblock filter 1\n"
684 "v1 x1vdeblock experimental v deblock filter 1\n"
685 "dr dering deringing filter\n"
686 "al autolevels automatic brightness / contrast\n"
687 " f fullyrange stretch luminance to (0..255)\n"
688 "lb linblenddeint linear blend deinterlacer\n"
689 "li linipoldeint linear interpolating deinterlace\n"
690 "ci cubicipoldeint cubic interpolating deinterlacer\n"
691 "md mediandeint median deinterlacer\n"
692 "fd ffmpegdeint ffmpeg deinterlacer\n"
693 "l5 lowpass5 FIR lowpass deinterlacer\n"
694 "de default hb:a,vb:a,dr:a\n"
695 "fa fast h1:a,v1:a,dr:a\n"
696 "ac ha:a:128:7,va:a,dr:a\n"
697 "tn tmpnoise (3 threshold) temporal noise reducer\n"
698 " 1. <= 2. <= 3. larger -> stronger filtering\n"
699 "fq forceQuant <quantizer> force quantizer\n"
700 "Usage:\n"
701 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
702 "long form example:\n"
703 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
704 "short form example:\n"
705 "vb:a/hb:a/lb de,-vb\n"
706 "more examples:\n"
707 "tn:64:128:256\n"
708 "\n"
709 ;
710 
711 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
712 {
713  char temp[GET_MODE_BUFFER_SIZE];
714  char *p= temp;
715  static const char filterDelimiters[] = ",/";
716  static const char optionDelimiters[] = ":";
717  struct PPMode *ppMode;
718  char *filterToken;
719 
720  ppMode= av_malloc(sizeof(PPMode));
721 
722  ppMode->lumMode= 0;
723  ppMode->chromMode= 0;
724  ppMode->maxTmpNoise[0]= 700;
725  ppMode->maxTmpNoise[1]= 1500;
726  ppMode->maxTmpNoise[2]= 3000;
727  ppMode->maxAllowedY= 234;
728  ppMode->minAllowedY= 16;
729  ppMode->baseDcDiff= 256/8;
730  ppMode->flatnessThreshold= 56-16-1;
731  ppMode->maxClippedThreshold= 0.01;
732  ppMode->error=0;
733 
734  memset(temp, 0, GET_MODE_BUFFER_SIZE);
735  av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
736 
737  av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
738 
739  for(;;){
740  char *filterName;
741  int q= 1000000; //PP_QUALITY_MAX;
742  int chrom=-1;
743  int luma=-1;
744  char *option;
746  int i;
747  int filterNameOk=0;
748  int numOfUnknownOptions=0;
749  int enable=1; //does the user want us to enabled or disabled the filter
750 
751  filterToken= strtok(p, filterDelimiters);
752  if(filterToken == NULL) break;
753  p+= strlen(filterToken) + 1; // p points to next filterToken
754  filterName= strtok(filterToken, optionDelimiters);
755  av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
756 
757  if(*filterName == '-'){
758  enable=0;
759  filterName++;
760  }
761 
762  for(;;){ //for all options
763  option= strtok(NULL, optionDelimiters);
764  if(option == NULL) break;
765 
766  av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
767  if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
768  else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
769  else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
770  else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
771  else{
772  options[numOfUnknownOptions] = option;
773  numOfUnknownOptions++;
774  }
775  if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
776  }
777  options[numOfUnknownOptions] = NULL;
778 
779  /* replace stuff from the replace Table */
780  for(i=0; replaceTable[2*i]!=NULL; i++){
781  if(!strcmp(replaceTable[2*i], filterName)){
782  int newlen= strlen(replaceTable[2*i + 1]);
783  int plen;
784  int spaceLeft;
785 
786  if(p==NULL) p= temp, *p=0; //last filter
787  else p--, *p=','; //not last filter
788 
789  plen= strlen(p);
790  spaceLeft= p - temp + plen;
791  if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){
792  ppMode->error++;
793  break;
794  }
795  memmove(p + newlen, p, plen+1);
796  memcpy(p, replaceTable[2*i + 1], newlen);
797  filterNameOk=1;
798  }
799  }
800 
801  for(i=0; filters[i].shortName!=NULL; i++){
802  if( !strcmp(filters[i].longName, filterName)
803  || !strcmp(filters[i].shortName, filterName)){
804  ppMode->lumMode &= ~filters[i].mask;
805  ppMode->chromMode &= ~filters[i].mask;
806 
807  filterNameOk=1;
808  if(!enable) break; // user wants to disable it
809 
810  if(q >= filters[i].minLumQuality && luma)
811  ppMode->lumMode|= filters[i].mask;
812  if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
813  if(q >= filters[i].minChromQuality)
814  ppMode->chromMode|= filters[i].mask;
815 
816  if(filters[i].mask == LEVEL_FIX){
817  int o;
818  ppMode->minAllowedY= 16;
819  ppMode->maxAllowedY= 234;
820  for(o=0; options[o]!=NULL; o++){
821  if( !strcmp(options[o],"fullyrange")
822  ||!strcmp(options[o],"f")){
823  ppMode->minAllowedY= 0;
824  ppMode->maxAllowedY= 255;
825  numOfUnknownOptions--;
826  }
827  }
828  }
829  else if(filters[i].mask == TEMP_NOISE_FILTER)
830  {
831  int o;
832  int numOfNoises=0;
833 
834  for(o=0; options[o]!=NULL; o++){
835  char *tail;
836  ppMode->maxTmpNoise[numOfNoises]=
837  strtol(options[o], &tail, 0);
838  if(tail!=options[o]){
839  numOfNoises++;
840  numOfUnknownOptions--;
841  if(numOfNoises >= 3) break;
842  }
843  }
844  }
845  else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
846  || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
847  int o;
848 
849  for(o=0; options[o]!=NULL && o<2; o++){
850  char *tail;
851  int val= strtol(options[o], &tail, 0);
852  if(tail==options[o]) break;
853 
854  numOfUnknownOptions--;
855  if(o==0) ppMode->baseDcDiff= val;
856  else ppMode->flatnessThreshold= val;
857  }
858  }
859  else if(filters[i].mask == FORCE_QUANT){
860  int o;
861  ppMode->forcedQuant= 15;
862 
863  for(o=0; options[o]!=NULL && o<1; o++){
864  char *tail;
865  int val= strtol(options[o], &tail, 0);
866  if(tail==options[o]) break;
867 
868  numOfUnknownOptions--;
869  ppMode->forcedQuant= val;
870  }
871  }
872  }
873  }
874  if(!filterNameOk) ppMode->error++;
875  ppMode->error += numOfUnknownOptions;
876  }
877 
878  av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
879  if(ppMode->error){
880  av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
881  av_free(ppMode);
882  return NULL;
883  }
884  return ppMode;
885 }
886 
887 void pp_free_mode(pp_mode *mode){
888  av_free(mode);
889 }
890 
891 static void reallocAlign(void **p, int alignment, int size){
892  av_free(*p);
893  *p= av_mallocz(size);
894 }
895 
896 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
897  int mbWidth = (width+15)>>4;
898  int mbHeight= (height+15)>>4;
899  int i;
900 
901  c->stride= stride;
902  c->qpStride= qpStride;
903 
904  reallocAlign((void **)&c->tempDst, 8, stride*24);
905  reallocAlign((void **)&c->tempSrc, 8, stride*24);
906  reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
907  reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
908  for(i=0; i<256; i++)
909  c->yHistogram[i]= width*height/64*15/256;
910 
911  for(i=0; i<3; i++){
912  //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
913  reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
914  reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
915  }
916 
917  reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
918  reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
919  reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
920  reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
921 }
922 
923 static const char * context_to_name(void * ptr) {
924  return "postproc";
925 }
926 
927 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
928 
929 pp_context *pp_get_context(int width, int height, int cpuCaps){
930  PPContext *c= av_malloc(sizeof(PPContext));
931  int stride= FFALIGN(width, 16); //assumed / will realloc if needed
932  int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
933 
934  memset(c, 0, sizeof(PPContext));
936  c->cpuCaps= cpuCaps;
937  if(cpuCaps&PP_FORMAT){
938  c->hChromaSubSample= cpuCaps&0x3;
939  c->vChromaSubSample= (cpuCaps>>4)&0x3;
940  }else{
941  c->hChromaSubSample= 1;
942  c->vChromaSubSample= 1;
943  }
944 
945  reallocBuffers(c, width, height, stride, qpStride);
946 
947  c->frameNum=-1;
948 
949  return c;
950 }
951 
952 void pp_free_context(void *vc){
953  PPContext *c = (PPContext*)vc;
954  int i;
955 
956  for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
957  for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
958 
959  av_free(c->tempBlocks);
960  av_free(c->yHistogram);
961  av_free(c->tempDst);
962  av_free(c->tempSrc);
963  av_free(c->deintTemp);
964  av_free(c->stdQPTable);
965  av_free(c->nonBQPTable);
967 
968  memset(c, 0, sizeof(PPContext));
969 
970  av_free(c);
971 }
972 
973 void pp_postprocess(const uint8_t * src[3], const int srcStride[3],
974  uint8_t * dst[3], const int dstStride[3],
975  int width, int height,
976  const QP_STORE_T *QP_store, int QPStride,
977  pp_mode *vm, void *vc, int pict_type)
978 {
979  int mbWidth = (width+15)>>4;
980  int mbHeight= (height+15)>>4;
981  PPMode *mode = (PPMode*)vm;
982  PPContext *c = (PPContext*)vc;
983  int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
984  int absQPStride = FFABS(QPStride);
985 
986  // c->stride and c->QPStride are always positive
987  if(c->stride < minStride || c->qpStride < absQPStride)
988  reallocBuffers(c, width, height,
989  FFMAX(minStride, c->stride),
990  FFMAX(c->qpStride, absQPStride));
991 
992  if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){
993  int i;
994  QP_store= c->forcedQPTable;
995  absQPStride = QPStride = 0;
996  if(mode->lumMode & FORCE_QUANT)
997  for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
998  else
999  for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
1000  }
1001 
1002  if(pict_type & PP_PICT_TYPE_QP2){
1003  int i;
1004  const int count= mbHeight * absQPStride;
1005  for(i=0; i<(count>>2); i++){
1006  ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1007  }
1008  for(i<<=2; i<count; i++){
1009  c->stdQPTable[i] = QP_store[i]>>1;
1010  }
1011  QP_store= c->stdQPTable;
1012  QPStride= absQPStride;
1013  }
1014 
1015  if(0){
1016  int x,y;
1017  for(y=0; y<mbHeight; y++){
1018  for(x=0; x<mbWidth; x++){
1019  av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1020  }
1021  av_log(c, AV_LOG_INFO, "\n");
1022  }
1023  av_log(c, AV_LOG_INFO, "\n");
1024  }
1025 
1026  if((pict_type&7)!=3){
1027  if (QPStride >= 0){
1028  int i;
1029  const int count= mbHeight * QPStride;
1030  for(i=0; i<(count>>2); i++){
1031  ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1032  }
1033  for(i<<=2; i<count; i++){
1034  c->nonBQPTable[i] = QP_store[i] & 0x3F;
1035  }
1036  } else {
1037  int i,j;
1038  for(i=0; i<mbHeight; i++) {
1039  for(j=0; j<absQPStride; j++) {
1040  c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1041  }
1042  }
1043  }
1044  }
1045 
1046  av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1047  mode->lumMode, mode->chromMode);
1048 
1049  postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1050  width, height, QP_store, QPStride, 0, mode, c);
1051 
1052  width = (width )>>c->hChromaSubSample;
1053  height = (height)>>c->vChromaSubSample;
1054 
1055  if(mode->chromMode){
1056  postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1057  width, height, QP_store, QPStride, 1, mode, c);
1058  postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1059  width, height, QP_store, QPStride, 2, mode, c);
1060  }
1061  else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1062  linecpy(dst[1], src[1], height, srcStride[1]);
1063  linecpy(dst[2], src[2], height, srcStride[2]);
1064  }else{
1065  int y;
1066  for(y=0; y<height; y++){
1067  memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1068  memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1069  }
1070  }
1071 }