yuv2rgb_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of Libav.
7  *
8  * Libav is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * Libav is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with Libav; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 /*
24 Convert I420 YV12 to RGB in various formats,
25  it rejects images that are not in 420 formats,
26  it rejects images that don't have widths of multiples of 16,
27  it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
29 
30 Lots of optimizations to be done here.
31 
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33  and adds, so we currently use max/min to clip.
34 
35 2. The inefficient use of chroma loading needs a bit of brushing up.
36 
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38  pipeline stalls.
39 
40 
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
46 
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
48 
49 March 27,2004
50 PERFORMANCE ANALYSIS
51 
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53 used as test.
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55 same sequence.
56 
57 720 * 480 * 30 ~10MPS
58 
59 so we have roughly 10 clocks per pixel. This is too high, something has
60 to be wrong.
61 
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63 need for vec_min.
64 
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
71 
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
74 
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
79 
80 GL2 libraries work now with patch for RGB32.
81 
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83 
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
85 adjustment.
86 */
87 
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 #include <assert.h>
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
97 #include "libavutil/cpu.h"
98 #include "yuv2rgb_altivec.h"
99 
100 #undef PROFILE_THE_BEAST
101 #undef INC_SCALING
102 
103 typedef unsigned char ubyte;
104 typedef signed char sbyte;
105 
106 
107 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
108  homogeneous vector registers x0,x1,x2 are interleaved with the
109  following technique:
110 
111  o0 = vec_mergeh (x0,x1);
112  o1 = vec_perm (o0, x2, perm_rgb_0);
113  o2 = vec_perm (o0, x2, perm_rgb_1);
114  o3 = vec_mergel (x0,x1);
115  o4 = vec_perm (o3,o2,perm_rgb_2);
116  o5 = vec_perm (o3,o2,perm_rgb_3);
117 
118  perm_rgb_0: o0(RG).h v1(B) --> o1*
119  0 1 2 3 4
120  rgbr|gbrg|brgb|rgbr
121  0010 0100 1001 0010
122  0102 3145 2673 894A
123 
124  perm_rgb_1: o0(RG).h v1(B) --> o2
125  0 1 2 3 4
126  gbrg|brgb|bbbb|bbbb
127  0100 1001 1111 1111
128  B5CD 6EF7 89AB CDEF
129 
130  perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
131  0 1 2 3 4
132  gbrg|brgb|rgbr|gbrg
133  1111 1111 0010 0100
134  89AB CDEF 0182 3945
135 
136  perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
137  0 1 2 3 4
138  brgb|rgbr|gbrg|brgb
139  1001 0010 0100 1001
140  a67b 89cA BdCD eEFf
141 
142 */
143 static
144 const vector unsigned char
145  perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
146  0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
147  perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
148  0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
149  perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
150  0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
151  perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
152  0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
153 
154 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
155 do { \
156  __typeof__(x0) o0,o2,o3; \
157  o0 = vec_mergeh (x0,x1); \
158  y0 = vec_perm (o0, x2, perm_rgb_0); \
159  o2 = vec_perm (o0, x2, perm_rgb_1); \
160  o3 = vec_mergel (x0,x1); \
161  y1 = vec_perm (o3,o2,perm_rgb_2); \
162  y2 = vec_perm (o3,o2,perm_rgb_3); \
163 } while(0)
164 
165 #define vec_mstbgr24(x0,x1,x2,ptr) \
166 do { \
167  __typeof__(x0) _0,_1,_2; \
168  vec_merge3 (x0,x1,x2,_0,_1,_2); \
169  vec_st (_0, 0, ptr++); \
170  vec_st (_1, 0, ptr++); \
171  vec_st (_2, 0, ptr++); \
172 } while (0)
173 
174 #define vec_mstrgb24(x0,x1,x2,ptr) \
175 do { \
176  __typeof__(x0) _0,_1,_2; \
177  vec_merge3 (x2,x1,x0,_0,_1,_2); \
178  vec_st (_0, 0, ptr++); \
179  vec_st (_1, 0, ptr++); \
180  vec_st (_2, 0, ptr++); \
181 } while (0)
182 
183 /* pack the pixels in rgb0 format
184  msb R
185  lsb 0
186 */
187 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
188 do { \
189  T _0,_1,_2,_3; \
190  _0 = vec_mergeh (x0,x1); \
191  _1 = vec_mergeh (x2,x3); \
192  _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
193  _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
194  vec_st (_2, 0*16, (T *)ptr); \
195  vec_st (_3, 1*16, (T *)ptr); \
196  _0 = vec_mergel (x0,x1); \
197  _1 = vec_mergel (x2,x3); \
198  _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
199  _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
200  vec_st (_2, 2*16, (T *)ptr); \
201  vec_st (_3, 3*16, (T *)ptr); \
202  ptr += 4; \
203 } while (0)
204 
205 /*
206 
207  | 1 0 1.4021 | | Y |
208  | 1 -0.3441 -0.7142 |x| Cb|
209  | 1 1.7718 0 | | Cr|
210 
211 
212  Y: [-128 127]
213  Cb/Cr : [-128 127]
214 
215  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
216 
217 */
218 
219 
220 
221 
222 #define vec_unh(x) \
223  (vector signed short) \
224  vec_perm(x,(__typeof__(x)){0}, \
225  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
226  0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
227 #define vec_unl(x) \
228  (vector signed short) \
229  vec_perm(x,(__typeof__(x)){0}, \
230  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
231  0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
232 
233 #define vec_clip_s16(x) \
234  vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
235  ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
236 
237 #define vec_packclp(x,y) \
238  (vector unsigned char)vec_packs \
239  ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
240  (vector unsigned short)vec_max (y,((vector signed short) {0})))
241 
242 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
243 
244 
245 static inline void cvtyuvtoRGB (SwsContext *c,
246  vector signed short Y, vector signed short U, vector signed short V,
247  vector signed short *R, vector signed short *G, vector signed short *B)
248 {
249  vector signed short vx,ux,uvx;
250 
251  Y = vec_mradds (Y, c->CY, c->OY);
252  U = vec_sub (U,(vector signed short)
253  vec_splat((vector signed short){128},0));
254  V = vec_sub (V,(vector signed short)
255  vec_splat((vector signed short){128},0));
256 
257  // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
258  ux = vec_sl (U, c->CSHIFT);
259  *B = vec_mradds (ux, c->CBU, Y);
260 
261  // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
262  vx = vec_sl (V, c->CSHIFT);
263  *R = vec_mradds (vx, c->CRV, Y);
264 
265  // uvx = ((CGU*u) + (CGV*v))>>15;
266  uvx = vec_mradds (U, c->CGU, Y);
267  *G = vec_mradds (V, c->CGV, uvx);
268 }
269 
270 
271 /*
272  ------------------------------------------------------------------------------
273  CS converters
274  ------------------------------------------------------------------------------
275 */
276 
277 
278 #define DEFCSP420_CVT(name,out_pixels) \
279 static int altivec_##name (SwsContext *c, \
280  const unsigned char **in, int *instrides, \
281  int srcSliceY, int srcSliceH, \
282  unsigned char **oplanes, int *outstrides) \
283 { \
284  int w = c->srcW; \
285  int h = srcSliceH; \
286  int i,j; \
287  int instrides_scl[3]; \
288  vector unsigned char y0,y1; \
289  \
290  vector signed char u,v; \
291  \
292  vector signed short Y0,Y1,Y2,Y3; \
293  vector signed short U,V; \
294  vector signed short vx,ux,uvx; \
295  vector signed short vx0,ux0,uvx0; \
296  vector signed short vx1,ux1,uvx1; \
297  vector signed short R0,G0,B0; \
298  vector signed short R1,G1,B1; \
299  vector unsigned char R,G,B; \
300  \
301  vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
302  vector unsigned char align_perm; \
303  \
304  vector signed short \
305  lCY = c->CY, \
306  lOY = c->OY, \
307  lCRV = c->CRV, \
308  lCBU = c->CBU, \
309  lCGU = c->CGU, \
310  lCGV = c->CGV; \
311  \
312  vector unsigned short lCSHIFT = c->CSHIFT; \
313  \
314  const ubyte *y1i = in[0]; \
315  const ubyte *y2i = in[0]+instrides[0]; \
316  const ubyte *ui = in[1]; \
317  const ubyte *vi = in[2]; \
318  \
319  vector unsigned char *oute \
320  = (vector unsigned char *) \
321  (oplanes[0]+srcSliceY*outstrides[0]); \
322  vector unsigned char *outo \
323  = (vector unsigned char *) \
324  (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
325  \
326  \
327  instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
328  instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
329  instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
330  \
331  \
332  for (i=0;i<h/2;i++) { \
333  vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
334  vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
335  \
336  for (j=0;j<w/16;j++) { \
337  \
338  y1ivP = (vector unsigned char *)y1i; \
339  y2ivP = (vector unsigned char *)y2i; \
340  uivP = (vector unsigned char *)ui; \
341  vivP = (vector unsigned char *)vi; \
342  \
343  align_perm = vec_lvsl (0, y1i); \
344  y0 = (vector unsigned char) \
345  vec_perm (y1ivP[0], y1ivP[1], align_perm); \
346  \
347  align_perm = vec_lvsl (0, y2i); \
348  y1 = (vector unsigned char) \
349  vec_perm (y2ivP[0], y2ivP[1], align_perm); \
350  \
351  align_perm = vec_lvsl (0, ui); \
352  u = (vector signed char) \
353  vec_perm (uivP[0], uivP[1], align_perm); \
354  \
355  align_perm = vec_lvsl (0, vi); \
356  v = (vector signed char) \
357  vec_perm (vivP[0], vivP[1], align_perm); \
358  \
359  u = (vector signed char) \
360  vec_sub (u,(vector signed char) \
361  vec_splat((vector signed char){128},0)); \
362  v = (vector signed char) \
363  vec_sub (v,(vector signed char) \
364  vec_splat((vector signed char){128},0)); \
365  \
366  U = vec_unpackh (u); \
367  V = vec_unpackh (v); \
368  \
369  \
370  Y0 = vec_unh (y0); \
371  Y1 = vec_unl (y0); \
372  Y2 = vec_unh (y1); \
373  Y3 = vec_unl (y1); \
374  \
375  Y0 = vec_mradds (Y0, lCY, lOY); \
376  Y1 = vec_mradds (Y1, lCY, lOY); \
377  Y2 = vec_mradds (Y2, lCY, lOY); \
378  Y3 = vec_mradds (Y3, lCY, lOY); \
379  \
380  /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
381  ux = vec_sl (U, lCSHIFT); \
382  ux = vec_mradds (ux, lCBU, (vector signed short){0}); \
383  ux0 = vec_mergeh (ux,ux); \
384  ux1 = vec_mergel (ux,ux); \
385  \
386  /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
387  vx = vec_sl (V, lCSHIFT); \
388  vx = vec_mradds (vx, lCRV, (vector signed short){0}); \
389  vx0 = vec_mergeh (vx,vx); \
390  vx1 = vec_mergel (vx,vx); \
391  \
392  /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
393  uvx = vec_mradds (U, lCGU, (vector signed short){0}); \
394  uvx = vec_mradds (V, lCGV, uvx); \
395  uvx0 = vec_mergeh (uvx,uvx); \
396  uvx1 = vec_mergel (uvx,uvx); \
397  \
398  R0 = vec_add (Y0,vx0); \
399  G0 = vec_add (Y0,uvx0); \
400  B0 = vec_add (Y0,ux0); \
401  R1 = vec_add (Y1,vx1); \
402  G1 = vec_add (Y1,uvx1); \
403  B1 = vec_add (Y1,ux1); \
404  \
405  R = vec_packclp (R0,R1); \
406  G = vec_packclp (G0,G1); \
407  B = vec_packclp (B0,B1); \
408  \
409  out_pixels(R,G,B,oute); \
410  \
411  R0 = vec_add (Y2,vx0); \
412  G0 = vec_add (Y2,uvx0); \
413  B0 = vec_add (Y2,ux0); \
414  R1 = vec_add (Y3,vx1); \
415  G1 = vec_add (Y3,uvx1); \
416  B1 = vec_add (Y3,ux1); \
417  R = vec_packclp (R0,R1); \
418  G = vec_packclp (G0,G1); \
419  B = vec_packclp (B0,B1); \
420  \
421  \
422  out_pixels(R,G,B,outo); \
423  \
424  y1i += 16; \
425  y2i += 16; \
426  ui += 8; \
427  vi += 8; \
428  \
429  } \
430  \
431  outo += (outstrides[0])>>4; \
432  oute += (outstrides[0])>>4; \
433  \
434  ui += instrides_scl[1]; \
435  vi += instrides_scl[2]; \
436  y1i += instrides_scl[0]; \
437  y2i += instrides_scl[0]; \
438  } \
439  return srcSliceH; \
440 }
441 
442 
443 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
444 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
445 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
446 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
447 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
448 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
449 
450 DEFCSP420_CVT (yuv2_abgr, out_abgr)
451 DEFCSP420_CVT (yuv2_bgra, out_bgra)
452 DEFCSP420_CVT (yuv2_rgba, out_rgba)
453 DEFCSP420_CVT (yuv2_argb, out_argb)
454 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
455 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
456 
457 
458 // uyvy|uyvy|uyvy|uyvy
459 // 0123 4567 89ab cdef
460 static
461 const vector unsigned char
462  demux_u = {0x10,0x00,0x10,0x00,
463  0x10,0x04,0x10,0x04,
464  0x10,0x08,0x10,0x08,
465  0x10,0x0c,0x10,0x0c},
466  demux_v = {0x10,0x02,0x10,0x02,
467  0x10,0x06,0x10,0x06,
468  0x10,0x0A,0x10,0x0A,
469  0x10,0x0E,0x10,0x0E},
470  demux_y = {0x10,0x01,0x10,0x03,
471  0x10,0x05,0x10,0x07,
472  0x10,0x09,0x10,0x0B,
473  0x10,0x0D,0x10,0x0F};
474 
475 /*
476  this is so I can play live CCIR raw video
477 */
479  const unsigned char **in, int *instrides,
480  int srcSliceY, int srcSliceH,
481  unsigned char **oplanes, int *outstrides)
482 {
483  int w = c->srcW;
484  int h = srcSliceH;
485  int i,j;
486  vector unsigned char uyvy;
487  vector signed short Y,U,V;
488  vector signed short R0,G0,B0,R1,G1,B1;
489  vector unsigned char R,G,B;
490  vector unsigned char *out;
491  const ubyte *img;
492 
493  img = in[0];
494  out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
495 
496  for (i=0;i<h;i++) {
497  for (j=0;j<w/16;j++) {
498  uyvy = vec_ld (0, img);
499  U = (vector signed short)
500  vec_perm (uyvy, (vector unsigned char){0}, demux_u);
501 
502  V = (vector signed short)
503  vec_perm (uyvy, (vector unsigned char){0}, demux_v);
504 
505  Y = (vector signed short)
506  vec_perm (uyvy, (vector unsigned char){0}, demux_y);
507 
508  cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
509 
510  uyvy = vec_ld (16, img);
511  U = (vector signed short)
512  vec_perm (uyvy, (vector unsigned char){0}, demux_u);
513 
514  V = (vector signed short)
515  vec_perm (uyvy, (vector unsigned char){0}, demux_v);
516 
517  Y = (vector signed short)
518  vec_perm (uyvy, (vector unsigned char){0}, demux_y);
519 
520  cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
521 
522  R = vec_packclp (R0,R1);
523  G = vec_packclp (G0,G1);
524  B = vec_packclp (B0,B1);
525 
526  // vec_mstbgr24 (R,G,B, out);
527  out_rgba (R,G,B,out);
528 
529  img += 32;
530  }
531  }
532  return srcSliceH;
533 }
534 
535 
536 
537 /* Ok currently the acceleration routine only supports
538  inputs of widths a multiple of 16
539  and heights a multiple 2
540 
541  So we just fall back to the C codes for this.
542 */
544 {
546  return NULL;
547 
548  /*
549  and this seems not to matter too much I tried a bunch of
550  videos with abnormal widths and MPlayer crashes elsewhere.
551  mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
552  boom with X11 bad match.
553 
554  */
555  if ((c->srcW & 0xf) != 0) return NULL;
556 
557  switch (c->srcFormat) {
558  case PIX_FMT_YUV410P:
559  case PIX_FMT_YUV420P:
560  /*case IMGFMT_CLPL: ??? */
561  case PIX_FMT_GRAY8:
562  case PIX_FMT_NV12:
563  case PIX_FMT_NV21:
564  if ((c->srcH & 0x1) != 0)
565  return NULL;
566 
567  switch(c->dstFormat) {
568  case PIX_FMT_RGB24:
569  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
570  return altivec_yuv2_rgb24;
571  case PIX_FMT_BGR24:
572  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
573  return altivec_yuv2_bgr24;
574  case PIX_FMT_ARGB:
575  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
576  return altivec_yuv2_argb;
577  case PIX_FMT_ABGR:
578  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
579  return altivec_yuv2_abgr;
580  case PIX_FMT_RGBA:
581  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
582  return altivec_yuv2_rgba;
583  case PIX_FMT_BGRA:
584  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
585  return altivec_yuv2_bgra;
586  default: return NULL;
587  }
588  break;
589 
590  case PIX_FMT_UYVY422:
591  switch(c->dstFormat) {
592  case PIX_FMT_BGR32:
593  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
594  return altivec_uyvy_rgb32;
595  default: return NULL;
596  }
597  break;
598 
599  }
600  return NULL;
601 }
602 
603 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
604 {
605  union {
606  DECLARE_ALIGNED(16, signed short, tmp)[8];
607  vector signed short vec;
608  } buf;
609 
610  buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy
611  buf.tmp[1] = -256*brightness; //oy
612  buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
613  buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
614  buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
615  buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
616 
617 
618  c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
619  c->CY = vec_splat ((vector signed short)buf.vec, 0);
620  c->OY = vec_splat ((vector signed short)buf.vec, 1);
621  c->CRV = vec_splat ((vector signed short)buf.vec, 2);
622  c->CBU = vec_splat ((vector signed short)buf.vec, 3);
623  c->CGU = vec_splat ((vector signed short)buf.vec, 4);
624  c->CGV = vec_splat ((vector signed short)buf.vec, 5);
625  return;
626 }
627 
628 
629 static av_always_inline void
630 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
631  const int16_t **lumSrc, int lumFilterSize,
632  const int16_t *chrFilter, const int16_t **chrUSrc,
633  const int16_t **chrVSrc, int chrFilterSize,
634  const int16_t **alpSrc, uint8_t *dest,
635  int dstW, int dstY, enum PixelFormat target)
636 {
637  int i,j;
638  vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
639  vector signed short R0,G0,B0,R1,G1,B1;
640 
641  vector unsigned char R,G,B;
642  vector unsigned char *out,*nout;
643 
644  vector signed short RND = vec_splat_s16(1<<3);
645  vector unsigned short SCL = vec_splat_u16(4);
646  DECLARE_ALIGNED(16, unsigned int, scratch)[16];
647 
648  vector signed short *YCoeffs, *CCoeffs;
649 
650  YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
651  CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
652 
653  out = (vector unsigned char *)dest;
654 
655  for (i=0; i<dstW; i+=16) {
656  Y0 = RND;
657  Y1 = RND;
658  /* extract 16 coeffs from lumSrc */
659  for (j=0; j<lumFilterSize; j++) {
660  X0 = vec_ld (0, &lumSrc[j][i]);
661  X1 = vec_ld (16, &lumSrc[j][i]);
662  Y0 = vec_mradds (X0, YCoeffs[j], Y0);
663  Y1 = vec_mradds (X1, YCoeffs[j], Y1);
664  }
665 
666  U = RND;
667  V = RND;
668  /* extract 8 coeffs from U,V */
669  for (j=0; j<chrFilterSize; j++) {
670  X = vec_ld (0, &chrUSrc[j][i/2]);
671  U = vec_mradds (X, CCoeffs[j], U);
672  X = vec_ld (0, &chrVSrc[j][i/2]);
673  V = vec_mradds (X, CCoeffs[j], V);
674  }
675 
676  /* scale and clip signals */
677  Y0 = vec_sra (Y0, SCL);
678  Y1 = vec_sra (Y1, SCL);
679  U = vec_sra (U, SCL);
680  V = vec_sra (V, SCL);
681 
682  Y0 = vec_clip_s16 (Y0);
683  Y1 = vec_clip_s16 (Y1);
684  U = vec_clip_s16 (U);
685  V = vec_clip_s16 (V);
686 
687  /* now we have
688  Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
689  U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
690 
691  Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
692  U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
693  V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
694  */
695 
696  U0 = vec_mergeh (U,U);
697  V0 = vec_mergeh (V,V);
698 
699  U1 = vec_mergel (U,U);
700  V1 = vec_mergel (V,V);
701 
702  cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
703  cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
704 
705  R = vec_packclp (R0,R1);
706  G = vec_packclp (G0,G1);
707  B = vec_packclp (B0,B1);
708 
709  switch(target) {
710  case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
711  case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
712  case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
713  case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
714  case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
715  case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
716  default:
717  {
718  /* If this is reached, the caller should have called yuv2packedXinC
719  instead. */
720  static int printed_error_message;
721  if (!printed_error_message) {
722  av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
724  printed_error_message=1;
725  }
726  return;
727  }
728  }
729  }
730 
731  if (i < dstW) {
732  i -= 16;
733 
734  Y0 = RND;
735  Y1 = RND;
736  /* extract 16 coeffs from lumSrc */
737  for (j=0; j<lumFilterSize; j++) {
738  X0 = vec_ld (0, &lumSrc[j][i]);
739  X1 = vec_ld (16, &lumSrc[j][i]);
740  Y0 = vec_mradds (X0, YCoeffs[j], Y0);
741  Y1 = vec_mradds (X1, YCoeffs[j], Y1);
742  }
743 
744  U = RND;
745  V = RND;
746  /* extract 8 coeffs from U,V */
747  for (j=0; j<chrFilterSize; j++) {
748  X = vec_ld (0, &chrUSrc[j][i/2]);
749  U = vec_mradds (X, CCoeffs[j], U);
750  X = vec_ld (0, &chrVSrc[j][i/2]);
751  V = vec_mradds (X, CCoeffs[j], V);
752  }
753 
754  /* scale and clip signals */
755  Y0 = vec_sra (Y0, SCL);
756  Y1 = vec_sra (Y1, SCL);
757  U = vec_sra (U, SCL);
758  V = vec_sra (V, SCL);
759 
760  Y0 = vec_clip_s16 (Y0);
761  Y1 = vec_clip_s16 (Y1);
762  U = vec_clip_s16 (U);
763  V = vec_clip_s16 (V);
764 
765  /* now we have
766  Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
767  U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
768 
769  Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
770  U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
771  V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
772  */
773 
774  U0 = vec_mergeh (U,U);
775  V0 = vec_mergeh (V,V);
776 
777  U1 = vec_mergel (U,U);
778  V1 = vec_mergel (V,V);
779 
780  cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
781  cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
782 
783  R = vec_packclp (R0,R1);
784  G = vec_packclp (G0,G1);
785  B = vec_packclp (B0,B1);
786 
787  nout = (vector unsigned char *)scratch;
788  switch(target) {
789  case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
790  case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
791  case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
792  case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
793  case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
794  case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
795  default:
796  /* Unreachable, I think. */
797  av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
799  return;
800  }
801 
802  memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
803  }
804 
805 }
806 
807 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
808 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
809  const int16_t **lumSrc, int lumFilterSize, \
810  const int16_t *chrFilter, const int16_t **chrUSrc, \
811  const int16_t **chrVSrc, int chrFilterSize, \
812  const int16_t **alpSrc, uint8_t *dest, \
813  int dstW, int dstY) \
814 { \
815  ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
816  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
817  alpSrc, dest, dstW, dstY, pixfmt); \
818 }
819