h264_template_altivec.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifdef DEBUG
22 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
23 #else
24 #define ASSERT_ALIGNED(ptr) ;
25 #endif
26 
27 /* this code assume that stride % 16 == 0 */
28 
29 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
30  vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
31  vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
32 \
33  psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
34  psum = vec_mladd(vB, vsrc1ssH, psum);\
35  psum = vec_mladd(vC, vsrc2ssH, psum);\
36  psum = vec_mladd(vD, vsrc3ssH, psum);\
37  psum = BIAS2(psum);\
38  psum = vec_sr(psum, v6us);\
39 \
40  vdst = vec_ld(0, dst);\
41  ppsum = (vec_u8)vec_pack(psum, psum);\
42  vfdst = vec_perm(vdst, ppsum, fperm);\
43 \
44  OP_U8_ALTIVEC(fsum, vfdst, vdst);\
45 \
46  vec_st(fsum, 0, dst);\
47 \
48  vsrc0ssH = vsrc2ssH;\
49  vsrc1ssH = vsrc3ssH;\
50 \
51  dst += stride;\
52  src += stride;
53 
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
55 \
56  vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
57  vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
58 \
59  psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60  psum = vec_mladd(vE, vsrc1ssH, psum);\
61  psum = vec_sr(psum, v6us);\
62 \
63  vdst = vec_ld(0, dst);\
64  ppsum = (vec_u8)vec_pack(psum, psum);\
65  vfdst = vec_perm(vdst, ppsum, fperm);\
66 \
67  OP_U8_ALTIVEC(fsum, vfdst, vdst);\
68 \
69  vec_st(fsum, 0, dst);\
70 \
71  dst += stride;\
72  src += stride;
73 
74 #define noop(a) a
75 #define add28(a) vec_add(v28ss, a)
76 
77 #ifdef PREFIX_h264_chroma_mc8_altivec
78 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
79  int stride, int h, int x, int y) {
80  DECLARE_ALIGNED(16, signed int, ABCD)[4] =
81  {((8 - x) * (8 - y)),
82  (( x) * (8 - y)),
83  ((8 - x) * ( y)),
84  (( x) * ( y))};
85  register int i;
86  vec_u8 fperm;
87  const vec_s32 vABCD = vec_ld(0, ABCD);
88  const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
89  const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
90  const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
91  const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
92  LOAD_ZERO;
93  const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
94  const vec_u16 v6us = vec_splat_u16(6);
95  register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
96  register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
97 
98  vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
99  vec_u8 vsrc0uc, vsrc1uc;
100  vec_s16 vsrc0ssH, vsrc1ssH;
101  vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
102  vec_s16 vsrc2ssH, vsrc3ssH, psum;
103  vec_u8 vdst, ppsum, vfdst, fsum;
104 
105  if (((unsigned long)dst) % 16 == 0) {
106  fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
107  0x14, 0x15, 0x16, 0x17,
108  0x08, 0x09, 0x0A, 0x0B,
109  0x0C, 0x0D, 0x0E, 0x0F};
110  } else {
111  fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
112  0x04, 0x05, 0x06, 0x07,
113  0x18, 0x19, 0x1A, 0x1B,
114  0x1C, 0x1D, 0x1E, 0x1F};
115  }
116 
117  vsrcAuc = vec_ld(0, src);
118 
119  if (loadSecond)
120  vsrcBuc = vec_ld(16, src);
121  vsrcperm0 = vec_lvsl(0, src);
122  vsrcperm1 = vec_lvsl(1, src);
123 
124  vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
125  if (reallyBadAlign)
126  vsrc1uc = vsrcBuc;
127  else
128  vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
129 
130  vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
131  vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
132 
133  if (ABCD[3]) {
134  if (!loadSecond) {// -> !reallyBadAlign
135  for (i = 0 ; i < h ; i++) {
136  vsrcCuc = vec_ld(stride + 0, src);
137  vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
138  vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
139 
141  }
142  } else {
143  vec_u8 vsrcDuc;
144  for (i = 0 ; i < h ; i++) {
145  vsrcCuc = vec_ld(stride + 0, src);
146  vsrcDuc = vec_ld(stride + 16, src);
147  vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
148  if (reallyBadAlign)
149  vsrc3uc = vsrcDuc;
150  else
151  vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
152 
154  }
155  }
156  } else {
157  const vec_s16 vE = vec_add(vB, vC);
158  if (ABCD[2]) { // x == 0 B == 0
159  if (!loadSecond) {// -> !reallyBadAlign
160  for (i = 0 ; i < h ; i++) {
161  vsrcCuc = vec_ld(stride + 0, src);
162  vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
164 
165  vsrc0uc = vsrc1uc;
166  }
167  } else {
168  vec_u8 vsrcDuc;
169  for (i = 0 ; i < h ; i++) {
170  vsrcCuc = vec_ld(stride + 0, src);
171  vsrcDuc = vec_ld(stride + 15, src);
172  vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
174 
175  vsrc0uc = vsrc1uc;
176  }
177  }
178  } else { // y == 0 C == 0
179  if (!loadSecond) {// -> !reallyBadAlign
180  for (i = 0 ; i < h ; i++) {
181  vsrcCuc = vec_ld(0, src);
182  vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
183  vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
184 
186  }
187  } else {
188  vec_u8 vsrcDuc;
189  for (i = 0 ; i < h ; i++) {
190  vsrcCuc = vec_ld(0, src);
191  vsrcDuc = vec_ld(15, src);
192  vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
193  if (reallyBadAlign)
194  vsrc1uc = vsrcDuc;
195  else
196  vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
197 
199  }
200  }
201  }
202  }
203 }
204 #endif
205 
206 /* this code assume that stride % 16 == 0 */
207 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
208 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
209  DECLARE_ALIGNED(16, signed int, ABCD)[4] =
210  {((8 - x) * (8 - y)),
211  (( x) * (8 - y)),
212  ((8 - x) * ( y)),
213  (( x) * ( y))};
214  register int i;
215  vec_u8 fperm;
216  const vec_s32 vABCD = vec_ld(0, ABCD);
217  const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
218  const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
219  const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
220  const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
221  LOAD_ZERO;
222  const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
223  const vec_u16 v6us = vec_splat_u16(6);
224  register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
225  register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
226 
227  vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
228  vec_u8 vsrc0uc, vsrc1uc;
229  vec_s16 vsrc0ssH, vsrc1ssH;
230  vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
231  vec_s16 vsrc2ssH, vsrc3ssH, psum;
232  vec_u8 vdst, ppsum, vfdst, fsum;
233 
234  if (((unsigned long)dst) % 16 == 0) {
235  fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
236  0x14, 0x15, 0x16, 0x17,
237  0x08, 0x09, 0x0A, 0x0B,
238  0x0C, 0x0D, 0x0E, 0x0F};
239  } else {
240  fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
241  0x04, 0x05, 0x06, 0x07,
242  0x18, 0x19, 0x1A, 0x1B,
243  0x1C, 0x1D, 0x1E, 0x1F};
244  }
245 
246  vsrcAuc = vec_ld(0, src);
247 
248  if (loadSecond)
249  vsrcBuc = vec_ld(16, src);
250  vsrcperm0 = vec_lvsl(0, src);
251  vsrcperm1 = vec_lvsl(1, src);
252 
253  vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
254  if (reallyBadAlign)
255  vsrc1uc = vsrcBuc;
256  else
257  vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
258 
259  vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
260  vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
261 
262  if (!loadSecond) {// -> !reallyBadAlign
263  for (i = 0 ; i < h ; i++) {
264 
265 
266  vsrcCuc = vec_ld(stride + 0, src);
267 
268  vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
269  vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
270 
271  CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
272  }
273  } else {
274  vec_u8 vsrcDuc;
275  for (i = 0 ; i < h ; i++) {
276  vsrcCuc = vec_ld(stride + 0, src);
277  vsrcDuc = vec_ld(stride + 16, src);
278 
279  vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
280  if (reallyBadAlign)
281  vsrc3uc = vsrcDuc;
282  else
283  vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
284 
285  CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
286  }
287  }
288 }
289 #endif
290 
291 #undef noop
292 #undef add28
293 #undef CHROMA_MC8_ALTIVEC_CORE
294 
295 /* this code assume stride % 16 == 0 */
296 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
298  register int i;
299 
300  LOAD_ZERO;
301  const vec_u8 permM2 = vec_lvsl(-2, src);
302  const vec_u8 permM1 = vec_lvsl(-1, src);
303  const vec_u8 permP0 = vec_lvsl(+0, src);
304  const vec_u8 permP1 = vec_lvsl(+1, src);
305  const vec_u8 permP2 = vec_lvsl(+2, src);
306  const vec_u8 permP3 = vec_lvsl(+3, src);
307  const vec_s16 v5ss = vec_splat_s16(5);
308  const vec_u16 v5us = vec_splat_u16(5);
309  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
310  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
311 
312  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
313 
314  register int align = ((((unsigned long)src) - 2) % 16);
315 
316  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
317  srcP2A, srcP2B, srcP3A, srcP3B,
318  srcM1A, srcM1B, srcM2A, srcM2B,
319  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
320  pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
321  psumA, psumB, sumA, sumB;
322 
323  vec_u8 sum, vdst, fsum;
324 
325  for (i = 0 ; i < 16 ; i ++) {
326  vec_u8 srcR1 = vec_ld(-2, src);
327  vec_u8 srcR2 = vec_ld(14, src);
328 
329  switch (align) {
330  default: {
331  srcM2 = vec_perm(srcR1, srcR2, permM2);
332  srcM1 = vec_perm(srcR1, srcR2, permM1);
333  srcP0 = vec_perm(srcR1, srcR2, permP0);
334  srcP1 = vec_perm(srcR1, srcR2, permP1);
335  srcP2 = vec_perm(srcR1, srcR2, permP2);
336  srcP3 = vec_perm(srcR1, srcR2, permP3);
337  } break;
338  case 11: {
339  srcM2 = vec_perm(srcR1, srcR2, permM2);
340  srcM1 = vec_perm(srcR1, srcR2, permM1);
341  srcP0 = vec_perm(srcR1, srcR2, permP0);
342  srcP1 = vec_perm(srcR1, srcR2, permP1);
343  srcP2 = vec_perm(srcR1, srcR2, permP2);
344  srcP3 = srcR2;
345  } break;
346  case 12: {
347  vec_u8 srcR3 = vec_ld(30, src);
348  srcM2 = vec_perm(srcR1, srcR2, permM2);
349  srcM1 = vec_perm(srcR1, srcR2, permM1);
350  srcP0 = vec_perm(srcR1, srcR2, permP0);
351  srcP1 = vec_perm(srcR1, srcR2, permP1);
352  srcP2 = srcR2;
353  srcP3 = vec_perm(srcR2, srcR3, permP3);
354  } break;
355  case 13: {
356  vec_u8 srcR3 = vec_ld(30, src);
357  srcM2 = vec_perm(srcR1, srcR2, permM2);
358  srcM1 = vec_perm(srcR1, srcR2, permM1);
359  srcP0 = vec_perm(srcR1, srcR2, permP0);
360  srcP1 = srcR2;
361  srcP2 = vec_perm(srcR2, srcR3, permP2);
362  srcP3 = vec_perm(srcR2, srcR3, permP3);
363  } break;
364  case 14: {
365  vec_u8 srcR3 = vec_ld(30, src);
366  srcM2 = vec_perm(srcR1, srcR2, permM2);
367  srcM1 = vec_perm(srcR1, srcR2, permM1);
368  srcP0 = srcR2;
369  srcP1 = vec_perm(srcR2, srcR3, permP1);
370  srcP2 = vec_perm(srcR2, srcR3, permP2);
371  srcP3 = vec_perm(srcR2, srcR3, permP3);
372  } break;
373  case 15: {
374  vec_u8 srcR3 = vec_ld(30, src);
375  srcM2 = vec_perm(srcR1, srcR2, permM2);
376  srcM1 = srcR2;
377  srcP0 = vec_perm(srcR2, srcR3, permP0);
378  srcP1 = vec_perm(srcR2, srcR3, permP1);
379  srcP2 = vec_perm(srcR2, srcR3, permP2);
380  srcP3 = vec_perm(srcR2, srcR3, permP3);
381  } break;
382  }
383 
384  srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
385  srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
386  srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
387  srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
388 
389  srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
390  srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
391  srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
392  srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
393 
394  srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
395  srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
396  srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
397  srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
398 
399  sum1A = vec_adds(srcP0A, srcP1A);
400  sum1B = vec_adds(srcP0B, srcP1B);
401  sum2A = vec_adds(srcM1A, srcP2A);
402  sum2B = vec_adds(srcM1B, srcP2B);
403  sum3A = vec_adds(srcM2A, srcP3A);
404  sum3B = vec_adds(srcM2B, srcP3B);
405 
406  pp1A = vec_mladd(sum1A, v20ss, v16ss);
407  pp1B = vec_mladd(sum1B, v20ss, v16ss);
408 
409  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
410  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
411 
412  pp3A = vec_add(sum3A, pp1A);
413  pp3B = vec_add(sum3B, pp1B);
414 
415  psumA = vec_sub(pp3A, pp2A);
416  psumB = vec_sub(pp3B, pp2B);
417 
418  sumA = vec_sra(psumA, v5us);
419  sumB = vec_sra(psumB, v5us);
420 
421  sum = vec_packsu(sumA, sumB);
422 
423  ASSERT_ALIGNED(dst);
424  vdst = vec_ld(0, dst);
425 
426  OP_U8_ALTIVEC(fsum, sum, vdst);
427 
428  vec_st(fsum, 0, dst);
429 
430  src += srcStride;
431  dst += dstStride;
432  }
433 }
434 #endif
435 
436 /* this code assume stride % 16 == 0 */
437 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
438 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
439  register int i;
440 
441  LOAD_ZERO;
442  const vec_u8 perm = vec_lvsl(0, src);
443  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
444  const vec_u16 v5us = vec_splat_u16(5);
445  const vec_s16 v5ss = vec_splat_s16(5);
446  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
447 
448  uint8_t *srcbis = src - (srcStride * 2);
449 
450  const vec_u8 srcM2a = vec_ld(0, srcbis);
451  const vec_u8 srcM2b = vec_ld(16, srcbis);
452  const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
453  //srcbis += srcStride;
454  const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
455  const vec_u8 srcM1b = vec_ld(16, srcbis);
456  const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
457  //srcbis += srcStride;
458  const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
459  const vec_u8 srcP0b = vec_ld(16, srcbis);
460  const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
461  //srcbis += srcStride;
462  const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
463  const vec_u8 srcP1b = vec_ld(16, srcbis);
464  const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
465  //srcbis += srcStride;
466  const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
467  const vec_u8 srcP2b = vec_ld(16, srcbis);
468  const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
469  //srcbis += srcStride;
470 
471  vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
472  vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
473  vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
474  vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
475  vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
476  vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
477  vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
478  vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
479  vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
480  vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
481 
482  vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
483  psumA, psumB, sumA, sumB,
484  srcP3ssA, srcP3ssB,
485  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
486 
487  vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
488 
489  for (i = 0 ; i < 16 ; i++) {
490  srcP3a = vec_ld(0, srcbis += srcStride);
491  srcP3b = vec_ld(16, srcbis);
492  srcP3 = vec_perm(srcP3a, srcP3b, perm);
493  srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
494  srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
495  //srcbis += srcStride;
496 
497  sum1A = vec_adds(srcP0ssA, srcP1ssA);
498  sum1B = vec_adds(srcP0ssB, srcP1ssB);
499  sum2A = vec_adds(srcM1ssA, srcP2ssA);
500  sum2B = vec_adds(srcM1ssB, srcP2ssB);
501  sum3A = vec_adds(srcM2ssA, srcP3ssA);
502  sum3B = vec_adds(srcM2ssB, srcP3ssB);
503 
504  srcM2ssA = srcM1ssA;
505  srcM2ssB = srcM1ssB;
506  srcM1ssA = srcP0ssA;
507  srcM1ssB = srcP0ssB;
508  srcP0ssA = srcP1ssA;
509  srcP0ssB = srcP1ssB;
510  srcP1ssA = srcP2ssA;
511  srcP1ssB = srcP2ssB;
512  srcP2ssA = srcP3ssA;
513  srcP2ssB = srcP3ssB;
514 
515  pp1A = vec_mladd(sum1A, v20ss, v16ss);
516  pp1B = vec_mladd(sum1B, v20ss, v16ss);
517 
518  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
519  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
520 
521  pp3A = vec_add(sum3A, pp1A);
522  pp3B = vec_add(sum3B, pp1B);
523 
524  psumA = vec_sub(pp3A, pp2A);
525  psumB = vec_sub(pp3B, pp2B);
526 
527  sumA = vec_sra(psumA, v5us);
528  sumB = vec_sra(psumB, v5us);
529 
530  sum = vec_packsu(sumA, sumB);
531 
532  ASSERT_ALIGNED(dst);
533  vdst = vec_ld(0, dst);
534 
535  OP_U8_ALTIVEC(fsum, sum, vdst);
536 
537  vec_st(fsum, 0, dst);
538 
539  dst += dstStride;
540  }
541 }
542 #endif
543 
544 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
545 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
546 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
547  register int i;
548  LOAD_ZERO;
549  const vec_u8 permM2 = vec_lvsl(-2, src);
550  const vec_u8 permM1 = vec_lvsl(-1, src);
551  const vec_u8 permP0 = vec_lvsl(+0, src);
552  const vec_u8 permP1 = vec_lvsl(+1, src);
553  const vec_u8 permP2 = vec_lvsl(+2, src);
554  const vec_u8 permP3 = vec_lvsl(+3, src);
555  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
556  const vec_u32 v10ui = vec_splat_u32(10);
557  const vec_s16 v5ss = vec_splat_s16(5);
558  const vec_s16 v1ss = vec_splat_s16(1);
559  const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
560  const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
561 
562  register int align = ((((unsigned long)src) - 2) % 16);
563 
564  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
565  srcP2A, srcP2B, srcP3A, srcP3B,
566  srcM1A, srcM1B, srcM2A, srcM2B,
567  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
568  pp1A, pp1B, pp2A, pp2B, psumA, psumB;
569 
570  const vec_u8 mperm = (const vec_u8)
571  {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
572  0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
573  int16_t *tmpbis = tmp;
574 
575  vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
576  tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
577  tmpP2ssA, tmpP2ssB;
578 
579  vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
580  pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
581  pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
582  ssumAe, ssumAo, ssumBe, ssumBo;
583  vec_u8 fsum, sumv, sum, vdst;
584  vec_s16 ssume, ssumo;
585 
586  src -= (2 * srcStride);
587  for (i = 0 ; i < 21 ; i ++) {
588  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
589  vec_u8 srcR1 = vec_ld(-2, src);
590  vec_u8 srcR2 = vec_ld(14, src);
591 
592  switch (align) {
593  default: {
594  srcM2 = vec_perm(srcR1, srcR2, permM2);
595  srcM1 = vec_perm(srcR1, srcR2, permM1);
596  srcP0 = vec_perm(srcR1, srcR2, permP0);
597  srcP1 = vec_perm(srcR1, srcR2, permP1);
598  srcP2 = vec_perm(srcR1, srcR2, permP2);
599  srcP3 = vec_perm(srcR1, srcR2, permP3);
600  } break;
601  case 11: {
602  srcM2 = vec_perm(srcR1, srcR2, permM2);
603  srcM1 = vec_perm(srcR1, srcR2, permM1);
604  srcP0 = vec_perm(srcR1, srcR2, permP0);
605  srcP1 = vec_perm(srcR1, srcR2, permP1);
606  srcP2 = vec_perm(srcR1, srcR2, permP2);
607  srcP3 = srcR2;
608  } break;
609  case 12: {
610  vec_u8 srcR3 = vec_ld(30, src);
611  srcM2 = vec_perm(srcR1, srcR2, permM2);
612  srcM1 = vec_perm(srcR1, srcR2, permM1);
613  srcP0 = vec_perm(srcR1, srcR2, permP0);
614  srcP1 = vec_perm(srcR1, srcR2, permP1);
615  srcP2 = srcR2;
616  srcP3 = vec_perm(srcR2, srcR3, permP3);
617  } break;
618  case 13: {
619  vec_u8 srcR3 = vec_ld(30, src);
620  srcM2 = vec_perm(srcR1, srcR2, permM2);
621  srcM1 = vec_perm(srcR1, srcR2, permM1);
622  srcP0 = vec_perm(srcR1, srcR2, permP0);
623  srcP1 = srcR2;
624  srcP2 = vec_perm(srcR2, srcR3, permP2);
625  srcP3 = vec_perm(srcR2, srcR3, permP3);
626  } break;
627  case 14: {
628  vec_u8 srcR3 = vec_ld(30, src);
629  srcM2 = vec_perm(srcR1, srcR2, permM2);
630  srcM1 = vec_perm(srcR1, srcR2, permM1);
631  srcP0 = srcR2;
632  srcP1 = vec_perm(srcR2, srcR3, permP1);
633  srcP2 = vec_perm(srcR2, srcR3, permP2);
634  srcP3 = vec_perm(srcR2, srcR3, permP3);
635  } break;
636  case 15: {
637  vec_u8 srcR3 = vec_ld(30, src);
638  srcM2 = vec_perm(srcR1, srcR2, permM2);
639  srcM1 = srcR2;
640  srcP0 = vec_perm(srcR2, srcR3, permP0);
641  srcP1 = vec_perm(srcR2, srcR3, permP1);
642  srcP2 = vec_perm(srcR2, srcR3, permP2);
643  srcP3 = vec_perm(srcR2, srcR3, permP3);
644  } break;
645  }
646 
647  srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
648  srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
649  srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
650  srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
651 
652  srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
653  srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
654  srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
655  srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
656 
657  srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
658  srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
659  srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
660  srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
661 
662  sum1A = vec_adds(srcP0A, srcP1A);
663  sum1B = vec_adds(srcP0B, srcP1B);
664  sum2A = vec_adds(srcM1A, srcP2A);
665  sum2B = vec_adds(srcM1B, srcP2B);
666  sum3A = vec_adds(srcM2A, srcP3A);
667  sum3B = vec_adds(srcM2B, srcP3B);
668 
669  pp1A = vec_mladd(sum1A, v20ss, sum3A);
670  pp1B = vec_mladd(sum1B, v20ss, sum3B);
671 
672  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
673  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
674 
675  psumA = vec_sub(pp1A, pp2A);
676  psumB = vec_sub(pp1B, pp2B);
677 
678  vec_st(psumA, 0, tmp);
679  vec_st(psumB, 16, tmp);
680 
681  src += srcStride;
682  tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
683  }
684 
685  tmpM2ssA = vec_ld(0, tmpbis);
686  tmpM2ssB = vec_ld(16, tmpbis);
687  tmpbis += tmpStride;
688  tmpM1ssA = vec_ld(0, tmpbis);
689  tmpM1ssB = vec_ld(16, tmpbis);
690  tmpbis += tmpStride;
691  tmpP0ssA = vec_ld(0, tmpbis);
692  tmpP0ssB = vec_ld(16, tmpbis);
693  tmpbis += tmpStride;
694  tmpP1ssA = vec_ld(0, tmpbis);
695  tmpP1ssB = vec_ld(16, tmpbis);
696  tmpbis += tmpStride;
697  tmpP2ssA = vec_ld(0, tmpbis);
698  tmpP2ssB = vec_ld(16, tmpbis);
699  tmpbis += tmpStride;
700 
701  for (i = 0 ; i < 16 ; i++) {
702  const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
703  const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
704 
705  const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
706  const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
707  const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
708  const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
709  const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
710  const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
711 
712  tmpbis += tmpStride;
713 
714  tmpM2ssA = tmpM1ssA;
715  tmpM2ssB = tmpM1ssB;
716  tmpM1ssA = tmpP0ssA;
717  tmpM1ssB = tmpP0ssB;
718  tmpP0ssA = tmpP1ssA;
719  tmpP0ssB = tmpP1ssB;
720  tmpP1ssA = tmpP2ssA;
721  tmpP1ssB = tmpP2ssB;
722  tmpP2ssA = tmpP3ssA;
723  tmpP2ssB = tmpP3ssB;
724 
725  pp1Ae = vec_mule(sum1A, v20ss);
726  pp1Ao = vec_mulo(sum1A, v20ss);
727  pp1Be = vec_mule(sum1B, v20ss);
728  pp1Bo = vec_mulo(sum1B, v20ss);
729 
730  pp2Ae = vec_mule(sum2A, v5ss);
731  pp2Ao = vec_mulo(sum2A, v5ss);
732  pp2Be = vec_mule(sum2B, v5ss);
733  pp2Bo = vec_mulo(sum2B, v5ss);
734 
735  pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
736  pp3Ao = vec_mulo(sum3A, v1ss);
737  pp3Be = vec_sra((vec_s32)sum3B, v16ui);
738  pp3Bo = vec_mulo(sum3B, v1ss);
739 
740  pp1cAe = vec_add(pp1Ae, v512si);
741  pp1cAo = vec_add(pp1Ao, v512si);
742  pp1cBe = vec_add(pp1Be, v512si);
743  pp1cBo = vec_add(pp1Bo, v512si);
744 
745  pp32Ae = vec_sub(pp3Ae, pp2Ae);
746  pp32Ao = vec_sub(pp3Ao, pp2Ao);
747  pp32Be = vec_sub(pp3Be, pp2Be);
748  pp32Bo = vec_sub(pp3Bo, pp2Bo);
749 
750  sumAe = vec_add(pp1cAe, pp32Ae);
751  sumAo = vec_add(pp1cAo, pp32Ao);
752  sumBe = vec_add(pp1cBe, pp32Be);
753  sumBo = vec_add(pp1cBo, pp32Bo);
754 
755  ssumAe = vec_sra(sumAe, v10ui);
756  ssumAo = vec_sra(sumAo, v10ui);
757  ssumBe = vec_sra(sumBe, v10ui);
758  ssumBo = vec_sra(sumBo, v10ui);
759 
760  ssume = vec_packs(ssumAe, ssumBe);
761  ssumo = vec_packs(ssumAo, ssumBo);
762 
763  sumv = vec_packsu(ssume, ssumo);
764  sum = vec_perm(sumv, sumv, mperm);
765 
766  ASSERT_ALIGNED(dst);
767  vdst = vec_ld(0, dst);
768 
769  OP_U8_ALTIVEC(fsum, sum, vdst);
770 
771  vec_st(fsum, 0, dst);
772 
773  dst += dstStride;
774  }
775 }
776 #endif