22 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24 #define ASSERT_ALIGNED(ptr) ;
29 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
30 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
31 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
34 psum = vec_mladd(vB, vsrc1ssH, psum);\
35 psum = vec_mladd(vC, vsrc2ssH, psum);\
36 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = vec_sr(psum, v6us);\
40 vdst = vec_ld(0, dst);\
41 ppsum = (vec_u8)vec_pack(psum, psum);\
42 vfdst = vec_perm(vdst, ppsum, fperm);\
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46 vec_st(fsum, 0, dst);\
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
57 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60 psum = vec_mladd(vE, vsrc1ssH, psum);\
61 psum = vec_sr(psum, v6us);\
63 vdst = vec_ld(0, dst);\
64 ppsum = (vec_u8)vec_pack(psum, psum);\
65 vfdst = vec_perm(vdst, ppsum, fperm);\
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69 vec_st(fsum, 0, dst);\
75 #define add28(a) vec_add(v28ss, a)
77 #ifdef PREFIX_h264_chroma_mc8_altivec
79 int stride,
int h,
int x,
int y) {
87 const vec_s32 vABCD = vec_ld(0, ABCD);
93 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
94 const vec_u16 v6us = vec_splat_u16(6);
95 register int loadSecond = (((
unsigned long)src) % 16) <= 7 ? 0 : 1;
96 register int reallyBadAlign = (((
unsigned long)src) % 16) == 15 ? 1 : 0;
101 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
102 vec_s16 vsrc2ssH, vsrc3ssH, psum;
103 vec_u8 vdst, ppsum, vfdst, fsum;
105 if (((
unsigned long)dst) % 16 == 0) {
106 fperm = (
vec_u8){0x10, 0x11, 0x12, 0x13,
107 0x14, 0x15, 0x16, 0x17,
108 0x08, 0x09, 0x0A, 0x0B,
109 0x0C, 0x0D, 0x0E, 0x0F};
111 fperm = (
vec_u8){0x00, 0x01, 0x02, 0x03,
112 0x04, 0x05, 0x06, 0x07,
113 0x18, 0x19, 0x1A, 0x1B,
114 0x1C, 0x1D, 0x1E, 0x1F};
117 vsrcAuc = vec_ld(0, src);
120 vsrcBuc = vec_ld(16, src);
121 vsrcperm0 = vec_lvsl(0, src);
122 vsrcperm1 = vec_lvsl(1, src);
124 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
128 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
135 for (i = 0 ; i < h ; i++) {
136 vsrcCuc = vec_ld(stride + 0, src);
137 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
138 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
144 for (i = 0 ; i < h ; i++) {
145 vsrcCuc = vec_ld(stride + 0, src);
146 vsrcDuc = vec_ld(stride + 16, src);
147 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
151 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
157 const vec_s16 vE = vec_add(vB, vC);
160 for (i = 0 ; i < h ; i++) {
161 vsrcCuc = vec_ld(stride + 0, src);
162 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
169 for (i = 0 ; i < h ; i++) {
170 vsrcCuc = vec_ld(stride + 0, src);
171 vsrcDuc = vec_ld(stride + 15, src);
172 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
180 for (i = 0 ; i < h ; i++) {
181 vsrcCuc = vec_ld(0, src);
182 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
183 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
189 for (i = 0 ; i < h ; i++) {
190 vsrcCuc = vec_ld(0, src);
191 vsrcDuc = vec_ld(15, src);
192 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
196 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
207 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
210 {((8 - x) * (8 - y)),
216 const vec_s32 vABCD = vec_ld(0, ABCD);
222 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
223 const vec_u16 v6us = vec_splat_u16(6);
224 register int loadSecond = (((
unsigned long)src) % 16) <= 7 ? 0 : 1;
225 register int reallyBadAlign = (((
unsigned long)src) % 16) == 15 ? 1 : 0;
230 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
231 vec_s16 vsrc2ssH, vsrc3ssH, psum;
232 vec_u8 vdst, ppsum, vfdst, fsum;
234 if (((
unsigned long)dst) % 16 == 0) {
235 fperm = (
vec_u8){0x10, 0x11, 0x12, 0x13,
236 0x14, 0x15, 0x16, 0x17,
237 0x08, 0x09, 0x0A, 0x0B,
238 0x0C, 0x0D, 0x0E, 0x0F};
240 fperm = (
vec_u8){0x00, 0x01, 0x02, 0x03,
241 0x04, 0x05, 0x06, 0x07,
242 0x18, 0x19, 0x1A, 0x1B,
243 0x1C, 0x1D, 0x1E, 0x1F};
246 vsrcAuc = vec_ld(0, src);
249 vsrcBuc = vec_ld(16, src);
250 vsrcperm0 = vec_lvsl(0, src);
251 vsrcperm1 = vec_lvsl(1, src);
253 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
257 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
263 for (i = 0 ; i < h ; i++) {
266 vsrcCuc = vec_ld(stride + 0, src);
268 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
269 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
275 for (i = 0 ; i < h ; i++) {
276 vsrcCuc = vec_ld(stride + 0, src);
277 vsrcDuc = vec_ld(stride + 16, src);
279 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
283 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
293 #undef CHROMA_MC8_ALTIVEC_CORE
296 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
301 const vec_u8 permM2 = vec_lvsl(-2, src);
302 const vec_u8 permM1 = vec_lvsl(-1, src);
303 const vec_u8 permP0 = vec_lvsl(+0, src);
304 const vec_u8 permP1 = vec_lvsl(+1, src);
305 const vec_u8 permP2 = vec_lvsl(+2, src);
306 const vec_u8 permP3 = vec_lvsl(+3, src);
307 const vec_s16 v5ss = vec_splat_s16(5);
308 const vec_u16 v5us = vec_splat_u16(5);
309 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
310 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
312 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
314 register int align = ((((
unsigned long)src) - 2) % 16);
316 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
317 srcP2A, srcP2B, srcP3A, srcP3B,
318 srcM1A, srcM1B, srcM2A, srcM2B,
319 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
320 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
321 psumA, psumB, sumA, sumB;
325 for (i = 0 ; i < 16 ; i ++) {
326 vec_u8 srcR1 = vec_ld(-2, src);
327 vec_u8 srcR2 = vec_ld(14, src);
331 srcM2 = vec_perm(srcR1, srcR2, permM2);
332 srcM1 = vec_perm(srcR1, srcR2, permM1);
333 srcP0 = vec_perm(srcR1, srcR2, permP0);
334 srcP1 = vec_perm(srcR1, srcR2, permP1);
335 srcP2 = vec_perm(srcR1, srcR2, permP2);
336 srcP3 = vec_perm(srcR1, srcR2, permP3);
339 srcM2 = vec_perm(srcR1, srcR2, permM2);
340 srcM1 = vec_perm(srcR1, srcR2, permM1);
341 srcP0 = vec_perm(srcR1, srcR2, permP0);
342 srcP1 = vec_perm(srcR1, srcR2, permP1);
343 srcP2 = vec_perm(srcR1, srcR2, permP2);
347 vec_u8 srcR3 = vec_ld(30, src);
348 srcM2 = vec_perm(srcR1, srcR2, permM2);
349 srcM1 = vec_perm(srcR1, srcR2, permM1);
350 srcP0 = vec_perm(srcR1, srcR2, permP0);
351 srcP1 = vec_perm(srcR1, srcR2, permP1);
353 srcP3 = vec_perm(srcR2, srcR3, permP3);
356 vec_u8 srcR3 = vec_ld(30, src);
357 srcM2 = vec_perm(srcR1, srcR2, permM2);
358 srcM1 = vec_perm(srcR1, srcR2, permM1);
359 srcP0 = vec_perm(srcR1, srcR2, permP0);
361 srcP2 = vec_perm(srcR2, srcR3, permP2);
362 srcP3 = vec_perm(srcR2, srcR3, permP3);
365 vec_u8 srcR3 = vec_ld(30, src);
366 srcM2 = vec_perm(srcR1, srcR2, permM2);
367 srcM1 = vec_perm(srcR1, srcR2, permM1);
369 srcP1 = vec_perm(srcR2, srcR3, permP1);
370 srcP2 = vec_perm(srcR2, srcR3, permP2);
371 srcP3 = vec_perm(srcR2, srcR3, permP3);
374 vec_u8 srcR3 = vec_ld(30, src);
375 srcM2 = vec_perm(srcR1, srcR2, permM2);
377 srcP0 = vec_perm(srcR2, srcR3, permP0);
378 srcP1 = vec_perm(srcR2, srcR3, permP1);
379 srcP2 = vec_perm(srcR2, srcR3, permP2);
380 srcP3 = vec_perm(srcR2, srcR3, permP3);
399 sum1A = vec_adds(srcP0A, srcP1A);
400 sum1B = vec_adds(srcP0B, srcP1B);
401 sum2A = vec_adds(srcM1A, srcP2A);
402 sum2B = vec_adds(srcM1B, srcP2B);
403 sum3A = vec_adds(srcM2A, srcP3A);
404 sum3B = vec_adds(srcM2B, srcP3B);
406 pp1A = vec_mladd(sum1A, v20ss, v16ss);
407 pp1B = vec_mladd(sum1B, v20ss, v16ss);
409 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
410 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
412 pp3A = vec_add(sum3A, pp1A);
413 pp3B = vec_add(sum3B, pp1B);
415 psumA = vec_sub(pp3A, pp2A);
416 psumB = vec_sub(pp3B, pp2B);
418 sumA = vec_sra(psumA, v5us);
419 sumB = vec_sra(psumB, v5us);
421 sum = vec_packsu(sumA, sumB);
424 vdst = vec_ld(0, dst);
428 vec_st(fsum, 0, dst);
437 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
442 const vec_u8 perm = vec_lvsl(0, src);
443 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
444 const vec_u16 v5us = vec_splat_u16(5);
445 const vec_s16 v5ss = vec_splat_s16(5);
446 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
448 uint8_t *srcbis = src - (srcStride * 2);
450 const vec_u8 srcM2a = vec_ld(0, srcbis);
451 const vec_u8 srcM2b = vec_ld(16, srcbis);
452 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
454 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
455 const vec_u8 srcM1b = vec_ld(16, srcbis);
456 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
458 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
459 const vec_u8 srcP0b = vec_ld(16, srcbis);
460 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
462 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
463 const vec_u8 srcP1b = vec_ld(16, srcbis);
464 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
466 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
467 const vec_u8 srcP2b = vec_ld(16, srcbis);
468 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
482 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
483 psumA, psumB, sumA, sumB,
485 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
487 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
489 for (i = 0 ; i < 16 ; i++) {
490 srcP3a = vec_ld(0, srcbis += srcStride);
491 srcP3b = vec_ld(16, srcbis);
492 srcP3 = vec_perm(srcP3a, srcP3b, perm);
497 sum1A = vec_adds(srcP0ssA, srcP1ssA);
498 sum1B = vec_adds(srcP0ssB, srcP1ssB);
499 sum2A = vec_adds(srcM1ssA, srcP2ssA);
500 sum2B = vec_adds(srcM1ssB, srcP2ssB);
501 sum3A = vec_adds(srcM2ssA, srcP3ssA);
502 sum3B = vec_adds(srcM2ssB, srcP3ssB);
515 pp1A = vec_mladd(sum1A, v20ss, v16ss);
516 pp1B = vec_mladd(sum1B, v20ss, v16ss);
518 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
519 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
521 pp3A = vec_add(sum3A, pp1A);
522 pp3B = vec_add(sum3B, pp1B);
524 psumA = vec_sub(pp3A, pp2A);
525 psumB = vec_sub(pp3B, pp2B);
527 sumA = vec_sra(psumA, v5us);
528 sumB = vec_sra(psumB, v5us);
530 sum = vec_packsu(sumA, sumB);
533 vdst = vec_ld(0, dst);
537 vec_st(fsum, 0, dst);
545 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
549 const vec_u8 permM2 = vec_lvsl(-2, src);
550 const vec_u8 permM1 = vec_lvsl(-1, src);
551 const vec_u8 permP0 = vec_lvsl(+0, src);
552 const vec_u8 permP1 = vec_lvsl(+1, src);
553 const vec_u8 permP2 = vec_lvsl(+2, src);
554 const vec_u8 permP3 = vec_lvsl(+3, src);
555 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
556 const vec_u32 v10ui = vec_splat_u32(10);
557 const vec_s16 v5ss = vec_splat_s16(5);
558 const vec_s16 v1ss = vec_splat_s16(1);
559 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
560 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
562 register int align = ((((
unsigned long)src) - 2) % 16);
564 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
565 srcP2A, srcP2B, srcP3A, srcP3B,
566 srcM1A, srcM1B, srcM2A, srcM2B,
567 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
568 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
571 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
572 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
573 int16_t *tmpbis = tmp;
575 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
576 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
579 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
580 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
581 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
582 ssumAe, ssumAo, ssumBe, ssumBo;
583 vec_u8 fsum, sumv, sum, vdst;
586 src -= (2 * srcStride);
587 for (i = 0 ; i < 21 ; i ++) {
588 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
589 vec_u8 srcR1 = vec_ld(-2, src);
590 vec_u8 srcR2 = vec_ld(14, src);
594 srcM2 = vec_perm(srcR1, srcR2, permM2);
595 srcM1 = vec_perm(srcR1, srcR2, permM1);
596 srcP0 = vec_perm(srcR1, srcR2, permP0);
597 srcP1 = vec_perm(srcR1, srcR2, permP1);
598 srcP2 = vec_perm(srcR1, srcR2, permP2);
599 srcP3 = vec_perm(srcR1, srcR2, permP3);
602 srcM2 = vec_perm(srcR1, srcR2, permM2);
603 srcM1 = vec_perm(srcR1, srcR2, permM1);
604 srcP0 = vec_perm(srcR1, srcR2, permP0);
605 srcP1 = vec_perm(srcR1, srcR2, permP1);
606 srcP2 = vec_perm(srcR1, srcR2, permP2);
610 vec_u8 srcR3 = vec_ld(30, src);
611 srcM2 = vec_perm(srcR1, srcR2, permM2);
612 srcM1 = vec_perm(srcR1, srcR2, permM1);
613 srcP0 = vec_perm(srcR1, srcR2, permP0);
614 srcP1 = vec_perm(srcR1, srcR2, permP1);
616 srcP3 = vec_perm(srcR2, srcR3, permP3);
619 vec_u8 srcR3 = vec_ld(30, src);
620 srcM2 = vec_perm(srcR1, srcR2, permM2);
621 srcM1 = vec_perm(srcR1, srcR2, permM1);
622 srcP0 = vec_perm(srcR1, srcR2, permP0);
624 srcP2 = vec_perm(srcR2, srcR3, permP2);
625 srcP3 = vec_perm(srcR2, srcR3, permP3);
628 vec_u8 srcR3 = vec_ld(30, src);
629 srcM2 = vec_perm(srcR1, srcR2, permM2);
630 srcM1 = vec_perm(srcR1, srcR2, permM1);
632 srcP1 = vec_perm(srcR2, srcR3, permP1);
633 srcP2 = vec_perm(srcR2, srcR3, permP2);
634 srcP3 = vec_perm(srcR2, srcR3, permP3);
637 vec_u8 srcR3 = vec_ld(30, src);
638 srcM2 = vec_perm(srcR1, srcR2, permM2);
640 srcP0 = vec_perm(srcR2, srcR3, permP0);
641 srcP1 = vec_perm(srcR2, srcR3, permP1);
642 srcP2 = vec_perm(srcR2, srcR3, permP2);
643 srcP3 = vec_perm(srcR2, srcR3, permP3);
662 sum1A = vec_adds(srcP0A, srcP1A);
663 sum1B = vec_adds(srcP0B, srcP1B);
664 sum2A = vec_adds(srcM1A, srcP2A);
665 sum2B = vec_adds(srcM1B, srcP2B);
666 sum3A = vec_adds(srcM2A, srcP3A);
667 sum3B = vec_adds(srcM2B, srcP3B);
669 pp1A = vec_mladd(sum1A, v20ss, sum3A);
670 pp1B = vec_mladd(sum1B, v20ss, sum3B);
672 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
673 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
675 psumA = vec_sub(pp1A, pp2A);
676 psumB = vec_sub(pp1B, pp2B);
678 vec_st(psumA, 0, tmp);
679 vec_st(psumB, 16, tmp);
685 tmpM2ssA = vec_ld(0, tmpbis);
686 tmpM2ssB = vec_ld(16, tmpbis);
688 tmpM1ssA = vec_ld(0, tmpbis);
689 tmpM1ssB = vec_ld(16, tmpbis);
691 tmpP0ssA = vec_ld(0, tmpbis);
692 tmpP0ssB = vec_ld(16, tmpbis);
694 tmpP1ssA = vec_ld(0, tmpbis);
695 tmpP1ssB = vec_ld(16, tmpbis);
697 tmpP2ssA = vec_ld(0, tmpbis);
698 tmpP2ssB = vec_ld(16, tmpbis);
701 for (i = 0 ; i < 16 ; i++) {
702 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
703 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
705 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
706 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
707 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
708 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
709 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
710 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
725 pp1Ae = vec_mule(sum1A, v20ss);
726 pp1Ao = vec_mulo(sum1A, v20ss);
727 pp1Be = vec_mule(sum1B, v20ss);
728 pp1Bo = vec_mulo(sum1B, v20ss);
730 pp2Ae = vec_mule(sum2A, v5ss);
731 pp2Ao = vec_mulo(sum2A, v5ss);
732 pp2Be = vec_mule(sum2B, v5ss);
733 pp2Bo = vec_mulo(sum2B, v5ss);
735 pp3Ae = vec_sra((
vec_s32)sum3A, v16ui);
736 pp3Ao = vec_mulo(sum3A, v1ss);
737 pp3Be = vec_sra((
vec_s32)sum3B, v16ui);
738 pp3Bo = vec_mulo(sum3B, v1ss);
740 pp1cAe = vec_add(pp1Ae, v512si);
741 pp1cAo = vec_add(pp1Ao, v512si);
742 pp1cBe = vec_add(pp1Be, v512si);
743 pp1cBo = vec_add(pp1Bo, v512si);
745 pp32Ae = vec_sub(pp3Ae, pp2Ae);
746 pp32Ao = vec_sub(pp3Ao, pp2Ao);
747 pp32Be = vec_sub(pp3Be, pp2Be);
748 pp32Bo = vec_sub(pp3Bo, pp2Bo);
750 sumAe = vec_add(pp1cAe, pp32Ae);
751 sumAo = vec_add(pp1cAo, pp32Ao);
752 sumBe = vec_add(pp1cBe, pp32Be);
753 sumBo = vec_add(pp1cBo, pp32Bo);
755 ssumAe = vec_sra(sumAe, v10ui);
756 ssumAo = vec_sra(sumAo, v10ui);
757 ssumBe = vec_sra(sumBe, v10ui);
758 ssumBo = vec_sra(sumBo, v10ui);
760 ssume = vec_packs(ssumAe, ssumBe);
761 ssumo = vec_packs(ssumAo, ssumBo);
763 sumv = vec_packsu(ssume, ssumo);
764 sum = vec_perm(sumv, sumv, mperm);
767 vdst = vec_ld(0, dst);
771 vec_st(fsum, 0, dst);