37 __typeof__(a) swap_temp=a; \
43 #define TRANSPOSE4(a,b,c,d) \
45 __typeof__(a) _trans_ach = vec_mergeh(a, c); \
46 __typeof__(a) _trans_acl = vec_mergel(a, c); \
47 __typeof__(a) _trans_bdh = vec_mergeh(b, d); \
48 __typeof__(a) _trans_bdl = vec_mergel(b, d); \
50 a = vec_mergeh(_trans_ach, _trans_bdh); \
51 b = vec_mergel(_trans_ach, _trans_bdh); \
52 c = vec_mergeh(_trans_acl, _trans_bdl); \
53 d = vec_mergel(_trans_acl, _trans_bdl); \
60 #define LOAD4(vec, address) \
62 __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
63 vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
64 vec = vec_ld(0, _load_addr); \
65 vec = vec_perm(vec, vec, _perm_vec); \
66 vec = vec_splat(vec, 0); \
70 #define FOUROF(a) {a,a,a,a}
74 int qscale,
int* overflow)
77 vector
float row0, row1, row2, row3, row4, row5, row6, row7;
78 vector
float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
79 const vector
float zero = (
const vector float)
FOUROF(0.);
85 vector
signed short data0, data1, data2, data3, data4, data5, data6, data7;
87 data0 = vec_ld(0, data);
88 data1 = vec_ld(16, data);
89 data2 = vec_ld(32, data);
90 data3 = vec_ld(48, data);
91 data4 = vec_ld(64, data);
92 data5 = vec_ld(80, data);
93 data6 = vec_ld(96, data);
94 data7 = vec_ld(112, data);
97 TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
102 row0 = vec_ctf(vec_unpackh(data0), 0);
103 alt0 = vec_ctf(vec_unpackl(data0), 0);
104 row1 = vec_ctf(vec_unpackh(data1), 0);
105 alt1 = vec_ctf(vec_unpackl(data1), 0);
106 row2 = vec_ctf(vec_unpackh(data2), 0);
107 alt2 = vec_ctf(vec_unpackl(data2), 0);
108 row3 = vec_ctf(vec_unpackh(data3), 0);
109 alt3 = vec_ctf(vec_unpackl(data3), 0);
110 row4 = vec_ctf(vec_unpackh(data4), 0);
111 alt4 = vec_ctf(vec_unpackl(data4), 0);
112 row5 = vec_ctf(vec_unpackh(data5), 0);
113 alt5 = vec_ctf(vec_unpackl(data5), 0);
114 row6 = vec_ctf(vec_unpackh(data6), 0);
115 alt6 = vec_ctf(vec_unpackl(data6), 0);
116 row7 = vec_ctf(vec_unpackh(data7), 0);
117 alt7 = vec_ctf(vec_unpackl(data7), 0);
125 const vector
float vec_0_298631336 = (vector float)
FOUROF(0.298631336
f);
126 const vector
float vec_0_390180644 = (vector float)
FOUROF(-0.390180644
f);
127 const vector
float vec_0_541196100 = (vector float)
FOUROF(0.541196100
f);
128 const vector
float vec_0_765366865 = (vector float)
FOUROF(0.765366865
f);
129 const vector
float vec_0_899976223 = (vector float)
FOUROF(-0.899976223
f);
130 const vector
float vec_1_175875602 = (vector float)
FOUROF(1.175875602
f);
131 const vector
float vec_1_501321110 = (vector float)
FOUROF(1.501321110
f);
132 const vector
float vec_1_847759065 = (vector float)
FOUROF(-1.847759065
f);
133 const vector
float vec_1_961570560 = (vector float)
FOUROF(-1.961570560
f);
134 const vector
float vec_2_053119869 = (vector float)
FOUROF(2.053119869
f);
135 const vector
float vec_2_562915447 = (vector float)
FOUROF(-2.562915447
f);
136 const vector
float vec_3_072711026 = (vector float)
FOUROF(3.072711026
f);
139 int whichPass, whichHalf;
141 for(whichPass = 1; whichPass<=2; whichPass++) {
142 for(whichHalf = 1; whichHalf<=2; whichHalf++) {
143 vector
float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
144 vector
float tmp10, tmp11, tmp12, tmp13;
145 vector
float z1, z2, z3, z4, z5;
147 tmp0 = vec_add(row0, row7);
148 tmp7 = vec_sub(row0, row7);
149 tmp3 = vec_add(row3, row4);
150 tmp4 = vec_sub(row3, row4);
151 tmp1 = vec_add(row1, row6);
152 tmp6 = vec_sub(row1, row6);
153 tmp2 = vec_add(row2, row5);
154 tmp5 = vec_sub(row2, row5);
156 tmp10 = vec_add(tmp0, tmp3);
157 tmp13 = vec_sub(tmp0, tmp3);
158 tmp11 = vec_add(tmp1, tmp2);
159 tmp12 = vec_sub(tmp1, tmp2);
163 row0 = vec_add(tmp10, tmp11);
166 row4 = vec_sub(tmp10, tmp11);
170 z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector
float)zero);
174 row2 = vec_madd(tmp13, vec_0_765366865, z1);
178 row6 = vec_madd(tmp12, vec_1_847759065, z1);
180 z1 = vec_add(tmp4, tmp7);
181 z2 = vec_add(tmp5, tmp6);
182 z3 = vec_add(tmp4, tmp6);
183 z4 = vec_add(tmp5, tmp7);
186 z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector
float)zero);
189 z3 = vec_madd(z3, vec_1_961570560, z5);
192 z4 = vec_madd(z4, vec_0_390180644, z5);
209 row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3));
213 row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4));
217 row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3));
221 row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4));
237 if (whichPass == 1) {
259 const vector
signed int* qmat;
260 vector
float bias, negBias;
263 vector
signed int baseVector;
267 baseVector = vec_cts(vec_splat(row0, 0), 0);
268 vec_ste(baseVector, 0, &oldBaseValue);
280 vector
signed int biasInt;
281 const vector
float negOneFloat = (vector float)
FOUROF(-1.0
f);
282 LOAD4(biasInt, biasAddr);
284 negBias = vec_madd(bias, negOneFloat, zero);
288 vector
float q0, q1, q2, q3, q4, q5, q6, q7;
299 row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias),
300 vec_cmpgt(row0, zero));
301 row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias),
302 vec_cmpgt(row1, zero));
303 row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias),
304 vec_cmpgt(row2, zero));
305 row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias),
306 vec_cmpgt(row3, zero));
307 row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias),
308 vec_cmpgt(row4, zero));
309 row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias),
310 vec_cmpgt(row5, zero));
311 row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias),
312 vec_cmpgt(row6, zero));
313 row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias),
314 vec_cmpgt(row7, zero));
325 alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias),
326 vec_cmpgt(alt0, zero));
327 alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias),
328 vec_cmpgt(alt1, zero));
329 alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias),
330 vec_cmpgt(alt2, zero));
331 alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias),
332 vec_cmpgt(alt3, zero));
333 alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias),
334 vec_cmpgt(alt4, zero));
335 alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias),
336 vec_cmpgt(alt5, zero));
337 alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias),
338 vec_cmpgt(alt6, zero));
339 alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias),
340 vec_cmpgt(alt7, zero));
348 vector
signed short data0, data1, data2, data3, data4, data5, data6, data7;
350 data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0));
351 data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0));
352 data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0));
353 data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0));
354 data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0));
355 data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0));
356 data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0));
357 data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0));
361 vector
signed int max_q_int, min_q_int;
362 vector
signed short max_q, min_q;
367 max_q = vec_pack(max_q_int, max_q_int);
368 min_q = vec_pack(min_q_int, min_q_int);
370 data0 = vec_max(vec_min(data0, max_q), min_q);
371 data1 = vec_max(vec_min(data1, max_q), min_q);
372 data2 = vec_max(vec_min(data2, max_q), min_q);
373 data4 = vec_max(vec_min(data4, max_q), min_q);
374 data5 = vec_max(vec_min(data5, max_q), min_q);
375 data6 = vec_max(vec_min(data6, max_q), min_q);
376 data7 = vec_max(vec_min(data7, max_q), min_q);
380 vector
bool char zero_01, zero_23, zero_45, zero_67;
381 vector
signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67;
382 vector
signed char negOne = vec_splat_s8(-1);
383 vector
signed char* scanPtr =
385 signed char lastNonZeroChar;
388 zero_01 = vec_pack(vec_cmpeq(data0, (vector
signed short)zero),
389 vec_cmpeq(data1, (vector
signed short)zero));
390 zero_23 = vec_pack(vec_cmpeq(data2, (vector
signed short)zero),
391 vec_cmpeq(data3, (vector
signed short)zero));
392 zero_45 = vec_pack(vec_cmpeq(data4, (vector
signed short)zero),
393 vec_cmpeq(data5, (vector
signed short)zero));
394 zero_67 = vec_pack(vec_cmpeq(data6, (vector
signed short)zero),
395 vec_cmpeq(data7, (vector
signed short)zero));
398 scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01);
399 scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23);
400 scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45);
401 scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67);
404 scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23);
405 scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67);
408 scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45);
411 scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
412 vec_mergel(scanIndexes_01, negOne));
415 scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
416 vec_mergel(scanIndexes_01, negOne));
419 scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
420 vec_mergel(scanIndexes_01, negOne));
423 scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
424 vec_mergel(scanIndexes_01, negOne));
426 scanIndexes_01 = vec_splat(scanIndexes_01, 0);
429 vec_ste(scanIndexes_01, 0, &lastNonZeroChar);
431 lastNonZero = lastNonZeroChar;
438 TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
441 vec_st(data0, 0, data);
442 vec_st(data1, 16, data);
443 vec_st(data2, 32, data);
444 vec_st(data3, 48, data);
445 vec_st(data4, 64, data);
446 vec_st(data5, 80, data);
447 vec_st(data6, 96, data);
448 vec_st(data7, 112, data);
462 data[0] = (oldBaseValue + 4) >> 3;
467 if ((lastNonZero > 0) &&
482 int i,
level, qmul, qadd;
487 qadd = (qscale - 1) | 1;
506 register const vector
signed short vczero = (
const vector
signed short)vec_splat_s16(0);
509 register vector
signed short blockv, qmulv, qaddv, nqaddv, temp1;
510 register vector
bool short blockv_null, blockv_neg;
511 register short backup_0 = block[0];
514 qmulv = vec_splat((
vec_s16)vec_lde(0, &qmul8), 0);
515 qaddv = vec_splat((
vec_s16)vec_lde(0, &qadd8), 0);
516 nqaddv = vec_sub(vczero, qaddv);
520 for(; (j + 7) <= nCoeffs ; j+=8) {
521 blockv = vec_ld(j << 1, block);
522 blockv_neg = vec_cmplt(blockv, vczero);
523 blockv_null = vec_cmpeq(blockv, vczero);
525 temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
527 temp1 = vec_mladd(blockv, qmulv, temp1);
529 blockv = vec_sel(temp1, blockv, blockv_null);
530 vec_st(blockv, j << 1, block);
537 for(; j <= nCoeffs ; j++) {
541 level = level * qmul - qadd;
543 level = level * qmul + qadd;
565 "to use AltiVec DCT. Reverting to non-AltiVec version.\n");
571 "to use AltiVec DCT. Reverting to non-AltiVec version.\n");