dsputil_iwmmxt_rnd_template.c
Go to the documentation of this file.
1 /*
2  * iWMMXt optimized DSP utils
3  * copyright (c) 2004 AGAWA Koji
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
23 {
24  int stride = line_size;
25  __asm__ volatile (
26  "and r12, %[pixels], #7 \n\t"
27  "bic %[pixels], %[pixels], #7 \n\t"
28  "tmcr wcgr1, r12 \n\t"
29  "add r4, %[pixels], %[line_size] \n\t"
30  "add r5, %[block], %[line_size] \n\t"
31  "mov %[line_size], %[line_size], lsl #1 \n\t"
32  "1: \n\t"
33  "wldrd wr0, [%[pixels]] \n\t"
34  "subs %[h], %[h], #2 \n\t"
35  "wldrd wr1, [%[pixels], #8] \n\t"
36  "add %[pixels], %[pixels], %[line_size] \n\t"
37  "wldrd wr3, [r4] \n\t"
38  "pld [%[pixels]] \n\t"
39  "pld [%[pixels], #32] \n\t"
40  "wldrd wr4, [r4, #8] \n\t"
41  "add r4, r4, %[line_size] \n\t"
42  "walignr1 wr8, wr0, wr1 \n\t"
43  "pld [r4] \n\t"
44  "pld [r4, #32] \n\t"
45  "walignr1 wr10, wr3, wr4 \n\t"
46  "wstrd wr8, [%[block]] \n\t"
47  "add %[block], %[block], %[line_size] \n\t"
48  "wstrd wr10, [r5] \n\t"
49  "add r5, r5, %[line_size] \n\t"
50  "bne 1b \n\t"
51  : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
52  :
53  : "memory", "r4", "r5", "r12");
54 }
55 
56 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
57 {
58  int stride = line_size;
59  __asm__ volatile (
60  "and r12, %[pixels], #7 \n\t"
61  "bic %[pixels], %[pixels], #7 \n\t"
62  "tmcr wcgr1, r12 \n\t"
63  "add r4, %[pixels], %[line_size] \n\t"
64  "add r5, %[block], %[line_size] \n\t"
65  "mov %[line_size], %[line_size], lsl #1 \n\t"
66  "1: \n\t"
67  "wldrd wr0, [%[pixels]] \n\t"
68  "subs %[h], %[h], #2 \n\t"
69  "wldrd wr1, [%[pixels], #8] \n\t"
70  "add %[pixels], %[pixels], %[line_size] \n\t"
71  "wldrd wr3, [r4] \n\t"
72  "pld [%[pixels]] \n\t"
73  "pld [%[pixels], #32] \n\t"
74  "wldrd wr4, [r4, #8] \n\t"
75  "add r4, r4, %[line_size] \n\t"
76  "walignr1 wr8, wr0, wr1 \n\t"
77  "wldrd wr0, [%[block]] \n\t"
78  "wldrd wr2, [r5] \n\t"
79  "pld [r4] \n\t"
80  "pld [r4, #32] \n\t"
81  "walignr1 wr10, wr3, wr4 \n\t"
82  WAVG2B" wr8, wr8, wr0 \n\t"
83  WAVG2B" wr10, wr10, wr2 \n\t"
84  "wstrd wr8, [%[block]] \n\t"
85  "add %[block], %[block], %[line_size] \n\t"
86  "wstrd wr10, [r5] \n\t"
87  "pld [%[block]] \n\t"
88  "pld [%[block], #32] \n\t"
89  "add r5, r5, %[line_size] \n\t"
90  "pld [r5] \n\t"
91  "pld [r5, #32] \n\t"
92  "bne 1b \n\t"
93  : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
94  :
95  : "memory", "r4", "r5", "r12");
96 }
97 
98 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
99 {
100  int stride = line_size;
101  __asm__ volatile (
102  "and r12, %[pixels], #7 \n\t"
103  "bic %[pixels], %[pixels], #7 \n\t"
104  "tmcr wcgr1, r12 \n\t"
105  "add r4, %[pixels], %[line_size] \n\t"
106  "add r5, %[block], %[line_size] \n\t"
107  "mov %[line_size], %[line_size], lsl #1 \n\t"
108  "1: \n\t"
109  "wldrd wr0, [%[pixels]] \n\t"
110  "wldrd wr1, [%[pixels], #8] \n\t"
111  "subs %[h], %[h], #2 \n\t"
112  "wldrd wr2, [%[pixels], #16] \n\t"
113  "add %[pixels], %[pixels], %[line_size] \n\t"
114  "wldrd wr3, [r4] \n\t"
115  "pld [%[pixels]] \n\t"
116  "pld [%[pixels], #32] \n\t"
117  "walignr1 wr8, wr0, wr1 \n\t"
118  "wldrd wr4, [r4, #8] \n\t"
119  "walignr1 wr9, wr1, wr2 \n\t"
120  "wldrd wr5, [r4, #16] \n\t"
121  "add r4, r4, %[line_size] \n\t"
122  "pld [r4] \n\t"
123  "pld [r4, #32] \n\t"
124  "walignr1 wr10, wr3, wr4 \n\t"
125  "wstrd wr8, [%[block]] \n\t"
126  "walignr1 wr11, wr4, wr5 \n\t"
127  "wstrd wr9, [%[block], #8] \n\t"
128  "add %[block], %[block], %[line_size] \n\t"
129  "wstrd wr10, [r5] \n\t"
130  "wstrd wr11, [r5, #8] \n\t"
131  "add r5, r5, %[line_size] \n\t"
132  "bne 1b \n\t"
133  : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
134  :
135  : "memory", "r4", "r5", "r12");
136 }
137 
138 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
139 {
140  int stride = line_size;
141  __asm__ volatile (
142  "pld [%[pixels]] \n\t"
143  "pld [%[pixels], #32] \n\t"
144  "pld [%[block]] \n\t"
145  "pld [%[block], #32] \n\t"
146  "and r12, %[pixels], #7 \n\t"
147  "bic %[pixels], %[pixels], #7 \n\t"
148  "tmcr wcgr1, r12 \n\t"
149  "add r4, %[pixels], %[line_size]\n\t"
150  "add r5, %[block], %[line_size] \n\t"
151  "mov %[line_size], %[line_size], lsl #1 \n\t"
152  "1: \n\t"
153  "wldrd wr0, [%[pixels]] \n\t"
154  "wldrd wr1, [%[pixels], #8] \n\t"
155  "subs %[h], %[h], #2 \n\t"
156  "wldrd wr2, [%[pixels], #16] \n\t"
157  "add %[pixels], %[pixels], %[line_size] \n\t"
158  "wldrd wr3, [r4] \n\t"
159  "pld [%[pixels]] \n\t"
160  "pld [%[pixels], #32] \n\t"
161  "walignr1 wr8, wr0, wr1 \n\t"
162  "wldrd wr4, [r4, #8] \n\t"
163  "walignr1 wr9, wr1, wr2 \n\t"
164  "wldrd wr5, [r4, #16] \n\t"
165  "add r4, r4, %[line_size] \n\t"
166  "wldrd wr0, [%[block]] \n\t"
167  "pld [r4] \n\t"
168  "wldrd wr1, [%[block], #8] \n\t"
169  "pld [r4, #32] \n\t"
170  "wldrd wr2, [r5] \n\t"
171  "walignr1 wr10, wr3, wr4 \n\t"
172  "wldrd wr3, [r5, #8] \n\t"
173  WAVG2B" wr8, wr8, wr0 \n\t"
174  WAVG2B" wr9, wr9, wr1 \n\t"
175  WAVG2B" wr10, wr10, wr2 \n\t"
176  "wstrd wr8, [%[block]] \n\t"
177  "walignr1 wr11, wr4, wr5 \n\t"
178  WAVG2B" wr11, wr11, wr3 \n\t"
179  "wstrd wr9, [%[block], #8] \n\t"
180  "add %[block], %[block], %[line_size] \n\t"
181  "wstrd wr10, [r5] \n\t"
182  "pld [%[block]] \n\t"
183  "pld [%[block], #32] \n\t"
184  "wstrd wr11, [r5, #8] \n\t"
185  "add r5, r5, %[line_size] \n\t"
186  "pld [r5] \n\t"
187  "pld [r5, #32] \n\t"
188  "bne 1b \n\t"
189  : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
190  :
191  : "memory", "r4", "r5", "r12");
192 }
193 
194 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
195 {
196  int stride = line_size;
197  // [wr0 wr1 wr2 wr3] for previous line
198  // [wr4 wr5 wr6 wr7] for current line
199  SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
200  __asm__ volatile(
201  "pld [%[pixels]] \n\t"
202  "pld [%[pixels], #32] \n\t"
203  "and r12, %[pixels], #7 \n\t"
204  "bic %[pixels], %[pixels], #7 \n\t"
205  "tmcr wcgr1, r12 \n\t"
206  "add r12, r12, #1 \n\t"
207  "add r4, %[pixels], %[line_size]\n\t"
208  "tmcr wcgr2, r12 \n\t"
209  "add r5, %[block], %[line_size] \n\t"
210  "mov %[line_size], %[line_size], lsl #1 \n\t"
211 
212  "1: \n\t"
213  "wldrd wr10, [%[pixels]] \n\t"
214  "cmp r12, #8 \n\t"
215  "wldrd wr11, [%[pixels], #8] \n\t"
216  "add %[pixels], %[pixels], %[line_size] \n\t"
217  "wldrd wr13, [r4] \n\t"
218  "pld [%[pixels]] \n\t"
219  "wldrd wr14, [r4, #8] \n\t"
220  "pld [%[pixels], #32] \n\t"
221  "add r4, r4, %[line_size] \n\t"
222  "walignr1 wr0, wr10, wr11 \n\t"
223  "pld [r4] \n\t"
224  "pld [r4, #32] \n\t"
225  "walignr1 wr2, wr13, wr14 \n\t"
226  "wmoveq wr4, wr11 \n\t"
227  "wmoveq wr6, wr14 \n\t"
228  "walignr2ne wr4, wr10, wr11 \n\t"
229  "walignr2ne wr6, wr13, wr14 \n\t"
230  WAVG2B" wr0, wr0, wr4 \n\t"
231  WAVG2B" wr2, wr2, wr6 \n\t"
232  "wstrd wr0, [%[block]] \n\t"
233  "subs %[h], %[h], #2 \n\t"
234  "wstrd wr2, [r5] \n\t"
235  "add %[block], %[block], %[line_size] \n\t"
236  "add r5, r5, %[line_size] \n\t"
237  "bne 1b \n\t"
238  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
239  :
240  : "r4", "r5", "r12", "memory");
241 }
242 
243 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
244 {
245  int stride = line_size;
246  // [wr0 wr1 wr2 wr3] for previous line
247  // [wr4 wr5 wr6 wr7] for current line
248  SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
249  __asm__ volatile(
250  "pld [%[pixels]] \n\t"
251  "pld [%[pixels], #32] \n\t"
252  "and r12, %[pixels], #7 \n\t"
253  "bic %[pixels], %[pixels], #7 \n\t"
254  "tmcr wcgr1, r12 \n\t"
255  "add r12, r12, #1 \n\t"
256  "add r4, %[pixels], %[line_size]\n\t"
257  "tmcr wcgr2, r12 \n\t"
258  "add r5, %[block], %[line_size] \n\t"
259  "mov %[line_size], %[line_size], lsl #1 \n\t"
260 
261  "1: \n\t"
262  "wldrd wr10, [%[pixels]] \n\t"
263  "cmp r12, #8 \n\t"
264  "wldrd wr11, [%[pixels], #8] \n\t"
265  "wldrd wr12, [%[pixels], #16] \n\t"
266  "add %[pixels], %[pixels], %[line_size] \n\t"
267  "wldrd wr13, [r4] \n\t"
268  "pld [%[pixels]] \n\t"
269  "wldrd wr14, [r4, #8] \n\t"
270  "pld [%[pixels], #32] \n\t"
271  "wldrd wr15, [r4, #16] \n\t"
272  "add r4, r4, %[line_size] \n\t"
273  "walignr1 wr0, wr10, wr11 \n\t"
274  "pld [r4] \n\t"
275  "pld [r4, #32] \n\t"
276  "walignr1 wr1, wr11, wr12 \n\t"
277  "walignr1 wr2, wr13, wr14 \n\t"
278  "walignr1 wr3, wr14, wr15 \n\t"
279  "wmoveq wr4, wr11 \n\t"
280  "wmoveq wr5, wr12 \n\t"
281  "wmoveq wr6, wr14 \n\t"
282  "wmoveq wr7, wr15 \n\t"
283  "walignr2ne wr4, wr10, wr11 \n\t"
284  "walignr2ne wr5, wr11, wr12 \n\t"
285  "walignr2ne wr6, wr13, wr14 \n\t"
286  "walignr2ne wr7, wr14, wr15 \n\t"
287  WAVG2B" wr0, wr0, wr4 \n\t"
288  WAVG2B" wr1, wr1, wr5 \n\t"
289  "wstrd wr0, [%[block]] \n\t"
290  WAVG2B" wr2, wr2, wr6 \n\t"
291  "wstrd wr1, [%[block], #8] \n\t"
292  WAVG2B" wr3, wr3, wr7 \n\t"
293  "add %[block], %[block], %[line_size] \n\t"
294  "wstrd wr2, [r5] \n\t"
295  "subs %[h], %[h], #2 \n\t"
296  "wstrd wr3, [r5, #8] \n\t"
297  "add r5, r5, %[line_size] \n\t"
298  "bne 1b \n\t"
299  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
300  :
301  : "r4", "r5", "r12", "memory");
302 }
303 
304 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
305 {
306  int stride = line_size;
307  // [wr0 wr1 wr2 wr3] for previous line
308  // [wr4 wr5 wr6 wr7] for current line
309  SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
310  __asm__ volatile(
311  "pld [%[pixels]] \n\t"
312  "pld [%[pixels], #32] \n\t"
313  "pld [%[block]] \n\t"
314  "pld [%[block], #32] \n\t"
315  "and r12, %[pixels], #7 \n\t"
316  "bic %[pixels], %[pixels], #7 \n\t"
317  "tmcr wcgr1, r12 \n\t"
318  "add r12, r12, #1 \n\t"
319  "add r4, %[pixels], %[line_size]\n\t"
320  "tmcr wcgr2, r12 \n\t"
321  "add r5, %[block], %[line_size] \n\t"
322  "mov %[line_size], %[line_size], lsl #1 \n\t"
323  "pld [r5] \n\t"
324  "pld [r5, #32] \n\t"
325 
326  "1: \n\t"
327  "wldrd wr10, [%[pixels]] \n\t"
328  "cmp r12, #8 \n\t"
329  "wldrd wr11, [%[pixels], #8] \n\t"
330  "add %[pixels], %[pixels], %[line_size] \n\t"
331  "wldrd wr13, [r4] \n\t"
332  "pld [%[pixels]] \n\t"
333  "wldrd wr14, [r4, #8] \n\t"
334  "pld [%[pixels], #32] \n\t"
335  "add r4, r4, %[line_size] \n\t"
336  "walignr1 wr0, wr10, wr11 \n\t"
337  "pld [r4] \n\t"
338  "pld [r4, #32] \n\t"
339  "walignr1 wr2, wr13, wr14 \n\t"
340  "wmoveq wr4, wr11 \n\t"
341  "wmoveq wr6, wr14 \n\t"
342  "walignr2ne wr4, wr10, wr11 \n\t"
343  "wldrd wr10, [%[block]] \n\t"
344  "walignr2ne wr6, wr13, wr14 \n\t"
345  "wldrd wr12, [r5] \n\t"
346  WAVG2B" wr0, wr0, wr4 \n\t"
347  WAVG2B" wr2, wr2, wr6 \n\t"
348  WAVG2B" wr0, wr0, wr10 \n\t"
349  WAVG2B" wr2, wr2, wr12 \n\t"
350  "wstrd wr0, [%[block]] \n\t"
351  "subs %[h], %[h], #2 \n\t"
352  "wstrd wr2, [r5] \n\t"
353  "add %[block], %[block], %[line_size] \n\t"
354  "add r5, r5, %[line_size] \n\t"
355  "pld [%[block]] \n\t"
356  "pld [%[block], #32] \n\t"
357  "pld [r5] \n\t"
358  "pld [r5, #32] \n\t"
359  "bne 1b \n\t"
360  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
361  :
362  : "r4", "r5", "r12", "memory");
363 }
364 
365 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
366 {
367  int stride = line_size;
368  // [wr0 wr1 wr2 wr3] for previous line
369  // [wr4 wr5 wr6 wr7] for current line
370  SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
371  __asm__ volatile(
372  "pld [%[pixels]] \n\t"
373  "pld [%[pixels], #32] \n\t"
374  "pld [%[block]] \n\t"
375  "pld [%[block], #32] \n\t"
376  "and r12, %[pixels], #7 \n\t"
377  "bic %[pixels], %[pixels], #7 \n\t"
378  "tmcr wcgr1, r12 \n\t"
379  "add r12, r12, #1 \n\t"
380  "add r4, %[pixels], %[line_size]\n\t"
381  "tmcr wcgr2, r12 \n\t"
382  "add r5, %[block], %[line_size] \n\t"
383  "mov %[line_size], %[line_size], lsl #1 \n\t"
384  "pld [r5] \n\t"
385  "pld [r5, #32] \n\t"
386 
387  "1: \n\t"
388  "wldrd wr10, [%[pixels]] \n\t"
389  "cmp r12, #8 \n\t"
390  "wldrd wr11, [%[pixels], #8] \n\t"
391  "wldrd wr12, [%[pixels], #16] \n\t"
392  "add %[pixels], %[pixels], %[line_size] \n\t"
393  "wldrd wr13, [r4] \n\t"
394  "pld [%[pixels]] \n\t"
395  "wldrd wr14, [r4, #8] \n\t"
396  "pld [%[pixels], #32] \n\t"
397  "wldrd wr15, [r4, #16] \n\t"
398  "add r4, r4, %[line_size] \n\t"
399  "walignr1 wr0, wr10, wr11 \n\t"
400  "pld [r4] \n\t"
401  "pld [r4, #32] \n\t"
402  "walignr1 wr1, wr11, wr12 \n\t"
403  "walignr1 wr2, wr13, wr14 \n\t"
404  "walignr1 wr3, wr14, wr15 \n\t"
405  "wmoveq wr4, wr11 \n\t"
406  "wmoveq wr5, wr12 \n\t"
407  "wmoveq wr6, wr14 \n\t"
408  "wmoveq wr7, wr15 \n\t"
409  "walignr2ne wr4, wr10, wr11 \n\t"
410  "walignr2ne wr5, wr11, wr12 \n\t"
411  "walignr2ne wr6, wr13, wr14 \n\t"
412  "walignr2ne wr7, wr14, wr15 \n\t"
413  "wldrd wr10, [%[block]] \n\t"
414  WAVG2B" wr0, wr0, wr4 \n\t"
415  "wldrd wr11, [%[block], #8] \n\t"
416  WAVG2B" wr1, wr1, wr5 \n\t"
417  "wldrd wr12, [r5] \n\t"
418  WAVG2B" wr2, wr2, wr6 \n\t"
419  "wldrd wr13, [r5, #8] \n\t"
420  WAVG2B" wr3, wr3, wr7 \n\t"
421  WAVG2B" wr0, wr0, wr10 \n\t"
422  WAVG2B" wr1, wr1, wr11 \n\t"
423  WAVG2B" wr2, wr2, wr12 \n\t"
424  WAVG2B" wr3, wr3, wr13 \n\t"
425  "wstrd wr0, [%[block]] \n\t"
426  "subs %[h], %[h], #2 \n\t"
427  "wstrd wr1, [%[block], #8] \n\t"
428  "add %[block], %[block], %[line_size] \n\t"
429  "wstrd wr2, [r5] \n\t"
430  "pld [%[block]] \n\t"
431  "wstrd wr3, [r5, #8] \n\t"
432  "add r5, r5, %[line_size] \n\t"
433  "pld [%[block], #32] \n\t"
434  "pld [r5] \n\t"
435  "pld [r5, #32] \n\t"
436  "bne 1b \n\t"
437  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
438  :
439  :"r4", "r5", "r12", "memory");
440 }
441 
442 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
443 {
444  int stride = line_size;
445  // [wr0 wr1 wr2 wr3] for previous line
446  // [wr4 wr5 wr6 wr7] for current line
447  __asm__ volatile(
448  "pld [%[pixels]] \n\t"
449  "pld [%[pixels], #32] \n\t"
450  "and r12, %[pixels], #7 \n\t"
451  "tmcr wcgr1, r12 \n\t"
452  "bic %[pixels], %[pixels], #7 \n\t"
453 
454  "wldrd wr10, [%[pixels]] \n\t"
455  "wldrd wr11, [%[pixels], #8] \n\t"
456  "pld [%[block]] \n\t"
457  "add %[pixels], %[pixels], %[line_size] \n\t"
458  "walignr1 wr0, wr10, wr11 \n\t"
459  "pld [%[pixels]] \n\t"
460  "pld [%[pixels], #32] \n\t"
461 
462  "1: \n\t"
463  "wldrd wr10, [%[pixels]] \n\t"
464  "wldrd wr11, [%[pixels], #8] \n\t"
465  "add %[pixels], %[pixels], %[line_size] \n\t"
466  "pld [%[pixels]] \n\t"
467  "pld [%[pixels], #32] \n\t"
468  "walignr1 wr4, wr10, wr11 \n\t"
469  "wldrd wr10, [%[block]] \n\t"
470  WAVG2B" wr8, wr0, wr4 \n\t"
471  WAVG2B" wr8, wr8, wr10 \n\t"
472  "wstrd wr8, [%[block]] \n\t"
473  "add %[block], %[block], %[line_size] \n\t"
474 
475  "wldrd wr10, [%[pixels]] \n\t"
476  "wldrd wr11, [%[pixels], #8] \n\t"
477  "pld [%[block]] \n\t"
478  "add %[pixels], %[pixels], %[line_size] \n\t"
479  "pld [%[pixels]] \n\t"
480  "pld [%[pixels], #32] \n\t"
481  "walignr1 wr0, wr10, wr11 \n\t"
482  "wldrd wr10, [%[block]] \n\t"
483  WAVG2B" wr8, wr0, wr4 \n\t"
484  WAVG2B" wr8, wr8, wr10 \n\t"
485  "wstrd wr8, [%[block]] \n\t"
486  "add %[block], %[block], %[line_size] \n\t"
487 
488  "subs %[h], %[h], #2 \n\t"
489  "pld [%[block]] \n\t"
490  "bne 1b \n\t"
491  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
492  :
493  : "cc", "memory", "r12");
494 }
495 
496 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
497 {
498  int stride = line_size;
499  // [wr0 wr1 wr2 wr3] for previous line
500  // [wr4 wr5 wr6 wr7] for current line
501  __asm__ volatile(
502  "pld [%[pixels]] \n\t"
503  "pld [%[pixels], #32] \n\t"
504  "and r12, %[pixels], #7 \n\t"
505  "tmcr wcgr1, r12 \n\t"
506  "bic %[pixels], %[pixels], #7 \n\t"
507 
508  "wldrd wr10, [%[pixels]] \n\t"
509  "wldrd wr11, [%[pixels], #8] \n\t"
510  "wldrd wr12, [%[pixels], #16] \n\t"
511  "add %[pixels], %[pixels], %[line_size] \n\t"
512  "pld [%[pixels]] \n\t"
513  "pld [%[pixels], #32] \n\t"
514  "walignr1 wr0, wr10, wr11 \n\t"
515  "walignr1 wr1, wr11, wr12 \n\t"
516 
517  "1: \n\t"
518  "wldrd wr10, [%[pixels]] \n\t"
519  "wldrd wr11, [%[pixels], #8] \n\t"
520  "wldrd wr12, [%[pixels], #16] \n\t"
521  "add %[pixels], %[pixels], %[line_size] \n\t"
522  "pld [%[pixels]] \n\t"
523  "pld [%[pixels], #32] \n\t"
524  "walignr1 wr4, wr10, wr11 \n\t"
525  "walignr1 wr5, wr11, wr12 \n\t"
526  WAVG2B" wr8, wr0, wr4 \n\t"
527  WAVG2B" wr9, wr1, wr5 \n\t"
528  "wstrd wr8, [%[block]] \n\t"
529  "wstrd wr9, [%[block], #8] \n\t"
530  "add %[block], %[block], %[line_size] \n\t"
531 
532  "wldrd wr10, [%[pixels]] \n\t"
533  "wldrd wr11, [%[pixels], #8] \n\t"
534  "wldrd wr12, [%[pixels], #16] \n\t"
535  "add %[pixels], %[pixels], %[line_size] \n\t"
536  "pld [%[pixels]] \n\t"
537  "pld [%[pixels], #32] \n\t"
538  "walignr1 wr0, wr10, wr11 \n\t"
539  "walignr1 wr1, wr11, wr12 \n\t"
540  WAVG2B" wr8, wr0, wr4 \n\t"
541  WAVG2B" wr9, wr1, wr5 \n\t"
542  "wstrd wr8, [%[block]] \n\t"
543  "wstrd wr9, [%[block], #8] \n\t"
544  "add %[block], %[block], %[line_size] \n\t"
545 
546  "subs %[h], %[h], #2 \n\t"
547  "bne 1b \n\t"
548  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
549  :
550  : "r4", "r5", "r12", "memory");
551 }
552 
553 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
554 {
555  int stride = line_size;
556  // [wr0 wr1 wr2 wr3] for previous line
557  // [wr4 wr5 wr6 wr7] for current line
558  __asm__ volatile(
559  "pld [%[pixels]] \n\t"
560  "pld [%[pixels], #32] \n\t"
561  "and r12, %[pixels], #7 \n\t"
562  "tmcr wcgr1, r12 \n\t"
563  "bic %[pixels], %[pixels], #7 \n\t"
564 
565  "wldrd wr10, [%[pixels]] \n\t"
566  "wldrd wr11, [%[pixels], #8] \n\t"
567  "pld [%[block]] \n\t"
568  "wldrd wr12, [%[pixels], #16] \n\t"
569  "add %[pixels], %[pixels], %[line_size] \n\t"
570  "pld [%[pixels]] \n\t"
571  "pld [%[pixels], #32] \n\t"
572  "walignr1 wr0, wr10, wr11 \n\t"
573  "walignr1 wr1, wr11, wr12 \n\t"
574 
575  "1: \n\t"
576  "wldrd wr10, [%[pixels]] \n\t"
577  "wldrd wr11, [%[pixels], #8] \n\t"
578  "wldrd wr12, [%[pixels], #16] \n\t"
579  "add %[pixels], %[pixels], %[line_size] \n\t"
580  "pld [%[pixels]] \n\t"
581  "pld [%[pixels], #32] \n\t"
582  "walignr1 wr4, wr10, wr11 \n\t"
583  "walignr1 wr5, wr11, wr12 \n\t"
584  "wldrd wr10, [%[block]] \n\t"
585  "wldrd wr11, [%[block], #8] \n\t"
586  WAVG2B" wr8, wr0, wr4 \n\t"
587  WAVG2B" wr9, wr1, wr5 \n\t"
588  WAVG2B" wr8, wr8, wr10 \n\t"
589  WAVG2B" wr9, wr9, wr11 \n\t"
590  "wstrd wr8, [%[block]] \n\t"
591  "wstrd wr9, [%[block], #8] \n\t"
592  "add %[block], %[block], %[line_size] \n\t"
593 
594  "wldrd wr10, [%[pixels]] \n\t"
595  "wldrd wr11, [%[pixels], #8] \n\t"
596  "pld [%[block]] \n\t"
597  "wldrd wr12, [%[pixels], #16] \n\t"
598  "add %[pixels], %[pixels], %[line_size] \n\t"
599  "pld [%[pixels]] \n\t"
600  "pld [%[pixels], #32] \n\t"
601  "walignr1 wr0, wr10, wr11 \n\t"
602  "walignr1 wr1, wr11, wr12 \n\t"
603  "wldrd wr10, [%[block]] \n\t"
604  "wldrd wr11, [%[block], #8] \n\t"
605  WAVG2B" wr8, wr0, wr4 \n\t"
606  WAVG2B" wr9, wr1, wr5 \n\t"
607  WAVG2B" wr8, wr8, wr10 \n\t"
608  WAVG2B" wr9, wr9, wr11 \n\t"
609  "wstrd wr8, [%[block]] \n\t"
610  "wstrd wr9, [%[block], #8] \n\t"
611  "add %[block], %[block], %[line_size] \n\t"
612 
613  "subs %[h], %[h], #2 \n\t"
614  "pld [%[block]] \n\t"
615  "bne 1b \n\t"
616  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
617  :
618  : "r4", "r5", "r12", "memory");
619 }
620 
621 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
622 {
623  // [wr0 wr1 wr2 wr3] for previous line
624  // [wr4 wr5 wr6 wr7] for current line
625  SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
626  __asm__ volatile(
627  "pld [%[pixels]] \n\t"
628  "mov r12, #2 \n\t"
629  "pld [%[pixels], #32] \n\t"
630  "tmcr wcgr0, r12 \n\t" /* for shift value */
631  "and r12, %[pixels], #7 \n\t"
632  "bic %[pixels], %[pixels], #7 \n\t"
633  "tmcr wcgr1, r12 \n\t"
634 
635  // [wr0 wr1 wr2 wr3] <= *
636  // [wr4 wr5 wr6 wr7]
637  "wldrd wr12, [%[pixels]] \n\t"
638  "add r12, r12, #1 \n\t"
639  "wldrd wr13, [%[pixels], #8] \n\t"
640  "tmcr wcgr2, r12 \n\t"
641  "add %[pixels], %[pixels], %[line_size] \n\t"
642  "cmp r12, #8 \n\t"
643  "pld [%[pixels]] \n\t"
644  "pld [%[pixels], #32] \n\t"
645  "walignr1 wr2, wr12, wr13 \n\t"
646  "wmoveq wr10, wr13 \n\t"
647  "walignr2ne wr10, wr12, wr13 \n\t"
648  "wunpckelub wr0, wr2 \n\t"
649  "wunpckehub wr1, wr2 \n\t"
650  "wunpckelub wr8, wr10 \n\t"
651  "wunpckehub wr9, wr10 \n\t"
652  "waddhus wr0, wr0, wr8 \n\t"
653  "waddhus wr1, wr1, wr9 \n\t"
654 
655  "1: \n\t"
656  // [wr0 wr1 wr2 wr3]
657  // [wr4 wr5 wr6 wr7] <= *
658  "wldrd wr12, [%[pixels]] \n\t"
659  "cmp r12, #8 \n\t"
660  "wldrd wr13, [%[pixels], #8] \n\t"
661  "add %[pixels], %[pixels], %[line_size] \n\t"
662  "walignr1 wr6, wr12, wr13 \n\t"
663  "pld [%[pixels]] \n\t"
664  "pld [%[pixels], #32] \n\t"
665  "wmoveq wr10, wr13 \n\t"
666  "walignr2ne wr10, wr12, wr13 \n\t"
667  "wunpckelub wr4, wr6 \n\t"
668  "wunpckehub wr5, wr6 \n\t"
669  "wunpckelub wr8, wr10 \n\t"
670  "wunpckehub wr9, wr10 \n\t"
671  "waddhus wr4, wr4, wr8 \n\t"
672  "waddhus wr5, wr5, wr9 \n\t"
673  "waddhus wr8, wr0, wr4 \n\t"
674  "waddhus wr9, wr1, wr5 \n\t"
675  "waddhus wr8, wr8, wr15 \n\t"
676  "waddhus wr9, wr9, wr15 \n\t"
677  "wsrlhg wr8, wr8, wcgr0 \n\t"
678  "wsrlhg wr9, wr9, wcgr0 \n\t"
679  "wpackhus wr8, wr8, wr9 \n\t"
680  "wstrd wr8, [%[block]] \n\t"
681  "add %[block], %[block], %[line_size] \n\t"
682 
683  // [wr0 wr1 wr2 wr3] <= *
684  // [wr4 wr5 wr6 wr7]
685  "wldrd wr12, [%[pixels]] \n\t"
686  "wldrd wr13, [%[pixels], #8] \n\t"
687  "add %[pixels], %[pixels], %[line_size] \n\t"
688  "walignr1 wr2, wr12, wr13 \n\t"
689  "pld [%[pixels]] \n\t"
690  "pld [%[pixels], #32] \n\t"
691  "wmoveq wr10, wr13 \n\t"
692  "walignr2ne wr10, wr12, wr13 \n\t"
693  "wunpckelub wr0, wr2 \n\t"
694  "wunpckehub wr1, wr2 \n\t"
695  "wunpckelub wr8, wr10 \n\t"
696  "wunpckehub wr9, wr10 \n\t"
697  "waddhus wr0, wr0, wr8 \n\t"
698  "waddhus wr1, wr1, wr9 \n\t"
699  "waddhus wr8, wr0, wr4 \n\t"
700  "waddhus wr9, wr1, wr5 \n\t"
701  "waddhus wr8, wr8, wr15 \n\t"
702  "waddhus wr9, wr9, wr15 \n\t"
703  "wsrlhg wr8, wr8, wcgr0 \n\t"
704  "wsrlhg wr9, wr9, wcgr0 \n\t"
705  "wpackhus wr8, wr8, wr9 \n\t"
706  "subs %[h], %[h], #2 \n\t"
707  "wstrd wr8, [%[block]] \n\t"
708  "add %[block], %[block], %[line_size] \n\t"
709  "bne 1b \n\t"
710  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
711  : [line_size]"r"(line_size)
712  : "r12", "memory");
713 }
714 
715 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
716 {
717  // [wr0 wr1 wr2 wr3] for previous line
718  // [wr4 wr5 wr6 wr7] for current line
719  SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
720  __asm__ volatile(
721  "pld [%[pixels]] \n\t"
722  "mov r12, #2 \n\t"
723  "pld [%[pixels], #32] \n\t"
724  "tmcr wcgr0, r12 \n\t" /* for shift value */
725  /* alignment */
726  "and r12, %[pixels], #7 \n\t"
727  "bic %[pixels], %[pixels], #7 \n\t"
728  "tmcr wcgr1, r12 \n\t"
729  "add r12, r12, #1 \n\t"
730  "tmcr wcgr2, r12 \n\t"
731 
732  // [wr0 wr1 wr2 wr3] <= *
733  // [wr4 wr5 wr6 wr7]
734  "wldrd wr12, [%[pixels]] \n\t"
735  "cmp r12, #8 \n\t"
736  "wldrd wr13, [%[pixels], #8] \n\t"
737  "wldrd wr14, [%[pixels], #16] \n\t"
738  "add %[pixels], %[pixels], %[line_size] \n\t"
739  "pld [%[pixels]] \n\t"
740  "walignr1 wr2, wr12, wr13 \n\t"
741  "pld [%[pixels], #32] \n\t"
742  "walignr1 wr3, wr13, wr14 \n\t"
743  "wmoveq wr10, wr13 \n\t"
744  "wmoveq wr11, wr14 \n\t"
745  "walignr2ne wr10, wr12, wr13 \n\t"
746  "walignr2ne wr11, wr13, wr14 \n\t"
747  "wunpckelub wr0, wr2 \n\t"
748  "wunpckehub wr1, wr2 \n\t"
749  "wunpckelub wr2, wr3 \n\t"
750  "wunpckehub wr3, wr3 \n\t"
751  "wunpckelub wr8, wr10 \n\t"
752  "wunpckehub wr9, wr10 \n\t"
753  "wunpckelub wr10, wr11 \n\t"
754  "wunpckehub wr11, wr11 \n\t"
755  "waddhus wr0, wr0, wr8 \n\t"
756  "waddhus wr1, wr1, wr9 \n\t"
757  "waddhus wr2, wr2, wr10 \n\t"
758  "waddhus wr3, wr3, wr11 \n\t"
759 
760  "1: \n\t"
761  // [wr0 wr1 wr2 wr3]
762  // [wr4 wr5 wr6 wr7] <= *
763  "wldrd wr12, [%[pixels]] \n\t"
764  "cmp r12, #8 \n\t"
765  "wldrd wr13, [%[pixels], #8] \n\t"
766  "wldrd wr14, [%[pixels], #16] \n\t"
767  "add %[pixels], %[pixels], %[line_size] \n\t"
768  "walignr1 wr6, wr12, wr13 \n\t"
769  "pld [%[pixels]] \n\t"
770  "pld [%[pixels], #32] \n\t"
771  "walignr1 wr7, wr13, wr14 \n\t"
772  "wmoveq wr10, wr13 \n\t"
773  "wmoveq wr11, wr14 \n\t"
774  "walignr2ne wr10, wr12, wr13 \n\t"
775  "walignr2ne wr11, wr13, wr14 \n\t"
776  "wunpckelub wr4, wr6 \n\t"
777  "wunpckehub wr5, wr6 \n\t"
778  "wunpckelub wr6, wr7 \n\t"
779  "wunpckehub wr7, wr7 \n\t"
780  "wunpckelub wr8, wr10 \n\t"
781  "wunpckehub wr9, wr10 \n\t"
782  "wunpckelub wr10, wr11 \n\t"
783  "wunpckehub wr11, wr11 \n\t"
784  "waddhus wr4, wr4, wr8 \n\t"
785  "waddhus wr5, wr5, wr9 \n\t"
786  "waddhus wr6, wr6, wr10 \n\t"
787  "waddhus wr7, wr7, wr11 \n\t"
788  "waddhus wr8, wr0, wr4 \n\t"
789  "waddhus wr9, wr1, wr5 \n\t"
790  "waddhus wr10, wr2, wr6 \n\t"
791  "waddhus wr11, wr3, wr7 \n\t"
792  "waddhus wr8, wr8, wr15 \n\t"
793  "waddhus wr9, wr9, wr15 \n\t"
794  "waddhus wr10, wr10, wr15 \n\t"
795  "waddhus wr11, wr11, wr15 \n\t"
796  "wsrlhg wr8, wr8, wcgr0 \n\t"
797  "wsrlhg wr9, wr9, wcgr0 \n\t"
798  "wsrlhg wr10, wr10, wcgr0 \n\t"
799  "wsrlhg wr11, wr11, wcgr0 \n\t"
800  "wpackhus wr8, wr8, wr9 \n\t"
801  "wpackhus wr9, wr10, wr11 \n\t"
802  "wstrd wr8, [%[block]] \n\t"
803  "wstrd wr9, [%[block], #8] \n\t"
804  "add %[block], %[block], %[line_size] \n\t"
805 
806  // [wr0 wr1 wr2 wr3] <= *
807  // [wr4 wr5 wr6 wr7]
808  "wldrd wr12, [%[pixels]] \n\t"
809  "wldrd wr13, [%[pixels], #8] \n\t"
810  "wldrd wr14, [%[pixels], #16] \n\t"
811  "add %[pixels], %[pixels], %[line_size] \n\t"
812  "walignr1 wr2, wr12, wr13 \n\t"
813  "pld [%[pixels]] \n\t"
814  "pld [%[pixels], #32] \n\t"
815  "walignr1 wr3, wr13, wr14 \n\t"
816  "wmoveq wr10, wr13 \n\t"
817  "wmoveq wr11, wr14 \n\t"
818  "walignr2ne wr10, wr12, wr13 \n\t"
819  "walignr2ne wr11, wr13, wr14 \n\t"
820  "wunpckelub wr0, wr2 \n\t"
821  "wunpckehub wr1, wr2 \n\t"
822  "wunpckelub wr2, wr3 \n\t"
823  "wunpckehub wr3, wr3 \n\t"
824  "wunpckelub wr8, wr10 \n\t"
825  "wunpckehub wr9, wr10 \n\t"
826  "wunpckelub wr10, wr11 \n\t"
827  "wunpckehub wr11, wr11 \n\t"
828  "waddhus wr0, wr0, wr8 \n\t"
829  "waddhus wr1, wr1, wr9 \n\t"
830  "waddhus wr2, wr2, wr10 \n\t"
831  "waddhus wr3, wr3, wr11 \n\t"
832  "waddhus wr8, wr0, wr4 \n\t"
833  "waddhus wr9, wr1, wr5 \n\t"
834  "waddhus wr10, wr2, wr6 \n\t"
835  "waddhus wr11, wr3, wr7 \n\t"
836  "waddhus wr8, wr8, wr15 \n\t"
837  "waddhus wr9, wr9, wr15 \n\t"
838  "waddhus wr10, wr10, wr15 \n\t"
839  "waddhus wr11, wr11, wr15 \n\t"
840  "wsrlhg wr8, wr8, wcgr0 \n\t"
841  "wsrlhg wr9, wr9, wcgr0 \n\t"
842  "wsrlhg wr10, wr10, wcgr0 \n\t"
843  "wsrlhg wr11, wr11, wcgr0 \n\t"
844  "wpackhus wr8, wr8, wr9 \n\t"
845  "wpackhus wr9, wr10, wr11 \n\t"
846  "wstrd wr8, [%[block]] \n\t"
847  "wstrd wr9, [%[block], #8] \n\t"
848  "add %[block], %[block], %[line_size] \n\t"
849 
850  "subs %[h], %[h], #2 \n\t"
851  "bne 1b \n\t"
852  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
853  : [line_size]"r"(line_size)
854  : "r12", "memory");
855 }
856 
857 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
858 {
859  // [wr0 wr1 wr2 wr3] for previous line
860  // [wr4 wr5 wr6 wr7] for current line
861  SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
862  __asm__ volatile(
863  "pld [%[block]] \n\t"
864  "pld [%[block], #32] \n\t"
865  "pld [%[pixels]] \n\t"
866  "mov r12, #2 \n\t"
867  "pld [%[pixels], #32] \n\t"
868  "tmcr wcgr0, r12 \n\t" /* for shift value */
869  "and r12, %[pixels], #7 \n\t"
870  "bic %[pixels], %[pixels], #7 \n\t"
871  "tmcr wcgr1, r12 \n\t"
872 
873  // [wr0 wr1 wr2 wr3] <= *
874  // [wr4 wr5 wr6 wr7]
875  "wldrd wr12, [%[pixels]] \n\t"
876  "add r12, r12, #1 \n\t"
877  "wldrd wr13, [%[pixels], #8] \n\t"
878  "tmcr wcgr2, r12 \n\t"
879  "add %[pixels], %[pixels], %[line_size] \n\t"
880  "cmp r12, #8 \n\t"
881  "pld [%[pixels]] \n\t"
882  "pld [%[pixels], #32] \n\t"
883  "walignr1 wr2, wr12, wr13 \n\t"
884  "wmoveq wr10, wr13 \n\t"
885  "walignr2ne wr10, wr12, wr13 \n\t"
886  "wunpckelub wr0, wr2 \n\t"
887  "wunpckehub wr1, wr2 \n\t"
888  "wunpckelub wr8, wr10 \n\t"
889  "wunpckehub wr9, wr10 \n\t"
890  "waddhus wr0, wr0, wr8 \n\t"
891  "waddhus wr1, wr1, wr9 \n\t"
892 
893  "1: \n\t"
894  // [wr0 wr1 wr2 wr3]
895  // [wr4 wr5 wr6 wr7] <= *
896  "wldrd wr12, [%[pixels]] \n\t"
897  "cmp r12, #8 \n\t"
898  "wldrd wr13, [%[pixels], #8] \n\t"
899  "add %[pixels], %[pixels], %[line_size] \n\t"
900  "walignr1 wr6, wr12, wr13 \n\t"
901  "pld [%[pixels]] \n\t"
902  "pld [%[pixels], #32] \n\t"
903  "wmoveq wr10, wr13 \n\t"
904  "walignr2ne wr10, wr12, wr13 \n\t"
905  "wunpckelub wr4, wr6 \n\t"
906  "wunpckehub wr5, wr6 \n\t"
907  "wunpckelub wr8, wr10 \n\t"
908  "wunpckehub wr9, wr10 \n\t"
909  "waddhus wr4, wr4, wr8 \n\t"
910  "waddhus wr5, wr5, wr9 \n\t"
911  "waddhus wr8, wr0, wr4 \n\t"
912  "waddhus wr9, wr1, wr5 \n\t"
913  "waddhus wr8, wr8, wr15 \n\t"
914  "waddhus wr9, wr9, wr15 \n\t"
915  "wldrd wr12, [%[block]] \n\t"
916  "wsrlhg wr8, wr8, wcgr0 \n\t"
917  "wsrlhg wr9, wr9, wcgr0 \n\t"
918  "wpackhus wr8, wr8, wr9 \n\t"
919  WAVG2B" wr8, wr8, wr12 \n\t"
920  "wstrd wr8, [%[block]] \n\t"
921  "add %[block], %[block], %[line_size] \n\t"
922  "wldrd wr12, [%[pixels]] \n\t"
923  "pld [%[block]] \n\t"
924  "pld [%[block], #32] \n\t"
925 
926  // [wr0 wr1 wr2 wr3] <= *
927  // [wr4 wr5 wr6 wr7]
928  "wldrd wr13, [%[pixels], #8] \n\t"
929  "add %[pixels], %[pixels], %[line_size] \n\t"
930  "walignr1 wr2, wr12, wr13 \n\t"
931  "pld [%[pixels]] \n\t"
932  "pld [%[pixels], #32] \n\t"
933  "wmoveq wr10, wr13 \n\t"
934  "walignr2ne wr10, wr12, wr13 \n\t"
935  "wunpckelub wr0, wr2 \n\t"
936  "wunpckehub wr1, wr2 \n\t"
937  "wunpckelub wr8, wr10 \n\t"
938  "wunpckehub wr9, wr10 \n\t"
939  "waddhus wr0, wr0, wr8 \n\t"
940  "waddhus wr1, wr1, wr9 \n\t"
941  "waddhus wr8, wr0, wr4 \n\t"
942  "waddhus wr9, wr1, wr5 \n\t"
943  "waddhus wr8, wr8, wr15 \n\t"
944  "waddhus wr9, wr9, wr15 \n\t"
945  "wldrd wr12, [%[block]] \n\t"
946  "wsrlhg wr8, wr8, wcgr0 \n\t"
947  "wsrlhg wr9, wr9, wcgr0 \n\t"
948  "wpackhus wr8, wr8, wr9 \n\t"
949  "subs %[h], %[h], #2 \n\t"
950  WAVG2B" wr8, wr8, wr12 \n\t"
951  "wstrd wr8, [%[block]] \n\t"
952  "add %[block], %[block], %[line_size] \n\t"
953  "pld [%[block]] \n\t"
954  "pld [%[block], #32] \n\t"
955  "bne 1b \n\t"
956  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
957  : [line_size]"r"(line_size)
958  : "r12", "memory");
959 }
960 
961 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
962 {
963  // [wr0 wr1 wr2 wr3] for previous line
964  // [wr4 wr5 wr6 wr7] for current line
965  SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
966  __asm__ volatile(
967  "pld [%[block]] \n\t"
968  "pld [%[block], #32] \n\t"
969  "pld [%[pixels]] \n\t"
970  "mov r12, #2 \n\t"
971  "pld [%[pixels], #32] \n\t"
972  "tmcr wcgr0, r12 \n\t" /* for shift value */
973  /* alignment */
974  "and r12, %[pixels], #7 \n\t"
975  "bic %[pixels], %[pixels], #7 \n\t"
976  "tmcr wcgr1, r12 \n\t"
977  "add r12, r12, #1 \n\t"
978  "tmcr wcgr2, r12 \n\t"
979 
980  // [wr0 wr1 wr2 wr3] <= *
981  // [wr4 wr5 wr6 wr7]
982  "wldrd wr12, [%[pixels]] \n\t"
983  "cmp r12, #8 \n\t"
984  "wldrd wr13, [%[pixels], #8] \n\t"
985  "wldrd wr14, [%[pixels], #16] \n\t"
986  "add %[pixels], %[pixels], %[line_size] \n\t"
987  "pld [%[pixels]] \n\t"
988  "walignr1 wr2, wr12, wr13 \n\t"
989  "pld [%[pixels], #32] \n\t"
990  "walignr1 wr3, wr13, wr14 \n\t"
991  "wmoveq wr10, wr13 \n\t"
992  "wmoveq wr11, wr14 \n\t"
993  "walignr2ne wr10, wr12, wr13 \n\t"
994  "walignr2ne wr11, wr13, wr14 \n\t"
995  "wunpckelub wr0, wr2 \n\t"
996  "wunpckehub wr1, wr2 \n\t"
997  "wunpckelub wr2, wr3 \n\t"
998  "wunpckehub wr3, wr3 \n\t"
999  "wunpckelub wr8, wr10 \n\t"
1000  "wunpckehub wr9, wr10 \n\t"
1001  "wunpckelub wr10, wr11 \n\t"
1002  "wunpckehub wr11, wr11 \n\t"
1003  "waddhus wr0, wr0, wr8 \n\t"
1004  "waddhus wr1, wr1, wr9 \n\t"
1005  "waddhus wr2, wr2, wr10 \n\t"
1006  "waddhus wr3, wr3, wr11 \n\t"
1007 
1008  "1: \n\t"
1009  // [wr0 wr1 wr2 wr3]
1010  // [wr4 wr5 wr6 wr7] <= *
1011  "wldrd wr12, [%[pixels]] \n\t"
1012  "cmp r12, #8 \n\t"
1013  "wldrd wr13, [%[pixels], #8] \n\t"
1014  "wldrd wr14, [%[pixels], #16] \n\t"
1015  "add %[pixels], %[pixels], %[line_size] \n\t"
1016  "walignr1 wr6, wr12, wr13 \n\t"
1017  "pld [%[pixels]] \n\t"
1018  "pld [%[pixels], #32] \n\t"
1019  "walignr1 wr7, wr13, wr14 \n\t"
1020  "wmoveq wr10, wr13 \n\t"
1021  "wmoveq wr11, wr14 \n\t"
1022  "walignr2ne wr10, wr12, wr13 \n\t"
1023  "walignr2ne wr11, wr13, wr14 \n\t"
1024  "wunpckelub wr4, wr6 \n\t"
1025  "wunpckehub wr5, wr6 \n\t"
1026  "wunpckelub wr6, wr7 \n\t"
1027  "wunpckehub wr7, wr7 \n\t"
1028  "wunpckelub wr8, wr10 \n\t"
1029  "wunpckehub wr9, wr10 \n\t"
1030  "wunpckelub wr10, wr11 \n\t"
1031  "wunpckehub wr11, wr11 \n\t"
1032  "waddhus wr4, wr4, wr8 \n\t"
1033  "waddhus wr5, wr5, wr9 \n\t"
1034  "waddhus wr6, wr6, wr10 \n\t"
1035  "waddhus wr7, wr7, wr11 \n\t"
1036  "waddhus wr8, wr0, wr4 \n\t"
1037  "waddhus wr9, wr1, wr5 \n\t"
1038  "waddhus wr10, wr2, wr6 \n\t"
1039  "waddhus wr11, wr3, wr7 \n\t"
1040  "waddhus wr8, wr8, wr15 \n\t"
1041  "waddhus wr9, wr9, wr15 \n\t"
1042  "waddhus wr10, wr10, wr15 \n\t"
1043  "waddhus wr11, wr11, wr15 \n\t"
1044  "wsrlhg wr8, wr8, wcgr0 \n\t"
1045  "wsrlhg wr9, wr9, wcgr0 \n\t"
1046  "wldrd wr12, [%[block]] \n\t"
1047  "wldrd wr13, [%[block], #8] \n\t"
1048  "wsrlhg wr10, wr10, wcgr0 \n\t"
1049  "wsrlhg wr11, wr11, wcgr0 \n\t"
1050  "wpackhus wr8, wr8, wr9 \n\t"
1051  "wpackhus wr9, wr10, wr11 \n\t"
1052  WAVG2B" wr8, wr8, wr12 \n\t"
1053  WAVG2B" wr9, wr9, wr13 \n\t"
1054  "wstrd wr8, [%[block]] \n\t"
1055  "wstrd wr9, [%[block], #8] \n\t"
1056  "add %[block], %[block], %[line_size] \n\t"
1057 
1058  // [wr0 wr1 wr2 wr3] <= *
1059  // [wr4 wr5 wr6 wr7]
1060  "wldrd wr12, [%[pixels]] \n\t"
1061  "pld [%[block]] \n\t"
1062  "wldrd wr13, [%[pixels], #8] \n\t"
1063  "pld [%[block], #32] \n\t"
1064  "wldrd wr14, [%[pixels], #16] \n\t"
1065  "add %[pixels], %[pixels], %[line_size] \n\t"
1066  "walignr1 wr2, wr12, wr13 \n\t"
1067  "pld [%[pixels]] \n\t"
1068  "pld [%[pixels], #32] \n\t"
1069  "walignr1 wr3, wr13, wr14 \n\t"
1070  "wmoveq wr10, wr13 \n\t"
1071  "wmoveq wr11, wr14 \n\t"
1072  "walignr2ne wr10, wr12, wr13 \n\t"
1073  "walignr2ne wr11, wr13, wr14 \n\t"
1074  "wunpckelub wr0, wr2 \n\t"
1075  "wunpckehub wr1, wr2 \n\t"
1076  "wunpckelub wr2, wr3 \n\t"
1077  "wunpckehub wr3, wr3 \n\t"
1078  "wunpckelub wr8, wr10 \n\t"
1079  "wunpckehub wr9, wr10 \n\t"
1080  "wunpckelub wr10, wr11 \n\t"
1081  "wunpckehub wr11, wr11 \n\t"
1082  "waddhus wr0, wr0, wr8 \n\t"
1083  "waddhus wr1, wr1, wr9 \n\t"
1084  "waddhus wr2, wr2, wr10 \n\t"
1085  "waddhus wr3, wr3, wr11 \n\t"
1086  "waddhus wr8, wr0, wr4 \n\t"
1087  "waddhus wr9, wr1, wr5 \n\t"
1088  "waddhus wr10, wr2, wr6 \n\t"
1089  "waddhus wr11, wr3, wr7 \n\t"
1090  "waddhus wr8, wr8, wr15 \n\t"
1091  "waddhus wr9, wr9, wr15 \n\t"
1092  "waddhus wr10, wr10, wr15 \n\t"
1093  "waddhus wr11, wr11, wr15 \n\t"
1094  "wsrlhg wr8, wr8, wcgr0 \n\t"
1095  "wsrlhg wr9, wr9, wcgr0 \n\t"
1096  "wldrd wr12, [%[block]] \n\t"
1097  "wldrd wr13, [%[block], #8] \n\t"
1098  "wsrlhg wr10, wr10, wcgr0 \n\t"
1099  "wsrlhg wr11, wr11, wcgr0 \n\t"
1100  "wpackhus wr8, wr8, wr9 \n\t"
1101  "wpackhus wr9, wr10, wr11 \n\t"
1102  WAVG2B" wr8, wr8, wr12 \n\t"
1103  WAVG2B" wr9, wr9, wr13 \n\t"
1104  "wstrd wr8, [%[block]] \n\t"
1105  "wstrd wr9, [%[block], #8] \n\t"
1106  "add %[block], %[block], %[line_size] \n\t"
1107  "subs %[h], %[h], #2 \n\t"
1108  "pld [%[block]] \n\t"
1109  "pld [%[block], #32] \n\t"
1110  "bne 1b \n\t"
1111  : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
1112  : [line_size]"r"(line_size)
1113  : "r12", "memory");
1114 }