FFmpeg  4.0
simple_idct_mmi.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized simple idct
3  *
4  * Copyright (c) 2015 Loongson Technology Corporation Limited
5  * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "idctdsp_mips.h"
26 #include "constants.h"
27 
28 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
29 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
30 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
31 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
32 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
33 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36 
37 #define ROW_SHIFT 11
38 #define COL_SHIFT 20
39 
40 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
41  1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
42  1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
43  C4, C4, C4, C4,
44  C4, -C4, C4, -C4,
45  C2, C6, C2, C6,
46  C6, -C2, C6, -C2,
47  C1, C3, C1, C3,
48  C5, C7, C5, C7,
49  C3, -C7, C3, -C7,
50  -C1, -C5, -C1, -C5,
51  C5, -C1, C5, -C1,
52  C7, C3, C7, C3,
53  C7, -C5, C7, -C5,
54  C3, -C1, C3, -C1
55 };
56 
57 void ff_simple_idct_mmi(int16_t *block)
58 {
59  DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
60  int16_t * const temp= (int16_t*)align_tmp;
61 
62  __asm__ volatile (
63 #undef DC_COND_IDCT
64 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift) \
65  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
66  "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
67  "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
68  "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
69  "ldc1 $f8, %3 \n\t" \
70  "and $f8, $f8, $f0 \n\t" \
71  "or $f8, $f8, $f2 \n\t" \
72  "or $f8, $f8, $f4 \n\t" \
73  "or $f8, $f8, $f6 \n\t" \
74  "packsswh $f8, $f8, $f8 \n\t" \
75  "li $11, " #shift " \n\t" \
76  "mfc1 $10, $f8 \n\t" \
77  "mtc1 $11, $f18 \n\t" \
78  "beqz $10, 1f \n\t" \
79  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
80  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
81  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
82  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
83  "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
84  "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
85  "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
86  "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
87  "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
88  "ldc1 $f16, " #rarg " \n\t" \
89  "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
90  #rounder " $f8, $f8, $f16 \n\t" \
91  "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
92  "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
93  "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
94  "ldc1 $f10, 56(%2) \n\t" /* C7 C5 C7 C5 */\
95  "ldc1 $f16, " #rarg " \n\t" \
96  "pmaddhw $f10, $f10, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
97  #rounder " $f0, $f0, $f16 \n\t" \
98  "paddw $f2, $f2, $f0 \n\t" /* A1 a1 */\
99  "ldc1 $f16, 64(%2) \n\t" \
100  "paddw $f0, $f0, $f0 \n\t" \
101  "psubw $f0, $f0, $f2 \n\t" /* A2 a2 */\
102  "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
103  "paddw $f14, $f14, $f10 \n\t" /* B0 b0 */\
104  "ldc1 $f10, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
105  "pmaddhw $f10, $f10, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
106  "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
107  "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
108  "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
109  "paddw $f10, $f10, $f4 \n\t" /* B1 b1 */\
110  "psraw $f14, $f14, $f18 \n\t" \
111  "psraw $f8, $f8, $f18 \n\t" \
112  "mov.d $f4, $f2 \n\t" /* A1 a1 */\
113  "paddw $f2, $f2, $f10 \n\t" /* A1+B1 a1+b1 */\
114  "psubw $f4, $f4, $f10 \n\t" /* A1-B1 a1-b1 */\
115  "psraw $f2, $f2, $f18 \n\t" \
116  "psraw $f4, $f4, $f18 \n\t" \
117  "packsswh $f14, $f14, $f2 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
118  "packsswh $f4, $f4, $f8 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
119  "sdc1 $f14, " #dst " \n\t" \
120  "ldc1 $f2, " #src1 " \n\t" /* R3 R1 r3 r1 */\
121  "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
122  "sdc1 $f4, 24+" #dst " \n\t" \
123  "pmaddhw $f8, $f8, $f2 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
124  "ldc1 $f16, 96(%2) \n\t" \
125  "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\
126  "pmaddhw $f2, $f2, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
127  "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
128  "ldc1 $f16, 104(%2) \n\t" \
129  "mov.d $f4, $f0 \n\t" /* A2 a2 */\
130  "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
131  "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\
132  "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\
133  "psubw $f0, $f0, $f8 \n\t" /* a2-B2 a2-b2 */\
134  "psraw $f4, $f4, $f18 \n\t" \
135  "psraw $f0, $f0, $f18 \n\t" \
136  "mov.d $f8, $f12 \n\t" /* A3 a3 */\
137  "paddw $f6, $f6, $f2 \n\t" /* B3 b3 */\
138  "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
139  "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\
140  "psraw $f12, $f12, $f18 \n\t" \
141  "packsswh $f4, $f4, $f12 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
142  "sdc1 $f4, 8+" #dst " \n\t" \
143  "psraw $f8, $f8, $f18 \n\t" \
144  "packsswh $f8, $f8, $f0 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
145  "sdc1 $f8, 16+" #dst " \n\t" \
146  "b 2f \n\t" \
147  "1: \n\t" \
148  "li $10, 16 \n\t" \
149  "mtc1 $10, $f16 \n\t" \
150  "psllw $f0, $f0, $f16 \n\t" \
151  "ldc1 $f16, %4 \n\t" \
152  "paddw $f0, $f0, $f16 \n\t" \
153  "li $10, 13 \n\t" \
154  "mtc1 $10, $f16 \n\t" \
155  "psraw $f0, $f0, $f16 \n\t" \
156  "packsswh $f0, $f0, $f0 \n\t" \
157  "sdc1 $f0, " #dst " \n\t" \
158  "sdc1 $f0, 8+" #dst " \n\t" \
159  "sdc1 $f0, 16+" #dst " \n\t" \
160  "sdc1 $f0, 24+" #dst " \n\t" \
161  "2: \n\t"
162 
163 #undef Z_COND_IDCT
164 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift, bt) \
165  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
166  "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
167  "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
168  "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
169  "mov.d $f8, $f0 \n\t" \
170  "or $f8, $f8, $f2 \n\t" \
171  "or $f8, $f8, $f4 \n\t" \
172  "or $f8, $f8, $f6 \n\t" \
173  "packsswh $f8, $f8, $f8 \n\t" \
174  "mfc1 $10, $f8 \n\t" \
175  "beqz $10, " #bt " \n\t" \
176  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
177  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
178  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
179  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
180  "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
181  "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
182  "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
183  "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
184  "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
185  "ldc1 $f16, " #rarg " \n\t" \
186  "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
187  #rounder " $f8, $f8, $f16 \n\t" \
188  "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
189  "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
190  "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
191  "ldc1 $f10, 56(%2) \n\t" /* C7 C5 C7 C5 */\
192  "ldc1 $f16, " #rarg " \n\t" \
193  "pmaddhw $f10, $f10, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
194  #rounder " $f0, $f0, $f16 \n\t" \
195  "paddw $f2, $f2, $f0 \n\t" /* A1 a1 */\
196  "paddw $f0, $f0, $f0 \n\t" \
197  "ldc1 $f16, 64(%2) \n\t" \
198  "psubw $f0, $f0, $f2 \n\t" /* A2 a2 */\
199  "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
200  "paddw $f14, $f14, $f10 \n\t" /* B0 b0 */\
201  "ldc1 $f10, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
202  "pmaddhw $f10, $f10, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
203  "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
204  "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
205  "li $10, " #shift " \n\t" \
206  "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
207  "mtc1 $10, $f18 \n\t" \
208  "paddw $f10, $f10, $f4 \n\t" /* B1 b1 */\
209  "psraw $f14, $f14, $f18 \n\t" \
210  "psraw $f8, $f8, $f18 \n\t" \
211  "mov.d $f4, $f2 \n\t" /* A1 a1 */\
212  "paddw $f2, $f2, $f10 \n\t" /* A1+B1 a1+b1 */\
213  "psubw $f4, $f4, $f10 \n\t" /* A1-B1 a1-b1 */\
214  "psraw $f2, $f2, $f18 \n\t" \
215  "psraw $f4, $f4, $f18 \n\t" \
216  "packsswh $f14, $f14, $f2 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
217  "packsswh $f4, $f4, $f8 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
218  "sdc1 $f14, " #dst " \n\t" \
219  "ldc1 $f2, " #src1 " \n\t" /* R3 R1 r3 r1 */\
220  "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
221  "sdc1 $f4, 24+" #dst " \n\t" \
222  "pmaddhw $f8, $f8, $f2 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
223  "ldc1 $f16, 96(%2) \n\t" \
224  "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\
225  "pmaddhw $f2, $f2, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
226  "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
227  "ldc1 $f16, 104(%2) \n\t" \
228  "mov.d $f4, $f0 \n\t" /* A2 a2 */\
229  "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
230  "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\
231  "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\
232  "psubw $f0, $f0, $f8 \n\t" /* a2-B2 a2-b2 */\
233  "psraw $f4, $f4, $f18 \n\t" \
234  "psraw $f0, $f0, $f18 \n\t" \
235  "mov.d $f8, $f12 \n\t" /* A3 a3 */\
236  "paddw $f6, $f6, $f2 \n\t" /* B3 b3 */\
237  "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
238  "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\
239  "psraw $f12, $f12, $f18 \n\t" \
240  "packsswh $f4, $f4, $f12 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
241  "sdc1 $f4, 8+" #dst " \n\t" \
242  "psraw $f8, $f8, $f18 \n\t" \
243  "packsswh $f8, $f8, $f0 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
244  "sdc1 $f8, 16+" #dst " \n\t" \
245 
246  //IDCT( src0, src4, src1, src5, dst, rounder, shift)
247  DC_COND_IDCT(0(%0), 8(%0), 16(%0), 24(%0), 0(%1), paddw,8(%2), 11)
248  Z_COND_IDCT(32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddw,(%2), 11, 4f)
249  Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddw,(%2), 11, 2f)
250  Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1), paddw,(%2), 11, 1f)
251 
252 #undef IDCT
253 #define IDCT(src0, src4, src1, src5, dst, shift) \
254  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
255  "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
256  "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
257  "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
258  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
259  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
260  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
261  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
262  "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
263  "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
264  "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
265  "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
266  "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
267  "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
268  "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
269  "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
270  "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
271  "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
272  "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\
273  "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\
274  "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\
275  "ldc1 $f16, 64(%2) \n\t" \
276  "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
277  "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
278  "li $10, " #shift " \n\t" \
279  "paddw $f14, $f14, $f2 \n\t" /* B0 b0 */\
280  "ldc1 $f2, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
281  "mtc1 $10, $f18 \n\t" \
282  "pmaddhw $f2, $f2, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
283  "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
284  "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
285  "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
286  "paddw $f2, $f2, $f4 \n\t" /* B1 b1 */\
287  "psraw $f14, $f14, $f18 \n\t" \
288  "psraw $f8, $f8, $f18 \n\t" \
289  "mov.d $f4, $f0 \n\t" /* A1 a1 */\
290  "paddw $f0, $f0, $f2 \n\t" /* A1+B1 a1+b1 */\
291  "psubw $f4, $f4, $f2 \n\t" /* A1-B1 a1-b1 */\
292  "psraw $f0, $f0, $f18 \n\t" \
293  "psraw $f4, $f4, $f18 \n\t" \
294  "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\
295  "swc1 $f14, " #dst " \n\t" \
296  "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
297  "swc1 $f0, 16+" #dst " \n\t" \
298  "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\
299  "swc1 $f4, 96+" #dst " \n\t" \
300  "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
301  "swc1 $f8, 112+" #dst " \n\t" \
302  "ldc1 $f0, " #src1 " \n\t" /* R3 R1 r3 r1 */\
303  "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
304  "pmaddhw $f8, $f8, $f0 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
305  "ldc1 $f16, 96(%2) \n\t" \
306  "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\
307  "pmaddhw $f0, $f0, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
308  "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
309  "ldc1 $f16, 104(%2) \n\t" \
310  "mov.d $f4, $f10 \n\t" /* A2 a2 */\
311  "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
312  "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\
313  "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\
314  "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\
315  "psraw $f4, $f4, $f18 \n\t" \
316  "psraw $f10, $f10, $f18 \n\t" \
317  "mov.d $f8, $f12 \n\t" /* A3 a3 */\
318  "paddw $f6, $f6, $f0 \n\t" /* B3 b3 */\
319  "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
320  "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\
321  "psraw $f12, $f12, $f18 \n\t" \
322  "psraw $f8, $f8, $f18 \n\t" \
323  "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\
324  "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
325  "swc1 $f4, 32+" #dst " \n\t" \
326  "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\
327  "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
328  "swc1 $f12, 48+" #dst " \n\t" \
329  "swc1 $f8, 64+" #dst " \n\t" \
330  "swc1 $f10, 80+" #dst " \n\t"
331 
332  //IDCT( src0, src4, src1, src5, dst, shift)
333  IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
334  IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
335  IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
336  IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
337  "b 9f \n\t"
338 
339  "# .p2align 4 \n\t"
340  "4: \n\t"
341  Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddw,(%2), 11, 6f)
342  Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 5f)
343 
344 #undef IDCT
345 #define IDCT(src0, src4, src1, src5, dst, shift) \
346  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
347  "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
348  "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
349  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
350  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
351  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
352  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
353  "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
354  "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
355  "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
356  "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
357  "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
358  "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
359  "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
360  "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
361  "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\
362  "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\
363  "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\
364  "li $10, " #shift " \n\t" \
365  "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
366  "ldc1 $f14, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
367  "mtc1 $10, $f18 \n\t" \
368  "pmaddhw $f14, $f14, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
369  "paddw $f2, $f2, $f8 \n\t" /* A0+B0 a0+b0 */\
370  "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
371  "psubw $f8, $f8, $f2 \n\t" /* A0-B0 a0-b0 */\
372  "psraw $f2, $f2, $f18 \n\t" \
373  "psraw $f8, $f8, $f18 \n\t" \
374  "mov.d $f4, $f0 \n\t" /* A1 a1 */\
375  "paddw $f0, $f0, $f14 \n\t" /* A1+B1 a1+b1 */\
376  "psubw $f4, $f4, $f14 \n\t" /* A1-B1 a1-b1 */\
377  "psraw $f0, $f0, $f18 \n\t" \
378  "psraw $f4, $f4, $f18 \n\t" \
379  "packsswh $f2, $f2, $f2 \n\t" /* A0+B0 a0+b0 */\
380  "swc1 $f2, " #dst " \n\t" \
381  "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
382  "swc1 $f0, 16+" #dst " \n\t" \
383  "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\
384  "swc1 $f4, 96+" #dst " \n\t" \
385  "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
386  "swc1 $f8, 112+" #dst " \n\t" \
387  "ldc1 $f2, 88(%2) \n\t" /* C3 C7 C3 C7 */\
388  "ldc1 $f16, 104(%2) \n\t" \
389  "pmaddhw $f2, $f2, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
390  "mov.d $f4, $f10 \n\t" /* A2 a2 */\
391  "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
392  "paddw $f4, $f4, $f2 \n\t" /* A2+B2 a2+b2 */\
393  "psubw $f10, $f10, $f2 \n\t" /* a2-B2 a2-b2 */\
394  "psraw $f4, $f4, $f18 \n\t" \
395  "psraw $f10, $f10, $f18 \n\t" \
396  "mov.d $f2, $f12 \n\t" /* A3 a3 */\
397  "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
398  "psubw $f2, $f2, $f6 \n\t" /* a3-B3 a3-b3 */\
399  "psraw $f12, $f12, $f18 \n\t" \
400  "psraw $f2, $f2, $f18 \n\t" \
401  "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\
402  "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
403  "swc1 $f4, 32+" #dst " \n\t" \
404  "packsswh $f2, $f2, $f2 \n\t" /* A3-B3 a3-b3 */\
405  "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
406  "swc1 $f12, 48+" #dst " \n\t" \
407  "swc1 $f2, 64+" #dst " \n\t" \
408  "swc1 $f10, 80+" #dst " \n\t"
409 
410  //IDCT( src0, src4, src1, src5, dst, shift)
411  IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
412  IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
413  IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
414  IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
415  "b 9f \n\t"
416 
417  "# .p2align 4 \n\t"
418  "6: \n\t"
419  Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 7f)
420 
421 #undef IDCT
422 #define IDCT(src0, src4, src1, src5, dst, shift) \
423  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
424  "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
425  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
426  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
427  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
428  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
429  "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
430  "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
431  "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\
432  "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
433  "ldc1 $f14, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
434  "li $10, " #shift " \n\t" \
435  "pmaddhw $f14, $f14, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
436  "paddw $f2, $f2, $f8 \n\t" /* A0+B0 a0+b0 */\
437  "mtc1 $10, $f18 \n\t" \
438  "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
439  "psubw $f8, $f8, $f2 \n\t" /* A0-B0 a0-b0 */\
440  "psraw $f2, $f2, $f18 \n\t" \
441  "psraw $f8, $f8, $f18 \n\t" \
442  "mov.d $f4, $f0 \n\t" /* A1 a1 */\
443  "paddw $f0, $f0, $f14 \n\t" /* A1+B1 a1+b1 */\
444  "psubw $f4, $f4, $f14 \n\t" /* A1-B1 a1-b1 */\
445  "psraw $f0, $f0, $f18 \n\t" \
446  "psraw $f4, $f4, $f18 \n\t" \
447  "packsswh $f2, $f2, $f2 \n\t" /* A0+B0 a0+b0 */\
448  "swc1 $f2, " #dst " \n\t" \
449  "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
450  "swc1 $f0, 16+" #dst " \n\t" \
451  "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\
452  "swc1 $f4, 96+" #dst " \n\t" \
453  "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
454  "swc1 $f8, 112+" #dst " \n\t" \
455  "ldc1 $f2, 88(%2) \n\t" /* C3 C7 C3 C7 */\
456  "ldc1 $f16, 104(%2) \n\t" \
457  "pmaddhw $f2, $f2, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
458  "mov.d $f4, $f10 \n\t" /* A2 a2 */\
459  "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
460  "paddw $f4, $f4, $f2 \n\t" /* A2+B2 a2+b2 */\
461  "psubw $f10, $f10, $f2 \n\t" /* a2-B2 a2-b2 */\
462  "psraw $f4, $f4, $f18 \n\t" \
463  "psraw $f10, $f10, $f18 \n\t" \
464  "mov.d $f2, $f12 \n\t" /* A3 a3 */\
465  "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
466  "psubw $f2, $f2, $f6 \n\t" /* a3-B3 a3-b3 */\
467  "psraw $f12, $f12, $f18 \n\t" \
468  "psraw $f2, $f2, $f18 \n\t" \
469  "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\
470  "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
471  "swc1 $f4, 32+" #dst " \n\t" \
472  "packsswh $f2, $f2, $f2 \n\t" /* A3-B3 a3-b3 */\
473  "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
474  "swc1 $f12, 48+" #dst " \n\t" \
475  "swc1 $f2, 64+" #dst " \n\t" \
476  "swc1 $f10, 80+" #dst " \n\t"
477 
478  //IDCT( src0, src4, src1, src5, dst, shift)
479  IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
480  IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
481  IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
482  IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
483  "b 9f \n\t"
484 
485  "# .p2align 4 \n\t"
486  "2: \n\t"
487  Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 3f)
488 
489 #undef IDCT
490 #define IDCT(src0, src4, src1, src5, dst, shift) \
491  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
492  "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
493  "ldc1 $f6, " #src5 " \n\t" /* R7 R5 r7 r5 */\
494  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
495  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
496  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
497  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
498  "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
499  "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
500  "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
501  "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
502  "ldc1 $f2, 56(%2) \n\t" /* C7 C5 C7 C5 */\
503  "pmaddhw $f2, $f2, $f6 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
504  "ldc1 $f16, 64(%2) \n\t" \
505  "pmaddhw $f4, $f4, $f16 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
506  "paddw $f14, $f14, $f2 \n\t" /* B0 b0 */\
507  "ldc1 $f2, 72(%2) \n\t" /* -C5 -C1 -C5 -C1 */\
508  "li $10, " #shift " \n\t" \
509  "pmaddhw $f2, $f2, $f6 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
510  "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
511  "mtc1 $10, $f18 \n\t" \
512  "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
513  "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
514  "paddw $f2, $f2, $f4 \n\t" /* B1 b1 */\
515  "psraw $f14, $f14, $f18 \n\t" \
516  "psraw $f8, $f8, $f18 \n\t" \
517  "mov.d $f4, $f0 \n\t" /* A1 a1 */\
518  "paddw $f0, $f0, $f2 \n\t" /* A1+B1 a1+b1 */\
519  "psubw $f4, $f4, $f2 \n\t" /* A1-B1 a1-b1 */\
520  "psraw $f0, $f0, $f18 \n\t" \
521  "psraw $f4, $f4, $f18 \n\t" \
522  "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\
523  "swc1 $f14, " #dst " \n\t" \
524  "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
525  "swc1 $f0, 16+" #dst " \n\t" \
526  "packsswh $f4, $f4, $f4 \n\t" /* A1-B1 a1-b1 */\
527  "swc1 $f4, 96+" #dst " \n\t" \
528  "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
529  "swc1 $f8, 112+" #dst " \n\t" \
530  "ldc1 $f0, " #src1 " \n\t" /* R3 R1 r3 r1 */\
531  "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
532  "pmaddhw $f8, $f8, $f0 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
533  "ldc1 $f14, 88(%2) \n\t" /* C3 C7 C3 C7 */\
534  "ldc1 $f16, 96(%2) \n\t" \
535  "pmaddhw $f0, $f0, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
536  "pmaddhw $f14, $f14, $f6 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
537  "mov.d $f4, $f10 \n\t" /* A2 a2 */\
538  "ldc1 $f16, 104(%2) \n\t" \
539  "pmaddhw $f6, $f6, $f16 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
540  "paddw $f8, $f8, $f14 \n\t" /* B2 b2 */\
541  "paddw $f4, $f4, $f8 \n\t" /* A2+B2 a2+b2 */\
542  "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\
543  "psraw $f4, $f4, $f18 \n\t" \
544  "psraw $f10, $f10, $f18 \n\t" \
545  "mov.d $f8, $f12 \n\t" /* A3 a3 */\
546  "paddw $f6, $f6, $f0 \n\t" /* B3 b3 */\
547  "paddw $f12, $f12, $f6 \n\t" /* A3+B3 a3+b3 */\
548  "psubw $f8, $f8, $f6 \n\t" /* a3-B3 a3-b3 */\
549  "psraw $f12, $f12, $f18 \n\t" \
550  "psraw $f8, $f8, $f18 \n\t" \
551  "packsswh $f4, $f4, $f4 \n\t" /* A2+B2 a2+b2 */\
552  "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
553  "swc1 $f4, 32+" #dst " \n\t" \
554  "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\
555  "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
556  "swc1 $f12, 48+" #dst " \n\t" \
557  "swc1 $f8, 64+" #dst " \n\t" \
558  "swc1 $f10, 80+" #dst " \n\t"
559 
560  //IDCT( src0, src4, src1, src5, dst, shift)
561  IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
562  IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
563  IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
564  IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
565  "b 9f \n\t"
566 
567  "# .p2align 4 \n\t"
568  "3: \n\t"
569 
570 #undef IDCT
571 #define IDCT(src0, src4, src1, src5, dst, shift) \
572  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
573  "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
574  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
575  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
576  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
577  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
578  "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
579  "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
580  "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
581  "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
582  "ldc1 $f6, 64(%2) \n\t" \
583  "pmaddhw $f6, $f6, $f4 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
584  "li $10, " #shift " \n\t" \
585  "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
586  "mtc1 $10, $f18 \n\t" \
587  "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
588  "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
589  "psraw $f14, $f14, $f18 \n\t" \
590  "psraw $f8, $f8, $f18 \n\t" \
591  "mov.d $f2, $f0 \n\t" /* A1 a1 */\
592  "paddw $f0, $f0, $f6 \n\t" /* A1+B1 a1+b1 */\
593  "psubw $f2, $f2, $f6 \n\t" /* A1-B1 a1-b1 */\
594  "psraw $f0, $f0, $f18 \n\t" \
595  "psraw $f2, $f2, $f18 \n\t" \
596  "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\
597  "swc1 $f14, " #dst " \n\t" \
598  "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
599  "swc1 $f0, 16+" #dst " \n\t" \
600  "packsswh $f2, $f2, $f2 \n\t" /* A1-B1 a1-b1 */\
601  "swc1 $f2, 96+" #dst " \n\t" \
602  "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
603  "swc1 $f8, 112+" #dst " \n\t" \
604  "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
605  "ldc1 $f16, 96(%2) \n\t" \
606  "pmaddhw $f8, $f8, $f4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
607  "pmaddhw $f4, $f4, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
608  "mov.d $f2, $f10 \n\t" /* A2 a2 */\
609  "paddw $f2, $f2, $f8 \n\t" /* A2+B2 a2+b2 */\
610  "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\
611  "psraw $f2, $f2, $f18 \n\t" \
612  "psraw $f10, $f10, $f18 \n\t" \
613  "mov.d $f8, $f12 \n\t" /* A3 a3 */\
614  "paddw $f12, $f12, $f4 \n\t" /* A3+B3 a3+b3 */\
615  "psubw $f8, $f8, $f4 \n\t" /* a3-B3 a3-b3 */\
616  "psraw $f12, $f12, $f18 \n\t" \
617  "psraw $f8, $f8, $f18 \n\t" \
618  "packsswh $f2, $f2, $f2 \n\t" /* A2+B2 a2+b2 */\
619  "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
620  "swc1 $f2, 32+" #dst " \n\t" \
621  "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\
622  "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
623  "swc1 $f12, 48+" #dst " \n\t" \
624  "swc1 $f8, 64+" #dst " \n\t" \
625  "swc1 $f10, 80+" #dst " \n\t"
626 
627  //IDCT( src0, src4, src1, src5, dst, shift)
628  IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
629  IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
630  IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
631  IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
632  "b 9f \n\t"
633 
634  "# .p2align 4 \n\t"
635  "5: \n\t"
636 
637 #undef IDCT
638 #define IDCT(src0, src4, src1, src5, dst, shift) \
639  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
640  "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
641  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
642  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
643  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
644  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
645  "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
646  "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
647  "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
648  "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
649  "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
650  "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
651  "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
652  "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
653  "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\
654  "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\
655  "ldc1 $f4, 8+" #src0 " \n\t" /* R4 R0 r4 r0 */\
656  "ldc1 $f6, 8+" #src4 " \n\t" /* R6 R2 r6 r2 */\
657  "ldc1 $f2, 16(%2) \n\t" /* C4 C4 C4 C4 */\
658  "pmaddhw $f2, $f2, $f4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
659  "ldc1 $f14, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
660  "pmaddhw $f4, $f4, $f14 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
661  "ldc1 $f14, 32(%2) \n\t" /* C6 C2 C6 C2 */\
662  "ldc1 $f16, 40(%2) \n\t" \
663  "pmaddhw $f14, $f14, $f6 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
664  "pmaddhw $f6, $f6, $f16 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
665  "paddw $f14, $f14, $f2 \n\t" /* A0 a0 */\
666  "paddw $f2, $f2, $f2 \n\t" /* 2C0 2c0 */\
667  "psubw $f2, $f2, $f14 \n\t" /* A3 a3 */\
668  "li $10, " #shift " \n\t" \
669  "paddw $f6, $f6, $f4 \n\t" /* A1 a1 */\
670  "mtc1 $10, $f18 \n\t" \
671  "paddw $f4, $f4, $f4 \n\t" /* 2C1 2c1 */\
672  "psubw $f4, $f4, $f6 \n\t" /* A2 a2 */\
673  "psraw $f8, $f8, $f18 \n\t" \
674  "psraw $f14, $f14, $f18 \n\t" \
675  "psraw $f6, $f6, $f18 \n\t" \
676  "packsswh $f8, $f8, $f14 \n\t" /* A0 a0 */\
677  "sdc1 $f8, " #dst " \n\t" \
678  "psraw $f0, $f0, $f18 \n\t" \
679  "packsswh $f0, $f0, $f6 \n\t" /* A1 a1 */\
680  "sdc1 $f0, 16+" #dst " \n\t" \
681  "sdc1 $f0, 96+" #dst " \n\t" \
682  "sdc1 $f8, 112+" #dst " \n\t" \
683  "psraw $f10, $f10, $f18 \n\t" \
684  "psraw $f12, $f12, $f18 \n\t" \
685  "psraw $f4, $f4, $f18 \n\t" \
686  "packsswh $f10, $f10, $f4 \n\t" /* A2-B2 a2-b2 */\
687  "sdc1 $f10, 32+" #dst " \n\t" \
688  "psraw $f2, $f2, $f18 \n\t" \
689  "packsswh $f12, $f12, $f2 \n\t" /* A3+B3 a3+b3 */\
690  "sdc1 $f12, 48+" #dst " \n\t" \
691  "sdc1 $f12, 64+" #dst " \n\t" \
692  "sdc1 $f10, 80+" #dst " \n\t"
693 
694  //IDCT( src0, src4, src1, src5, dst, shift)
695  IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
696  IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
697  "b 9f \n\t"
698 
699  "# .p2align 4 \n\t"
700  "1: \n\t"
701 
702 #undef IDCT
703 #define IDCT(src0, src4, src1, src5, dst, shift) \
704  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
705  "ldc1 $f2, " #src4 " \n\t" /* R6 R2 r6 r2 */\
706  "ldc1 $f4, " #src1 " \n\t" /* R3 R1 r3 r1 */\
707  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
708  "li $10, " #shift " \n\t" \
709  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
710  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
711  "mtc1 $10, $f18 \n\t" \
712  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
713  "ldc1 $f10, 32(%2) \n\t" /* C6 C2 C6 C2 */\
714  "pmaddhw $f10, $f10, $f2 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
715  "ldc1 $f12, 40(%2) \n\t" /* -C2 C6 -C2 C6 */\
716  "pmaddhw $f2, $f2, $f12 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
717  "mov.d $f12, $f8 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
718  "ldc1 $f14, 48(%2) \n\t" /* C3 C1 C3 C1 */\
719  "pmaddhw $f14, $f14, $f4 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
720  "paddw $f8, $f8, $f10 \n\t" /* A0 a0 */\
721  "psubw $f12, $f12, $f10 \n\t" /* A3 a3 */\
722  "mov.d $f10, $f0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
723  "paddw $f0, $f0, $f2 \n\t" /* A1 a1 */\
724  "psubw $f10, $f10, $f2 \n\t" /* A2 a2 */\
725  "ldc1 $f2, 64(%2) \n\t" \
726  "pmaddhw $f2, $f2, $f4 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
727  "paddw $f14, $f14, $f8 \n\t" /* A0+B0 a0+b0 */\
728  "paddw $f8, $f8, $f8 \n\t" /* 2A0 2a0 */\
729  "psubw $f8, $f8, $f14 \n\t" /* A0-B0 a0-b0 */\
730  "psraw $f14, $f14, $f18 \n\t" \
731  "psraw $f8, $f8, $f18 \n\t" \
732  "mov.d $f6, $f0 \n\t" /* A1 a1 */\
733  "paddw $f0, $f0, $f2 \n\t" /* A1+B1 a1+b1 */\
734  "psubw $f6, $f6, $f2 \n\t" /* A1-B1 a1-b1 */\
735  "psraw $f0, $f0, $f18 \n\t" \
736  "psraw $f6, $f6, $f18 \n\t" \
737  "packsswh $f14, $f14, $f14 \n\t" /* A0+B0 a0+b0 */\
738  "swc1 $f14, " #dst " \n\t" \
739  "packsswh $f0, $f0, $f0 \n\t" /* A1+B1 a1+b1 */\
740  "swc1 $f0, 16+" #dst " \n\t" \
741  "packsswh $f6, $f6, $f6 \n\t" /* A1-B1 a1-b1 */\
742  "swc1 $f6, 96+" #dst " \n\t" \
743  "packsswh $f8, $f8, $f8 \n\t" /* A0-B0 a0-b0 */\
744  "swc1 $f8, 112+" #dst " \n\t" \
745  "ldc1 $f8, 80(%2) \n\t" /* -C1 C5 -C1 C5 */\
746  "ldc1 $f16, 96(%2) \n\t" \
747  "pmaddhw $f8, $f8, $f4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
748  "pmaddhw $f4, $f4, $f16 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
749  "mov.d $f6, $f10 \n\t" /* A2 a2 */\
750  "paddw $f6, $f6, $f8 \n\t" /* A2+B2 a2+b2 */\
751  "psubw $f10, $f10, $f8 \n\t" /* a2-B2 a2-b2 */\
752  "psraw $f6, $f6, $f18 \n\t" \
753  "psraw $f10, $f10, $f18 \n\t" \
754  "mov.d $f8, $f12 \n\t" /* A3 a3 */\
755  "paddw $f12, $f12, $f4 \n\t" /* A3+B3 a3+b3 */\
756  "psubw $f8, $f8, $f4 \n\t" /* a3-B3 a3-b3 */\
757  "psraw $f12, $f12, $f18 \n\t" \
758  "packsswh $f6, $f6, $f6 \n\t" /* A2+B2 a2+b2 */\
759  "swc1 $f6, 32+" #dst " \n\t" \
760  "psraw $f8, $f8, $f18 \n\t" \
761  "packsswh $f12, $f12, $f12 \n\t" /* A3+B3 a3+b3 */\
762  "swc1 $f12, 48+" #dst " \n\t" \
763  "packsswh $f8, $f8, $f8 \n\t" /* A3-B3 a3-b3 */\
764  "packsswh $f10, $f10, $f10 \n\t" /* A2-B2 a2-b2 */\
765  "swc1 $f8, 64+" #dst " \n\t" \
766  "swc1 $f10, 80+" #dst " \n\t"
767 
768  //IDCT( src0, src4, src1, src5, dst, shift)
769  IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
770  IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
771  IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
772  IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
773  "b 9f \n\t"
774 
775  "# .p2align 4 \n\t"
776  "7: \n\t"
777 
778 #undef IDCT
779 #define IDCT(src0, src4, src1, src5, dst, shift) \
780  "ldc1 $f0, " #src0 " \n\t" /* R4 R0 r4 r0 */\
781  "ldc1 $f8, 16(%2) \n\t" /* C4 C4 C4 C4 */\
782  "li $10, " #shift " \n\t" \
783  "pmaddhw $f8, $f8, $f0 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
784  "mtc1 $10, $f18 \n\t" \
785  "ldc1 $f10, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
786  "pmaddhw $f0, $f0, $f10 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
787  "psraw $f8, $f8, $f18 \n\t" \
788  "psraw $f0, $f0, $f18 \n\t" \
789  "ldc1 $f4, 8+" #src0 " \n\t" /* R4 R0 r4 r0 */\
790  "ldc1 $f2, 16(%2) \n\t" /* C4 C4 C4 C4 */\
791  "pmaddhw $f2, $f2, $f4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
792  "ldc1 $f14, 24(%2) \n\t" /* -C4 C4 -C4 C4 */\
793  "pmaddhw $f4, $f4, $f14 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
794  "ldc1 $f14, 32(%2) \n\t" /* C6 C2 C6 C2 */\
795  "psraw $f2, $f2, $f18 \n\t" \
796  "packsswh $f8, $f8, $f2 \n\t" /* A0 a0 */\
797  "sdc1 $f8, " #dst " \n\t" \
798  "psraw $f4, $f4, $f18 \n\t" \
799  "packsswh $f0, $f0, $f4 \n\t" /* A1 a1 */\
800  "sdc1 $f0, 16+" #dst " \n\t" \
801  "sdc1 $f0, 96+" #dst " \n\t" \
802  "sdc1 $f8, 112+" #dst " \n\t" \
803  "sdc1 $f0, 32+" #dst " \n\t" \
804  "sdc1 $f8, 48+" #dst " \n\t" \
805  "sdc1 $f8, 64+" #dst " \n\t" \
806  "sdc1 $f0, 80+" #dst " \n\t"
807 
808  //IDCT( src0, src4, src1, src5, dst, shift)
809  IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
810  IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
811 
812  "9: \n\t"
813  ::"r"(block),"r"(temp),"r"(coeffs),"m"(ff_wm1010),"m"(ff_d40000)
814  : "$10","$11"
815  );
816 }
const uint64_t ff_wm1010
Definition: constants.c:67
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift)
else temp
Definition: vf_mcdeint.c:256
#define C3
static int16_t block[64]
Definition: dct.c:115
#define C5
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:112
#define C6
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift, bt)
#define C1
#define C2
#define C4
#define IDCT(src0, src4, src1, src5, dst, shift)
const uint64_t ff_d40000
Definition: constants.c:68
#define ROW_SHIFT
void ff_simple_idct_mmi(int16_t *block)
static const int16_t coeffs[]
#define C7