FFmpeg  4.0
generic_macros_msa.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 
27 #define ALIGNMENT 16
28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
29 
30 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
31 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
32 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
33 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
34 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
35 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
36 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
37 
38 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
39 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
40 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
41 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
42 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
43 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
44 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
45 
46 #if (__mips_isa_rev >= 6)
47  #define LH(psrc) \
48  ( { \
49  uint16_t val_lh_m = *(uint16_t *)(psrc); \
50  val_lh_m; \
51  } )
52 
53  #define LW(psrc) \
54  ( { \
55  uint32_t val_lw_m = *(uint32_t *)(psrc); \
56  val_lw_m; \
57  } )
58 
59  #if (__mips == 64)
60  #define LD(psrc) \
61  ( { \
62  uint64_t val_ld_m = *(uint64_t *)(psrc); \
63  val_ld_m; \
64  } )
65  #else // !(__mips == 64)
66  #define LD(psrc) \
67  ( { \
68  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
69  uint32_t val0_ld_m, val1_ld_m; \
70  uint64_t val_ld_m = 0; \
71  \
72  val0_ld_m = LW(psrc_ld_m); \
73  val1_ld_m = LW(psrc_ld_m + 4); \
74  \
75  val_ld_m = (uint64_t) (val1_ld_m); \
76  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
77  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
78  \
79  val_ld_m; \
80  } )
81  #endif // (__mips == 64)
82 
83  #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
84  #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
85  #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
86 
87 #else // !(__mips_isa_rev >= 6)
88  #define LH(psrc) \
89  ( { \
90  uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
91  uint16_t val_lh_m; \
92  \
93  __asm__ volatile ( \
94  "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
95  \
96  : [val_lh_m] "=r" (val_lh_m) \
97  : [psrc_lh_m] "m" (*psrc_lh_m) \
98  ); \
99  \
100  val_lh_m; \
101  } )
102 
103  #define LW(psrc) \
104  ( { \
105  uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
106  uint32_t val_lw_m; \
107  \
108  __asm__ volatile ( \
109  "ulw %[val_lw_m], %[psrc_lw_m] \n\t" \
110  \
111  : [val_lw_m] "=r" (val_lw_m) \
112  : [psrc_lw_m] "m" (*psrc_lw_m) \
113  ); \
114  \
115  val_lw_m; \
116  } )
117 
118  #if (__mips == 64)
119  #define LD(psrc) \
120  ( { \
121  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
122  uint64_t val_ld_m = 0; \
123  \
124  __asm__ volatile ( \
125  "uld %[val_ld_m], %[psrc_ld_m] \n\t" \
126  \
127  : [val_ld_m] "=r" (val_ld_m) \
128  : [psrc_ld_m] "m" (*psrc_ld_m) \
129  ); \
130  \
131  val_ld_m; \
132  } )
133  #else // !(__mips == 64)
134  #define LD(psrc) \
135  ( { \
136  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
137  uint32_t val0_ld_m, val1_ld_m; \
138  uint64_t val_ld_m = 0; \
139  \
140  val0_ld_m = LW(psrc_ld_m); \
141  val1_ld_m = LW(psrc_ld_m + 4); \
142  \
143  val_ld_m = (uint64_t) (val1_ld_m); \
144  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
145  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
146  \
147  val_ld_m; \
148  } )
149  #endif // (__mips == 64)
150 
151  #define SH(val, pdst) \
152  { \
153  uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
154  uint16_t val_sh_m = (val); \
155  \
156  __asm__ volatile ( \
157  "ush %[val_sh_m], %[pdst_sh_m] \n\t" \
158  \
159  : [pdst_sh_m] "=m" (*pdst_sh_m) \
160  : [val_sh_m] "r" (val_sh_m) \
161  ); \
162  }
163 
164  #define SW(val, pdst) \
165  { \
166  uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
167  uint32_t val_sw_m = (val); \
168  \
169  __asm__ volatile ( \
170  "usw %[val_sw_m], %[pdst_sw_m] \n\t" \
171  \
172  : [pdst_sw_m] "=m" (*pdst_sw_m) \
173  : [val_sw_m] "r" (val_sw_m) \
174  ); \
175  }
176 
177  #define SD(val, pdst) \
178  { \
179  uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
180  uint32_t val0_sd_m, val1_sd_m; \
181  \
182  val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
183  val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
184  \
185  SW(val0_sd_m, pdst_sd_m); \
186  SW(val1_sd_m, pdst_sd_m + 4); \
187  }
188 #endif // (__mips_isa_rev >= 6)
189 
190 /* Description : Load 4 words with stride
191  Arguments : Inputs - psrc (source pointer to load from)
192  - stride
193  Outputs - out0, out1, out2, out3
194  Details : Loads word in 'out0' from (psrc)
195  Loads word in 'out1' from (psrc + stride)
196  Loads word in 'out2' from (psrc + 2 * stride)
197  Loads word in 'out3' from (psrc + 3 * stride)
198 */
199 #define LW4(psrc, stride, out0, out1, out2, out3) \
200 { \
201  out0 = LW((psrc)); \
202  out1 = LW((psrc) + stride); \
203  out2 = LW((psrc) + 2 * stride); \
204  out3 = LW((psrc) + 3 * stride); \
205 }
206 
207 #define LW2(psrc, stride, out0, out1) \
208 { \
209  out0 = LW((psrc)); \
210  out1 = LW((psrc) + stride); \
211 }
212 
213 /* Description : Load double words with stride
214  Arguments : Inputs - psrc (source pointer to load from)
215  - stride
216  Outputs - out0, out1
217  Details : Loads double word in 'out0' from (psrc)
218  Loads double word in 'out1' from (psrc + stride)
219 */
220 #define LD2(psrc, stride, out0, out1) \
221 { \
222  out0 = LD((psrc)); \
223  out1 = LD((psrc) + stride); \
224 }
225 #define LD4(psrc, stride, out0, out1, out2, out3) \
226 { \
227  LD2((psrc), stride, out0, out1); \
228  LD2((psrc) + 2 * stride, stride, out2, out3); \
229 }
230 
231 /* Description : Store 4 words with stride
232  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
233  Details : Stores word from 'in0' to (pdst)
234  Stores word from 'in1' to (pdst + stride)
235  Stores word from 'in2' to (pdst + 2 * stride)
236  Stores word from 'in3' to (pdst + 3 * stride)
237 */
238 #define SW4(in0, in1, in2, in3, pdst, stride) \
239 { \
240  SW(in0, (pdst)) \
241  SW(in1, (pdst) + stride); \
242  SW(in2, (pdst) + 2 * stride); \
243  SW(in3, (pdst) + 3 * stride); \
244 }
245 
246 /* Description : Store 4 double words with stride
247  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
248  Details : Stores double word from 'in0' to (pdst)
249  Stores double word from 'in1' to (pdst + stride)
250  Stores double word from 'in2' to (pdst + 2 * stride)
251  Stores double word from 'in3' to (pdst + 3 * stride)
252 */
253 #define SD4(in0, in1, in2, in3, pdst, stride) \
254 { \
255  SD(in0, (pdst)) \
256  SD(in1, (pdst) + stride); \
257  SD(in2, (pdst) + 2 * stride); \
258  SD(in3, (pdst) + 3 * stride); \
259 }
260 
261 /* Description : Load vector elements with stride
262  Arguments : Inputs - psrc (source pointer to load from)
263  - stride
264  Outputs - out0, out1
265  Return Type - as per RTYPE
266  Details : Loads elements in 'out0' from (psrc)
267  Loads elements in 'out1' from (psrc + stride)
268 */
269 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
270 { \
271  out0 = LD_V(RTYPE, (psrc)); \
272  out1 = LD_V(RTYPE, (psrc) + stride); \
273 }
274 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
275 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
276 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
277 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
278 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
279 
280 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
281 { \
282  LD_V2(RTYPE, (psrc), stride, out0, out1); \
283  out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
284 }
285 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
286 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
287 
288 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
289 { \
290  LD_V2(RTYPE, (psrc), stride, out0, out1); \
291  LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
292 }
293 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
294 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
295 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
296 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
297 
298 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
299 { \
300  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
301  out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
302 }
303 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
304 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
305 
306 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
307 { \
308  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
309  LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
310 }
311 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
312 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
313 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
314 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
315 
316 #define LD_V7(RTYPE, psrc, stride, \
317  out0, out1, out2, out3, out4, out5, out6) \
318 { \
319  LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
320  LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
321 }
322 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
323 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
324 
325 #define LD_V8(RTYPE, psrc, stride, \
326  out0, out1, out2, out3, out4, out5, out6, out7) \
327 { \
328  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
329  LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
330 }
331 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
332 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
333 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
334 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
335 
336 #define LD_V16(RTYPE, psrc, stride, \
337  out0, out1, out2, out3, out4, out5, out6, out7, \
338  out8, out9, out10, out11, out12, out13, out14, out15) \
339 { \
340  LD_V8(RTYPE, (psrc), stride, \
341  out0, out1, out2, out3, out4, out5, out6, out7); \
342  LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
343  out8, out9, out10, out11, out12, out13, out14, out15); \
344 }
345 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
346 
347 /* Description : Load as 4x4 block of signed halfword elements from 1D source
348  data into 4 vectors (Each vector with 4 signed halfwords)
349  Arguments : Inputs - psrc
350  Outputs - out0, out1, out2, out3
351 */
352 #define LD4x4_SH(psrc, out0, out1, out2, out3) \
353 { \
354  out0 = LD_SH(psrc); \
355  out2 = LD_SH(psrc + 8); \
356  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
357  out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
358 }
359 
360 /* Description : Store vectors with stride
361  Arguments : Inputs - in0, in1, stride
362  Outputs - pdst (destination pointer to store to)
363  Details : Stores elements from 'in0' to (pdst)
364  Stores elements from 'in1' to (pdst + stride)
365 */
366 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
367 { \
368  ST_V(RTYPE, in0, (pdst)); \
369  ST_V(RTYPE, in1, (pdst) + stride); \
370 }
371 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
372 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
373 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
374 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
375 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
376 
377 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
378 { \
379  ST_V2(RTYPE, in0, in1, (pdst), stride); \
380  ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
381 }
382 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
383 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
384 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
385 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
386 
387 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
388 { \
389  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
390  ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
391 }
392 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
393 
394 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
395 { \
396  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
397  ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
398 }
399 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
400 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
401 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
402 
403 /* Description : Store as 2x4 byte block to destination memory from input vector
404  Arguments : Inputs - in, stidx, pdst, stride
405  Return Type - unsigned byte
406  Details : Index stidx halfword element from 'in' vector is copied and
407  stored on first line
408  Index stidx+1 halfword element from 'in' vector is copied and
409  stored on second line
410  Index stidx+2 halfword element from 'in' vector is copied and
411  stored on third line
412  Index stidx+3 halfword element from 'in' vector is copied and
413  stored on fourth line
414 */
415 #define ST2x4_UB(in, stidx, pdst, stride) \
416 { \
417  uint16_t out0_m, out1_m, out2_m, out3_m; \
418  uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
419  \
420  out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
421  out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
422  out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
423  out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
424  \
425  SH(out0_m, pblk_2x4_m); \
426  SH(out1_m, pblk_2x4_m + stride); \
427  SH(out2_m, pblk_2x4_m + 2 * stride); \
428  SH(out3_m, pblk_2x4_m + 3 * stride); \
429 }
430 
431 /* Description : Store as 4x2 byte block to destination memory from input vector
432  Arguments : Inputs - in, pdst, stride
433  Return Type - unsigned byte
434  Details : Index 0 word element from input vector is copied and stored
435  on first line
436  Index 1 word element from input vector is copied and stored
437  on second line
438 */
439 #define ST4x2_UB(in, pdst, stride) \
440 { \
441  uint32_t out0_m, out1_m; \
442  uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
443  \
444  out0_m = __msa_copy_u_w((v4i32) in, 0); \
445  out1_m = __msa_copy_u_w((v4i32) in, 1); \
446  \
447  SW(out0_m, pblk_4x2_m); \
448  SW(out1_m, pblk_4x2_m + stride); \
449 }
450 
451 /* Description : Store as 4x4 byte block to destination memory from input vector
452  Arguments : Inputs - in0, in1, pdst, stride
453  Return Type - unsigned byte
454  Details : Idx0 word element from input vector 'in0' is copied and stored
455  on first line
456  Idx1 word element from input vector 'in0' is copied and stored
457  on second line
458  Idx2 word element from input vector 'in1' is copied and stored
459  on third line
460  Idx3 word element from input vector 'in1' is copied and stored
461  on fourth line
462 */
463 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
464 { \
465  uint32_t out0_m, out1_m, out2_m, out3_m; \
466  uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
467  \
468  out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
469  out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
470  out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
471  out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
472  \
473  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
474 }
475 #define ST4x8_UB(in0, in1, pdst, stride) \
476 { \
477  uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
478  \
479  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
480  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
481 }
482 
483 /* Description : Store as 6x4 byte block to destination memory from input
484  vectors
485  Arguments : Inputs - in0, in1, pdst, stride
486  Return Type - unsigned byte
487  Details : Index 0 word element from input vector 'in0' is copied and
488  stored on first line followed by index 2 halfword element
489  Index 2 word element from input vector 'in0' is copied and
490  stored on second line followed by index 2 halfword element
491  Index 0 word element from input vector 'in1' is copied and
492  stored on third line followed by index 2 halfword element
493  Index 2 word element from input vector 'in1' is copied and
494  stored on fourth line followed by index 2 halfword element
495 */
496 #define ST6x4_UB(in0, in1, pdst, stride) \
497 { \
498  uint32_t out0_m, out1_m, out2_m, out3_m; \
499  uint16_t out4_m, out5_m, out6_m, out7_m; \
500  uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
501  \
502  out0_m = __msa_copy_u_w((v4i32) in0, 0); \
503  out1_m = __msa_copy_u_w((v4i32) in0, 2); \
504  out2_m = __msa_copy_u_w((v4i32) in1, 0); \
505  out3_m = __msa_copy_u_w((v4i32) in1, 2); \
506  \
507  out4_m = __msa_copy_u_h((v8i16) in0, 2); \
508  out5_m = __msa_copy_u_h((v8i16) in0, 6); \
509  out6_m = __msa_copy_u_h((v8i16) in1, 2); \
510  out7_m = __msa_copy_u_h((v8i16) in1, 6); \
511  \
512  SW(out0_m, pblk_6x4_m); \
513  SH(out4_m, (pblk_6x4_m + 4)); \
514  pblk_6x4_m += stride; \
515  SW(out1_m, pblk_6x4_m); \
516  SH(out5_m, (pblk_6x4_m + 4)); \
517  pblk_6x4_m += stride; \
518  SW(out2_m, pblk_6x4_m); \
519  SH(out6_m, (pblk_6x4_m + 4)); \
520  pblk_6x4_m += stride; \
521  SW(out3_m, pblk_6x4_m); \
522  SH(out7_m, (pblk_6x4_m + 4)); \
523 }
524 
525 /* Description : Store as 8x1 byte block to destination memory from input vector
526  Arguments : Inputs - in, pdst
527  Details : Index 0 double word element from input vector 'in' is copied
528  and stored to destination memory at (pdst)
529 */
530 #define ST8x1_UB(in, pdst) \
531 { \
532  uint64_t out0_m; \
533  out0_m = __msa_copy_u_d((v2i64) in, 0); \
534  SD(out0_m, pdst); \
535 }
536 
537 /* Description : Store as 8x2 byte block to destination memory from input vector
538  Arguments : Inputs - in, pdst, stride
539  Details : Index 0 double word element from input vector 'in' is copied
540  and stored to destination memory at (pdst)
541  Index 1 double word element from input vector 'in' is copied
542  and stored to destination memory at (pdst + stride)
543 */
544 #define ST8x2_UB(in, pdst, stride) \
545 { \
546  uint64_t out0_m, out1_m; \
547  uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
548  \
549  out0_m = __msa_copy_u_d((v2i64) in, 0); \
550  out1_m = __msa_copy_u_d((v2i64) in, 1); \
551  \
552  SD(out0_m, pblk_8x2_m); \
553  SD(out1_m, pblk_8x2_m + stride); \
554 }
555 
556 /* Description : Store as 8x4 byte block to destination memory from input
557  vectors
558  Arguments : Inputs - in0, in1, pdst, stride
559  Details : Index 0 double word element from input vector 'in0' is copied
560  and stored to destination memory at (pblk_8x4_m)
561  Index 1 double word element from input vector 'in0' is copied
562  and stored to destination memory at (pblk_8x4_m + stride)
563  Index 0 double word element from input vector 'in1' is copied
564  and stored to destination memory at (pblk_8x4_m + 2 * stride)
565  Index 1 double word element from input vector 'in1' is copied
566  and stored to destination memory at (pblk_8x4_m + 3 * stride)
567 */
568 #define ST8x4_UB(in0, in1, pdst, stride) \
569 { \
570  uint64_t out0_m, out1_m, out2_m, out3_m; \
571  uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
572  \
573  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
574  out1_m = __msa_copy_u_d((v2i64) in0, 1); \
575  out2_m = __msa_copy_u_d((v2i64) in1, 0); \
576  out3_m = __msa_copy_u_d((v2i64) in1, 1); \
577  \
578  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
579 }
580 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
581 { \
582  uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
583  \
584  ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
585  ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
586 }
587 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
588 { \
589  uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
590  \
591  /* left 8x4 */ \
592  ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
593  /* right 4x4 */ \
594  ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
595 }
596 
597 /* Description : Store as 12x8 byte block to destination memory from
598  input vectors
599  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
600  Details : Index 0 double word element from input vector 'in0' is copied
601  and stored to destination memory at (pblk_12x8_m) followed by
602  index 2 word element from same input vector 'in0' at
603  (pblk_12x8_m + 8)
604  Similar to remaining lines
605 */
606 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
607 { \
608  uint64_t out0_m, out1_m, out2_m, out3_m; \
609  uint64_t out4_m, out5_m, out6_m, out7_m; \
610  uint32_t out8_m, out9_m, out10_m, out11_m; \
611  uint32_t out12_m, out13_m, out14_m, out15_m; \
612  uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
613  \
614  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
615  out1_m = __msa_copy_u_d((v2i64) in1, 0); \
616  out2_m = __msa_copy_u_d((v2i64) in2, 0); \
617  out3_m = __msa_copy_u_d((v2i64) in3, 0); \
618  out4_m = __msa_copy_u_d((v2i64) in4, 0); \
619  out5_m = __msa_copy_u_d((v2i64) in5, 0); \
620  out6_m = __msa_copy_u_d((v2i64) in6, 0); \
621  out7_m = __msa_copy_u_d((v2i64) in7, 0); \
622  \
623  out8_m = __msa_copy_u_w((v4i32) in0, 2); \
624  out9_m = __msa_copy_u_w((v4i32) in1, 2); \
625  out10_m = __msa_copy_u_w((v4i32) in2, 2); \
626  out11_m = __msa_copy_u_w((v4i32) in3, 2); \
627  out12_m = __msa_copy_u_w((v4i32) in4, 2); \
628  out13_m = __msa_copy_u_w((v4i32) in5, 2); \
629  out14_m = __msa_copy_u_w((v4i32) in6, 2); \
630  out15_m = __msa_copy_u_w((v4i32) in7, 2); \
631  \
632  SD(out0_m, pblk_12x8_m); \
633  SW(out8_m, pblk_12x8_m + 8); \
634  pblk_12x8_m += stride; \
635  SD(out1_m, pblk_12x8_m); \
636  SW(out9_m, pblk_12x8_m + 8); \
637  pblk_12x8_m += stride; \
638  SD(out2_m, pblk_12x8_m); \
639  SW(out10_m, pblk_12x8_m + 8); \
640  pblk_12x8_m += stride; \
641  SD(out3_m, pblk_12x8_m); \
642  SW(out11_m, pblk_12x8_m + 8); \
643  pblk_12x8_m += stride; \
644  SD(out4_m, pblk_12x8_m); \
645  SW(out12_m, pblk_12x8_m + 8); \
646  pblk_12x8_m += stride; \
647  SD(out5_m, pblk_12x8_m); \
648  SW(out13_m, pblk_12x8_m + 8); \
649  pblk_12x8_m += stride; \
650  SD(out6_m, pblk_12x8_m); \
651  SW(out14_m, pblk_12x8_m + 8); \
652  pblk_12x8_m += stride; \
653  SD(out7_m, pblk_12x8_m); \
654  SW(out15_m, pblk_12x8_m + 8); \
655 }
656 
657 /* Description : average with rounding (in0 + in1 + 1) / 2.
658  Arguments : Inputs - in0, in1, in2, in3,
659  Outputs - out0, out1
660  Return Type - as per RTYPE
661  Details : Each byte element from 'in0' vector is added with each byte
662  element from 'in1' vector. The addition of the elements plus 1
663  (for rounding) is done unsigned with full precision,
664  i.e. the result has one extra bit. Unsigned division by 2
665  (or logical shift right by one bit) is performed before writing
666  the result to vector 'out0'
667  Similar for the pair of 'in2' and 'in3'
668 */
669 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
670 { \
671  out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
672  out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
673 }
674 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
675 
676 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
677  out0, out1, out2, out3) \
678 { \
679  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
680  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
681 }
682 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
683 
684 /* Description : Immediate number of columns to slide with zero
685  Arguments : Inputs - in0, in1, slide_val
686  Outputs - out0, out1
687  Return Type - as per RTYPE
688  Details : Byte elements from 'zero_m' vector are slide into 'in0' by
689  number of elements specified by 'slide_val'
690 */
691 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
692 { \
693  v16i8 zero_m = { 0 }; \
694  out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
695  out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
696 }
697 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
698 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
699 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
700 
701 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
702 { \
703  v16i8 zero_m = { 0 }; \
704  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
705  out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
706 }
707 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
708 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
709 
710 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
711  out0, out1, out2, out3, slide_val) \
712 { \
713  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
714  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
715 }
716 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
717 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
718 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
719 
720 /* Description : Immediate number of columns to slide
721  Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
722  Outputs - out0, out1
723  Return Type - as per RTYPE
724  Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
725  number of elements specified by 'slide_val'
726 */
727 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
728 { \
729  out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
730  out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
731 }
732 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
733 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
734 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
735 
736 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
737  out0, out1, out2, slide_val) \
738 { \
739  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
740  out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val); \
741 }
742 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
743 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
744 
745 /* Description : Shuffle byte vector elements as per mask vector
746  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
747  Outputs - out0, out1
748  Return Type - as per RTYPE
749  Details : Selective byte elements from in0 & in1 are copied to out0 as
750  per control vector mask0
751  Selective byte elements from in2 & in3 are copied to out1 as
752  per control vector mask1
753 */
754 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
755 { \
756  out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
757  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
758 }
759 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
760 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
761 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
762 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
763 
764 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
765  out0, out1, out2) \
766 { \
767  VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
768  out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
769 }
770 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
771 
772 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
773  out0, out1, out2, out3) \
774 { \
775  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
776  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
777 }
778 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
779 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
780 
781 /* Description : Shuffle halfword vector elements as per mask vector
782  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
783  Outputs - out0, out1
784  Return Type - as per RTYPE
785  Details : Selective halfword elements from in0 & in1 are copied to out0
786  as per control vector mask0
787  Selective halfword elements from in2 & in3 are copied to out1
788  as per control vector mask1
789 */
790 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
791 { \
792  out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
793  out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
794 }
795 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
796 
797 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
798  out0, out1, out2) \
799 { \
800  VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
801  out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
802 }
803 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
804 
805 /* Description : Shuffle byte vector elements as per mask vector
806  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
807  Outputs - out0, out1
808  Return Type - as per RTYPE
809  Details : Selective byte elements from in0 & in1 are copied to out0 as
810  per control vector mask0
811  Selective byte elements from in2 & in3 are copied to out1 as
812  per control vector mask1
813 */
814 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
815 { \
816  out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
817  out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
818 }
819 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
820 
821 /* Description : Dot product of byte vector elements
822  Arguments : Inputs - mult0, mult1
823  cnst0, cnst1
824  Outputs - out0, out1
825  Return Type - as per RTYPE
826  Details : Unsigned byte elements from mult0 are multiplied with
827  unsigned byte elements from cnst0 producing a result
828  twice the size of input i.e. unsigned halfword.
829  Then this multiplication results of adjacent odd-even elements
830  are added together and stored to the out vector
831  (2 unsigned halfword results)
832 */
833 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
834 { \
835  out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
836  out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
837 }
838 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
839 
840 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
841  cnst0, cnst1, cnst2, cnst3, \
842  out0, out1, out2, out3) \
843 { \
844  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
845  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
846 }
847 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
848 
849 /* Description : Dot product of byte vector elements
850  Arguments : Inputs - mult0, mult1
851  cnst0, cnst1
852  Outputs - out0, out1
853  Return Type - as per RTYPE
854  Details : Signed byte elements from mult0 are multiplied with
855  signed byte elements from cnst0 producing a result
856  twice the size of input i.e. signed halfword.
857  Then this multiplication results of adjacent odd-even elements
858  are added together and stored to the out vector
859  (2 signed halfword results)
860 */
861 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
862 { \
863  out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
864  out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
865 }
866 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
867 
868 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
869  out0, out1, out2) \
870 { \
871  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
872  out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
873 }
874 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
875 
876 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
877  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
878 { \
879  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
880  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
881 }
882 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
883 
884 /* Description : Dot product of halfword vector elements
885  Arguments : Inputs - mult0, mult1
886  cnst0, cnst1
887  Outputs - out0, out1
888  Return Type - as per RTYPE
889  Details : Signed halfword elements from mult0 are multiplied with
890  signed halfword elements from cnst0 producing a result
891  twice the size of input i.e. signed word.
892  Then this multiplication results of adjacent odd-even elements
893  are added together and stored to the out vector
894  (2 signed word results)
895 */
896 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
897 { \
898  out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
899  out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
900 }
901 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
902 
903 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
904  cnst0, cnst1, cnst2, cnst3, \
905  out0, out1, out2, out3) \
906 { \
907  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
908  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
909 }
910 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
911 
912 /* Description : Dot product & addition of byte vector elements
913  Arguments : Inputs - mult0, mult1
914  cnst0, cnst1
915  Outputs - out0, out1
916  Return Type - as per RTYPE
917  Details : Signed byte elements from mult0 are multiplied with
918  signed byte elements from cnst0 producing a result
919  twice the size of input i.e. signed halfword.
920  Then this multiplication results of adjacent odd-even elements
921  are added to the out vector
922  (2 signed halfword results)
923 */
924 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
925 { \
926  out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
927  (v16i8) mult0, (v16i8) cnst0); \
928  out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
929  (v16i8) mult1, (v16i8) cnst1); \
930 }
931 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
932 
933 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
934  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
935 { \
936  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
937  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
938 }
939 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
940 
941 /* Description : Dot product & addition of byte vector elements
942  Arguments : Inputs - mult0, mult1
943  cnst0, cnst1
944  Outputs - out0, out1
945  Return Type - as per RTYPE
946  Details : Unsigned byte elements from mult0 are multiplied with
947  unsigned byte elements from cnst0 producing a result
948  twice the size of input i.e. unsigned halfword.
949  Then this multiplication results of adjacent odd-even elements
950  are added to the out vector
951  (2 unsigned halfword results)
952 */
953 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
954 { \
955  out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
956  (v16u8) mult0, (v16u8) cnst0); \
957  out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
958  (v16u8) mult1, (v16u8) cnst1); \
959 }
960 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
961 
962 /* Description : Dot product & addition of halfword vector elements
963  Arguments : Inputs - mult0, mult1
964  cnst0, cnst1
965  Outputs - out0, out1
966  Return Type - as per RTYPE
967  Details : Signed halfword elements from mult0 are multiplied with
968  signed halfword elements from cnst0 producing a result
969  twice the size of input i.e. signed word.
970  Then this multiplication results of adjacent odd-even elements
971  are added to the out vector
972  (2 signed word results)
973 */
974 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
975 { \
976  out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
977  (v8i16) mult0, (v8i16) cnst0); \
978  out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
979  (v8i16) mult1, (v8i16) cnst1); \
980 }
981 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
982 
983 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
984  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
985 { \
986  DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
987  DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
988 }
989 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
990 
991 /* Description : Minimum values between unsigned elements of
992  either vector are copied to the output vector
993  Arguments : Inputs - in0, in1, min_vec
994  Outputs - in0, in1, (in place)
995  Return Type - as per RTYPE
996  Details : Minimum of unsigned halfword element values from 'in0' and
997  'min_value' are written to output vector 'in0'
998 */
999 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
1000 { \
1001  in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
1002  in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
1003 }
1004 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
1005 
1006 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
1007 { \
1008  MIN_UH2(RTYPE, in0, in1, min_vec); \
1009  MIN_UH2(RTYPE, in2, in3, min_vec); \
1010 }
1011 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
1012 
1013 /* Description : Clips all halfword elements of input vector between min & max
1014  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1015  Arguments : Inputs - in (input vector)
1016  - min (min threshold)
1017  - max (max threshold)
1018  Outputs - out_m (output vector with clipped elements)
1019  Return Type - signed halfword
1020 */
1021 #define CLIP_SH(in, min, max) \
1022 ( { \
1023  v8i16 out_m; \
1024  \
1025  out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
1026  out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
1027  out_m; \
1028 } )
1029 
1030 /* Description : Clips all signed halfword elements of input vector
1031  between 0 & 255
1032  Arguments : Inputs - in (input vector)
1033  Outputs - out_m (output vector with clipped elements)
1034  Return Type - signed halfword
1035 */
1036 #define CLIP_SH_0_255(in) \
1037 ( { \
1038  v8i16 max_m = __msa_ldi_h(255); \
1039  v8i16 out_m; \
1040  \
1041  out_m = __msa_maxi_s_h((v8i16) in, 0); \
1042  out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
1043  out_m; \
1044 } )
1045 #define CLIP_SH2_0_255(in0, in1) \
1046 { \
1047  in0 = CLIP_SH_0_255(in0); \
1048  in1 = CLIP_SH_0_255(in1); \
1049 }
1050 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
1051 { \
1052  CLIP_SH2_0_255(in0, in1); \
1053  CLIP_SH2_0_255(in2, in3); \
1054 }
1055 
1056 #define CLIP_SH_0_255_MAX_SATU(in) \
1057 ( { \
1058  v8i16 out_m; \
1059  \
1060  out_m = __msa_maxi_s_h((v8i16) in, 0); \
1061  out_m = (v8i16) __msa_sat_u_h((v8u16) out_m, 7); \
1062  out_m; \
1063 } )
1064 #define CLIP_SH2_0_255_MAX_SATU(in0, in1) \
1065 { \
1066  in0 = CLIP_SH_0_255_MAX_SATU(in0); \
1067  in1 = CLIP_SH_0_255_MAX_SATU(in1); \
1068 }
1069 #define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3) \
1070 { \
1071  CLIP_SH2_0_255_MAX_SATU(in0, in1); \
1072  CLIP_SH2_0_255_MAX_SATU(in2, in3); \
1073 }
1074 
1075 /* Description : Clips all signed word elements of input vector
1076  between 0 & 255
1077  Arguments : Inputs - in (input vector)
1078  Outputs - out_m (output vector with clipped elements)
1079  Return Type - signed word
1080 */
1081 #define CLIP_SW_0_255(in) \
1082 ( { \
1083  v4i32 max_m = __msa_ldi_w(255); \
1084  v4i32 out_m; \
1085  \
1086  out_m = __msa_maxi_s_w((v4i32) in, 0); \
1087  out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
1088  out_m; \
1089 } )
1090 
1091 #define CLIP_SW_0_255_MAX_SATU(in) \
1092 ( { \
1093  v4i32 out_m; \
1094  \
1095  out_m = __msa_maxi_s_w((v4i32) in, 0); \
1096  out_m = (v4i32) __msa_sat_u_w((v4u32) out_m, 7); \
1097  out_m; \
1098 } )
1099 #define CLIP_SW2_0_255_MAX_SATU(in0, in1) \
1100 { \
1101  in0 = CLIP_SW_0_255_MAX_SATU(in0); \
1102  in1 = CLIP_SW_0_255_MAX_SATU(in1); \
1103 }
1104 #define CLIP_SW4_0_255_MAX_SATU(in0, in1, in2, in3) \
1105 { \
1106  CLIP_SW2_0_255_MAX_SATU(in0, in1); \
1107  CLIP_SW2_0_255_MAX_SATU(in2, in3); \
1108 }
1109 
1110 /* Description : Addition of 4 signed word elements
1111  4 signed word elements of input vector are added together and
1112  resulted integer sum is returned
1113  Arguments : Inputs - in (signed word vector)
1114  Outputs - sum_m (i32 sum)
1115  Return Type - signed word
1116 */
1117 #define HADD_SW_S32(in) \
1118 ( { \
1119  v2i64 res0_m, res1_m; \
1120  int32_t sum_m; \
1121  \
1122  res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1123  res1_m = __msa_splati_d(res0_m, 1); \
1124  res0_m += res1_m; \
1125  sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1126  sum_m; \
1127 } )
1128 
1129 /* Description : Addition of 8 unsigned halfword elements
1130  8 unsigned halfword elements of input vector are added
1131  together and resulted integer sum is returned
1132  Arguments : Inputs - in (unsigned halfword vector)
1133  Outputs - sum_m (u32 sum)
1134  Return Type - unsigned word
1135 */
1136 #define HADD_UH_U32(in) \
1137 ( { \
1138  v4u32 res_m; \
1139  v2u64 res0_m, res1_m; \
1140  uint32_t sum_m; \
1141  \
1142  res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1143  res0_m = __msa_hadd_u_d(res_m, res_m); \
1144  res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1145  res0_m += res1_m; \
1146  sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1147  sum_m; \
1148 } )
1149 
1150 /* Description : Horizontal addition of signed byte vector elements
1151  Arguments : Inputs - in0, in1
1152  Outputs - out0, out1
1153  Return Type - as per RTYPE
1154  Details : Each signed odd byte element from 'in0' is added to
1155  even signed byte element from 'in0' (pairwise) and the
1156  halfword result is stored in 'out0'
1157 */
1158 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1159 { \
1160  out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1161  out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1162 }
1163 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1164 
1165 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1166 { \
1167  HADD_SB2(RTYPE, in0, in1, out0, out1); \
1168  HADD_SB2(RTYPE, in2, in3, out2, out3); \
1169 }
1170 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1171 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1172 
1173 /* Description : Horizontal addition of unsigned byte vector elements
1174  Arguments : Inputs - in0, in1
1175  Outputs - out0, out1
1176  Return Type - as per RTYPE
1177  Details : Each unsigned odd byte element from 'in0' is added to
1178  even unsigned byte element from 'in0' (pairwise) and the
1179  halfword result is stored in 'out0'
1180 */
1181 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1182 { \
1183  out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1184  out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1185 }
1186 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1187 
1188 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1189 { \
1190  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1191  out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1192 }
1193 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1194 
1195 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1196 { \
1197  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1198  HADD_UB2(RTYPE, in2, in3, out2, out3); \
1199 }
1200 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1201 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1202 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1203 
1204 /* Description : Horizontal subtraction of unsigned byte vector elements
1205  Arguments : Inputs - in0, in1
1206  Outputs - out0, out1
1207  Return Type - as per RTYPE
1208  Details : Each unsigned odd byte element from 'in0' is subtracted from
1209  even unsigned byte element from 'in0' (pairwise) and the
1210  halfword result is stored in 'out0'
1211 */
1212 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1213 { \
1214  out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1215  out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1216 }
1217 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1218 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1219 
1220 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1221 { \
1222  HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1223  HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1224 }
1225 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1226 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1227 
1228 /* Description : SAD (Sum of Absolute Difference)
1229  Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1230  Outputs - sad_m (halfword vector with sad)
1231  Return Type - unsigned halfword
1232  Details : Absolute difference of all the byte elements from 'in0' with
1233  'ref0' is calculated and preserved in 'diff0'. From the 16
1234  unsigned absolute diff values, even-odd pairs are added
1235  together to generate 8 halfword results.
1236 */
1237 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1238 ( { \
1239  v16u8 diff0_m, diff1_m; \
1240  v8u16 sad_m = { 0 }; \
1241  \
1242  diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1243  diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1244  \
1245  sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1246  sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1247  \
1248  sad_m; \
1249 } )
1250 
1251 /* Description : Insert specified word elements from input vectors to 1
1252  destination vector
1253  Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1254  Outputs - out (output vector)
1255  Return Type - as per RTYPE
1256 */
1257 #define INSERT_W2(RTYPE, in0, in1, out) \
1258 { \
1259  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1260  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1261 }
1262 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1263 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1264 
1265 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1266 { \
1267  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1268  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1269  out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1270  out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1271 }
1272 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1273 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1274 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1275 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1276 
1277 /* Description : Insert specified double word elements from input vectors to 1
1278  destination vector
1279  Arguments : Inputs - in0, in1 (2 input vectors)
1280  Outputs - out (output vector)
1281  Return Type - as per RTYPE
1282 */
1283 #define INSERT_D2(RTYPE, in0, in1, out) \
1284 { \
1285  out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1286  out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1287 }
1288 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1289 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1290 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1291 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1292 
1293 /* Description : Interleave even byte elements from vectors
1294  Arguments : Inputs - in0, in1, in2, in3
1295  Outputs - out0, out1
1296  Return Type - as per RTYPE
1297  Details : Even byte elements of 'in0' and even byte
1298  elements of 'in1' are interleaved and copied to 'out0'
1299  Even byte elements of 'in2' and even byte
1300  elements of 'in3' are interleaved and copied to 'out1'
1301 */
1302 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1303 { \
1304  out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1305  out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1306 }
1307 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1308 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1309 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1310 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1311 
1312 /* Description : Interleave even halfword elements from vectors
1313  Arguments : Inputs - in0, in1, in2, in3
1314  Outputs - out0, out1
1315  Return Type - as per RTYPE
1316  Details : Even halfword elements of 'in0' and even halfword
1317  elements of 'in1' are interleaved and copied to 'out0'
1318  Even halfword elements of 'in2' and even halfword
1319  elements of 'in3' are interleaved and copied to 'out1'
1320 */
1321 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1322 { \
1323  out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1324  out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1325 }
1326 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1327 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1328 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1329 
1330 /* Description : Interleave even word elements from vectors
1331  Arguments : Inputs - in0, in1, in2, in3
1332  Outputs - out0, out1
1333  Return Type - as per RTYPE
1334  Details : Even word elements of 'in0' and even word
1335  elements of 'in1' are interleaved and copied to 'out0'
1336  Even word elements of 'in2' and even word
1337  elements of 'in3' are interleaved and copied to 'out1'
1338 */
1339 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1340 { \
1341  out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1342  out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1343 }
1344 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1345 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1346 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1347 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1348 
1349 /* Description : Interleave even double word elements from vectors
1350  Arguments : Inputs - in0, in1, in2, in3
1351  Outputs - out0, out1
1352  Return Type - as per RTYPE
1353  Details : Even double word elements of 'in0' and even double word
1354  elements of 'in1' are interleaved and copied to 'out0'
1355  Even double word elements of 'in2' and even double word
1356  elements of 'in3' are interleaved and copied to 'out1'
1357 */
1358 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1359 { \
1360  out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1361  out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1362 }
1363 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1364 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1365 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1366 
1367 /* Description : Interleave left half of byte elements from vectors
1368  Arguments : Inputs - in0, in1, in2, in3
1369  Outputs - out0, out1
1370  Return Type - as per RTYPE
1371  Details : Left half of byte elements of in0 and left half of byte
1372  elements of in1 are interleaved and copied to out0.
1373  Left half of byte elements of in2 and left half of byte
1374  elements of in3 are interleaved and copied to out1.
1375 */
1376 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1377 { \
1378  out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1379  out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1380 }
1381 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1382 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1383 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1384 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1385 
1386 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1387  out0, out1, out2, out3) \
1388 { \
1389  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1390  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1391 }
1392 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1393 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1394 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1395 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1396 
1397 /* Description : Interleave left half of halfword elements from vectors
1398  Arguments : Inputs - in0, in1, in2, in3
1399  Outputs - out0, out1
1400  Return Type - as per RTYPE
1401  Details : Left half of halfword elements of in0 and left half of halfword
1402  elements of in1 are interleaved and copied to out0.
1403  Left half of halfword elements of in2 and left half of halfword
1404  elements of in3 are interleaved and copied to out1.
1405 */
1406 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1407 { \
1408  out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1409  out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1410 }
1411 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1412 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1413 
1414 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1415  out0, out1, out2, out3) \
1416 { \
1417  ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1418  ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1419 }
1420 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1421 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1422 
1423 /* Description : Interleave left half of word elements from vectors
1424  Arguments : Inputs - in0, in1, in2, in3
1425  Outputs - out0, out1
1426  Return Type - as per RTYPE
1427  Details : Left half of word elements of in0 and left half of word
1428  elements of in1 are interleaved and copied to out0.
1429  Left half of word elements of in2 and left half of word
1430  elements of in3 are interleaved and copied to out1.
1431 */
1432 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1433 { \
1434  out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1435  out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1436 }
1437 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1438 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1439 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1440 
1441 /* Description : Interleave right half of byte elements from vectors
1442  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1443  Outputs - out0, out1, out2, out3
1444  Return Type - as per RTYPE
1445  Details : Right half of byte elements of in0 and right half of byte
1446  elements of in1 are interleaved and copied to out0.
1447  Right half of byte elements of in2 and right half of byte
1448  elements of in3 are interleaved and copied to out1.
1449  Similar for other pairs
1450 */
1451 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1452 { \
1453  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1454  out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1455 }
1456 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1457 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1458 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1459 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1460 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1461 
1462 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1463 { \
1464  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1465  out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1466 }
1467 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1468 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1469 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1470 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1471 
1472 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1473  out0, out1, out2, out3) \
1474 { \
1475  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1476  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1477 }
1478 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1479 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1480 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1481 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1482 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1483 
1484 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1485  in8, in9, in10, in11, in12, in13, in14, in15, \
1486  out0, out1, out2, out3, out4, out5, out6, out7) \
1487 { \
1488  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1489  out0, out1, out2, out3); \
1490  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1491  out4, out5, out6, out7); \
1492 }
1493 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1494 
1495 /* Description : Interleave right half of halfword elements from vectors
1496  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1497  Outputs - out0, out1, out2, out3
1498  Return Type - as per RTYPE
1499  Details : Right half of halfword elements of in0 and right half of
1500  halfword elements of in1 are interleaved and copied to out0.
1501  Right half of halfword elements of in2 and right half of
1502  halfword elements of in3 are interleaved and copied to out1.
1503  Similar for other pairs
1504 */
1505 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1506 { \
1507  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1508  out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1509 }
1510 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1511 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1512 
1513 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1514 { \
1515  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1516  out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1517 }
1518 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1519 
1520 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1521  out0, out1, out2, out3) \
1522 { \
1523  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1524  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1525 }
1526 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1527 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1528 
1529 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1530 { \
1531  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1532  out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1533 }
1534 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1535 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1536 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1537 
1538 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1539  out0, out1, out2, out3) \
1540 { \
1541  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1542  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1543 }
1544 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1545 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1546 
1547 /* Description : Interleave right half of double word elements from vectors
1548  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1549  Outputs - out0, out1, out2, out3
1550  Return Type - as per RTYPE
1551  Details : Right half of double word elements of in0 and right half of
1552  double word elements of in1 are interleaved and copied to out0.
1553  Right half of double word elements of in2 and right half of
1554  double word elements of in3 are interleaved and copied to out1.
1555 */
1556 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1557 { \
1558  out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
1559  out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
1560 }
1561 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1562 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1563 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1564 
1565 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1566 { \
1567  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1568  out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
1569 }
1570 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1571 
1572 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1573  out0, out1, out2, out3) \
1574 { \
1575  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1576  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1577 }
1578 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1579 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1580 
1581 /* Description : Interleave left half of double word elements from vectors
1582  Arguments : Inputs - in0, in1, in2, in3
1583  Outputs - out0, out1
1584  Return Type - as per RTYPE
1585  Details : Left half of double word elements of in0 and left half of
1586  double word elements of in1 are interleaved and copied to out0.
1587  Left half of double word elements of in2 and left half of
1588  double word elements of in3 are interleaved and copied to out1.
1589 */
1590 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1591 { \
1592  out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
1593  out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
1594 }
1595 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1596 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1597 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1598 
1599 /* Description : Interleave both left and right half of input vectors
1600  Arguments : Inputs - in0, in1
1601  Outputs - out0, out1
1602  Return Type - as per RTYPE
1603  Details : Right half of byte elements from 'in0' and 'in1' are
1604  interleaved and stored to 'out0'
1605  Left half of byte elements from 'in0' and 'in1' are
1606  interleaved and stored to 'out1'
1607 */
1608 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1609 { \
1610  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1611  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1612 }
1613 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1614 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1615 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1616 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1617 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1618 
1619 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1620 { \
1621  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1622  out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1623 }
1624 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1625 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1626 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1627 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1628 
1629 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1630 { \
1631  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1632  out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1633 }
1634 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1635 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1636 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1637 
1638 /* Description : Maximum values between signed elements of vector and
1639  5-bit signed immediate value are copied to the output vector
1640  Arguments : Inputs - in0, in1, in2, in3, max_val
1641  Outputs - in0, in1, in2, in3 (in place)
1642  Return Type - as per RTYPE
1643  Details : Maximum of signed halfword element values from 'in0' and
1644  'max_val' are written to output vector 'in0'
1645 */
1646 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1647 { \
1648  in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
1649  in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
1650 }
1651 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1652 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1653 
1654 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1655 { \
1656  MAXI_SH2(RTYPE, in0, in1, max_val); \
1657  MAXI_SH2(RTYPE, in2, in3, max_val); \
1658 }
1659 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1660 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1661 
1662 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \
1663 { \
1664  MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \
1665  MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \
1666 }
1667 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1668 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1669 
1670 /* Description : Saturate the halfword element values to the max
1671  unsigned value of (sat_val+1 bits)
1672  The element data width remains unchanged
1673  Arguments : Inputs - in0, in1, in2, in3, sat_val
1674  Outputs - in0, in1, in2, in3 (in place)
1675  Return Type - as per RTYPE
1676  Details : Each unsigned halfword element from 'in0' is saturated to the
1677  value generated with (sat_val+1) bit range
1678  Results are in placed to original vectors
1679 */
1680 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1681 { \
1682  in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1683  in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1684 }
1685 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1686 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1687 
1688 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1689 { \
1690  SAT_UH2(RTYPE, in0, in1, sat_val); \
1691  SAT_UH2(RTYPE, in2, in3, sat_val); \
1692 }
1693 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1694 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1695 
1696 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \
1697 { \
1698  SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \
1699  SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \
1700 }
1701 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1702 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1703 
1704 /* Description : Saturate the halfword element values to the max
1705  unsigned value of (sat_val+1 bits)
1706  The element data width remains unchanged
1707  Arguments : Inputs - in0, in1, in2, in3, sat_val
1708  Outputs - in0, in1, in2, in3 (in place)
1709  Return Type - as per RTYPE
1710  Details : Each unsigned halfword element from 'in0' is saturated to the
1711  value generated with (sat_val+1) bit range
1712  Results are in placed to original vectors
1713 */
1714 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1715 { \
1716  in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1717  in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1718 }
1719 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1720 
1721 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1722 { \
1723  SAT_SH2(RTYPE, in0, in1, sat_val); \
1724  in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1725 }
1726 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1727 
1728 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1729 { \
1730  SAT_SH2(RTYPE, in0, in1, sat_val); \
1731  SAT_SH2(RTYPE, in2, in3, sat_val); \
1732 }
1733 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1734 
1735 /* Description : Saturate the word element values to the max
1736  unsigned value of (sat_val+1 bits)
1737  The element data width remains unchanged
1738  Arguments : Inputs - in0, in1, in2, in3, sat_val
1739  Outputs - in0, in1, in2, in3 (in place)
1740  Return Type - as per RTYPE
1741  Details : Each unsigned word element from 'in0' is saturated to the
1742  value generated with (sat_val+1) bit range
1743  Results are in placed to original vectors
1744 */
1745 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1746 { \
1747  in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1748  in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1749 }
1750 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1751 
1752 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1753 { \
1754  SAT_SW2(RTYPE, in0, in1, sat_val); \
1755  SAT_SW2(RTYPE, in2, in3, sat_val); \
1756 }
1757 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1758 
1759 /* Description : Indexed halfword element values are replicated to all
1760  elements in output vector
1761  Arguments : Inputs - in, idx0, idx1
1762  Outputs - out0, out1
1763  Return Type - as per RTYPE
1764  Details : 'idx0' element value from 'in' vector is replicated to all
1765  elements in 'out0' vector
1766  Valid index range for halfword operation is 0-7
1767 */
1768 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1769 { \
1770  out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1771  out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1772 }
1773 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1774 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1775 
1776 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1777  out0, out1, out2) \
1778 { \
1779  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1780  out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1781 }
1782 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1783 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1784 
1785 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1786  out0, out1, out2, out3) \
1787 { \
1788  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1789  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1790 }
1791 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1792 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1793 
1794 /* Description : Indexed word element values are replicated to all
1795  elements in output vector
1796  Arguments : Inputs - in, stidx
1797  Outputs - out0, out1
1798  Return Type - as per RTYPE
1799  Details : 'stidx' element value from 'in' vector is replicated to all
1800  elements in 'out0' vector
1801  'stidx + 1' element value from 'in' vector is replicated to all
1802  elements in 'out1' vector
1803  Valid index range for halfword operation is 0-3
1804 */
1805 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1806 { \
1807  out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1808  out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1809 }
1810 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1811 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1812 
1813 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1814 { \
1815  SPLATI_W2(RTYPE, in, 0, out0, out1); \
1816  SPLATI_W2(RTYPE, in, 2, out2, out3); \
1817 }
1818 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1819 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1820 
1821 /* Description : Pack even byte elements of vector pairs
1822  Arguments : Inputs - in0, in1, in2, in3
1823  Outputs - out0, out1
1824  Return Type - as per RTYPE
1825  Details : Even byte elements of in0 are copied to the left half of
1826  out0 & even byte elements of in1 are copied to the right
1827  half of out0.
1828  Even byte elements of in2 are copied to the left half of
1829  out1 & even byte elements of in3 are copied to the right
1830  half of out1.
1831 */
1832 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1833 { \
1834  out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1835  out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1836 }
1837 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1838 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1839 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1840 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1841 
1842 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1843 { \
1844  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1845  out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1846 }
1847 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1848 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1849 
1850 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1851  out0, out1, out2, out3) \
1852 { \
1853  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1854  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1855 }
1856 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1857 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1858 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1859 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1860 
1861 /* Description : Pack even halfword elements of vector pairs
1862  Arguments : Inputs - in0, in1, in2, in3
1863  Outputs - out0, out1
1864  Return Type - as per RTYPE
1865  Details : Even halfword elements of in0 are copied to the left half of
1866  out0 & even halfword elements of in1 are copied to the right
1867  half of out0.
1868  Even halfword elements of in2 are copied to the left half of
1869  out1 & even halfword elements of in3 are copied to the right
1870  half of out1.
1871 */
1872 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1873 { \
1874  out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1875  out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1876 }
1877 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1878 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1879 
1880 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1881  out0, out1, out2, out3) \
1882 { \
1883  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1884  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1885 }
1886 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1887 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1888 
1889 /* Description : Pack even double word elements of vector pairs
1890  Arguments : Inputs - in0, in1, in2, in3
1891  Outputs - out0, out1
1892  Return Type - as per RTYPE
1893  Details : Even double elements of in0 are copied to the left half of
1894  out0 & even double elements of in1 are copied to the right
1895  half of out0.
1896  Even double elements of in2 are copied to the left half of
1897  out1 & even double elements of in3 are copied to the right
1898  half of out1.
1899 */
1900 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1901 { \
1902  out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1903  out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1904 }
1905 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1906 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1907 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1908 
1909 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1910  out0, out1, out2, out3) \
1911 { \
1912  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1913  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1914 }
1915 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1916 
1917 /* Description : Pack odd double word elements of vector pairs
1918  Arguments : Inputs - in0, in1
1919  Outputs - out0, out1
1920  Return Type - as per RTYPE
1921  Details : As operation is on same input 'in0' vector, index 1 double word
1922  element is overwritten to index 0 and result is written to out0
1923  As operation is on same input 'in1' vector, index 1 double word
1924  element is overwritten to index 0 and result is written to out1
1925 */
1926 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1927 { \
1928  out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1929  out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1930 }
1931 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1932 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1933 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1934 
1935 /* Description : Each byte element is logically xor'ed with immediate 128
1936  Arguments : Inputs - in0, in1
1937  Outputs - in0, in1 (in-place)
1938  Return Type - as per RTYPE
1939  Details : Each unsigned byte element from input vector 'in0' is
1940  logically xor'ed with 128 and result is in-place stored in
1941  'in0' vector
1942  Each unsigned byte element from input vector 'in1' is
1943  logically xor'ed with 128 and result is in-place stored in
1944  'in1' vector
1945  Similar for other pairs
1946 */
1947 #define XORI_B2_128(RTYPE, in0, in1) \
1948 { \
1949  in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1950  in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1951 }
1952 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1953 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1954 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1955 
1956 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1957 { \
1958  XORI_B2_128(RTYPE, in0, in1); \
1959  in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1960 }
1961 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1962 
1963 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1964 { \
1965  XORI_B2_128(RTYPE, in0, in1); \
1966  XORI_B2_128(RTYPE, in2, in3); \
1967 }
1968 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1969 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1970 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1971 
1972 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1973 { \
1974  XORI_B3_128(RTYPE, in0, in1, in2); \
1975  XORI_B2_128(RTYPE, in3, in4); \
1976 }
1977 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1978 
1979 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1980 { \
1981  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1982  XORI_B2_128(RTYPE, in4, in5); \
1983 }
1984 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1985 
1986 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1987 { \
1988  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1989  XORI_B3_128(RTYPE, in4, in5, in6); \
1990 }
1991 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1992 
1993 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1994 { \
1995  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1996  XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1997 }
1998 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1999 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
2000 
2001 /* Description : Addition of signed halfword elements and signed saturation
2002  Arguments : Inputs - in0, in1, in2, in3
2003  Outputs - out0, out1
2004  Return Type - as per RTYPE
2005  Details : Signed halfword elements from 'in0' are added to signed
2006  halfword elements of 'in1'. The result is then signed saturated
2007  between -32768 to +32767 (as per halfword data type)
2008  Similar for other pairs
2009 */
2010 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
2011 { \
2012  out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
2013  out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
2014 }
2015 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
2016 
2017 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2018  out0, out1, out2, out3) \
2019 { \
2020  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
2021  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
2022 }
2023 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
2024 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
2025 
2026 /* Description : Shift left all elements of vector (generic for all data types)
2027  Arguments : Inputs - in0, in1, in2, in3, shift
2028  Outputs - in0, in1, in2, in3 (in place)
2029  Return Type - as per input vector RTYPE
2030  Details : Each element of vector 'in0' is left shifted by 'shift' and
2031  result is in place written to 'in0'
2032  Similar for other pairs
2033 */
2034 #define SLLI_2V(in0, in1, shift) \
2035 { \
2036  in0 = in0 << shift; \
2037  in1 = in1 << shift; \
2038 }
2039 #define SLLI_4V(in0, in1, in2, in3, shift) \
2040 { \
2041  in0 = in0 << shift; \
2042  in1 = in1 << shift; \
2043  in2 = in2 << shift; \
2044  in3 = in3 << shift; \
2045 }
2046 
2047 /* Description : Arithmetic shift right all elements of vector
2048  (generic for all data types)
2049  Arguments : Inputs - in0, in1, in2, in3, shift
2050  Outputs - in0, in1, in2, in3 (in place)
2051  Return Type - as per input vector RTYPE
2052  Details : Each element of vector 'in0' is right shifted by 'shift' and
2053  result is in place written to 'in0'
2054  Here, 'shift' is GP variable passed in
2055  Similar for other pairs
2056 */
2057 #define SRA_4V(in0, in1, in2, in3, shift) \
2058 { \
2059  in0 = in0 >> shift; \
2060  in1 = in1 >> shift; \
2061  in2 = in2 >> shift; \
2062  in3 = in3 >> shift; \
2063 }
2064 
2065 /* Description : Shift right logical all halfword elements of vector
2066  Arguments : Inputs - in0, in1, in2, in3, shift
2067  Outputs - in0, in1, in2, in3 (in place)
2068  Return Type - as per RTYPE
2069  Details : Each element of vector 'in0' is shifted right logical by
2070  number of bits respective element holds in vector 'shift' and
2071  result is in place written to 'in0'
2072  Here, 'shift' is a vector passed in
2073  Similar for other pairs
2074 */
2075 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
2076 { \
2077  in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
2078  in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
2079  in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
2080  in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
2081 }
2082 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
2083 
2084 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \
2085 { \
2086  in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \
2087  in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \
2088  in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \
2089  in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \
2090 }
2091 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
2092 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
2093 
2094 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \
2095 { \
2096  SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \
2097  SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \
2098 }
2099 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
2100 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
2101 
2102 /* Description : Shift right arithmetic rounded halfwords
2103  Arguments : Inputs - in0, in1, shift
2104  Outputs - in0, in1, (in place)
2105  Return Type - as per RTYPE
2106  Details : Each element of vector 'in0' is shifted right arithmetic by
2107  number of bits respective element holds in vector 'shift'.
2108  The last discarded bit is added to shifted value for rounding
2109  and the result is in place written to 'in0'
2110  Here, 'shift' is a vector passed in
2111  Similar for other pairs
2112 */
2113 #define SRAR_H2(RTYPE, in0, in1, shift) \
2114 { \
2115  in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2116  in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2117 }
2118 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2119 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2120 
2121 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2122 { \
2123  SRAR_H2(RTYPE, in0, in1, shift) \
2124  in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2125 }
2126 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2127 
2128 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2129 { \
2130  SRAR_H2(RTYPE, in0, in1, shift) \
2131  SRAR_H2(RTYPE, in2, in3, shift) \
2132 }
2133 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2134 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2135 
2136 /* Description : Shift right arithmetic rounded words
2137  Arguments : Inputs - in0, in1, shift
2138  Outputs - in0, in1, (in place)
2139  Return Type - as per RTYPE
2140  Details : Each element of vector 'in0' is shifted right arithmetic by
2141  number of bits respective element holds in vector 'shift'.
2142  The last discarded bit is added to shifted value for rounding
2143  and the result is in place written to 'in0'
2144  Here, 'shift' is a vector passed in
2145  Similar for other pairs
2146 */
2147 #define SRAR_W2(RTYPE, in0, in1, shift) \
2148 { \
2149  in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2150  in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2151 }
2152 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2153 
2154 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2155 { \
2156  SRAR_W2(RTYPE, in0, in1, shift) \
2157  SRAR_W2(RTYPE, in2, in3, shift) \
2158 }
2159 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2160 
2161 /* Description : Shift right arithmetic rounded (immediate)
2162  Arguments : Inputs - in0, in1, in2, in3, shift
2163  Outputs - in0, in1, in2, in3 (in place)
2164  Return Type - as per RTYPE
2165  Details : Each element of vector 'in0' is shifted right arithmetic by
2166  value in 'shift'.
2167  The last discarded bit is added to shifted value for rounding
2168  and the result is in place written to 'in0'
2169  Similar for other pairs
2170 */
2171 #define SRARI_H2(RTYPE, in0, in1, shift) \
2172 { \
2173  in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2174  in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2175 }
2176 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2177 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2178 
2179 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2180 { \
2181  SRARI_H2(RTYPE, in0, in1, shift); \
2182  SRARI_H2(RTYPE, in2, in3, shift); \
2183 }
2184 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2185 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2186 
2187 /* Description : Shift right arithmetic rounded (immediate)
2188  Arguments : Inputs - in0, in1, shift
2189  Outputs - in0, in1 (in place)
2190  Return Type - as per RTYPE
2191  Details : Each element of vector 'in0' is shifted right arithmetic by
2192  value in 'shift'.
2193  The last discarded bit is added to shifted value for rounding
2194  and the result is in place written to 'in0'
2195  Similar for other pairs
2196 */
2197 #define SRARI_W2(RTYPE, in0, in1, shift) \
2198 { \
2199  in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2200  in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2201 }
2202 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2203 
2204 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2205 { \
2206  SRARI_W2(RTYPE, in0, in1, shift); \
2207  SRARI_W2(RTYPE, in2, in3, shift); \
2208 }
2209 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2210 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2211 
2212 /* Description : Multiplication of pairs of vectors
2213  Arguments : Inputs - in0, in1, in2, in3
2214  Outputs - out0, out1
2215  Details : Each element from 'in0' is multiplied with elements from 'in1'
2216  and result is written to 'out0'
2217  Similar for other pairs
2218 */
2219 #define MUL2(in0, in1, in2, in3, out0, out1) \
2220 { \
2221  out0 = in0 * in1; \
2222  out1 = in2 * in3; \
2223 }
2224 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2225 { \
2226  MUL2(in0, in1, in2, in3, out0, out1); \
2227  MUL2(in4, in5, in6, in7, out2, out3); \
2228 }
2229 
2230 /* Description : Addition of 2 pairs of vectors
2231  Arguments : Inputs - in0, in1, in2, in3
2232  Outputs - out0, out1
2233  Details : Each element from 2 pairs vectors is added and 2 results are
2234  produced
2235 */
2236 #define ADD2(in0, in1, in2, in3, out0, out1) \
2237 { \
2238  out0 = in0 + in1; \
2239  out1 = in2 + in3; \
2240 }
2241 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2242 { \
2243  ADD2(in0, in1, in2, in3, out0, out1); \
2244  ADD2(in4, in5, in6, in7, out2, out3); \
2245 }
2246 
2247 /* Description : Subtraction of 2 pairs of vectors
2248  Arguments : Inputs - in0, in1, in2, in3
2249  Outputs - out0, out1
2250  Details : Each element from 2 pairs vectors is subtracted and 2 results
2251  are produced
2252 */
2253 #define SUB2(in0, in1, in2, in3, out0, out1) \
2254 { \
2255  out0 = in0 - in1; \
2256  out1 = in2 - in3; \
2257 }
2258 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2259 { \
2260  out0 = in0 - in1; \
2261  out1 = in2 - in3; \
2262  out2 = in4 - in5; \
2263  out3 = in6 - in7; \
2264 }
2265 
2266 /* Description : Sign extend byte elements from right half of the vector
2267  Arguments : Input - in (byte vector)
2268  Output - out (sign extended halfword vector)
2269  Return Type - signed halfword
2270  Details : Sign bit of byte elements from input vector 'in' is
2271  extracted and interleaved with same vector 'in' to generate
2272  8 halfword elements keeping sign intact
2273 */
2274 #define UNPCK_R_SB_SH(in, out) \
2275 { \
2276  v16i8 sign_m; \
2277  \
2278  sign_m = __msa_clti_s_b((v16i8) in, 0); \
2279  out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in); \
2280 }
2281 
2282 /* Description : Sign extend halfword elements from right half of the vector
2283  Arguments : Inputs - in (input halfword vector)
2284  Outputs - out (sign extended word vectors)
2285  Return Type - signed word
2286  Details : Sign bit of halfword elements from input vector 'in' is
2287  extracted and interleaved with same vector 'in0' to generate
2288  4 word elements keeping sign intact
2289 */
2290 #define UNPCK_R_SH_SW(in, out) \
2291 { \
2292  v8i16 sign_m; \
2293  \
2294  sign_m = __msa_clti_s_h((v8i16) in, 0); \
2295  out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2296 }
2297 
2298 /* Description : Sign extend byte elements from input vector and return
2299  halfword results in pair of vectors
2300  Arguments : Inputs - in (1 input byte vector)
2301  Outputs - out0, out1 (sign extended 2 halfword vectors)
2302  Return Type - signed halfword
2303  Details : Sign bit of byte elements from input vector 'in' is
2304  extracted and interleaved right with same vector 'in0' to
2305  generate 8 signed halfword elements in 'out0'
2306  Then interleaved left with same vector 'in0' to
2307  generate 8 signed halfword elements in 'out1'
2308 */
2309 #define UNPCK_SB_SH(in, out0, out1) \
2310 { \
2311  v16i8 tmp_m; \
2312  \
2313  tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2314  ILVRL_B2_SH(tmp_m, in, out0, out1); \
2315 }
2316 
2317 /* Description : Zero extend unsigned byte elements to halfword elements
2318  Arguments : Inputs - in (1 input unsigned byte vector)
2319  Outputs - out0, out1 (unsigned 2 halfword vectors)
2320  Return Type - signed halfword
2321  Details : Zero extended right half of vector is returned in 'out0'
2322  Zero extended left half of vector is returned in 'out1'
2323 */
2324 #define UNPCK_UB_SH(in, out0, out1) \
2325 { \
2326  v16i8 zero_m = { 0 }; \
2327  \
2328  ILVRL_B2_SH(zero_m, in, out0, out1); \
2329 }
2330 
2331 /* Description : Sign extend halfword elements from input vector and return
2332  result in pair of vectors
2333  Arguments : Inputs - in (1 input halfword vector)
2334  Outputs - out0, out1 (sign extended 2 word vectors)
2335  Return Type - signed word
2336  Details : Sign bit of halfword elements from input vector 'in' is
2337  extracted and interleaved right with same vector 'in0' to
2338  generate 4 signed word elements in 'out0'
2339  Then interleaved left with same vector 'in0' to
2340  generate 4 signed word elements in 'out1'
2341 */
2342 #define UNPCK_SH_SW(in, out0, out1) \
2343 { \
2344  v8i16 tmp_m; \
2345  \
2346  tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2347  ILVRL_H2_SW(tmp_m, in, out0, out1); \
2348 }
2349 
2350 /* Description : Swap two variables
2351  Arguments : Inputs - in0, in1
2352  Outputs - in0, in1 (in-place)
2353  Details : Swapping of two input variables using xor
2354 */
2355 #define SWAP(in0, in1) \
2356 { \
2357  in0 = in0 ^ in1; \
2358  in1 = in0 ^ in1; \
2359  in0 = in0 ^ in1; \
2360 }
2361 
2362 /* Description : Butterfly of 4 input vectors
2363  Arguments : Inputs - in0, in1, in2, in3
2364  Outputs - out0, out1, out2, out3
2365  Details : Butterfly operation
2366 */
2367 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2368 { \
2369  out0 = in0 + in3; \
2370  out1 = in1 + in2; \
2371  \
2372  out2 = in1 - in2; \
2373  out3 = in0 - in3; \
2374 }
2375 
2376 /* Description : Butterfly of 8 input vectors
2377  Arguments : Inputs - in0 ... in7
2378  Outputs - out0 .. out7
2379  Details : Butterfly operation
2380 */
2381 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2382  out0, out1, out2, out3, out4, out5, out6, out7) \
2383 { \
2384  out0 = in0 + in7; \
2385  out1 = in1 + in6; \
2386  out2 = in2 + in5; \
2387  out3 = in3 + in4; \
2388  \
2389  out4 = in3 - in4; \
2390  out5 = in2 - in5; \
2391  out6 = in1 - in6; \
2392  out7 = in0 - in7; \
2393 }
2394 
2395 /* Description : Butterfly of 16 input vectors
2396  Arguments : Inputs - in0 ... in15
2397  Outputs - out0 .. out15
2398  Details : Butterfly operation
2399 */
2400 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2401  in8, in9, in10, in11, in12, in13, in14, in15, \
2402  out0, out1, out2, out3, out4, out5, out6, out7, \
2403  out8, out9, out10, out11, out12, out13, out14, out15) \
2404 { \
2405  out0 = in0 + in15; \
2406  out1 = in1 + in14; \
2407  out2 = in2 + in13; \
2408  out3 = in3 + in12; \
2409  out4 = in4 + in11; \
2410  out5 = in5 + in10; \
2411  out6 = in6 + in9; \
2412  out7 = in7 + in8; \
2413  \
2414  out8 = in7 - in8; \
2415  out9 = in6 - in9; \
2416  out10 = in5 - in10; \
2417  out11 = in4 - in11; \
2418  out12 = in3 - in12; \
2419  out13 = in2 - in13; \
2420  out14 = in1 - in14; \
2421  out15 = in0 - in15; \
2422 }
2423 
2424 /* Description : Transposes input 4x4 byte block
2425  Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2426  Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2427  Return Type - unsigned byte
2428  Details :
2429 */
2430 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2431 { \
2432  v16i8 zero_m = { 0 }; \
2433  v16i8 s0_m, s1_m, s2_m, s3_m; \
2434  \
2435  ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2436  ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2437  \
2438  out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2439  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2440  out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2441  out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2442 }
2443 
2444 /* Description : Transposes input 8x4 byte block into 4x8
2445  Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2446  Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2447  Return Type - as per RTYPE
2448  Details :
2449 */
2450 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2451  out0, out1, out2, out3) \
2452 { \
2453  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2454  \
2455  ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2456  tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2457  ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2458  \
2459  tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2460  ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2461  \
2462  ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2463  out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2464  out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2465 }
2466 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2467 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2468 
2469 /* Description : Transposes input 8x8 byte block
2470  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2471  (input 8x8 byte block)
2472  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2473  (output 8x8 byte block)
2474  Return Type - as per RTYPE
2475  Details :
2476 */
2477 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2478  out0, out1, out2, out3, out4, out5, out6, out7) \
2479 { \
2480  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2481  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2482  \
2483  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2484  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2485  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2486  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2487  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2488  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2489  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
2490  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
2491 }
2492 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2493 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2494 
2495 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2496  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2497  in8, in9, in10, in11, in12, in13, in14, in15
2498  Outputs - out0, out1, out2, out3
2499  Return Type - unsigned byte
2500  Details :
2501 */
2502 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2503  in8, in9, in10, in11, in12, in13, in14, in15, \
2504  out0, out1, out2, out3) \
2505 { \
2506  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2507  \
2508  ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2509  out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2510  \
2511  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2512  out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2513  \
2514  ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2515  \
2516  tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2517  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2518  \
2519  tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2520  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2521  out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2522  out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2523  \
2524  tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2525  tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2526  out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2527  out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2528 }
2529 
2530 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2531  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2532  in8, in9, in10, in11, in12, in13, in14, in15
2533  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2534  Return Type - unsigned byte
2535  Details :
2536 */
2537 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2538  in8, in9, in10, in11, in12, in13, in14, in15, \
2539  out0, out1, out2, out3, out4, out5, out6, out7) \
2540 { \
2541  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2542  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2543  \
2544  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2545  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2546  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2547  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2548  \
2549  tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2550  tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2551  tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2552  tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2553  out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2554  tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2555  out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2556  tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2557  \
2558  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2559  out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2560  out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2561  \
2562  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2563  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2564  out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2565  out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2566  \
2567  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2568  out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2569  out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2570  \
2571  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2572  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2573  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2574  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2575  out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2576  out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2577 }
2578 
2579 /* Description : Transposes 4x4 block with half word elements in vectors
2580  Arguments : Inputs - in0, in1, in2, in3
2581  Outputs - out0, out1, out2, out3
2582  Return Type - signed halfword
2583  Details :
2584 */
2585 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2586 { \
2587  v8i16 s0_m, s1_m; \
2588  \
2589  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2590  ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2591  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2592  out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2593 }
2594 
2595 /* Description : Transposes 8x8 block with half word elements in vectors
2596  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2597  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2598  Return Type - as per RTYPE
2599  Details :
2600 */
2601 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2602  out0, out1, out2, out3, out4, out5, out6, out7) \
2603 { \
2604  v8i16 s0_m, s1_m; \
2605  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2606  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2607  \
2608  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2609  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2610  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2611  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2612  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2613  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2614  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2615  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2616  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2617  tmp3_m, tmp7_m, out0, out2, out4, out6); \
2618  out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2619  out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2620  out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2621  out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2622 }
2623 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2624 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2625 
2626 /* Description : Transposes 4x4 block with word elements in vectors
2627  Arguments : Inputs - in0, in1, in2, in3
2628  Outputs - out0, out1, out2, out3
2629  Return Type - signed word
2630  Details :
2631 */
2632 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2633 { \
2634  v4i32 s0_m, s1_m, s2_m, s3_m; \
2635  \
2636  ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2637  ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2638  \
2639  out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2640  out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2641  out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2642  out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2643 }
2644 
2645 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2646  block in destination memory
2647  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2648  Details : Each byte element from input vector pair 'in0' and 'in1' are
2649  averaged (a + b)/2 and stored in 'tmp0_m'
2650  Each byte element from input vector pair 'in2' and 'in3' are
2651  averaged (a + b)/2 and stored in 'tmp1_m'
2652  Each byte element from input vector pair 'in4' and 'in5' are
2653  averaged (a + b)/2 and stored in 'tmp2_m'
2654  Each byte element from input vector pair 'in6' and 'in7' are
2655  averaged (a + b)/2 and stored in 'tmp3_m'
2656  The half vector results from all 4 vectors are stored in
2657  destination memory as 8x4 byte block
2658 */
2659 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2660 { \
2661  uint64_t out0_m, out1_m, out2_m, out3_m; \
2662  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2663  \
2664  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2665  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2666  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2667  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2668  \
2669  out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2670  out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2671  out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2672  out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2673  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2674 }
2675 
2676 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2677  block in destination memory
2678  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2679  Details : Each byte element from input vector pair 'in0' and 'in1' are
2680  averaged (a + b)/2 and stored in 'tmp0_m'
2681  Each byte element from input vector pair 'in2' and 'in3' are
2682  averaged (a + b)/2 and stored in 'tmp1_m'
2683  Each byte element from input vector pair 'in4' and 'in5' are
2684  averaged (a + b)/2 and stored in 'tmp2_m'
2685  Each byte element from input vector pair 'in6' and 'in7' are
2686  averaged (a + b)/2 and stored in 'tmp3_m'
2687  The results from all 4 vectors are stored in destination
2688  memory as 16x4 byte block
2689 */
2690 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2691 { \
2692  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2693  \
2694  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2695  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2696  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2697  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2698  \
2699  ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2700 }
2701 
2702 /* Description : Average rounded byte elements from pair of vectors and store
2703  8x4 byte block in destination memory
2704  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2705  Details : Each byte element from input vector pair 'in0' and 'in1' are
2706  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2707  Each byte element from input vector pair 'in2' and 'in3' are
2708  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2709  Each byte element from input vector pair 'in4' and 'in5' are
2710  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2711  Each byte element from input vector pair 'in6' and 'in7' are
2712  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2713  The half vector results from all 4 vectors are stored in
2714  destination memory as 8x4 byte block
2715 */
2716 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2717 { \
2718  uint64_t out0_m, out1_m, out2_m, out3_m; \
2719  v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2720  \
2721  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2722  tp0_m, tp1_m, tp2_m, tp3_m); \
2723  \
2724  out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2725  out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2726  out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2727  out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2728  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2729 }
2730 
2731 /* Description : Average rounded byte elements from pair of vectors and store
2732  16x4 byte block in destination memory
2733  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2734  Details : Each byte element from input vector pair 'in0' and 'in1' are
2735  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2736  Each byte element from input vector pair 'in2' and 'in3' are
2737  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2738  Each byte element from input vector pair 'in4' and 'in5' are
2739  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2740  Each byte element from input vector pair 'in6' and 'in7' are
2741  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2742  The vector results from all 4 vectors are stored in
2743  destination memory as 16x4 byte block
2744 */
2745 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2746 { \
2747  v16u8 t0_m, t1_m, t2_m, t3_m; \
2748  \
2749  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2750  t0_m, t1_m, t2_m, t3_m); \
2751  ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2752 }
2753 
2754 /* Description : Average rounded byte elements from pair of vectors,
2755  average rounded with destination and store 8x4 byte block
2756  in destination memory
2757  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2758  Details : Each byte element from input vector pair 'in0' and 'in1' are
2759  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2760  Each byte element from input vector pair 'in2' and 'in3' are
2761  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2762  Each byte element from input vector pair 'in4' and 'in5' are
2763  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2764  Each byte element from input vector pair 'in6' and 'in7' are
2765  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2766  The half vector results from all 4 vectors are stored in
2767  destination memory as 8x4 byte block
2768 */
2769 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2770  pdst, stride) \
2771 { \
2772  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2773  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2774  \
2775  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2776  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2777  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2778  AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2779  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2780 }
2781 
2782 /* Description : Average rounded byte elements from pair of vectors,
2783  average rounded with destination and store 16x4 byte block
2784  in destination memory
2785  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2786  Details : Each byte element from input vector pair 'in0' and 'in1' are
2787  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2788  Each byte element from input vector pair 'in2' and 'in3' are
2789  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2790  Each byte element from input vector pair 'in4' and 'in5' are
2791  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2792  Each byte element from input vector pair 'in6' and 'in7' are
2793  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2794  The vector results from all 4 vectors are stored in
2795  destination memory as 16x4 byte block
2796 */
2797 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2798  pdst, stride) \
2799 { \
2800  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2801  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2802  \
2803  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2804  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2805  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2806  AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2807  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2808 }
2809 
2810 /* Description : Add block 4x4
2811  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2812  Details : Least significant 4 bytes from each input vector are added to
2813  the destination bytes, clipped between 0-255 and then stored.
2814 */
2815 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2816 { \
2817  uint32_t src0_m, src1_m, src2_m, src3_m; \
2818  uint32_t out0_m, out1_m, out2_m, out3_m; \
2819  v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2820  v16i8 dst0_m = { 0 }; \
2821  v16i8 dst1_m = { 0 }; \
2822  v16i8 zero_m = { 0 }; \
2823  \
2824  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2825  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2826  INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2827  INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2828  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2829  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2830  CLIP_SH2_0_255(res0_m, res1_m); \
2831  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2832  \
2833  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2834  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2835  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2836  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2837  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2838 }
2839 
2840 /* Description : Dot product and addition of 3 signed halfword input vectors
2841  Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2842  Outputs - out0_m
2843  Return Type - signed halfword
2844  Details : Dot product of 'in0' with 'coeff0'
2845  Dot product of 'in1' with 'coeff1'
2846  Dot product of 'in2' with 'coeff2'
2847  Addition of all the 3 vector results
2848 
2849  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2850 */
2851 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2852 ( { \
2853  v8i16 tmp1_m; \
2854  v8i16 out0_m; \
2855  \
2856  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2857  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2858  tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \
2859  out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
2860  \
2861  out0_m; \
2862 } )
2863 
2864 /* Description : Pack even elements of input vectors & xor with 128
2865  Arguments : Inputs - in0, in1
2866  Outputs - out_m
2867  Return Type - unsigned byte
2868  Details : Signed byte even elements from 'in0' and 'in1' are packed
2869  together in one vector and the resulted vector is xor'ed with
2870  128 to shift the range from signed to unsigned byte
2871 */
2872 #define PCKEV_XORI128_UB(in0, in1) \
2873 ( { \
2874  v16u8 out_m; \
2875  out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2876  out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2877  out_m; \
2878 } )
2879 
2880 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2881  as 8x4 unsigned byte block
2882  Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
2883 */
2884 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2885  dst0, dst1, pdst, stride) \
2886 { \
2887  v16u8 tmp0_m, tmp1_m; \
2888  uint8_t *pdst_m = (uint8_t *) (pdst); \
2889  \
2890  tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2891  tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2892  AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
2893  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
2894 }
2895 
2896 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2897  of results and store 4 words in destination memory as per
2898  stride
2899  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2900 */
2901 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2902 { \
2903  uint32_t out0_m, out1_m, out2_m, out3_m; \
2904  v16i8 tmp0_m, tmp1_m; \
2905  \
2906  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2907  \
2908  out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2909  out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2910  out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2911  out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2912  \
2913  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2914 }
2915 
2916 /* Description : Pack even byte elements and store byte vector in destination
2917  memory
2918  Arguments : Inputs - in0, in1, pdst
2919 */
2920 #define PCKEV_ST_SB(in0, in1, pdst) \
2921 { \
2922  v16i8 tmp_m; \
2923  tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2924  ST_SB(tmp_m, (pdst)); \
2925 }
2926 
2927 /* Description : Horizontal 2 tap filter kernel code
2928  Arguments : Inputs - in0, in1, mask, coeff, shift
2929 */
2930 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2931 ( { \
2932  v16i8 tmp0_m; \
2933  v8u16 tmp1_m; \
2934  \
2935  tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2936  tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2937  tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2938  tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2939  \
2940  tmp1_m; \
2941 } )
2942 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */