FFmpeg  4.0
vp9_mc_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 static const uint8_t mc_filt_mask_arr[16 * 3] = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 static const int8_t vp9_bilinear_filters_msa[15][2] = {
35  {120, 8},
36  {112, 16},
37  {104, 24},
38  {96, 32},
39  {88, 40},
40  {80, 48},
41  {72, 56},
42  {64, 64},
43  {56, 72},
44  {48, 80},
45  {40, 88},
46  {32, 96},
47  {24, 104},
48  {16, 112},
49  {8, 120}
50 };
51 
52 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
53  filt0, filt1, filt2, filt3) \
54 ( { \
55  v8i16 tmp0, tmp1; \
56  \
57  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
58  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
59  tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
60  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
61  tmp0 = __msa_adds_s_h(tmp0, tmp1); \
62  \
63  tmp0; \
64 } )
65 
66 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
67  filt_h0, filt_h1, filt_h2, filt_h3) \
68 ( { \
69  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
70  v8i16 hz_out_m; \
71  \
72  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
73  vec0_m, vec1_m, vec2_m, vec3_m); \
74  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
75  filt_h0, filt_h1, filt_h2, filt_h3); \
76  \
77  hz_out_m = __msa_srari_h(hz_out_m, 7); \
78  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
79  \
80  hz_out_m; \
81 } )
82 
83 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
84  mask0, mask1, mask2, mask3, \
85  filt0, filt1, filt2, filt3, \
86  out0, out1) \
87 { \
88  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
89  v8i16 res0_m, res1_m, res2_m, res3_m; \
90  \
91  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
92  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
93  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
94  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
95  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
96  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
97  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
98  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
99  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
100 }
101 
102 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
103  mask0, mask1, mask2, mask3, \
104  filt0, filt1, filt2, filt3, \
105  out0, out1, out2, out3) \
106 { \
107  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
108  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
109  \
110  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
111  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
112  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
113  res0_m, res1_m, res2_m, res3_m); \
114  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
115  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
116  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
117  res4_m, res5_m, res6_m, res7_m); \
118  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
119  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
120  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
121  res0_m, res1_m, res2_m, res3_m); \
122  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
123  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
124  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
125  res4_m, res5_m, res6_m, res7_m); \
126  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
127  res7_m, out0, out1, out2, out3); \
128 }
129 
130 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
131 { \
132  v16u8 tmp_m; \
133  \
134  tmp_m = PCKEV_XORI128_UB(in1, in0); \
135  tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
136  ST_UB(tmp_m, (pdst)); \
137 }
138 
139 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
140 { \
141  v16u8 tmp_m; \
142  \
143  tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
144  tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
145  ST_UB(tmp_m, (pdst)); \
146 }
147 
148 #define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, \
149  pdst, stride) \
150 { \
151  v16u8 tmp0_m, tmp1_m; \
152  uint8_t *pdst_m = (uint8_t *) (pdst); \
153  \
154  PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
155  AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
156  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
157 }
158 
159 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
160  uint8_t *dst, int32_t dst_stride,
161  const int8_t *filter)
162 {
163  v16u8 mask0, mask1, mask2, mask3, out;
164  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
165  v8i16 filt, out0, out1;
166 
167  mask0 = LD_UB(&mc_filt_mask_arr[16]);
168  src -= 3;
169 
170  /* rearranging filter */
171  filt = LD_SH(filter);
172  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
173 
174  mask1 = mask0 + 2;
175  mask2 = mask0 + 4;
176  mask3 = mask0 + 6;
177 
178  LD_SB4(src, src_stride, src0, src1, src2, src3);
179  XORI_B4_128_SB(src0, src1, src2, src3);
180  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
181  mask3, filt0, filt1, filt2, filt3, out0, out1);
182  SRARI_H2_SH(out0, out1, 7);
183  SAT_SH2_SH(out0, out1, 7);
184  out = PCKEV_XORI128_UB(out0, out1);
185  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
186 }
187 
188 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
189  uint8_t *dst, int32_t dst_stride,
190  const int8_t *filter)
191 {
192  v16i8 filt0, filt1, filt2, filt3;
193  v16i8 src0, src1, src2, src3;
194  v16u8 mask0, mask1, mask2, mask3, out;
195  v8i16 filt, out0, out1, out2, out3;
196 
197  mask0 = LD_UB(&mc_filt_mask_arr[16]);
198  src -= 3;
199 
200  /* rearranging filter */
201  filt = LD_SH(filter);
202  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
203 
204  mask1 = mask0 + 2;
205  mask2 = mask0 + 4;
206  mask3 = mask0 + 6;
207 
208  LD_SB4(src, src_stride, src0, src1, src2, src3);
209  XORI_B4_128_SB(src0, src1, src2, src3);
210  src += (4 * src_stride);
211  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
212  mask3, filt0, filt1, filt2, filt3, out0, out1);
213  LD_SB4(src, src_stride, src0, src1, src2, src3);
214  XORI_B4_128_SB(src0, src1, src2, src3);
215  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
216  mask3, filt0, filt1, filt2, filt3, out2, out3);
217  SRARI_H4_SH(out0, out1, out2, out3, 7);
218  SAT_SH4_SH(out0, out1, out2, out3, 7);
219  out = PCKEV_XORI128_UB(out0, out1);
220  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
221  dst += (4 * dst_stride);
222  out = PCKEV_XORI128_UB(out2, out3);
223  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
224 }
225 
226 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
227  uint8_t *dst, int32_t dst_stride,
228  const int8_t *filter, int32_t height)
229 {
230  if (4 == height) {
231  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
232  } else if (8 == height) {
233  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
234  }
235 }
236 
237 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
238  uint8_t *dst, int32_t dst_stride,
239  const int8_t *filter)
240 {
241  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
242  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
243  v8i16 filt, out0, out1, out2, out3;
244 
245  mask0 = LD_UB(&mc_filt_mask_arr[0]);
246  src -= 3;
247 
248  /* rearranging filter */
249  filt = LD_SH(filter);
250  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
251 
252  mask1 = mask0 + 2;
253  mask2 = mask0 + 4;
254  mask3 = mask0 + 6;
255 
256  LD_SB4(src, src_stride, src0, src1, src2, src3);
257  XORI_B4_128_SB(src0, src1, src2, src3);
258  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
259  mask3, filt0, filt1, filt2, filt3, out0, out1,
260  out2, out3);
261  SRARI_H4_SH(out0, out1, out2, out3, 7);
262  SAT_SH4_SH(out0, out1, out2, out3, 7);
263  tmp0 = PCKEV_XORI128_UB(out0, out1);
264  tmp1 = PCKEV_XORI128_UB(out2, out3);
265  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
266 }
267 
268 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
269  uint8_t *dst, int32_t dst_stride,
270  const int8_t *filter, int32_t height)
271 {
272  uint32_t loop_cnt;
273  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
274  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
275  v8i16 filt, out0, out1, out2, out3;
276 
277  mask0 = LD_UB(&mc_filt_mask_arr[0]);
278  src -= 3;
279 
280  /* rearranging filter */
281  filt = LD_SH(filter);
282  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
283 
284  mask1 = mask0 + 2;
285  mask2 = mask0 + 4;
286  mask3 = mask0 + 6;
287 
288  for (loop_cnt = (height >> 2); loop_cnt--;) {
289  LD_SB4(src, src_stride, src0, src1, src2, src3);
290  XORI_B4_128_SB(src0, src1, src2, src3);
291  src += (4 * src_stride);
292  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
293  mask3, filt0, filt1, filt2, filt3, out0,
294  out1, out2, out3);
295  SRARI_H4_SH(out0, out1, out2, out3, 7);
296  SAT_SH4_SH(out0, out1, out2, out3, 7);
297  tmp0 = PCKEV_XORI128_UB(out0, out1);
298  tmp1 = PCKEV_XORI128_UB(out2, out3);
299  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
300  dst += (4 * dst_stride);
301  }
302 }
303 
304 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
305  uint8_t *dst, int32_t dst_stride,
306  const int8_t *filter, int32_t height)
307 {
308  if (4 == height) {
309  common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
310  } else {
311  common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
312  height);
313  }
314 }
315 
316 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
317  uint8_t *dst, int32_t dst_stride,
318  const int8_t *filter, int32_t height)
319 {
320  uint32_t loop_cnt;
321  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
322  v16u8 mask0, mask1, mask2, mask3, out;
323  v8i16 filt, out0, out1, out2, out3;
324 
325  mask0 = LD_UB(&mc_filt_mask_arr[0]);
326  src -= 3;
327 
328  /* rearranging filter */
329  filt = LD_SH(filter);
330  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
331 
332  mask1 = mask0 + 2;
333  mask2 = mask0 + 4;
334  mask3 = mask0 + 6;
335 
336  for (loop_cnt = (height >> 1); loop_cnt--;) {
337  LD_SB2(src, src_stride, src0, src2);
338  LD_SB2(src + 8, src_stride, src1, src3);
339  XORI_B4_128_SB(src0, src1, src2, src3);
340  src += (2 * src_stride);
341  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
342  mask3, filt0, filt1, filt2, filt3, out0,
343  out1, out2, out3);
344  SRARI_H4_SH(out0, out1, out2, out3, 7);
345  SAT_SH4_SH(out0, out1, out2, out3, 7);
346  out = PCKEV_XORI128_UB(out0, out1);
347  ST_UB(out, dst);
348  dst += dst_stride;
349  out = PCKEV_XORI128_UB(out2, out3);
350  ST_UB(out, dst);
351  dst += dst_stride;
352  }
353 }
354 
355 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
356  uint8_t *dst, int32_t dst_stride,
357  const int8_t *filter, int32_t height)
358 {
359  uint32_t loop_cnt;
360  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
361  v16u8 mask0, mask1, mask2, mask3, out;
362  v8i16 filt, out0, out1, out2, out3;
363 
364  mask0 = LD_UB(&mc_filt_mask_arr[0]);
365  src -= 3;
366 
367  /* rearranging filter */
368  filt = LD_SH(filter);
369  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
370 
371  mask1 = mask0 + 2;
372  mask2 = mask0 + 4;
373  mask3 = mask0 + 6;
374 
375  for (loop_cnt = (height >> 1); loop_cnt--;) {
376  src0 = LD_SB(src);
377  src2 = LD_SB(src + 16);
378  src3 = LD_SB(src + 24);
379  src1 = __msa_sldi_b(src2, src0, 8);
380  src += src_stride;
381  XORI_B4_128_SB(src0, src1, src2, src3);
382  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
383  mask3, filt0, filt1, filt2, filt3, out0,
384  out1, out2, out3);
385  SRARI_H4_SH(out0, out1, out2, out3, 7);
386  SAT_SH4_SH(out0, out1, out2, out3, 7);
387 
388  src0 = LD_SB(src);
389  src2 = LD_SB(src + 16);
390  src3 = LD_SB(src + 24);
391  src1 = __msa_sldi_b(src2, src0, 8);
392  src += src_stride;
393 
394  out = PCKEV_XORI128_UB(out0, out1);
395  ST_UB(out, dst);
396  out = PCKEV_XORI128_UB(out2, out3);
397  ST_UB(out, dst + 16);
398  dst += dst_stride;
399 
400  XORI_B4_128_SB(src0, src1, src2, src3);
401  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
402  mask3, filt0, filt1, filt2, filt3, out0,
403  out1, out2, out3);
404  SRARI_H4_SH(out0, out1, out2, out3, 7);
405  SAT_SH4_SH(out0, out1, out2, out3, 7);
406  out = PCKEV_XORI128_UB(out0, out1);
407  ST_UB(out, dst);
408  out = PCKEV_XORI128_UB(out2, out3);
409  ST_UB(out, dst + 16);
410  dst += dst_stride;
411  }
412 }
413 
414 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
415  uint8_t *dst, int32_t dst_stride,
416  const int8_t *filter, int32_t height)
417 {
418  int32_t loop_cnt;
419  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
420  v16u8 mask0, mask1, mask2, mask3, out;
421  v8i16 filt, out0, out1, out2, out3;
422 
423  mask0 = LD_UB(&mc_filt_mask_arr[0]);
424  src -= 3;
425 
426  /* rearranging filter */
427  filt = LD_SH(filter);
428  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
429 
430  mask1 = mask0 + 2;
431  mask2 = mask0 + 4;
432  mask3 = mask0 + 6;
433 
434  for (loop_cnt = height; loop_cnt--;) {
435  src0 = LD_SB(src);
436  src2 = LD_SB(src + 16);
437  src3 = LD_SB(src + 24);
438  src1 = __msa_sldi_b(src2, src0, 8);
439 
440  XORI_B4_128_SB(src0, src1, src2, src3);
441  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
442  mask2, mask3, filt0, filt1, filt2, filt3,
443  out0, out1, out2, out3);
444  SRARI_H4_SH(out0, out1, out2, out3, 7);
445  SAT_SH4_SH(out0, out1, out2, out3, 7);
446  out = PCKEV_XORI128_UB(out0, out1);
447  ST_UB(out, dst);
448  out = PCKEV_XORI128_UB(out2, out3);
449  ST_UB(out, dst + 16);
450 
451  src0 = LD_SB(src + 32);
452  src2 = LD_SB(src + 48);
453  src3 = LD_SB(src + 56);
454  src1 = __msa_sldi_b(src2, src0, 8);
455  src += src_stride;
456 
457  XORI_B4_128_SB(src0, src1, src2, src3);
458  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
459  mask2, mask3, filt0, filt1, filt2, filt3,
460  out0, out1, out2, out3);
461  SRARI_H4_SH(out0, out1, out2, out3, 7);
462  SAT_SH4_SH(out0, out1, out2, out3, 7);
463  out = PCKEV_XORI128_UB(out0, out1);
464  ST_UB(out, dst + 32);
465  out = PCKEV_XORI128_UB(out2, out3);
466  ST_UB(out, dst + 48);
467  dst += dst_stride;
468  }
469 }
470 
471 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
472  uint8_t *dst, int32_t dst_stride,
473  const int8_t *filter, int32_t height)
474 {
475  uint32_t loop_cnt;
476  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
477  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
478  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
479  v16i8 src10998, filt0, filt1, filt2, filt3;
480  v16u8 out;
481  v8i16 filt, out10, out32;
482 
483  src -= (3 * src_stride);
484 
485  filt = LD_SH(filter);
486  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
487 
488  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
489  src += (7 * src_stride);
490 
491  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
492  src54_r, src21_r);
493  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
494  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
495  src4332, src6554);
496  XORI_B3_128_SB(src2110, src4332, src6554);
497 
498  for (loop_cnt = (height >> 2); loop_cnt--;) {
499  LD_SB4(src, src_stride, src7, src8, src9, src10);
500  src += (4 * src_stride);
501 
502  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
503  src87_r, src98_r, src109_r);
504  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
505  XORI_B2_128_SB(src8776, src10998);
506  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
507  filt1, filt2, filt3);
508  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
509  filt1, filt2, filt3);
510  SRARI_H2_SH(out10, out32, 7);
511  SAT_SH2_SH(out10, out32, 7);
512  out = PCKEV_XORI128_UB(out10, out32);
513  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
514  dst += (4 * dst_stride);
515 
516  src2110 = src6554;
517  src4332 = src8776;
518  src6554 = src10998;
519  src6 = src10;
520  }
521 }
522 
523 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
524  uint8_t *dst, int32_t dst_stride,
525  const int8_t *filter, int32_t height)
526 {
527  uint32_t loop_cnt;
528  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
529  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
530  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
531  v16u8 tmp0, tmp1;
532  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
533 
534  src -= (3 * src_stride);
535 
536  filt = LD_SH(filter);
537  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
538 
539  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
540  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
541  src += (7 * src_stride);
542  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
543  src54_r, src21_r);
544  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
545 
546  for (loop_cnt = (height >> 2); loop_cnt--;) {
547  LD_SB4(src, src_stride, src7, src8, src9, src10);
548  XORI_B4_128_SB(src7, src8, src9, src10);
549  src += (4 * src_stride);
550 
551  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
552  src87_r, src98_r, src109_r);
553  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
554  filt1, filt2, filt3);
555  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
556  filt1, filt2, filt3);
557  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
558  filt1, filt2, filt3);
559  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
560  filt1, filt2, filt3);
561  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
562  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
563  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
564  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
565  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
566  dst += (4 * dst_stride);
567 
568  src10_r = src54_r;
569  src32_r = src76_r;
570  src54_r = src98_r;
571  src21_r = src65_r;
572  src43_r = src87_r;
573  src65_r = src109_r;
574  src6 = src10;
575  }
576 }
577 
578 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
579  uint8_t *dst, int32_t dst_stride,
580  const int8_t *filter, int32_t height)
581 {
582  uint32_t loop_cnt;
583  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
584  v16i8 filt0, filt1, filt2, filt3;
585  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
586  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
587  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
588  v16u8 tmp0, tmp1, tmp2, tmp3;
589  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
590 
591  src -= (3 * src_stride);
592 
593  filt = LD_SH(filter);
594  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
595 
596  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
597  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
598  src += (7 * src_stride);
599  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
600  src54_r, src21_r);
601  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
602  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
603  src54_l, src21_l);
604  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
605 
606  for (loop_cnt = (height >> 2); loop_cnt--;) {
607  LD_SB4(src, src_stride, src7, src8, src9, src10);
608  XORI_B4_128_SB(src7, src8, src9, src10);
609  src += (4 * src_stride);
610 
611  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
612  src87_r, src98_r, src109_r);
613  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
614  src87_l, src98_l, src109_l);
615  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
616  filt1, filt2, filt3);
617  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
618  filt1, filt2, filt3);
619  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
620  filt1, filt2, filt3);
621  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
622  filt1, filt2, filt3);
623  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
624  filt1, filt2, filt3);
625  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
626  filt1, filt2, filt3);
627  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
628  filt1, filt2, filt3);
629  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
630  filt1, filt2, filt3);
631  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
632  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
633  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
634  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
635  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
636  out3_r, tmp0, tmp1, tmp2, tmp3);
637  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
638  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
639  dst += (4 * dst_stride);
640 
641  src10_r = src54_r;
642  src32_r = src76_r;
643  src54_r = src98_r;
644  src21_r = src65_r;
645  src43_r = src87_r;
646  src65_r = src109_r;
647  src10_l = src54_l;
648  src32_l = src76_l;
649  src54_l = src98_l;
650  src21_l = src65_l;
651  src43_l = src87_l;
652  src65_l = src109_l;
653  src6 = src10;
654  }
655 }
656 
657 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
658  uint8_t *dst, int32_t dst_stride,
659  const int8_t *filter, int32_t height,
660  int32_t width)
661 {
662  const uint8_t *src_tmp;
663  uint8_t *dst_tmp;
664  uint32_t loop_cnt, cnt;
665  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
666  v16i8 filt0, filt1, filt2, filt3;
667  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
668  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
669  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
670  v16u8 tmp0, tmp1, tmp2, tmp3;
671  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
672 
673  src -= (3 * src_stride);
674 
675  filt = LD_SH(filter);
676  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
677 
678  for (cnt = (width >> 4); cnt--;) {
679  src_tmp = src;
680  dst_tmp = dst;
681 
682  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
683  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
684  src_tmp += (7 * src_stride);
685  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
686  src32_r, src54_r, src21_r);
687  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
688  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
689  src32_l, src54_l, src21_l);
690  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
691 
692  for (loop_cnt = (height >> 2); loop_cnt--;) {
693  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
694  XORI_B4_128_SB(src7, src8, src9, src10);
695  src_tmp += (4 * src_stride);
696  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
697  src87_r, src98_r, src109_r);
698  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
699  src87_l, src98_l, src109_l);
700  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
701  filt0, filt1, filt2, filt3);
702  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
703  filt0, filt1, filt2, filt3);
704  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
705  filt0, filt1, filt2, filt3);
706  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
707  filt0, filt1, filt2, filt3);
708  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
709  filt0, filt1, filt2, filt3);
710  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
711  filt0, filt1, filt2, filt3);
712  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
713  filt0, filt1, filt2, filt3);
714  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
715  filt0, filt1, filt2, filt3);
716  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
717  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
718  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
719  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
720  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
721  out3_r, tmp0, tmp1, tmp2, tmp3);
722  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
723  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
724  dst_tmp += (4 * dst_stride);
725 
726  src10_r = src54_r;
727  src32_r = src76_r;
728  src54_r = src98_r;
729  src21_r = src65_r;
730  src43_r = src87_r;
731  src65_r = src109_r;
732  src10_l = src54_l;
733  src32_l = src76_l;
734  src54_l = src98_l;
735  src21_l = src65_l;
736  src43_l = src87_l;
737  src65_l = src109_l;
738  src6 = src10;
739  }
740 
741  src += 16;
742  dst += 16;
743  }
744 }
745 
746 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
747  uint8_t *dst, int32_t dst_stride,
748  const int8_t *filter, int32_t height)
749 {
750  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
751  32);
752 }
753 
754 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
755  uint8_t *dst, int32_t dst_stride,
756  const int8_t *filter, int32_t height)
757 {
758  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
759  64);
760 }
761 
762 static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
763  uint8_t *dst, int32_t dst_stride,
764  const int8_t *filter_horiz,
765  const int8_t *filter_vert,
766  int32_t height)
767 {
768  uint32_t loop_cnt;
769  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
770  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
771  v16u8 mask0, mask1, mask2, mask3, out;
772  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
773  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
774  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
775 
776  mask0 = LD_UB(&mc_filt_mask_arr[16]);
777  src -= (3 + 3 * src_stride);
778 
779  /* rearranging filter */
780  filt = LD_SH(filter_horiz);
781  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
782 
783  mask1 = mask0 + 2;
784  mask2 = mask0 + 4;
785  mask3 = mask0 + 6;
786 
787  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
788  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
789  src += (7 * src_stride);
790 
791  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
792  filt_hz1, filt_hz2, filt_hz3);
793  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
794  filt_hz1, filt_hz2, filt_hz3);
795  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
796  filt_hz1, filt_hz2, filt_hz3);
797  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
798  filt_hz1, filt_hz2, filt_hz3);
799  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
800 
801  filt = LD_SH(filter_vert);
802  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
803 
804  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
805  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
806 
807  for (loop_cnt = (height >> 2); loop_cnt--;) {
808  LD_SB4(src, src_stride, src7, src8, src9, src10);
809  XORI_B4_128_SB(src7, src8, src9, src10);
810  src += (4 * src_stride);
811 
812  hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
813  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
814  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
815  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
816  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
817  filt_vt2, filt_vt3);
818 
819  hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
820  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
821  hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
822  out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
823  tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
824  filt_vt2, filt_vt3);
825  SRARI_H2_SH(tmp0, tmp1, 7);
826  SAT_SH2_SH(tmp0, tmp1, 7);
827  out = PCKEV_XORI128_UB(tmp0, tmp1);
828  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
829  dst += (4 * dst_stride);
830 
831  hz_out5 = hz_out9;
832  out0 = out2;
833  out1 = out3;
834  out2 = out4;
835  }
836 }
837 
838 static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
839  uint8_t *dst, int32_t dst_stride,
840  const int8_t *filter_horiz,
841  const int8_t *filter_vert,
842  int32_t height)
843 {
844  uint32_t loop_cnt;
845  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
846  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
847  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
848  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
849  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
850  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
851  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
852 
853  mask0 = LD_UB(&mc_filt_mask_arr[0]);
854  src -= (3 + 3 * src_stride);
855 
856  /* rearranging filter */
857  filt = LD_SH(filter_horiz);
858  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
859 
860  mask1 = mask0 + 2;
861  mask2 = mask0 + 4;
862  mask3 = mask0 + 6;
863 
864  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
865  src += (7 * src_stride);
866 
867  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
868  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
869  filt_hz1, filt_hz2, filt_hz3);
870  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
871  filt_hz1, filt_hz2, filt_hz3);
872  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
873  filt_hz1, filt_hz2, filt_hz3);
874  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
875  filt_hz1, filt_hz2, filt_hz3);
876  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
877  filt_hz1, filt_hz2, filt_hz3);
878  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
879  filt_hz1, filt_hz2, filt_hz3);
880  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
881  filt_hz1, filt_hz2, filt_hz3);
882 
883  filt = LD_SH(filter_vert);
884  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
885 
886  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
887  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
888  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
889 
890  for (loop_cnt = (height >> 2); loop_cnt--;) {
891  LD_SB4(src, src_stride, src7, src8, src9, src10);
892  src += (4 * src_stride);
893 
894  XORI_B4_128_SB(src7, src8, src9, src10);
895 
896  hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
897  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
898  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
899  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
900  filt_vt2, filt_vt3);
901 
902  hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
903  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
904  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
905  tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
906  filt_vt2, filt_vt3);
907 
908  hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
909  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
910  out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
911  tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0,
912  filt_vt1, filt_vt2, filt_vt3);
913 
914  hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
915  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
916  out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
917  tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
918  filt_vt2, filt_vt3);
919  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
920  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
921  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
922  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
923  ST8x4_UB(vec0, vec1, dst, dst_stride);
924  dst += (4 * dst_stride);
925 
926  hz_out6 = hz_out10;
927  out0 = out2;
928  out1 = out3;
929  out2 = out8;
930  out4 = out6;
931  out5 = out7;
932  out6 = out9;
933  }
934 }
935 
936 static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
937  uint8_t *dst, int32_t dst_stride,
938  const int8_t *filter_horiz,
939  const int8_t *filter_vert,
940  int32_t height)
941 {
942  int32_t multiple8_cnt;
943 
944  for (multiple8_cnt = 2; multiple8_cnt--;) {
945  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
946  filter_vert, height);
947 
948  src += 8;
949  dst += 8;
950  }
951 }
952 
953 static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
954  uint8_t *dst, int32_t dst_stride,
955  const int8_t *filter_horiz,
956  const int8_t *filter_vert,
957  int32_t height)
958 {
959  int32_t multiple8_cnt;
960 
961  for (multiple8_cnt = 4; multiple8_cnt--;) {
962  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
963  filter_vert, height);
964 
965  src += 8;
966  dst += 8;
967  }
968 }
969 
970 static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
971  uint8_t *dst, int32_t dst_stride,
972  const int8_t *filter_horiz,
973  const int8_t *filter_vert,
974  int32_t height)
975 {
976  int32_t multiple8_cnt;
977 
978  for (multiple8_cnt = 8; multiple8_cnt--;) {
979  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
980  filter_vert, height);
981 
982  src += 8;
983  dst += 8;
984  }
985 }
986 
988  int32_t src_stride,
989  uint8_t *dst, int32_t dst_stride,
990  const int8_t *filter)
991 {
992  uint32_t tp0, tp1, tp2, tp3;
993  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
994  v16u8 dst0, res;
995  v16u8 mask0, mask1, mask2, mask3;
996  v8i16 filt, res0, res1;
997 
998  mask0 = LD_UB(&mc_filt_mask_arr[16]);
999  src -= 3;
1000 
1001  /* rearranging filter */
1002  filt = LD_SH(filter);
1003  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1004 
1005  mask1 = mask0 + 2;
1006  mask2 = mask0 + 4;
1007  mask3 = mask0 + 6;
1008 
1009  LD_SB4(src, src_stride, src0, src1, src2, src3);
1010  XORI_B4_128_SB(src0, src1, src2, src3);
1011  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1012  mask3, filt0, filt1, filt2, filt3, res0, res1);
1013  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1014  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1015  SRARI_H2_SH(res0, res1, 7);
1016  SAT_SH2_SH(res0, res1, 7);
1017  res = PCKEV_XORI128_UB(res0, res1);
1018  res = (v16u8) __msa_aver_u_b(res, dst0);
1019  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
1020 }
1021 
1023  int32_t src_stride,
1024  uint8_t *dst, int32_t dst_stride,
1025  const int8_t *filter)
1026 {
1027  uint32_t tp0, tp1, tp2, tp3;
1028  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1029  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
1030  v16u8 dst0, dst1;
1031  v8i16 filt, vec0, vec1, vec2, vec3;
1032 
1033  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1034  src -= 3;
1035 
1036  /* rearranging filter */
1037  filt = LD_SH(filter);
1038  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1039 
1040  mask1 = mask0 + 2;
1041  mask2 = mask0 + 4;
1042  mask3 = mask0 + 6;
1043 
1044  LD_SB4(src, src_stride, src0, src1, src2, src3);
1045  XORI_B4_128_SB(src0, src1, src2, src3);
1046  src += (4 * src_stride);
1047  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1048  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1049  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
1050  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1051  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1052  mask3, filt0, filt1, filt2, filt3, vec0, vec1);
1053  LD_SB4(src, src_stride, src0, src1, src2, src3);
1054  XORI_B4_128_SB(src0, src1, src2, src3);
1055  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1056  mask3, filt0, filt1, filt2, filt3, vec2, vec3);
1057  SRARI_H4_SH(vec0, vec1, vec2, vec3, 7);
1058  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
1059  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1060  res0, res1, res2, res3);
1061  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
1062  XORI_B2_128_UB(res0, res2);
1063  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
1064  ST4x8_UB(res0, res2, dst, dst_stride);
1065 }
1066 
1068  int32_t src_stride,
1069  uint8_t *dst, int32_t dst_stride,
1070  const int8_t *filter,
1071  int32_t height)
1072 {
1073  if (4 == height) {
1074  common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
1075  filter);
1076  } else if (8 == height) {
1077  common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
1078  filter);
1079  }
1080 }
1081 
1083  int32_t src_stride,
1084  uint8_t *dst, int32_t dst_stride,
1085  const int8_t *filter,
1086  int32_t height)
1087 {
1088  int32_t loop_cnt;
1089  int64_t tp0, tp1, tp2, tp3;
1090  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1091  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1092  v8i16 filt, out0, out1, out2, out3;
1093 
1094  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1095  src -= 3;
1096 
1097  /* rearranging filter */
1098  filt = LD_SH(filter);
1099  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1100 
1101  mask1 = mask0 + 2;
1102  mask2 = mask0 + 4;
1103  mask3 = mask0 + 6;
1104 
1105  for (loop_cnt = (height >> 2); loop_cnt--;) {
1106  LD_SB4(src, src_stride, src0, src1, src2, src3);
1107  XORI_B4_128_SB(src0, src1, src2, src3);
1108  src += (4 * src_stride);
1109  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1110  mask3, filt0, filt1, filt2, filt3, out0,
1111  out1, out2, out3);
1112  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1113  INSERT_D2_UB(tp0, tp1, dst0);
1114  INSERT_D2_UB(tp2, tp3, dst1);
1115  SRARI_H4_SH(out0, out1, out2, out3, 7);
1116  SAT_SH4_SH(out0, out1, out2, out3, 7);
1117  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1118  dst, dst_stride);
1119  dst += (4 * dst_stride);
1120  }
1121 }
1122 
1124  int32_t src_stride,
1125  uint8_t *dst, int32_t dst_stride,
1126  const int8_t *filter,
1127  int32_t height)
1128 {
1129  int32_t loop_cnt;
1130  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1131  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1132  v8i16 filt, out0, out1, out2, out3;
1133  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1134  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1135 
1136  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1137  src -= 3;
1138 
1139  /* rearranging filter */
1140  filt = LD_SH(filter);
1141  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1142 
1143  mask1 = mask0 + 2;
1144  mask2 = mask0 + 4;
1145  mask3 = mask0 + 6;
1146 
1147  for (loop_cnt = height >> 1; loop_cnt--;) {
1148  LD_SB2(src, src_stride, src0, src2);
1149  LD_SB2(src + 8, src_stride, src1, src3);
1150  src += (2 * src_stride);
1151 
1152  XORI_B4_128_SB(src0, src1, src2, src3);
1153  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1154  vec12);
1155  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1156  vec13);
1157  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1158  vec14);
1159  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1160  vec15);
1161  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1162  vec1, vec2, vec3);
1163  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1164  vec9, vec10, vec11);
1165  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1166  vec1, vec2, vec3);
1167  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1168  vec8, vec9, vec10, vec11);
1169  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1170  out1, out2, out3);
1171  LD_UB2(dst, dst_stride, dst0, dst1);
1172  SRARI_H4_SH(out0, out1, out2, out3, 7);
1173  SAT_SH4_SH(out0, out1, out2, out3, 7);
1174  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
1175  dst += dst_stride;
1176  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
1177  dst += dst_stride;
1178  }
1179 }
1180 
1182  int32_t src_stride,
1183  uint8_t *dst, int32_t dst_stride,
1184  const int8_t *filter,
1185  int32_t height)
1186 {
1187  uint32_t loop_cnt;
1188  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1189  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1190  v8i16 filt, out0, out1, out2, out3;
1191  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1192  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1193 
1194  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1195  src -= 3;
1196 
1197  /* rearranging filter */
1198  filt = LD_SH(filter);
1199  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1200 
1201  mask1 = mask0 + 2;
1202  mask2 = mask0 + 4;
1203  mask3 = mask0 + 6;
1204 
1205  for (loop_cnt = height; loop_cnt--;) {
1206  src0 = LD_SB(src);
1207  src2 = LD_SB(src + 16);
1208  src3 = LD_SB(src + 24);
1209  src1 = __msa_sldi_b(src2, src0, 8);
1210  src += src_stride;
1211 
1212  XORI_B4_128_SB(src0, src1, src2, src3);
1213  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1214  vec12);
1215  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1216  vec13);
1217  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1218  vec14);
1219  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1220  vec15);
1221  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1222  vec1, vec2, vec3);
1223  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1224  vec9, vec10, vec11);
1225  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1226  vec1, vec2, vec3);
1227  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1228  vec8, vec9, vec10, vec11);
1229  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1230  out1, out2, out3);
1231  SRARI_H4_SH(out0, out1, out2, out3, 7);
1232  SAT_SH4_SH(out0, out1, out2, out3, 7);
1233  LD_UB2(dst, 16, dst1, dst2);
1234  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
1235  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
1236  dst += dst_stride;
1237  }
1238 }
1239 
1241  int32_t src_stride,
1242  uint8_t *dst, int32_t dst_stride,
1243  const int8_t *filter,
1244  int32_t height)
1245 {
1246  uint32_t loop_cnt, cnt;
1247  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1248  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1249  v8i16 filt, out0, out1, out2, out3;
1250  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1251  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1252 
1253  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1254  src -= 3;
1255 
1256  /* rearranging filter */
1257  filt = LD_SH(filter);
1258  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1259 
1260  mask1 = mask0 + 2;
1261  mask2 = mask0 + 4;
1262  mask3 = mask0 + 6;
1263 
1264  for (loop_cnt = height; loop_cnt--;) {
1265  for (cnt = 0; cnt < 2; ++cnt) {
1266  src0 = LD_SB(&src[cnt << 5]);
1267  src2 = LD_SB(&src[16 + (cnt << 5)]);
1268  src3 = LD_SB(&src[24 + (cnt << 5)]);
1269  src1 = __msa_sldi_b(src2, src0, 8);
1270 
1271  XORI_B4_128_SB(src0, src1, src2, src3);
1272  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1273  vec12);
1274  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1275  vec13);
1276  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
1277  vec10, vec14);
1278  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
1279  vec11, vec15);
1280  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1281  vec0, vec1, vec2, vec3);
1282  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
1283  vec8, vec9, vec10, vec11);
1284  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
1285  vec0, vec1, vec2, vec3);
1286  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1287  vec8, vec9, vec10, vec11);
1288  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1289  out1, out2, out3);
1290  SRARI_H4_SH(out0, out1, out2, out3, 7);
1291  SAT_SH4_SH(out0, out1, out2, out3, 7);
1292  LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
1293  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
1294  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
1295  }
1296 
1297  src += src_stride;
1298  dst += dst_stride;
1299  }
1300 }
1301 
1303  int32_t src_stride,
1304  uint8_t *dst, int32_t dst_stride,
1305  const int8_t *filter,
1306  int32_t height)
1307 {
1308  uint32_t loop_cnt;
1309  uint32_t tp0, tp1, tp2, tp3;
1310  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1311  v16u8 dst0, out;
1312  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1313  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
1314  v16i8 src10998, filt0, filt1, filt2, filt3;
1315  v8i16 filt, out10, out32;
1316 
1317  src -= (3 * src_stride);
1318 
1319  filt = LD_SH(filter);
1320  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1321 
1322  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1323  src += (7 * src_stride);
1324 
1325  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1326  src54_r, src21_r);
1327  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1328  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1329  src4332, src6554);
1330  XORI_B3_128_SB(src2110, src4332, src6554);
1331 
1332  for (loop_cnt = (height >> 2); loop_cnt--;) {
1333  LD_SB4(src, src_stride, src7, src8, src9, src10);
1334  src += (4 * src_stride);
1335 
1336  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1337  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1338  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1339  src87_r, src98_r, src109_r);
1340  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1341  XORI_B2_128_SB(src8776, src10998);
1342  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
1343  filt1, filt2, filt3);
1344  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
1345  filt1, filt2, filt3);
1346  SRARI_H2_SH(out10, out32, 7);
1347  SAT_SH2_SH(out10, out32, 7);
1348  out = PCKEV_XORI128_UB(out10, out32);
1349  out = __msa_aver_u_b(out, dst0);
1350 
1351  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1352  dst += (4 * dst_stride);
1353 
1354  src2110 = src6554;
1355  src4332 = src8776;
1356  src6554 = src10998;
1357  src6 = src10;
1358  }
1359 }
1360 
1362  int32_t src_stride,
1363  uint8_t *dst, int32_t dst_stride,
1364  const int8_t *filter,
1365  int32_t height)
1366 {
1367  uint32_t loop_cnt;
1368  uint64_t tp0, tp1, tp2, tp3;
1369  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1370  v16u8 dst0, dst1;
1371  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1372  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1373  v8i16 filt, out0, out1, out2, out3;
1374 
1375  src -= (3 * src_stride);
1376 
1377  filt = LD_SH(filter);
1378  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1379 
1380  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1381  src += (7 * src_stride);
1382 
1383  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1384  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1385  src54_r, src21_r);
1386  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1387 
1388  for (loop_cnt = (height >> 2); loop_cnt--;) {
1389  LD_SB4(src, src_stride, src7, src8, src9, src10);
1390  src += (4 * src_stride);
1391 
1392  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1393  INSERT_D2_UB(tp0, tp1, dst0);
1394  INSERT_D2_UB(tp2, tp3, dst1);
1395  XORI_B4_128_SB(src7, src8, src9, src10);
1396  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1397  src87_r, src98_r, src109_r);
1398  out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1399  filt1, filt2, filt3);
1400  out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1401  filt1, filt2, filt3);
1402  out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1403  filt1, filt2, filt3);
1404  out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1405  filt1, filt2, filt3);
1406  SRARI_H4_SH(out0, out1, out2, out3, 7);
1407  SAT_SH4_SH(out0, out1, out2, out3, 7);
1408  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1409  dst, dst_stride);
1410  dst += (4 * dst_stride);
1411 
1412  src10_r = src54_r;
1413  src32_r = src76_r;
1414  src54_r = src98_r;
1415  src21_r = src65_r;
1416  src43_r = src87_r;
1417  src65_r = src109_r;
1418  src6 = src10;
1419  }
1420 }
1421 
1423  int32_t src_stride,
1424  uint8_t *dst,
1425  int32_t dst_stride,
1426  const int8_t *filter,
1427  int32_t height,
1428  int32_t width)
1429 {
1430  const uint8_t *src_tmp;
1431  uint8_t *dst_tmp;
1432  uint32_t loop_cnt, cnt;
1433  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1434  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1435  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1436  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1437  v16i8 filt0, filt1, filt2, filt3;
1438  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
1439  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
1440 
1441  src -= (3 * src_stride);
1442 
1443  filt = LD_SH(filter);
1444  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1445 
1446  for (cnt = (width >> 4); cnt--;) {
1447  src_tmp = src;
1448  dst_tmp = dst;
1449 
1450  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1451  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1452  src_tmp += (7 * src_stride);
1453 
1454  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1455  src32_r, src54_r, src21_r);
1456  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1457  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1458  src32_l, src54_l, src21_l);
1459  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1460 
1461  for (loop_cnt = (height >> 2); loop_cnt--;) {
1462  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1463  src_tmp += (4 * src_stride);
1464 
1465  LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
1466  XORI_B4_128_SB(src7, src8, src9, src10);
1467  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1468  src87_r, src98_r, src109_r);
1469  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1470  src87_l, src98_l, src109_l);
1471  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1472  filt0, filt1, filt2, filt3);
1473  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1474  filt0, filt1, filt2, filt3);
1475  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1476  filt0, filt1, filt2, filt3);
1477  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1478  filt0, filt1, filt2, filt3);
1479  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1480  filt0, filt1, filt2, filt3);
1481  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1482  filt0, filt1, filt2, filt3);
1483  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1484  filt0, filt1, filt2, filt3);
1485  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1486  filt0, filt1, filt2, filt3);
1487  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1488  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1489  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1490  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1491  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1492  out3_r, tmp0, tmp1, tmp2, tmp3);
1493  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1494  AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
1495  dst0, dst1, dst2, dst3);
1496  ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
1497  dst_tmp += (4 * dst_stride);
1498 
1499  src10_r = src54_r;
1500  src32_r = src76_r;
1501  src54_r = src98_r;
1502  src21_r = src65_r;
1503  src43_r = src87_r;
1504  src65_r = src109_r;
1505  src10_l = src54_l;
1506  src32_l = src76_l;
1507  src54_l = src98_l;
1508  src21_l = src65_l;
1509  src43_l = src87_l;
1510  src65_l = src109_l;
1511  src6 = src10;
1512  }
1513 
1514  src += 16;
1515  dst += 16;
1516  }
1517 }
1518 
1520  int32_t src_stride,
1521  uint8_t *dst, int32_t dst_stride,
1522  const int8_t *filter,
1523  int32_t height)
1524 {
1525  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1526  filter, height, 16);
1527 }
1528 
1530  int32_t src_stride,
1531  uint8_t *dst, int32_t dst_stride,
1532  const int8_t *filter,
1533  int32_t height)
1534 {
1535  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1536  filter, height, 32);
1537 }
1538 
1540  int32_t src_stride,
1541  uint8_t *dst, int32_t dst_stride,
1542  const int8_t *filter,
1543  int32_t height)
1544 {
1545  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1546  filter, height, 64);
1547 }
1548 
1550  int32_t src_stride,
1551  uint8_t *dst,
1552  int32_t dst_stride,
1553  const int8_t *filter_horiz,
1554  const int8_t *filter_vert,
1555  int32_t height)
1556 {
1557  uint32_t loop_cnt;
1558  uint32_t tp0, tp1, tp2, tp3;
1559  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1560  v16u8 dst0, res, mask0, mask1, mask2, mask3;
1561  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1562  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1563  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
1564  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1565 
1566  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1567  src -= (3 + 3 * src_stride);
1568 
1569  /* rearranging filter */
1570  filt = LD_SH(filter_horiz);
1571  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1572 
1573  mask1 = mask0 + 2;
1574  mask2 = mask0 + 4;
1575  mask3 = mask0 + 6;
1576 
1577  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1578  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1579  src += (7 * src_stride);
1580 
1581  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1582  filt_hz1, filt_hz2, filt_hz3);
1583  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1584  filt_hz1, filt_hz2, filt_hz3);
1585  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1586  filt_hz1, filt_hz2, filt_hz3);
1587  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1588  filt_hz1, filt_hz2, filt_hz3);
1589  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
1590 
1591  filt = LD_SH(filter_vert);
1592  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1593 
1594  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1595  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1596 
1597  for (loop_cnt = (height >> 2); loop_cnt--;) {
1598  LD_SB4(src, src_stride, src7, src8, src9, src10);
1599  XORI_B4_128_SB(src7, src8, src9, src10);
1600  src += (4 * src_stride);
1601 
1602  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1603  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1604  hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
1605  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1606  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1607  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1608  res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
1609  filt_vt2, filt_vt3);
1610 
1611  hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
1612  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1613  hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
1614  vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1615  res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
1616  filt_vt2, filt_vt3);
1617 
1618  SRARI_H2_SH(res0, res1, 7);
1619  SAT_SH2_SH(res0, res1, 7);
1620  res = PCKEV_XORI128_UB(res0, res1);
1621  res = (v16u8) __msa_aver_u_b(res, dst0);
1622  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
1623  dst += (4 * dst_stride);
1624 
1625  hz_out5 = hz_out9;
1626  vec0 = vec2;
1627  vec1 = vec3;
1628  vec2 = vec4;
1629  }
1630 }
1631 
1633  int32_t src_stride,
1634  uint8_t *dst,
1635  int32_t dst_stride,
1636  const int8_t *filter_horiz,
1637  const int8_t *filter_vert,
1638  int32_t height)
1639 {
1640  uint32_t loop_cnt;
1641  uint64_t tp0, tp1, tp2, tp3;
1642  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1643  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1644  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1645  v16u8 dst0, dst1, mask0, mask1, mask2, mask3;
1646  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1647  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
1648  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
1649 
1650  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1651  src -= (3 + 3 * src_stride);
1652 
1653  /* rearranging filter */
1654  filt = LD_SH(filter_horiz);
1655  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1656 
1657  mask1 = mask0 + 2;
1658  mask2 = mask0 + 4;
1659  mask3 = mask0 + 6;
1660 
1661  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1662  src += (7 * src_stride);
1663 
1664  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1665  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1666  filt_hz1, filt_hz2, filt_hz3);
1667  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1668  filt_hz1, filt_hz2, filt_hz3);
1669  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1670  filt_hz1, filt_hz2, filt_hz3);
1671  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1672  filt_hz1, filt_hz2, filt_hz3);
1673  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1674  filt_hz1, filt_hz2, filt_hz3);
1675  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1676  filt_hz1, filt_hz2, filt_hz3);
1677  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1678  filt_hz1, filt_hz2, filt_hz3);
1679 
1680  filt = LD_SH(filter_vert);
1681  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1682 
1683  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1684  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
1685  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
1686 
1687  for (loop_cnt = (height >> 2); loop_cnt--;) {
1688  LD_SB4(src, src_stride, src7, src8, src9, src10);
1689  XORI_B4_128_SB(src7, src8, src9, src10);
1690  src += (4 * src_stride);
1691 
1692  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1693  INSERT_D2_UB(tp0, tp1, dst0);
1694  INSERT_D2_UB(tp2, tp3, dst1);
1695 
1696  hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
1697  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1698  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1699  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
1700  filt_vt2, filt_vt3);
1701 
1702  hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
1703  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1704  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1705  tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
1706  filt_vt2, filt_vt3);
1707 
1708  hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
1709  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1710  out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1711  tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
1712  filt_vt2, filt_vt3);
1713 
1714  hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
1715  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1716  out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
1717  tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
1718  filt_vt2, filt_vt3);
1719 
1720  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1721  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1722  CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1,
1723  dst, dst_stride);
1724  dst += (4 * dst_stride);
1725 
1726  hz_out6 = hz_out10;
1727  out0 = out2;
1728  out1 = out3;
1729  out2 = out8;
1730  out4 = out6;
1731  out5 = out7;
1732  out6 = out9;
1733  }
1734 }
1735 
1737  int32_t src_stride,
1738  uint8_t *dst,
1739  int32_t dst_stride,
1740  const int8_t *filter_horiz,
1741  const int8_t *filter_vert,
1742  int32_t height)
1743 {
1744  int32_t multiple8_cnt;
1745 
1746  for (multiple8_cnt = 2; multiple8_cnt--;) {
1747  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1748  filter_horiz, filter_vert,
1749  height);
1750 
1751  src += 8;
1752  dst += 8;
1753  }
1754 }
1755 
1757  int32_t src_stride,
1758  uint8_t *dst,
1759  int32_t dst_stride,
1760  const int8_t *filter_horiz,
1761  const int8_t *filter_vert,
1762  int32_t height)
1763 {
1764  int32_t multiple8_cnt;
1765 
1766  for (multiple8_cnt = 4; multiple8_cnt--;) {
1767  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1768  filter_horiz, filter_vert,
1769  height);
1770 
1771  src += 8;
1772  dst += 8;
1773  }
1774 }
1775 
1777  int32_t src_stride,
1778  uint8_t *dst,
1779  int32_t dst_stride,
1780  const int8_t *filter_horiz,
1781  const int8_t *filter_vert,
1782  int32_t height)
1783 {
1784  int32_t multiple8_cnt;
1785 
1786  for (multiple8_cnt = 8; multiple8_cnt--;) {
1787  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1788  filter_horiz, filter_vert,
1789  height);
1790 
1791  src += 8;
1792  dst += 8;
1793  }
1794 }
1795 
1796 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
1797  uint8_t *dst, int32_t dst_stride,
1798  const int8_t *filter)
1799 {
1800  v16i8 src0, src1, src2, src3, mask;
1801  v16u8 filt0, vec0, vec1, res0, res1;
1802  v8u16 vec2, vec3, filt;
1803 
1804  mask = LD_SB(&mc_filt_mask_arr[16]);
1805 
1806  /* rearranging filter */
1807  filt = LD_UH(filter);
1808  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1809 
1810  LD_SB4(src, src_stride, src0, src1, src2, src3);
1811  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1812  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1813  SRARI_H2_UH(vec2, vec3, 7);
1814  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1815  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1816 }
1817 
1818 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
1819  uint8_t *dst, int32_t dst_stride,
1820  const int8_t *filter)
1821 {
1822  v16u8 vec0, vec1, vec2, vec3, filt0;
1823  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1824  v16i8 res0, res1, res2, res3;
1825  v8u16 vec4, vec5, vec6, vec7, filt;
1826 
1827  mask = LD_SB(&mc_filt_mask_arr[16]);
1828 
1829  /* rearranging filter */
1830  filt = LD_UH(filter);
1831  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1832 
1833  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1834  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1835  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1836  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1837  vec4, vec5, vec6, vec7);
1838  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1839  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1840  res0, res1, res2, res3);
1841  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1842  dst += (4 * dst_stride);
1843  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
1844 }
1845 
1846 void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1847  const uint8_t *src, ptrdiff_t src_stride,
1848  int height, int mx, int my)
1849 {
1850  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1851 
1852  if (4 == height) {
1853  common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1854  } else if (8 == height) {
1855  common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1856  }
1857 }
1858 
1859 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
1860  uint8_t *dst, int32_t dst_stride,
1861  const int8_t *filter)
1862 {
1863  v16u8 filt0;
1864  v16i8 src0, src1, src2, src3, mask;
1865  v8u16 vec0, vec1, vec2, vec3, filt;
1866 
1867  mask = LD_SB(&mc_filt_mask_arr[0]);
1868 
1869  /* rearranging filter */
1870  filt = LD_UH(filter);
1871  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1872 
1873  LD_SB4(src, src_stride, src0, src1, src2, src3);
1874  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1875  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1876  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1877  vec0, vec1, vec2, vec3);
1878  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1879  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1880  ST8x4_UB(src0, src1, dst, dst_stride);
1881 }
1882 
1883 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
1884  uint8_t *dst, int32_t dst_stride,
1885  const int8_t *filter, int32_t height)
1886 {
1887  v16u8 filt0;
1888  v16i8 src0, src1, src2, src3, mask, out0, out1;
1889  v8u16 vec0, vec1, vec2, vec3, filt;
1890 
1891  mask = LD_SB(&mc_filt_mask_arr[0]);
1892 
1893  /* rearranging filter */
1894  filt = LD_UH(filter);
1895  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1896 
1897  LD_SB4(src, src_stride, src0, src1, src2, src3);
1898  src += (4 * src_stride);
1899 
1900  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1901  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1902  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1903  vec0, vec1, vec2, vec3);
1904  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1905  LD_SB4(src, src_stride, src0, src1, src2, src3);
1906  src += (4 * src_stride);
1907 
1908  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1909  ST8x4_UB(out0, out1, dst, dst_stride);
1910  dst += (4 * dst_stride);
1911 
1912  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1913  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1914  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1915  vec0, vec1, vec2, vec3);
1916  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1917  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1918  ST8x4_UB(out0, out1, dst, dst_stride);
1919  dst += (4 * dst_stride);
1920 
1921  if (16 == height) {
1922  LD_SB4(src, src_stride, src0, src1, src2, src3);
1923  src += (4 * src_stride);
1924 
1925  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1926  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1927  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1928  vec0, vec1, vec2, vec3);
1929  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1930  LD_SB4(src, src_stride, src0, src1, src2, src3);
1931  src += (4 * src_stride);
1932 
1933  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1934  ST8x4_UB(out0, out1, dst, dst_stride);
1935 
1936  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1937  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1938  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1939  vec0, vec1, vec2, vec3);
1940  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1941  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1942  ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
1943  }
1944 }
1945 
1946 void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1947  const uint8_t *src, ptrdiff_t src_stride,
1948  int height, int mx, int my)
1949 {
1950  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1951 
1952  if (4 == height) {
1953  common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1954  } else {
1955  common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1956  height);
1957  }
1958 }
1959 
1960 void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1961  const uint8_t *src, ptrdiff_t src_stride,
1962  int height, int mx, int my)
1963 {
1964  uint32_t loop_cnt;
1965  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1966  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1967  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1968  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1969 
1970  mask = LD_SB(&mc_filt_mask_arr[0]);
1971 
1972  loop_cnt = (height >> 2) - 1;
1973 
1974  /* rearranging filter */
1975  filt = LD_UH(filter);
1976  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1977 
1978  LD_SB4(src, src_stride, src0, src2, src4, src6);
1979  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1980  src += (4 * src_stride);
1981 
1982  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1983  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1984  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1985  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1986  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1987  out0, out1, out2, out3);
1988  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1989  out4, out5, out6, out7);
1990  SRARI_H4_UH(out0, out1, out2, out3, 7);
1991  SRARI_H4_UH(out4, out5, out6, out7, 7);
1992  PCKEV_ST_SB(out0, out1, dst);
1993  dst += dst_stride;
1994  PCKEV_ST_SB(out2, out3, dst);
1995  dst += dst_stride;
1996  PCKEV_ST_SB(out4, out5, dst);
1997  dst += dst_stride;
1998  PCKEV_ST_SB(out6, out7, dst);
1999  dst += dst_stride;
2000 
2001  for (; loop_cnt--;) {
2002  LD_SB4(src, src_stride, src0, src2, src4, src6);
2003  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2004  src += (4 * src_stride);
2005 
2006  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2007  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2008  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2009  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2010  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2011  out0, out1, out2, out3);
2012  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2013  out4, out5, out6, out7);
2014  SRARI_H4_UH(out0, out1, out2, out3, 7);
2015  SRARI_H4_UH(out4, out5, out6, out7, 7);
2016  PCKEV_ST_SB(out0, out1, dst);
2017  dst += dst_stride;
2018  PCKEV_ST_SB(out2, out3, dst);
2019  dst += dst_stride;
2020  PCKEV_ST_SB(out4, out5, dst);
2021  dst += dst_stride;
2022  PCKEV_ST_SB(out6, out7, dst);
2023  dst += dst_stride;
2024  }
2025 }
2026 
2027 void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2028  const uint8_t *src, ptrdiff_t src_stride,
2029  int height, int mx, int my)
2030 {
2031  uint32_t loop_cnt;
2032  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2033  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2034  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2035  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2036 
2037  mask = LD_SB(&mc_filt_mask_arr[0]);
2038 
2039  /* rearranging filter */
2040  filt = LD_UH(filter);
2041  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2042 
2043  for (loop_cnt = height >> 1; loop_cnt--;) {
2044  src0 = LD_SB(src);
2045  src2 = LD_SB(src + 16);
2046  src3 = LD_SB(src + 24);
2047  src1 = __msa_sldi_b(src2, src0, 8);
2048  src += src_stride;
2049  src4 = LD_SB(src);
2050  src6 = LD_SB(src + 16);
2051  src7 = LD_SB(src + 24);
2052  src5 = __msa_sldi_b(src6, src4, 8);
2053  src += src_stride;
2054 
2055  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2056  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2057  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2058  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2059  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2060  out0, out1, out2, out3);
2061  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2062  out4, out5, out6, out7);
2063  SRARI_H4_UH(out0, out1, out2, out3, 7);
2064  SRARI_H4_UH(out4, out5, out6, out7, 7);
2065  PCKEV_ST_SB(out0, out1, dst);
2066  PCKEV_ST_SB(out2, out3, dst + 16);
2067  dst += dst_stride;
2068  PCKEV_ST_SB(out4, out5, dst);
2069  PCKEV_ST_SB(out6, out7, dst + 16);
2070  dst += dst_stride;
2071  }
2072 }
2073 
2074 void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2075  const uint8_t *src, ptrdiff_t src_stride,
2076  int height, int mx, int my)
2077 {
2078  uint32_t loop_cnt;
2079  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2080  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2081  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2082  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2083 
2084  mask = LD_SB(&mc_filt_mask_arr[0]);
2085 
2086  /* rearranging filter */
2087  filt = LD_UH(filter);
2088  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2089 
2090  for (loop_cnt = height; loop_cnt--;) {
2091  src0 = LD_SB(src);
2092  src2 = LD_SB(src + 16);
2093  src4 = LD_SB(src + 32);
2094  src6 = LD_SB(src + 48);
2095  src7 = LD_SB(src + 56);
2096  SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
2097  src += src_stride;
2098 
2099  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2100  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2101  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2102  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2103  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2104  out0, out1, out2, out3);
2105  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2106  out4, out5, out6, out7);
2107  SRARI_H4_UH(out0, out1, out2, out3, 7);
2108  SRARI_H4_UH(out4, out5, out6, out7, 7);
2109  PCKEV_ST_SB(out0, out1, dst);
2110  PCKEV_ST_SB(out2, out3, dst + 16);
2111  PCKEV_ST_SB(out4, out5, dst + 32);
2112  PCKEV_ST_SB(out6, out7, dst + 48);
2113  dst += dst_stride;
2114  }
2115 }
2116 
2117 static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
2118  uint8_t *dst, int32_t dst_stride,
2119  const int8_t *filter)
2120 {
2121  v16i8 src0, src1, src2, src3, src4;
2122  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2123  v16u8 filt0;
2124  v8i16 filt;
2125  v8u16 tmp0, tmp1;
2126 
2127  filt = LD_SH(filter);
2128  filt0 = (v16u8) __msa_splati_h(filt, 0);
2129 
2130  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2131  src += (5 * src_stride);
2132 
2133  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2134  src10_r, src21_r, src32_r, src43_r);
2135  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2136  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
2137  SRARI_H2_UH(tmp0, tmp1, 7);
2138  SAT_UH2_UH(tmp0, tmp1, 7);
2139  src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2140  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
2141 }
2142 
2143 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
2144  uint8_t *dst, int32_t dst_stride,
2145  const int8_t *filter)
2146 {
2147  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2148  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
2149  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
2150  v8u16 tmp0, tmp1, tmp2, tmp3;
2151  v16u8 filt0;
2152  v8i16 filt;
2153 
2154  filt = LD_SH(filter);
2155  filt0 = (v16u8) __msa_splati_h(filt, 0);
2156 
2157  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2158  src += (8 * src_stride);
2159 
2160  src8 = LD_SB(src);
2161  src += src_stride;
2162 
2163  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2164  src32_r, src43_r);
2165  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2166  src76_r, src87_r);
2167  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2168  src87_r, src76_r, src2110, src4332, src6554, src8776);
2169  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
2170  tmp0, tmp1, tmp2, tmp3);
2171  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2172  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2173  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
2174  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
2175  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2176 }
2177 
2178 void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2179  const uint8_t *src, ptrdiff_t src_stride,
2180  int height, int mx, int my)
2181 {
2182  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2183 
2184  if (4 == height) {
2185  common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2186  } else if (8 == height) {
2187  common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2188  }
2189 }
2190 
2191 static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
2192  uint8_t *dst, int32_t dst_stride,
2193  const int8_t *filter)
2194 {
2195  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
2196  v16i8 out0, out1;
2197  v8u16 tmp0, tmp1, tmp2, tmp3;
2198  v8i16 filt;
2199 
2200  /* rearranging filter_y */
2201  filt = LD_SH(filter);
2202  filt0 = (v16u8) __msa_splati_h(filt, 0);
2203 
2204  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
2205  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
2206  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
2207  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2208  tmp0, tmp1, tmp2, tmp3);
2209  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2210  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2211  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2212  ST8x4_UB(out0, out1, dst, dst_stride);
2213 }
2214 
2215 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2216  uint8_t *dst, int32_t dst_stride,
2217  const int8_t *filter, int32_t height)
2218 {
2219  uint32_t loop_cnt;
2220  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2221  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2222  v16i8 out0, out1;
2223  v8u16 tmp0, tmp1, tmp2, tmp3;
2224  v8i16 filt;
2225 
2226  /* rearranging filter_y */
2227  filt = LD_SH(filter);
2228  filt0 = (v16u8) __msa_splati_h(filt, 0);
2229 
2230  src0 = LD_UB(src);
2231  src += src_stride;
2232 
2233  for (loop_cnt = (height >> 3); loop_cnt--;) {
2234  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
2235  src += (8 * src_stride);
2236 
2237  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
2238  vec0, vec1, vec2, vec3);
2239  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
2240  vec4, vec5, vec6, vec7);
2241  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2242  tmp0, tmp1, tmp2, tmp3);
2243  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2244  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2245  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2246  ST8x4_UB(out0, out1, dst, dst_stride);
2247  dst += (4 * dst_stride);
2248 
2249  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2250  tmp0, tmp1, tmp2, tmp3);
2251  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2252  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2253  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2254  ST8x4_UB(out0, out1, dst, dst_stride);
2255  dst += (4 * dst_stride);
2256 
2257  src0 = src8;
2258  }
2259 }
2260 
2261 void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2262  const uint8_t *src, ptrdiff_t src_stride,
2263  int height, int mx, int my)
2264 {
2265  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2266 
2267  if (4 == height) {
2268  common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
2269  } else {
2270  common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
2271  height);
2272  }
2273 }
2274 
2275 void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2276  const uint8_t *src, ptrdiff_t src_stride,
2277  int height, int mx, int my)
2278 {
2279  uint32_t loop_cnt;
2280  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2281  v16u8 src0, src1, src2, src3, src4;
2282  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2283  v8u16 tmp0, tmp1, tmp2, tmp3;
2284  v8i16 filt;
2285 
2286  /* rearranging filter_y */
2287  filt = LD_SH(filter);
2288  filt0 = (v16u8) __msa_splati_h(filt, 0);
2289 
2290  src0 = LD_UB(src);
2291  src += src_stride;
2292 
2293  for (loop_cnt = (height >> 2); loop_cnt--;) {
2294  LD_UB4(src, src_stride, src1, src2, src3, src4);
2295  src += (4 * src_stride);
2296 
2297  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2298  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2299  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2300  SRARI_H2_UH(tmp0, tmp1, 7);
2301  SAT_UH2_UH(tmp0, tmp1, 7);
2302  PCKEV_ST_SB(tmp0, tmp1, dst);
2303  dst += dst_stride;
2304 
2305  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2306  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2307  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2308  SRARI_H2_UH(tmp2, tmp3, 7);
2309  SAT_UH2_UH(tmp2, tmp3, 7);
2310  PCKEV_ST_SB(tmp2, tmp3, dst);
2311  dst += dst_stride;
2312 
2313  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2314  SRARI_H2_UH(tmp0, tmp1, 7);
2315  SAT_UH2_UH(tmp0, tmp1, 7);
2316  PCKEV_ST_SB(tmp0, tmp1, dst);
2317  dst += dst_stride;
2318 
2319  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2320  SRARI_H2_UH(tmp2, tmp3, 7);
2321  SAT_UH2_UH(tmp2, tmp3, 7);
2322  PCKEV_ST_SB(tmp2, tmp3, dst);
2323  dst += dst_stride;
2324 
2325  src0 = src4;
2326  }
2327 }
2328 
2329 void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2330  const uint8_t *src, ptrdiff_t src_stride,
2331  int height, int mx, int my)
2332 {
2333  uint32_t loop_cnt;
2334  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2335  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
2336  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2337  v8u16 tmp0, tmp1, tmp2, tmp3;
2338  v8i16 filt;
2339 
2340  /* rearranging filter_y */
2341  filt = LD_SH(filter);
2342  filt0 = (v16u8) __msa_splati_h(filt, 0);
2343 
2344  src0 = LD_UB(src);
2345  src5 = LD_UB(src + 16);
2346  src += src_stride;
2347 
2348  for (loop_cnt = (height >> 2); loop_cnt--;) {
2349  LD_UB4(src, src_stride, src1, src2, src3, src4);
2350  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2351  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2352 
2353  LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
2354  src += (4 * src_stride);
2355 
2356  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2357  SRARI_H2_UH(tmp0, tmp1, 7);
2358  SAT_UH2_UH(tmp0, tmp1, 7);
2359  PCKEV_ST_SB(tmp0, tmp1, dst);
2360  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2361  SRARI_H2_UH(tmp2, tmp3, 7);
2362  SAT_UH2_UH(tmp2, tmp3, 7);
2363  PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2364 
2365  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2366  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2367  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2368  SRARI_H2_UH(tmp0, tmp1, 7);
2369  SAT_UH2_UH(tmp0, tmp1, 7);
2370  PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
2371 
2372  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2373  SRARI_H2_UH(tmp2, tmp3, 7);
2374  SAT_UH2_UH(tmp2, tmp3, 7);
2375  PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
2376 
2377  ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
2378  ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
2379  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2380  SRARI_H2_UH(tmp0, tmp1, 7);
2381  SAT_UH2_UH(tmp0, tmp1, 7);
2382  PCKEV_ST_SB(tmp0, tmp1, dst + 16);
2383 
2384  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2385  SRARI_H2_UH(tmp2, tmp3, 7);
2386  SAT_UH2_UH(tmp2, tmp3, 7);
2387  PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
2388 
2389  ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
2390  ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
2391  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2392  SRARI_H2_UH(tmp0, tmp1, 7);
2393  SAT_UH2_UH(tmp0, tmp1, 7);
2394  PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
2395 
2396  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2397  SRARI_H2_UH(tmp2, tmp3, 7);
2398  SAT_UH2_UH(tmp2, tmp3, 7);
2399  PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
2400  dst += (4 * dst_stride);
2401 
2402  src0 = src4;
2403  src5 = src9;
2404  }
2405 }
2406 
2407 void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2408  const uint8_t *src, ptrdiff_t src_stride,
2409  int height, int mx, int my)
2410 {
2411  uint32_t loop_cnt;
2412  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2413  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2414  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2415  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2416  v8i16 filt;
2417 
2418  /* rearranging filter_y */
2419  filt = LD_SH(filter);
2420  filt0 = (v16u8) __msa_splati_h(filt, 0);
2421 
2422  LD_UB4(src, 16, src0, src3, src6, src9);
2423  src += src_stride;
2424 
2425  for (loop_cnt = (height >> 1); loop_cnt--;) {
2426  LD_UB2(src, src_stride, src1, src2);
2427  LD_UB2(src + 16, src_stride, src4, src5);
2428  LD_UB2(src + 32, src_stride, src7, src8);
2429  LD_UB2(src + 48, src_stride, src10, src11);
2430  src += (2 * src_stride);
2431 
2432  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2433  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2434  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2435  SRARI_H2_UH(tmp0, tmp1, 7);
2436  SAT_UH2_UH(tmp0, tmp1, 7);
2437  PCKEV_ST_SB(tmp0, tmp1, dst);
2438 
2439  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2440  SRARI_H2_UH(tmp2, tmp3, 7);
2441  SAT_UH2_UH(tmp2, tmp3, 7);
2442  PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2443 
2444  ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
2445  ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
2446  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2447  SRARI_H2_UH(tmp4, tmp5, 7);
2448  SAT_UH2_UH(tmp4, tmp5, 7);
2449  PCKEV_ST_SB(tmp4, tmp5, dst + 16);
2450 
2451  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2452  SRARI_H2_UH(tmp6, tmp7, 7);
2453  SAT_UH2_UH(tmp6, tmp7, 7);
2454  PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
2455 
2456  ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
2457  ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
2458  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2459  SRARI_H2_UH(tmp0, tmp1, 7);
2460  SAT_UH2_UH(tmp0, tmp1, 7);
2461  PCKEV_ST_SB(tmp0, tmp1, dst + 32);
2462 
2463  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2464  SRARI_H2_UH(tmp2, tmp3, 7);
2465  SAT_UH2_UH(tmp2, tmp3, 7);
2466  PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
2467 
2468  ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
2469  ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
2470  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2471  SRARI_H2_UH(tmp4, tmp5, 7);
2472  SAT_UH2_UH(tmp4, tmp5, 7);
2473  PCKEV_ST_SB(tmp4, tmp5, dst + 48);
2474 
2475  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2476  SRARI_H2_UH(tmp6, tmp7, 7);
2477  SAT_UH2_UH(tmp6, tmp7, 7);
2478  PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
2479  dst += (2 * dst_stride);
2480 
2481  src0 = src2;
2482  src3 = src5;
2483  src6 = src8;
2484  src9 = src11;
2485  }
2486 }
2487 
2488 static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
2489  uint8_t *dst, int32_t dst_stride,
2490  const int8_t *filter_horiz, const int8_t *filter_vert)
2491 {
2492  v16i8 src0, src1, src2, src3, src4, mask;
2493  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
2494  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
2495 
2496  mask = LD_SB(&mc_filt_mask_arr[16]);
2497 
2498  /* rearranging filter */
2499  filt = LD_UH(filter_horiz);
2500  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2501 
2502  filt = LD_UH(filter_vert);
2503  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2504 
2505  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2506  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2507  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2508  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2509  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
2510  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
2511 
2512  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2513  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
2514  SRARI_H2_UH(tmp0, tmp1, 7);
2515  SAT_UH2_UH(tmp0, tmp1, 7);
2516  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
2517  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2518 }
2519 
2520 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
2521  uint8_t *dst, int32_t dst_stride,
2522  const int8_t *filter_horiz, const int8_t *filter_vert)
2523 {
2524  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
2525  v16i8 res0, res1, res2, res3;
2526  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2527  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2528  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
2529 
2530  mask = LD_SB(&mc_filt_mask_arr[16]);
2531 
2532  /* rearranging filter */
2533  filt = LD_UH(filter_horiz);
2534  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2535 
2536  filt = LD_UH(filter_vert);
2537  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2538 
2539  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2540  src += (8 * src_stride);
2541  src8 = LD_SB(src);
2542 
2543  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2544  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2545  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
2546  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
2547  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
2548  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
2549  hz_out3, hz_out5, 8);
2550  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2551 
2552  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2553  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2554  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2555  vec4, vec5, vec6, vec7);
2556  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2557  SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2558  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2559  res0, res1, res2, res3);
2560  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2561  dst += (4 * dst_stride);
2562  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
2563 }
2564 
2565 void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2566  const uint8_t *src, ptrdiff_t src_stride,
2567  int height, int mx, int my)
2568 {
2569  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2570  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2571 
2572  if (4 == height) {
2573  common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2574  filter_horiz, filter_vert);
2575  } else if (8 == height) {
2576  common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2577  filter_horiz, filter_vert);
2578  }
2579 }
2580 
2581 static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
2582  uint8_t *dst, int32_t dst_stride,
2583  const int8_t *filter_horiz, const int8_t *filter_vert)
2584 {
2585  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2586  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2587  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2588  v8i16 filt;
2589 
2590  mask = LD_SB(&mc_filt_mask_arr[0]);
2591 
2592  /* rearranging filter */
2593  filt = LD_SH(filter_horiz);
2594  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2595 
2596  filt = LD_SH(filter_vert);
2597  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2598 
2599  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2600 
2601  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2602  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2603  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2604  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2605 
2606  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2607  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2608  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2609 
2610  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2611  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2612  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2613 
2614  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2615  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2616  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2617 
2618  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2619  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2620  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2621  ST8x4_UB(out0, out1, dst, dst_stride);
2622 }
2623 
2624 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2625  uint8_t *dst, int32_t dst_stride,
2626  const int8_t *filter_horiz, const int8_t *filter_vert,
2627  int32_t height)
2628 {
2629  uint32_t loop_cnt;
2630  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2631  v16u8 filt_hz, filt_vt, vec0;
2632  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2633  v8i16 filt;
2634 
2635  mask = LD_SB(&mc_filt_mask_arr[0]);
2636 
2637  /* rearranging filter */
2638  filt = LD_SH(filter_horiz);
2639  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2640 
2641  filt = LD_SH(filter_vert);
2642  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2643 
2644  src0 = LD_SB(src);
2645  src += src_stride;
2646 
2647  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2648 
2649  for (loop_cnt = (height >> 3); loop_cnt--;) {
2650  LD_SB4(src, src_stride, src1, src2, src3, src4);
2651  src += (4 * src_stride);
2652 
2653  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2654  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2655  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2656 
2657  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2658  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2659  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2660 
2661  SRARI_H2_UH(tmp1, tmp2, 7);
2662  SAT_UH2_UH(tmp1, tmp2, 7);
2663 
2664  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2665  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2666  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2667 
2668  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2669  LD_SB4(src, src_stride, src1, src2, src3, src4);
2670  src += (4 * src_stride);
2671  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2672  tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2673 
2674  SRARI_H2_UH(tmp3, tmp4, 7);
2675  SAT_UH2_UH(tmp3, tmp4, 7);
2676  PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2677  ST8x4_UB(out0, out1, dst, dst_stride);
2678  dst += (4 * dst_stride);
2679 
2680  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2681  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2682  tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2683 
2684  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2685  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2686  tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2687 
2688  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2689  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2690  tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2691 
2692  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2693  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2694  tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2695 
2696  SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2697  SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2698  PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2699  ST8x4_UB(out0, out1, dst, dst_stride);
2700  dst += (4 * dst_stride);
2701  }
2702 }
2703 
2704 void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2705  const uint8_t *src, ptrdiff_t src_stride,
2706  int height, int mx, int my)
2707 {
2708  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2709  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2710 
2711  if (4 == height) {
2712  common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2713  filter_horiz, filter_vert);
2714  } else {
2715  common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2716  filter_horiz, filter_vert, height);
2717  }
2718 }
2719 
2720 void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2721  const uint8_t *src, ptrdiff_t src_stride,
2722  int height, int mx, int my)
2723 {
2724  uint32_t loop_cnt;
2725  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2726  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2727  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2728  v16u8 filt_hz, filt_vt, vec0, vec1;
2729  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2730  v8i16 filt;
2731 
2732  mask = LD_SB(&mc_filt_mask_arr[0]);
2733 
2734  /* rearranging filter */
2735  filt = LD_SH(filter_horiz);
2736  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2737 
2738  filt = LD_SH(filter_vert);
2739  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2740 
2741  LD_SB2(src, 8, src0, src1);
2742  src += src_stride;
2743 
2744  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2745  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2746 
2747 
2748  for (loop_cnt = (height >> 2); loop_cnt--;) {
2749  LD_SB4(src, src_stride, src0, src2, src4, src6);
2750  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2751  src += (4 * src_stride);
2752 
2753  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2754  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2755  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2756  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2757  SRARI_H2_UH(tmp1, tmp2, 7);
2758  SAT_UH2_UH(tmp1, tmp2, 7);
2759  PCKEV_ST_SB(tmp1, tmp2, dst);
2760  dst += dst_stride;
2761 
2762  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2763  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2764  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2765  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2766  SRARI_H2_UH(tmp1, tmp2, 7);
2767  SAT_UH2_UH(tmp1, tmp2, 7);
2768  PCKEV_ST_SB(tmp1, tmp2, dst);
2769  dst += dst_stride;
2770 
2771  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2772  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2773  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2774  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2775  SRARI_H2_UH(tmp1, tmp2, 7);
2776  SAT_UH2_UH(tmp1, tmp2, 7);
2777  PCKEV_ST_SB(tmp1, tmp2, dst);
2778  dst += dst_stride;
2779 
2780  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2781  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2782  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2783  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2784  SRARI_H2_UH(tmp1, tmp2, 7);
2785  SAT_UH2_UH(tmp1, tmp2, 7);
2786  PCKEV_ST_SB(tmp1, tmp2, dst);
2787  dst += dst_stride;
2788  }
2789 }
2790 
2791 void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2792  const uint8_t *src, ptrdiff_t src_stride,
2793  int height, int mx, int my)
2794 {
2795  int32_t multiple8_cnt;
2796 
2797  for (multiple8_cnt = 2; multiple8_cnt--;) {
2798  ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2799 
2800  src += 16;
2801  dst += 16;
2802  }
2803 }
2804 
2805 void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2806  const uint8_t *src, ptrdiff_t src_stride,
2807  int height, int mx, int my)
2808 {
2809  int32_t multiple8_cnt;
2810 
2811  for (multiple8_cnt = 4; multiple8_cnt--;) {
2812  ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2813 
2814  src += 16;
2815  dst += 16;
2816  }
2817 }
2818 
2820  int32_t src_stride,
2821  uint8_t *dst, int32_t dst_stride,
2822  const int8_t *filter)
2823 {
2824  uint32_t tp0, tp1, tp2, tp3;
2825  v16i8 src0, src1, src2, src3, mask;
2826  v16u8 filt0, dst0, vec0, vec1, res;
2827  v8u16 vec2, vec3, filt;
2828 
2829  mask = LD_SB(&mc_filt_mask_arr[16]);
2830 
2831  /* rearranging filter */
2832  filt = LD_UH(filter);
2833  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2834 
2835  LD_SB4(src, src_stride, src0, src1, src2, src3);
2836  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2837  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2838  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2839  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
2840  SRARI_H2_UH(vec2, vec3, 7);
2841 
2842  res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
2843  res = (v16u8) __msa_aver_u_b(res, dst0);
2844 
2845  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
2846 }
2847 
2849  int32_t src_stride,
2850  uint8_t *dst, int32_t dst_stride,
2851  const int8_t *filter)
2852 {
2853  uint32_t tp0, tp1, tp2, tp3;
2854  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2855  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
2856  v16u8 dst0, dst1;
2857  v8u16 vec4, vec5, vec6, vec7, filt;
2858 
2859  mask = LD_SB(&mc_filt_mask_arr[16]);
2860 
2861  /* rearranging filter */
2862  filt = LD_UH(filter);
2863  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2864 
2865  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2866  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2867  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2868  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
2869  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
2870  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2871  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
2872  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
2873  vec6, vec7);
2874  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2875  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
2876  res2, res3);
2877  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
2878  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
2879  ST4x8_UB(res0, res2, dst, dst_stride);
2880 }
2881 
2882 void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2883  const uint8_t *src, ptrdiff_t src_stride,
2884  int height, int mx, int my)
2885 {
2886  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2887 
2888  if (4 == height) {
2889  common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
2890  filter);
2891  } else if (8 == height) {
2892  common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
2893  filter);
2894  }
2895 }
2896 
2898  int32_t src_stride,
2899  uint8_t *dst, int32_t dst_stride,
2900  const int8_t *filter)
2901 {
2902  int64_t tp0, tp1, tp2, tp3;
2903  v16i8 src0, src1, src2, src3, mask;
2904  v16u8 filt0, dst0, dst1;
2905  v8u16 vec0, vec1, vec2, vec3, filt;
2906 
2907  mask = LD_SB(&mc_filt_mask_arr[0]);
2908 
2909  /* rearranging filter */
2910  filt = LD_UH(filter);
2911  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2912 
2913  LD_SB4(src, src_stride, src0, src1, src2, src3);
2914  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2915  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2916  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2917  vec0, vec1, vec2, vec3);
2918  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2919  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2920  INSERT_D2_UB(tp0, tp1, dst0);
2921  INSERT_D2_UB(tp2, tp3, dst1);
2922  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2923 }
2924 
2926  int32_t src_stride,
2927  uint8_t *dst,
2928  int32_t dst_stride,
2929  const int8_t *filter,
2930  int32_t height)
2931 {
2932  int64_t tp0, tp1, tp2, tp3;
2933  v16i8 src0, src1, src2, src3, mask;
2934  v16u8 filt0, dst0, dst1;
2935  v8u16 vec0, vec1, vec2, vec3, filt;
2936 
2937  mask = LD_SB(&mc_filt_mask_arr[0]);
2938 
2939  /* rearranging filter */
2940  filt = LD_UH(filter);
2941  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2942 
2943  LD_SB4(src, src_stride, src0, src1, src2, src3);
2944  src += (4 * src_stride);
2945  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2946  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2947  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2948  vec2, vec3);
2949  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2950  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2951  INSERT_D2_UB(tp0, tp1, dst0);
2952  INSERT_D2_UB(tp2, tp3, dst1);
2953  LD_SB4(src, src_stride, src0, src1, src2, src3);
2954  src += (4 * src_stride);
2955  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2956  dst += (4 * dst_stride);
2957 
2958  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2959  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2960  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2961  vec2, vec3);
2962  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2963  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2964  INSERT_D2_UB(tp0, tp1, dst0);
2965  INSERT_D2_UB(tp2, tp3, dst1);
2966  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2967  dst += (4 * dst_stride);
2968 
2969  if (16 == height) {
2970  LD_SB4(src, src_stride, src0, src1, src2, src3);
2971  src += (4 * src_stride);
2972 
2973  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2974  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2975  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2976  vec1, vec2, vec3);
2977  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2978  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2979  INSERT_D2_UB(tp0, tp1, dst0);
2980  INSERT_D2_UB(tp2, tp3, dst1);
2981  LD_SB4(src, src_stride, src0, src1, src2, src3);
2982  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2983  dst += (4 * dst_stride);
2984 
2985  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2986  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2987  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2988  vec1, vec2, vec3);
2989  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2990  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2991  INSERT_D2_UB(tp0, tp1, dst0);
2992  INSERT_D2_UB(tp2, tp3, dst1);
2993  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2994  }
2995 }
2996 
2997 void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2998  const uint8_t *src, ptrdiff_t src_stride,
2999  int height, int mx, int my)
3000 {
3001  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3002 
3003  if (4 == height) {
3004  common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3005  filter);
3006  } else {
3007  common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3008  filter, height);
3009  }
3010 }
3011 
3012 void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3013  const uint8_t *src, ptrdiff_t src_stride,
3014  int height, int mx, int my)
3015 {
3016  uint32_t loop_cnt;
3017  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3018  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3019  v16u8 filt0, dst0, dst1, dst2, dst3;
3020  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3021  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3022 
3023  mask = LD_SB(&mc_filt_mask_arr[0]);
3024 
3025  /* rearranging filter */
3026  filt = LD_UH(filter);
3027  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3028 
3029  LD_SB4(src, src_stride, src0, src2, src4, src6);
3030  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3031  src += (4 * src_stride);
3032 
3033  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3034  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3035  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3036  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3037  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
3038  res2, res3);
3039  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
3040  res6, res7);
3041  SRARI_H4_UH(res0, res1, res2, res3, 7);
3042  SRARI_H4_UH(res4, res5, res6, res7, 7);
3043  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3044  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3045  dst += dst_stride;
3046  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3047  dst += dst_stride;
3048  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3049  dst += dst_stride;
3050  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3051  dst += dst_stride;
3052 
3053  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
3054  LD_SB4(src, src_stride, src0, src2, src4, src6);
3055  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3056  src += (4 * src_stride);
3057 
3058  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3059  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3060  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3061  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3062  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
3063  res1, res2, res3);
3064  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
3065  res5, res6, res7);
3066  SRARI_H4_UH(res0, res1, res2, res3, 7);
3067  SRARI_H4_UH(res4, res5, res6, res7, 7);
3068  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3069  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3070  dst += dst_stride;
3071  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3072  dst += dst_stride;
3073  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3074  dst += dst_stride;
3075  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3076  dst += dst_stride;
3077  }
3078 }
3079 
3080 void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3081  const uint8_t *src, ptrdiff_t src_stride,
3082  int height, int mx, int my)
3083 {
3084  uint32_t loop_cnt;
3085  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3086  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3087  v16u8 filt0, dst0, dst1, dst2, dst3;
3088  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3089  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3090 
3091  mask = LD_SB(&mc_filt_mask_arr[0]);
3092 
3093  /* rearranging filter */
3094  filt = LD_UH(filter);
3095  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3096 
3097  for (loop_cnt = (height >> 1); loop_cnt--;) {
3098  src0 = LD_SB(src);
3099  src2 = LD_SB(src + 16);
3100  src3 = LD_SB(src + 24);
3101  src1 = __msa_sldi_b(src2, src0, 8);
3102  src += src_stride;
3103  src4 = LD_SB(src);
3104  src6 = LD_SB(src + 16);
3105  src7 = LD_SB(src + 24);
3106  src5 = __msa_sldi_b(src6, src4, 8);
3107  src += src_stride;
3108 
3109  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3110  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3111  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3112  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3113  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3114  res0, res1, res2, res3);
3115  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3116  res4, res5, res6, res7);
3117  SRARI_H4_UH(res0, res1, res2, res3, 7);
3118  SRARI_H4_UH(res4, res5, res6, res7, 7);
3119  LD_UB2(dst, 16, dst0, dst1);
3120  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3121  PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
3122  dst += dst_stride;
3123  LD_UB2(dst, 16, dst2, dst3);
3124  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3125  PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
3126  dst += dst_stride;
3127  }
3128 }
3129 
3130 void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3131  const uint8_t *src, ptrdiff_t src_stride,
3132  int height, int mx, int my)
3133 {
3134  uint32_t loop_cnt;
3135  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3136  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3137  v16u8 filt0, dst0, dst1, dst2, dst3;
3138  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3139  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
3140 
3141  mask = LD_SB(&mc_filt_mask_arr[0]);
3142 
3143  /* rearranging filter */
3144  filt = LD_UH(filter);
3145  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3146 
3147  for (loop_cnt = height; loop_cnt--;) {
3148  LD_SB4(src, 16, src0, src2, src4, src6);
3149  src7 = LD_SB(src + 56);
3150  SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
3151  src += src_stride;
3152 
3153  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3154  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3155  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3156  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3157  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3158  out0, out1, out2, out3);
3159  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3160  out4, out5, out6, out7);
3161  SRARI_H4_UH(out0, out1, out2, out3, 7);
3162  SRARI_H4_UH(out4, out5, out6, out7, 7);
3163  LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
3164  PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
3165  PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
3166  PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
3167  PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
3168  dst += dst_stride;
3169  }
3170 }
3171 
3173  int32_t src_stride,
3174  uint8_t *dst, int32_t dst_stride,
3175  const int8_t *filter)
3176 {
3177  uint32_t tp0, tp1, tp2, tp3;
3178  v16i8 src0, src1, src2, src3, src4;
3179  v16u8 dst0, out, filt0, src2110, src4332;
3180  v16i8 src10_r, src32_r, src21_r, src43_r;
3181  v8i16 filt;
3182  v8u16 tmp0, tmp1;
3183 
3184  filt = LD_SH(filter);
3185  filt0 = (v16u8) __msa_splati_h(filt, 0);
3186 
3187  LD_SB4(src, src_stride, src0, src1, src2, src3);
3188  src += (4 * src_stride);
3189 
3190  src4 = LD_SB(src);
3191  src += src_stride;
3192 
3193  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3194  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3195  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
3196  src10_r, src21_r, src32_r, src43_r);
3197  ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3198  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
3199  SRARI_H2_UH(tmp0, tmp1, 7);
3200  SAT_UH2_UH(tmp0, tmp1, 7);
3201 
3202  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3203  out = __msa_aver_u_b(out, dst0);
3204 
3205  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3206 }
3207 
3209  int32_t src_stride,
3210  uint8_t *dst, int32_t dst_stride,
3211  const int8_t *filter)
3212 {
3213  uint32_t tp0, tp1, tp2, tp3;
3214  v16u8 dst0, dst1;
3215  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
3216  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3217  v16u8 src2110, src4332, src6554, src8776, filt0;
3218  v8u16 tmp0, tmp1, tmp2, tmp3;
3219  v8i16 filt;
3220 
3221  filt = LD_SH(filter);
3222  filt0 = (v16u8) __msa_splati_h(filt, 0);
3223 
3224  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3225  src += (8 * src_stride);
3226  src8 = LD_SB(src);
3227 
3228  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3229  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3230  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3231  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3232  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3233  src32_r, src43_r);
3234  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3235  src76_r, src87_r);
3236  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3237  src87_r, src76_r, src2110, src4332, src6554, src8776);
3238  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
3239  tmp0, tmp1, tmp2, tmp3);
3240  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3241  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3242  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
3243  AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
3244  ST4x8_UB(src2110, src4332, dst, dst_stride);
3245 }
3246 
3247 void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3248  const uint8_t *src, ptrdiff_t src_stride,
3249  int height, int mx, int my)
3250 {
3251  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3252 
3253  if (4 == height) {
3254  common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3255  filter);
3256  } else if (8 == height) {
3257  common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3258  filter);
3259  }
3260 }
3261 
3263  int32_t src_stride,
3264  uint8_t *dst,
3265  int32_t dst_stride,
3266  const int8_t *filter)
3267 {
3268  int64_t tp0, tp1, tp2, tp3;
3269  v16u8 src0, src1, src2, src3, src4;
3270  v16u8 dst0, dst1, vec0, vec1, vec2, vec3, filt0;
3271  v8u16 tmp0, tmp1, tmp2, tmp3;
3272  v8i16 filt;
3273 
3274  /* rearranging filter_y */
3275  filt = LD_SH(filter);
3276  filt0 = (v16u8) __msa_splati_h(filt, 0);
3277 
3278  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
3279  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3280  INSERT_D2_UB(tp0, tp1, dst0);
3281  INSERT_D2_UB(tp2, tp3, dst1);
3282  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
3283  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
3284  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3285  tmp0, tmp1, tmp2, tmp3);
3286  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3287  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3288  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3289 }
3290 
3292  int32_t src_stride,
3293  uint8_t *dst,
3294  int32_t dst_stride,
3295  const int8_t *filter,
3296  int32_t height)
3297 {
3298  uint32_t loop_cnt;
3299  int64_t tp0, tp1, tp2, tp3;
3300  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3301  v16u8 dst0, dst1, dst2, dst3;
3302  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3303  v8u16 tmp0, tmp1, tmp2, tmp3;
3304  v8i16 filt;
3305 
3306  /* rearranging filter_y */
3307  filt = LD_SH(filter);
3308  filt0 = (v16u8) __msa_splati_h(filt, 0);
3309 
3310  src0 = LD_UB(src);
3311  src += src_stride;
3312 
3313  for (loop_cnt = (height >> 3); loop_cnt--;) {
3314  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
3315  src += (8 * src_stride);
3316 
3317  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3318  INSERT_D2_UB(tp0, tp1, dst0);
3319  INSERT_D2_UB(tp2, tp3, dst1);
3320  LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3321  INSERT_D2_UB(tp0, tp1, dst2);
3322  INSERT_D2_UB(tp2, tp3, dst3);
3323 
3324  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
3325  vec0, vec1, vec2, vec3);
3326  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
3327  vec4, vec5, vec6, vec7);
3328  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3329  tmp0, tmp1, tmp2, tmp3);
3330  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3331  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3332  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3333  dst += (4 * dst_stride);
3334 
3335  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3336  tmp0, tmp1, tmp2, tmp3);
3337  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3338  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3339  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
3340  dst += (4 * dst_stride);
3341 
3342  src0 = src8;
3343  }
3344 }
3345 
3346 void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3347  const uint8_t *src, ptrdiff_t src_stride,
3348  int height, int mx, int my)
3349 {
3350  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3351 
3352  if (4 == height) {
3353  common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3354  filter);
3355  } else {
3356  common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3357  filter, height);
3358  }
3359 }
3360 
3361 void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3362  const uint8_t *src, ptrdiff_t src_stride,
3363  int height, int mx, int my)
3364 {
3365  uint32_t loop_cnt;
3366  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3367  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
3368  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3369  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3370 
3371  /* rearranging filter_y */
3372  filt = LD_UH(filter);
3373  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3374 
3375  src0 = LD_UB(src);
3376  src += src_stride;
3377 
3378  for (loop_cnt = (height >> 2); loop_cnt--;) {
3379  LD_UB4(src, src_stride, src1, src2, src3, src4);
3380  src += (4 * src_stride);
3381 
3382  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3383  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3384  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3385  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3386  SRARI_H2_UH(tmp0, tmp1, 7);
3387  SAT_UH2_UH(tmp0, tmp1, 7);
3388  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3389  dst += dst_stride;
3390 
3391  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3392  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3393  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3394  SRARI_H2_UH(tmp2, tmp3, 7);
3395  SAT_UH2_UH(tmp2, tmp3, 7);
3396  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
3397  dst += dst_stride;
3398 
3399  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3400  SRARI_H2_UH(tmp0, tmp1, 7);
3401  SAT_UH2_UH(tmp0, tmp1, 7);
3402  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3403  dst += dst_stride;
3404 
3405  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3406  SRARI_H2_UH(tmp2, tmp3, 7);
3407  SAT_UH2_UH(tmp2, tmp3, 7);
3408  PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
3409  dst += dst_stride;
3410 
3411  src0 = src4;
3412  }
3413 }
3414 
3415 void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3416  const uint8_t *src, ptrdiff_t src_stride,
3417  int height, int mx, int my)
3418 {
3419  uint32_t loop_cnt;
3420  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3421  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3422  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3423  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3424  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3425 
3426  /* rearranging filter_y */
3427  filt = LD_UH(filter);
3428  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3429 
3430  LD_UB2(src, 16, src0, src5);
3431  src += src_stride;
3432 
3433  for (loop_cnt = (height >> 2); loop_cnt--;) {
3434  LD_UB4(src, src_stride, src1, src2, src3, src4);
3435  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3436  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3437  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3438 
3439  LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
3440  LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
3441  src += (4 * src_stride);
3442 
3443  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3444  SRARI_H2_UH(tmp0, tmp1, 7);
3445  SAT_UH2_UH(tmp0, tmp1, 7);
3446  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3447 
3448  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3449  SRARI_H2_UH(tmp2, tmp3, 7);
3450  SAT_UH2_UH(tmp2, tmp3, 7);
3451  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3452 
3453  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3454  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3455  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3456  SRARI_H2_UH(tmp0, tmp1, 7);
3457  SAT_UH2_UH(tmp0, tmp1, 7);
3458  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
3459 
3460  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3461  SRARI_H2_UH(tmp2, tmp3, 7);
3462  SAT_UH2_UH(tmp2, tmp3, 7);
3463  PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
3464 
3465  ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
3466  ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
3467  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3468  SRARI_H2_UH(tmp0, tmp1, 7);
3469  SAT_UH2_UH(tmp0, tmp1, 7);
3470  PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
3471 
3472  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3473  SRARI_H2_UH(tmp2, tmp3, 7);
3474  SAT_UH2_UH(tmp2, tmp3, 7);
3475  PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
3476 
3477  ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
3478  ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
3479  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3480  SRARI_H2_UH(tmp0, tmp1, 7);
3481  SAT_UH2_UH(tmp0, tmp1, 7);
3482  PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
3483 
3484  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3485  SRARI_H2_UH(tmp2, tmp3, 7);
3486  SAT_UH2_UH(tmp2, tmp3, 7);
3487  PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
3488  dst += (4 * dst_stride);
3489 
3490  src0 = src4;
3491  src5 = src9;
3492  }
3493 }
3494 
3495 void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3496  const uint8_t *src, ptrdiff_t src_stride,
3497  int height, int mx, int my)
3498 {
3499  uint32_t loop_cnt;
3500  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3501  v16u8 src0, src1, src2, src3, src4, src5;
3502  v16u8 src6, src7, src8, src9, src10, src11, filt0;
3503  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3504  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3505  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3506  v8u16 filt;
3507 
3508  /* rearranging filter_y */
3509  filt = LD_UH(filter);
3510  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3511 
3512  LD_UB4(src, 16, src0, src3, src6, src9);
3513  src += src_stride;
3514 
3515  for (loop_cnt = (height >> 1); loop_cnt--;) {
3516  LD_UB2(src, src_stride, src1, src2);
3517  LD_UB2(dst, dst_stride, dst0, dst1);
3518  LD_UB2(src + 16, src_stride, src4, src5);
3519  LD_UB2(dst + 16, dst_stride, dst2, dst3);
3520  LD_UB2(src + 32, src_stride, src7, src8);
3521  LD_UB2(dst + 32, dst_stride, dst4, dst5);
3522  LD_UB2(src + 48, src_stride, src10, src11);
3523  LD_UB2(dst + 48, dst_stride, dst6, dst7);
3524  src += (2 * src_stride);
3525 
3526  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3527  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3528  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3529  SRARI_H2_UH(tmp0, tmp1, 7);
3530  SAT_UH2_UH(tmp0, tmp1, 7);
3531  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3532 
3533  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3534  SRARI_H2_UH(tmp2, tmp3, 7);
3535  SAT_UH2_UH(tmp2, tmp3, 7);
3536  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3537 
3538  ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
3539  ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
3540  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3541  SRARI_H2_UH(tmp4, tmp5, 7);
3542  SAT_UH2_UH(tmp4, tmp5, 7);
3543  PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
3544 
3545  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3546  SRARI_H2_UH(tmp6, tmp7, 7);
3547  SAT_UH2_UH(tmp6, tmp7, 7);
3548  PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
3549 
3550  ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
3551  ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
3552  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3553  SRARI_H2_UH(tmp0, tmp1, 7);
3554  SAT_UH2_UH(tmp0, tmp1, 7);
3555  PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
3556 
3557  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3558  SRARI_H2_UH(tmp2, tmp3, 7);
3559  SAT_UH2_UH(tmp2, tmp3, 7);
3560  PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
3561 
3562  ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
3563  ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
3564  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3565  SRARI_H2_UH(tmp4, tmp5, 7);
3566  SAT_UH2_UH(tmp4, tmp5, 7);
3567  PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
3568 
3569  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3570  SRARI_H2_UH(tmp6, tmp7, 7);
3571  SAT_UH2_UH(tmp6, tmp7, 7);
3572  PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
3573  dst += (2 * dst_stride);
3574 
3575  src0 = src2;
3576  src3 = src5;
3577  src6 = src8;
3578  src9 = src11;
3579  }
3580 }
3581 
3583  int32_t src_stride,
3584  uint8_t *dst,
3585  int32_t dst_stride,
3586  const int8_t *filter_horiz,
3587  const int8_t *filter_vert)
3588 {
3589  uint32_t tp0, tp1, tp2, tp3;
3590  v16i8 src0, src1, src2, src3, src4, mask;
3591  v16u8 filt_hz, filt_vt, vec0, vec1;
3592  v16u8 dst0, out;
3593  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
3594 
3595  mask = LD_SB(&mc_filt_mask_arr[16]);
3596 
3597  /* rearranging filter */
3598  filt = LD_UH(filter_horiz);
3599  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
3600 
3601  filt = LD_UH(filter_vert);
3602  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
3603 
3604  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3605 
3606  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3607  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3608  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3609  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
3610  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
3611  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3612 
3613  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3614  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3615 
3616  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3617  SRARI_H2_UH(tmp0, tmp1, 7);
3618  SAT_UH2_UH(tmp0, tmp1, 7);
3619 
3620  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3621  out = __msa_aver_u_b(out, dst0);
3622 
3623  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3624 }
3625 
3627  int32_t src_stride,
3628  uint8_t *dst,
3629  int32_t dst_stride,
3630  const int8_t *filter_horiz,
3631  const int8_t *filter_vert)
3632 {
3633  uint32_t tp0, tp1, tp2, tp3;
3634  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
3635  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
3636  v16u8 dst0, dst1;
3637  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3638  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
3639  v8i16 filt;
3640 
3641  mask = LD_SB(&mc_filt_mask_arr[16]);
3642 
3643  /* rearranging filter */
3644  filt = LD_SH(filter_horiz);
3645  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3646 
3647  filt = LD_SH(filter_vert);
3648  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3649 
3650  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3651  src += (8 * src_stride);
3652  src8 = LD_SB(src);
3653 
3654  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3655  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3656  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
3657  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
3658  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
3659  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
3660  hz_out3, hz_out5, 8);
3661  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
3662 
3663  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3664  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3665  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3666  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3667  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3668  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
3669  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
3670  tmp0, tmp1, tmp2, tmp3);
3671  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3672  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3673  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
3674  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
3675  ST4x8_UB(res0, res1, dst, dst_stride);
3676 }
3677 
3678 void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3679  const uint8_t *src, ptrdiff_t src_stride,
3680  int height, int mx, int my)
3681 {
3682  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3683  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3684 
3685  if (4 == height) {
3686  common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3687  filter_horiz, filter_vert);
3688  } else if (8 == height) {
3689  common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3690  filter_horiz, filter_vert);
3691  }
3692 }
3693 
3695  int32_t src_stride,
3696  uint8_t *dst,
3697  int32_t dst_stride,
3698  const int8_t *filter_horiz,
3699  const int8_t *filter_vert)
3700 {
3701  uint64_t tp0, tp1, tp2, tp3;
3702  v16i8 src0, src1, src2, src3, src4, mask;
3703  v16u8 filt_hz, filt_vt, dst0, dst1, vec0, vec1, vec2, vec3;
3704  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3705  v8i16 filt;
3706 
3707  mask = LD_SB(&mc_filt_mask_arr[0]);
3708 
3709  /* rearranging filter */
3710  filt = LD_SH(filter_horiz);
3711  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3712 
3713  filt = LD_SH(filter_vert);
3714  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3715 
3716  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3717  src += (5 * src_stride);
3718 
3719  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3720  INSERT_D2_UB(tp0, tp1, dst0);
3721  INSERT_D2_UB(tp2, tp3, dst1);
3722  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3723  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3724  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3725  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3726 
3727  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3728  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3729  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
3730 
3731  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3732  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3733  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
3734 
3735  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3736  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3737  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
3738 
3739  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3740  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3741  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3742 }
3743 
3745  int32_t src_stride,
3746  uint8_t *dst,
3747  int32_t dst_stride,
3748  const int8_t *filter_horiz,
3749  const int8_t *filter_vert,
3750  int32_t height)
3751 {
3752  uint32_t loop_cnt;
3753  uint64_t tp0, tp1, tp2, tp3;
3754  v16i8 src0, src1, src2, src3, src4, mask;
3755  v16u8 filt_hz, filt_vt, vec0, dst0, dst1;
3756  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3757  v8i16 filt;
3758 
3759  mask = LD_SB(&mc_filt_mask_arr[0]);
3760 
3761  /* rearranging filter */
3762  filt = LD_SH(filter_horiz);
3763  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3764 
3765  filt = LD_SH(filter_vert);
3766  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3767 
3768  src0 = LD_SB(src);
3769  src += src_stride;
3770 
3771  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3772 
3773  for (loop_cnt = (height >> 2); loop_cnt--;) {
3774  LD_SB4(src, src_stride, src1, src2, src3, src4);
3775  src += (4 * src_stride);
3776 
3777  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3778  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3779  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3780 
3781  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3782  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3783  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
3784 
3785  SRARI_H2_UH(tmp0, tmp1, 7);
3786  SAT_UH2_UH(tmp0, tmp1, 7);
3787 
3788  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3789  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3790  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
3791 
3792  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3793  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3794  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
3795 
3796  SRARI_H2_UH(tmp2, tmp3, 7);
3797  SAT_UH2_UH(tmp2, tmp3, 7);
3798  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3799  INSERT_D2_UB(tp0, tp1, dst0);
3800  INSERT_D2_UB(tp2, tp3, dst1);
3801  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3802  dst += (4 * dst_stride);
3803  }
3804 }
3805 
3806 void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3807  const uint8_t *src, ptrdiff_t src_stride,
3808  int height, int mx, int my)
3809 {
3810  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3811  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3812 
3813  if (4 == height) {
3814  common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3815  filter_horiz, filter_vert);
3816  } else {
3818  dst, dst_stride,
3819  filter_horiz, filter_vert,
3820  height);
3821  }
3822 }
3823 
3824 void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3825  const uint8_t *src, ptrdiff_t src_stride,
3826  int height, int mx, int my)
3827 {
3828  uint32_t loop_cnt;
3829  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3830  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3831  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3832  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
3833  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
3834  v8i16 filt;
3835 
3836  mask = LD_SB(&mc_filt_mask_arr[0]);
3837 
3838  /* rearranging filter */
3839  filt = LD_SH(filter_horiz);
3840  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3841 
3842  filt = LD_SH(filter_vert);
3843  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3844 
3845  LD_SB2(src, 8, src0, src1);
3846  src += src_stride;
3847 
3848  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3849  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3850 
3851  for (loop_cnt = (height >> 2); loop_cnt--;) {
3852  LD_SB4(src, src_stride, src0, src2, src4, src6);
3853  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3854  src += (4 * src_stride);
3855  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3856 
3857  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3858  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3859  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3860  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3861  SRARI_H2_UH(tmp0, tmp1, 7);
3862  SAT_UH2_UH(tmp0, tmp1, 7);
3863  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3864  dst += dst_stride;
3865 
3866  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3867  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3868  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3869  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3870  SRARI_H2_UH(tmp0, tmp1, 7);
3871  SAT_UH2_UH(tmp0, tmp1, 7);
3872  PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
3873  dst += dst_stride;
3874 
3875  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3876  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
3877  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3878  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3879  SRARI_H2_UH(tmp0, tmp1, 7);
3880  SAT_UH2_UH(tmp0, tmp1, 7);
3881  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3882  dst += dst_stride;
3883 
3884  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
3885  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
3886  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3887  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3888  SRARI_H2_UH(tmp0, tmp1, 7);
3889  SAT_UH2_UH(tmp0, tmp1, 7);
3890  PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
3891  dst += dst_stride;
3892  }
3893 }
3894 
3895 void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3896  const uint8_t *src, ptrdiff_t src_stride,
3897  int height, int mx, int my)
3898 {
3899  int32_t multiple8_cnt;
3900 
3901  for (multiple8_cnt = 2; multiple8_cnt--;) {
3902  ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3903 
3904  src += 16;
3905  dst += 16;
3906  }
3907 }
3908 
3909 void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3910  const uint8_t *src, ptrdiff_t src_stride,
3911  int height, int mx, int my)
3912 {
3913  int32_t multiple8_cnt;
3914 
3915  for (multiple8_cnt = 4; multiple8_cnt--;) {
3916  ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3917 
3918  src += 16;
3919  dst += 16;
3920  }
3921 }
3922 
3923 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
3924  uint8_t *dst, int32_t dst_stride,
3925  int32_t height)
3926 {
3927  int32_t cnt;
3928  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
3929 
3930  if (0 == height % 8) {
3931  for (cnt = height >> 3; cnt--;) {
3932  LD4(src, src_stride, out0, out1, out2, out3);
3933  src += (4 * src_stride);
3934  LD4(src, src_stride, out4, out5, out6, out7);
3935  src += (4 * src_stride);
3936 
3937  SD4(out0, out1, out2, out3, dst, dst_stride);
3938  dst += (4 * dst_stride);
3939  SD4(out4, out5, out6, out7, dst, dst_stride);
3940  dst += (4 * dst_stride);
3941  }
3942  } else if (0 == height % 4) {
3943  for (cnt = (height / 4); cnt--;) {
3944  LD4(src, src_stride, out0, out1, out2, out3);
3945  src += (4 * src_stride);
3946 
3947  SD4(out0, out1, out2, out3, dst, dst_stride);
3948  dst += (4 * dst_stride);
3949  }
3950  }
3951 }
3952 
3953 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
3954  uint8_t *dst, int32_t dst_stride,
3955  int32_t height)
3956 {
3957  int32_t cnt;
3958  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
3959 
3960  if (8 == height) {
3961  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3962  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3963  } else if (16 == height) {
3964  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3965  src += (8 * src_stride);
3966  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3967  dst += (8 * dst_stride);
3968  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3969  src += (8 * src_stride);
3970  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3971  dst += (8 * dst_stride);
3972  } else if (32 == height) {
3973  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3974  src += (8 * src_stride);
3975  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3976  dst += (8 * dst_stride);
3977  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3978  src += (8 * src_stride);
3979  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3980  dst += (8 * dst_stride);
3981  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3982  src += (8 * src_stride);
3983  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3984  dst += (8 * dst_stride);
3985  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3986  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3987  } else if (0 == height % 4) {
3988  for (cnt = (height >> 2); cnt--;) {
3989  LD_UB4(src, src_stride, src0, src1, src2, src3);
3990  src += (4 * src_stride);
3991  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
3992  dst += (4 * dst_stride);
3993  }
3994  }
3995 }
3996 
3997 static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
3998  uint8_t *dst, int32_t dst_stride,
3999  int32_t height)
4000 {
4001  int32_t cnt;
4002  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4003 
4004  if (0 == height % 8) {
4005  for (cnt = (height >> 3); cnt--;) {
4006  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4007  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
4008  LD_UB8(src + 16, src_stride, src0, src1, src2, src3, src4, src5, src6,
4009  src7);
4010  src += (8 * src_stride);
4011  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 16,
4012  dst_stride);
4013  dst += (8 * dst_stride);
4014  }
4015  } else if (0 == height % 4) {
4016  for (cnt = (height >> 2); cnt--;) {
4017  LD_UB4(src, src_stride, src0, src1, src2, src3);
4018  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4019  src += (4 * src_stride);
4020  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4021  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4022  dst += (4 * dst_stride);
4023  }
4024  }
4025 }
4026 
4027 static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
4028  uint8_t *dst, int32_t dst_stride,
4029  int32_t height)
4030 {
4031  int32_t cnt;
4032  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4033  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4034 
4035  for (cnt = (height >> 2); cnt--;) {
4036  LD_UB4(src, 16, src0, src1, src2, src3);
4037  src += src_stride;
4038  LD_UB4(src, 16, src4, src5, src6, src7);
4039  src += src_stride;
4040  LD_UB4(src, 16, src8, src9, src10, src11);
4041  src += src_stride;
4042  LD_UB4(src, 16, src12, src13, src14, src15);
4043  src += src_stride;
4044 
4045  ST_UB4(src0, src1, src2, src3, dst, 16);
4046  dst += dst_stride;
4047  ST_UB4(src4, src5, src6, src7, dst, 16);
4048  dst += dst_stride;
4049  ST_UB4(src8, src9, src10, src11, dst, 16);
4050  dst += dst_stride;
4051  ST_UB4(src12, src13, src14, src15, dst, 16);
4052  dst += dst_stride;
4053  }
4054 }
4055 
4056 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
4057  uint8_t *dst, int32_t dst_stride,
4058  int32_t height)
4059 {
4060  uint32_t tp0, tp1, tp2, tp3;
4061  v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
4062 
4063  if (8 == height) {
4064  LW4(src, src_stride, tp0, tp1, tp2, tp3);
4065  src += 4 * src_stride;
4066  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4067  LW4(src, src_stride, tp0, tp1, tp2, tp3);
4068  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
4069  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4070  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4071  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
4072  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
4073  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4074  ST4x8_UB(dst0, dst1, dst, dst_stride);
4075  } else if (4 == height) {
4076  LW4(src, src_stride, tp0, tp1, tp2, tp3);
4077  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4078  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4079  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4080  dst0 = __msa_aver_u_b(src0, dst0);
4081  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
4082  }
4083 }
4084 
4085 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
4086  uint8_t *dst, int32_t dst_stride,
4087  int32_t height)
4088 {
4089  int32_t cnt;
4090  uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
4091  v16u8 src0, src1, src2, src3;
4092  v16u8 dst0, dst1, dst2, dst3;
4093 
4094  if (0 == (height % 8)) {
4095  for (cnt = (height >> 3); cnt--;) {
4096  LD4(src, src_stride, tp0, tp1, tp2, tp3);
4097  src += 4 * src_stride;
4098  LD4(src, src_stride, tp4, tp5, tp6, tp7);
4099  src += 4 * src_stride;
4100  INSERT_D2_UB(tp0, tp1, src0);
4101  INSERT_D2_UB(tp2, tp3, src1);
4102  INSERT_D2_UB(tp4, tp5, src2);
4103  INSERT_D2_UB(tp6, tp7, src3);
4104  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4105  LD4(dst + 4 * dst_stride, dst_stride, tp4, tp5, tp6, tp7);
4106  INSERT_D2_UB(tp0, tp1, dst0);
4107  INSERT_D2_UB(tp2, tp3, dst1);
4108  INSERT_D2_UB(tp4, tp5, dst2);
4109  INSERT_D2_UB(tp6, tp7, dst3);
4110  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0,
4111  dst1, dst2, dst3);
4112  ST8x8_UB(dst0, dst1, dst2, dst3, dst, dst_stride);
4113  dst += 8 * dst_stride;
4114  }
4115  } else if (4 == height) {
4116  LD4(src, src_stride, tp0, tp1, tp2, tp3);
4117  INSERT_D2_UB(tp0, tp1, src0);
4118  INSERT_D2_UB(tp2, tp3, src1);
4119  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4120  INSERT_D2_UB(tp0, tp1, dst0);
4121  INSERT_D2_UB(tp2, tp3, dst1);
4122  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4123  ST8x4_UB(dst0, dst1, dst, dst_stride);
4124  }
4125 }
4126 
4127 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
4128  uint8_t *dst, int32_t dst_stride,
4129  int32_t height)
4130 {
4131  int32_t cnt;
4132  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4133  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4134 
4135  if (0 == (height % 8)) {
4136  for (cnt = (height / 8); cnt--;) {
4137  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4138  src += (8 * src_stride);
4139  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4140 
4141  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4142  dst0, dst1, dst2, dst3);
4143  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4144  dst4, dst5, dst6, dst7);
4145  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
4146  dst += (8 * dst_stride);
4147  }
4148  } else if (0 == (height % 4)) {
4149  for (cnt = (height / 4); cnt--;) {
4150  LD_UB4(src, src_stride, src0, src1, src2, src3);
4151  src += (4 * src_stride);
4152  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4153 
4154  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4155  dst0, dst1, dst2, dst3);
4156  ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
4157  dst += (4 * dst_stride);
4158  }
4159  }
4160 }
4161 
4162 static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
4163  uint8_t *dst, int32_t dst_stride,
4164  int32_t height)
4165 {
4166  int32_t cnt;
4167  uint8_t *dst_dup = dst;
4168  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4169  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4170  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4171  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4172 
4173  if (0 == (height % 8)) {
4174  for (cnt = (height / 8); cnt--;) {
4175  LD_UB4(src, src_stride, src0, src2, src4, src6);
4176  LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4177  src += (4 * src_stride);
4178  LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4179  LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4180  dst_dup += (4 * dst_stride);
4181  LD_UB4(src, src_stride, src8, src10, src12, src14);
4182  LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
4183  src += (4 * src_stride);
4184  LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
4185  LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
4186  dst_dup += (4 * dst_stride);
4187 
4188  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4189  dst0, dst1, dst2, dst3);
4190  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4191  dst4, dst5, dst6, dst7);
4192  AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4193  dst8, dst9, dst10, dst11);
4194  AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4195  dst12, dst13, dst14, dst15);
4196 
4197  ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4198  ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4199  dst += (4 * dst_stride);
4200  ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
4201  ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
4202  dst += (4 * dst_stride);
4203  }
4204  } else if (0 == (height % 4)) {
4205  for (cnt = (height / 4); cnt--;) {
4206  LD_UB4(src, src_stride, src0, src2, src4, src6);
4207  LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4208  src += (4 * src_stride);
4209  LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4210  LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4211  dst_dup += (4 * dst_stride);
4212 
4213  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4214  dst0, dst1, dst2, dst3);
4215  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4216  dst4, dst5, dst6, dst7);
4217 
4218  ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4219  ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4220  dst += (4 * dst_stride);
4221  }
4222  }
4223 }
4224 
4225 static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
4226  uint8_t *dst, int32_t dst_stride,
4227  int32_t height)
4228 {
4229  int32_t cnt;
4230  uint8_t *dst_dup = dst;
4231  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4232  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4233  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4234  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4235 
4236  for (cnt = (height / 4); cnt--;) {
4237  LD_UB4(src, 16, src0, src1, src2, src3);
4238  src += src_stride;
4239  LD_UB4(src, 16, src4, src5, src6, src7);
4240  src += src_stride;
4241  LD_UB4(src, 16, src8, src9, src10, src11);
4242  src += src_stride;
4243  LD_UB4(src, 16, src12, src13, src14, src15);
4244  src += src_stride;
4245 
4246  LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
4247  dst_dup += dst_stride;
4248  LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
4249  dst_dup += dst_stride;
4250  LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
4251  dst_dup += dst_stride;
4252  LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
4253  dst_dup += dst_stride;
4254 
4255  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4256  dst0, dst1, dst2, dst3);
4257  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4258  dst4, dst5, dst6, dst7);
4259  AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4260  dst8, dst9, dst10, dst11);
4261  AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4262  dst12, dst13, dst14, dst15);
4263 
4264  ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
4265  dst += dst_stride;
4266  ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
4267  dst += dst_stride;
4268  ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
4269  dst += dst_stride;
4270  ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
4271  dst += dst_stride;
4272  }
4273 }
4274 
4275 static const int8_t vp9_subpel_filters_msa[3][15][8] = {
4276  [FILTER_8TAP_REGULAR] = {
4277  {0, 1, -5, 126, 8, -3, 1, 0},
4278  {-1, 3, -10, 122, 18, -6, 2, 0},
4279  {-1, 4, -13, 118, 27, -9, 3, -1},
4280  {-1, 4, -16, 112, 37, -11, 4, -1},
4281  {-1, 5, -18, 105, 48, -14, 4, -1},
4282  {-1, 5, -19, 97, 58, -16, 5, -1},
4283  {-1, 6, -19, 88, 68, -18, 5, -1},
4284  {-1, 6, -19, 78, 78, -19, 6, -1},
4285  {-1, 5, -18, 68, 88, -19, 6, -1},
4286  {-1, 5, -16, 58, 97, -19, 5, -1},
4287  {-1, 4, -14, 48, 105, -18, 5, -1},
4288  {-1, 4, -11, 37, 112, -16, 4, -1},
4289  {-1, 3, -9, 27, 118, -13, 4, -1},
4290  {0, 2, -6, 18, 122, -10, 3, -1},
4291  {0, 1, -3, 8, 126, -5, 1, 0},
4292  }, [FILTER_8TAP_SHARP] = {
4293  {-1, 3, -7, 127, 8, -3, 1, 0},
4294  {-2, 5, -13, 125, 17, -6, 3, -1},
4295  {-3, 7, -17, 121, 27, -10, 5, -2},
4296  {-4, 9, -20, 115, 37, -13, 6, -2},
4297  {-4, 10, -23, 108, 48, -16, 8, -3},
4298  {-4, 10, -24, 100, 59, -19, 9, -3},
4299  {-4, 11, -24, 90, 70, -21, 10, -4},
4300  {-4, 11, -23, 80, 80, -23, 11, -4},
4301  {-4, 10, -21, 70, 90, -24, 11, -4},
4302  {-3, 9, -19, 59, 100, -24, 10, -4},
4303  {-3, 8, -16, 48, 108, -23, 10, -4},
4304  {-2, 6, -13, 37, 115, -20, 9, -4},
4305  {-2, 5, -10, 27, 121, -17, 7, -3},
4306  {-1, 3, -6, 17, 125, -13, 5, -2},
4307  {0, 1, -3, 8, 127, -7, 3, -1},
4308  }, [FILTER_8TAP_SMOOTH] = {
4309  {-3, -1, 32, 64, 38, 1, -3, 0},
4310  {-2, -2, 29, 63, 41, 2, -3, 0},
4311  {-2, -2, 26, 63, 43, 4, -4, 0},
4312  {-2, -3, 24, 62, 46, 5, -4, 0},
4313  {-2, -3, 21, 60, 49, 7, -4, 0},
4314  {-1, -4, 18, 59, 51, 9, -4, 0},
4315  {-1, -4, 16, 57, 53, 12, -4, -1},
4316  {-1, -4, 14, 55, 55, 14, -4, -1},
4317  {-1, -4, 12, 53, 57, 16, -4, -1},
4318  {0, -4, 9, 51, 59, 18, -4, -1},
4319  {0, -4, 7, 49, 60, 21, -3, -2},
4320  {0, -4, 5, 46, 62, 24, -3, -2},
4321  {0, -4, 4, 43, 63, 26, -2, -2},
4322  {0, -3, 2, 41, 63, 29, -2, -2},
4323  {0, -3, 1, 38, 64, 32, -1, -3},
4324  }
4325 };
4326 
4327 #define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx) \
4328 void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \
4329  const uint8_t *src, \
4330  ptrdiff_t srcstride, \
4331  int h, int mx, int my) \
4332 { \
4333  const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4334  \
4335  common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \
4336 } \
4337  \
4338 void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \
4339  const uint8_t *src, \
4340  ptrdiff_t srcstride, \
4341  int h, int mx, int my) \
4342 { \
4343  const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \
4344  \
4345  common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \
4346 } \
4347  \
4348 void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \
4349  const uint8_t *src, \
4350  ptrdiff_t srcstride, \
4351  int h, int mx, int my) \
4352 { \
4353  const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4354  const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \
4355  \
4356  common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter, \
4357  vfilter, h); \
4358 } \
4359  \
4360 void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \
4361  const uint8_t *src, \
4362  ptrdiff_t srcstride, \
4363  int h, int mx, int my) \
4364 { \
4365  const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4366  \
4367  common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \
4368  dststride, filter, h); \
4369 } \
4370  \
4371 void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \
4372  const uint8_t *src, \
4373  ptrdiff_t srcstride, \
4374  int h, int mx, int my) \
4375 { \
4376  const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \
4377  \
4378  common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride, \
4379  filter, h); \
4380 } \
4381  \
4382 void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \
4383  const uint8_t *src, \
4384  ptrdiff_t srcstride, \
4385  int h, int mx, int my) \
4386 { \
4387  const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4388  const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \
4389  \
4390  common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \
4391  dststride, hfilter, \
4392  vfilter, h); \
4393 }
4394 
4395 #define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE) \
4396 void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4397  const uint8_t *src, ptrdiff_t srcstride, \
4398  int h, int mx, int my) \
4399 { \
4400  \
4401  copy_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4402 } \
4403  \
4404 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4405  const uint8_t *src, ptrdiff_t srcstride, \
4406  int h, int mx, int my) \
4407 { \
4408  \
4409  avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4410 }
4411 
4412 #define VP9_AVG_MIPS_MSA_FUNC(SIZE) \
4413 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4414  const uint8_t *src, ptrdiff_t srcstride, \
4415  int h, int mx, int my) \
4416 { \
4417  \
4418  avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4419 }
4420 
4426 
4432 
4438 
4444 
4445 #undef VP9_8TAP_MIPS_MSA_FUNC
4446 #undef VP9_COPY_AVG_MIPS_MSA_FUNC
4447 #undef VP9_AVG_MIPS_MSA_FUNC
void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2261
void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3130
static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1776
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1022
void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3678
static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:746
#define SLDI_B2_SH(...)
#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)
Definition: vp9_mc_msa.c:4395
#define VSHF_B4_SH(...)
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:2624
void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3806
static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:523
#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride)
Definition: vp9_mc_msa.c:148
#define XORI_B2_128_SB(...)
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:159
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3923
#define PCKEV_XORI128_UB(in0, in1)
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3582
static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2819
#define LD_SB(...)
#define XORI_B3_128_SB(...)
#define SLDI_B3_UH(...)
#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, filt_h1, filt_h2, filt_h3)
Definition: vp9_mc_msa.c:66
#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)
Definition: vp9_mc_msa.c:4327
#define ILVR_D2_UB(...)
void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2565
void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2027
#define LD_UB4(...)
void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3361
#define DPADD_SB4_SH(...)
#define ILVR_B2_SB(...)
#define src
Definition: vp8dsp.c:254
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4027
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3208
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride)
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1549
static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1756
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1067
static const int8_t vp9_subpel_filters_msa[3][15][8]
Definition: vp9_mc_msa.c:4275
#define XORI_B4_128_UB(...)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
Definition: cfhd.c:114
static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:2925
#define PCKEV_ST_SB(in0, in1, pdst)
static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1818
#define ILVR_D2_SB(...)
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:3744
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
#define LD_UB2(...)
#define SRARI_H4_SH(...)
#define XORI_B2_128_UB(...)
void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2805
#define SPLATI_H4_SH(...)
#define ILVL_B2_SB(...)
#define height
#define LD_SH(...)
static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3172
static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2191
static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:754
static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:268
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)
Definition: vp9_mc_msa.c:139
#define LD_UB5(...)
void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2704
#define ILVR_D3_SB(...)
#define ILVR_D4_SB(...)
#define LD_SB8(...)
void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2275
static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_msa.c:657
#define PCKEV_B2_SB(...)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3)
Definition: vp9_mc_msa.c:52
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:471
void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2720
static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:970
void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2407
static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2897
static const uint16_t mask[17]
Definition: lzw.c:38
static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1632
static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1859
static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:762
static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1361
#define XORI_B7_128_SB(...)
void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3895
uint16_t width
Definition: gdv.c:47
#define XORI_B4_128_SB(...)
static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:304
static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:936
static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1123
static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_msa.c:1422
static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:237
static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:414
static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2581
#define SRARI_H2_SH(...)
#define ILVR_B4_UB(...)
void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3080
static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:953
static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2143
#define LD_UB8(...)
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2178
#define SRARI_H2_UH(...)
#define VSHF_B2_UH(...)
int32_t
void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2997
void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3415
#define PCKEV_B4_SB(...)
#define AVER_UB2_UB(...)
static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2488
static void avg_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4127
static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:316
#define ST_UB(...)
static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1736
static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1529
#define SAT_SH4_SH(...)
#define SPLATI_H4_SB(...)
#define LD_SB4(...)
#define PCKEV_B4_UB(...)
#define INSERT_W4_UB(...)
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2520
void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2074
#define ST_UB8(...)
#define AVER_UB4_UB(...)
void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1946
#define ST_UB4(...)
static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1181
static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:578
#define src1
Definition: h264pred.c:139
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1)
Definition: vp9_mc_msa.c:83
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:987
void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2329
#define ILVL_B4_SB(...)
#define SAT_SH2_SH(...)
static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2117
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3953
static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:355
static const int8_t vp9_bilinear_filters_msa[15][2]
Definition: vp9_mc_msa.c:34
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4085
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3)
Definition: vp9_mc_msa.c:102
static void avg_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4225
#define ILVR_D4_UB(...)
void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1846
static void avg_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4162
#define DOTP_SB4_SH(...)
#define DOTP_UB2_UH(...)
void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1960
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3997
#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)
Definition: vp9_mc_msa.c:130
#define SRARI_H4_UH(...)
#define src0
Definition: h264pred.c:138
static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:3291
static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:838
#define SD4(in0, in1, in2, in3, pdst, stride)
void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3012
static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:2215
static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3262
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:188
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1302
#define ST4x8_UB(in0, in1, pdst, stride)
#define LD_SB7(...)
#define LD_SB5(...)
void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3247
static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1240
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3694
#define INSERT_D2_UB(...)
#define ILVEV_B2_SH(...)
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ILVEV_B2_UB(...)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVL_B2_UB(...)
void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2882
#define SAT_UH2_UH(...)
void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3346
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
#define SAT_UH4_UH(...)
void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3495
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:226
#define SLDI_B3_SB(...)
#define LD_UB(...)
static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1082
#define DOTP_UB4_UH(...)
#define VSHF_B2_UB(...)
void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3909
#define ILVR_B4_SB(...)
static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1883
static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1519
FILE * out
Definition: movenc.c:54
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3626
void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3824
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2848
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp9_mc_msa.c:25
static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1796
#define LD_UH(...)
#define VP9_AVG_MIPS_MSA_FUNC(SIZE)
Definition: vp9_mc_msa.c:4412
void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2791
#define PCKEV_B2_UB(...)
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4056
#define ILVR_B2_UB(...)
static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1539
#define ADDS_SH4_SH(...)