FFmpeg  4.0
vp8_mc_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp8dsp.h"
23 #include "vp8dsp_mips.h"
24 
25 static const uint8_t mc_filt_mask_arr[16 * 3] = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 static const int8_t subpel_filters_msa[7][8] = {
35  {-6, 123, 12, -1, 0, 0, 0, 0},
36  {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
37  {-9, 93, 50, -6, 0, 0, 0, 0},
38  {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
39  {-6, 50, 93, -9, 0, 0, 0, 0},
40  {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
41  {-1, 12, 123, -6, 0, 0, 0, 0},
42 };
43 
44 static const int8_t bilinear_filters_msa[7][2] = {
45  {112, 16},
46  {96, 32},
47  {80, 48},
48  {64, 64},
49  {48, 80},
50  {32, 96},
51  {16, 112}
52 };
53 
54 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \
55  filt_h0, filt_h1, filt_h2) \
56 ( { \
57  v16i8 vec0_m, vec1_m, vec2_m; \
58  v8i16 hz_out_m; \
59  \
60  VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
61  vec0_m, vec1_m, vec2_m); \
62  hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \
63  filt_h0, filt_h1, filt_h2); \
64  \
65  hz_out_m = __msa_srari_h(hz_out_m, 7); \
66  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
67  \
68  hz_out_m; \
69 } )
70 
71 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
72  mask0, mask1, mask2, \
73  filt0, filt1, filt2, \
74  out0, out1) \
75 { \
76  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \
77  \
78  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
79  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
80  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
81  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
82  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
83  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
84 }
85 
86 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
87  mask0, mask1, mask2, \
88  filt0, filt1, filt2, \
89  out0, out1, out2, out3) \
90 { \
91  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
92  \
93  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
94  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
95  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
96  out0, out1, out2, out3); \
97  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
98  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
99  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \
100  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \
101  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
102  out0, out1, out2, out3); \
103  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
104  out0, out1, out2, out3); \
105 }
106 
107 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
108 ( { \
109  v8i16 tmp0; \
110  \
111  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
112  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
113  \
114  tmp0; \
115 } )
116 
117 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
118 ( { \
119  v16i8 vec0_m, vec1_m; \
120  v8i16 hz_out_m; \
121  \
122  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
123  hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
124  \
125  hz_out_m = __msa_srari_h(hz_out_m, 7); \
126  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
127  \
128  hz_out_m; \
129 } )
130 
131 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
132  mask0, mask1, filt0, filt1, \
133  out0, out1) \
134 { \
135  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
136  \
137  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
138  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
139  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
140  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
141 }
142 
143 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
144  mask0, mask1, filt0, filt1, \
145  out0, out1, out2, out3) \
146 { \
147  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
148  \
149  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
150  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
151  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
152  out0, out1, out2, out3); \
153  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
154  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
155  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
156  out0, out1, out2, out3); \
157 }
158 
159 static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
160  uint8_t *dst, int32_t dst_stride,
161  const int8_t *filter)
162 {
163  v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
164  v16u8 mask0, mask1, mask2, out;
165  v8i16 filt, out0, out1;
166 
167  mask0 = LD_UB(&mc_filt_mask_arr[16]);
168  src -= 2;
169 
170  /* rearranging filter */
171  filt = LD_SH(filter);
172  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
173 
174  mask1 = mask0 + 2;
175  mask2 = mask0 + 4;
176 
177  LD_SB4(src, src_stride, src0, src1, src2, src3);
178  XORI_B4_128_SB(src0, src1, src2, src3);
179  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
180  filt0, filt1, filt2, out0, out1);
181  SRARI_H2_SH(out0, out1, 7);
182  SAT_SH2_SH(out0, out1, 7);
183  out = PCKEV_XORI128_UB(out0, out1);
184  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
185 }
186 
187 static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
188  uint8_t *dst, int32_t dst_stride,
189  const int8_t *filter)
190 {
191  v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
192  v16u8 mask0, mask1, mask2, out;
193  v8i16 filt, out0, out1, out2, out3;
194 
195  mask0 = LD_UB(&mc_filt_mask_arr[16]);
196  src -= 2;
197 
198  /* rearranging filter */
199  filt = LD_SH(filter);
200  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
201 
202  mask1 = mask0 + 2;
203  mask2 = mask0 + 4;
204 
205  LD_SB4(src, src_stride, src0, src1, src2, src3);
206  XORI_B4_128_SB(src0, src1, src2, src3);
207  src += (4 * src_stride);
208  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
209  filt0, filt1, filt2, out0, out1);
210  LD_SB4(src, src_stride, src0, src1, src2, src3);
211  XORI_B4_128_SB(src0, src1, src2, src3);
212  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
213  filt0, filt1, filt2, out2, out3);
214  SRARI_H4_SH(out0, out1, out2, out3, 7);
215  SAT_SH4_SH(out0, out1, out2, out3, 7);
216  out = PCKEV_XORI128_UB(out0, out1);
217  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
218  dst += (4 * dst_stride);
219  out = PCKEV_XORI128_UB(out2, out3);
220  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
221 }
222 
223 void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
224  uint8_t *src, ptrdiff_t src_stride,
225  int height, int mx, int my)
226 {
227  const int8_t *filter = subpel_filters_msa[mx - 1];
228 
229  if (4 == height) {
230  common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
231  } else if (8 == height) {
232  common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
233  }
234 }
235 
236 void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
237  uint8_t *src, ptrdiff_t src_stride,
238  int height, int mx, int my)
239 {
240  uint32_t loop_cnt;
241  const int8_t *filter = subpel_filters_msa[mx - 1];
242  v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
243  v16u8 mask0, mask1, mask2, tmp0, tmp1;
244  v8i16 filt, out0, out1, out2, out3;
245 
246  mask0 = LD_UB(&mc_filt_mask_arr[0]);
247 
248  src -= 2;
249 
250  /* rearranging filter */
251  filt = LD_SH(filter);
252  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
253 
254  mask1 = mask0 + 2;
255  mask2 = mask0 + 4;
256 
257  LD_SB4(src, src_stride, src0, src1, src2, src3);
258  XORI_B4_128_SB(src0, src1, src2, src3);
259  src += (4 * src_stride);
260  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
261  filt0, filt1, filt2, out0, out1, out2, out3);
262  SRARI_H4_SH(out0, out1, out2, out3, 7);
263  SAT_SH4_SH(out0, out1, out2, out3, 7);
264  tmp0 = PCKEV_XORI128_UB(out0, out1);
265  tmp1 = PCKEV_XORI128_UB(out2, out3);
266  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
267  dst += (4 * dst_stride);
268 
269  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
270  LD_SB4(src, src_stride, src0, src1, src2, src3);
271  XORI_B4_128_SB(src0, src1, src2, src3);
272  src += (4 * src_stride);
273  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
274  filt0, filt1, filt2, out0, out1, out2, out3);
275  SRARI_H4_SH(out0, out1, out2, out3, 7);
276  SAT_SH4_SH(out0, out1, out2, out3, 7);
277  tmp0 = PCKEV_XORI128_UB(out0, out1);
278  tmp1 = PCKEV_XORI128_UB(out2, out3);
279  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
280  dst += (4 * dst_stride);
281  }
282 }
283 
284 void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
285  uint8_t *src, ptrdiff_t src_stride,
286  int height, int mx, int my)
287 {
288  uint32_t loop_cnt;
289  const int8_t *filter = subpel_filters_msa[mx - 1];
290  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
291  v16u8 mask0, mask1, mask2, out;
292  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
293 
294  mask0 = LD_UB(&mc_filt_mask_arr[0]);
295  src -= 2;
296 
297  /* rearranging filter */
298  filt = LD_SH(filter);
299  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
300 
301  mask1 = mask0 + 2;
302  mask2 = mask0 + 4;
303 
304  for (loop_cnt = (height >> 2); loop_cnt--;) {
305  LD_SB4(src, src_stride, src0, src2, src4, src6);
306  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
307  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
308  src += (4 * src_stride);
309 
310  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
311  filt0, filt1, filt2, out0, out1, out2, out3);
312  HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
313  filt0, filt1, filt2, out4, out5, out6, out7);
314  SRARI_H4_SH(out0, out1, out2, out3, 7);
315  SRARI_H4_SH(out4, out5, out6, out7, 7);
316  SAT_SH4_SH(out0, out1, out2, out3, 7);
317  SAT_SH4_SH(out4, out5, out6, out7, 7);
318  out = PCKEV_XORI128_UB(out0, out1);
319  ST_UB(out, dst);
320  dst += dst_stride;
321  out = PCKEV_XORI128_UB(out2, out3);
322  ST_UB(out, dst);
323  dst += dst_stride;
324  out = PCKEV_XORI128_UB(out4, out5);
325  ST_UB(out, dst);
326  dst += dst_stride;
327  out = PCKEV_XORI128_UB(out6, out7);
328  ST_UB(out, dst);
329  dst += dst_stride;
330  }
331 }
332 
333 void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
334  uint8_t *src, ptrdiff_t src_stride,
335  int height, int mx, int my)
336 {
337  uint32_t loop_cnt;
338  const int8_t *filter = subpel_filters_msa[my - 1];
339  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
340  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
341  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
342  v16u8 out;
343  v8i16 filt, out10, out32;
344 
345  src -= (2 * src_stride);
346 
347  filt = LD_SH(filter);
348  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
349 
350  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
351  src += (5 * src_stride);
352 
353  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
354  src32_r, src43_r);
355  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
356  XORI_B2_128_SB(src2110, src4332);
357 
358  for (loop_cnt = (height >> 2); loop_cnt--;) {
359  LD_SB4(src, src_stride, src5, src6, src7, src8);
360  src += (4 * src_stride);
361 
362  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
363  src65_r, src76_r, src87_r);
364  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
365  XORI_B2_128_SB(src6554, src8776);
366  out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
367  out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
368  SRARI_H2_SH(out10, out32, 7);
369  SAT_SH2_SH(out10, out32, 7);
370  out = PCKEV_XORI128_UB(out10, out32);
371  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
372  dst += (4 * dst_stride);
373 
374  src2110 = src6554;
375  src4332 = src8776;
376  src4 = src8;
377  }
378 }
379 
380 void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
381  uint8_t *src, ptrdiff_t src_stride,
382  int height, int mx, int my)
383 {
384  uint32_t loop_cnt;
385  const int8_t *filter = subpel_filters_msa[my - 1];
386  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
387  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
388  v16i8 src109_r, filt0, filt1, filt2;
389  v16u8 tmp0, tmp1;
390  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
391 
392  src -= (2 * src_stride);
393 
394  filt = LD_SH(filter);
395  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
396 
397  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
398  src += (5 * src_stride);
399 
400  XORI_B5_128_SB(src0, src1, src2, src3, src4);
401  ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
402  src10_r, src32_r, src21_r, src43_r);
403 
404  for (loop_cnt = (height >> 2); loop_cnt--;) {
405  LD_SB4(src, src_stride, src7, src8, src9, src10);
406  XORI_B4_128_SB(src7, src8, src9, src10);
407  src += (4 * src_stride);
408 
409  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
410  src87_r, src98_r, src109_r);
411  out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
412  out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
413  out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
414  out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
415  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
416  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
417  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
418  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
419  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
420  dst += (4 * dst_stride);
421 
422  src10_r = src76_r;
423  src32_r = src98_r;
424  src21_r = src87_r;
425  src43_r = src109_r;
426  src4 = src10;
427  }
428 }
429 
430 void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
431  uint8_t *src, ptrdiff_t src_stride,
432  int height, int mx, int my)
433 {
434  uint32_t loop_cnt;
435  const int8_t *filter = subpel_filters_msa[my - 1];
436  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
437  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
438  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
439  v16i8 src65_l, src87_l, filt0, filt1, filt2;
440  v16u8 tmp0, tmp1, tmp2, tmp3;
441  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
442 
443  src -= (2 * src_stride);
444 
445  filt = LD_SH(filter);
446  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
447 
448  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
449  src += (5 * src_stride);
450 
451  XORI_B5_128_SB(src0, src1, src2, src3, src4);
452  ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
453  src32_r, src43_r, src21_r);
454  ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
455  src32_l, src43_l, src21_l);
456 
457  for (loop_cnt = (height >> 2); loop_cnt--;) {
458  LD_SB4(src, src_stride, src5, src6, src7, src8);
459  src += (4 * src_stride);
460 
461  XORI_B4_128_SB(src5, src6, src7, src8);
462  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
463  src65_r, src76_r, src87_r);
464  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
465  src65_l, src76_l, src87_l);
466  out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
467  filt2);
468  out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
469  filt2);
470  out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
471  filt2);
472  out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
473  filt2);
474  out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
475  filt2);
476  out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
477  filt2);
478  out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
479  filt2);
480  out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
481  filt2);
482  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
483  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
484  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
485  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
486  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
487  out3_r, tmp0, tmp1, tmp2, tmp3);
488  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
489  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
490  dst += (4 * dst_stride);
491 
492  src10_r = src54_r;
493  src32_r = src76_r;
494  src21_r = src65_r;
495  src43_r = src87_r;
496  src10_l = src54_l;
497  src32_l = src76_l;
498  src21_l = src65_l;
499  src43_l = src87_l;
500  src4 = src8;
501  }
502 }
503 
504 void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
505  uint8_t *src, ptrdiff_t src_stride,
506  int height, int mx, int my)
507 {
508  uint32_t loop_cnt;
509  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
510  const int8_t *filter_vert = subpel_filters_msa[my - 1];
511  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
512  v16i8 filt_hz0, filt_hz1, filt_hz2;
513  v16u8 mask0, mask1, mask2, out;
514  v8i16 tmp0, tmp1;
515  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
516  v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
517 
518  mask0 = LD_UB(&mc_filt_mask_arr[16]);
519  src -= (2 + 2 * src_stride);
520 
521  /* rearranging filter */
522  filt = LD_SH(filter_horiz);
523  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
524 
525  filt = LD_SH(filter_vert);
526  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
527 
528  mask1 = mask0 + 2;
529  mask2 = mask0 + 4;
530 
531  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
532  src += (5 * src_stride);
533 
534  XORI_B5_128_SB(src0, src1, src2, src3, src4);
535  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
536  filt_hz1, filt_hz2);
537  hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
538  filt_hz1, filt_hz2);
539  hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
540  hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
541  filt_hz1, filt_hz2);
542  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
543 
544  for (loop_cnt = (height >> 2); loop_cnt--;) {
545  LD_SB2(src, src_stride, src5, src6);
546  src += (2 * src_stride);
547 
548  XORI_B2_128_SB(src5, src6);
549  hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
550  filt_hz1, filt_hz2);
551  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
552 
553  LD_SB2(src, src_stride, src7, src8);
554  src += (2 * src_stride);
555 
556  XORI_B2_128_SB(src7, src8);
557  hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
558  filt_hz1, filt_hz2);
559  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
560 
561  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
562  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
563 
564  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
565  tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
566 
567  SRARI_H2_SH(tmp0, tmp1, 7);
568  SAT_SH2_SH(tmp0, tmp1, 7);
569  out = PCKEV_XORI128_UB(tmp0, tmp1);
570  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
571  dst += (4 * dst_stride);
572 
573  hz_out3 = hz_out7;
574  out0 = out2;
575  out1 = out3;
576  }
577 }
578 
579 void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
580  uint8_t *src, ptrdiff_t src_stride,
581  int height, int mx, int my)
582 {
583  uint32_t loop_cnt;
584  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
585  const int8_t *filter_vert = subpel_filters_msa[my - 1];
586  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
587  v16i8 filt_hz0, filt_hz1, filt_hz2;
588  v16u8 mask0, mask1, mask2, vec0, vec1;
589  v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
590  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
591  v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
592  v8i16 tmp0, tmp1, tmp2, tmp3;
593 
594  mask0 = LD_UB(&mc_filt_mask_arr[0]);
595  src -= (2 + 2 * src_stride);
596 
597  /* rearranging filter */
598  filt = LD_SH(filter_horiz);
599  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
600 
601  mask1 = mask0 + 2;
602  mask2 = mask0 + 4;
603 
604  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
605  src += (5 * src_stride);
606 
607  XORI_B5_128_SB(src0, src1, src2, src3, src4);
608  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
609  filt_hz1, filt_hz2);
610  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
611  filt_hz1, filt_hz2);
612  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
613  filt_hz1, filt_hz2);
614  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
615  filt_hz1, filt_hz2);
616  hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
617  filt_hz1, filt_hz2);
618 
619  filt = LD_SH(filter_vert);
620  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
621 
622  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
623  ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
624 
625  for (loop_cnt = (height >> 2); loop_cnt--;) {
626  LD_SB4(src, src_stride, src5, src6, src7, src8);
627  src += (4 * src_stride);
628 
629  XORI_B4_128_SB(src5, src6, src7, src8);
630  hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
631  filt_hz1, filt_hz2);
632  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
633  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
634 
635  hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
636  filt_hz1, filt_hz2);
637  out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
638  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
639 
640  hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
641  filt_hz1, filt_hz2);
642  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
643  tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
644 
645  hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
646  filt_hz1, filt_hz2);
647  out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
648  tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
649 
650  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
651  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
652  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
653  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
654  ST8x4_UB(vec0, vec1, dst, dst_stride);
655  dst += (4 * dst_stride);
656 
657  hz_out4 = hz_out8;
658  out0 = out2;
659  out1 = out7;
660  out3 = out5;
661  out4 = out6;
662  }
663 }
664 
665 
666 void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
667  uint8_t *src, ptrdiff_t src_stride,
668  int height, int mx, int my)
669 {
670  int32_t multiple8_cnt;
671 
672  for (multiple8_cnt = 2; multiple8_cnt--;) {
673  ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
674  mx, my);
675 
676  src += 8;
677  dst += 8;
678  }
679 }
680 
681 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
682  uint8_t *dst, int32_t dst_stride,
683  const int8_t *filter)
684 {
685  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
686  v8i16 filt, out0, out1;
687  v16u8 out;
688 
689  mask0 = LD_SB(&mc_filt_mask_arr[16]);
690  src -= 1;
691 
692  /* rearranging filter */
693  filt = LD_SH(filter);
694  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
695 
696  mask1 = mask0 + 2;
697 
698  LD_SB4(src, src_stride, src0, src1, src2, src3);
699  XORI_B4_128_SB(src0, src1, src2, src3);
700  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
701  filt0, filt1, out0, out1);
702  SRARI_H2_SH(out0, out1, 7);
703  SAT_SH2_SH(out0, out1, 7);
704  out = PCKEV_XORI128_UB(out0, out1);
705  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
706 }
707 
708 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
709  uint8_t *dst, int32_t dst_stride,
710  const int8_t *filter)
711 {
712  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
713  v16u8 out;
714  v8i16 filt, out0, out1, out2, out3;
715 
716  mask0 = LD_SB(&mc_filt_mask_arr[16]);
717  src -= 1;
718 
719  /* rearranging filter */
720  filt = LD_SH(filter);
721  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
722 
723  mask1 = mask0 + 2;
724 
725  LD_SB4(src, src_stride, src0, src1, src2, src3);
726  src += (4 * src_stride);
727 
728  XORI_B4_128_SB(src0, src1, src2, src3);
729  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
730  filt0, filt1, out0, out1);
731  LD_SB4(src, src_stride, src0, src1, src2, src3);
732  XORI_B4_128_SB(src0, src1, src2, src3);
733  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
734  filt0, filt1, out2, out3);
735  SRARI_H4_SH(out0, out1, out2, out3, 7);
736  SAT_SH4_SH(out0, out1, out2, out3, 7);
737  out = PCKEV_XORI128_UB(out0, out1);
738  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
739  dst += (4 * dst_stride);
740  out = PCKEV_XORI128_UB(out2, out3);
741  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
742 }
743 
744 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
745  uint8_t *dst, int32_t dst_stride,
746  const int8_t *filter)
747 {
748  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
749  v16i8 filt0, filt1, mask0, mask1;
750  v16u8 out;
751  v8i16 filt, out0, out1, out2, out3;
752 
753  mask0 = LD_SB(&mc_filt_mask_arr[16]);
754  src -= 1;
755 
756  /* rearranging filter */
757  filt = LD_SH(filter);
758  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
759 
760  mask1 = mask0 + 2;
761 
762  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
763  src += (8 * src_stride);
764  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
765  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
766  filt0, filt1, out0, out1);
767  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
768  filt0, filt1, out2, out3);
769  SRARI_H4_SH(out0, out1, out2, out3, 7);
770  SAT_SH4_SH(out0, out1, out2, out3, 7);
771  out = PCKEV_XORI128_UB(out0, out1);
772  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
773  dst += (4 * dst_stride);
774  out = PCKEV_XORI128_UB(out2, out3);
775  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
776  dst += (4 * dst_stride);
777 
778  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
779  src += (8 * src_stride);
780  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
781  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
782  filt0, filt1, out0, out1);
783  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
784  filt0, filt1, out2, out3);
785  SRARI_H4_SH(out0, out1, out2, out3, 7);
786  SAT_SH4_SH(out0, out1, out2, out3, 7);
787  out = PCKEV_XORI128_UB(out0, out1);
788  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
789  dst += (4 * dst_stride);
790  out = PCKEV_XORI128_UB(out2, out3);
791  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
792 }
793 
794 void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
795  uint8_t *src, ptrdiff_t src_stride,
796  int height, int mx, int my)
797 {
798  const int8_t *filter = subpel_filters_msa[mx - 1];
799 
800  if (4 == height) {
801  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
802  } else if (8 == height) {
803  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
804  } else if (16 == height) {
805  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
806  }
807 }
808 
809 void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
810  uint8_t *src, ptrdiff_t src_stride,
811  int height, int mx, int my)
812 {
813  uint32_t loop_cnt;
814  const int8_t *filter = subpel_filters_msa[mx - 1];
815  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
816  v16u8 tmp0, tmp1;
817  v8i16 filt, out0, out1, out2, out3;
818 
819  mask0 = LD_SB(&mc_filt_mask_arr[0]);
820  src -= 1;
821 
822  /* rearranging filter */
823  filt = LD_SH(filter);
824  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
825 
826  mask1 = mask0 + 2;
827 
828  for (loop_cnt = (height >> 2); loop_cnt--;) {
829  LD_SB4(src, src_stride, src0, src1, src2, src3);
830  src += (4 * src_stride);
831 
832  XORI_B4_128_SB(src0, src1, src2, src3);
833  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
834  filt1, out0, out1, out2, out3);
835  SRARI_H4_SH(out0, out1, out2, out3, 7);
836  SAT_SH4_SH(out0, out1, out2, out3, 7);
837  tmp0 = PCKEV_XORI128_UB(out0, out1);
838  tmp1 = PCKEV_XORI128_UB(out2, out3);
839  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
840  dst += (4 * dst_stride);
841  }
842 }
843 
844 void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
845  uint8_t *src, ptrdiff_t src_stride,
846  int height, int mx, int my)
847 {
848  uint32_t loop_cnt;
849  const int8_t *filter = subpel_filters_msa[mx - 1];
850  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
851  v16i8 filt0, filt1, mask0, mask1;
852  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
853  v16u8 out;
854 
855  mask0 = LD_SB(&mc_filt_mask_arr[0]);
856  src -= 1;
857 
858  /* rearranging filter */
859  filt = LD_SH(filter);
860  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
861 
862  mask1 = mask0 + 2;
863 
864  for (loop_cnt = (height >> 2); loop_cnt--;) {
865  LD_SB4(src, src_stride, src0, src2, src4, src6);
866  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
867  src += (4 * src_stride);
868 
869  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
870  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
871  filt1, out0, out1, out2, out3);
872  HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
873  filt1, out4, out5, out6, out7);
874  SRARI_H4_SH(out0, out1, out2, out3, 7);
875  SRARI_H4_SH(out4, out5, out6, out7, 7);
876  SAT_SH4_SH(out0, out1, out2, out3, 7);
877  SAT_SH4_SH(out4, out5, out6, out7, 7);
878  out = PCKEV_XORI128_UB(out0, out1);
879  ST_UB(out, dst);
880  dst += dst_stride;
881  out = PCKEV_XORI128_UB(out2, out3);
882  ST_UB(out, dst);
883  dst += dst_stride;
884  out = PCKEV_XORI128_UB(out4, out5);
885  ST_UB(out, dst);
886  dst += dst_stride;
887  out = PCKEV_XORI128_UB(out6, out7);
888  ST_UB(out, dst);
889  dst += dst_stride;
890  }
891 }
892 
893 void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
894  uint8_t *src, ptrdiff_t src_stride,
895  int height, int mx, int my)
896 {
897  uint32_t loop_cnt;
898  const int8_t *filter = subpel_filters_msa[my - 1];
899  v16i8 src0, src1, src2, src3, src4, src5;
900  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
901  v16i8 src2110, src4332, filt0, filt1;
902  v8i16 filt, out10, out32;
903  v16u8 out;
904 
905  src -= src_stride;
906 
907  filt = LD_SH(filter);
908  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
909 
910  LD_SB3(src, src_stride, src0, src1, src2);
911  src += (3 * src_stride);
912 
913  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
914 
915  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
916  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
917 
918  for (loop_cnt = (height >> 2); loop_cnt--;) {
919  LD_SB3(src, src_stride, src3, src4, src5);
920  src += (3 * src_stride);
921  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
922  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
923  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
924  out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
925 
926  src2 = LD_SB(src);
927  src += (src_stride);
928  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
929  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
930  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
931  out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
932  SRARI_H2_SH(out10, out32, 7);
933  SAT_SH2_SH(out10, out32, 7);
934  out = PCKEV_XORI128_UB(out10, out32);
935  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
936  dst += (4 * dst_stride);
937  }
938 }
939 
940 void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
941  uint8_t *src, ptrdiff_t src_stride,
942  int height, int mx, int my)
943 {
944  uint32_t loop_cnt;
945  const int8_t *filter = subpel_filters_msa[my - 1];
946  v16i8 src0, src1, src2, src7, src8, src9, src10;
947  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
948  v16u8 tmp0, tmp1;
949  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
950 
951  src -= src_stride;
952 
953  filt = LD_SH(filter);
954  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
955 
956  LD_SB3(src, src_stride, src0, src1, src2);
957  src += (3 * src_stride);
958 
959  XORI_B3_128_SB(src0, src1, src2);
960  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
961 
962  for (loop_cnt = (height >> 2); loop_cnt--;) {
963  LD_SB4(src, src_stride, src7, src8, src9, src10);
964  src += (4 * src_stride);
965 
966  XORI_B4_128_SB(src7, src8, src9, src10);
967  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
968  src72_r, src87_r, src98_r, src109_r);
969  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
970  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
971  out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
972  out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
973  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
974  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
975  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
976  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
977  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
978  dst += (4 * dst_stride);
979 
980  src10_r = src98_r;
981  src21_r = src109_r;
982  src2 = src10;
983  }
984 }
985 
986 void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
987  uint8_t *src, ptrdiff_t src_stride,
988  int height, int mx, int my)
989 {
990  uint32_t loop_cnt;
991  const int8_t *filter = subpel_filters_msa[my - 1];
992  v16i8 src0, src1, src2, src3, src4, src5, src6;
993  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
994  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
995  v16u8 tmp0, tmp1, tmp2, tmp3;
996  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
997 
998  src -= src_stride;
999 
1000  filt = LD_SH(filter);
1001  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1002 
1003  LD_SB3(src, src_stride, src0, src1, src2);
1004  src += (3 * src_stride);
1005 
1006  XORI_B3_128_SB(src0, src1, src2);
1007  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
1008  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
1009 
1010  for (loop_cnt = (height >> 2); loop_cnt--;) {
1011  LD_SB4(src, src_stride, src3, src4, src5, src6);
1012  src += (4 * src_stride);
1013 
1014  XORI_B4_128_SB(src3, src4, src5, src6);
1015  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1016  src32_r, src43_r, src54_r, src65_r);
1017  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1018  src32_l, src43_l, src54_l, src65_l);
1019  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
1020  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
1021  out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
1022  out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
1023  out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
1024  out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
1025  out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
1026  out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
1027  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1030  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1031  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1032  out3_r, tmp0, tmp1, tmp2, tmp3);
1033  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1034  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1035  dst += (4 * dst_stride);
1036 
1037  src10_r = src54_r;
1038  src21_r = src65_r;
1039  src10_l = src54_l;
1040  src21_l = src65_l;
1041  src2 = src6;
1042  }
1043 }
1044 
1045 void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1046  uint8_t *src, ptrdiff_t src_stride,
1047  int height, int mx, int my)
1048 {
1049  uint32_t loop_cnt;
1050  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1051  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1052  v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1053  v16u8 mask0, mask1, out;
1054  v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1055  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1056 
1057  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1058  src -= (1 + 1 * src_stride);
1059 
1060  /* rearranging filter */
1061  filt = LD_SH(filter_horiz);
1062  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1063 
1064  mask1 = mask0 + 2;
1065 
1066  LD_SB3(src, src_stride, src0, src1, src2);
1067  src += (3 * src_stride);
1068 
1069  XORI_B3_128_SB(src0, src1, src2);
1070  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1071  hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1072  vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1073 
1074  filt = LD_SH(filter_vert);
1075  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1076 
1077  for (loop_cnt = (height >> 2); loop_cnt--;) {
1078  LD_SB4(src, src_stride, src3, src4, src5, src6);
1079  src += (4 * src_stride);
1080 
1081  XORI_B2_128_SB(src3, src4);
1082  hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1083  hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1084  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1085  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1086 
1087  XORI_B2_128_SB(src5, src6);
1088  hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1089  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1090  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1091  tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1092 
1093  SRARI_H2_SH(tmp0, tmp1, 7);
1094  SAT_SH2_SH(tmp0, tmp1, 7);
1095  out = PCKEV_XORI128_UB(tmp0, tmp1);
1096  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1097  dst += (4 * dst_stride);
1098 
1099  hz_out1 = hz_out5;
1100  vec0 = vec2;
1101  }
1102 }
1103 
1104 void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1105  uint8_t *src, ptrdiff_t src_stride,
1106  int height, int mx, int my)
1107 {
1108  uint32_t loop_cnt;
1109  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1110  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1111  v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1112  v16u8 mask0, mask1, out0, out1;
1113  v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1114  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1115  v8i16 vec0, vec1, vec2, vec3, vec4;
1116 
1117  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1118  src -= (1 + 1 * src_stride);
1119 
1120  /* rearranging filter */
1121  filt = LD_SH(filter_horiz);
1122  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1123 
1124  mask1 = mask0 + 2;
1125 
1126  LD_SB3(src, src_stride, src0, src1, src2);
1127  src += (3 * src_stride);
1128 
1129  XORI_B3_128_SB(src0, src1, src2);
1130  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1131  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1132  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1133  ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1134 
1135  filt = LD_SH(filter_vert);
1136  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1137 
1138  for (loop_cnt = (height >> 2); loop_cnt--;) {
1139  LD_SB4(src, src_stride, src3, src4, src5, src6);
1140  src += (4 * src_stride);
1141 
1142  XORI_B4_128_SB(src3, src4, src5, src6);
1143  hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1144  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1145  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1146 
1147  hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1148  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1149  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1150 
1151  hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1152  vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1153  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1154 
1155  hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1156  ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1157  tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1158 
1159  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1160  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1161  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1162  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1163  ST8x4_UB(out0, out1, dst, dst_stride);
1164  dst += (4 * dst_stride);
1165 
1166  vec0 = vec4;
1167  vec2 = vec1;
1168  }
1169 }
1170 
1171 void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1172  uint8_t *src, ptrdiff_t src_stride,
1173  int height, int mx, int my)
1174 {
1175  int32_t multiple8_cnt;
1176 
1177  for (multiple8_cnt = 2; multiple8_cnt--;) {
1178  ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
1179  mx, my);
1180 
1181  src += 8;
1182  dst += 8;
1183  }
1184 }
1185 
1186 void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1187  uint8_t *src, ptrdiff_t src_stride,
1188  int height, int mx, int my)
1189 {
1190  uint32_t loop_cnt;
1191  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1192  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1193  v16i8 src0, src1, src2, src3, src4, src5, src6;
1194  v16i8 filt_hz0, filt_hz1, filt_hz2;
1195  v16u8 res0, res1, mask0, mask1, mask2;
1196  v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1197  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1198 
1199  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1200  src -= (2 + 1 * src_stride);
1201 
1202  /* rearranging filter */
1203  filt = LD_SH(filter_horiz);
1204  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1205 
1206  mask1 = mask0 + 2;
1207  mask2 = mask0 + 4;
1208 
1209  LD_SB3(src, src_stride, src0, src1, src2);
1210  src += (3 * src_stride);
1211 
1212  XORI_B3_128_SB(src0, src1, src2);
1213  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
1214  filt_hz1, filt_hz2);
1215  hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
1216  filt_hz1, filt_hz2);
1217  vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1218 
1219  filt = LD_SH(filter_vert);
1220  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1221 
1222  for (loop_cnt = (height >> 2); loop_cnt--;) {
1223  LD_SB4(src, src_stride, src3, src4, src5, src6);
1224  src += (4 * src_stride);
1225 
1226  XORI_B4_128_SB(src3, src4, src5, src6);
1227  hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1228  filt_hz1, filt_hz2);
1229  hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1230  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1231  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1232 
1233  hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1234  filt_hz1, filt_hz2);
1235  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1236  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1237  tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1238 
1239  SRARI_H2_SH(tmp0, tmp1, 7);
1240  SAT_SH2_SH(tmp0, tmp1, 7);
1241  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1242  XORI_B2_128_UB(res0, res1);
1243  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1244  dst += (4 * dst_stride);
1245 
1246  hz_out1 = hz_out5;
1247  vec0 = vec2;
1248  }
1249 }
1250 
1251 void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1252  uint8_t *src, ptrdiff_t src_stride,
1253  int height, int mx, int my)
1254 {
1255  uint32_t loop_cnt;
1256  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1257  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1258  v16i8 src0, src1, src2, src3, src4, src5, src6;
1259  v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1260  v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1261  v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1262  v16u8 out0, out1;
1263 
1264  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1265  src -= (2 + src_stride);
1266 
1267  /* rearranging filter */
1268  filt = LD_SH(filter_horiz);
1269  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1270 
1271  mask1 = mask0 + 2;
1272  mask2 = mask0 + 4;
1273 
1274  LD_SB3(src, src_stride, src0, src1, src2);
1275  src += (3 * src_stride);
1276 
1277  XORI_B3_128_SB(src0, src1, src2);
1278  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
1279  filt_hz1, filt_hz2);
1280  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
1281  filt_hz1, filt_hz2);
1282  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
1283  filt_hz1, filt_hz2);
1284  ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1285 
1286  filt = LD_SH(filter_vert);
1287  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1288 
1289  for (loop_cnt = (height >> 2); loop_cnt--;) {
1290  LD_SB4(src, src_stride, src3, src4, src5, src6);
1291  src += (4 * src_stride);
1292 
1293  XORI_B4_128_SB(src3, src4, src5, src6);
1294 
1295  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1296  filt_hz1, filt_hz2);
1297  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1298  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1299 
1300  hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1301  filt_hz1, filt_hz2);
1302  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1303  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1304 
1305  hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1306  filt_hz1, filt_hz2);
1307  vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1308  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1309 
1310  hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1311  filt_hz1, filt_hz2);
1312  ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1313  tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1314 
1315  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1316  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1317  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1318  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1319  ST8x4_UB(out0, out1, dst, dst_stride);
1320  dst += (4 * dst_stride);
1321  }
1322 }
1323 
1324 void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1325  uint8_t *src, ptrdiff_t src_stride,
1326  int height, int mx, int my)
1327 {
1328  int32_t multiple8_cnt;
1329 
1330  for (multiple8_cnt = 2; multiple8_cnt--;) {
1331  ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
1332  mx, my);
1333 
1334  src += 8;
1335  dst += 8;
1336  }
1337 }
1338 
1339 void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1340  uint8_t *src, ptrdiff_t src_stride,
1341  int height, int mx, int my)
1342 {
1343  uint32_t loop_cnt;
1344  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1345  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1346  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1347  v16i8 filt_hz0, filt_hz1, mask0, mask1;
1348  v16u8 out;
1349  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1350  v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1351  v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1352 
1353  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1354 
1355  src -= (1 + 2 * src_stride);
1356 
1357  /* rearranging filter */
1358  filt = LD_SH(filter_horiz);
1359  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1360 
1361  mask1 = mask0 + 2;
1362 
1363  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1364  src += (5 * src_stride);
1365 
1366  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1367  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1368  hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1369  hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1370  hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1371  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1372 
1373  filt = LD_SH(filter_vert);
1374  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1375 
1376  for (loop_cnt = (height >> 2); loop_cnt--;) {
1377  LD_SB4(src, src_stride, src5, src6, src7, src8);
1378  XORI_B4_128_SB(src5, src6, src7, src8);
1379  src += (4 * src_stride);
1380 
1381  hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1382  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1383  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1384  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1385 
1386  hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1387  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1388  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1389  tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1390 
1391  SRARI_H2_SH(tmp0, tmp1, 7);
1392  SAT_SH2_SH(tmp0, tmp1, 7);
1393  out = PCKEV_XORI128_UB(tmp0, tmp1);
1394  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1395  dst += (4 * dst_stride);
1396 
1397  hz_out3 = hz_out7;
1398  out0 = out2;
1399  out1 = out3;
1400  }
1401 }
1402 
1403 void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1404  uint8_t *src, ptrdiff_t src_stride,
1405  int height, int mx, int my)
1406 {
1407  uint32_t loop_cnt;
1408  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1409  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1410  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1411  v16i8 filt_hz0, filt_hz1, mask0, mask1;
1412  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1413  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1414  v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1415  v16u8 vec0, vec1;
1416 
1417  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1418  src -= (1 + 2 * src_stride);
1419 
1420  /* rearranging filter */
1421  filt = LD_SH(filter_horiz);
1422  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1423 
1424  mask1 = mask0 + 2;
1425 
1426  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1427  src += (5 * src_stride);
1428 
1429  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1430  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1431  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1432  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1433  hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1434  hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1435  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1436  ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1437 
1438  filt = LD_SH(filter_vert);
1439  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1440 
1441  for (loop_cnt = (height >> 2); loop_cnt--;) {
1442  LD_SB4(src, src_stride, src5, src6, src7, src8);
1443  src += (4 * src_stride);
1444 
1445  XORI_B4_128_SB(src5, src6, src7, src8);
1446 
1447  hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1448  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1449  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1450 
1451  hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1452  out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1453  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1454 
1455  hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1456  out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1457  tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1458 
1459  hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1460  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1461  tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1462 
1463  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1464  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1465  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1466  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1467  ST8x4_UB(vec0, vec1, dst, dst_stride);
1468  dst += (4 * dst_stride);
1469 
1470  hz_out4 = hz_out8;
1471  out0 = out2;
1472  out1 = out6;
1473  out3 = out5;
1474  out4 = out7;
1475  }
1476 }
1477 
1478 void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1479  uint8_t *src, ptrdiff_t src_stride,
1480  int height, int mx, int my)
1481 {
1482  int32_t multiple8_cnt;
1483 
1484  for (multiple8_cnt = 2; multiple8_cnt--;) {
1485  ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
1486  mx, my);
1487 
1488  src += 8;
1489  dst += 8;
1490  }
1491 }
1492 
1493 static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1494  uint8_t *dst, int32_t dst_stride,
1495  const int8_t *filter)
1496 {
1497  v16i8 src0, src1, src2, src3, mask;
1498  v16u8 filt0, vec0, vec1, res0, res1;
1499  v8u16 vec2, vec3, filt;
1500 
1501  mask = LD_SB(&mc_filt_mask_arr[16]);
1502 
1503  /* rearranging filter */
1504  filt = LD_UH(filter);
1505  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1506 
1507  LD_SB4(src, src_stride, src0, src1, src2, src3);
1508  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1509  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1510  SRARI_H2_UH(vec2, vec3, 7);
1511  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1512  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1513 }
1514 
1515 static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1516  uint8_t *dst, int32_t dst_stride,
1517  const int8_t *filter)
1518 {
1519  v16u8 vec0, vec1, vec2, vec3, filt0;
1520  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1521  v16i8 res0, res1, res2, res3;
1522  v8u16 vec4, vec5, vec6, vec7, filt;
1523 
1524  mask = LD_SB(&mc_filt_mask_arr[16]);
1525 
1526  /* rearranging filter */
1527  filt = LD_UH(filter);
1528  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1529 
1530  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1531  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1532  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1533  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534  vec4, vec5, vec6, vec7);
1535  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1536  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537  res0, res1, res2, res3);
1538  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1539  dst += (4 * dst_stride);
1540  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
1541 }
1542 
1543 void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1544  uint8_t *src, ptrdiff_t src_stride,
1545  int height, int mx, int my)
1546 {
1547  const int8_t *filter = bilinear_filters_msa[mx - 1];
1548 
1549  if (4 == height) {
1550  common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1551  } else if (8 == height) {
1552  common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1553  }
1554 }
1555 
1556 static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1557  uint8_t *dst, int32_t dst_stride,
1558  const int8_t *filter)
1559 {
1560  v16u8 filt0;
1561  v16i8 src0, src1, src2, src3, mask;
1562  v8u16 vec0, vec1, vec2, vec3, filt;
1563 
1564  mask = LD_SB(&mc_filt_mask_arr[0]);
1565 
1566  /* rearranging filter */
1567  filt = LD_UH(filter);
1568  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1569 
1570  LD_SB4(src, src_stride, src0, src1, src2, src3);
1571  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1572  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1573  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1574  vec0, vec1, vec2, vec3);
1575  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1576  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1577  ST8x4_UB(src0, src1, dst, dst_stride);
1578 }
1579 
1580 static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1581  uint8_t *dst, int32_t dst_stride,
1582  const int8_t *filter, int32_t height)
1583 {
1584  v16u8 filt0;
1585  v16i8 src0, src1, src2, src3, mask, out0, out1;
1586  v8u16 vec0, vec1, vec2, vec3, filt;
1587 
1588  mask = LD_SB(&mc_filt_mask_arr[0]);
1589 
1590  /* rearranging filter */
1591  filt = LD_UH(filter);
1592  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1593 
1594  LD_SB4(src, src_stride, src0, src1, src2, src3);
1595  src += (4 * src_stride);
1596 
1597  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1598  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1599  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1600  vec0, vec1, vec2, vec3);
1601  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1602 
1603  LD_SB4(src, src_stride, src0, src1, src2, src3);
1604  src += (4 * src_stride);
1605 
1606  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1607  ST8x4_UB(out0, out1, dst, dst_stride);
1608  dst += (4 * dst_stride);
1609 
1610  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1611  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1612  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613  vec0, vec1, vec2, vec3);
1614  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1615  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1616  ST8x4_UB(out0, out1, dst, dst_stride);
1617  dst += (4 * dst_stride);
1618 
1619  if (16 == height) {
1620  LD_SB4(src, src_stride, src0, src1, src2, src3);
1621  src += (4 * src_stride);
1622 
1623  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1624  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1625  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626  vec0, vec1, vec2, vec3);
1627  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1628  LD_SB4(src, src_stride, src0, src1, src2, src3);
1629  src += (4 * src_stride);
1630 
1631  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1632  ST8x4_UB(out0, out1, dst, dst_stride);
1633 
1634  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1635  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1636  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637  vec0, vec1, vec2, vec3);
1638  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1639  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1640  ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
1641  }
1642 }
1643 
1644 void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1645  uint8_t *src, ptrdiff_t src_stride,
1646  int height, int mx, int my)
1647 {
1648  const int8_t *filter = bilinear_filters_msa[mx - 1];
1649 
1650  if (4 == height) {
1651  common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1652  } else {
1653  common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1654  height);
1655  }
1656 }
1657 
1658 void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1659  uint8_t *src, ptrdiff_t src_stride,
1660  int height, int mx, int my)
1661 {
1662  uint32_t loop_cnt;
1663  const int8_t *filter = bilinear_filters_msa[mx - 1];
1664  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1665  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1667 
1668  mask = LD_SB(&mc_filt_mask_arr[0]);
1669 
1670  loop_cnt = (height >> 2) - 1;
1671 
1672  /* rearranging filter */
1673  filt = LD_UH(filter);
1674  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1675 
1676  LD_SB4(src, src_stride, src0, src2, src4, src6);
1677  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1678  src += (4 * src_stride);
1679 
1680  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1681  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1682  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1683  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1684  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685  out0, out1, out2, out3);
1686  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687  out4, out5, out6, out7);
1688  SRARI_H4_UH(out0, out1, out2, out3, 7);
1689  SRARI_H4_UH(out4, out5, out6, out7, 7);
1690  PCKEV_ST_SB(out0, out1, dst);
1691  dst += dst_stride;
1692  PCKEV_ST_SB(out2, out3, dst);
1693  dst += dst_stride;
1694  PCKEV_ST_SB(out4, out5, dst);
1695  dst += dst_stride;
1696  PCKEV_ST_SB(out6, out7, dst);
1697  dst += dst_stride;
1698 
1699  for (; loop_cnt--;) {
1700  LD_SB4(src, src_stride, src0, src2, src4, src6);
1701  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1702  src += (4 * src_stride);
1703 
1704  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1705  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1706  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1707  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1708  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709  out0, out1, out2, out3);
1710  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711  out4, out5, out6, out7);
1712  SRARI_H4_UH(out0, out1, out2, out3, 7);
1713  SRARI_H4_UH(out4, out5, out6, out7, 7);
1714  PCKEV_ST_SB(out0, out1, dst);
1715  dst += dst_stride;
1716  PCKEV_ST_SB(out2, out3, dst);
1717  dst += dst_stride;
1718  PCKEV_ST_SB(out4, out5, dst);
1719  dst += dst_stride;
1720  PCKEV_ST_SB(out6, out7, dst);
1721  dst += dst_stride;
1722  }
1723 }
1724 
1725 static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1726  uint8_t *dst, int32_t dst_stride,
1727  const int8_t *filter)
1728 {
1729  v16i8 src0, src1, src2, src3, src4;
1730  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1731  v16u8 filt0;
1732  v8i16 filt;
1733  v8u16 tmp0, tmp1;
1734 
1735  filt = LD_SH(filter);
1736  filt0 = (v16u8) __msa_splati_h(filt, 0);
1737 
1738  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1739  src += (5 * src_stride);
1740 
1741  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1742  src10_r, src21_r, src32_r, src43_r);
1743  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1745  SRARI_H2_UH(tmp0, tmp1, 7);
1746  SAT_UH2_UH(tmp0, tmp1, 7);
1747  src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
1749 }
1750 
1751 static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1752  uint8_t *dst, int32_t dst_stride,
1753  const int8_t *filter)
1754 {
1755  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1756  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758  v8u16 tmp0, tmp1, tmp2, tmp3;
1759  v16u8 filt0;
1760  v8i16 filt;
1761 
1762  filt = LD_SH(filter);
1763  filt0 = (v16u8) __msa_splati_h(filt, 0);
1764 
1765  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1766  src += (8 * src_stride);
1767 
1768  src8 = LD_SB(src);
1769  src += src_stride;
1770 
1771  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1772  src32_r, src43_r);
1773  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1774  src76_r, src87_r);
1775  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776  src87_r, src76_r, src2110, src4332, src6554, src8776);
1777  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778  tmp0, tmp1, tmp2, tmp3);
1779  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1780  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1781  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
1783  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
1784 }
1785 
1786 void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1787  uint8_t *src, ptrdiff_t src_stride,
1788  int height, int mx, int my)
1789 {
1790  const int8_t *filter = bilinear_filters_msa[my - 1];
1791 
1792  if (4 == height) {
1793  common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1794  } else if (8 == height) {
1795  common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1796  }
1797 }
1798 
1799 static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1800  uint8_t *dst, int32_t dst_stride,
1801  const int8_t *filter)
1802 {
1803  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1804  v16i8 out0, out1;
1805  v8u16 tmp0, tmp1, tmp2, tmp3;
1806  v8i16 filt;
1807 
1808  /* rearranging filter_y */
1809  filt = LD_SH(filter);
1810  filt0 = (v16u8) __msa_splati_h(filt, 0);
1811 
1812  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1813  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
1814  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
1815  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1816  tmp0, tmp1, tmp2, tmp3);
1817  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1818  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1819  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1820  ST8x4_UB(out0, out1, dst, dst_stride);
1821 }
1822 
1823 static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1824  uint8_t *dst, int32_t dst_stride,
1825  const int8_t *filter, int32_t height)
1826 {
1827  uint32_t loop_cnt;
1828  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1829  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1830  v16i8 out0, out1;
1831  v8u16 tmp0, tmp1, tmp2, tmp3;
1832  v8i16 filt;
1833 
1834  /* rearranging filter_y */
1835  filt = LD_SH(filter);
1836  filt0 = (v16u8) __msa_splati_h(filt, 0);
1837 
1838  src0 = LD_UB(src);
1839  src += src_stride;
1840 
1841  for (loop_cnt = (height >> 3); loop_cnt--;) {
1842  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
1843  src += (8 * src_stride);
1844 
1845  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1846  vec0, vec1, vec2, vec3);
1847  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1848  vec4, vec5, vec6, vec7);
1849  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1850  tmp0, tmp1, tmp2, tmp3);
1851  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1852  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1853  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1854  ST8x4_UB(out0, out1, dst, dst_stride);
1855  dst += (4 * dst_stride);
1856 
1857  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1858  tmp0, tmp1, tmp2, tmp3);
1859  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1860  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1861  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1862  ST8x4_UB(out0, out1, dst, dst_stride);
1863  dst += (4 * dst_stride);
1864 
1865  src0 = src8;
1866  }
1867 }
1868 
1869 void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1870  uint8_t *src, ptrdiff_t src_stride,
1871  int height, int mx, int my)
1872 {
1873  const int8_t *filter = bilinear_filters_msa[my - 1];
1874 
1875  if (4 == height) {
1876  common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1877  } else {
1878  common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1879  height);
1880  }
1881 }
1882 
1883 void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1884  uint8_t *src, ptrdiff_t src_stride,
1885  int height, int mx, int my)
1886 {
1887  uint32_t loop_cnt;
1888  const int8_t *filter = bilinear_filters_msa[my - 1];
1889  v16u8 src0, src1, src2, src3, src4;
1890  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1891  v8u16 tmp0, tmp1, tmp2, tmp3;
1892  v8i16 filt;
1893 
1894  /* rearranging filter_y */
1895  filt = LD_SH(filter);
1896  filt0 = (v16u8) __msa_splati_h(filt, 0);
1897 
1898  src0 = LD_UB(src);
1899  src += src_stride;
1900 
1901  for (loop_cnt = (height >> 2); loop_cnt--;) {
1902  LD_UB4(src, src_stride, src1, src2, src3, src4);
1903  src += (4 * src_stride);
1904 
1905  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
1906  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
1907  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1908  SRARI_H2_UH(tmp0, tmp1, 7);
1909  SAT_UH2_UH(tmp0, tmp1, 7);
1910  PCKEV_ST_SB(tmp0, tmp1, dst);
1911  dst += dst_stride;
1912 
1913  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
1914  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
1915  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1916  SRARI_H2_UH(tmp2, tmp3, 7);
1917  SAT_UH2_UH(tmp2, tmp3, 7);
1918  PCKEV_ST_SB(tmp2, tmp3, dst);
1919  dst += dst_stride;
1920 
1921  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1922  SRARI_H2_UH(tmp0, tmp1, 7);
1923  SAT_UH2_UH(tmp0, tmp1, 7);
1924  PCKEV_ST_SB(tmp0, tmp1, dst);
1925  dst += dst_stride;
1926 
1927  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1928  SRARI_H2_UH(tmp2, tmp3, 7);
1929  SAT_UH2_UH(tmp2, tmp3, 7);
1930  PCKEV_ST_SB(tmp2, tmp3, dst);
1931  dst += dst_stride;
1932 
1933  src0 = src4;
1934  }
1935 }
1936 
1938  uint8_t *dst, int32_t dst_stride,
1939  const int8_t *filter_horiz,
1940  const int8_t *filter_vert)
1941 {
1942  v16i8 src0, src1, src2, src3, src4, mask;
1943  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1944  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
1945 
1946  mask = LD_SB(&mc_filt_mask_arr[16]);
1947 
1948  /* rearranging filter */
1949  filt = LD_UH(filter_horiz);
1950  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1951 
1952  filt = LD_UH(filter_vert);
1953  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1954 
1955  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1956  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1957  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1958  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
1959  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1960  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1961 
1962  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1963  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1964  SRARI_H2_UH(tmp0, tmp1, 7);
1965  SAT_UH2_UH(tmp0, tmp1, 7);
1966  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1967  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1968 }
1969 
1971  uint8_t *dst, int32_t dst_stride,
1972  const int8_t *filter_horiz,
1973  const int8_t *filter_vert)
1974 {
1975  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
1976  v16i8 res0, res1, res2, res3;
1977  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1978  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1979  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
1980 
1981  mask = LD_SB(&mc_filt_mask_arr[16]);
1982 
1983  /* rearranging filter */
1984  filt = LD_UH(filter_horiz);
1985  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1986 
1987  filt = LD_UH(filter_vert);
1988  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1989 
1990  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1991  src += (8 * src_stride);
1992  src8 = LD_SB(src);
1993 
1994  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1995  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1996  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
1997  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
1998  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
1999  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
2000  hz_out3, hz_out5, 8);
2001  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2002 
2003  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2004  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2005  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2006  vec4, vec5, vec6, vec7);
2007  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2008  SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2009  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2010  res0, res1, res2, res3);
2011  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2012  dst += (4 * dst_stride);
2013  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
2014 }
2015 
2016 void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2017  uint8_t *src, ptrdiff_t src_stride,
2018  int height, int mx, int my)
2019 {
2020  const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2021  const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2022 
2023  if (4 == height) {
2024  common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2025  filter_horiz, filter_vert);
2026  } else if (8 == height) {
2027  common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2028  filter_horiz, filter_vert);
2029  }
2030 }
2031 
2033  uint8_t *dst, int32_t dst_stride,
2034  const int8_t *filter_horiz,
2035  const int8_t *filter_vert)
2036 {
2037  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2038  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2040  v8i16 filt;
2041 
2042  mask = LD_SB(&mc_filt_mask_arr[0]);
2043 
2044  /* rearranging filter */
2045  filt = LD_SH(filter_horiz);
2046  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2047 
2048  filt = LD_SH(filter_vert);
2049  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2050 
2051  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2052 
2053  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2054  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2055  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2057 
2058  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2059  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2061 
2062  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2063  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2065 
2066  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2067  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2069 
2070  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2071  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2072  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2073  ST8x4_UB(out0, out1, dst, dst_stride);
2074 }
2075 
2077  uint8_t *dst, int32_t dst_stride,
2078  const int8_t *filter_horiz,
2079  const int8_t *filter_vert,
2080  int32_t height)
2081 {
2082  uint32_t loop_cnt;
2083  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2084  v16u8 filt_hz, filt_vt, vec0;
2085  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2086  v8i16 filt;
2087 
2088  mask = LD_SB(&mc_filt_mask_arr[0]);
2089 
2090  /* rearranging filter */
2091  filt = LD_SH(filter_horiz);
2092  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2093 
2094  filt = LD_SH(filter_vert);
2095  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2096 
2097  src0 = LD_SB(src);
2098  src += src_stride;
2099 
2100  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2101 
2102  for (loop_cnt = (height >> 3); loop_cnt--;) {
2103  LD_SB4(src, src_stride, src1, src2, src3, src4);
2104  src += (4 * src_stride);
2105 
2106  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2107  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2109 
2110  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2111  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2113 
2114  SRARI_H2_UH(tmp1, tmp2, 7);
2115  SAT_UH2_UH(tmp1, tmp2, 7);
2116 
2117  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2118  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2120 
2121  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2122  LD_SB4(src, src_stride, src1, src2, src3, src4);
2123  src += (4 * src_stride);
2124  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125  tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2126 
2127  SRARI_H2_UH(tmp3, tmp4, 7);
2128  SAT_UH2_UH(tmp3, tmp4, 7);
2129  PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2130  ST8x4_UB(out0, out1, dst, dst_stride);
2131  dst += (4 * dst_stride);
2132 
2133  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2134  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2135  tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2136 
2137  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2138  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2139  tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2140 
2141  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2142  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2143  tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2144 
2145  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2146  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2147  tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2148 
2149  SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2150  SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2151  PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2152  ST8x4_UB(out0, out1, dst, dst_stride);
2153  dst += (4 * dst_stride);
2154  }
2155 }
2156 
2157 void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2158  uint8_t *src, ptrdiff_t src_stride,
2159  int height, int mx, int my)
2160 {
2161  const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2162  const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2163 
2164  if (4 == height) {
2165  common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2166  filter_horiz, filter_vert);
2167  } else {
2168  common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2169  filter_horiz, filter_vert, height);
2170  }
2171 }
2172 
2173 void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2174  uint8_t *src, ptrdiff_t src_stride,
2175  int height, int mx, int my)
2176 {
2177  uint32_t loop_cnt;
2178  const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2179  const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2180  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2181  v16u8 filt_hz, filt_vt, vec0, vec1;
2182  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2183  v8i16 filt;
2184 
2185  mask = LD_SB(&mc_filt_mask_arr[0]);
2186 
2187  /* rearranging filter */
2188  filt = LD_SH(filter_horiz);
2189  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2190 
2191  filt = LD_SH(filter_vert);
2192  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2193 
2194  LD_SB2(src, 8, src0, src1);
2195  src += src_stride;
2196 
2197  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2198  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2199 
2200 
2201  for (loop_cnt = (height >> 2); loop_cnt--;) {
2202  LD_SB4(src, src_stride, src0, src2, src4, src6);
2203  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2204  src += (4 * src_stride);
2205 
2206  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2207  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2208  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2209  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2210  SRARI_H2_UH(tmp1, tmp2, 7);
2211  SAT_UH2_UH(tmp1, tmp2, 7);
2212  PCKEV_ST_SB(tmp1, tmp2, dst);
2213  dst += dst_stride;
2214 
2215  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2216  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2217  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2218  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2219  SRARI_H2_UH(tmp1, tmp2, 7);
2220  SAT_UH2_UH(tmp1, tmp2, 7);
2221  PCKEV_ST_SB(tmp1, tmp2, dst);
2222  dst += dst_stride;
2223 
2224  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2225  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2226  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2227  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2228  SRARI_H2_UH(tmp1, tmp2, 7);
2229  SAT_UH2_UH(tmp1, tmp2, 7);
2230  PCKEV_ST_SB(tmp1, tmp2, dst);
2231  dst += dst_stride;
2232 
2233  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2234  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2235  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2236  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2237  SRARI_H2_UH(tmp1, tmp2, 7);
2238  SAT_UH2_UH(tmp1, tmp2, 7);
2239  PCKEV_ST_SB(tmp1, tmp2, dst);
2240  dst += dst_stride;
2241  }
2242 }
2243 
2244 void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
2245  uint8_t *src, ptrdiff_t src_stride,
2246  int height, int mx, int my)
2247 {
2248  int32_t cnt;
2249  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2250  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2251 
2252  if (0 == height % 8) {
2253  for (cnt = height >> 3; cnt--;) {
2254  LD_UB8(src, src_stride,
2255  src0, src1, src2, src3, src4, src5, src6, src7);
2256  src += (8 * src_stride);
2257 
2258  out0 = __msa_copy_u_d((v2i64) src0, 0);
2259  out1 = __msa_copy_u_d((v2i64) src1, 0);
2260  out2 = __msa_copy_u_d((v2i64) src2, 0);
2261  out3 = __msa_copy_u_d((v2i64) src3, 0);
2262  out4 = __msa_copy_u_d((v2i64) src4, 0);
2263  out5 = __msa_copy_u_d((v2i64) src5, 0);
2264  out6 = __msa_copy_u_d((v2i64) src6, 0);
2265  out7 = __msa_copy_u_d((v2i64) src7, 0);
2266 
2267  SD4(out0, out1, out2, out3, dst, dst_stride);
2268  dst += (4 * dst_stride);
2269  SD4(out4, out5, out6, out7, dst, dst_stride);
2270  dst += (4 * dst_stride);
2271  }
2272  } else if (0 == height % 4) {
2273  for (cnt = (height / 4); cnt--;) {
2274  LD_UB4(src, src_stride, src0, src1, src2, src3);
2275  src += (4 * src_stride);
2276  out0 = __msa_copy_u_d((v2i64) src0, 0);
2277  out1 = __msa_copy_u_d((v2i64) src1, 0);
2278  out2 = __msa_copy_u_d((v2i64) src2, 0);
2279  out3 = __msa_copy_u_d((v2i64) src3, 0);
2280 
2281  SD4(out0, out1, out2, out3, dst, dst_stride);
2282  dst += (4 * dst_stride);
2283  }
2284  }
2285 }
2286 
2287 static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
2288  uint8_t *dst, int32_t dst_stride,
2290 {
2291  int32_t cnt, loop_cnt;
2292  uint8_t *src_tmp, *dst_tmp;
2293  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2294 
2295  for (cnt = (width >> 4); cnt--;) {
2296  src_tmp = src;
2297  dst_tmp = dst;
2298 
2299  for (loop_cnt = (height >> 3); loop_cnt--;) {
2300  LD_UB8(src_tmp, src_stride,
2301  src0, src1, src2, src3, src4, src5, src6, src7);
2302  src_tmp += (8 * src_stride);
2303 
2304  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2305  dst_tmp, dst_stride);
2306  dst_tmp += (8 * dst_stride);
2307  }
2308 
2309  src += 16;
2310  dst += 16;
2311  }
2312 }
2313 
2314 void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
2315  uint8_t *src, ptrdiff_t src_stride,
2316  int height, int mx, int my)
2317 {
2318  int32_t cnt;
2319  v16u8 src0, src1, src2, src3;
2320 
2321  if (0 == height % 8) {
2322  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
2323  } else if (0 == height % 4) {
2324  for (cnt = (height >> 2); cnt--;) {
2325  LD_UB4(src, src_stride, src0, src1, src2, src3);
2326  src += (4 * src_stride);
2327 
2328  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2329  dst += (4 * dst_stride);
2330  }
2331  }
2332 }
void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1186
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:744
void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1644
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
#define SPLATI_H3_SH(...)
void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1478
static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1515
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:708
static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1751
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
#define LD_SB(...)
#define XORI_B3_128_SB(...)
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
Definition: vp8_mc_msa.c:107
void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2244
#define SLDI_B3_UH(...)
void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1045
void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:809
#define LD_UB4(...)
#define ILVR_B2_SB(...)
#define SPLATI_H2_SH(...)
#define src
Definition: vp8dsp.c:254
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1799
void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2016
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
#define XORI_B4_128_UB(...)
static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp8_mc_msa.c:2032
void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:380
void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:986
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
Definition: cfhd.c:114
#define PCKEV_ST_SB(in0, in1, pdst)
#define ILVR_D2_SB(...)
uint8_t
#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, filt1, filt2, out0, out1, out2, out3)
Definition: vp8_mc_msa.c:86
void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:844
void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:333
VP8 compatible video decoder.
static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1725
#define SRARI_H4_SH(...)
static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp8_mc_msa.c:1823
#define XORI_B2_128_UB(...)
void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:666
#define ILVL_B2_SB(...)
#define height
#define LD_SH(...)
static const int8_t bilinear_filters_msa[7][2]
Definition: vp8_mc_msa.c:44
#define LD_UB5(...)
#define ILVR_D4_SB(...)
#define LD_SB8(...)
void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2173
static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1493
#define PCKEV_B2_SB(...)
static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
Definition: vp8_mc_msa.c:2287
void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1869
static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp8_mc_msa.c:2076
void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1171
static const int8_t subpel_filters_msa[7][8]
Definition: vp8_mc_msa.c:34
void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:940
static const uint16_t mask[17]
Definition: lzw.c:38
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1)
Definition: vp8_mc_msa.c:131
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:681
#define SPLATI_H2_SB(...)
uint16_t width
Definition: gdv.c:47
void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:579
#define XORI_B4_128_SB(...)
void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1543
void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1786
#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, filt_h2)
Definition: vp8_mc_msa.c:54
void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2314
#define SRARI_H2_SH(...)
void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1403
#define ILVR_B4_UB(...)
void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2157
#define LD_UB8(...)
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp8_mc_msa.c:25
#define SRARI_H2_UH(...)
void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:504
void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1658
#define VSHF_B2_UH(...)
int32_t
#define PCKEV_B4_SB(...)
#define LD_SB3(...)
void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:430
#define ST_UB(...)
#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)
Definition: vp8_mc_msa.c:117
#define SAT_SH4_SH(...)
#define LD_SB4(...)
#define PCKEV_B4_UB(...)
static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:159
void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:794
#define ST_UB8(...)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3)
Definition: vp8_mc_msa.c:143
#define ST_UB4(...)
void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:284
#define src1
Definition: h264pred.c:139
#define ILVL_B4_SB(...)
#define SAT_SH2_SH(...)
void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:236
void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1104
#define DOTP_UB2_UH(...)
#define SRARI_H4_UH(...)
#define src0
Definition: h264pred.c:138
static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp8_mc_msa.c:1580
#define SD4(in0, in1, in2, in3, pdst, stride)
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1324
#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, filt1, filt2, out0, out1)
Definition: vp8_mc_msa.c:71
#define LD_SB5(...)
#define ILVEV_B2_SH(...)
#define ILVEV_B2_UB(...)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVL_B2_UB(...)
#define SAT_UH2_UH(...)
#define SAT_UH4_UH(...)
#define LD_UB(...)
#define SPLATI_H3_SB(...)
#define DOTP_UB4_UH(...)
#define VSHF_B2_UB(...)
static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:187
#define ILVR_B4_SB(...)
void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1251
FILE * out
Definition: movenc.c:54
static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp8_mc_msa.c:1970
void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1883
static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1556
void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:893
void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:223
#define LD_UH(...)
#define PCKEV_B2_UB(...)
#define ILVR_B2_UB(...)
static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp8_mc_msa.c:1937
void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1339