FFmpeg  4.0
hevc_mc_uni_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35  mask0, mask1, mask2, mask3, \
36  filt0, filt1, filt2, filt3, \
37  out0, out1) \
38 { \
39  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
40  \
41  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
49 }
50 
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52  mask0, mask1, mask2, mask3, \
53  filt0, filt1, filt2, filt3, \
54  out0, out1, out2, out3) \
55 { \
56  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
57  \
58  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61  out0, out1, out2, out3); \
62  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65  out0, out1, out2, out3); \
66  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69  out0, out1, out2, out3); \
70  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73  out0, out1, out2, out3); \
74 }
75 
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77  mask0, mask1, filt0, filt1, \
78  out0, out1) \
79 { \
80  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
81  \
82  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
86 }
87 
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89  mask0, mask1, filt0, filt1, \
90  out0, out1, out2, out3) \
91 { \
92  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
93  \
94  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97  out0, out1, out2, out3); \
98  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101  out0, out1, out2, out3); \
102 }
103 
104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105  uint8_t *dst, int32_t dst_stride,
106  int32_t height)
107 {
108  int32_t cnt;
109  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110 
111  if (2 == height) {
112  LD2(src, src_stride, out0, out1);
113  SD(out0, dst);
114  dst += dst_stride;
115  SD(out1, dst);
116  } else if (6 == height) {
117  LD4(src, src_stride, out0, out1, out2, out3);
118  src += (4 * src_stride);
119  SD4(out0, out1, out2, out3, dst, dst_stride);
120  dst += (4 * dst_stride);
121  LD2(src, src_stride, out0, out1);
122  SD(out0, dst);
123  dst += dst_stride;
124  SD(out1, dst);
125  } else if (0 == (height % 8)) {
126  for (cnt = (height >> 3); cnt--;) {
127  LD4(src, src_stride, out0, out1, out2, out3);
128  src += (4 * src_stride);
129  LD4(src, src_stride, out4, out5, out6, out7);
130  src += (4 * src_stride);
131  SD4(out0, out1, out2, out3, dst, dst_stride);
132  dst += (4 * dst_stride);
133  SD4(out4, out5, out6, out7, dst, dst_stride);
134  dst += (4 * dst_stride);
135  }
136  } else if (0 == (height % 4)) {
137  for (cnt = (height >> 2); cnt--;) {
138  LD4(src, src_stride, out0, out1, out2, out3);
139  src += (4 * src_stride);
140  SD4(out0, out1, out2, out3, dst, dst_stride);
141  dst += (4 * dst_stride);
142  }
143  }
144 }
145 
146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147  uint8_t *dst, int32_t dst_stride,
148  int32_t height)
149 {
150  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151 
152  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153  src += (8 * src_stride);
154  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155  dst += (8 * dst_stride);
156  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158 }
159 
160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161  uint8_t *dst, int32_t dst_stride,
162  int32_t height)
163 {
164  int32_t cnt;
165  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166 
167  if (12 == height) {
168  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169  src += (8 * src_stride);
170  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171  dst += (8 * dst_stride);
172  LD_UB4(src, src_stride, src0, src1, src2, src3);
173  src += (4 * src_stride);
174  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175  dst += (4 * dst_stride);
176  } else if (0 == (height % 8)) {
177  for (cnt = (height >> 3); cnt--;) {
178  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179  src7);
180  src += (8 * src_stride);
181  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182  dst_stride);
183  dst += (8 * dst_stride);
184  }
185  } else if (0 == (height % 4)) {
186  for (cnt = (height >> 2); cnt--;) {
187  LD_UB4(src, src_stride, src0, src1, src2, src3);
188  src += (4 * src_stride);
189 
190  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191  dst += (4 * dst_stride);
192  }
193  }
194 }
195 
196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197  uint8_t *dst, int32_t dst_stride,
198  int32_t height)
199 {
200  int32_t cnt;
201  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203 
204  for (cnt = 4; cnt--;) {
205  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206  LD4(src + 16, src_stride, out0, out1, out2, out3);
207  src += (4 * src_stride);
208  LD4(src + 16, src_stride, out4, out5, out6, out7);
209  src += (4 * src_stride);
210 
211  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212  SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213  dst += (4 * dst_stride);
214  SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215  dst += (4 * dst_stride);
216  }
217 }
218 
219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220  uint8_t *dst, int32_t dst_stride,
221  int32_t height)
222 {
223  int32_t cnt;
224  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225 
226  for (cnt = (height >> 2); cnt--;) {
227  LD_UB4(src, src_stride, src0, src1, src2, src3);
228  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229  src += (4 * src_stride);
230  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232  dst += (4 * dst_stride);
233  }
234 }
235 
236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237  uint8_t *dst, int32_t dst_stride,
238  int32_t height)
239 {
240  int32_t cnt;
241  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242  v16u8 src11;
243 
244  for (cnt = (height >> 2); cnt--;) {
245  LD_UB4(src, src_stride, src0, src1, src2, src3);
246  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247  LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248  src += (4 * src_stride);
249 
250  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252  ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253  dst += (4 * dst_stride);
254  }
255 }
256 
257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258  uint8_t *dst, int32_t dst_stride,
259  int32_t height)
260 {
261  int32_t cnt;
262  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264 
265  for (cnt = (height >> 2); cnt--;) {
266  LD_UB4(src, 16, src0, src1, src2, src3);
267  src += src_stride;
268  LD_UB4(src, 16, src4, src5, src6, src7);
269  src += src_stride;
270  LD_UB4(src, 16, src8, src9, src10, src11);
271  src += src_stride;
272  LD_UB4(src, 16, src12, src13, src14, src15);
273  src += src_stride;
274 
275  ST_UB4(src0, src1, src2, src3, dst, 16);
276  dst += dst_stride;
277  ST_UB4(src4, src5, src6, src7, dst, 16);
278  dst += dst_stride;
279  ST_UB4(src8, src9, src10, src11, dst, 16);
280  dst += dst_stride;
281  ST_UB4(src12, src13, src14, src15, dst, 16);
282  dst += dst_stride;
283  }
284 }
285 
286 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
287  uint8_t *dst, int32_t dst_stride,
288  const int8_t *filter)
289 {
290  v16u8 mask0, mask1, mask2, mask3, out;
291  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292  v8i16 filt, out0, out1;
293 
294  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295  src -= 3;
296 
297  /* rearranging filter */
298  filt = LD_SH(filter);
299  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300 
301  mask1 = mask0 + 2;
302  mask2 = mask0 + 4;
303  mask3 = mask0 + 6;
304 
305  LD_SB4(src, src_stride, src0, src1, src2, src3);
306  XORI_B4_128_SB(src0, src1, src2, src3);
307  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308  mask3, filt0, filt1, filt2, filt3, out0, out1);
309  SRARI_H2_SH(out0, out1, 6);
310  SAT_SH2_SH(out0, out1, 7);
311  out = PCKEV_XORI128_UB(out0, out1);
312  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
313 }
314 
315 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
316  uint8_t *dst, int32_t dst_stride,
317  const int8_t *filter)
318 {
319  v16i8 filt0, filt1, filt2, filt3;
320  v16i8 src0, src1, src2, src3;
321  v16u8 mask0, mask1, mask2, mask3, out;
322  v8i16 filt, out0, out1, out2, out3;
323 
324  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325  src -= 3;
326 
327  /* rearranging filter */
328  filt = LD_SH(filter);
329  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330 
331  mask1 = mask0 + 2;
332  mask2 = mask0 + 4;
333  mask3 = mask0 + 6;
334 
335  LD_SB4(src, src_stride, src0, src1, src2, src3);
336  XORI_B4_128_SB(src0, src1, src2, src3);
337  src += (4 * src_stride);
338  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339  mask3, filt0, filt1, filt2, filt3, out0, out1);
340  LD_SB4(src, src_stride, src0, src1, src2, src3);
341  XORI_B4_128_SB(src0, src1, src2, src3);
342  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343  mask3, filt0, filt1, filt2, filt3, out2, out3);
344  SRARI_H4_SH(out0, out1, out2, out3, 6);
345  SAT_SH4_SH(out0, out1, out2, out3, 7);
346  out = PCKEV_XORI128_UB(out0, out1);
347  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
348  dst += (4 * dst_stride);
349  out = PCKEV_XORI128_UB(out2, out3);
350  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
351 }
352 
353 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
354  uint8_t *dst, int32_t dst_stride,
355  const int8_t *filter)
356 {
357  v16u8 mask0, mask1, mask2, mask3, out;
358  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
359  v8i16 filt, out0, out1, out2, out3;
360 
361  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
362  src -= 3;
363 
364  /* rearranging filter */
365  filt = LD_SH(filter);
366  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
367 
368  mask1 = mask0 + 2;
369  mask2 = mask0 + 4;
370  mask3 = mask0 + 6;
371 
372  LD_SB4(src, src_stride, src0, src1, src2, src3);
373  XORI_B4_128_SB(src0, src1, src2, src3);
374  src += (4 * src_stride);
375  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
376  mask3, filt0, filt1, filt2, filt3, out0, out1);
377  LD_SB4(src, src_stride, src0, src1, src2, src3);
378  XORI_B4_128_SB(src0, src1, src2, src3);
379  src += (4 * src_stride);
380  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
381  mask3, filt0, filt1, filt2, filt3, out2, out3);
382  SRARI_H4_SH(out0, out1, out2, out3, 6);
383  SAT_SH4_SH(out0, out1, out2, out3, 7);
384  out = PCKEV_XORI128_UB(out0, out1);
385  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
386  dst += (4 * dst_stride);
387  out = PCKEV_XORI128_UB(out2, out3);
388  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
389  dst += (4 * dst_stride);
390 
391  LD_SB4(src, src_stride, src0, src1, src2, src3);
392  XORI_B4_128_SB(src0, src1, src2, src3);
393  src += (4 * src_stride);
394  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
395  mask3, filt0, filt1, filt2, filt3, out0, out1);
396  LD_SB4(src, src_stride, src0, src1, src2, src3);
397  XORI_B4_128_SB(src0, src1, src2, src3);
398  src += (4 * src_stride);
399  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
400  mask3, filt0, filt1, filt2, filt3, out2, out3);
401 
402  SRARI_H4_SH(out0, out1, out2, out3, 6);
403  SAT_SH4_SH(out0, out1, out2, out3, 7);
404  out = PCKEV_XORI128_UB(out0, out1);
405  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
406  dst += (4 * dst_stride);
407  out = PCKEV_XORI128_UB(out2, out3);
408  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
409 }
410 
411 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
412  uint8_t *dst, int32_t dst_stride,
413  const int8_t *filter, int32_t height)
414 {
415  if (4 == height) {
416  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
417  } else if (8 == height) {
418  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
419  } else if (16 == height) {
420  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
421  }
422 }
423 
424 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
425  uint8_t *dst, int32_t dst_stride,
426  const int8_t *filter, int32_t height)
427 {
428  uint32_t loop_cnt;
429  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
430  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
431  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
432  v8i16 filt, out0, out1, out2, out3;
433 
434  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
435  src -= 3;
436 
437  /* rearranging filter */
438  filt = LD_SH(filter);
439  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
440 
441  mask1 = mask0 + 2;
442  mask2 = mask0 + 4;
443  mask3 = mask0 + 6;
444 
445  for (loop_cnt = (height >> 2); loop_cnt--;) {
446  LD_SB4(src, src_stride, src0, src1, src2, src3);
447  XORI_B4_128_SB(src0, src1, src2, src3);
448  src += (4 * src_stride);
449 
450  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
451  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
452  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
453  out0, out1, out2, out3);
454  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
455  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
456  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
457  out0, out1, out2, out3);
458  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
459  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
460  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
461  out0, out1, out2, out3);
462  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
463  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
464  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
465  out0, out1, out2, out3);
466 
467  SRARI_H4_SH(out0, out1, out2, out3, 6);
468  SAT_SH4_SH(out0, out1, out2, out3, 7);
469  tmp0 = PCKEV_XORI128_UB(out0, out1);
470  tmp1 = PCKEV_XORI128_UB(out2, out3);
471  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
472  dst += (4 * dst_stride);
473  }
474 }
475 
476 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
477  uint8_t *dst, int32_t dst_stride,
478  const int8_t *filter, int32_t height)
479 {
480  uint32_t loop_cnt;
481  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
482  v16u8 tmp0, tmp1, tmp2;
483  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
484  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
485  v16i8 filt0, filt1, filt2, filt3;
486  v8i16 filt, out0, out1, out2, out3, out4, out5;
487 
488  mask00 = LD_UB(&ff_hevc_mask_arr[0]);
489  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
490 
491  src = src - 3;
492 
493  /* rearranging filter */
494  filt = LD_SH(filter);
495  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
496 
497  mask1 = mask00 + 2;
498  mask2 = mask00 + 4;
499  mask3 = mask00 + 6;
500  mask4 = mask0 + 2;
501  mask5 = mask0 + 4;
502  mask6 = mask0 + 6;
503 
504  for (loop_cnt = 4; loop_cnt--;) {
505  /* 8 width */
506  LD_SB4(src, src_stride, src0, src1, src2, src3);
507  /* 4 width */
508  LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
509 
510  XORI_B4_128_SB(src0, src1, src2, src3);
511  XORI_B4_128_SB(src4, src5, src6, src7);
512  src += (4 * src_stride);
513 
514  VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
515  VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
516  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
517  out1, out2, out3);
518  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
519  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
520  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
521  out1, out2, out3);
522  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
523  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
524  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
525  out1, out2, out3);
526  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
527  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
528  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
529  out1, out2, out3);
530 
531  /* 4 width */
532  VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
533  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
534  VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
535  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
536  VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
537  DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
538  VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
539  DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
540 
541  SRARI_H4_SH(out0, out1, out2, out3, 6);
542  SRARI_H2_SH(out4, out5, 6);
543  SAT_SH4_SH(out0, out1, out2, out3, 7);
544  SAT_SH2_SH(out4, out5, 7);
545  tmp0 = PCKEV_XORI128_UB(out0, out1);
546  tmp1 = PCKEV_XORI128_UB(out2, out3);
547  tmp2 = PCKEV_XORI128_UB(out4, out5);
548 
549  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
550  ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
551  dst += (4 * dst_stride);
552  }
553 }
554 
555 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
556  uint8_t *dst, int32_t dst_stride,
557  const int8_t *filter, int32_t height)
558 {
559  uint32_t loop_cnt;
560  v16u8 mask0, mask1, mask2, mask3, out;
561  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
562  v16i8 filt0, filt1, filt2, filt3;
563  v8i16 filt, out0, out1, out2, out3;
564 
565  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
566  src -= 3;
567 
568  /* rearranging filter */
569  filt = LD_SH(filter);
570  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
571 
572  mask1 = mask0 + 2;
573  mask2 = mask0 + 4;
574  mask3 = mask0 + 6;
575 
576  for (loop_cnt = (height >> 2); loop_cnt--;) {
577  LD_SB2(src, src_stride, src0, src2);
578  LD_SB2(src + 8, src_stride, src1, src3);
579  src += (2 * src_stride);
580 
581  LD_SB2(src, src_stride, src4, src6);
582  LD_SB2(src + 8, src_stride, src5, src7);
583  src += (2 * src_stride);
584 
585  XORI_B4_128_SB(src0, src1, src2, src3);
586  XORI_B4_128_SB(src4, src5, src6, src7);
587  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
588  mask3, filt0, filt1, filt2, filt3, out0,
589  out1, out2, out3);
590  SRARI_H4_SH(out0, out1, out2, out3, 6);
591  SAT_SH4_SH(out0, out1, out2, out3, 7);
592  out = PCKEV_XORI128_UB(out0, out1);
593  ST_UB(out, dst);
594  dst += dst_stride;
595  out = PCKEV_XORI128_UB(out2, out3);
596  ST_UB(out, dst);
597  dst += dst_stride;
598 
599  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
600  mask3, filt0, filt1, filt2, filt3, out0,
601  out1, out2, out3);
602  SRARI_H4_SH(out0, out1, out2, out3, 6);
603  SAT_SH4_SH(out0, out1, out2, out3, 7);
604  out = PCKEV_XORI128_UB(out0, out1);
605  ST_UB(out, dst);
606  dst += dst_stride;
607  out = PCKEV_XORI128_UB(out2, out3);
608  ST_UB(out, dst);
609  dst += dst_stride;
610  }
611 }
612 
613 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
614  uint8_t *dst, int32_t dst_stride,
615  const int8_t *filter, int32_t height)
616 {
617  uint32_t loop_cnt;
618  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
619  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
620  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
621  v16i8 vec11;
622  v8i16 out0, out1, out2, out3, out8, out9, filt;
623 
624  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
625  src -= 3;
626 
627  /* rearranging filter */
628  filt = LD_SH(filter);
629  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
630 
631  mask1 = mask0 + 2;
632  mask2 = mask0 + 4;
633  mask3 = mask0 + 6;
634  mask4 = mask0 + 8;
635  mask5 = mask0 + 10;
636  mask6 = mask0 + 12;
637  mask7 = mask0 + 14;
638 
639  for (loop_cnt = 16; loop_cnt--;) {
640  LD_SB2(src, src_stride, src0, src2);
641  LD_SB2(src + 16, src_stride, src1, src3);
642  XORI_B4_128_SB(src0, src1, src2, src3);
643  src += (2 * src_stride);
644  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
645  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
646  VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
647  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
648  out8, out2, out9);
649  DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
650  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
651  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
652  VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
653  DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
654  out0, out8, out2, out9);
655  DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
656  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
657  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
658  VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
659  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
660  out0, out8, out2, out9);
661  DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
662  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
663  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
664  VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
665  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
666  out0, out8, out2, out9);
667  DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
668  SRARI_H4_SH(out0, out8, out2, out9, 6);
669  SRARI_H2_SH(out1, out3, 6);
670  SAT_SH4_SH(out0, out8, out2, out9, 7);
671  SAT_SH2_SH(out1, out3, 7);
672  out = PCKEV_XORI128_UB(out8, out9);
673  ST8x2_UB(out, dst + 16, dst_stride);
674  out = PCKEV_XORI128_UB(out0, out1);
675  ST_UB(out, dst);
676  dst += dst_stride;
677  out = PCKEV_XORI128_UB(out2, out3);
678  ST_UB(out, dst);
679  dst += dst_stride;
680  }
681 }
682 
683 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
684  uint8_t *dst, int32_t dst_stride,
685  const int8_t *filter, int32_t height)
686 {
687  uint32_t loop_cnt;
688  v16u8 mask0, mask1, mask2, mask3, out;
689  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
690  v16i8 filt0, filt1, filt2, filt3;
691  v8i16 filt, out0, out1, out2, out3;
692 
693  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
694  src -= 3;
695 
696  /* rearranging filter */
697  filt = LD_SH(filter);
698  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
699 
700  mask1 = mask0 + 2;
701  mask2 = mask0 + 4;
702  mask3 = mask0 + 6;
703 
704  for (loop_cnt = (height >> 1); loop_cnt--;) {
705  src0 = LD_SB(src);
706  src1 = LD_SB(src + 8);
707  src2 = LD_SB(src + 16);
708  src3 = LD_SB(src + 24);
709  src += src_stride;
710  XORI_B4_128_SB(src0, src1, src2, src3);
711 
712  src4 = LD_SB(src);
713  src5 = LD_SB(src + 8);
714  src6 = LD_SB(src + 16);
715  src7 = LD_SB(src + 24);
716  src += src_stride;
717  XORI_B4_128_SB(src4, src5, src6, src7);
718 
719  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
720  mask3, filt0, filt1, filt2, filt3, out0,
721  out1, out2, out3);
722  SRARI_H4_SH(out0, out1, out2, out3, 6);
723  SAT_SH4_SH(out0, out1, out2, out3, 7);
724 
725  out = PCKEV_XORI128_UB(out0, out1);
726  ST_UB(out, dst);
727  out = PCKEV_XORI128_UB(out2, out3);
728  ST_UB(out, dst + 16);
729  dst += dst_stride;
730 
731  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
732  mask3, filt0, filt1, filt2, filt3, out0,
733  out1, out2, out3);
734  SRARI_H4_SH(out0, out1, out2, out3, 6);
735  SAT_SH4_SH(out0, out1, out2, out3, 7);
736  out = PCKEV_XORI128_UB(out0, out1);
737  ST_UB(out, dst);
738  out = PCKEV_XORI128_UB(out2, out3);
739  ST_UB(out, dst + 16);
740  dst += dst_stride;
741  }
742 }
743 
744 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
745  uint8_t *dst, int32_t dst_stride,
746  const int8_t *filter, int32_t height)
747 {
748  uint32_t loop_cnt;
749  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
750  v16i8 src4;
751  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
752  v8i16 filt, out0, out1, out2, out3;
753 
754  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
755  src -= 3;
756 
757  /* rearranging filter */
758  filt = LD_SH(filter);
759  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
760 
761  mask1 = mask0 + 2;
762  mask2 = mask0 + 4;
763  mask3 = mask0 + 6;
764  mask4 = mask0 + 8;
765  mask5 = mask0 + 10;
766  mask6 = mask0 + 12;
767  mask7 = mask0 + 14;
768 
769  for (loop_cnt = 64; loop_cnt--;) {
770  src0 = LD_SB(src);
771  src1 = LD_SB(src + 8);
772  src2 = LD_SB(src + 16);
773  src3 = LD_SB(src + 32);
774  src4 = LD_SB(src + 40);
775  src += src_stride;
776 
777  XORI_B4_128_SB(src0, src1, src2, src3);
778  src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
779 
780  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
781  vec0, vec1, vec2);
782  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
783  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
784  vec0, vec1, vec2);
785  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
786  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
787  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
788  vec0, vec1, vec2);
789  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
790  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
791 
792  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
793  vec0, vec1, vec2);
794  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
795  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
796 
797  SRARI_H2_SH(out0, out1, 6);
798  out3 = __msa_srari_h(out2, 6);
799  SAT_SH3_SH(out0, out1, out3, 7);
800  out = PCKEV_XORI128_UB(out0, out1);
801  ST_UB(out, dst);
802 
803  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
804  vec0, vec1, vec2);
805  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
806  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
807  vec0, vec1, vec2);
808  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
809  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
810  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
811  vec0, vec1, vec2);
812  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
813  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
814  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
815  vec0, vec1, vec2);
816  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
817  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
818 
819  SRARI_H2_SH(out0, out1, 6);
820  out2 = __msa_srari_h(out2, 6);
821  SAT_SH3_SH(out0, out1, out2, 7);
822  out = PCKEV_XORI128_UB(out3, out0);
823  ST_UB(out, dst + 16);
824  out = PCKEV_XORI128_UB(out1, out2);
825  ST_UB(out, dst + 32);
826  dst += dst_stride;
827  }
828 }
829 
830 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
831  uint8_t *dst, int32_t dst_stride,
832  const int8_t *filter, int32_t height)
833 {
834  int32_t loop_cnt;
835  v16u8 mask0, mask1, mask2, mask3, out;
836  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
837  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
838  v16i8 filt0, filt1, filt2, filt3;
839  v8i16 res0, res1, res2, res3, filt;
840 
841  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
842  src -= 3;
843 
844  /* rearranging filter */
845  filt = LD_SH(filter);
846  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
847 
848  mask1 = mask0 + 2;
849  mask2 = mask0 + 4;
850  mask3 = mask0 + 6;
851 
852  for (loop_cnt = height; loop_cnt--;) {
853  LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
854  src += src_stride;
855 
856  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
857 
858  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
859  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
860  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
861  res1, res2, res3);
862  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
863  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
864  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
865  res1, res2, res3);
866  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
867  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
868  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
869  res1, res2, res3);
870  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
871  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
872  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
873  res1, res2, res3);
874 
875  SRARI_H4_SH(res0, res1, res2, res3, 6);
876  SAT_SH4_SH(res0, res1, res2, res3, 7);
877  out = PCKEV_XORI128_UB(res0, res1);
878  ST_UB(out, dst);
879  out = PCKEV_XORI128_UB(res2, res3);
880  ST_UB(out, dst + 16);
881 
882  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
883  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
884  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
885  res1, res2, res3);
886  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
887  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
888  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
889  res1, res2, res3);
890  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
891  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
892  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
893  res1, res2, res3);
894  VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
895  VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
896  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
897  res1, res2, res3);
898 
899  SRARI_H4_SH(res0, res1, res2, res3, 6);
900  SAT_SH4_SH(res0, res1, res2, res3, 7);
901  out = PCKEV_XORI128_UB(res0, res1);
902  ST_UB(out, dst + 32);
903  out = PCKEV_XORI128_UB(res2, res3);
904  ST_UB(out, dst + 48);
905  dst += dst_stride;
906  }
907 }
908 
909 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
910  uint8_t *dst, int32_t dst_stride,
911  const int8_t *filter, int32_t height)
912 {
913  uint32_t loop_cnt;
914  v16u8 out0, out1;
915  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
916  v16i8 src11, src12, src13, src14;
917  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
918  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
919  v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
920  v16i8 src10998, filt0, filt1, filt2, filt3;
921  v8i16 filt, out10, out32, out54, out76;
922 
923  src -= (3 * src_stride);
924 
925  filt = LD_SH(filter);
926  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
927 
928  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
929  src += (7 * src_stride);
930 
931  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
932  src54_r, src21_r);
933  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
934  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
935  src4332, src6554);
936  XORI_B3_128_SB(src2110, src4332, src6554);
937 
938  for (loop_cnt = (height >> 3); loop_cnt--;) {
939  LD_SB4(src, src_stride, src7, src8, src9, src10);
940  src += (4 * src_stride);
941  LD_SB4(src, src_stride, src11, src12, src13, src14);
942  src += (4 * src_stride);
943 
944  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
945  src87_r, src98_r, src109_r);
946  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
947  src1110_r, src1211_r, src1312_r, src1413_r);
948  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
949  ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
950  src12111110, src14131312);
951  XORI_B2_128_SB(src8776, src10998);
952  XORI_B2_128_SB(src12111110, src14131312);
953 
954  DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
955  DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
956  DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
957  DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
958  DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
959  DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
960  DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
961  DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
962  SRARI_H2_SH(out10, out32, 6);
963  SRARI_H2_SH(out54, out76, 6);
964  SAT_SH2_SH(out10, out32, 7);
965  SAT_SH2_SH(out54, out76, 7);
966  out0 = PCKEV_XORI128_UB(out10, out32);
967  out1 = PCKEV_XORI128_UB(out54, out76);
968  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
969  dst += (4 * dst_stride);
970  ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
971  dst += (4 * dst_stride);
972 
973  src2110 = src10998;
974  src4332 = src12111110;
975  src6554 = src14131312;
976  src6 = src14;
977  }
978 }
979 
980 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
981  uint8_t *dst, int32_t dst_stride,
982  const int8_t *filter, int32_t height)
983 {
984  uint32_t loop_cnt;
985  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
986  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
987  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
988  v16u8 tmp0, tmp1;
989  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
990 
991  src -= (3 * src_stride);
992 
993  filt = LD_SH(filter);
994  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
995 
996  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
997  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
998  src += (7 * src_stride);
999  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1000  src54_r, src21_r);
1001  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1002 
1003  for (loop_cnt = (height >> 2); loop_cnt--;) {
1004  LD_SB4(src, src_stride, src7, src8, src9, src10);
1005  XORI_B4_128_SB(src7, src8, src9, src10);
1006  src += (4 * src_stride);
1007 
1008  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1009  src87_r, src98_r, src109_r);
1010  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1011  filt0, out0_r, out1_r, out2_r, out3_r);
1012  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1013  filt1, out0_r, out1_r, out2_r, out3_r);
1014  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1015  filt2, out0_r, out1_r, out2_r, out3_r);
1016  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1017  filt3, out0_r, out1_r, out2_r, out3_r);
1018  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1019  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1020  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1021  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1022  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1023  dst += (4 * dst_stride);
1024 
1025  src10_r = src54_r;
1026  src32_r = src76_r;
1027  src54_r = src98_r;
1028  src21_r = src65_r;
1029  src43_r = src87_r;
1030  src65_r = src109_r;
1031  src6 = src10;
1032  }
1033 }
1034 
1035 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1036  uint8_t *dst, int32_t dst_stride,
1037  const int8_t *filter, int32_t height)
1038 {
1039  uint32_t loop_cnt;
1040  uint32_t out2, out3;
1041  uint64_t out0, out1;
1042  v16u8 tmp0, tmp1, tmp2, tmp3;
1043  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1044  v16i8 filt0, filt1, filt2, filt3;
1045  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1046  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1047  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1048  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1049 
1050  src -= (3 * src_stride);
1051 
1052  filt = LD_SH(filter);
1053  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1054 
1055  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1056  src += (7 * src_stride);
1057 
1058  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1059 
1060  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1061  src54_r, src21_r);
1062  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1063  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1064  src54_l, src21_l);
1065  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1066 
1067  for (loop_cnt = 4; loop_cnt--;) {
1068  LD_SB4(src, src_stride, src7, src8, src9, src10);
1069  XORI_B4_128_SB(src7, src8, src9, src10);
1070  src += (4 * src_stride);
1071 
1072  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1073  src87_r, src98_r, src109_r);
1074  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1075  src87_l, src98_l, src109_l);
1076  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1077  filt1, filt2, filt3);
1078  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1079  filt1, filt2, filt3);
1080  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1081  filt1, filt2, filt3);
1082  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1083  filt1, filt2, filt3);
1084  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1085  filt1, filt2, filt3);
1086  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1087  filt1, filt2, filt3);
1088  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1089  filt1, filt2, filt3);
1090  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1091  filt1, filt2, filt3);
1092  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1093  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1094  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1095  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1096  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1097  out3_r, tmp0, tmp1, tmp2, tmp3);
1098  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1099 
1100  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1101  out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1102  out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1103  out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1104  SD(out0, dst);
1105  SW(out2, (dst + 8));
1106  dst += dst_stride;
1107  SD(out1, dst);
1108  SW(out3, (dst + 8));
1109  dst += dst_stride;
1110  out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1111  out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1112  out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1113  out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1114  SD(out0, dst);
1115  SW(out2, (dst + 8));
1116  dst += dst_stride;
1117  SD(out1, dst);
1118  SW(out3, (dst + 8));
1119  dst += dst_stride;
1120 
1121  src10_r = src54_r;
1122  src32_r = src76_r;
1123  src54_r = src98_r;
1124  src21_r = src65_r;
1125  src43_r = src87_r;
1126  src65_r = src109_r;
1127  src10_l = src54_l;
1128  src32_l = src76_l;
1129  src54_l = src98_l;
1130  src21_l = src65_l;
1131  src43_l = src87_l;
1132  src65_l = src109_l;
1133  src6 = src10;
1134  }
1135 }
1136 
1137 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1138  uint8_t *dst, int32_t dst_stride,
1139  const int8_t *filter, int32_t height)
1140 {
1141  uint32_t loop_cnt;
1142  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1143  v16i8 filt0, filt1, filt2, filt3;
1144  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1145  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1146  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1147  v16u8 tmp0, tmp1, tmp2, tmp3;
1148  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1149 
1150  src -= (3 * src_stride);
1151 
1152  filt = LD_SH(filter);
1153  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1154 
1155  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1156  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1157  src += (7 * src_stride);
1158  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1159  src54_r, src21_r);
1160  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1161  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1162  src54_l, src21_l);
1163  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1164 
1165  for (loop_cnt = (height >> 2); loop_cnt--;) {
1166  LD_SB4(src, src_stride, src7, src8, src9, src10);
1167  XORI_B4_128_SB(src7, src8, src9, src10);
1168  src += (4 * src_stride);
1169 
1170  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1171  src87_r, src98_r, src109_r);
1172  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1173  src87_l, src98_l, src109_l);
1174  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1175  filt1, filt2, filt3);
1176  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1177  filt1, filt2, filt3);
1178  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1179  filt1, filt2, filt3);
1180  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1181  filt1, filt2, filt3);
1182  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1183  filt1, filt2, filt3);
1184  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1185  filt1, filt2, filt3);
1186  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1187  filt1, filt2, filt3);
1188  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1189  filt1, filt2, filt3);
1190  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1191  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1192  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1193  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1194  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1195  out3_r, tmp0, tmp1, tmp2, tmp3);
1196  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1197  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1198  dst += (4 * dst_stride);
1199 
1200  src10_r = src54_r;
1201  src32_r = src76_r;
1202  src54_r = src98_r;
1203  src21_r = src65_r;
1204  src43_r = src87_r;
1205  src65_r = src109_r;
1206  src10_l = src54_l;
1207  src32_l = src76_l;
1208  src54_l = src98_l;
1209  src21_l = src65_l;
1210  src43_l = src87_l;
1211  src65_l = src109_l;
1212  src6 = src10;
1213  }
1214 }
1215 
1217  uint8_t *dst, int32_t dst_stride,
1218  const int8_t *filter, int32_t height,
1219  int32_t width)
1220 {
1221  uint8_t *src_tmp;
1222  uint8_t *dst_tmp;
1223  uint32_t loop_cnt, cnt;
1224  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1225  v16i8 filt0, filt1, filt2, filt3;
1226  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1227  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1228  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1229  v16u8 tmp0, tmp1, tmp2, tmp3;
1230  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1231 
1232  src -= (3 * src_stride);
1233 
1234  filt = LD_SH(filter);
1235  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1236 
1237  for (cnt = (width >> 4); cnt--;) {
1238  src_tmp = src;
1239  dst_tmp = dst;
1240 
1241  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1242  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1243  src_tmp += (7 * src_stride);
1244  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1245  src32_r, src54_r, src21_r);
1246  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1247  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1248  src32_l, src54_l, src21_l);
1249  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1250 
1251  for (loop_cnt = (height >> 2); loop_cnt--;) {
1252  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1253  XORI_B4_128_SB(src7, src8, src9, src10);
1254  src_tmp += (4 * src_stride);
1255  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1256  src87_r, src98_r, src109_r);
1257  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1258  src87_l, src98_l, src109_l);
1259  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1260  filt0, filt1, filt2, filt3);
1261  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1262  filt0, filt1, filt2, filt3);
1263  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1264  filt0, filt1, filt2, filt3);
1265  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1266  filt0, filt1, filt2, filt3);
1267  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1268  filt0, filt1, filt2, filt3);
1269  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1270  filt0, filt1, filt2, filt3);
1271  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1272  filt0, filt1, filt2, filt3);
1273  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1274  filt0, filt1, filt2, filt3);
1275  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1276  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1277  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1278  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1279  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1280  out3_r, tmp0, tmp1, tmp2, tmp3);
1281  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1282  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1283  dst_tmp += (4 * dst_stride);
1284 
1285  src10_r = src54_r;
1286  src32_r = src76_r;
1287  src54_r = src98_r;
1288  src21_r = src65_r;
1289  src43_r = src87_r;
1290  src65_r = src109_r;
1291  src10_l = src54_l;
1292  src32_l = src76_l;
1293  src54_l = src98_l;
1294  src21_l = src65_l;
1295  src43_l = src87_l;
1296  src65_l = src109_l;
1297  src6 = src10;
1298  }
1299 
1300  src += 16;
1301  dst += 16;
1302  }
1303 }
1304 
1305 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1306  uint8_t *dst, int32_t dst_stride,
1307  const int8_t *filter, int32_t height)
1308 {
1309  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1310  16);
1311 
1312  common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1313  height);
1314 }
1315 
1316 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1317  uint8_t *dst, int32_t dst_stride,
1318  const int8_t *filter, int32_t height)
1319 {
1320  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1321  32);
1322 }
1323 
1324 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1325  uint8_t *dst, int32_t dst_stride,
1326  const int8_t *filter, int32_t height)
1327 {
1328  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1329  48);
1330 }
1331 
1332 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1333  uint8_t *dst, int32_t dst_stride,
1334  const int8_t *filter, int32_t height)
1335 {
1336  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1337  64);
1338 }
1339 
1341  int32_t src_stride,
1342  uint8_t *dst,
1343  int32_t dst_stride,
1344  const int8_t *filter_x,
1345  const int8_t *filter_y,
1346  int32_t height)
1347 {
1348  uint32_t loop_cnt;
1349  v16u8 out0, out1;
1350  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1351  v16i8 src9, src10, src11, src12, src13, src14;
1352  v8i16 filt0, filt1, filt2, filt3;
1353  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1354  v16i8 mask1, mask2, mask3;
1355  v8i16 filter_vec;
1356  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1357  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1358  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1359  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1360  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1361  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1362  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1363 
1364  src -= ((3 * src_stride) + 3);
1365  filter_vec = LD_SH(filter_x);
1366  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1367 
1368  filter_vec = LD_SH(filter_y);
1369  UNPCK_R_SB_SH(filter_vec, filter_vec);
1370 
1371  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1372 
1373  mask1 = mask0 + 2;
1374  mask2 = mask0 + 4;
1375  mask3 = mask0 + 6;
1376 
1377  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1378  src += (7 * src_stride);
1379  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1380 
1381  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1382  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1383  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1384  vec8, vec9, vec10, vec11);
1385  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1386  vec12, vec13, vec14, vec15);
1387 
1388  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1389  filt3);
1390  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1391  filt3);
1392  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1393  filt3);
1394  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1395  filt3);
1396 
1397  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1398  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1399  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1400 
1401  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1402 
1403  for (loop_cnt = height >> 3; loop_cnt--;) {
1404  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1405  src14);
1406  src += (8 * src_stride);
1407  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1408 
1409  VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1410  vec0, vec1, vec2, vec3);
1411  VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1412  vec4, vec5, vec6, vec7);
1413  VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1414  vec8, vec9, vec10, vec11);
1415  VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1416  vec12, vec13, vec14, vec15);
1417 
1418  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1419  filt3);
1420  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1421  filt3);
1422  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1423  filt2, filt3);
1424  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1425  filt2, filt3);
1426 
1427  dst76_r = __msa_ilvr_h(dst117, dst66);
1428  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1429  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1430  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1431  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1432  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1433 
1434  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1435  filt_h1, filt_h2, filt_h3);
1436  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1437  filt_h1, filt_h2, filt_h3);
1438  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1439  filt_h1, filt_h2, filt_h3);
1440  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1441  filt_h1, filt_h2, filt_h3);
1442  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1443  filt_h1, filt_h2, filt_h3);
1444  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1445  filt_h1, filt_h2, filt_h3);
1446  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1447  filt_h1, filt_h2, filt_h3);
1448  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1449  filt_h0, filt_h1, filt_h2, filt_h3);
1450 
1451  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1452  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1453  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1454  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1455  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1456  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1457  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1458  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1459  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1460  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1461  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1462  dst += (4 * dst_stride);
1463  ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1464  dst += (4 * dst_stride);
1465 
1466  dst10_r = dst98_r;
1467  dst32_r = dst1110_r;
1468  dst54_r = dst1312_r;
1469  dst21_r = dst109_r;
1470  dst43_r = dst1211_r;
1471  dst65_r = dst1413_r;
1472  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1473  }
1474 }
1475 
1477  int32_t src_stride,
1478  uint8_t *dst,
1479  int32_t dst_stride,
1480  const int8_t *filter_x,
1481  const int8_t *filter_y,
1483 {
1484  uint32_t loop_cnt, cnt;
1485  uint8_t *src_tmp;
1486  uint8_t *dst_tmp;
1487  v16u8 out;
1488  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1489  v8i16 filt0, filt1, filt2, filt3;
1490  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1491  v16i8 mask1, mask2, mask3;
1492  v8i16 filter_vec;
1493  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1494  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1495  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1496  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1497  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1498  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1499  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1500  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1501  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1502 
1503  src -= ((3 * src_stride) + 3);
1504 
1505  filter_vec = LD_SH(filter_x);
1506  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1507 
1508  filter_vec = LD_SH(filter_y);
1509  UNPCK_R_SB_SH(filter_vec, filter_vec);
1510 
1511  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1512 
1513  mask1 = mask0 + 2;
1514  mask2 = mask0 + 4;
1515  mask3 = mask0 + 6;
1516 
1517  for (cnt = width >> 3; cnt--;) {
1518  src_tmp = src;
1519  dst_tmp = dst;
1520 
1521  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1522  src_tmp += (7 * src_stride);
1523  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1524 
1525  /* row 0 row 1 row 2 row 3 */
1526  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1527  vec0, vec1, vec2, vec3);
1528  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1529  vec4, vec5, vec6, vec7);
1530  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1531  vec8, vec9, vec10, vec11);
1532  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1533  vec12, vec13, vec14, vec15);
1534  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1535  filt3);
1536  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1537  filt3);
1538  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1539  filt3);
1540  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1541  filt2, filt3);
1542 
1543  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1544  vec0, vec1, vec2, vec3);
1545  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1546  vec4, vec5, vec6, vec7);
1547  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1548  vec8, vec9, vec10, vec11);
1549  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1550  filt3);
1551  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1552  filt3);
1553  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1554  filt3);
1555 
1556  for (loop_cnt = height >> 1; loop_cnt--;) {
1557  LD_SB2(src_tmp, src_stride, src7, src8);
1558  XORI_B2_128_SB(src7, src8);
1559  src_tmp += 2 * src_stride;
1560 
1561  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1562  dst10_r, dst32_r, dst54_r, dst21_r);
1563  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1564  dst10_l, dst32_l, dst54_l, dst21_l);
1565  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1566  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1567 
1568  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1569  vec0, vec1, vec2, vec3);
1570  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1571  filt2, filt3);
1572 
1573  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1574  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1575  filt_h0, filt_h1, filt_h2, filt_h3);
1576  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1577  filt_h0, filt_h1, filt_h2, filt_h3);
1578  dst0_r >>= 6;
1579  dst0_l >>= 6;
1580 
1581  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1582  vec0, vec1, vec2, vec3);
1583  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1584  filt2, filt3);
1585 
1586  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1587  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1588  filt_h0, filt_h1, filt_h2, filt_h3);
1589  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1590  filt_h0, filt_h1, filt_h2, filt_h3);
1591  dst1_r >>= 6;
1592  dst1_l >>= 6;
1593  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1594  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1595 
1596  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1597  out = PCKEV_XORI128_UB(dst0, dst1);
1598  ST8x2_UB(out, dst_tmp, dst_stride);
1599  dst_tmp += (2 * dst_stride);
1600 
1601  dst0 = dst2;
1602  dst1 = dst3;
1603  dst2 = dst4;
1604  dst3 = dst5;
1605  dst4 = dst6;
1606  dst5 = dst7;
1607  dst6 = dst8;
1608  }
1609 
1610  src += 8;
1611  dst += 8;
1612  }
1613 }
1614 
1616  int32_t src_stride,
1617  uint8_t *dst,
1618  int32_t dst_stride,
1619  const int8_t *filter_x,
1620  const int8_t *filter_y,
1621  int32_t height)
1622 {
1623  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1624  filter_x, filter_y, height, 8);
1625 }
1626 
1628  int32_t src_stride,
1629  uint8_t *dst,
1630  int32_t dst_stride,
1631  const int8_t *filter_x,
1632  const int8_t *filter_y,
1633  int32_t height)
1634 {
1635  uint32_t loop_cnt;
1636  uint8_t *src_tmp, *dst_tmp;
1637  v16u8 out0, out1;
1638  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1639  v16i8 src11, src12, src13, src14;
1640  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1641  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1642  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1643  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1644  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1645  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1646  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1647  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1648  v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1649  v8i16 dst1413_r, dst87_l, filter_vec;
1650  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1651  v4i32 dst0_l, dst1_l;
1652 
1653  src -= ((3 * src_stride) + 3);
1654 
1655  filter_vec = LD_SH(filter_x);
1656  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1657 
1658  filter_vec = LD_SH(filter_y);
1659  UNPCK_R_SB_SH(filter_vec, filter_vec);
1660 
1661  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1662 
1663  mask0 = LD_SB(ff_hevc_mask_arr);
1664  mask1 = mask0 + 2;
1665  mask2 = mask0 + 4;
1666  mask3 = mask0 + 6;
1667 
1668  src_tmp = src;
1669  dst_tmp = dst;
1670 
1671  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1672  src_tmp += (7 * src_stride);
1673  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1674 
1675  /* row 0 row 1 row 2 row 3 */
1676  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1677  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1678  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1679  vec11);
1680  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1681  vec15);
1682  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1683  filt3);
1684  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1685  filt3);
1686  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1687  filt3);
1688  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1689  filt2, filt3);
1690 
1691  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1692  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1693  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1694  vec11);
1695  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1696  filt3);
1697  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1698  filt3);
1699  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1700  filt3);
1701 
1702  for (loop_cnt = 8; loop_cnt--;) {
1703  LD_SB2(src_tmp, src_stride, src7, src8);
1704  XORI_B2_128_SB(src7, src8);
1705  src_tmp += 2 * src_stride;
1706 
1707  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1708  dst32_r, dst54_r, dst21_r);
1709  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1710  dst32_l, dst54_l, dst21_l);
1711  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1712  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1713 
1714  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1715  vec3);
1716  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1717  filt3);
1718 
1719  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1720  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1721  filt_h0, filt_h1, filt_h2, filt_h3);
1722  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1723  filt_h0, filt_h1, filt_h2, filt_h3);
1724  dst0_r >>= 6;
1725  dst0_l >>= 6;
1726 
1727  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1728  vec3);
1729  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1730  filt3);
1731 
1732  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1733  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1734  filt_h0, filt_h1, filt_h2, filt_h3);
1735  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1736  filt_h0, filt_h1, filt_h2, filt_h3);
1737  dst1_r >>= 6;
1738  dst1_l >>= 6;
1739  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1740  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1741 
1742  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1743  out0 = PCKEV_XORI128_UB(dst0, dst1);
1744  ST8x2_UB(out0, dst_tmp, dst_stride);
1745  dst_tmp += (2 * dst_stride);
1746 
1747  dst0 = dst2;
1748  dst1 = dst3;
1749  dst2 = dst4;
1750  dst3 = dst5;
1751  dst4 = dst6;
1752  dst5 = dst7;
1753  dst6 = dst8;
1754  }
1755 
1756  src += 8;
1757  dst += 8;
1758 
1759  mask4 = LD_SB(ff_hevc_mask_arr + 16);
1760  mask5 = mask4 + 2;
1761  mask6 = mask4 + 4;
1762  mask7 = mask4 + 6;
1763 
1764  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1765  src += (7 * src_stride);
1766  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1767 
1768  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1769  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1770  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1771  vec11);
1772  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1773  vec15);
1774 
1775  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1776  filt3);
1777  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1778  filt3);
1779  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1780  filt3);
1781  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1782  filt3);
1783 
1784  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1785  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1786  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1787 
1788  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1789 
1790  for (loop_cnt = 2; loop_cnt--;) {
1791  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1792  src14);
1793  src += (8 * src_stride);
1794  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1795 
1796  VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1797  vec3);
1798  VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1799  vec7);
1800  VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1801  vec11);
1802  VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1803  vec14, vec15);
1804 
1805  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1806  filt3);
1807  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1808  filt3);
1809  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1810  filt2, filt3);
1811  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1812  filt2, filt3);
1813 
1814  dst76_r = __msa_ilvr_h(dst117, dst66);
1815  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1816  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1817  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1818  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1819  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1820 
1821  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1822  filt_h1, filt_h2, filt_h3);
1823  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1824  filt_h1, filt_h2, filt_h3);
1825  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1826  filt_h1, filt_h2, filt_h3);
1827  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1828  filt_h1, filt_h2, filt_h3);
1829  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1830  filt_h1, filt_h2, filt_h3);
1831  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1832  filt_h1, filt_h2, filt_h3);
1833  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1834  filt_h1, filt_h2, filt_h3);
1835  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1836  filt_h0, filt_h1, filt_h2, filt_h3);
1837 
1838  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1839  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1840  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1841  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1842  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1843  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1844  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1845  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1846  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1847  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1848  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1849  dst += (4 * dst_stride);
1850  ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1851  dst += (4 * dst_stride);
1852 
1853  dst10_r = dst98_r;
1854  dst32_r = dst1110_r;
1855  dst54_r = dst1312_r;
1856  dst21_r = dst109_r;
1857  dst43_r = dst1211_r;
1858  dst65_r = dst1413_r;
1859  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1860  }
1861 }
1862 
1864  int32_t src_stride,
1865  uint8_t *dst,
1866  int32_t dst_stride,
1867  const int8_t *filter_x,
1868  const int8_t *filter_y,
1869  int32_t height)
1870 {
1871  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1872  filter_x, filter_y, height, 16);
1873 }
1874 
1876  int32_t src_stride,
1877  uint8_t *dst,
1878  int32_t dst_stride,
1879  const int8_t *filter_x,
1880  const int8_t *filter_y,
1881  int32_t height)
1882 {
1883  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1884  filter_x, filter_y, height, 24);
1885 }
1886 
1888  int32_t src_stride,
1889  uint8_t *dst,
1890  int32_t dst_stride,
1891  const int8_t *filter_x,
1892  const int8_t *filter_y,
1893  int32_t height)
1894 {
1895  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1896  filter_x, filter_y, height, 32);
1897 }
1898 
1900  int32_t src_stride,
1901  uint8_t *dst,
1902  int32_t dst_stride,
1903  const int8_t *filter_x,
1904  const int8_t *filter_y,
1905  int32_t height)
1906 {
1907  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1908  filter_x, filter_y, height, 48);
1909 }
1910 
1912  int32_t src_stride,
1913  uint8_t *dst,
1914  int32_t dst_stride,
1915  const int8_t *filter_x,
1916  const int8_t *filter_y,
1917  int32_t height)
1918 {
1919  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1920  filter_x, filter_y, height, 64);
1921 }
1922 
1923 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1924  uint8_t *dst, int32_t dst_stride,
1925  const int8_t *filter)
1926 {
1927  v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1928  v16u8 out;
1929  v8i16 filt, res0;
1930 
1931  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1932  src -= 1;
1933 
1934  /* rearranging filter */
1935  filt = LD_SH(filter);
1936  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1937 
1938  mask1 = mask0 + 2;
1939 
1940  LD_SB2(src, src_stride, src0, src1);
1941  XORI_B2_128_SB(src0, src1);
1942  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1943  res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1944  res0 = __msa_srari_h(res0, 6);
1945  res0 = __msa_sat_s_h(res0, 7);
1946  out = PCKEV_XORI128_UB(res0, res0);
1947  ST4x2_UB(out, dst, dst_stride);
1948 }
1949 
1950 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1951  uint8_t *dst, int32_t dst_stride,
1952  const int8_t *filter)
1953 {
1954  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1955  v8i16 filt, out0, out1;
1956  v16u8 out;
1957 
1958  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1959  src -= 1;
1960 
1961  /* rearranging filter */
1962  filt = LD_SH(filter);
1963  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1964 
1965  mask1 = mask0 + 2;
1966 
1967  LD_SB4(src, src_stride, src0, src1, src2, src3);
1968  XORI_B4_128_SB(src0, src1, src2, src3);
1969  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1970  filt0, filt1, out0, out1);
1971  SRARI_H2_SH(out0, out1, 6);
1972  SAT_SH2_SH(out0, out1, 7);
1973  out = PCKEV_XORI128_UB(out0, out1);
1974  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1975 }
1976 
1977 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1978  uint8_t *dst, int32_t dst_stride,
1979  const int8_t *filter)
1980 {
1981  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1982  v16u8 out;
1983  v8i16 filt, out0, out1, out2, out3;
1984 
1985  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1986  src -= 1;
1987 
1988  /* rearranging filter */
1989  filt = LD_SH(filter);
1990  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1991 
1992  mask1 = mask0 + 2;
1993 
1994  LD_SB4(src, src_stride, src0, src1, src2, src3);
1995  src += (4 * src_stride);
1996 
1997  XORI_B4_128_SB(src0, src1, src2, src3);
1998  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1999  filt0, filt1, out0, out1);
2000  LD_SB4(src, src_stride, src0, src1, src2, src3);
2001  XORI_B4_128_SB(src0, src1, src2, src3);
2002  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2003  filt0, filt1, out2, out3);
2004  SRARI_H4_SH(out0, out1, out2, out3, 6);
2005  SAT_SH4_SH(out0, out1, out2, out3, 7);
2006  out = PCKEV_XORI128_UB(out0, out1);
2007  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2008  dst += (4 * dst_stride);
2009  out = PCKEV_XORI128_UB(out2, out3);
2010  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2011 }
2012 
2013 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2014  uint8_t *dst, int32_t dst_stride,
2015  const int8_t *filter)
2016 {
2017  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2018  v16i8 filt0, filt1, mask0, mask1;
2019  v16u8 out;
2020  v8i16 filt, out0, out1, out2, out3;
2021 
2022  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2023  src -= 1;
2024 
2025  /* rearranging filter */
2026  filt = LD_SH(filter);
2027  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2028 
2029  mask1 = mask0 + 2;
2030 
2031  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2032  src += (8 * src_stride);
2033  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2034  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2035  filt0, filt1, out0, out1);
2036  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2037  filt0, filt1, out2, out3);
2038  SRARI_H4_SH(out0, out1, out2, out3, 6);
2039  SAT_SH4_SH(out0, out1, out2, out3, 7);
2040  out = PCKEV_XORI128_UB(out0, out1);
2041  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2042  dst += (4 * dst_stride);
2043  out = PCKEV_XORI128_UB(out2, out3);
2044  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2045  dst += (4 * dst_stride);
2046 
2047  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2048  src += (8 * src_stride);
2049  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2050  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2051  filt0, filt1, out0, out1);
2052  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2053  filt0, filt1, out2, out3);
2054  SRARI_H4_SH(out0, out1, out2, out3, 6);
2055  SAT_SH4_SH(out0, out1, out2, out3, 7);
2056  out = PCKEV_XORI128_UB(out0, out1);
2057  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2058  dst += (4 * dst_stride);
2059  out = PCKEV_XORI128_UB(out2, out3);
2060  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2061 }
2062 
2063 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2064  uint8_t *dst, int32_t dst_stride,
2065  const int8_t *filter, int32_t height)
2066 {
2067  if (2 == height) {
2068  common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2069  } else if (4 == height) {
2070  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2071  } else if (8 == height) {
2072  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2073  } else if (16 == height) {
2074  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2075  }
2076 }
2077 
2078 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2079  uint8_t *dst, int32_t dst_stride,
2080  const int8_t *filter, int32_t height)
2081 {
2082  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2083  v16u8 out4, out5;
2084  v8i16 filt, out0, out1, out2, out3;
2085 
2086  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2087  src -= 1;
2088 
2089  /* rearranging filter */
2090  filt = LD_SH(filter);
2091  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2092 
2093  mask1 = mask0 + 2;
2094 
2095  LD_SB4(src, src_stride, src0, src1, src2, src3);
2096  src += (4 * src_stride);
2097 
2098  XORI_B4_128_SB(src0, src1, src2, src3);
2099  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2100  filt1, out0, out1, out2, out3);
2101  SRARI_H4_SH(out0, out1, out2, out3, 6);
2102  SAT_SH4_SH(out0, out1, out2, out3, 7);
2103  out4 = PCKEV_XORI128_UB(out0, out1);
2104  out5 = PCKEV_XORI128_UB(out2, out3);
2105  ST6x4_UB(out4, out5, dst, dst_stride);
2106  dst += (4 * dst_stride);
2107 
2108  LD_SB4(src, src_stride, src0, src1, src2, src3);
2109  src += (4 * src_stride);
2110 
2111  XORI_B4_128_SB(src0, src1, src2, src3);
2112  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2113  filt1, out0, out1, out2, out3);
2114  SRARI_H4_SH(out0, out1, out2, out3, 6);
2115  SAT_SH4_SH(out0, out1, out2, out3, 7);
2116  out4 = PCKEV_XORI128_UB(out0, out1);
2117  out5 = PCKEV_XORI128_UB(out2, out3);
2118  ST6x4_UB(out4, out5, dst, dst_stride);
2119  dst += (4 * dst_stride);
2120 }
2121 
2122 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2123  uint8_t *dst, int32_t dst_stride,
2124  const int8_t *filter, int32_t height)
2125 {
2126  uint32_t loop_cnt;
2127  v16i8 src0, src1, filt0, filt1, mask0, mask1;
2128  v16u8 out;
2129  v8i16 filt, vec0, vec1, vec2, vec3;
2130 
2131  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2132  src -= 1;
2133 
2134  filt = LD_SH(filter);
2135  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2136 
2137  mask1 = mask0 + 2;
2138 
2139  for (loop_cnt = (height >> 1); loop_cnt--;) {
2140  LD_SB2(src, src_stride, src0, src1);
2141  src += (2 * src_stride);
2142 
2143  XORI_B2_128_SB(src0, src1);
2144  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2145  DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2146  VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2147  DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2148  SRARI_H2_SH(vec0, vec1, 6);
2149  SAT_SH2_SH(vec0, vec1, 7);
2150  out = PCKEV_XORI128_UB(vec0, vec1);
2151  ST8x2_UB(out, dst, dst_stride);
2152  dst += (2 * dst_stride);
2153  }
2154 }
2155 
2156 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2157  uint8_t *dst, int32_t dst_stride,
2158  const int8_t *filter, int32_t height)
2159 {
2160  uint32_t loop_cnt;
2161  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2162  v16u8 tmp0, tmp1;
2163  v8i16 filt, out0, out1, out2, out3;
2164 
2165  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2166  src -= 1;
2167 
2168  /* rearranging filter */
2169  filt = LD_SH(filter);
2170  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2171 
2172  mask1 = mask0 + 2;
2173 
2174  for (loop_cnt = (height >> 2); loop_cnt--;) {
2175  LD_SB4(src, src_stride, src0, src1, src2, src3);
2176  src += (4 * src_stride);
2177 
2178  XORI_B4_128_SB(src0, src1, src2, src3);
2179  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2180  filt1, out0, out1, out2, out3);
2181  SRARI_H4_SH(out0, out1, out2, out3, 6);
2182  SAT_SH4_SH(out0, out1, out2, out3, 7);
2183  tmp0 = PCKEV_XORI128_UB(out0, out1);
2184  tmp1 = PCKEV_XORI128_UB(out2, out3);
2185  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2186  dst += (4 * dst_stride);
2187  }
2188 }
2189 
2190 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2191  uint8_t *dst, int32_t dst_stride,
2192  const int8_t *filter, int32_t height)
2193 {
2194  if ((2 == height) || (6 == height)) {
2195  common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2196  height);
2197  } else {
2198  common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2199  height);
2200  }
2201 }
2202 
2203 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2204  uint8_t *dst, int32_t dst_stride,
2205  const int8_t *filter, int32_t height)
2206 {
2207  uint32_t loop_cnt;
2208  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2209  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2210  v16i8 vec10, vec11;
2211  v16u8 tmp0, tmp1;
2212  v8i16 filt, out0, out1, out2, out3, out4, out5;
2213 
2214  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2215  mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2216 
2217  src -= 1;
2218 
2219  /* rearranging filter */
2220  filt = LD_SH(filter);
2221  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2222 
2223  mask1 = mask0 + 2;
2224  mask3 = mask2 + 2;
2225 
2226  for (loop_cnt = 4; loop_cnt--;) {
2227  LD_SB4(src, src_stride, src0, src1, src2, src3);
2228  src += (4 * src_stride);
2229 
2230  XORI_B4_128_SB(src0, src1, src2, src3);
2231  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2232  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2233  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2234  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2235  SRARI_H2_SH(out0, out1, 6);
2236  SAT_SH2_SH(out0, out1, 7);
2237  tmp0 = PCKEV_XORI128_UB(out0, out1);
2238  ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2239 
2240  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2241  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2242  DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2243  out2, out3, out4, out5);
2244  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2245  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2246  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2247  out2, out3, out4, out5);
2248  SRARI_H4_SH(out2, out3, out4, out5, 6);
2249  SAT_SH4_SH(out2, out3, out4, out5, 7);
2250  tmp0 = PCKEV_XORI128_UB(out2, out3);
2251  tmp1 = PCKEV_XORI128_UB(out4, out5);
2252  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2253  dst += (4 * dst_stride);
2254  }
2255 }
2256 
2257 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2258  uint8_t *dst, int32_t dst_stride,
2259  const int8_t *filter, int32_t height)
2260 {
2261  uint32_t loop_cnt;
2262  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2263  v16i8 filt0, filt1, mask0, mask1;
2264  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2265  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2266  v16u8 out;
2267 
2268  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2269  src -= 1;
2270 
2271  /* rearranging filter */
2272  filt = LD_SH(filter);
2273  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2274 
2275  mask1 = mask0 + 2;
2276 
2277  for (loop_cnt = (height >> 2); loop_cnt--;) {
2278  LD_SB4(src, src_stride, src0, src2, src4, src6);
2279  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2280  src += (4 * src_stride);
2281 
2282  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2283 
2284  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2285  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2286  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2287  out0, out1, out2, out3);
2288  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2289  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2290  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2291  out0, out1, out2, out3);
2292  SRARI_H4_SH(out0, out1, out2, out3, 6);
2293  SAT_SH4_SH(out0, out1, out2, out3, 7);
2294  out = PCKEV_XORI128_UB(out0, out1);
2295  ST_UB(out, dst);
2296  dst += dst_stride;
2297  out = PCKEV_XORI128_UB(out2, out3);
2298  ST_UB(out, dst);
2299  dst += dst_stride;
2300 
2301  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2302  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2303  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2304  out4, out5, out6, out7);
2305  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2306  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2307  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2308  out4, out5, out6, out7);
2309  SRARI_H4_SH(out4, out5, out6, out7, 6);
2310  SAT_SH4_SH(out4, out5, out6, out7, 7);
2311  out = PCKEV_XORI128_UB(out4, out5);
2312  ST_UB(out, dst);
2313  dst += dst_stride;
2314  out = PCKEV_XORI128_UB(out6, out7);
2315  ST_UB(out, dst);
2316  dst += dst_stride;
2317  }
2318 }
2319 
2320 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2321  uint8_t *dst, int32_t dst_stride,
2322  const int8_t *filter, int32_t height)
2323 {
2324  uint8_t *dst1 = dst + 16;
2325  uint32_t loop_cnt;
2326  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2327  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2328  v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2329  v8i16 filt, out0, out1, out2, out3;
2330  v16u8 tmp0, tmp1;
2331 
2332  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2333  src -= 1;
2334 
2335  /* rearranging filter */
2336  filt = LD_SH(filter);
2337  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2338 
2339  mask1 = mask0 + 2;
2340  mask00 = mask0 + 8;
2341  mask11 = mask0 + 10;
2342 
2343  for (loop_cnt = 8; loop_cnt--;) {
2344  LD_SB4(src, src_stride, src0, src2, src4, src6);
2345  LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2346  src += (4 * src_stride);
2347 
2348  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2349  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2350  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2351  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2352  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2353  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2354  out0, out1, out2, out3);
2355  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2356  out0, out1, out2, out3);
2357  SRARI_H4_SH(out0, out1, out2, out3, 6);
2358  SAT_SH4_SH(out0, out1, out2, out3, 7);
2359  tmp0 = PCKEV_XORI128_UB(out0, out1);
2360  ST_UB(tmp0, dst);
2361  dst += dst_stride;
2362  tmp0 = PCKEV_XORI128_UB(out2, out3);
2363  ST_UB(tmp0, dst);
2364  dst += dst_stride;
2365 
2366  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2367  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2368  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2369  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2370  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2371  out0, out1, out2, out3);
2372  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2373  out0, out1, out2, out3);
2374  SRARI_H4_SH(out0, out1, out2, out3, 6);
2375  SAT_SH4_SH(out0, out1, out2, out3, 7);
2376  tmp0 = PCKEV_XORI128_UB(out0, out1);
2377  ST_UB(tmp0, dst);
2378  dst += dst_stride;
2379  tmp0 = PCKEV_XORI128_UB(out2, out3);
2380  ST_UB(tmp0, dst);
2381  dst += dst_stride;
2382 
2383  /* 8 width */
2384  VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2385  VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2386  VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2387  VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2388 
2389  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2390  out0, out1, out2, out3);
2391  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2392  out0, out1, out2, out3);
2393 
2394  SRARI_H4_SH(out0, out1, out2, out3, 6);
2395  SAT_SH4_SH(out0, out1, out2, out3, 7);
2396  tmp0 = PCKEV_XORI128_UB(out0, out1);
2397  tmp1 = PCKEV_XORI128_UB(out2, out3);
2398  ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2399  dst1 += (4 * dst_stride);
2400  }
2401 }
2402 
2403 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2404  uint8_t *dst, int32_t dst_stride,
2405  const int8_t *filter, int32_t height)
2406 {
2407  uint32_t loop_cnt;
2408  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2409  v16i8 filt0, filt1, mask0, mask1;
2410  v16u8 out;
2411  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2412  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2413 
2414  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2415  src -= 1;
2416 
2417  /* rearranging filter */
2418  filt = LD_SH(filter);
2419  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2420 
2421  mask1 = mask0 + 2;
2422 
2423  for (loop_cnt = (height >> 1); loop_cnt--;) {
2424  src0 = LD_SB(src);
2425  src1 = LD_SB(src + 8);
2426  src2 = LD_SB(src + 16);
2427  src3 = LD_SB(src + 24);
2428  src += src_stride;
2429  src4 = LD_SB(src);
2430  src5 = LD_SB(src + 8);
2431  src6 = LD_SB(src + 16);
2432  src7 = LD_SB(src + 24);
2433  src += src_stride;
2434 
2435  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2436 
2437  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2438  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2439  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2440  out0, out1, out2, out3);
2441  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2442  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2443  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2444  out0, out1, out2, out3);
2445 
2446  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2447  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2448  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2449  out4, out5, out6, out7);
2450  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2451  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2452  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2453  out4, out5, out6, out7);
2454  SRARI_H4_SH(out0, out1, out2, out3, 6);
2455  SRARI_H4_SH(out4, out5, out6, out7, 6);
2456  SAT_SH4_SH(out0, out1, out2, out3, 7);
2457  SAT_SH4_SH(out4, out5, out6, out7, 7);
2458  out = PCKEV_XORI128_UB(out0, out1);
2459  ST_UB(out, dst);
2460  out = PCKEV_XORI128_UB(out2, out3);
2461  ST_UB(out, dst + 16);
2462  dst += dst_stride;
2463  out = PCKEV_XORI128_UB(out4, out5);
2464  ST_UB(out, dst);
2465  out = PCKEV_XORI128_UB(out6, out7);
2466  ST_UB(out, dst + 16);
2467  dst += dst_stride;
2468  }
2469 }
2470 
2471 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2472  uint8_t *dst, int32_t dst_stride,
2473  const int8_t *filter)
2474 {
2475  v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2476  v16i8 src2110, src4332, filt0, filt1;
2477  v16u8 out;
2478  v8i16 filt, out10;
2479 
2480  src -= src_stride;
2481 
2482  filt = LD_SH(filter);
2483  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2484 
2485  LD_SB3(src, src_stride, src0, src1, src2);
2486  src += (3 * src_stride);
2487 
2488  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2489  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2490  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2491  LD_SB2(src, src_stride, src3, src4);
2492  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2493  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2494  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2495  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2496  out10 = __msa_srari_h(out10, 6);
2497  out10 = __msa_sat_s_h(out10, 7);
2498  out = PCKEV_XORI128_UB(out10, out10);
2499  ST4x2_UB(out, dst, dst_stride);
2500 }
2501 
2503  uint8_t *dst, int32_t dst_stride,
2504  const int8_t *filter, int32_t height)
2505 {
2506  uint32_t loop_cnt;
2507  v16i8 src0, src1, src2, src3, src4, src5;
2508  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2509  v16i8 src2110, src4332, filt0, filt1;
2510  v8i16 filt, out10, out32;
2511  v16u8 out;
2512 
2513  src -= src_stride;
2514 
2515  filt = LD_SH(filter);
2516  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2517 
2518  LD_SB3(src, src_stride, src0, src1, src2);
2519  src += (3 * src_stride);
2520 
2521  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2522 
2523  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2524  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2525 
2526  for (loop_cnt = (height >> 2); loop_cnt--;) {
2527  LD_SB3(src, src_stride, src3, src4, src5);
2528  src += (3 * src_stride);
2529  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2530  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2531  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2532  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2533 
2534  src2 = LD_SB(src);
2535  src += (src_stride);
2536  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2537  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2538  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2539  out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2540  SRARI_H2_SH(out10, out32, 6);
2541  SAT_SH2_SH(out10, out32, 7);
2542  out = PCKEV_XORI128_UB(out10, out32);
2543  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2544  dst += (4 * dst_stride);
2545  }
2546 }
2547 
2548 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2549  uint8_t *dst, int32_t dst_stride,
2550  const int8_t *filter, int32_t height)
2551 {
2552  if (2 == height) {
2553  common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2554  } else {
2555  common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2556  height);
2557  }
2558 }
2559 
2560 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2561  uint8_t *dst, int32_t dst_stride,
2562  const int8_t *filter, int32_t height)
2563 {
2564  v16u8 out0, out1;
2565  v16i8 src0, src1, src2, src3, src4, src5, src6;
2566  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2567  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2568 
2569  src -= src_stride;
2570 
2571  filter_vec = LD_SH(filter);
2572  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2573 
2574  LD_SB3(src, src_stride, src0, src1, src2);
2575  src += (3 * src_stride);
2576  XORI_B3_128_SB(src0, src1, src2);
2577  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2578 
2579  LD_SB2(src, src_stride, src3, src4);
2580  src += (2 * src_stride);
2581  XORI_B2_128_SB(src3, src4);
2582  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2583 
2584  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2585  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2586 
2587  LD_SB2(src, src_stride, src5, src6);
2588  src += (2 * src_stride);
2589  XORI_B2_128_SB(src5, src6);
2590  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2591 
2592  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2593  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2594 
2595  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2596  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2597  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2598  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2599  ST6x4_UB(out0, out1, dst, dst_stride);
2600  dst += (4 * dst_stride);
2601 
2602  LD_SB2(src, src_stride, src3, src4);
2603  src += (2 * src_stride);
2604  XORI_B2_128_SB(src3, src4);
2605  ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2606 
2607  dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2608  dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2609 
2610  LD_SB2(src, src_stride, src5, src6);
2611  src += (2 * src_stride);
2612  XORI_B2_128_SB(src5, src6);
2613  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2614 
2615  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2616  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2617 
2618  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2619  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2620  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2621  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2622  ST6x4_UB(out0, out1, dst, dst_stride);
2623 }
2624 
2625 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2626  uint8_t *dst, int32_t dst_stride,
2627  const int8_t *filter)
2628 {
2629  v16i8 src0, src1, src2, src3, src4;
2630  v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2631  v16u8 out;
2632 
2633  src -= src_stride;
2634 
2635  /* rearranging filter_y */
2636  filt = LD_SH(filter);
2637  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2638 
2639  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2640  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2641  ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2642  tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2643  ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2644  tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2645  SRARI_H2_SH(tmp0, tmp1, 6);
2646  SAT_SH2_SH(tmp0, tmp1, 7);
2647  out = PCKEV_XORI128_UB(tmp0, tmp1);
2648  ST8x2_UB(out, dst, dst_stride);
2649 }
2650 
2651 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2652  uint8_t *dst, int32_t dst_stride,
2653  const int8_t *filter)
2654 {
2655  uint32_t loop_cnt;
2656  uint64_t out0, out1, out2;
2657  v16i8 src0, src1, src2, src3, src4, src5;
2658  v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2659  v8i16 filt, filt0, filt1;
2660 
2661  src -= src_stride;
2662 
2663  /* rearranging filter_y */
2664  filt = LD_SH(filter);
2665  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2666 
2667  LD_SB3(src, src_stride, src0, src1, src2);
2668  src += (3 * src_stride);
2669 
2670  XORI_B3_128_SB(src0, src1, src2);
2671  ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2672 
2673  for (loop_cnt = 2; loop_cnt--;) {
2674  LD_SB3(src, src_stride, src3, src4, src5);
2675  src += (3 * src_stride);
2676 
2677  XORI_B3_128_SB(src3, src4, src5);
2678  ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2679  tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2680  tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2681  tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2682  SRARI_H2_SH(tmp0, tmp1, 6);
2683  tmp2 = __msa_srari_h(tmp2, 6);
2684  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2685  PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2686  XORI_B2_128_SH(tmp0, tmp2);
2687 
2688  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2689  out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2690  out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2691  SD(out0, dst);
2692  dst += dst_stride;
2693  SD(out1, dst);
2694  dst += dst_stride;
2695  SD(out2, dst);
2696  dst += dst_stride;
2697 
2698  src2 = src5;
2699  vec0 = vec3;
2700  vec2 = vec4;
2701  }
2702 }
2703 
2704 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2705  uint8_t *dst, int32_t dst_stride,
2706  const int8_t *filter, int32_t height)
2707 {
2708  uint32_t loop_cnt;
2709  v16i8 src0, src1, src2, src7, src8, src9, src10;
2710  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2711  v16u8 tmp0, tmp1;
2712  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2713 
2714  src -= src_stride;
2715 
2716  filt = LD_SH(filter);
2717  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2718 
2719  LD_SB3(src, src_stride, src0, src1, src2);
2720  src += (3 * src_stride);
2721 
2722  XORI_B3_128_SB(src0, src1, src2);
2723  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2724 
2725  for (loop_cnt = (height >> 2); loop_cnt--;) {
2726  LD_SB4(src, src_stride, src7, src8, src9, src10);
2727  src += (4 * src_stride);
2728 
2729  XORI_B4_128_SB(src7, src8, src9, src10);
2730  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2731  src72_r, src87_r, src98_r, src109_r);
2732  out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2733  out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2734  out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2735  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2736  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2737  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2738  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2739  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2740  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2741  dst += (4 * dst_stride);
2742 
2743  src10_r = src98_r;
2744  src21_r = src109_r;
2745  src2 = src10;
2746  }
2747 }
2748 
2749 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2750  uint8_t *dst, int32_t dst_stride,
2751  const int8_t *filter, int32_t height)
2752 {
2753  if (2 == height) {
2754  common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2755  } else if (6 == height) {
2756  common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2757  } else {
2758  common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2759  filter, height);
2760  }
2761 }
2762 
2763 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2764  uint8_t *dst, int32_t dst_stride,
2765  const int8_t *filter, int32_t height)
2766 {
2767  uint32_t loop_cnt;
2768  v16i8 src0, src1, src2, src3, src4, src5, src6;
2769  v16u8 out0, out1;
2770  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2771  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2772  v16i8 src2110, src4332, src6554;
2773  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2774  v8i16 filter_vec;
2775 
2776  src -= (1 * src_stride);
2777 
2778  filter_vec = LD_SH(filter);
2779  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2780 
2781  LD_SB3(src, src_stride, src0, src1, src2);
2782  src += (3 * src_stride);
2783 
2784  XORI_B3_128_SB(src0, src1, src2);
2785  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2786  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2787  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2788 
2789  for (loop_cnt = 4; loop_cnt--;) {
2790  LD_SB4(src, src_stride, src3, src4, src5, src6);
2791  src += (4 * src_stride);
2792 
2793  XORI_B4_128_SB(src3, src4, src5, src6);
2794  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2795  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2796  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2797  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2798  ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2799  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2800 
2801  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2802  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2803  dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2804  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2805  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2806  dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2807 
2808  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2809  SRARI_H2_SH(dst0_l, dst1_l, 6);
2810  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2811  SAT_SH2_SH(dst0_l, dst1_l, 7);
2812  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2813  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2814  ST8x4_UB(out0, out1, dst, dst_stride);
2815  out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2816  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2817  dst += (4 * dst_stride);
2818 
2819  src2 = src6;
2820  src10_r = src54_r;
2821  src21_r = src65_r;
2822  src2110 = src6554;
2823  }
2824 }
2825 
2826 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2827  uint8_t *dst, int32_t dst_stride,
2828  const int8_t *filter, int32_t height)
2829 {
2830  uint32_t loop_cnt;
2831  v16i8 src0, src1, src2, src3, src4, src5, src6;
2832  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2833  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2834  v16u8 tmp0, tmp1, tmp2, tmp3;
2835  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2836 
2837  src -= src_stride;
2838 
2839  filt = LD_SH(filter);
2840  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2841 
2842  LD_SB3(src, src_stride, src0, src1, src2);
2843  src += (3 * src_stride);
2844 
2845  XORI_B3_128_SB(src0, src1, src2);
2846  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2847  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2848 
2849  for (loop_cnt = (height >> 2); loop_cnt--;) {
2850  LD_SB4(src, src_stride, src3, src4, src5, src6);
2851  src += (4 * src_stride);
2852 
2853  XORI_B4_128_SB(src3, src4, src5, src6);
2854  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2855  src32_r, src43_r, src54_r, src65_r);
2856  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2857  src32_l, src43_l, src54_l, src65_l);
2858  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2859  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2860  out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2861  out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2862  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2863  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2864  out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2865  out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2866  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2867  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2868  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2869  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2870  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2871  out3_r, tmp0, tmp1, tmp2, tmp3);
2872  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2873  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2874  dst += (4 * dst_stride);
2875 
2876  src10_r = src54_r;
2877  src21_r = src65_r;
2878  src10_l = src54_l;
2879  src21_l = src65_l;
2880  src2 = src6;
2881  }
2882 }
2883 
2884 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2885  uint8_t *dst, int32_t dst_stride,
2886  const int8_t *filter, int32_t height)
2887 {
2888  uint32_t loop_cnt;
2889  uint64_t out0, out1;
2890  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2891  v16i8 src11, filt0, filt1;
2892  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2893  v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2894  v16u8 out;
2895  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2896 
2897  src -= src_stride;
2898 
2899  filt = LD_SH(filter);
2900  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2901 
2902  /* 16 width */
2903  LD_SB3(src, src_stride, src0, src1, src2);
2904  XORI_B3_128_SB(src0, src1, src2);
2905  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2906  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2907 
2908  /* 8 width */
2909  LD_SB3(src + 16, src_stride, src6, src7, src8);
2910  src += (3 * src_stride);
2911  XORI_B3_128_SB(src6, src7, src8);
2912  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2913 
2914  for (loop_cnt = 8; loop_cnt--;) {
2915  /* 16 width */
2916  LD_SB2(src, src_stride, src3, src4);
2917  XORI_B2_128_SB(src3, src4);
2918  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2919  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2920 
2921  /* 8 width */
2922  LD_SB2(src + 16, src_stride, src9, src10);
2923  src += (2 * src_stride);
2924  XORI_B2_128_SB(src9, src10);
2925  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2926 
2927  /* 16 width */
2928  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2929  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2930  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2931  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2932 
2933  /* 8 width */
2934  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2935  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2936 
2937  /* 16 + 8 width */
2938  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2939  SRARI_H2_SH(out0_l, out1_l, 6);
2940  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2941  SAT_SH2_SH(out0_l, out1_l, 7);
2942  out = PCKEV_XORI128_UB(out0_r, out0_l);
2943  ST_UB(out, dst);
2944  PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2945  XORI_B2_128_SH(out2_r, out3_r);
2946  out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2947  out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2948  SD(out0, dst + 16);
2949  dst += dst_stride;
2950  out = PCKEV_XORI128_UB(out1_r, out1_l);
2951  ST_UB(out, dst);
2952  SD(out1, dst + 16);
2953  dst += dst_stride;
2954 
2955  /* 16 width */
2956  LD_SB2(src, src_stride, src5, src2);
2957  XORI_B2_128_SB(src5, src2);
2958  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2959  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2960 
2961  /* 8 width */
2962  LD_SB2(src + 16, src_stride, src11, src8);
2963  src += (2 * src_stride);
2964  XORI_B2_128_SB(src11, src8);
2965  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2966 
2967  /* 16 width */
2968  out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2969  out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2970  out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2971  out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2972 
2973  /* 8 width */
2974  out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2975  out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2976 
2977  /* 16 + 8 width */
2978  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2979  SRARI_H2_SH(out0_l, out1_l, 6);
2980  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2981  SAT_SH2_SH(out0_l, out1_l, 7);
2982  out = PCKEV_XORI128_UB(out0_r, out0_l);
2983  ST_UB(out, dst);
2984  out = PCKEV_XORI128_UB(out2_r, out2_r);
2985  ST8x1_UB(out, dst + 16);
2986  dst += dst_stride;
2987  out = PCKEV_XORI128_UB(out1_r, out1_l);
2988  ST_UB(out, dst);
2989  out = PCKEV_XORI128_UB(out3_r, out3_r);
2990  ST8x1_UB(out, dst + 16);
2991  dst += dst_stride;
2992  }
2993 }
2994 
2995 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2996  uint8_t *dst, int32_t dst_stride,
2997  const int8_t *filter, int32_t height)
2998 {
2999  uint32_t loop_cnt;
3000  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3001  v16i8 src10_r, src32_r, src76_r, src98_r;
3002  v16i8 src21_r, src43_r, src87_r, src109_r;
3003  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3004  v16i8 src10_l, src32_l, src76_l, src98_l;
3005  v16i8 src21_l, src43_l, src87_l, src109_l;
3006  v8i16 filt;
3007  v16i8 filt0, filt1;
3008  v16u8 out;
3009 
3010  src -= src_stride;
3011 
3012  filt = LD_SH(filter);
3013  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3014 
3015  /* 16 width */
3016  LD_SB3(src, src_stride, src0, src1, src2);
3017  XORI_B3_128_SB(src0, src1, src2);
3018 
3019  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3020  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3021 
3022  /* next 16 width */
3023  LD_SB3(src + 16, src_stride, src6, src7, src8);
3024  src += (3 * src_stride);
3025 
3026  XORI_B3_128_SB(src6, src7, src8);
3027  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3028  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3029 
3030  for (loop_cnt = (height >> 1); loop_cnt--;) {
3031  /* 16 width */
3032  LD_SB2(src, src_stride, src3, src4);
3033  XORI_B2_128_SB(src3, src4);
3034  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3035  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3036 
3037  /* 16 width */
3038  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3039  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3040  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3041  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3042 
3043  /* 16 width */
3044  SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3045  SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3046  out = PCKEV_XORI128_UB(out0_r, out0_l);
3047  ST_UB(out, dst);
3048  out = PCKEV_XORI128_UB(out1_r, out1_l);
3049  ST_UB(out, dst + dst_stride);
3050 
3051  src10_r = src32_r;
3052  src21_r = src43_r;
3053  src10_l = src32_l;
3054  src21_l = src43_l;
3055  src2 = src4;
3056 
3057  /* next 16 width */
3058  LD_SB2(src + 16, src_stride, src9, src10);
3059  src += (2 * src_stride);
3060  XORI_B2_128_SB(src9, src10);
3061  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3062  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3063 
3064  /* next 16 width */
3065  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3066  out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3067  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3068  out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3069 
3070  /* next 16 width */
3071  SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3072  SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3073  out = PCKEV_XORI128_UB(out2_r, out2_l);
3074  ST_UB(out, dst + 16);
3075  out = PCKEV_XORI128_UB(out3_r, out3_l);
3076  ST_UB(out, dst + 16 + dst_stride);
3077 
3078  dst += 2 * dst_stride;
3079 
3080  src76_r = src98_r;
3081  src87_r = src109_r;
3082  src76_l = src98_l;
3083  src87_l = src109_l;
3084  src8 = src10;
3085  }
3086 }
3087 
3089  int32_t src_stride,
3090  uint8_t *dst,
3091  int32_t dst_stride,
3092  const int8_t *filter_x,
3093  const int8_t *filter_y)
3094 {
3095  v16u8 out;
3096  v16i8 src0, src1, src2, src3, src4;
3097  v8i16 filt0, filt1;
3098  v8i16 filt_h0, filt_h1;
3099  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3100  v16i8 mask1;
3101  v8i16 filter_vec, tmp;
3102  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3103  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3104  v4i32 dst0, dst1;
3105 
3106  src -= (src_stride + 1);
3107 
3108  filter_vec = LD_SH(filter_x);
3109  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3110 
3111  filter_vec = LD_SH(filter_y);
3112  UNPCK_R_SB_SH(filter_vec, filter_vec);
3113 
3114  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3115 
3116  mask1 = mask0 + 2;
3117 
3118  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3119  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3120 
3121  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3122  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3123  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3124 
3125  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3126  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3127  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3128 
3129  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3130  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3131 
3132  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3133  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3134  dst0 >>= 6;
3135  dst1 >>= 6;
3136  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3137  tmp = __msa_srari_h(tmp, 6);
3138  tmp = __msa_sat_s_h(tmp, 7);
3139  out = PCKEV_XORI128_UB(tmp, tmp);
3140  ST4x2_UB(out, dst, dst_stride);
3141 }
3142 
3144  int32_t src_stride,
3145  uint8_t *dst,
3146  int32_t dst_stride,
3147  const int8_t *filter_x,
3148  const int8_t *filter_y)
3149 {
3150  v16u8 out;
3151  v16i8 src0, src1, src2, src3, src4, src5, src6;
3152  v8i16 filt0, filt1;
3153  v8i16 filt_h0, filt_h1;
3154  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3155  v16i8 mask1;
3156  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3157  v8i16 filter_vec, tmp0, tmp1;
3158  v8i16 dst30, dst41, dst52, dst63;
3159  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3160  v4i32 dst0, dst1, dst2, dst3;
3161 
3162  src -= (src_stride + 1);
3163 
3164  filter_vec = LD_SH(filter_x);
3165  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3166 
3167  filter_vec = LD_SH(filter_y);
3168  UNPCK_R_SB_SH(filter_vec, filter_vec);
3169 
3170  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3171 
3172  mask1 = mask0 + 2;
3173 
3174  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3175  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3176 
3177  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3178  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3179  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3180  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3181 
3182  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3183  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3184  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3185  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3186 
3187  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3188  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3189  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3190  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3191  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3192  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3193  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3194  SRA_4V(dst0, dst1, dst2, dst3, 6);
3195  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3196  SRARI_H2_SH(tmp0, tmp1, 6);
3197  SAT_SH2_SH(tmp0, tmp1, 7);
3198  out = PCKEV_XORI128_UB(tmp0, tmp1);
3199  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3200 }
3201 
3203  int32_t src_stride,
3204  uint8_t *dst,
3205  int32_t dst_stride,
3206  const int8_t *filter_x,
3207  const int8_t *filter_y,
3208  int32_t height)
3209 {
3210  uint32_t loop_cnt;
3211  v16u8 out0, out1;
3212  v16i8 src0, src1, src2, src3, src4, src5;
3213  v16i8 src6, src7, src8, src9, src10;
3214  v8i16 filt0, filt1;
3215  v8i16 filt_h0, filt_h1;
3216  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3217  v16i8 mask1;
3218  v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3219  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3220  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3221  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3222  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3223  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3224  v8i16 dst98_r, dst109_r;
3225 
3226  src -= (src_stride + 1);
3227 
3228  filter_vec = LD_SH(filter_x);
3229  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3230 
3231  filter_vec = LD_SH(filter_y);
3232  UNPCK_R_SB_SH(filter_vec, filter_vec);
3233 
3234  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3235 
3236  mask1 = mask0 + 2;
3237 
3238  LD_SB3(src, src_stride, src0, src1, src2);
3239  src += (3 * src_stride);
3240 
3241  XORI_B3_128_SB(src0, src1, src2);
3242 
3243  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3244  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3245  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3246  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3247  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3248  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3249 
3250  for (loop_cnt = height >> 3; loop_cnt--;) {
3251  LD_SB8(src, src_stride,
3252  src3, src4, src5, src6, src7, src8, src9, src10);
3253  src += (8 * src_stride);
3254 
3255  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3256 
3257  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3258  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3259  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3260  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3261 
3262  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3263  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3264  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3265  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3266 
3267  dst32_r = __msa_ilvr_h(dst73, dst22);
3268  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3269  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3270  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3271  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3272  dst76_r = __msa_ilvr_h(dst22, dst106);
3273 
3274  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3275  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3276  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3277  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3278  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3279  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3280  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3281  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3282  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3283  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3284  PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3285  dst5_r, dst4_r, dst7_r, dst6_r,
3286  tmp0, tmp1, tmp2, tmp3);
3287  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3288  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3289  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3290  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3291  ST4x8_UB(out0, out1, dst, dst_stride);
3292  dst += (8 * dst_stride);
3293 
3294  dst10_r = dst98_r;
3295  dst21_r = dst109_r;
3296  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3297  }
3298 }
3299 
3301  int32_t src_stride,
3302  uint8_t *dst,
3303  int32_t dst_stride,
3304  const int8_t *filter_x,
3305  const int8_t *filter_y,
3306  int32_t height)
3307 {
3308  if (2 == height) {
3309  hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3310  filter_x, filter_y);
3311  } else if (4 == height) {
3312  hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3313  filter_x, filter_y);
3314  } else if (0 == (height % 8)) {
3315  hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3316  filter_x, filter_y, height);
3317  }
3318 }
3319 
3321  int32_t src_stride,
3322  uint8_t *dst,
3323  int32_t dst_stride,
3324  const int8_t *filter_x,
3325  const int8_t *filter_y,
3326  int32_t height)
3327 {
3328  v16u8 out0, out1, out2;
3329  v16i8 src0, src1, src2, src3, src4, src5, src6;
3330  v16i8 src7, src8, src9, src10;
3331  v8i16 filt0, filt1;
3332  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3333  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3334  v16i8 mask1;
3335  v8i16 filt_h0, filt_h1, filter_vec;
3336  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3337  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3338  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3339  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3340  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3341  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3342  v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3343  v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3344  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3345 
3346  src -= (src_stride + 1);
3347 
3348  filter_vec = LD_SH(filter_x);
3349  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3350 
3351  filter_vec = LD_SH(filter_y);
3352  UNPCK_R_SB_SH(filter_vec, filter_vec);
3353 
3354  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3355 
3356  mask1 = mask0 + 2;
3357 
3358  LD_SB3(src, src_stride, src0, src1, src2);
3359  src += (3 * src_stride);
3360 
3361  XORI_B3_128_SB(src0, src1, src2);
3362 
3363  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3364  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3365  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3366 
3367  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3368  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3369  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3370 
3371  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3372  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3373 
3374  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3375  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3376 
3377  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3378  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3379  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3380  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3381 
3382  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3383  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3384  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3385  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3386 
3387  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3388  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3389  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3390  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3391 
3392  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3393  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3394  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3395  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3396 
3397  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3398  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3399  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3400  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3401  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3402  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3403  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3404  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3405 
3406  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3407  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3408  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3409 
3410  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3411  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3412  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3413  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3414  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3415  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3416  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3417  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3418  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3419  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3420  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3421  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3422  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3423  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3424  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3425  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3426  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3427  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3428  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3429  SRARI_H2_SH(tmp4, tmp5, 6);
3430  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3431  SAT_SH2_SH(tmp4, tmp5,7);
3432  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3433  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3434  out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3435  ST4x8_UB(out0, out1, dst, dst_stride);
3436  ST2x4_UB(out2, 0, dst + 4, dst_stride);
3437  dst += 4 * dst_stride;
3438  ST2x4_UB(out2, 4, dst + 4, dst_stride);
3439 }
3440 
3442  int32_t src_stride,
3443  uint8_t *dst,
3444  int32_t dst_stride,
3445  const int8_t *filter_x,
3446  const int8_t *filter_y)
3447 {
3448  v16u8 out;
3449  v16i8 src0, src1, src2, src3, src4;
3450  v8i16 filt0, filt1;
3451  v8i16 filt_h0, filt_h1, filter_vec;
3452  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3453  v16i8 mask1;
3454  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3455  v8i16 dst0, dst1, dst2, dst3, dst4;
3456  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3457  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3458  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3459  v8i16 out0_r, out1_r;
3460 
3461  src -= (src_stride + 1);
3462 
3463  filter_vec = LD_SH(filter_x);
3464  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3465 
3466  filter_vec = LD_SH(filter_y);
3467  UNPCK_R_SB_SH(filter_vec, filter_vec);
3468 
3469  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3470 
3471  mask1 = mask0 + 2;
3472 
3473  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3474  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3475 
3476  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3477  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3478  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3479  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3480  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3481 
3482  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3483  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3484  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3485  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3486  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3487  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3488  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3489  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3490  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3491  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3492  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3493  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3494  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3495  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3496  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3497  SRARI_H2_SH(out0_r, out1_r, 6);
3498  SAT_SH2_SH(out0_r, out1_r, 7);
3499  out = PCKEV_XORI128_UB(out0_r, out1_r);
3500  ST8x2_UB(out, dst, dst_stride);
3501 }
3502 
3504  int32_t src_stride,
3505  uint8_t *dst,
3506  int32_t dst_stride,
3507  const int8_t *filter_x,
3508  const int8_t *filter_y,
3509  int32_t width8mult)
3510 {
3511  uint32_t cnt;
3512  v16u8 out0, out1;
3513  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3514  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3515  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3516  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3517  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3518  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3519  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3520 
3521  src -= (src_stride + 1);
3522 
3523  filter_vec = LD_SH(filter_x);
3524  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3525 
3526  filter_vec = LD_SH(filter_y);
3527  UNPCK_R_SB_SH(filter_vec, filter_vec);
3528 
3529  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3530 
3531  mask0 = LD_SB(ff_hevc_mask_arr);
3532  mask1 = mask0 + 2;
3533 
3534  for (cnt = width8mult; cnt--;) {
3535  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3536  src += 8;
3537  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3538 
3539  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3540  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3541  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3542 
3543  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3544  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3545  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3546 
3547  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3548  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3549 
3550  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3551  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3552  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3553  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3554 
3555  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3556  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3557  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3558  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3559 
3560  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3561  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3562  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3563  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3564 
3565  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3566  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3567  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3568  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3569  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3570  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3571  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3572  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3573 
3574  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3575  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3576 
3577  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3578  dst3_r, tmp0, tmp1, tmp2, tmp3);
3579  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3580  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3581  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3582  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3583  ST8x4_UB(out0, out1, dst, dst_stride);
3584  dst += 8;
3585  }
3586 }
3587 
3589  int32_t src_stride,
3590  uint8_t *dst,
3591  int32_t dst_stride,
3592  const int8_t *filter_x,
3593  const int8_t *filter_y)
3594 {
3595  v16u8 out0, out1, out2;
3596  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3597  v8i16 filt0, filt1;
3598  v8i16 filt_h0, filt_h1, filter_vec;
3599  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3600  v16i8 mask1;
3601  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3602  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3603  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3604  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3605  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3606  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3607  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3608  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3609  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3610  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3611 
3612  src -= (src_stride + 1);
3613 
3614  filter_vec = LD_SH(filter_x);
3615  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3616 
3617  filter_vec = LD_SH(filter_y);
3618  UNPCK_R_SB_SH(filter_vec, filter_vec);
3619 
3620  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3621 
3622  mask1 = mask0 + 2;
3623 
3624  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3625  src += (5 * src_stride);
3626  LD_SB4(src, src_stride, src5, src6, src7, src8);
3627 
3628  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3629  XORI_B4_128_SB(src5, src6, src7, src8);
3630 
3631  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3632  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3633  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3634  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3635  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3636  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3637  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3638  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3639  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3640 
3641  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3642  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3643  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3644  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3645  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3646  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3647  dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3648  dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3649  dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3650 
3651  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3652  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3653  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3654  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3655  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3656  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3657  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3658  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3659 
3660  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3661  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3662  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3663  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3664  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3665  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3666  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3667  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3668  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3669  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3670  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3671  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3672 
3673  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3674  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3675  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3676  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3677  dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3678  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3679  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3680  SRARI_H2_SH(out4_r, out5_r, 6);
3681  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3682  SAT_SH2_SH(out4_r, out5_r, 7);
3683  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3684  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3685  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3686 
3687  ST8x4_UB(out0, out1, dst, dst_stride);
3688  dst += (4 * dst_stride);
3689  ST8x2_UB(out2, dst, dst_stride);
3690 }
3691 
3693  int32_t src_stride,
3694  uint8_t *dst,
3695  int32_t dst_stride,
3696  const int8_t *filter_x,
3697  const int8_t *filter_y,
3698  int32_t height,
3699  int32_t width8mult)
3700 {
3701  uint32_t loop_cnt, cnt;
3702  uint8_t *src_tmp;
3703  uint8_t *dst_tmp;
3704  v16u8 out0, out1;
3705  v16i8 src0, src1, src2, src3, src4, src5, src6;
3706  v8i16 filt0, filt1;
3707  v8i16 filt_h0, filt_h1, filter_vec;
3708  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3709  v16i8 mask1;
3710  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3711  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3712  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3713  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3714  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3715  v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3716  v8i16 out0_r, out1_r, out2_r, out3_r;
3717 
3718  src -= (src_stride + 1);
3719 
3720  filter_vec = LD_SH(filter_x);
3721  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3722 
3723  filter_vec = LD_SH(filter_y);
3724  UNPCK_R_SB_SH(filter_vec, filter_vec);
3725 
3726  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3727 
3728  mask1 = mask0 + 2;
3729 
3730  for (cnt = width8mult; cnt--;) {
3731  src_tmp = src;
3732  dst_tmp = dst;
3733 
3734  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3735  src_tmp += (3 * src_stride);
3736 
3737  XORI_B3_128_SB(src0, src1, src2);
3738 
3739  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3740  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3741  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3742 
3743  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3744  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3745  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3746 
3747  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3748  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3749 
3750  for (loop_cnt = (height >> 2); loop_cnt--;) {
3751  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3752  src_tmp += (4 * src_stride);
3753 
3754  XORI_B4_128_SB(src3, src4, src5, src6);
3755 
3756  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3757  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3758  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3759  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3760 
3761  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3762  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3763  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3764  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3765 
3766  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3767  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3768  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3769  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3770 
3771  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3772  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3773  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3774  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3775  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3776  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3777  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3778  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3779 
3780  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3781  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3782 
3783  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3784  dst2_l, dst2_r, dst3_l, dst3_r,
3785  out0_r, out1_r, out2_r, out3_r);
3786 
3787  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3788  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3789  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3790  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3791  ST8x4_UB(out0, out1, dst_tmp, dst_stride);
3792  dst_tmp += (4 * dst_stride);
3793 
3794  dst10_r = dst54_r;
3795  dst10_l = dst54_l;
3796  dst21_r = dst65_r;
3797  dst21_l = dst65_l;
3798  dst2 = dst6;
3799  }
3800 
3801  src += 8;
3802  dst += 8;
3803  }
3804 }
3805 
3807  int32_t src_stride,
3808  uint8_t *dst,
3809  int32_t dst_stride,
3810  const int8_t *filter_x,
3811  const int8_t *filter_y,
3812  int32_t height)
3813 {
3814  if (2 == height) {
3815  hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3816  filter_x, filter_y);
3817  } else if (4 == height) {
3818  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3819  filter_x, filter_y, 1);
3820  } else if (6 == height) {
3821  hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3822  filter_x, filter_y);
3823  } else if (0 == (height % 4)) {
3824  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3825  filter_x, filter_y, height, 1);
3826  }
3827 }
3828 
3830  int32_t src_stride,
3831  uint8_t *dst,
3832  int32_t dst_stride,
3833  const int8_t *filter_x,
3834  const int8_t *filter_y,
3835  int32_t height)
3836 {
3837  uint32_t loop_cnt;
3838  uint8_t *src_tmp, *dst_tmp;
3839  v16u8 out0, out1;
3840  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3841  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3842  v16i8 mask0, mask1, mask2, mask3;
3843  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3844  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3845  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3846  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3847  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3848  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3849  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3850  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3851 
3852  src -= (src_stride + 1);
3853 
3854  filter_vec = LD_SH(filter_x);
3855  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3856 
3857  filter_vec = LD_SH(filter_y);
3858  UNPCK_R_SB_SH(filter_vec, filter_vec);
3859 
3860  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3861 
3862  mask0 = LD_SB(ff_hevc_mask_arr);
3863  mask1 = mask0 + 2;
3864 
3865  src_tmp = src;
3866  dst_tmp = dst;
3867 
3868  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3869  src_tmp += (3 * src_stride);
3870 
3871  XORI_B3_128_SB(src0, src1, src2);
3872 
3873  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3874  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3875  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3876 
3877  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3878  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3879  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3880 
3881  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3882  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3883 
3884  for (loop_cnt = 4; loop_cnt--;) {
3885  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3886  src_tmp += (4 * src_stride);
3887  XORI_B4_128_SB(src3, src4, src5, src6);
3888 
3889  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3890  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3891  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3892  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3893 
3894  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3895  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3896  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3897  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3898 
3899  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3900  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3901  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3902  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3903 
3904  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3905  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3906  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3907  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3908  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3909  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3910  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3911  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3912 
3913  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3914  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3915 
3916  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3917  dst3_r, tmp0, tmp1, tmp2, tmp3);
3918  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3919  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3920  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3921  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3922  ST8x4_UB(out0, out1, dst_tmp, dst_stride);
3923  dst_tmp += (4 * dst_stride);
3924 
3925  dst10_r = dst54_r;
3926  dst10_l = dst54_l;
3927  dst21_r = dst65_r;
3928  dst21_l = dst65_l;
3929  dsth2 = dsth6;
3930  }
3931 
3932  src += 8;
3933  dst += 8;
3934 
3935  mask2 = LD_SB(ff_hevc_mask_arr + 16);
3936  mask3 = mask2 + 2;
3937 
3938  LD_SB3(src, src_stride, src0, src1, src2);
3939  src += (3 * src_stride);
3940  XORI_B3_128_SB(src0, src1, src2);
3941  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3942  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3943 
3944  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3945  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3946 
3947  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3948  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3949 
3950  for (loop_cnt = 2; loop_cnt--;) {
3951  LD_SB8(src, src_stride,
3952  src3, src4, src5, src6, src7, src8, src9, src10);
3953  src += (8 * src_stride);
3954  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3955  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3956  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3957  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3958  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3959 
3960  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3961  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3962  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3963  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3964 
3965  dst32_r = __msa_ilvr_h(dst73, dst22);
3966  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3967  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3968  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3969  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3970  dst76_r = __msa_ilvr_h(dst22, dst106);
3971 
3972  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3973  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3974  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3975  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3976  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3977  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3978  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3979  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3980  SRA_4V(dst0, dst1, dst2, dst3, 6);
3981  SRA_4V(dst4, dst5, dst6, dst7, 6);
3982  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3983  tmp0, tmp1, tmp2, tmp3);
3984  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3985  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3986  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3987  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3988  ST4x8_UB(out0, out1, dst, dst_stride);
3989  dst += (8 * dst_stride);
3990 
3991  dst10_r = dst98_r;
3992  dst21_r = dst109_r;
3993  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3994  }
3995 }
3996 
3998  int32_t src_stride,
3999  uint8_t *dst,
4000  int32_t dst_stride,
4001  const int8_t *filter_x,
4002  const int8_t *filter_y,
4003  int32_t height)
4004 {
4005  if (4 == height) {
4006  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4007  filter_y, 2);
4008  } else {
4009  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4010  filter_x, filter_y, height, 2);
4011  }
4012 }
4013 
4015  int32_t src_stride,
4016  uint8_t *dst,
4017  int32_t dst_stride,
4018  const int8_t *filter_x,
4019  const int8_t *filter_y,
4020  int32_t height)
4021 {
4022  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4023  filter_x, filter_y, height, 3);
4024 }
4025 
4027  int32_t src_stride,
4028  uint8_t *dst,
4029  int32_t dst_stride,
4030  const int8_t *filter_x,
4031  const int8_t *filter_y,
4032  int32_t height)
4033 {
4034  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4035  filter_x, filter_y, height, 4);
4036 }
4037 
4038 #define UNI_MC_COPY(WIDTH) \
4039 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4040  ptrdiff_t dst_stride, \
4041  uint8_t *src, \
4042  ptrdiff_t src_stride, \
4043  int height, \
4044  intptr_t mx, \
4045  intptr_t my, \
4046  int width) \
4047 { \
4048  copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4049 }
4050 
4051 UNI_MC_COPY(8);
4052 UNI_MC_COPY(12);
4053 UNI_MC_COPY(16);
4054 UNI_MC_COPY(24);
4055 UNI_MC_COPY(32);
4056 UNI_MC_COPY(48);
4057 UNI_MC_COPY(64);
4058 
4059 #undef UNI_MC_COPY
4060 
4061 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4062 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4063  ptrdiff_t dst_stride, \
4064  uint8_t *src, \
4065  ptrdiff_t src_stride, \
4066  int height, \
4067  intptr_t mx, \
4068  intptr_t my, \
4069  int width) \
4070 { \
4071  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4072  \
4073  common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4074  filter, height); \
4075 }
4076 
4077 UNI_MC(qpel, h, 4, 8, hz, mx);
4078 UNI_MC(qpel, h, 8, 8, hz, mx);
4079 UNI_MC(qpel, h, 12, 8, hz, mx);
4080 UNI_MC(qpel, h, 16, 8, hz, mx);
4081 UNI_MC(qpel, h, 24, 8, hz, mx);
4082 UNI_MC(qpel, h, 32, 8, hz, mx);
4083 UNI_MC(qpel, h, 48, 8, hz, mx);
4084 UNI_MC(qpel, h, 64, 8, hz, mx);
4085 
4086 UNI_MC(qpel, v, 4, 8, vt, my);
4087 UNI_MC(qpel, v, 8, 8, vt, my);
4088 UNI_MC(qpel, v, 12, 8, vt, my);
4089 UNI_MC(qpel, v, 16, 8, vt, my);
4090 UNI_MC(qpel, v, 24, 8, vt, my);
4091 UNI_MC(qpel, v, 32, 8, vt, my);
4092 UNI_MC(qpel, v, 48, 8, vt, my);
4093 UNI_MC(qpel, v, 64, 8, vt, my);
4094 
4095 UNI_MC(epel, h, 4, 4, hz, mx);
4096 UNI_MC(epel, h, 6, 4, hz, mx);
4097 UNI_MC(epel, h, 8, 4, hz, mx);
4098 UNI_MC(epel, h, 12, 4, hz, mx);
4099 UNI_MC(epel, h, 16, 4, hz, mx);
4100 UNI_MC(epel, h, 24, 4, hz, mx);
4101 UNI_MC(epel, h, 32, 4, hz, mx);
4102 
4103 UNI_MC(epel, v, 4, 4, vt, my);
4104 UNI_MC(epel, v, 6, 4, vt, my);
4105 UNI_MC(epel, v, 8, 4, vt, my);
4106 UNI_MC(epel, v, 12, 4, vt, my);
4107 UNI_MC(epel, v, 16, 4, vt, my);
4108 UNI_MC(epel, v, 24, 4, vt, my);
4109 UNI_MC(epel, v, 32, 4, vt, my);
4110 
4111 #undef UNI_MC
4112 
4113 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4114 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4115  ptrdiff_t dst_stride, \
4116  uint8_t *src, \
4117  ptrdiff_t src_stride, \
4118  int height, \
4119  intptr_t mx, \
4120  intptr_t my, \
4121  int width) \
4122 { \
4123  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4124  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4125  \
4126  hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4127  filter_x, filter_y, height); \
4128 }
4129 
4130 UNI_MC_HV(qpel, 4, 8);
4131 UNI_MC_HV(qpel, 8, 8);
4132 UNI_MC_HV(qpel, 12, 8);
4133 UNI_MC_HV(qpel, 16, 8);
4134 UNI_MC_HV(qpel, 24, 8);
4135 UNI_MC_HV(qpel, 32, 8);
4136 UNI_MC_HV(qpel, 48, 8);
4137 UNI_MC_HV(qpel, 64, 8);
4138 
4139 UNI_MC_HV(epel, 4, 4);
4140 UNI_MC_HV(epel, 6, 4);
4141 UNI_MC_HV(epel, 8, 4);
4142 UNI_MC_HV(epel, 12, 4);
4143 UNI_MC_HV(epel, 16, 4);
4144 UNI_MC_HV(epel, 24, 4);
4145 UNI_MC_HV(epel, 32, 4);
4146 
4147 #undef UNI_MC_HV
#define VSHF_B4_SB(...)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define SRARI_W4_SW(...)
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVR_H4_SH(...)
#define PCKEV_B2_SH(...)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B2_128_SB(...)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define SD
Definition: ccaption_dec.c:819
#define LD_SB(...)
#define XORI_B3_128_SB(...)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNPCK_R_SB_SH(in, out)
#define LD_UB4(...)
#define DPADD_SB4_SH(...)
#define ILVR_B2_SB(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define src
Definition: vp8dsp.c:254
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define ILVL_H2_SH(...)
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define ILVL_H4_SH(...)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SH(...)
#define VSHF_B2_SB(...)
#define SRA_4V(in0, in1, in2, in3, shift)
#define XORI_B4_128_UB(...)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
Definition: cfhd.c:114
#define ILVR_D2_SB(...)
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
#define SPLATI_W2_SH(...)
#define SRARI_H4_SH(...)
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3)
#define ILVL_B2_SB(...)
#define height
#define LD_SH(...)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVRL_H2_SH(...)
#define ILVR_D3_SB(...)
#define LD_SB8(...)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static int aligned(int val)
Definition: dashdec.c:160
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST8x2_UB(in, pdst, stride)
#define VSHF_B2_SH(...)
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B7_128_SB(...)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
uint16_t width
Definition: gdv.c:47
#define ILVR_B2_SH(...)
#define XORI_B4_128_SB(...)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_W4_SH(...)
#define DPADD_SB2_SH(...)
#define SRARI_H2_SH(...)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_UB8(...)
#define PCKEV_D2_SH(...)
#define UNI_MC_HV(PEL, WIDTH, TAP)
#define SAT_SW4_SW(...)
#define PCKEV_H2_SW(...)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
int32_t
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define PCKEV_H2_SH(...)
#define LD_SB3(...)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST_UB(...)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SAT_SH4_SH(...)
#define SPLATI_H4_SB(...)
#define LD_SB4(...)
#define ST2x4_UB(in, stidx, pdst, stride)
#define PCKEV_B4_UB(...)
#define ST_UB8(...)
#define ST_UB4(...)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define src1
Definition: h264pred.c:139
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ILVL_B4_SB(...)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define SAT_SH2_SH(...)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define DOTP_SB4_SH(...)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SAT_SH3_SH(...)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define src0
Definition: h264pred.c:138
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SW(val, pdst)
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB7(...)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB5(...)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVR_H2_SH(...)
static const uint8_t ff_hevc_mask_arr[16 *3]
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
#define UNI_MC_COPY(WIDTH)
static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
#define LD_UB(...)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
#define ILVR_B4_SB(...)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
FILE * out
Definition: movenc.c:54
#define ILVR_B3_SH(...)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST8x1_UB(in, pdst)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define PCKEV_H4_SH(...)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x2_UB(in, pdst, stride)
#define LD2(psrc, stride, out0, out1)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define DOTP_SB2_SH(...)
static uint8_t tmp[11]
Definition: aes_ctr.c:26
#define VSHF_B3_SB(...)
#define DOTP_SB3_SH(...)