FFmpeg  4.0
vp9_lpf_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
26  p1_out, p0_out, q0_out, q1_out) \
27 { \
28  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2; \
29  const v16i8 cnst4b = __msa_ldi_b(4); \
30  const v16i8 cnst3b = __msa_ldi_b(3); \
31  \
32  p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
33  p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
34  q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
35  q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
36  \
37  filt = __msa_subs_s_b(p1_m, q1_m); \
38  \
39  filt = filt & (v16i8) hev_in; \
40  \
41  q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
42  filt = __msa_adds_s_b(filt, q0_sub_p0); \
43  filt = __msa_adds_s_b(filt, q0_sub_p0); \
44  filt = __msa_adds_s_b(filt, q0_sub_p0); \
45  filt = filt & (v16i8) mask_in; \
46  \
47  filt1 = __msa_adds_s_b(filt, cnst4b); \
48  filt1 >>= 3; \
49  \
50  filt2 = __msa_adds_s_b(filt, cnst3b); \
51  filt2 >>= 3; \
52  \
53  q0_m = __msa_subs_s_b(q0_m, filt1); \
54  q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
55  p0_m = __msa_adds_s_b(p0_m, filt2); \
56  p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
57  \
58  filt = __msa_srari_b(filt1, 1); \
59  hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
60  filt = filt & (v16i8) hev_in; \
61  \
62  q1_m = __msa_subs_s_b(q1_m, filt); \
63  q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
64  p1_m = __msa_adds_s_b(p1_m, filt); \
65  p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
66 }
67 
68 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
69 { \
70  v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
71  v16u8 zero_in = { 0 }; \
72  \
73  tmp = __msa_ori_b(zero_in, 1); \
74  p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
75  q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
76  p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
77  q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
78  \
79  p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
80  flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
81  p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
82  flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
83  \
84  flat_out = (tmp < (v16u8) flat_out); \
85  flat_out = __msa_xori_b(flat_out, 0xff); \
86  flat_out = flat_out & (mask); \
87 }
88 
89 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
90  q5_in, q6_in, q7_in, flat_in, flat2_out) \
91 { \
92  v16u8 tmp, zero_in = { 0 }; \
93  v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
94  v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
95  \
96  tmp = __msa_ori_b(zero_in, 1); \
97  p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
98  q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
99  p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
100  q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
101  p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
102  q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
103  p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
104  q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
105  \
106  p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
107  flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
108  flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
109  p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
110  flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
111  p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
112  flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
113  \
114  flat2_out = (tmp < (v16u8) flat2_out); \
115  flat2_out = __msa_xori_b(flat2_out, 0xff); \
116  flat2_out = flat2_out & flat_in; \
117 }
118 
119 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
120  q0_in, q1_in, q2_in, q3_in, \
121  p2_filt8_out, p1_filt8_out, p0_filt8_out, \
122  q0_filt8_out, q1_filt8_out, q2_filt8_out) \
123 { \
124  v8u16 tmp0, tmp1, tmp2; \
125  \
126  tmp2 = p2_in + p1_in + p0_in; \
127  tmp0 = p3_in << 1; \
128  \
129  tmp0 = tmp0 + tmp2 + q0_in; \
130  tmp1 = tmp0 + p3_in + p2_in; \
131  p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
132  \
133  tmp1 = tmp0 + p1_in + q1_in; \
134  p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
135  \
136  tmp1 = q2_in + q1_in + q0_in; \
137  tmp2 = tmp2 + tmp1; \
138  tmp0 = tmp2 + (p0_in); \
139  tmp0 = tmp0 + (p3_in); \
140  p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \
141  \
142  tmp0 = q2_in + q3_in; \
143  tmp0 = p0_in + tmp1 + tmp0; \
144  tmp1 = q3_in + q3_in; \
145  tmp1 = tmp1 + tmp0; \
146  q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
147  \
148  tmp0 = tmp2 + q3_in; \
149  tmp1 = tmp0 + q0_in; \
150  q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
151  \
152  tmp1 = tmp0 - p2_in; \
153  tmp0 = q1_in + q3_in; \
154  tmp1 = tmp0 + tmp1; \
155  q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
156 }
157 
158 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
159  q0_in, q1_in, q2_in, q3_in, \
160  limit_in, b_limit_in, thresh_in, \
161  hev_out, mask_out, flat_out) \
162 { \
163  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
164  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
165  \
166  /* absolute subtraction of pixel values */ \
167  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
168  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
169  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
170  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
171  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
172  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
173  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
174  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
175  \
176  /* calculation of hev */ \
177  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
178  hev_out = thresh_in < (v16u8) flat_out; \
179  \
180  /* calculation of mask */ \
181  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
182  p1_asub_q1_m >>= 1; \
183  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
184  \
185  mask_out = b_limit_in < p0_asub_q0_m; \
186  mask_out = __msa_max_u_b(flat_out, mask_out); \
187  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
188  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
189  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
190  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
191  \
192  mask_out = limit_in < (v16u8) mask_out; \
193  mask_out = __msa_xori_b(mask_out, 0xff); \
194 }
195 
196 void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
197  int32_t b_limit_ptr,
198  int32_t limit_ptr,
199  int32_t thresh_ptr)
200 {
201  uint64_t p1_d, p0_d, q0_d, q1_d;
202  v16u8 mask, hev, flat, thresh, b_limit, limit;
203  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
204 
205  /* load vector elements */
206  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
207 
208  thresh = (v16u8) __msa_fill_b(thresh_ptr);
209  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
210  limit = (v16u8) __msa_fill_b(limit_ptr);
211 
212  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
213  hev, mask, flat);
214  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
215  q1_out);
216 
217  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
218  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
219  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
220  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
221  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
222 }
223 
224 
225 void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
226  int32_t b_limit_ptr,
227  int32_t limit_ptr,
228  int32_t thresh_ptr)
229 {
230  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
231  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
232 
233  /* load vector elements */
234  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
235 
236  thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
237  thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
238  thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
239 
240  b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
241  b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
242  b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
243 
244  limit0 = (v16u8) __msa_fill_b(limit_ptr);
245  limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
246  limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
247 
248  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
249  hev, mask, flat);
250  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
251 
252  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
253 }
254 
255 void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
256  int32_t b_limit_ptr,
257  int32_t limit_ptr,
258  int32_t thresh_ptr)
259 {
260  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
261  v16u8 mask, hev, flat, thresh, b_limit, limit;
262  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
263  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
264  v8i16 p2_filter8, p1_filter8, p0_filter8;
265  v8i16 q0_filter8, q1_filter8, q2_filter8;
266  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
267  v16i8 zero = { 0 };
268 
269  /* load vector elements */
270  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
271 
272  thresh = (v16u8) __msa_fill_b(thresh_ptr);
273  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
274  limit = (v16u8) __msa_fill_b(limit_ptr);
275 
276  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
277  hev, mask, flat);
278  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
279  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
280  q1_out);
281 
282  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
283 
284  /* if flat is zero for all pixels, then no need to calculate other filter */
285  if (__msa_test_bz_v(flat)) {
286  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
287  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
288  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
289  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
290  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
291  } else {
292  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
293  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
294  q2_r, q3_r);
295  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
296  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
297 
298  /* convert 16 bit output data into 8 bit */
299  PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
300  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
301  q0_filter8);
302  PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
303 
304  /* store pixel values */
305  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
306  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
307  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
308  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
309  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
310  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
311 
312  p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
313  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
314  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
315  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
316  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
317  q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
318 
319  src -= 3 * pitch;
320 
321  SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
322  src += (4 * pitch);
323  SD(q1_d, src);
324  src += pitch;
325  SD(q2_d, src);
326  }
327 }
328 
329 void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
330  int32_t b_limit_ptr,
331  int32_t limit_ptr,
332  int32_t thresh_ptr)
333 {
334  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
335  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
336  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
337  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
338  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
339  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
340  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
341  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
342  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
343  v16u8 zero = { 0 };
344 
345  /* load vector elements */
346  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
347 
348  thresh = (v16u8) __msa_fill_b(thresh_ptr);
349  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
350  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
351 
352  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
353  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
354  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
355 
356  limit = (v16u8) __msa_fill_b(limit_ptr);
357  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
358  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
359 
360  /* mask and hev */
361  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
362  hev, mask, flat);
363  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
364  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
365  q1_out);
366 
367  /* if flat is zero for all pixels, then no need to calculate other filter */
368  if (__msa_test_bz_v(flat)) {
369  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
370  } else {
371  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
372  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
373  q2_r, q3_r);
374  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
375  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
376 
377  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
378  p0_l);
379  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
380  q3_l);
381  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
382  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
383 
384  /* convert 16 bit output data into 8 bit */
385  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
386  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
387  p0_filt8_r, q0_filt8_r);
388  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
389  q1_filt8_r, q2_filt8_r);
390 
391  /* store pixel values */
392  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
393  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
394  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
395  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
396  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
397  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
398 
399  src -= 3 * pitch;
400 
401  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
402  src += (4 * pitch);
403  ST_UB2(q1_out, q2_out, src, pitch);
404  src += (2 * pitch);
405  }
406 }
407 
408 void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
409  int32_t b_limit_ptr,
410  int32_t limit_ptr,
411  int32_t thresh_ptr)
412 {
413  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
414  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
415  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
416  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
417  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
418  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
419  v16u8 zero = { 0 };
420 
421  /* load vector elements */
422  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
423 
424  thresh = (v16u8) __msa_fill_b(thresh_ptr);
425  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
426  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
427 
428  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
429  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
430  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
431 
432  limit = (v16u8) __msa_fill_b(limit_ptr);
433  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
434  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
435 
436  /* mask and hev */
437  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
438  hev, mask, flat);
439  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
440  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
441  q1_out);
442 
443  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
444 
445  /* if flat is zero for all pixels, then no need to calculate other filter */
446  if (__msa_test_bz_v(flat)) {
447  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
448  } else {
449  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
450  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
451  q2_r, q3_r);
452  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
453  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
454 
455  /* convert 16 bit output data into 8 bit */
456  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
457  p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
458  p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
459  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
460  q1_filt8_r, q2_filt8_r);
461 
462  /* store pixel values */
463  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
464  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
465  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
466  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
467  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
468  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
469 
470  src -= 3 * pitch;
471 
472  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
473  src += (4 * pitch);
474  ST_UB2(q1_out, q2_out, src, pitch);
475  src += (2 * pitch);
476  }
477 }
478 
479 void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
480  int32_t b_limit_ptr,
481  int32_t limit_ptr,
482  int32_t thresh_ptr)
483 {
484  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
485  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
486  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
487  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
488  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
489  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
490  v16u8 zero = { 0 };
491 
492  /* load vector elements */
493  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
494 
495  thresh = (v16u8) __msa_fill_b(thresh_ptr);
496  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
497  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
498 
499  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
500  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
501  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
502 
503  limit = (v16u8) __msa_fill_b(limit_ptr);
504  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
505  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
506 
507  /* mask and hev */
508  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
509  hev, mask, flat);
510  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
511  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
512  q1_out);
513 
514  flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
515 
516  /* if flat is zero for all pixels, then no need to calculate other filter */
517  if (__msa_test_bz_v(flat)) {
518  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
519  } else {
520  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
521  p0_l);
522  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
523  q3_l);
524  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
525  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
526 
527  /* convert 16 bit output data into 8 bit */
528  PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
529  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
530  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
531  PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
532  q1_filt8_l, q2_filt8_l);
533 
534  /* store pixel values */
535  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
536  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
537  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
538  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
539  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
540  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
541 
542  src -= 3 * pitch;
543 
544  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
545  src += (4 * pitch);
546  ST_UB2(q1_out, q2_out, src, pitch);
547  src += (2 * pitch);
548  }
549 }
550 
551 static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
552  uint8_t *filter48,
553  int32_t b_limit_ptr,
554  int32_t limit_ptr,
555  int32_t thresh_ptr)
556 {
557  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
558  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
559  v16u8 flat, mask, hev, thresh, b_limit, limit;
560  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
561  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
562  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
563  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
564  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
565  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
566  v16u8 zero = { 0 };
567 
568  /* load vector elements */
569  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
570 
571  thresh = (v16u8) __msa_fill_b(thresh_ptr);
572  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
573  limit = (v16u8) __msa_fill_b(limit_ptr);
574 
575  /* mask and hev */
576  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
577  hev, mask, flat);
578  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
579  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
580  q1_out);
581 
582  /* if flat is zero for all pixels, then no need to calculate other filter */
583  if (__msa_test_bz_v(flat)) {
584  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
585 
586  return 1;
587  } else {
588  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
589  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
590  q2_r, q3_r);
591  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
592  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
593 
594  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
595  p0_l);
596  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
597  q3_l);
598  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
599  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
600 
601  /* convert 16 bit output data into 8 bit */
602  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
603  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
604  p0_filt8_r, q0_filt8_r);
605  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
606  q2_filt8_r);
607 
608  /* store pixel values */
609  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
610  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
611  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
612  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
613  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
614  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
615 
616  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
617  filter48 += (4 * 16);
618  ST_UB2(q1_out, q2_out, filter48, 16);
619  filter48 += (2 * 16);
620  ST_UB(flat, filter48);
621 
622  return 0;
623  }
624 }
625 
626 static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
627 {
628  v16u8 flat, flat2, filter8;
629  v16i8 zero = { 0 };
630  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
631  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
632  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
633  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
634  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
635  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
636  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
637  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
638  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
639  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
640  v8i16 l_out, r_out;
641 
642  flat = LD_UB(filter48 + 96);
643 
644  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
645  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
646  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
647 
648  /* if flat2 is zero for all pixels, then no need to calculate other filter */
649  if (__msa_test_bz_v(flat2)) {
650  LD_UB4(filter48, 16, p2, p1, p0, q0);
651  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
652 
653  src -= 3 * pitch;
654  ST_UB4(p2, p1, p0, q0, src, pitch);
655  src += (4 * pitch);
656  ST_UB2(q1, q2, src, pitch);
657  } else {
658  src -= 7 * pitch;
659 
660  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
661  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
662  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
663 
664  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
665 
666  tmp0_r = p7_r_in << 3;
667  tmp0_r -= p7_r_in;
668  tmp0_r += p6_r_in;
669  tmp0_r += q0_r_in;
670  tmp1_r = p6_r_in + p5_r_in;
671  tmp1_r += p4_r_in;
672  tmp1_r += p3_r_in;
673  tmp1_r += p2_r_in;
674  tmp1_r += p1_r_in;
675  tmp1_r += p0_r_in;
676  tmp1_r += tmp0_r;
677  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
678 
679  ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
680  p5_l_in, p4_l_in);
681  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
682  p1_l_in, p0_l_in);
683  q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
684 
685  tmp0_l = p7_l_in << 3;
686  tmp0_l -= p7_l_in;
687  tmp0_l += p6_l_in;
688  tmp0_l += q0_l_in;
689  tmp1_l = p6_l_in + p5_l_in;
690  tmp1_l += p4_l_in;
691  tmp1_l += p3_l_in;
692  tmp1_l += p2_l_in;
693  tmp1_l += p1_l_in;
694  tmp1_l += p0_l_in;
695  tmp1_l += tmp0_l;
696  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
697 
698  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
699  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
700  ST_UB(p6, src);
701  src += pitch;
702 
703  /* p5 */
704  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
705  tmp0_r = p5_r_in - p6_r_in;
706  tmp0_r += q1_r_in;
707  tmp0_r -= p7_r_in;
708  tmp1_r += tmp0_r;
709  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
710 
711  q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
712  tmp0_l = p5_l_in - p6_l_in;
713  tmp0_l += q1_l_in;
714  tmp0_l -= p7_l_in;
715  tmp1_l += tmp0_l;
716  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
717 
718  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
719  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
720  ST_UB(p5, src);
721  src += pitch;
722 
723  /* p4 */
724  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
725  tmp0_r = p4_r_in - p5_r_in;
726  tmp0_r += q2_r_in;
727  tmp0_r -= p7_r_in;
728  tmp1_r += tmp0_r;
729  r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
730 
731  q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
732  tmp0_l = p4_l_in - p5_l_in;
733  tmp0_l += q2_l_in;
734  tmp0_l -= p7_l_in;
735  tmp1_l += tmp0_l;
736  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
737 
738  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
739  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
740  ST_UB(p4, src);
741  src += pitch;
742 
743  /* p3 */
744  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
745  tmp0_r = p3_r_in - p4_r_in;
746  tmp0_r += q3_r_in;
747  tmp0_r -= p7_r_in;
748  tmp1_r += tmp0_r;
749  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
750 
751  q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
752  tmp0_l = p3_l_in - p4_l_in;
753  tmp0_l += q3_l_in;
754  tmp0_l -= p7_l_in;
755  tmp1_l += tmp0_l;
756  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
757 
758  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
759  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
760  ST_UB(p3, src);
761  src += pitch;
762 
763  /* p2 */
764  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
765  filter8 = LD_UB(filter48);
766  tmp0_r = p2_r_in - p3_r_in;
767  tmp0_r += q4_r_in;
768  tmp0_r -= p7_r_in;
769  tmp1_r += tmp0_r;
770  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
771 
772  q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
773  tmp0_l = p2_l_in - p3_l_in;
774  tmp0_l += q4_l_in;
775  tmp0_l -= p7_l_in;
776  tmp1_l += tmp0_l;
777  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
778 
779  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
780  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
781  ST_UB(filter8, src);
782  src += pitch;
783 
784  /* p1 */
785  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
786  filter8 = LD_UB(filter48 + 16);
787  tmp0_r = p1_r_in - p2_r_in;
788  tmp0_r += q5_r_in;
789  tmp0_r -= p7_r_in;
790  tmp1_r += tmp0_r;
791  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
792 
793  q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
794  tmp0_l = p1_l_in - p2_l_in;
795  tmp0_l += q5_l_in;
796  tmp0_l -= p7_l_in;
797  tmp1_l += tmp0_l;
798  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
799 
800  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
801  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
802  ST_UB(filter8, src);
803  src += pitch;
804 
805  /* p0 */
806  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
807  filter8 = LD_UB(filter48 + 32);
808  tmp0_r = p0_r_in - p1_r_in;
809  tmp0_r += q6_r_in;
810  tmp0_r -= p7_r_in;
811  tmp1_r += tmp0_r;
812  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
813 
814  q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
815  tmp0_l = p0_l_in - p1_l_in;
816  tmp0_l += q6_l_in;
817  tmp0_l -= p7_l_in;
818  tmp1_l += tmp0_l;
819  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
820 
821  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
822  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
823  ST_UB(filter8, src);
824  src += pitch;
825 
826  /* q0 */
827  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
828  filter8 = LD_UB(filter48 + 48);
829  tmp0_r = q7_r_in - p0_r_in;
830  tmp0_r += q0_r_in;
831  tmp0_r -= p7_r_in;
832  tmp1_r += tmp0_r;
833  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
834 
835  q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
836  tmp0_l = q7_l_in - p0_l_in;
837  tmp0_l += q0_l_in;
838  tmp0_l -= p7_l_in;
839  tmp1_l += tmp0_l;
840  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
841 
842  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
843  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
844  ST_UB(filter8, src);
845  src += pitch;
846 
847  /* q1 */
848  filter8 = LD_UB(filter48 + 64);
849  tmp0_r = q7_r_in - q0_r_in;
850  tmp0_r += q1_r_in;
851  tmp0_r -= p6_r_in;
852  tmp1_r += tmp0_r;
853  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
854 
855  tmp0_l = q7_l_in - q0_l_in;
856  tmp0_l += q1_l_in;
857  tmp0_l -= p6_l_in;
858  tmp1_l += tmp0_l;
859  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
860 
861  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
862  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
863  ST_UB(filter8, src);
864  src += pitch;
865 
866  /* q2 */
867  filter8 = LD_UB(filter48 + 80);
868  tmp0_r = q7_r_in - q1_r_in;
869  tmp0_r += q2_r_in;
870  tmp0_r -= p5_r_in;
871  tmp1_r += tmp0_r;
872  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
873 
874  tmp0_l = q7_l_in - q1_l_in;
875  tmp0_l += q2_l_in;
876  tmp0_l -= p5_l_in;
877  tmp1_l += tmp0_l;
878  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
879 
880  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
881  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
882  ST_UB(filter8, src);
883  src += pitch;
884 
885  /* q3 */
886  tmp0_r = q7_r_in - q2_r_in;
887  tmp0_r += q3_r_in;
888  tmp0_r -= p4_r_in;
889  tmp1_r += tmp0_r;
890  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
891 
892  tmp0_l = q7_l_in - q2_l_in;
893  tmp0_l += q3_l_in;
894  tmp0_l -= p4_l_in;
895  tmp1_l += tmp0_l;
896  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
897 
898  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
899  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
900  ST_UB(q3, src);
901  src += pitch;
902 
903  /* q4 */
904  tmp0_r = q7_r_in - q3_r_in;
905  tmp0_r += q4_r_in;
906  tmp0_r -= p3_r_in;
907  tmp1_r += tmp0_r;
908  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
909 
910  tmp0_l = q7_l_in - q3_l_in;
911  tmp0_l += q4_l_in;
912  tmp0_l -= p3_l_in;
913  tmp1_l += tmp0_l;
914  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
915 
916  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
917  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
918  ST_UB(q4, src);
919  src += pitch;
920 
921  /* q5 */
922  tmp0_r = q7_r_in - q4_r_in;
923  tmp0_r += q5_r_in;
924  tmp0_r -= p2_r_in;
925  tmp1_r += tmp0_r;
926  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
927 
928  tmp0_l = q7_l_in - q4_l_in;
929  tmp0_l += q5_l_in;
930  tmp0_l -= p2_l_in;
931  tmp1_l += tmp0_l;
932  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
933 
934  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
935  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
936  ST_UB(q5, src);
937  src += pitch;
938 
939  /* q6 */
940  tmp0_r = q7_r_in - q5_r_in;
941  tmp0_r += q6_r_in;
942  tmp0_r -= p1_r_in;
943  tmp1_r += tmp0_r;
944  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
945 
946  tmp0_l = q7_l_in - q5_l_in;
947  tmp0_l += q6_l_in;
948  tmp0_l -= p1_l_in;
949  tmp1_l += tmp0_l;
950  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
951 
952  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
953  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
954  ST_UB(q6, src);
955  }
956 }
957 
958 void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
959  int32_t b_limit_ptr,
960  int32_t limit_ptr,
961  int32_t thresh_ptr)
962 {
963  uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
964  uint8_t early_exit = 0;
965 
966  early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
967  b_limit_ptr, limit_ptr, thresh_ptr);
968 
969  if (0 == early_exit) {
970  vp9_hz_lpf_t16_16w(src, pitch, filter48);
971  }
972 }
973 
974 void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
975  int32_t b_limit_ptr,
976  int32_t limit_ptr,
977  int32_t thresh_ptr)
978 {
979  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
980  uint64_t dword0, dword1;
981  v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
982  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
983  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
984  v16u8 p0_filter16, p1_filter16;
985  v8i16 p2_filter8, p1_filter8, p0_filter8;
986  v8i16 q0_filter8, q1_filter8, q2_filter8;
987  v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
988  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
989  v16i8 zero = { 0 };
990  v8u16 tmp0, tmp1, tmp2;
991 
992  /* load vector elements */
993  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
994 
995  thresh = (v16u8) __msa_fill_b(thresh_ptr);
996  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
997  limit = (v16u8) __msa_fill_b(limit_ptr);
998 
999  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1000  hev, mask, flat);
1001  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1002  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1003  q1_out);
1004 
1005  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1006 
1007  /* if flat is zero for all pixels, then no need to calculate other filter */
1008  if (__msa_test_bz_v(flat)) {
1009  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1010  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1011  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1012  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1013  SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1014  } else {
1015  /* convert 8 bit input data into 16 bit */
1016  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1017  q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1018  q1_r, q2_r, q3_r);
1019  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1020  p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1021  q1_filter8, q2_filter8);
1022 
1023  /* convert 16 bit output data into 8 bit */
1024  PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1025  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1026  q0_filter8);
1027  PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1028  q2_filter8);
1029 
1030  /* store pixel values */
1031  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1032  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1033  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1034  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1035  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1036  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1037 
1038  /* load 16 vector elements */
1039  LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1040  LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1041 
1042  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1043 
1044  /* if flat2 is zero for all pixels, then no need to calculate other filter */
1045  if (__msa_test_bz_v(flat2)) {
1046  p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1047  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1048  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1049  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1050  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1051  q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1052 
1053  SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1054  SD(q1_d, src + pitch);
1055  SD(q2_d, src + 2 * pitch);
1056  } else {
1057  /* LSB(right) 8 pixel operation */
1058  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1059  zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1060  q4_r, q5_r, q6_r, q7_r);
1061 
1062  tmp0 = p7_r << 3;
1063  tmp0 -= p7_r;
1064  tmp0 += p6_r;
1065  tmp0 += q0_r;
1066 
1067  src -= 7 * pitch;
1068 
1069  /* calculation of p6 and p5 */
1070  tmp1 = p6_r + p5_r + p4_r + p3_r;
1071  tmp1 += (p2_r + p1_r + p0_r);
1072  tmp1 += tmp0;
1073  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1074  tmp0 = p5_r - p6_r + q1_r - p7_r;
1075  tmp1 += tmp0;
1076  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1077  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1078  p0_filter16, p1_filter16);
1079  p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1080  p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1081  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1082  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1083  SD(dword0, src);
1084  src += pitch;
1085  SD(dword1, src);
1086  src += pitch;
1087 
1088  /* calculation of p4 and p3 */
1089  tmp0 = p4_r - p5_r + q2_r - p7_r;
1090  tmp2 = p3_r - p4_r + q3_r - p7_r;
1091  tmp1 += tmp0;
1092  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1093  tmp1 += tmp2;
1094  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1095  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1096  p0_filter16, p1_filter16);
1097  p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1098  p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1099  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1100  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1101  SD(dword0, src);
1102  src += pitch;
1103  SD(dword1, src);
1104  src += pitch;
1105 
1106  /* calculation of p2 and p1 */
1107  tmp0 = p2_r - p3_r + q4_r - p7_r;
1108  tmp2 = p1_r - p2_r + q5_r - p7_r;
1109  tmp1 += tmp0;
1110  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1111  tmp1 += tmp2;
1112  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1113  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1114  p0_filter16, p1_filter16);
1115  p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1116  p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1117  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1118  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1119  SD(dword0, src);
1120  src += pitch;
1121  SD(dword1, src);
1122  src += pitch;
1123 
1124  /* calculation of p0 and q0 */
1125  tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1126  tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1127  tmp1 += tmp0;
1128  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1129  tmp1 += tmp2;
1130  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1131  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1132  p0_filter16, p1_filter16);
1133  p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1134  p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1135  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1136  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1137  SD(dword0, src);
1138  src += pitch;
1139  SD(dword1, src);
1140  src += pitch;
1141 
1142  /* calculation of q1 and q2 */
1143  tmp0 = q7_r - q0_r + q1_r - p6_r;
1144  tmp2 = q7_r - q1_r + q2_r - p5_r;
1145  tmp1 += tmp0;
1146  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1147  tmp1 += tmp2;
1148  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1149  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1150  p0_filter16, p1_filter16);
1151  p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1152  p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1153  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1154  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1155  SD(dword0, src);
1156  src += pitch;
1157  SD(dword1, src);
1158  src += pitch;
1159 
1160  /* calculation of q3 and q4 */
1161  tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1162  tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1163  tmp1 += tmp0;
1164  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1165  tmp1 += tmp2;
1166  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1167  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1168  p0_filter16, p1_filter16);
1169  p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1170  p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1171  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1172  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1173  SD(dword0, src);
1174  src += pitch;
1175  SD(dword1, src);
1176  src += pitch;
1177 
1178  /* calculation of q5 and q6 */
1179  tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1180  tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1181  tmp1 += tmp0;
1182  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1183  tmp1 += tmp2;
1184  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1185  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1186  p0_filter16, p1_filter16);
1187  p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1188  p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1189  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1190  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1191  SD(dword0, src);
1192  src += pitch;
1193  SD(dword1, src);
1194  }
1195  }
1196 }
1197 
1198 void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
1199  int32_t b_limit_ptr,
1200  int32_t limit_ptr,
1201  int32_t thresh_ptr)
1202 {
1203  v16u8 mask, hev, flat, limit, thresh, b_limit;
1204  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1205  v8i16 vec0, vec1, vec2, vec3;
1206 
1207  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1208 
1209  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1210  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1211  limit = (v16u8) __msa_fill_b(limit_ptr);
1212 
1213  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1214  p3, p2, p1, p0, q0, q1, q2, q3);
1215  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1216  hev, mask, flat);
1217  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1218  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
1219  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1220 
1221  src -= 2;
1222  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1223  src += 4 * pitch;
1224  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1225 }
1226 
1227 void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
1228  int32_t b_limit_ptr,
1229  int32_t limit_ptr,
1230  int32_t thresh_ptr)
1231 {
1232  v16u8 mask, hev, flat;
1233  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1234  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1235  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1236  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1237  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1238 
1239  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1240  LD_UB8(src - 4 + (8 * pitch), pitch,
1241  row8, row9, row10, row11, row12, row13, row14, row15);
1242 
1243  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1244  row8, row9, row10, row11, row12, row13, row14, row15,
1245  p3, p2, p1, p0, q0, q1, q2, q3);
1246 
1247  thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1248  thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1249  thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1250 
1251  b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1252  b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1253  b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1254 
1255  limit0 = (v16u8) __msa_fill_b(limit_ptr);
1256  limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1257  limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1258 
1259  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1260  hev, mask, flat);
1261  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1262  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1263  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
1264  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1265  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
1266 
1267  src -= 2;
1268 
1269  ST4x8_UB(tmp2, tmp3, src, pitch);
1270  src += (8 * pitch);
1271  ST4x8_UB(tmp4, tmp5, src, pitch);
1272 }
1273 
1274 void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
1275  int32_t b_limit_ptr,
1276  int32_t limit_ptr,
1277  int32_t thresh_ptr)
1278 {
1279  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1280  v16u8 p1_out, p0_out, q0_out, q1_out;
1281  v16u8 flat, mask, hev, thresh, b_limit, limit;
1282  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1283  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1284  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1285  v16u8 zero = { 0 };
1286  v8i16 vec0, vec1, vec2, vec3, vec4;
1287 
1288  /* load vector elements */
1289  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1290 
1291  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1292  p3, p2, p1, p0, q0, q1, q2, q3);
1293 
1294  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1295  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1296  limit = (v16u8) __msa_fill_b(limit_ptr);
1297 
1298  /* mask and hev */
1299  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1300  hev, mask, flat);
1301  /* flat4 */
1302  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1303  /* filter4 */
1304  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1305  q1_out);
1306 
1307  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1308 
1309  /* if flat is zero for all pixels, then no need to calculate other filter */
1310  if (__msa_test_bz_v(flat)) {
1311  /* Store 4 pixels p1-_q1 */
1312  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1313  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1314 
1315  src -= 2;
1316  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1317  src += 4 * pitch;
1318  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1319  } else {
1320  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1321  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1322  q3_r);
1323  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1324  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1325  /* convert 16 bit output data into 8 bit */
1326  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1327  p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1328  p0_filt8_r, q0_filt8_r);
1329  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1330  q2_filt8_r);
1331 
1332  /* store pixel values */
1333  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1334  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1335  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1336  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1337  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1338  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1339 
1340  /* Store 6 pixels p2-_q2 */
1341  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1342  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1343  vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1344 
1345  src -= 3;
1346  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1347  ST2x4_UB(vec4, 0, src + 4, pitch);
1348  src += (4 * pitch);
1349  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1350  ST2x4_UB(vec4, 4, src + 4, pitch);
1351  }
1352 }
1353 
1354 void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
1355  int32_t b_limit_ptr,
1356  int32_t limit_ptr,
1357  int32_t thresh_ptr)
1358 {
1359  uint8_t *temp_src;
1360  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1361  v16u8 p1_out, p0_out, q0_out, q1_out;
1362  v16u8 flat, mask, hev, thresh, b_limit, limit;
1363  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1364  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1365  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1366  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1367  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1368  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1369  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1370  v16u8 zero = { 0 };
1371  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1372 
1373  temp_src = src - 4;
1374 
1375  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1376  temp_src += (8 * pitch);
1377  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1378 
1379  /* transpose 16x8 matrix into 8x16 */
1380  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1381  q3, q2, q1, q0, row12, row13, row14, row15,
1382  p3, p2, p1, p0, q0, q1, q2, q3);
1383 
1384  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1385  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1386  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1387 
1388  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1389  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1390  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1391 
1392  limit = (v16u8) __msa_fill_b(limit_ptr);
1393  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1394  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1395 
1396  /* mask and hev */
1397  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1398  hev, mask, flat);
1399  /* flat4 */
1400  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1401  /* filter4 */
1402  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1403  q1_out);
1404 
1405  /* if flat is zero for all pixels, then no need to calculate other filter */
1406  if (__msa_test_bz_v(flat)) {
1407  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1408  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1409  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1410  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1411 
1412  src -= 2;
1413  ST4x8_UB(vec2, vec3, src, pitch);
1414  src += 8 * pitch;
1415  ST4x8_UB(vec4, vec5, src, pitch);
1416  } else {
1417  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1418  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1419  q3_r);
1420  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1421  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1422 
1423  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1424  p0_l);
1425  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1426  q3_l);
1427 
1428  /* filter8 */
1429  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1430  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1431 
1432  /* convert 16 bit output data into 8 bit */
1433  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1434  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1435  p0_filt8_r, q0_filt8_r);
1436  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1437  q2_filt8_r);
1438 
1439  /* store pixel values */
1440  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1441  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1442  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1443  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1444  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1445  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1446 
1447  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1448  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1449  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1450  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1451  ILVRL_B2_SH(q2, q1, vec2, vec5);
1452 
1453  src -= 3;
1454  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1455  ST2x4_UB(vec2, 0, src + 4, pitch);
1456  src += (4 * pitch);
1457  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1458  ST2x4_UB(vec2, 4, src + 4, pitch);
1459  src += (4 * pitch);
1460  ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1461  ST2x4_UB(vec5, 0, src + 4, pitch);
1462  src += (4 * pitch);
1463  ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1464  ST2x4_UB(vec5, 4, src + 4, pitch);
1465  }
1466 }
1467 
1468 void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
1469  int32_t b_limit_ptr,
1470  int32_t limit_ptr,
1471  int32_t thresh_ptr)
1472 {
1473  uint8_t *temp_src;
1474  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1475  v16u8 p1_out, p0_out, q0_out, q1_out;
1476  v16u8 flat, mask, hev, thresh, b_limit, limit;
1477  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1478  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1479  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1480  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1481  v16u8 zero = { 0 };
1482  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1483 
1484  temp_src = src - 4;
1485 
1486  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1487  temp_src += (8 * pitch);
1488  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1489 
1490  /* transpose 16x8 matrix into 8x16 */
1491  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1492  q3, q2, q1, q0, row12, row13, row14, row15,
1493  p3, p2, p1, p0, q0, q1, q2, q3);
1494 
1495  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1496  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1497  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1498 
1499  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1500  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1501  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1502 
1503  limit = (v16u8) __msa_fill_b(limit_ptr);
1504  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1505  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1506 
1507  /* mask and hev */
1508  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1509  hev, mask, flat);
1510  /* flat4 */
1511  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1512  /* filter4 */
1513  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1514  q1_out);
1515 
1516  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1517 
1518  /* if flat is zero for all pixels, then no need to calculate other filter */
1519  if (__msa_test_bz_v(flat)) {
1520  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1521  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1522  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1523  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1524 
1525  src -= 2;
1526  ST4x8_UB(vec2, vec3, src, pitch);
1527  src += 8 * pitch;
1528  ST4x8_UB(vec4, vec5, src, pitch);
1529  } else {
1530  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1531  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1532  q3_r);
1533  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1534  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1535 
1536  /* convert 16 bit output data into 8 bit */
1537  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1538  p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1539  p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1540  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1541  q1_filt8_r, q2_filt8_r);
1542 
1543  /* store pixel values */
1544  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1545  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1546  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1547  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1548  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1549  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1550 
1551  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1552  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1553  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1554  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1555  ILVRL_B2_SH(q2, q1, vec2, vec5);
1556 
1557  src -= 3;
1558  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1559  ST2x4_UB(vec2, 0, src + 4, pitch);
1560  src += (4 * pitch);
1561  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1562  ST2x4_UB(vec2, 4, src + 4, pitch);
1563  src += (4 * pitch);
1564  ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1565  ST2x4_UB(vec5, 0, src + 4, pitch);
1566  src += (4 * pitch);
1567  ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1568  ST2x4_UB(vec5, 4, src + 4, pitch);
1569  }
1570 }
1571 
1572 void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
1573  int32_t b_limit_ptr,
1574  int32_t limit_ptr,
1575  int32_t thresh_ptr)
1576 {
1577  uint8_t *temp_src;
1578  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1579  v16u8 p1_out, p0_out, q0_out, q1_out;
1580  v16u8 flat, mask, hev, thresh, b_limit, limit;
1581  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1582  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1583  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1584  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1585  v16u8 zero = { 0 };
1586  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1587 
1588  temp_src = src - 4;
1589 
1590  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1591  temp_src += (8 * pitch);
1592  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1593 
1594  /* transpose 16x8 matrix into 8x16 */
1595  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1596  q3, q2, q1, q0, row12, row13, row14, row15,
1597  p3, p2, p1, p0, q0, q1, q2, q3);
1598 
1599  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1600  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1601  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1602 
1603  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1604  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1605  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1606 
1607  limit = (v16u8) __msa_fill_b(limit_ptr);
1608  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1609  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1610 
1611  /* mask and hev */
1612  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1613  hev, mask, flat);
1614  /* flat4 */
1615  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1616  /* filter4 */
1617  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1618  q1_out);
1619 
1620  flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
1621 
1622  /* if flat is zero for all pixels, then no need to calculate other filter */
1623  if (__msa_test_bz_v(flat)) {
1624  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1625  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1626  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1627  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1628 
1629  src -= 2;
1630  ST4x8_UB(vec2, vec3, src, pitch);
1631  src += 8 * pitch;
1632  ST4x8_UB(vec4, vec5, src, pitch);
1633  } else {
1634  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1635  p0_l);
1636  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1637  q3_l);
1638 
1639  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1640  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1641 
1642  /* convert 16 bit output data into 8 bit */
1643  PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1644  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1645  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1646  PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1647  q1_filt8_l, q2_filt8_l);
1648 
1649  /* store pixel values */
1650  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1651  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1652  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1653  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1654  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1655  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1656 
1657  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1658  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1659  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1660  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1661  ILVRL_B2_SH(q2, q1, vec2, vec5);
1662 
1663  src -= 3;
1664  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1665  ST2x4_UB(vec2, 0, src + 4, pitch);
1666  src += (4 * pitch);
1667  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1668  ST2x4_UB(vec2, 4, src + 4, pitch);
1669  src += (4 * pitch);
1670  ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1671  ST2x4_UB(vec5, 0, src + 4, pitch);
1672  src += (4 * pitch);
1673  ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1674  ST2x4_UB(vec5, 4, src + 4, pitch);
1675  }
1676 }
1677 
1678 static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
1679  uint8_t *output, int32_t out_pitch)
1680 {
1681  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1682  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1683  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1684 
1685  LD_UB8(input, in_pitch,
1686  p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1687  /* 8x8 transpose */
1688  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
1689  p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1690  /* 8x8 transpose */
1691  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1692  tmp0, tmp1, tmp2, tmp3);
1693  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1694  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1695  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
1696  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
1697  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
1698 
1699  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1700  output += (8 * out_pitch);
1701  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1702 }
1703 
1704 static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
1705  uint8_t *output, int32_t out_pitch)
1706 {
1707  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1708  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1709 
1710  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1711  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1712  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1713  q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1714  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1715 }
1716 
1717 static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
1718  uint8_t *output, int32_t out_pitch)
1719 {
1720  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1721  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1722  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1723  v4i32 tmp2, tmp3;
1724  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1725 
1726  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1727  input += (8 * in_pitch);
1728  LD_UB8(input, in_pitch,
1729  row8, row9, row10, row11, row12, row13, row14, row15);
1730 
1731  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1732  row8, row9, row10, row11, row12, row13, row14, row15,
1733  p7, p6, p5, p4, p3, p2, p1, p0);
1734 
1735  /* transpose 16x8 matrix into 8x16 */
1736  /* total 8 intermediate register and 32 instructions */
1737  q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1738  q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1739  q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1740  q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1741  q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1742  q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1743  q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1744  q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1745 
1746  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
1747  tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1748  tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1749 
1750  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
1751  tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1752  tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
1753 
1754  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
1755  q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1756  q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1757 
1758  tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1759  tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1760  q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1761  q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1762 
1763  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
1764  q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1765  q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1766 
1767  tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1768  tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1769  q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1770  q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1771 
1772  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1773  output += (8 * out_pitch);
1774  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1775 }
1776 
1778  uint8_t *src_org, int32_t pitch_org,
1779  int32_t b_limit_ptr,
1780  int32_t limit_ptr,
1781  int32_t thresh_ptr)
1782 {
1783  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1784  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1785  v16u8 flat, mask, hev, thresh, b_limit, limit;
1786  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1787  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1788  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1789  v16i8 zero = { 0 };
1790  v8i16 vec0, vec1, vec2, vec3;
1791 
1792  /* load vector elements */
1793  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1794 
1795  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1796  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1797  limit = (v16u8) __msa_fill_b(limit_ptr);
1798 
1799  /* mask and hev */
1800  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1801  hev, mask, flat);
1802  /* flat4 */
1803  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1804  /* filter4 */
1805  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1806  q1_out);
1807 
1808  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1809 
1810  /* if flat is zero for all pixels, then no need to calculate other filter */
1811  if (__msa_test_bz_v(flat)) {
1812  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1813  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1814  ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
1815  return 1;
1816  } else {
1817  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1818  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1819  q3_r);
1820  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1821  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1822 
1823  /* convert 16 bit output data into 8 bit */
1824  p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1825  p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1826  p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1827  q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1828  q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1829  q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1830 
1831  /* store pixel values */
1832  p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1833  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1834  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1835  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1836  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1837  q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1838 
1839  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1840  filter48 += (4 * 16);
1841  ST_UB2(q1_out, q2_out, filter48, 16);
1842  filter48 += (2 * 16);
1843  ST_UB(flat, filter48);
1844 
1845  return 0;
1846  }
1847 }
1848 
1849 static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
1850  uint8_t *filter48)
1851 {
1852  v16i8 zero = { 0 };
1853  v16u8 filter8, flat, flat2;
1854  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1855  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1856  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1857  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1858  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1859  v8u16 tmp0_r, tmp1_r;
1860  v8i16 r_out;
1861 
1862  flat = LD_UB(filter48 + 6 * 16);
1863 
1864  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1865  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1866 
1867  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1868 
1869  /* if flat2 is zero for all pixels, then no need to calculate other filter */
1870  if (__msa_test_bz_v(flat2)) {
1871  v8i16 vec0, vec1, vec2, vec3, vec4;
1872 
1873  LD_UB4(filter48, 16, p2, p1, p0, q0);
1874  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1875 
1876  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1877  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1878  vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1879 
1880  src_org -= 3;
1881  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1882  ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1883  src_org += (4 * pitch);
1884  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1885  ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1886 
1887  return 1;
1888  } else {
1889  src -= 7 * 16;
1890 
1891  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1892  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1893  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1894  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
1895 
1896  tmp0_r = p7_r_in << 3;
1897  tmp0_r -= p7_r_in;
1898  tmp0_r += p6_r_in;
1899  tmp0_r += q0_r_in;
1900  tmp1_r = p6_r_in + p5_r_in;
1901  tmp1_r += p4_r_in;
1902  tmp1_r += p3_r_in;
1903  tmp1_r += p2_r_in;
1904  tmp1_r += p1_r_in;
1905  tmp1_r += p0_r_in;
1906  tmp1_r += tmp0_r;
1907 
1908  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1909  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1910  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1911  ST8x1_UB(p6, src);
1912  src += 16;
1913 
1914  /* p5 */
1915  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
1916  tmp0_r = p5_r_in - p6_r_in;
1917  tmp0_r += q1_r_in;
1918  tmp0_r -= p7_r_in;
1919  tmp1_r += tmp0_r;
1920  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1921  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1922  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1923  ST8x1_UB(p5, src);
1924  src += 16;
1925 
1926  /* p4 */
1927  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1928  tmp0_r = p4_r_in - p5_r_in;
1929  tmp0_r += q2_r_in;
1930  tmp0_r -= p7_r_in;
1931  tmp1_r += tmp0_r;
1932  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1933  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1934  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
1935  ST8x1_UB(p4, src);
1936  src += 16;
1937 
1938  /* p3 */
1939  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
1940  tmp0_r = p3_r_in - p4_r_in;
1941  tmp0_r += q3_r_in;
1942  tmp0_r -= p7_r_in;
1943  tmp1_r += tmp0_r;
1944  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1945  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1946  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
1947  ST8x1_UB(p3, src);
1948  src += 16;
1949 
1950  /* p2 */
1951  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
1952  filter8 = LD_UB(filter48);
1953  tmp0_r = p2_r_in - p3_r_in;
1954  tmp0_r += q4_r_in;
1955  tmp0_r -= p7_r_in;
1956  tmp1_r += tmp0_r;
1957  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1958  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1959  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1960  ST8x1_UB(filter8, src);
1961  src += 16;
1962 
1963  /* p1 */
1964  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
1965  filter8 = LD_UB(filter48 + 16);
1966  tmp0_r = p1_r_in - p2_r_in;
1967  tmp0_r += q5_r_in;
1968  tmp0_r -= p7_r_in;
1969  tmp1_r += tmp0_r;
1970  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1971  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1972  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1973  ST8x1_UB(filter8, src);
1974  src += 16;
1975 
1976  /* p0 */
1977  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
1978  filter8 = LD_UB(filter48 + 32);
1979  tmp0_r = p0_r_in - p1_r_in;
1980  tmp0_r += q6_r_in;
1981  tmp0_r -= p7_r_in;
1982  tmp1_r += tmp0_r;
1983  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1984  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1985  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1986  ST8x1_UB(filter8, src);
1987  src += 16;
1988 
1989  /* q0 */
1990  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
1991  filter8 = LD_UB(filter48 + 48);
1992  tmp0_r = q7_r_in - p0_r_in;
1993  tmp0_r += q0_r_in;
1994  tmp0_r -= p7_r_in;
1995  tmp1_r += tmp0_r;
1996  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1997  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1998  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1999  ST8x1_UB(filter8, src);
2000  src += 16;
2001 
2002  /* q1 */
2003  filter8 = LD_UB(filter48 + 64);
2004  tmp0_r = q7_r_in - q0_r_in;
2005  tmp0_r += q1_r_in;
2006  tmp0_r -= p6_r_in;
2007  tmp1_r += tmp0_r;
2008  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2009  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2010  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2011  ST8x1_UB(filter8, src);
2012  src += 16;
2013 
2014  /* q2 */
2015  filter8 = LD_UB(filter48 + 80);
2016  tmp0_r = q7_r_in - q1_r_in;
2017  tmp0_r += q2_r_in;
2018  tmp0_r -= p5_r_in;
2019  tmp1_r += tmp0_r;
2020  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2021  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2022  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2023  ST8x1_UB(filter8, src);
2024  src += 16;
2025 
2026  /* q3 */
2027  tmp0_r = q7_r_in - q2_r_in;
2028  tmp0_r += q3_r_in;
2029  tmp0_r -= p4_r_in;
2030  tmp1_r += tmp0_r;
2031  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2032  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2033  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2034  ST8x1_UB(q3, src);
2035  src += 16;
2036 
2037  /* q4 */
2038  tmp0_r = q7_r_in - q3_r_in;
2039  tmp0_r += q4_r_in;
2040  tmp0_r -= p3_r_in;
2041  tmp1_r += tmp0_r;
2042  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2043  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2044  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2045  ST8x1_UB(q4, src);
2046  src += 16;
2047 
2048  /* q5 */
2049  tmp0_r = q7_r_in - q4_r_in;
2050  tmp0_r += q5_r_in;
2051  tmp0_r -= p2_r_in;
2052  tmp1_r += tmp0_r;
2053  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2054  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2055  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2056  ST8x1_UB(q5, src);
2057  src += 16;
2058 
2059  /* q6 */
2060  tmp0_r = q7_r_in - q5_r_in;
2061  tmp0_r += q6_r_in;
2062  tmp0_r -= p1_r_in;
2063  tmp1_r += tmp0_r;
2064  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2065  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2066  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2067  ST8x1_UB(q6, src);
2068 
2069  return 0;
2070  }
2071 }
2072 
2073 void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
2074  int32_t b_limit_ptr,
2075  int32_t limit_ptr,
2076  int32_t thresh_ptr)
2077 {
2078  uint8_t early_exit = 0;
2079  uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2080  uint8_t *filter48 = &transposed_input[16 * 16];
2081 
2082  vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
2083 
2084  early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2085  &filter48[0], src, pitch,
2086  b_limit_ptr, limit_ptr, thresh_ptr);
2087 
2088  if (0 == early_exit) {
2089  early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
2090  &filter48[0]);
2091 
2092  if (0 == early_exit) {
2093  vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
2094  }
2095  }
2096 }
2097 
2099  uint8_t *src_org, ptrdiff_t pitch,
2100  int32_t b_limit_ptr,
2101  int32_t limit_ptr,
2102  int32_t thresh_ptr)
2103 {
2104  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
2105  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2106  v16u8 flat, mask, hev, thresh, b_limit, limit;
2107  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2108  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2109  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2110  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2111  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2112  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2113  v16i8 zero = { 0 };
2114  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2115 
2116  /* load vector elements */
2117  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2118 
2119  thresh = (v16u8) __msa_fill_b(thresh_ptr);
2120  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2121  limit = (v16u8) __msa_fill_b(limit_ptr);
2122 
2123  /* mask and hev */
2124  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2125  hev, mask, flat);
2126  /* flat4 */
2127  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2128  /* filter4 */
2129  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2130  q1_out);
2131 
2132  /* if flat is zero for all pixels, then no need to calculate other filter */
2133  if (__msa_test_bz_v(flat)) {
2134  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2135  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
2136  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2137  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
2138 
2139  src_org -= 2;
2140  ST4x8_UB(vec2, vec3, src_org, pitch);
2141  src_org += 8 * pitch;
2142  ST4x8_UB(vec4, vec5, src_org, pitch);
2143 
2144  return 1;
2145  } else {
2146  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2147  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2148  q3_r);
2149  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2150  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2151  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2152  p0_l);
2153  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2154  q3_l);
2155  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2156  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2157 
2158  /* convert 16 bit output data into 8 bit */
2159  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2160  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2161  p0_filt8_r, q0_filt8_r);
2162  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2163  q2_filt8_r);
2164 
2165  /* store pixel values */
2166  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2167  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2168  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2169  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2170  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2171  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2172 
2173  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2174  filter48 += (4 * 16);
2175  ST_UB2(q1_out, q2_out, filter48, 16);
2176  filter48 += (2 * 16);
2177  ST_UB(flat, filter48);
2178 
2179  return 0;
2180  }
2181 }
2182 
2183 static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
2184  uint8_t *filter48)
2185 {
2186  v16u8 flat, flat2, filter8;
2187  v16i8 zero = { 0 };
2188  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2189  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2190  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2191  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2192  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2193  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2194  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2195  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2196  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2197  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2198  v8i16 l_out, r_out;
2199 
2200  flat = LD_UB(filter48 + 6 * 16);
2201 
2202  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2203  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2204 
2205  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2206 
2207  /* if flat2 is zero for all pixels, then no need to calculate other filter */
2208  if (__msa_test_bz_v(flat2)) {
2209  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2210 
2211  LD_UB4(filter48, 16, p2, p1, p0, q0);
2212  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2213 
2214  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
2215  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
2216  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
2217  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
2218  ILVRL_B2_SH(q2, q1, vec2, vec5);
2219 
2220  src_org -= 3;
2221  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
2222  ST2x4_UB(vec2, 0, (src_org + 4), pitch);
2223  src_org += (4 * pitch);
2224  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
2225  ST2x4_UB(vec2, 4, (src_org + 4), pitch);
2226  src_org += (4 * pitch);
2227  ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
2228  ST2x4_UB(vec5, 0, (src_org + 4), pitch);
2229  src_org += (4 * pitch);
2230  ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
2231  ST2x4_UB(vec5, 4, (src_org + 4), pitch);
2232 
2233  return 1;
2234  } else {
2235  src -= 7 * 16;
2236 
2237  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2238  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2239  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2240  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
2241 
2242  tmp0_r = p7_r_in << 3;
2243  tmp0_r -= p7_r_in;
2244  tmp0_r += p6_r_in;
2245  tmp0_r += q0_r_in;
2246  tmp1_r = p6_r_in + p5_r_in;
2247  tmp1_r += p4_r_in;
2248  tmp1_r += p3_r_in;
2249  tmp1_r += p2_r_in;
2250  tmp1_r += p1_r_in;
2251  tmp1_r += p0_r_in;
2252  tmp1_r += tmp0_r;
2253  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2254 
2255  ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2256  p5_l_in, p4_l_in);
2257  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2258  p1_l_in, p0_l_in);
2259  q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
2260 
2261  tmp0_l = p7_l_in << 3;
2262  tmp0_l -= p7_l_in;
2263  tmp0_l += p6_l_in;
2264  tmp0_l += q0_l_in;
2265  tmp1_l = p6_l_in + p5_l_in;
2266  tmp1_l += p4_l_in;
2267  tmp1_l += p3_l_in;
2268  tmp1_l += p2_l_in;
2269  tmp1_l += p1_l_in;
2270  tmp1_l += p0_l_in;
2271  tmp1_l += tmp0_l;
2272  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2273 
2274  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2275  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2276  ST_UB(p6, src);
2277  src += 16;
2278 
2279  /* p5 */
2280  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
2281  tmp0_r = p5_r_in - p6_r_in;
2282  tmp0_r += q1_r_in;
2283  tmp0_r -= p7_r_in;
2284  tmp1_r += tmp0_r;
2285  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2286  q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
2287  tmp0_l = p5_l_in - p6_l_in;
2288  tmp0_l += q1_l_in;
2289  tmp0_l -= p7_l_in;
2290  tmp1_l += tmp0_l;
2291  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2292  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2293  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2294  ST_UB(p5, src);
2295  src += 16;
2296 
2297  /* p4 */
2298  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2299  tmp0_r = p4_r_in - p5_r_in;
2300  tmp0_r += q2_r_in;
2301  tmp0_r -= p7_r_in;
2302  tmp1_r += tmp0_r;
2303  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2304  q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2305  tmp0_l = p4_l_in - p5_l_in;
2306  tmp0_l += q2_l_in;
2307  tmp0_l -= p7_l_in;
2308  tmp1_l += tmp0_l;
2309  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2310  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2311  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2312  ST_UB(p4, src);
2313  src += 16;
2314 
2315  /* p3 */
2316  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2317  tmp0_r = p3_r_in - p4_r_in;
2318  tmp0_r += q3_r_in;
2319  tmp0_r -= p7_r_in;
2320  tmp1_r += tmp0_r;
2321  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2322  q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2323  tmp0_l = p3_l_in - p4_l_in;
2324  tmp0_l += q3_l_in;
2325  tmp0_l -= p7_l_in;
2326  tmp1_l += tmp0_l;
2327  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2328  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2329  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2330  ST_UB(p3, src);
2331  src += 16;
2332 
2333  /* p2 */
2334  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2335  filter8 = LD_UB(filter48);
2336  tmp0_r = p2_r_in - p3_r_in;
2337  tmp0_r += q4_r_in;
2338  tmp0_r -= p7_r_in;
2339  tmp1_r += tmp0_r;
2340  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2341  q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2342  tmp0_l = p2_l_in - p3_l_in;
2343  tmp0_l += q4_l_in;
2344  tmp0_l -= p7_l_in;
2345  tmp1_l += tmp0_l;
2346  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2347  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2348  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2349  ST_UB(filter8, src);
2350  src += 16;
2351 
2352  /* p1 */
2353  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2354  filter8 = LD_UB(filter48 + 16);
2355  tmp0_r = p1_r_in - p2_r_in;
2356  tmp0_r += q5_r_in;
2357  tmp0_r -= p7_r_in;
2358  tmp1_r += tmp0_r;
2359  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2360  q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2361  tmp0_l = p1_l_in - p2_l_in;
2362  tmp0_l += q5_l_in;
2363  tmp0_l -= p7_l_in;
2364  tmp1_l += tmp0_l;
2365  l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2366  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2367  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2368  ST_UB(filter8, src);
2369  src += 16;
2370 
2371  /* p0 */
2372  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2373  filter8 = LD_UB(filter48 + 32);
2374  tmp0_r = p0_r_in - p1_r_in;
2375  tmp0_r += q6_r_in;
2376  tmp0_r -= p7_r_in;
2377  tmp1_r += tmp0_r;
2378  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2379  q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2380  tmp0_l = p0_l_in - p1_l_in;
2381  tmp0_l += q6_l_in;
2382  tmp0_l -= p7_l_in;
2383  tmp1_l += tmp0_l;
2384  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2385  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2386  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2387  ST_UB(filter8, src);
2388  src += 16;
2389 
2390  /* q0 */
2391  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2392  filter8 = LD_UB(filter48 + 48);
2393  tmp0_r = q7_r_in - p0_r_in;
2394  tmp0_r += q0_r_in;
2395  tmp0_r -= p7_r_in;
2396  tmp1_r += tmp0_r;
2397  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2398  q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2399  tmp0_l = q7_l_in - p0_l_in;
2400  tmp0_l += q0_l_in;
2401  tmp0_l -= p7_l_in;
2402  tmp1_l += tmp0_l;
2403  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2404  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2405  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2406  ST_UB(filter8, src);
2407  src += 16;
2408 
2409  /* q1 */
2410  filter8 = LD_UB(filter48 + 64);
2411  tmp0_r = q7_r_in - q0_r_in;
2412  tmp0_r += q1_r_in;
2413  tmp0_r -= p6_r_in;
2414  tmp1_r += tmp0_r;
2415  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2416  tmp0_l = q7_l_in - q0_l_in;
2417  tmp0_l += q1_l_in;
2418  tmp0_l -= p6_l_in;
2419  tmp1_l += tmp0_l;
2420  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2421  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2422  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2423  ST_UB(filter8, src);
2424  src += 16;
2425 
2426  /* q2 */
2427  filter8 = LD_UB(filter48 + 80);
2428  tmp0_r = q7_r_in - q1_r_in;
2429  tmp0_r += q2_r_in;
2430  tmp0_r -= p5_r_in;
2431  tmp1_r += tmp0_r;
2432  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2433  tmp0_l = q7_l_in - q1_l_in;
2434  tmp0_l += q2_l_in;
2435  tmp0_l -= p5_l_in;
2436  tmp1_l += tmp0_l;
2437  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2438  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2439  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2440  ST_UB(filter8, src);
2441  src += 16;
2442 
2443  /* q3 */
2444  tmp0_r = q7_r_in - q2_r_in;
2445  tmp0_r += q3_r_in;
2446  tmp0_r -= p4_r_in;
2447  tmp1_r += tmp0_r;
2448  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2449  tmp0_l = q7_l_in - q2_l_in;
2450  tmp0_l += q3_l_in;
2451  tmp0_l -= p4_l_in;
2452  tmp1_l += tmp0_l;
2453  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2454  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2455  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2456  ST_UB(q3, src);
2457  src += 16;
2458 
2459  /* q4 */
2460  tmp0_r = q7_r_in - q3_r_in;
2461  tmp0_r += q4_r_in;
2462  tmp0_r -= p3_r_in;
2463  tmp1_r += tmp0_r;
2464  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2465  tmp0_l = q7_l_in - q3_l_in;
2466  tmp0_l += q4_l_in;
2467  tmp0_l -= p3_l_in;
2468  tmp1_l += tmp0_l;
2469  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2470  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2471  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2472  ST_UB(q4, src);
2473  src += 16;
2474 
2475  /* q5 */
2476  tmp0_r = q7_r_in - q4_r_in;
2477  tmp0_r += q5_r_in;
2478  tmp0_r -= p2_r_in;
2479  tmp1_r += tmp0_r;
2480  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2481  tmp0_l = q7_l_in - q4_l_in;
2482  tmp0_l += q5_l_in;
2483  tmp0_l -= p2_l_in;
2484  tmp1_l += tmp0_l;
2485  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2486  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2487  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2488  ST_UB(q5, src);
2489  src += 16;
2490 
2491  /* q6 */
2492  tmp0_r = q7_r_in - q5_r_in;
2493  tmp0_r += q6_r_in;
2494  tmp0_r -= p1_r_in;
2495  tmp1_r += tmp0_r;
2496  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2497  tmp0_l = q7_l_in - q5_l_in;
2498  tmp0_l += q6_l_in;
2499  tmp0_l -= p1_l_in;
2500  tmp1_l += tmp0_l;
2501  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2502  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2503  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2504  ST_UB(q6, src);
2505 
2506  return 0;
2507  }
2508 }
2509 
2510 void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
2511  int32_t b_limit_ptr,
2512  int32_t limit_ptr,
2513  int32_t thresh_ptr)
2514 {
2515  uint8_t early_exit = 0;
2516  uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2517  uint8_t *filter48 = &transposed_input[16 * 16];
2518 
2519  vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
2520 
2521  early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
2522  &filter48[0], src, pitch,
2523  b_limit_ptr, limit_ptr, thresh_ptr);
2524 
2525  if (0 == early_exit) {
2526  early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
2527  &filter48[0]);
2528 
2529  if (0 == early_exit) {
2530  vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
2531  }
2532  }
2533 }
void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1274
#define ILVRL_B2_SH(...)
static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1704
static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1717
void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:225
#define PCKEV_B2_SH(...)
#define ILVL_B4_UH(...)
#define SD
Definition: ccaption_dec.c:819
void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:196
void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1572
static const uint8_t q1[256]
Definition: twofish.c:96
#define LD_UB4(...)
#define ILVR_B2_SB(...)
#define src
Definition: vp8dsp.c:254
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, q6_in, q7_in, flat_in, flat2_out)
Definition: vp9_lpf_msa.c:89
uint8_t
#define LD_UB2(...)
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)
Definition: vp9_lpf_msa.c:68
#define ILVEV_H2_SW(...)
#define ILVL_B2_SB(...)
#define ILVRL_H2_SH(...)
#define SLDI_B4_0_UB(...)
static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:551
static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:626
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:566
static const uint16_t mask[17]
Definition: lzw.c:38
void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1468
#define zero
Definition: regdef.h:64
static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1777
#define ILVR_B2_SH(...)
void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1198
void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:479
static const uint8_t q0[256]
Definition: twofish.c:77
static av_always_inline void flat(WaveformContext *s, AVFrame *in, AVFrame *out, int component, int intensity, int offset_y, int offset_x, int column, int mirror)
Definition: vf_waveform.c:928
#define ILVR_B8_UH(...)
#define ALIGNMENT
#define LD_UB8(...)
#define PCKEV_B4_SH(...)
void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:958
int32_t
void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2073
#define ST_UB(...)
static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:2183
#define ST2x4_UB(in, stidx, pdst, stride)
#define ST_UB2(...)
#define ILVL_B2_SH(...)
#define ST_UB8(...)
#define ST_UB4(...)
static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:1849
#define ILVL_B4_SB(...)
void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:974
#define TRANSPOSE8x8_UB_UB(...)
void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1227
static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1678
void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:408
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ILVL_W2_UB(...)
#define ILVR_W2_UB(...)
#define ST4x8_UB(in0, in1, pdst, stride)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7)
#define ILVEV_B2_SH(...)
#define ILVEV_B2_UB(...)
#define ALLOC_ALIGNED(align)
void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:329
#define LD_UB(...)
void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:255
void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2510
#define ST8x1_UB(in, pdst)
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, p1_out, p0_out, q0_out, q1_out)
Definition: vp9_lpf_msa.c:25
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, q1_filt8_out, q2_filt8_out)
Definition: vp9_lpf_msa.c:119
void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1354
#define PCKEV_B2_UB(...)
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, limit_in, b_limit_in, thresh_in, hev_out, mask_out, flat_out)
Definition: vp9_lpf_msa.c:158
static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2098
static uint8_t tmp[11]
Definition: aes_ctr.c:26