FFmpeg  4.0
vp8_lpf_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp8dsp.h"
23 #include "vp8dsp_mips.h"
24 
25 #define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) \
26 { \
27  v16u8 p1_a_sub_q1, p0_a_sub_q0; \
28  \
29  p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \
30  p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \
31  p1_a_sub_q1 = (v16u8) __msa_srli_b((v16i8) p1_a_sub_q1, 1); \
32  p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \
33  mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \
34  mask = ((v16u8) mask <= b_limit); \
35 }
36 
37 #define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, \
38  mask_in, hev_in) \
39 { \
40  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
41  v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
42  v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
43  \
44  p1_m = (v16i8) __msa_xori_b(p1_in_out, 0x80); \
45  p0_m = (v16i8) __msa_xori_b(p0_in_out, 0x80); \
46  q0_m = (v16i8) __msa_xori_b(q0_in_out, 0x80); \
47  q1_m = (v16i8) __msa_xori_b(q1_in_out, 0x80); \
48  \
49  filt = __msa_subs_s_b(p1_m, q1_m); \
50  \
51  filt = filt & (v16i8) hev_in; \
52  \
53  q0_sub_p0 = q0_m - p0_m; \
54  filt_sign = __msa_clti_s_b(filt, 0); \
55  \
56  cnst3h = __msa_ldi_h(3); \
57  q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
58  q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \
59  filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \
60  filt_r += q0_sub_p0_r; \
61  filt_r = __msa_sat_s_h(filt_r, 7); \
62  \
63  q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
64  q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h); \
65  filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \
66  filt_l += q0_sub_p0_l; \
67  filt_l = __msa_sat_s_h(filt_l, 7); \
68  \
69  filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \
70  filt = filt & (v16i8) mask_in; \
71  \
72  cnst4b = __msa_ldi_b(4); \
73  filt1 = __msa_adds_s_b(filt, cnst4b); \
74  filt1 >>= 3; \
75  \
76  cnst3b = __msa_ldi_b(3); \
77  filt2 = __msa_adds_s_b(filt, cnst3b); \
78  filt2 >>= 3; \
79  \
80  q0_m = __msa_subs_s_b(q0_m, filt1); \
81  q0_in_out = __msa_xori_b((v16u8) q0_m, 0x80); \
82  p0_m = __msa_adds_s_b(p0_m, filt2); \
83  p0_in_out = __msa_xori_b((v16u8) p0_m, 0x80); \
84  \
85  filt = __msa_srari_b(filt1, 1); \
86  hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
87  filt = filt & (v16i8) hev_in; \
88  \
89  q1_m = __msa_subs_s_b(q1_m, filt); \
90  q1_in_out = __msa_xori_b((v16u8) q1_m, 0x80); \
91  p1_m = __msa_adds_s_b(p1_m, filt); \
92  p1_in_out = __msa_xori_b((v16u8) p1_m, 0x80); \
93 }
94 
95 #define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \
96 { \
97  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign; \
98  v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign; \
99  v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
100  \
101  p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
102  p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
103  q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
104  q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
105  \
106  filt = __msa_subs_s_b(p1_m, q1_m); \
107  \
108  q0_sub_p0 = q0_m - p0_m; \
109  filt_sign = __msa_clti_s_b(filt, 0); \
110  \
111  cnst3h = __msa_ldi_h(3); \
112  q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \
113  q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \
114  q0_sub_p0_r *= cnst3h; \
115  filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \
116  filt_r += q0_sub_p0_r; \
117  filt_r = __msa_sat_s_h(filt_r, 7); \
118  \
119  q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \
120  q0_sub_p0_l *= cnst3h; \
121  filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \
122  filt_l += q0_sub_p0_l; \
123  filt_l = __msa_sat_s_h(filt_l, 7); \
124  \
125  filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \
126  filt = filt & (v16i8) (mask); \
127  \
128  cnst4b = __msa_ldi_b(4); \
129  filt1 = __msa_adds_s_b(filt, cnst4b); \
130  filt1 >>= 3; \
131  \
132  cnst3b = __msa_ldi_b(3); \
133  filt2 = __msa_adds_s_b(filt, cnst3b); \
134  filt2 >>= 3; \
135  \
136  q0_m = __msa_subs_s_b(q0_m, filt1); \
137  p0_m = __msa_adds_s_b(p0_m, filt2); \
138  q0_in = __msa_xori_b((v16u8) q0_m, 0x80); \
139  p0_in = __msa_xori_b((v16u8) p0_m, 0x80); \
140 }
141 
142 #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
143 { \
144  v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \
145  v16i8 filt, q0_sub_p0, cnst4b, cnst3b; \
146  v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign; \
147  v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l; \
148  v8i16 cnst3h, cnst27h, cnst18h, cnst63h; \
149  \
150  cnst3h = __msa_ldi_h(3); \
151  \
152  p2_m = (v16i8) __msa_xori_b(p2, 0x80); \
153  p1_m = (v16i8) __msa_xori_b(p1, 0x80); \
154  p0_m = (v16i8) __msa_xori_b(p0, 0x80); \
155  q0_m = (v16i8) __msa_xori_b(q0, 0x80); \
156  q1_m = (v16i8) __msa_xori_b(q1, 0x80); \
157  q2_m = (v16i8) __msa_xori_b(q2, 0x80); \
158  \
159  filt = __msa_subs_s_b(p1_m, q1_m); \
160  q0_sub_p0 = q0_m - p0_m; \
161  q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \
162  filt_sign = __msa_clti_s_b(filt, 0); \
163  \
164  /* right part */ \
165  q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \
166  q0_sub_p0_r *= cnst3h; \
167  filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \
168  filt_r = filt_r + q0_sub_p0_r; \
169  filt_r = __msa_sat_s_h(filt_r, 7); \
170  \
171  /* left part */ \
172  q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \
173  q0_sub_p0_l *= cnst3h; \
174  filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \
175  filt_l = filt_l + q0_sub_p0_l; \
176  filt_l = __msa_sat_s_h(filt_l, 7); \
177  \
178  /* combine left and right part */ \
179  filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \
180  filt = filt & (v16i8) mask; \
181  filt2 = filt & (v16i8) hev; \
182  \
183  /* filt_val &= ~hev */ \
184  hev = __msa_xori_b(hev, 0xff); \
185  filt = filt & (v16i8) hev; \
186  cnst4b = __msa_ldi_b(4); \
187  filt1 = __msa_adds_s_b(filt2, cnst4b); \
188  filt1 >>= 3; \
189  cnst3b = __msa_ldi_b(3); \
190  filt2 = __msa_adds_s_b(filt2, cnst3b); \
191  filt2 >>= 3; \
192  q0_m = __msa_subs_s_b(q0_m, filt1); \
193  p0_m = __msa_adds_s_b(p0_m, filt2); \
194  \
195  filt_sign = __msa_clti_s_b(filt, 0); \
196  ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
197  \
198  cnst27h = __msa_ldi_h(27); \
199  cnst63h = __msa_ldi_h(63); \
200  \
201  /* right part */ \
202  u_r = filt_r * cnst27h; \
203  u_r += cnst63h; \
204  u_r >>= 7; \
205  u_r = __msa_sat_s_h(u_r, 7); \
206  /* left part */ \
207  u_l = filt_l * cnst27h; \
208  u_l += cnst63h; \
209  u_l >>= 7; \
210  u_l = __msa_sat_s_h(u_l, 7); \
211  /* combine left and right part */ \
212  u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \
213  q0_m = __msa_subs_s_b(q0_m, u); \
214  q0 = __msa_xori_b((v16u8) q0_m, 0x80); \
215  p0_m = __msa_adds_s_b(p0_m, u); \
216  p0 = __msa_xori_b((v16u8) p0_m, 0x80); \
217  cnst18h = __msa_ldi_h(18); \
218  u_r = filt_r * cnst18h; \
219  u_r += cnst63h; \
220  u_r >>= 7; \
221  u_r = __msa_sat_s_h(u_r, 7); \
222  \
223  /* left part */ \
224  u_l = filt_l * cnst18h; \
225  u_l += cnst63h; \
226  u_l >>= 7; \
227  u_l = __msa_sat_s_h(u_l, 7); \
228  /* combine left and right part */ \
229  u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \
230  q1_m = __msa_subs_s_b(q1_m, u); \
231  q1 = __msa_xori_b((v16u8) q1_m, 0x80); \
232  p1_m = __msa_adds_s_b(p1_m, u); \
233  p1 = __msa_xori_b((v16u8) p1_m, 0x80); \
234  u_r = filt_r << 3; \
235  u_r += filt_r + cnst63h; \
236  u_r >>= 7; \
237  u_r = __msa_sat_s_h(u_r, 7); \
238  \
239  /* left part */ \
240  u_l = filt_l << 3; \
241  u_l += filt_l + cnst63h; \
242  u_l >>= 7; \
243  u_l = __msa_sat_s_h(u_l, 7); \
244  /* combine left and right part */ \
245  u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \
246  q2_m = __msa_subs_s_b(q2_m, u); \
247  q2 = __msa_xori_b((v16u8) q2_m, 0x80); \
248  p2_m = __msa_adds_s_b(p2_m, u); \
249  p2 = __msa_xori_b((v16u8) p2_m, 0x80); \
250 }
251 
252 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
253  q0_in, q1_in, q2_in, q3_in, \
254  limit_in, b_limit_in, thresh_in, \
255  hev_out, mask_out, flat_out) \
256 { \
257  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
258  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
259  \
260  /* absolute subtraction of pixel values */ \
261  p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in)); \
262  p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in)); \
263  p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in)); \
264  q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in)); \
265  q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in)); \
266  q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in)); \
267  p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in)); \
268  p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in)); \
269  /* calculation of hev */ \
270  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
271  hev_out = (thresh_in) < (v16u8) flat_out; \
272  /* calculation of mask */ \
273  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
274  p1_asub_q1_m >>= 1; \
275  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
276  mask_out = (b_limit_in) < p0_asub_q0_m; \
277  mask_out = __msa_max_u_b(flat_out, mask_out); \
278  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
279  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
280  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
281  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
282  mask_out = (limit_in) < (v16u8) mask_out; \
283  mask_out = __msa_xori_b(mask_out, 0xff); \
284 }
285 
286 #define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) \
287 { \
288  uint16_t tmp0_h; \
289  uint32_t tmp0_w; \
290  \
291  tmp0_w = __msa_copy_u_w((v4i32) in0, in0_idx); \
292  tmp0_h = __msa_copy_u_h((v8i16) in1, in1_idx); \
293  SW(tmp0_w, pdst); \
294  SH(tmp0_h, pdst + stride); \
295 }
296 
297 void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
298  int limit_in, int thresh_in)
299 {
300  uint8_t *temp_src;
301  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
302  v16u8 mask, hev, flat, thresh, limit, b_limit;
303 
304  b_limit = (v16u8) __msa_fill_b(b_limit_in);
305  limit = (v16u8) __msa_fill_b(limit_in);
306  thresh = (v16u8) __msa_fill_b(thresh_in);
307  /* load vector elements */
308  temp_src = src - (pitch << 2);
309  LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
310  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
311  hev, mask, flat);
312  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
313  /* store vector elements */
314  temp_src = src - 3 * pitch;
315  ST_UB4(p2, p1, p0, q0, temp_src, pitch);
316  temp_src += (4 * pitch);
317  ST_UB2(q1, q2, temp_src, pitch);
318 }
319 
321  ptrdiff_t pitch, int b_limit_in, int limit_in,
322  int thresh_in)
323 {
324  uint8_t *temp_src;
325  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
326  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
327  v16u8 mask, hev, flat, thresh, limit, b_limit;
328  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
329  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
330 
331  b_limit = (v16u8) __msa_fill_b(b_limit_in);
332  limit = (v16u8) __msa_fill_b(limit_in);
333  thresh = (v16u8) __msa_fill_b(thresh_in);
334 
335  temp_src = src_u - (pitch << 2);
336  LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
337  temp_src = src_v - (pitch << 2);
338  LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
339 
340  /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */
341  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
342  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
343  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
344  hev, mask, flat);
345  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
346 
347  p2_d = __msa_copy_u_d((v2i64) p2, 0);
348  p1_d = __msa_copy_u_d((v2i64) p1, 0);
349  p0_d = __msa_copy_u_d((v2i64) p0, 0);
350  q0_d = __msa_copy_u_d((v2i64) q0, 0);
351  q1_d = __msa_copy_u_d((v2i64) q1, 0);
352  q2_d = __msa_copy_u_d((v2i64) q2, 0);
353  src_u -= (pitch * 3);
354  SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
355  src_u += 4 * pitch;
356  SD(q1_d, src_u);
357  src_u += pitch;
358  SD(q2_d, src_u);
359 
360  p2_d = __msa_copy_u_d((v2i64) p2, 1);
361  p1_d = __msa_copy_u_d((v2i64) p1, 1);
362  p0_d = __msa_copy_u_d((v2i64) p0, 1);
363  q0_d = __msa_copy_u_d((v2i64) q0, 1);
364  q1_d = __msa_copy_u_d((v2i64) q1, 1);
365  q2_d = __msa_copy_u_d((v2i64) q2, 1);
366  src_v -= (pitch * 3);
367  SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
368  src_v += 4 * pitch;
369  SD(q1_d, src_v);
370  src_v += pitch;
371  SD(q2_d, src_v);
372 }
373 
374 void ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
375  int limit_in, int thresh_in)
376 {
377  uint8_t *temp_src;
378  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
379  v16u8 mask, hev, flat, thresh, limit, b_limit;
380  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
381  v16u8 row9, row10, row11, row12, row13, row14, row15;
382  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
383 
384  b_limit = (v16u8) __msa_fill_b(b_limit_in);
385  limit = (v16u8) __msa_fill_b(limit_in);
386  thresh = (v16u8) __msa_fill_b(thresh_in);
387  temp_src = src - 4;
388  LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
389  temp_src += (8 * pitch);
390  LD_UB8(temp_src, pitch,
391  row8, row9, row10, row11, row12, row13, row14, row15);
392  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
393  row8, row9, row10, row11, row12, row13, row14, row15,
394  p3, p2, p1, p0, q0, q1, q2, q3);
395 
396  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
397  hev, mask, flat);
398  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
399  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
400  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
401  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
402  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
403  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
404 
405  temp_src = src - 3;
406  VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
407  temp_src += pitch;
408  VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
409  temp_src += pitch;
410  VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
411  temp_src += pitch;
412  VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
413  temp_src += pitch;
414  VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
415  temp_src += pitch;
416  VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
417  temp_src += pitch;
418  VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
419  temp_src += pitch;
420  VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
421  temp_src += pitch;
422  VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
423  temp_src += pitch;
424  VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
425  temp_src += pitch;
426  VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
427  temp_src += pitch;
428  VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
429  temp_src += pitch;
430  VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
431  temp_src += pitch;
432  VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
433  temp_src += pitch;
434  VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
435  temp_src += pitch;
436  VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
437 }
438 
440  ptrdiff_t pitch, int b_limit_in, int limit_in,
441  int thresh_in)
442 {
443  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
444  v16u8 mask, hev, flat, thresh, limit, b_limit;
445  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
446  v16u8 row9, row10, row11, row12, row13, row14, row15;
447  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
448 
449  b_limit = (v16u8) __msa_fill_b(b_limit_in);
450  limit = (v16u8) __msa_fill_b(limit_in);
451  thresh = (v16u8) __msa_fill_b(thresh_in);
452 
453  LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
454  LD_UB8(src_v - 4, pitch,
455  row8, row9, row10, row11, row12, row13, row14, row15);
456  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
457  row8, row9, row10, row11, row12, row13, row14, row15,
458  p3, p2, p1, p0, q0, q1, q2, q3);
459 
460  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
461  hev, mask, flat);
462  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
463 
464  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
465  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
466  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
467  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
468  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
469 
470  src_u -= 3;
471  VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
472  src_u += pitch;
473  VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
474  src_u += pitch;
475  VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
476  src_u += pitch;
477  VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
478  src_u += pitch;
479  VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
480  src_u += pitch;
481  VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
482  src_u += pitch;
483  VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
484  src_u += pitch;
485  VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
486 
487  src_v -= 3;
488  VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
489  src_v += pitch;
490  VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
491  src_v += pitch;
492  VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
493  src_v += pitch;
494  VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
495  src_v += pitch;
496  VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
497  src_v += pitch;
498  VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
499  src_v += pitch;
500  VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
501  src_v += pitch;
502  VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
503 }
504 
506  int b_limit_ptr)
507 {
508  v16u8 p1, p0, q1, q0;
509  v16u8 mask, b_limit;
510 
511  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
512  /* load vector elements */
513  LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
514  VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
515  VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
516  ST_UB2(p0, q0, (src - pitch), pitch);
517 }
518 
520  int b_limit_ptr)
521 {
522  uint8_t *temp_src;
523  v16u8 p1, p0, q1, q0;
524  v16u8 mask, b_limit;
525  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
526  v16u8 row9, row10, row11, row12, row13, row14, row15;
527  v8i16 tmp0, tmp1;
528 
529  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
530  temp_src = src - 2;
531  LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
532  temp_src += (8 * pitch);
533  LD_UB8(temp_src, pitch,
534  row8, row9, row10, row11, row12, row13, row14, row15);
535  TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
536  row8, row9, row10, row11, row12, row13, row14, row15,
537  p1, p0, q0, q1);
538  VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
539  VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
540  ILVRL_B2_SH(q0, p0, tmp1, tmp0);
541 
542  src -= 1;
543  ST2x4_UB(tmp1, 0, src, pitch);
544  src += 4 * pitch;
545  ST2x4_UB(tmp1, 4, src, pitch);
546  src += 4 * pitch;
547  ST2x4_UB(tmp0, 0, src, pitch);
548  src += 4 * pitch;
549  ST2x4_UB(tmp0, 4, src, pitch);
550  src += 4 * pitch;
551 }
552 
554  ptrdiff_t pitch, int b_limit_in,
555  int limit_in, int thresh_in)
556 {
557  uint64_t p1_d, p0_d, q0_d, q1_d;
558  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
559  v16u8 mask, hev, flat, thresh, limit, b_limit;
560  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
561  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
562 
563  thresh = (v16u8) __msa_fill_b(thresh_in);
564  limit = (v16u8) __msa_fill_b(limit_in);
565  b_limit = (v16u8) __msa_fill_b(b_limit_in);
566 
567  src_u = src_u - (pitch << 2);
568  LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
569  src_u += (5 * pitch);
570  src_v = src_v - (pitch << 2);
571  LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
572  src_v += (5 * pitch);
573 
574  /* right 8 element of p3 are u pixel and
575  left 8 element of p3 are v pixel */
576  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
577  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
578  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
579  hev, mask, flat);
580  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
581 
582  p1_d = __msa_copy_u_d((v2i64) p1, 0);
583  p0_d = __msa_copy_u_d((v2i64) p0, 0);
584  q0_d = __msa_copy_u_d((v2i64) q0, 0);
585  q1_d = __msa_copy_u_d((v2i64) q1, 0);
586  SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
587 
588  p1_d = __msa_copy_u_d((v2i64) p1, 1);
589  p0_d = __msa_copy_u_d((v2i64) p0, 1);
590  q0_d = __msa_copy_u_d((v2i64) q0, 1);
591  q1_d = __msa_copy_u_d((v2i64) q1, 1);
592  SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
593 }
594 
596  ptrdiff_t pitch, int b_limit_in,
597  int limit_in, int thresh_in)
598 {
599  uint8_t *temp_src_u, *temp_src_v;
600  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
601  v16u8 mask, hev, flat, thresh, limit, b_limit;
602  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
603  v16u8 row9, row10, row11, row12, row13, row14, row15;
604  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
605 
606  thresh = (v16u8) __msa_fill_b(thresh_in);
607  limit = (v16u8) __msa_fill_b(limit_in);
608  b_limit = (v16u8) __msa_fill_b(b_limit_in);
609 
610  LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
611  LD_UB8(src_v - 4, pitch,
612  row8, row9, row10, row11, row12, row13, row14, row15);
613  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
614  row8, row9, row10, row11, row12, row13, row14, row15,
615  p3, p2, p1, p0, q0, q1, q2, q3);
616 
617  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
618  hev, mask, flat);
619  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
620  ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
621  ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
622  tmp0 = (v4i32) __msa_ilvl_b((v16i8) p0, (v16i8) p1);
623  tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0);
624  ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
625 
626  temp_src_u = src_u - 2;
627  ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
628  temp_src_u += 4 * pitch;
629  ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
630 
631  temp_src_v = src_v - 2;
632  ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
633  temp_src_v += 4 * pitch;
634  ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
635 }
636 
638  int32_t e, int32_t i, int32_t h)
639 {
640  v16u8 mask, hev, flat;
641  v16u8 thresh, b_limit, limit;
642  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
643 
644  /* load vector elements */
645  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
646  thresh = (v16u8) __msa_fill_b(h);
647  b_limit = (v16u8) __msa_fill_b(e);
648  limit = (v16u8) __msa_fill_b(i);
649 
650  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
651  hev, mask, flat);
652  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
653 
654  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
655 }
656 
658  int32_t e, int32_t i, int32_t h)
659 {
660  v16u8 mask, hev, flat;
661  v16u8 thresh, b_limit, limit;
662  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
663  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
664  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
665  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
666 
667  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
668  LD_UB8(src - 4 + (8 * pitch), pitch,
669  row8, row9, row10, row11, row12, row13, row14, row15);
670  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
671  row8, row9, row10, row11, row12, row13, row14, row15,
672  p3, p2, p1, p0, q0, q1, q2, q3);
673 
674  thresh = (v16u8) __msa_fill_b(h);
675  b_limit = (v16u8) __msa_fill_b(e);
676  limit = (v16u8) __msa_fill_b(i);
677 
678  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
679  hev, mask, flat);
680  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
681  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
682  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
683  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
684  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
685 
686  src -= 2;
687  ST4x8_UB(tmp2, tmp3, src, pitch);
688  src += (8 * pitch);
689  ST4x8_UB(tmp4, tmp5, src, pitch);
690 }
void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, int32_t e, int32_t i, int32_t h)
Definition: vp8_lpf_msa.c:637
void ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in)
Definition: vp8_lpf_msa.c:374
#define ILVRL_B2_SH(...)
#define SD
Definition: ccaption_dec.c:819
#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)
Definition: vp8_lpf_msa.c:286
#define ILVRL_H2_SW(...)
static const uint8_t q1[256]
Definition: twofish.c:96
#define LD_UB4(...)
#define src
Definition: vp8dsp.c:254
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
uint8_t
VP8 compatible video decoder.
#define ILVRL_H2_SH(...)
void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in)
Definition: vp8_lpf_msa.c:297
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:566
static const uint16_t mask[17]
Definition: lzw.c:38
#define ILVR_B2_SW(...)
#define ILVR_B2_SH(...)
static const uint8_t q0[256]
Definition: twofish.c:77
static av_always_inline void flat(WaveformContext *s, AVFrame *in, AVFrame *out, int component, int intensity, int offset_y, int offset_x, int column, int mirror)
Definition: vf_waveform.c:928
#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)
Definition: vp8_lpf_msa.c:95
#define LD_UB8(...)
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, limit_in, b_limit_in, thresh_in, hev_out, mask_out, flat_out)
Definition: vp8_lpf_msa.c:252
#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask)
Definition: vp8_lpf_msa.c:25
int32_t
#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, mask_in, hev_in)
Definition: vp8_lpf_msa.c:37
void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in)
Definition: vp8_lpf_msa.c:320
#define ST2x4_UB(in, stidx, pdst, stride)
#define ST_UB2(...)
#define ILVL_B2_SH(...)
#define ST_UB4(...)
#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)
Definition: vp8_lpf_msa.c:142
#define ILVR_D4_UB(...)
void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in)
Definition: vp8_lpf_msa.c:553
void ff_vp8_h_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in)
Definition: vp8_lpf_msa.c:439
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST4x8_UB(in0, in1, pdst, stride)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7)
void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_ptr)
Definition: vp8_lpf_msa.c:519
void ff_vp8_v_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_ptr)
Definition: vp8_lpf_msa.c:505
void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in)
Definition: vp8_lpf_msa.c:595
#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, out0, out1, out2, out3)
void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, int32_t e, int32_t i, int32_t h)
Definition: vp8_lpf_msa.c:657