FFmpeg  4.0
h264dsp_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
23 
25  int32_t log2_denom, int32_t src_weight,
26  int32_t offset_in)
27 {
28  uint32_t tp0, tp1, offset_val;
29  v16u8 zero = { 0 };
30  v16u8 src0 = { 0 };
31  v8i16 src0_r, tmp0, wgt, denom, offset;
32 
33  offset_val = (unsigned) offset_in << log2_denom;
34 
35  wgt = __msa_fill_h(src_weight);
36  offset = __msa_fill_h(offset_val);
37  denom = __msa_fill_h(log2_denom);
38 
39  LW2(data, stride, tp0, tp1);
40  INSERT_W2_UB(tp0, tp1, src0);
41  src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
42  tmp0 = wgt * src0_r;
43  tmp0 = __msa_adds_s_h(tmp0, offset);
44  tmp0 = __msa_maxi_s_h(tmp0, 0);
45  tmp0 = __msa_srlr_h(tmp0, denom);
46  tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47  src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48  ST4x2_UB(src0, data, stride);
49 }
50 
51 static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
52  int32_t src_weight, int32_t offset_in)
53 {
54  uint32_t tp0, tp1, tp2, tp3, offset_val;
55  v16u8 src0 = { 0 };
56  v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
57 
58  offset_val = (unsigned) offset_in << log2_denom;
59 
60  wgt = __msa_fill_h(src_weight);
61  offset = __msa_fill_h(offset_val);
62  denom = __msa_fill_h(log2_denom);
63 
64  LW4(data, stride, tp0, tp1, tp2, tp3);
65  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
66  UNPCK_UB_SH(src0, src0_r, src1_r);
67  MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
68  ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
69  MAXI_SH2_SH(tmp0, tmp1, 0);
70  tmp0 = __msa_srlr_h(tmp0, denom);
71  tmp1 = __msa_srlr_h(tmp1, denom);
72  SAT_UH2_SH(tmp0, tmp1, 7);
73  src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
74  ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride);
75 }
76 
77 static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
78  int32_t src_weight, int32_t offset_in)
79 {
80  uint32_t tp0, tp1, tp2, tp3, offset_val;
81  v16u8 src0 = { 0 }, src1 = { 0 };
82  v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
83  v8i16 wgt, denom, offset;
84 
85  offset_val = (unsigned) offset_in << log2_denom;
86 
87  wgt = __msa_fill_h(src_weight);
88  offset = __msa_fill_h(offset_val);
89  denom = __msa_fill_h(log2_denom);
90 
91  LW4(data, stride, tp0, tp1, tp2, tp3);
92  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
93  LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
94  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
95  UNPCK_UB_SH(src0, src0_r, src1_r);
96  UNPCK_UB_SH(src1, src2_r, src3_r);
97  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
98  tmp3);
99  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
100  tmp1, tmp2, tmp3);
101  MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
102  SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
103  SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
104  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
105  ST4x8_UB(src0, src1, data, stride);
106 }
107 
108 static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
109  int32_t src_weight, int32_t offset_in)
110 {
111  uint32_t offset_val;
112  uint64_t tp0, tp1, tp2, tp3;
113  v16u8 src0 = { 0 }, src1 = { 0 };
114  v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
115  v8i16 wgt, denom, offset;
116 
117  offset_val = (unsigned) offset_in << log2_denom;
118 
119  wgt = __msa_fill_h(src_weight);
120  offset = __msa_fill_h(offset_val);
121  denom = __msa_fill_h(log2_denom);
122 
123  LD4(data, stride, tp0, tp1, tp2, tp3);
124  INSERT_D2_UB(tp0, tp1, src0);
125  INSERT_D2_UB(tp2, tp3, src1);
126  UNPCK_UB_SH(src0, src0_r, src1_r);
127  UNPCK_UB_SH(src1, src2_r, src3_r);
128  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
129  tmp3);
130  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
131  tmp1, tmp2, tmp3);
132  MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
133  SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
134  SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
135  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
136  ST8x4_UB(src0, src1, data, stride);
137 }
138 
139 static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
140  int32_t src_weight, int32_t offset_in)
141 {
142  uint32_t offset_val;
143  uint64_t tp0, tp1, tp2, tp3;
144  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
145  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
146  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
147  v8i16 wgt, denom, offset;
148 
149  offset_val = (unsigned) offset_in << log2_denom;
150 
151  wgt = __msa_fill_h(src_weight);
152  offset = __msa_fill_h(offset_val);
153  denom = __msa_fill_h(log2_denom);
154 
155  LD4(data, stride, tp0, tp1, tp2, tp3);
156  INSERT_D2_UB(tp0, tp1, src0);
157  INSERT_D2_UB(tp2, tp3, src1);
158  LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
159  INSERT_D2_UB(tp0, tp1, src2);
160  INSERT_D2_UB(tp2, tp3, src3);
161  UNPCK_UB_SH(src0, src0_r, src1_r);
162  UNPCK_UB_SH(src1, src2_r, src3_r);
163  UNPCK_UB_SH(src2, src4_r, src5_r);
164  UNPCK_UB_SH(src3, src6_r, src7_r);
165  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
166  tmp3);
167  MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
168  tmp7);
169  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
170  tmp1, tmp2, tmp3);
171  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
172  tmp5, tmp6, tmp7);
173  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
174  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
175  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
176  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
177  src2, src3);
178  ST8x8_UB(src0, src1, src2, src3, data, stride);
179 }
180 
181 static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
182  int32_t src_weight, int32_t offset_in)
183 {
184  uint32_t offset_val, cnt;
185  uint64_t tp0, tp1, tp2, tp3;
186  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
187  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
188  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
189  v8i16 wgt, denom, offset;
190 
191  offset_val = (unsigned) offset_in << log2_denom;
192 
193  wgt = __msa_fill_h(src_weight);
194  offset = __msa_fill_h(offset_val);
195  denom = __msa_fill_h(log2_denom);
196 
197  for (cnt = 2; cnt--;) {
198  LD4(data, stride, tp0, tp1, tp2, tp3);
199  INSERT_D2_UB(tp0, tp1, src0);
200  INSERT_D2_UB(tp2, tp3, src1);
201  LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
202  INSERT_D2_UB(tp0, tp1, src2);
203  INSERT_D2_UB(tp2, tp3, src3);
204  UNPCK_UB_SH(src0, src0_r, src1_r);
205  UNPCK_UB_SH(src1, src2_r, src3_r);
206  UNPCK_UB_SH(src2, src4_r, src5_r);
207  UNPCK_UB_SH(src3, src6_r, src7_r);
208  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
209  tmp2, tmp3);
210  MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
211  tmp6, tmp7);
212  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
213  tmp0, tmp1, tmp2, tmp3);
214  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
215  tmp4, tmp5, tmp6, tmp7);
216  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
217  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
218  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
219  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
220  src2, src3);
221  ST8x8_UB(src0, src1, src2, src3, data, stride);
222  data += 8 * stride;
223  }
224 }
225 
227  int32_t log2_denom, int32_t src_weight,
228  int32_t dst_weight, int32_t offset_in)
229 {
230  uint32_t tp0, tp1;
231  v16i8 src_wgt, dst_wgt, wgt, vec0;
232  v16u8 src0 = { 0 }, dst0 = { 0 };
233  v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
234 
235  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
236  offset_in += (128 * (src_weight + dst_weight));
237 
238  src_wgt = __msa_fill_b(src_weight);
239  dst_wgt = __msa_fill_b(dst_weight);
240  offset = __msa_fill_h(offset_in);
241  denom = __msa_fill_h(log2_denom + 1);
242 
243  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
244 
245  LW2(src, stride, tp0, tp1);
246  INSERT_W2_UB(tp0, tp1, src0);
247  LW2(dst, stride, tp0, tp1);
248  INSERT_W2_UB(tp0, tp1, dst0);
249  XORI_B2_128_UB(src0, dst0);
250  vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
251  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
252  tmp0 >>= denom;
253  tmp0 = __msa_maxi_s_h(tmp0, 0);
254  tmp0 = __msa_min_s_h(max255, tmp0);
255  dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
256  ST4x2_UB(dst0, dst, stride);
257 }
258 
260  int32_t log2_denom, int32_t src_weight,
261  int32_t dst_weight, int32_t offset_in)
262 {
263  uint32_t tp0, tp1, tp2, tp3;
264  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
265  v16u8 src0, dst0;
266  v8i16 tmp0, tmp1, denom, offset;
267 
268  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
269  offset_in += (128 * (src_weight + dst_weight));
270 
271  src_wgt = __msa_fill_b(src_weight);
272  dst_wgt = __msa_fill_b(dst_weight);
273  offset = __msa_fill_h(offset_in);
274  denom = __msa_fill_h(log2_denom + 1);
275 
276  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
277 
278  LW4(src, stride, tp0, tp1, tp2, tp3);
279  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
280  LW4(dst, stride, tp0, tp1, tp2, tp3);
281  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
282  XORI_B2_128_UB(src0, dst0);
283  ILVRL_B2_SB(dst0, src0, vec0, vec1);
284  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
285  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
286  tmp0 >>= denom;
287  tmp1 >>= denom;
288  CLIP_SH2_0_255(tmp0, tmp1);
289  dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
290  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
291 }
292 
294  int32_t log2_denom, int32_t src_weight,
295  int32_t dst_weight, int32_t offset_in)
296 {
297  uint32_t tp0, tp1, tp2, tp3;
298  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
299  v16u8 src0, src1, dst0, dst1;
300  v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
301 
302  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
303  offset_in += (128 * (src_weight + dst_weight));
304 
305  src_wgt = __msa_fill_b(src_weight);
306  dst_wgt = __msa_fill_b(dst_weight);
307  offset = __msa_fill_h(offset_in);
308  denom = __msa_fill_h(log2_denom + 1);
309  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
310 
311  LW4(src, stride, tp0, tp1, tp2, tp3);
312  src += 4 * stride;
313  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
314  LW4(src, stride, tp0, tp1, tp2, tp3);
315  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
316  LW4(dst, stride, tp0, tp1, tp2, tp3);
317  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
318  LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
319  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
320  XORI_B4_128_UB(src0, src1, dst0, dst1);
321  ILVRL_B2_SB(dst0, src0, vec0, vec1);
322  ILVRL_B2_SB(dst1, src1, vec2, vec3);
323  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
324  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
325  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
326  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
327  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
328  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
329  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
330  ST4x8_UB(dst0, dst1, dst, stride);
331 }
332 
334  int32_t log2_denom, int32_t src_weight,
335  int32_t dst_weight, int32_t offset_in)
336 {
337  uint64_t tp0, tp1, tp2, tp3;
338  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
339  v16u8 src0, src1, dst0, dst1;
340  v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
341 
342  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
343  offset_in += (128 * (src_weight + dst_weight));
344 
345  src_wgt = __msa_fill_b(src_weight);
346  dst_wgt = __msa_fill_b(dst_weight);
347  offset = __msa_fill_h(offset_in);
348  denom = __msa_fill_h(log2_denom + 1);
349 
350  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
351 
352  LD4(src, stride, tp0, tp1, tp2, tp3);
353  INSERT_D2_UB(tp0, tp1, src0);
354  INSERT_D2_UB(tp2, tp3, src1);
355  LD4(dst, stride, tp0, tp1, tp2, tp3);
356  INSERT_D2_UB(tp0, tp1, dst0);
357  INSERT_D2_UB(tp2, tp3, dst1);
358  XORI_B4_128_UB(src0, src1, dst0, dst1);
359  ILVRL_B2_SB(dst0, src0, vec0, vec1);
360  ILVRL_B2_SB(dst1, src1, vec2, vec3);
361  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
362  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
363  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
364  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
365  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
366  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
367  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
368  ST8x4_UB(dst0, dst1, dst, stride);
369 }
370 
372  int32_t log2_denom, int32_t src_weight,
373  int32_t dst_weight, int32_t offset_in)
374 {
375  uint64_t tp0, tp1, tp2, tp3;
376  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
377  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
378  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
379 
380  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
381  offset_in += (128 * (src_weight + dst_weight));
382 
383  src_wgt = __msa_fill_b(src_weight);
384  dst_wgt = __msa_fill_b(dst_weight);
385  offset = __msa_fill_h(offset_in);
386  denom = __msa_fill_h(log2_denom + 1);
387  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
388 
389  LD4(src, stride, tp0, tp1, tp2, tp3);
390  INSERT_D2_UB(tp0, tp1, src0);
391  INSERT_D2_UB(tp2, tp3, src1);
392  LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
393  INSERT_D2_UB(tp0, tp1, src2);
394  INSERT_D2_UB(tp2, tp3, src3);
395  LD4(dst, stride, tp0, tp1, tp2, tp3);
396  INSERT_D2_UB(tp0, tp1, dst0);
397  INSERT_D2_UB(tp2, tp3, dst1);
398  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
399  INSERT_D2_UB(tp0, tp1, dst2);
400  INSERT_D2_UB(tp2, tp3, dst3);
401  XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
402  ILVRL_B2_SB(dst0, src0, vec0, vec1);
403  ILVRL_B2_SB(dst1, src1, vec2, vec3);
404  ILVRL_B2_SB(dst2, src2, vec4, vec5);
405  ILVRL_B2_SB(dst3, src3, vec6, vec7);
406  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
407  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
408  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
409  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
410  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
411  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
412  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
413  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
414  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
415  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
416  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
417  CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
418  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
419  PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
420  ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
421 }
422 
424  int32_t log2_denom, int32_t src_weight,
425  int32_t dst_weight, int32_t offset_in)
426 {
427  uint8_t cnt;
428  uint64_t tp0, tp1, tp2, tp3;
429  v16i8 src_wgt, dst_wgt, wgt;
430  v16u8 src0, src1, src2, src3;
431  v16u8 dst0, dst1, dst2, dst3;
432  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
433  v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
434  v8i16 denom, offset;
435 
436  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
437  offset_in += (128 * (src_weight + dst_weight));
438 
439  src_wgt = __msa_fill_b(src_weight);
440  dst_wgt = __msa_fill_b(dst_weight);
441  offset = __msa_fill_h(offset_in);
442  denom = __msa_fill_h(log2_denom + 1);
443  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
444 
445  for (cnt = 2; cnt--;) {
446  LD4(src, stride, tp0, tp1, tp2, tp3);
447  src += 4 * stride;
448  INSERT_D2_UB(tp0, tp1, src0);
449  INSERT_D2_UB(tp2, tp3, src1);
450  LD4(src, stride, tp0, tp1, tp2, tp3);
451  src += 4 * stride;
452  INSERT_D2_UB(tp0, tp1, src2);
453  INSERT_D2_UB(tp2, tp3, src3);
454  LD4(dst, stride, tp0, tp1, tp2, tp3);
455  INSERT_D2_UB(tp0, tp1, dst0);
456  INSERT_D2_UB(tp2, tp3, dst1);
457  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
458  INSERT_D2_UB(tp0, tp1, dst2);
459  INSERT_D2_UB(tp2, tp3, dst3);
460  XORI_B4_128_UB(src0, src1, src2, src3);
461  XORI_B4_128_UB(dst0, dst1, dst2, dst3);
462  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
463  vec0, vec2, vec4, vec6);
464  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
465  vec1, vec3, vec5, vec7);
466 
467  temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
468  temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
469  temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
470  temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
471  temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
472  temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
473  temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
474  temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
475 
476  SRA_4V(temp0, temp1, temp2, temp3, denom);
477  SRA_4V(temp4, temp5, temp6, temp7, denom);
478  CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
479  CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
480  PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
481  dst0, dst1, dst2, dst3);
482  ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
483  dst += 8 * stride;
484  }
485 }
486 
487 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
488  q3_or_p3_org_in, p1_or_q1_org_in, \
489  p2_or_q2_org_in, q1_or_p1_org_in, \
490  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
491 { \
492  v8i16 threshold; \
493  v8i16 const3 = __msa_ldi_h(3); \
494  \
495  threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
496  threshold += (p1_or_q1_org_in); \
497  \
498  (p0_or_q0_out) = threshold << 1; \
499  (p0_or_q0_out) += (p2_or_q2_org_in); \
500  (p0_or_q0_out) += (q1_or_p1_org_in); \
501  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
502  \
503  (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
504  (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
505  \
506  (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
507  (p2_or_q2_out) += (p3_or_q3_org_in); \
508  (p2_or_q2_out) += (p3_or_q3_org_in); \
509  (p2_or_q2_out) += threshold; \
510  (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
511 }
512 
513 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
514 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
515  p1_or_q1_org_in, p0_or_q0_out) \
516 { \
517  (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
518  (p0_or_q0_out) += (p1_or_q1_org_in); \
519  (p0_or_q0_out) += (p1_or_q1_org_in); \
520  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
521 }
522 
523 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
524  p1_or_q1_org_in, p2_or_q2_org_in, \
525  negate_tc_in, tc_in, p1_or_q1_out) \
526 { \
527  v8i16 clip3, temp; \
528  \
529  clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
530  (v8u16) q0_or_p0_org_in); \
531  temp = p1_or_q1_org_in << 1; \
532  clip3 = clip3 - temp; \
533  clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
534  clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \
535  p1_or_q1_out = p1_or_q1_org_in + clip3; \
536 }
537 
538 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
539  p1_or_q1_org_in, q1_or_p1_org_in, \
540  negate_threshold_in, threshold_in, \
541  p0_or_q0_out, q0_or_p0_out) \
542 { \
543  v8i16 q0_sub_p0, p1_sub_q1, delta; \
544  \
545  q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
546  p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
547  q0_sub_p0 <<= 2; \
548  p1_sub_q1 += 4; \
549  delta = q0_sub_p0 + p1_sub_q1; \
550  delta >>= 3; \
551  \
552  delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \
553  \
554  p0_or_q0_out = p0_or_q0_org_in + delta; \
555  q0_or_p0_out = q0_or_p0_org_in - delta; \
556  \
557  CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
558 }
559 
560 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
561 { \
562  uint32_t load0, load1, load2, load3; \
563  v16u8 src0 = { 0 }; \
564  v16u8 src1 = { 0 }; \
565  v16u8 src2 = { 0 }; \
566  v16u8 src3 = { 0 }; \
567  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
568  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
569  v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
570  v8i16 res0_r, res1_r; \
571  v16i8 zeros = { 0 }; \
572  v16u8 res0, res1; \
573  \
574  LW4((src - 2), stride, load0, load1, load2, load3); \
575  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
576  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
577  src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
578  src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
579  \
580  TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
581  \
582  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
583  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
584  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
585  \
586  tc = __msa_fill_h(tc_val); \
587  \
588  is_less_than_alpha = (p0_asub_q0 < alpha); \
589  is_less_than_beta = (p1_asub_p0 < beta); \
590  is_less_than = is_less_than_alpha & is_less_than_beta; \
591  is_less_than_beta = (q1_asub_q0 < beta); \
592  is_less_than = is_less_than_beta & is_less_than; \
593  \
594  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
595  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
596  \
597  q0_sub_p0 <<= 2; \
598  delta = q0_sub_p0 + p1_sub_q1; \
599  delta = __msa_srari_h(delta, 3); \
600  \
601  delta = CLIP_SH(delta, -tc, tc); \
602  \
603  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
604  \
605  res0_r += delta; \
606  res1_r -= delta; \
607  \
608  CLIP_SH2_0_255(res0_r, res1_r); \
609  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
610  \
611  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
612  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
613  \
614  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
615 }
616 
617 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
618 { \
619  v16i8 zero_m = { 0 }; \
620  \
621  out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
622  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
623  SLDI_B2_0_UB(out1, out2, out2, out3, 2); \
624 }
625 
626 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
627 { \
628  uint32_t load0, load1; \
629  v16u8 src0 = { 0 }; \
630  v16u8 src1 = { 0 }; \
631  v16u8 src2 = { 0 }; \
632  v16u8 src3 = { 0 }; \
633  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
634  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
635  v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
636  v16i8 zeros = { 0 }; \
637  v16u8 res0, res1; \
638  \
639  load0 = LW(src - 2); \
640  load1 = LW(src - 2 + stride); \
641  \
642  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
643  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
644  \
645  TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
646  \
647  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
648  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
649  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
650  \
651  tc = __msa_fill_h(tc_val); \
652  \
653  is_less_than_alpha = (p0_asub_q0 < alpha); \
654  is_less_than_beta = (p1_asub_p0 < beta); \
655  is_less_than = is_less_than_alpha & is_less_than_beta; \
656  is_less_than_beta = (q1_asub_q0 < beta); \
657  is_less_than = is_less_than_beta & is_less_than; \
658  \
659  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
660  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
661  \
662  q0_sub_p0 <<= 2; \
663  delta = q0_sub_p0 + p1_sub_q1; \
664  delta = __msa_srari_h(delta, 3); \
665  delta = CLIP_SH(delta, -tc, tc); \
666  \
667  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
668  \
669  res0_r += delta; \
670  res1_r -= delta; \
671  \
672  CLIP_SH2_0_255(res0_r, res1_r); \
673  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
674  \
675  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
676  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
677  \
678  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
679 }
680 
682  uint8_t alpha_in,
683  uint8_t beta_in,
684  uint32_t img_width)
685 {
686  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
687  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
688  v16u8 p1_org, p0_org, q0_org, q1_org;
689 
690  LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
691 
692  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
693  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
694  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
695 
696  is_less_than_alpha = (p0_asub_q0 < alpha_in);
697  is_less_than_beta = (p1_asub_p0 < beta_in);
698  is_less_than = is_less_than_beta & is_less_than_alpha;
699  is_less_than_beta = (q1_asub_q0 < beta_in);
700  is_less_than = is_less_than_beta & is_less_than;
701 
702  if (!__msa_test_bz_v(is_less_than)) {
703  v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
704  v8i16 p0_r = { 0 };
705  v8i16 q0_r = { 0 };
706  v8i16 p0_l = { 0 };
707  v8i16 q0_l = { 0 };
708  v16i8 zero = { 0 };
709  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
710  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
711  v16u8 q2_org = LD_UB(data + (2 * img_width));
712  v16u8 p2_org = LD_UB(data - (3 * img_width));
713  v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
714 
715  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
716  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
717  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
718 
719  tmp_flag = (p0_asub_q0 < tmp_flag);
720 
721  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
722  is_less_than_beta = (p2_asub_p0 < beta_in);
723  is_less_than_beta = is_less_than_beta & tmp_flag;
724  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
725  is_less_than_beta = is_less_than_beta & is_less_than;
726  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
727 
728  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
729  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
730 
731  /* combine and store */
732  if (!__msa_test_bz_v(is_less_than_beta)) {
733  v8i16 p3_org_l, p3_org_r;
734  v16u8 p3_org = LD_UB(data - (img_width << 2));
735  v16u8 p2, p1;
736  v8i16 p2_r = { 0 };
737  v8i16 p2_l = { 0 };
738  v8i16 p1_r = { 0 };
739  v8i16 p1_l = { 0 };
740 
741  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
742  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
743  p2_r, q1_org_r, p0_r, p1_r, p2_r);
744 
745  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
746  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
747  p2_l, q1_org_l, p0_l, p1_l, p2_l);
748 
749  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
750 
751  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
752  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
753  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
754 
755  ST_UB(p1_org, data - (2 * img_width));
756  ST_UB(p2_org, data - (3 * img_width));
757  }
758 
759  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
760  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
761 
762  /* combine */
763  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
764  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
765 
766  ST_UB(p0_org, data - img_width);
767 
768  /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
769  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
770  is_less_than_beta = (q2_asub_q0 < beta_in);
771  is_less_than_beta = is_less_than_beta & tmp_flag;
772  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
773  is_less_than_beta = is_less_than_beta & is_less_than;
774  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
775 
776  /* combine and store */
777  if (!__msa_test_bz_v(is_less_than_beta)) {
778  v8i16 q3_org_r, q3_org_l;
779  v16u8 q3_org = LD_UB(data + (3 * img_width));
780  v16u8 q1, q2;
781  v8i16 q2_r = { 0 };
782  v8i16 q2_l = { 0 };
783  v8i16 q1_r = { 0 };
784  v8i16 q1_l = { 0 };
785 
786  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
787  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
788  q2_r, p1_org_r, q0_r, q1_r, q2_r);
789 
790  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
791  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
792  q2_l, p1_org_l, q0_l, q1_l, q2_l);
793 
794  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
795  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
796  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
797  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
798 
799  ST_UB(q1_org, data + img_width);
800  ST_UB(q2_org, data + 2 * img_width);
801  }
802 
803  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
804  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
805 
806  /* combine */
807  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
808  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
809 
810  ST_UB(q0_org, data);
811  }
812 }
813 
815  uint8_t alpha_in,
816  uint8_t beta_in,
817  uint32_t img_width)
818 {
819  uint8_t *src = data - 4;
820  v16u8 alpha, beta, p0_asub_q0;
821  v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
822  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
823  v16u8 p1_asub_p0, q1_asub_q0;
824 
825 
826  {
827  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
828  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
829 
830  LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
831  LD_UB8(src + (8 * img_width), img_width,
832  row8, row9, row10, row11, row12, row13, row14, row15);
833 
834  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
835  row4, row5, row6, row7,
836  row8, row9, row10, row11,
837  row12, row13, row14, row15,
838  p3_org, p2_org, p1_org, p0_org,
839  q0_org, q1_org, q2_org, q3_org);
840  }
841 
842  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
843  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
844  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
845 
846  alpha = (v16u8) __msa_fill_b(alpha_in);
847  beta = (v16u8) __msa_fill_b(beta_in);
848 
849  is_less_than_alpha = (p0_asub_q0 < alpha);
850  is_less_than_beta = (p1_asub_p0 < beta);
851  is_less_than = is_less_than_beta & is_less_than_alpha;
852  is_less_than_beta = (q1_asub_q0 < beta);
853  is_less_than = is_less_than_beta & is_less_than;
854 
855  if (!__msa_test_bz_v(is_less_than)) {
856  v8i16 p0_r = { 0 };
857  v8i16 q0_r = { 0 };
858  v8i16 p0_l = { 0 };
859  v8i16 q0_l = { 0 };
860  v16i8 zero = { 0 };
861  v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
862  v16u8 negate_is_less_than_beta;
863  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
864  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
865 
866  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
867  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
868  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
869  UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
870 
871  tmp_flag = alpha >> 2;
872  tmp_flag = tmp_flag + 2;
873  tmp_flag = (p0_asub_q0 < tmp_flag);
874 
875  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
876  is_less_than_beta = (p2_asub_p0 < beta);
877  is_less_than_beta = tmp_flag & is_less_than_beta;
878  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
879  is_less_than_beta = is_less_than_beta & is_less_than;
880  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
881 
882  if (!__msa_test_bz_v(is_less_than_beta)) {
883  v16u8 p2, p1;
884  v8i16 p3_org_r, p3_org_l;
885  v8i16 p2_l = { 0 };
886  v8i16 p2_r = { 0 };
887  v8i16 p1_l = { 0 };
888  v8i16 p1_r = { 0 };
889 
890  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
891  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
892  p2_r, q1_org_r, p0_r, p1_r, p2_r);
893 
894  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
895  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
896  p2_l, q1_org_l, p0_l, p1_l, p2_l);
897 
898  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
899  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
900  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
901  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
902  }
903 
904  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
905  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
906 
907  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
908  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
909 
910  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
911  is_less_than_beta = (q2_asub_q0 < beta);
912 
913  is_less_than_beta = is_less_than_beta & tmp_flag;
914  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
915 
916  is_less_than_beta = is_less_than_beta & is_less_than;
917  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
918 
919  if (!__msa_test_bz_v(is_less_than_beta)) {
920  v16u8 q1, q2;
921  v8i16 q3_org_r, q3_org_l;
922  v8i16 q1_l = { 0 };
923  v8i16 q1_r = { 0 };
924  v8i16 q2_l = { 0 };
925  v8i16 q2_r = { 0 };
926 
927  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
928  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
929  q2_r, p1_org_r, q0_r, q1_r, q2_r);
930 
931  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
932  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
933  q2_l, p1_org_l, q0_l, q1_l, q2_l);
934 
935  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
936  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
937  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
938  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
939  }
940 
941  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
942  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
943 
944  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
945  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
946 
947  {
948  v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
949 
950  ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
951  ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
952  ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
953 
954  ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
955  ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
956 
957  src = data - 3;
958  ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
959  ST2x4_UB(tmp2, 0, src + 4, img_width);
960  src += 4 * img_width;
961  ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
962  ST2x4_UB(tmp2, 4, src + 4, img_width);
963  src += 4 * img_width;
964 
965  ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
966  ST2x4_UB(tmp5, 0, src + 4, img_width);
967  src += 4 * img_width;
968  ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
969  ST2x4_UB(tmp5, 4, src + 4, img_width);
970  }
971  }
972 }
973 
975  int32_t alpha_in,
976  int32_t beta_in)
977 {
978  uint64_t load0, load1;
979  uint32_t out0, out2;
980  uint16_t out1, out3;
981  v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
982  v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
983  v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
984  v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
985  v8i16 tmp0, tmp1, tmp2, tmp3;
986  v16u8 alpha, beta;
987  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
988  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
989  v16u8 is_less_than_beta1, is_less_than_beta2;
990  v16i8 src0 = { 0 };
991  v16i8 src1 = { 0 };
992  v16i8 src2 = { 0 };
993  v16i8 src3 = { 0 };
994  v16i8 src4 = { 0 };
995  v16i8 src5 = { 0 };
996  v16i8 src6 = { 0 };
997  v16i8 src7 = { 0 };
998  v16i8 zeros = { 0 };
999 
1000  load0 = LD(src - 4);
1001  load1 = LD(src + stride - 4);
1002  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1003  src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1004 
1005  load0 = LD(src + (2 * stride) - 4);
1006  load1 = LD(src + (3 * stride) - 4);
1007  src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1008  src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1009 
1010  load0 = LD(src + (4 * stride) - 4);
1011  load1 = LD(src + (5 * stride) - 4);
1012  src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1013  src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1014 
1015  load0 = LD(src + (6 * stride) - 4);
1016  load1 = LD(src + (7 * stride) - 4);
1017  src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1018  src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1019 
1020  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1021  src0, src1, src2, src3);
1022 
1023  ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1024  ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1025 
1026  ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1027  ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1028  SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
1029 
1030  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1031  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1032  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1033 
1034  alpha = (v16u8) __msa_fill_b(alpha_in);
1035  beta = (v16u8) __msa_fill_b(beta_in);
1036 
1037  is_less_than_alpha = (p0_asub_q0 < alpha);
1038  is_less_than_beta = (p1_asub_p0 < beta);
1039  is_less_than = is_less_than_alpha & is_less_than_beta;
1040  is_less_than_beta = (q1_asub_q0 < beta);
1041  is_less_than = is_less_than & is_less_than_beta;
1042 
1043  alpha >>= 2;
1044  alpha += 2;
1045 
1046  is_less_than_alpha = (p0_asub_q0 < alpha);
1047 
1048  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1049  is_less_than_beta1 = (p2_asub_p0 < beta);
1050  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1051  is_less_than_beta2 = (q2_asub_q0 < beta);
1052 
1053  ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1054  src0_r, src1_r, src2_r, src3_r);
1055  ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1056  src4_r, src5_r, src6_r, src7_r);
1057 
1058  dst2_x_r = src1_r + src2_r + src3_r;
1059  dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1060  dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1061  dst1_r = src0_r + src1_r + src2_r + src3_r;
1062  dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1063 
1064  dst0_r = (2 * src6_r) + (3 * src0_r);
1065  dst0_r += src1_r + src2_r + src3_r;
1066  dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1067  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1068  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1069 
1070  PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1071  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1072 
1073  dst3_x_r = src2_r + src3_r + src4_r;
1074  dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1075  dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1076  dst4_r = src2_r + src3_r + src4_r + src5_r;
1077  dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1078 
1079  dst5_r = (2 * src7_r) + (3 * src5_r);
1080  dst5_r += src4_r + src3_r + src2_r;
1081  dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1082  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1083  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1084 
1085  PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1086  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1087 
1088  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1089  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1090  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1091  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1092 
1093  PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1094 
1095  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1096  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1097  dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1098  dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1099 
1100  is_less_than = is_less_than_alpha & is_less_than;
1101  dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1102  is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1103  dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1104 
1105  dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1106  dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1107  dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1108  is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1109  dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1110  dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1111  dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1112 
1113  ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1114  dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1115  ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1116  ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1117 
1118  ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1119  SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8);
1120  dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1121  dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1122  SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8);
1123 
1124  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1125  out1 = __msa_copy_u_h((v8i16) dst0, 2);
1126  out2 = __msa_copy_u_w((v4i32) dst1, 0);
1127  out3 = __msa_copy_u_h((v8i16) dst1, 2);
1128 
1129  SW(out0, (src - 3));
1130  SH(out1, (src + 1));
1131  src += stride;
1132  SW(out2, (src - 3));
1133  SH(out3, (src + 1));
1134  src += stride;
1135 
1136  out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1137  out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1138  out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1139  out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1140 
1141  SW(out0, (src - 3));
1142  SH(out1, (src + 1));
1143  src += stride;
1144  SW(out2, (src - 3));
1145  SH(out3, (src + 1));
1146  src += stride;
1147 
1148  out0 = __msa_copy_u_w((v4i32) dst4, 0);
1149  out1 = __msa_copy_u_h((v8i16) dst4, 2);
1150  out2 = __msa_copy_u_w((v4i32) dst5, 0);
1151  out3 = __msa_copy_u_h((v8i16) dst5, 2);
1152 
1153  SW(out0, (src - 3));
1154  SH(out1, (src + 1));
1155  src += stride;
1156  SW(out2, (src - 3));
1157  SH(out3, (src + 1));
1158  src += stride;
1159 
1160  out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1161  out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1162  out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1163  out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1164 
1165  SW(out0, (src - 3));
1166  SH(out1, (src + 1));
1167  src += stride;
1168  SW(out2, (src - 3));
1169  SH(out3, (src + 1));
1170 }
1171 
1173  uint8_t alpha_in,
1174  uint8_t beta_in,
1175  uint32_t img_width)
1176 {
1177  v16u8 alpha, beta;
1178  v16u8 is_less_than;
1179  v8i16 p0_or_q0, q0_or_p0;
1180  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1181  v16i8 zero = { 0 };
1182  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1183  v16u8 is_less_than_alpha, is_less_than_beta;
1184  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1185 
1186  alpha = (v16u8) __msa_fill_b(alpha_in);
1187  beta = (v16u8) __msa_fill_b(beta_in);
1188 
1189  LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1190  p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1191 
1192  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1193  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1194  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1195 
1196  is_less_than_alpha = (p0_asub_q0 < alpha);
1197  is_less_than_beta = (p1_asub_p0 < beta);
1198  is_less_than = is_less_than_beta & is_less_than_alpha;
1199  is_less_than_beta = (q1_asub_q0 < beta);
1200  is_less_than = is_less_than_beta & is_less_than;
1201 
1202  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1203 
1204  if (!__msa_test_bz_v(is_less_than)) {
1205  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1206  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1207  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1208  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1209  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1210 
1211  p0_or_q0_org =
1212  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1213  q0_or_p0_org =
1214  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1215 
1216  ST_UB(q0_or_p0_org, data_cb_or_cr);
1217  ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1218  }
1219 }
1220 
1222  uint8_t alpha_in,
1223  uint8_t beta_in,
1224  uint32_t img_width)
1225 {
1226  v8i16 tmp1;
1227  v16u8 alpha, beta, is_less_than;
1228  v8i16 p0_or_q0, q0_or_p0;
1229  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1230  v16i8 zero = { 0 };
1231  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1232  v16u8 is_less_than_alpha, is_less_than_beta;
1233  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1234 
1235  {
1236  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1237 
1238  LD_UB8((data_cb_or_cr - 2), img_width,
1239  row0, row1, row2, row3, row4, row5, row6, row7);
1240 
1241  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1242  p1_or_q1_org, p0_or_q0_org,
1243  q0_or_p0_org, q1_or_p1_org);
1244  }
1245 
1246  alpha = (v16u8) __msa_fill_b(alpha_in);
1247  beta = (v16u8) __msa_fill_b(beta_in);
1248 
1249  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1250  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1251  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1252 
1253  is_less_than_alpha = (p0_asub_q0 < alpha);
1254  is_less_than_beta = (p1_asub_p0 < beta);
1255  is_less_than = is_less_than_beta & is_less_than_alpha;
1256  is_less_than_beta = (q1_asub_q0 < beta);
1257  is_less_than = is_less_than_beta & is_less_than;
1258  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1259 
1260  if (!__msa_test_bz_v(is_less_than)) {
1261  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1262  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1263 
1264  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1265  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1266 
1267  /* convert 16 bit output into 8 bit output */
1268  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1269 
1270  p0_or_q0_org =
1271  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1272  q0_or_p0_org =
1273  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1274  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1275 
1276  data_cb_or_cr -= 1;
1277  ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
1278  data_cb_or_cr += 4 * img_width;
1279  ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
1280  }
1281 }
1282 
1284  uint8_t bs0, uint8_t bs1,
1285  uint8_t bs2, uint8_t bs3,
1286  uint8_t tc0, uint8_t tc1,
1287  uint8_t tc2, uint8_t tc3,
1288  uint8_t alpha_in,
1289  uint8_t beta_in,
1290  uint32_t img_width)
1291 {
1292  v16u8 tmp_vec, bs = { 0 };
1293 
1294  tmp_vec = (v16u8) __msa_fill_b(bs0);
1295  bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1296  tmp_vec = (v16u8) __msa_fill_b(bs1);
1297  bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1298  tmp_vec = (v16u8) __msa_fill_b(bs2);
1299  bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1300  tmp_vec = (v16u8) __msa_fill_b(bs3);
1301  bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1302 
1303  if (!__msa_test_bz_v(bs)) {
1304  uint8_t *src = data - 4;
1305  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
1306  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
1307  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1308  v16u8 is_bs_greater_than0;
1309  v16u8 tc = { 0 };
1310  v16i8 zero = { 0 };
1311 
1312  tmp_vec = (v16u8) __msa_fill_b(tc0);
1313  tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1314  tmp_vec = (v16u8) __msa_fill_b(tc1);
1315  tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1316  tmp_vec = (v16u8) __msa_fill_b(tc2);
1317  tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1318  tmp_vec = (v16u8) __msa_fill_b(tc3);
1319  tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1320 
1321  is_bs_greater_than0 = (zero < bs);
1322 
1323  {
1324  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1325  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1326 
1327  LD_UB8(src, img_width,
1328  row0, row1, row2, row3, row4, row5, row6, row7);
1329  src += (8 * img_width);
1330  LD_UB8(src, img_width,
1331  row8, row9, row10, row11, row12, row13, row14, row15);
1332 
1333  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1334  row8, row9, row10, row11,
1335  row12, row13, row14, row15,
1336  p3_org, p2_org, p1_org, p0_org,
1337  q0_org, q1_org, q2_org, q3_org);
1338  }
1339 
1340  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1341  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1342  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1343 
1344  alpha = (v16u8) __msa_fill_b(alpha_in);
1345  beta = (v16u8) __msa_fill_b(beta_in);
1346 
1347  is_less_than_alpha = (p0_asub_q0 < alpha);
1348  is_less_than_beta = (p1_asub_p0 < beta);
1349  is_less_than = is_less_than_beta & is_less_than_alpha;
1350  is_less_than_beta = (q1_asub_q0 < beta);
1351  is_less_than = is_less_than_beta & is_less_than;
1352  is_less_than = is_less_than & is_bs_greater_than0;
1353 
1354  if (!__msa_test_bz_v(is_less_than)) {
1355  v16i8 negate_tc, sign_negate_tc;
1356  v16u8 p0, q0, p2_asub_p0, q2_asub_q0;
1357  v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
1358  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1359  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1360  v8i16 p0_r, q0_r, p0_l, q0_l;
1361 
1362  negate_tc = zero - (v16i8) tc;
1363  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1364 
1365  ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1366 
1367  UNPCK_UB_SH(tc, tc_r, tc_l);
1368  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1369  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1370  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1371 
1372  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1373  is_less_than_beta = (p2_asub_p0 < beta);
1374  is_less_than_beta = is_less_than_beta & is_less_than;
1375 
1376  if (!__msa_test_bz_v(is_less_than_beta)) {
1377  v16u8 p1;
1378  v8i16 p1_r = { 0 };
1379  v8i16 p1_l = { 0 };
1380  v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1381  v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1382 
1383  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1384  negate_tc_r, tc_r, p1_r);
1385  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1386  i16_negatetc_l, tc_l, p1_l);
1387 
1388  p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1389  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1390 
1391  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1392  tc = tc + is_less_than_beta;
1393  }
1394 
1395  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1396  is_less_than_beta = (q2_asub_q0 < beta);
1397  is_less_than_beta = is_less_than_beta & is_less_than;
1398 
1399  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1400  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1401 
1402  if (!__msa_test_bz_v(is_less_than_beta)) {
1403  v16u8 q1;
1404  v8i16 q1_r = { 0 };
1405  v8i16 q1_l = { 0 };
1406  v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1407  v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1408 
1409  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1410  negate_tc_r, tc_r, q1_r);
1411  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1412  i16_negatetc_l, tc_l, q1_l);
1413 
1414  q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1415  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1416 
1417  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1418  tc = tc + is_less_than_beta;
1419  }
1420 
1421  {
1422  v8i16 threshold_r, negate_thresh_r;
1423  v8i16 threshold_l, negate_thresh_l;
1424  v16i8 negate_thresh, sign_negate_thresh;
1425 
1426  negate_thresh = zero - (v16i8) tc;
1427  sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1428 
1429  ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1430  threshold_r, negate_thresh_r);
1431 
1432  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1433  negate_thresh_r, threshold_r, p0_r, q0_r);
1434 
1435  threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
1436  negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1437  negate_thresh);
1438 
1439  AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1440  negate_thresh_l, threshold_l, p0_l, q0_l);
1441  }
1442 
1443  PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1444 
1445  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1446  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1447 
1448  {
1449  v16i8 tp0, tp1, tp2, tp3;
1450  v8i16 tmp2, tmp5;
1451  v4i32 tmp3, tmp4, tmp6, tmp7;
1452  uint32_t out0, out2;
1453  uint16_t out1, out3;
1454 
1455  src = data - 3;
1456 
1457  ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
1458  ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
1459  ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
1460 
1461  ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
1462  ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
1463 
1464  out0 = __msa_copy_u_w(tmp3, 0);
1465  out1 = __msa_copy_u_h(tmp2, 0);
1466  out2 = __msa_copy_u_w(tmp3, 1);
1467  out3 = __msa_copy_u_h(tmp2, 1);
1468 
1469  SW(out0, src);
1470  SH(out1, (src + 4));
1471  src += img_width;
1472  SW(out2, src);
1473  SH(out3, (src + 4));
1474 
1475  out0 = __msa_copy_u_w(tmp3, 2);
1476  out1 = __msa_copy_u_h(tmp2, 2);
1477  out2 = __msa_copy_u_w(tmp3, 3);
1478  out3 = __msa_copy_u_h(tmp2, 3);
1479 
1480  src += img_width;
1481  SW(out0, src);
1482  SH(out1, (src + 4));
1483  src += img_width;
1484  SW(out2, src);
1485  SH(out3, (src + 4));
1486 
1487  out0 = __msa_copy_u_w(tmp4, 0);
1488  out1 = __msa_copy_u_h(tmp2, 4);
1489  out2 = __msa_copy_u_w(tmp4, 1);
1490  out3 = __msa_copy_u_h(tmp2, 5);
1491 
1492  src += img_width;
1493  SW(out0, src);
1494  SH(out1, (src + 4));
1495  src += img_width;
1496  SW(out2, src);
1497  SH(out3, (src + 4));
1498 
1499  out0 = __msa_copy_u_w(tmp4, 2);
1500  out1 = __msa_copy_u_h(tmp2, 6);
1501  out2 = __msa_copy_u_w(tmp4, 3);
1502  out3 = __msa_copy_u_h(tmp2, 7);
1503 
1504  src += img_width;
1505  SW(out0, src);
1506  SH(out1, (src + 4));
1507  src += img_width;
1508  SW(out2, src);
1509  SH(out3, (src + 4));
1510 
1511  out0 = __msa_copy_u_w(tmp6, 0);
1512  out1 = __msa_copy_u_h(tmp5, 0);
1513  out2 = __msa_copy_u_w(tmp6, 1);
1514  out3 = __msa_copy_u_h(tmp5, 1);
1515 
1516  src += img_width;
1517  SW(out0, src);
1518  SH(out1, (src + 4));
1519  src += img_width;
1520  SW(out2, src);
1521  SH(out3, (src + 4));
1522 
1523  out0 = __msa_copy_u_w(tmp6, 2);
1524  out1 = __msa_copy_u_h(tmp5, 2);
1525  out2 = __msa_copy_u_w(tmp6, 3);
1526  out3 = __msa_copy_u_h(tmp5, 3);
1527 
1528  src += img_width;
1529  SW(out0, src);
1530  SH(out1, (src + 4));
1531  src += img_width;
1532  SW(out2, src);
1533  SH(out3, (src + 4));
1534 
1535  out0 = __msa_copy_u_w(tmp7, 0);
1536  out1 = __msa_copy_u_h(tmp5, 4);
1537  out2 = __msa_copy_u_w(tmp7, 1);
1538  out3 = __msa_copy_u_h(tmp5, 5);
1539 
1540  src += img_width;
1541  SW(out0, src);
1542  SH(out1, (src + 4));
1543  src += img_width;
1544  SW(out2, src);
1545  SH(out3, (src + 4));
1546 
1547  out0 = __msa_copy_u_w(tmp7, 2);
1548  out1 = __msa_copy_u_h(tmp5, 6);
1549  out2 = __msa_copy_u_w(tmp7, 3);
1550  out3 = __msa_copy_u_h(tmp5, 7);
1551 
1552  src += img_width;
1553  SW(out0, src);
1554  SH(out1, (src + 4));
1555  src += img_width;
1556  SW(out2, src);
1557  SH(out3, (src + 4));
1558  }
1559  }
1560  }
1561 }
1562 
1564  uint8_t bs0, uint8_t bs1,
1565  uint8_t bs2, uint8_t bs3,
1566  uint8_t tc0, uint8_t tc1,
1567  uint8_t tc2, uint8_t tc3,
1568  uint8_t alpha_in,
1569  uint8_t beta_in,
1570  uint32_t image_width)
1571 {
1572  v16u8 tmp_vec;
1573  v16u8 bs = { 0 };
1574 
1575  tmp_vec = (v16u8) __msa_fill_b(bs0);
1576  bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1577  tmp_vec = (v16u8) __msa_fill_b(bs1);
1578  bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1579  tmp_vec = (v16u8) __msa_fill_b(bs2);
1580  bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1581  tmp_vec = (v16u8) __msa_fill_b(bs3);
1582  bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1583 
1584  if (!__msa_test_bz_v(bs)) {
1585  v16u8 alpha, beta, is_less_than, is_less_than_beta;
1586  v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1587  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1588  v16u8 is_less_than_alpha, is_bs_greater_than0;
1589  v8i16 p0_r, q0_r, p0_l, q0_l;
1590  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1591  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1592  v16i8 zero = { 0 };
1593  v16i8 tc = { 0 };
1594 
1595  tmp_vec = (v16u8) __msa_fill_b(tc0);
1596  tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1597  tmp_vec = (v16u8) __msa_fill_b(tc1);
1598  tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1599  tmp_vec = (v16u8) __msa_fill_b(tc2);
1600  tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1601  tmp_vec = (v16u8) __msa_fill_b(tc3);
1602  tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1603 
1604  alpha = (v16u8) __msa_fill_b(alpha_in);
1605  beta = (v16u8) __msa_fill_b(beta_in);
1606 
1607  LD_UB5(data - (3 * image_width), image_width,
1608  p2_org, p1_org, p0_org, q0_org, q1_org);
1609 
1610  is_bs_greater_than0 = ((v16u8) zero < bs);
1611  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1612  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1613  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1614 
1615  is_less_than_alpha = (p0_asub_q0 < alpha);
1616  is_less_than_beta = (p1_asub_p0 < beta);
1617  is_less_than = is_less_than_beta & is_less_than_alpha;
1618  is_less_than_beta = (q1_asub_q0 < beta);
1619  is_less_than = is_less_than_beta & is_less_than;
1620  is_less_than = is_less_than & is_bs_greater_than0;
1621 
1622  if (!__msa_test_bz_v(is_less_than)) {
1623  v16i8 sign_negate_tc, negate_tc;
1624  v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1625  v16u8 p2_asub_p0, q2_asub_q0;
1626 
1627  q2_org = LD_UB(data + (2 * image_width));
1628  negate_tc = zero - tc;
1629  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1630 
1631  ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1632 
1633  UNPCK_UB_SH(tc, tc_r, tc_l);
1634  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1635  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1636  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1637 
1638  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1639  is_less_than_beta = (p2_asub_p0 < beta);
1640  is_less_than_beta = is_less_than_beta & is_less_than;
1641 
1642  if (!__msa_test_bz_v(is_less_than_beta)) {
1643  v16u8 p1;
1644  v8i16 p1_r = { 0 };
1645  v8i16 p1_l = { 0 };
1646  v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1647  v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1648 
1649  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1650  negate_tc_r, tc_r, p1_r);
1651  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1652  i16_negatetc_l, tc_l, p1_l);
1653 
1654  p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1655  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1656  ST_UB(p1_org, data - (2 * image_width));
1657 
1658  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1659  tc = tc + (v16i8) is_less_than_beta;
1660  }
1661 
1662  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1663  is_less_than_beta = (q2_asub_q0 < beta);
1664  is_less_than_beta = is_less_than_beta & is_less_than;
1665 
1666  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1667  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1668 
1669  if (!__msa_test_bz_v(is_less_than_beta)) {
1670  v16u8 q1;
1671  v8i16 q1_r = { 0 };
1672  v8i16 q1_l = { 0 };
1673  v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1674  v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1675 
1676  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1677  negate_tc_r, tc_r, q1_r);
1678  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1679  i16_negatetc_l, tc_l, q1_l);
1680 
1681  q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1682  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1683  ST_UB(q1_org, data + image_width);
1684 
1685  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1686  tc = tc + (v16i8) is_less_than_beta;
1687  }
1688  {
1689  v16i8 negate_thresh, sign_negate_thresh;
1690  v8i16 threshold_r, threshold_l;
1691  v8i16 negate_thresh_l, negate_thresh_r;
1692 
1693  negate_thresh = zero - tc;
1694  sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1695 
1696  ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1697  threshold_r, negate_thresh_r);
1698  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1699  negate_thresh_r, threshold_r, p0_r, q0_r);
1700 
1701  threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1702  negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1703  negate_thresh);
1704  AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1705  negate_thresh_l, threshold_l, p0_l, q0_l);
1706  }
1707 
1708  PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1709 
1710  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1711  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1712 
1713  ST_UB(p0_org, (data - image_width));
1714  ST_UB(q0_org, data);
1715  }
1716  }
1717 }
1718 
1720  int32_t alpha_in, int32_t beta_in,
1721  int8_t *tc0)
1722 {
1723  uint8_t *data = in;
1724  uint32_t out0, out1, out2, out3;
1725  uint64_t load;
1726  uint32_t tc_val;
1727  v16u8 alpha, beta;
1728  v16i8 inp0 = { 0 };
1729  v16i8 inp1 = { 0 };
1730  v16i8 inp2 = { 0 };
1731  v16i8 inp3 = { 0 };
1732  v16i8 inp4 = { 0 };
1733  v16i8 inp5 = { 0 };
1734  v16i8 inp6 = { 0 };
1735  v16i8 inp7 = { 0 };
1736  v16i8 src0, src1, src2, src3;
1737  v8i16 src4, src5, src6, src7;
1738  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1739  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1740  v16u8 is_less_than_beta1, is_less_than_beta2;
1741  v8i16 tc, tc_orig_r, tc_plus1;
1742  v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1743  v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1744  v8u16 src2_r, src3_r;
1745  v8i16 p2_r, p1_r, q2_r, q1_r;
1746  v16u8 p2, q2, p0, q0;
1747  v4i32 dst0, dst1;
1748  v16i8 zeros = { 0 };
1749 
1750  alpha = (v16u8) __msa_fill_b(alpha_in);
1751  beta = (v16u8) __msa_fill_b(beta_in);
1752 
1753  if (tc0[0] < 0) {
1754  data += (2 * stride);
1755  } else {
1756  load = LD(data - 3);
1757  inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1758  load = LD(data - 3 + stride);
1759  inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1760  data += (2 * stride);
1761  }
1762 
1763  if (tc0[1] < 0) {
1764  data += (2 * stride);
1765  } else {
1766  load = LD(data - 3);
1767  inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1768  load = LD(data - 3 + stride);
1769  inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1770  data += (2 * stride);
1771  }
1772 
1773  if (tc0[2] < 0) {
1774  data += (2 * stride);
1775  } else {
1776  load = LD(data - 3);
1777  inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1778  load = LD(data - 3 + stride);
1779  inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1780  data += (2 * stride);
1781  }
1782 
1783  if (tc0[3] < 0) {
1784  data += (2 * stride);
1785  } else {
1786  load = LD(data - 3);
1787  inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1788  load = LD(data - 3 + stride);
1789  inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1790  data += (2 * stride);
1791  }
1792 
1793  ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1794  src0, src1, src2, src3);
1795 
1796  ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1797  ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1798 
1799  src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1800  src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1801  src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1802  src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1803  src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1804  src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1805 
1806  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1807  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1808  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1809  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1810  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1811 
1812  is_less_than_alpha = (p0_asub_q0 < alpha);
1813  is_less_than_beta = (p1_asub_p0 < beta);
1814  is_less_than = is_less_than_alpha & is_less_than_beta;
1815  is_less_than_beta = (q1_asub_q0 < beta);
1816  is_less_than = is_less_than_beta & is_less_than;
1817 
1818  is_less_than_beta1 = (p2_asub_p0 < beta);
1819  is_less_than_beta2 = (q2_asub_q0 < beta);
1820 
1821  p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1822  p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1823  p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1824 
1825  ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1826  p2_r += p0_add_q0;
1827  p2_r >>= 1;
1828  p2_r -= p1_r;
1829  ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1830  q2_r += p0_add_q0;
1831  q2_r >>= 1;
1832  q2_r -= q1_r;
1833 
1834  tc_val = LW(tc0);
1835  tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1836  tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1837  is_tc_orig1 = tc_orig;
1838  is_tc_orig2 = tc_orig;
1839  tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1840  tc = tc_orig_r;
1841 
1842  p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1843  q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1844 
1845  p2_r += p1_r;
1846  q2_r += q1_r;
1847 
1848  PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1849 
1850  is_tc_orig1 = (zeros < is_tc_orig1);
1851  is_tc_orig2 = is_tc_orig1;
1852  is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1853  is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1854  is_tc_orig1 = is_less_than & is_tc_orig1;
1855  is_tc_orig2 = is_less_than & is_tc_orig2;
1856 
1857  p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1858  q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1859 
1860  q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1861  q0_sub_p0 <<= 2;
1862  p1_sub_q1 = p1_r - q1_r;
1863  q0_sub_p0 += p1_sub_q1;
1864  q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1865 
1866  tc_plus1 = tc + 1;
1867  is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1868  (v16i8) is_less_than_beta1);
1869  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1870  tc_plus1 = tc + 1;
1871  is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1872  (v16i8) is_less_than_beta2);
1873  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1874 
1875  q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc);
1876 
1877  ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
1878  src2_r += q0_sub_p0;
1879  src3_r -= q0_sub_p0;
1880 
1881  src2_r = (v8u16) CLIP_SH_0_255(src2_r);
1882  src3_r = (v8u16) CLIP_SH_0_255(src3_r);
1883 
1884  PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1885 
1886  p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1887  q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1888 
1889  ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1890 
1891  ILVRL_H2_SW(q2, p2, dst0, dst1);
1892 
1893  data = in;
1894 
1895  out0 = __msa_copy_u_w(dst0, 0);
1896  out1 = __msa_copy_u_w(dst0, 1);
1897  out2 = __msa_copy_u_w(dst0, 2);
1898  out3 = __msa_copy_u_w(dst0, 3);
1899 
1900  if (tc0[0] < 0) {
1901  data += (2 * stride);
1902  } else {
1903  SW(out0, (data - 2));
1904  data += stride;
1905  SW(out1, (data - 2));
1906  data += stride;
1907  }
1908 
1909  if (tc0[1] < 0) {
1910  data += (2 * stride);
1911  } else {
1912  SW(out2, (data - 2));
1913  data += stride;
1914  SW(out3, (data - 2));
1915  data += stride;
1916  }
1917 
1918  out0 = __msa_copy_u_w(dst1, 0);
1919  out1 = __msa_copy_u_w(dst1, 1);
1920  out2 = __msa_copy_u_w(dst1, 2);
1921  out3 = __msa_copy_u_w(dst1, 3);
1922 
1923  if (tc0[2] < 0) {
1924  data += (2 * stride);
1925  } else {
1926  SW(out0, (data - 2));
1927  data += stride;
1928  SW(out1, (data - 2));
1929  data += stride;
1930  }
1931 
1932  if (tc0[3] >= 0) {
1933  SW(out2, (data - 2));
1934  data += stride;
1935  SW(out3, (data - 2));
1936  }
1937 }
1938 
1940  uint8_t bs0, uint8_t bs1,
1941  uint8_t bs2, uint8_t bs3,
1942  uint8_t tc0, uint8_t tc1,
1943  uint8_t tc2, uint8_t tc3,
1944  uint8_t alpha_in,
1945  uint8_t beta_in,
1946  uint32_t img_width)
1947 {
1948  v16u8 alpha, beta;
1949  v8i16 tmp_vec;
1950  v8i16 bs = { 0 };
1951  v8i16 tc = { 0 };
1952  v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1953  v16u8 is_less_than;
1954  v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1955  v8i16 p0_r, q0_r;
1956  v16u8 p1_org, p0_org, q0_org, q1_org;
1957  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1958  v16i8 negate_tc, sign_negate_tc;
1959  v8i16 tc_r, negate_tc_r;
1960  v16i8 zero = { 0 };
1961 
1962  tmp_vec = (v8i16) __msa_fill_b(bs0);
1963  bs = __msa_insve_h(bs, 0, tmp_vec);
1964  tmp_vec = (v8i16) __msa_fill_b(bs1);
1965  bs = __msa_insve_h(bs, 1, tmp_vec);
1966  tmp_vec = (v8i16) __msa_fill_b(bs2);
1967  bs = __msa_insve_h(bs, 2, tmp_vec);
1968  tmp_vec = (v8i16) __msa_fill_b(bs3);
1969  bs = __msa_insve_h(bs, 3, tmp_vec);
1970 
1971  if (!__msa_test_bz_v((v16u8) bs)) {
1972  tmp_vec = (v8i16) __msa_fill_b(tc0);
1973  tc = __msa_insve_h(tc, 0, tmp_vec);
1974  tmp_vec = (v8i16) __msa_fill_b(tc1);
1975  tc = __msa_insve_h(tc, 1, tmp_vec);
1976  tmp_vec = (v8i16) __msa_fill_b(tc2);
1977  tc = __msa_insve_h(tc, 2, tmp_vec);
1978  tmp_vec = (v8i16) __msa_fill_b(tc3);
1979  tc = __msa_insve_h(tc, 3, tmp_vec);
1980 
1981  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1982 
1983  alpha = (v16u8) __msa_fill_b(alpha_in);
1984  beta = (v16u8) __msa_fill_b(beta_in);
1985 
1986  LD_UB4(data - (img_width << 1), img_width,
1987  p1_org, p0_org, q0_org, q1_org);
1988 
1989  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1990  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1991  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1992 
1993  is_less_than_alpha = (p0_asub_q0 < alpha);
1994  is_less_than_beta = (p1_asub_p0 < beta);
1995  is_less_than = is_less_than_beta & is_less_than_alpha;
1996  is_less_than_beta = (q1_asub_q0 < beta);
1997  is_less_than = is_less_than_beta & is_less_than;
1998  is_less_than = is_less_than & is_bs_greater_than0;
1999 
2000  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2001 
2002  if (!__msa_test_bz_v(is_less_than)) {
2003  negate_tc = zero - (v16i8) tc;
2004  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2005 
2006  ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
2007 
2008  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2009  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2010 
2011  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2012  tc_r, p0_r, q0_r);
2013 
2014  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2015 
2016  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2017  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2018 
2019  ST_UB(q0_org, data);
2020  ST_UB(p0_org, (data - img_width));
2021  }
2022  }
2023 }
2024 
2026  uint8_t bs0, uint8_t bs1,
2027  uint8_t bs2, uint8_t bs3,
2028  uint8_t tc0, uint8_t tc1,
2029  uint8_t tc2, uint8_t tc3,
2030  uint8_t alpha_in,
2031  uint8_t beta_in,
2032  uint32_t img_width)
2033 {
2034  uint8_t *src;
2035  v16u8 alpha, beta;
2036  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
2037  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
2038  v16u8 p0, q0;
2039  v8i16 p0_r = { 0 };
2040  v8i16 q0_r = { 0 };
2041  v16u8 p1_org, p0_org, q0_org, q1_org;
2042  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2043  v16u8 is_bs_greater_than0;
2044  v8i16 tc_r, negate_tc_r;
2045  v16i8 negate_tc, sign_negate_tc;
2046  v16i8 zero = { 0 };
2047  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
2048  v8i16 tmp1, tmp_vec, bs = { 0 };
2049  v8i16 tc = { 0 };
2050 
2051  tmp_vec = (v8i16) __msa_fill_b(bs0);
2052  bs = __msa_insve_h(bs, 0, tmp_vec);
2053  tmp_vec = (v8i16) __msa_fill_b(bs1);
2054  bs = __msa_insve_h(bs, 1, tmp_vec);
2055  tmp_vec = (v8i16) __msa_fill_b(bs2);
2056  bs = __msa_insve_h(bs, 2, tmp_vec);
2057  tmp_vec = (v8i16) __msa_fill_b(bs3);
2058  bs = __msa_insve_h(bs, 3, tmp_vec);
2059 
2060  if (!__msa_test_bz_v((v16u8) bs)) {
2061  tmp_vec = (v8i16) __msa_fill_b(tc0);
2062  tc = __msa_insve_h(tc, 0, tmp_vec);
2063  tmp_vec = (v8i16) __msa_fill_b(tc1);
2064  tc = __msa_insve_h(tc, 1, tmp_vec);
2065  tmp_vec = (v8i16) __msa_fill_b(tc2);
2066  tc = __msa_insve_h(tc, 2, tmp_vec);
2067  tmp_vec = (v8i16) __msa_fill_b(tc3);
2068  tc = __msa_insve_h(tc, 3, tmp_vec);
2069 
2070  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2071 
2072  LD_UB8((data - 2), img_width,
2073  row0, row1, row2, row3, row4, row5, row6, row7);
2074 
2075  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
2076  row4, row5, row6, row7,
2077  p1_org, p0_org, q0_org, q1_org);
2078 
2079  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2080  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2081  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2082 
2083  alpha = (v16u8) __msa_fill_b(alpha_in);
2084  beta = (v16u8) __msa_fill_b(beta_in);
2085 
2086  is_less_than_alpha = (p0_asub_q0 < alpha);
2087  is_less_than_beta = (p1_asub_p0 < beta);
2088  is_less_than = is_less_than_beta & is_less_than_alpha;
2089  is_less_than_beta = (q1_asub_q0 < beta);
2090  is_less_than = is_less_than_beta & is_less_than;
2091  is_less_than = is_bs_greater_than0 & is_less_than;
2092 
2093  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2094 
2095  if (!__msa_test_bz_v(is_less_than)) {
2096  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2097  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2098 
2099  negate_tc = zero - (v16i8) tc;
2100  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2101 
2102  ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
2103 
2104  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2105  tc_r, p0_r, q0_r);
2106 
2107  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2108 
2109  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2110  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2111  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
2112  src = data - 1;
2113  ST2x4_UB(tmp1, 0, src, img_width);
2114  src += 4 * img_width;
2115  ST2x4_UB(tmp1, 4, src, img_width);
2116  }
2117  }
2118 }
2119 
2121  int32_t alpha_in, int32_t beta_in,
2122  int8_t *tc0)
2123 {
2124  int32_t col, tc_val;
2125  v16u8 alpha, beta, res;
2126 
2127  alpha = (v16u8) __msa_fill_b(alpha_in);
2128  beta = (v16u8) __msa_fill_b(beta_in);
2129 
2130  for (col = 0; col < 4; col++) {
2131  tc_val = (tc0[col] - 1) + 1;
2132 
2133  if (tc_val <= 0) {
2134  src += (4 * stride);
2135  continue;
2136  }
2137 
2138  AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2139  ST2x4_UB(res, 0, (src - 1), stride);
2140  src += (4 * stride);
2141  }
2142 }
2143 
2145  int32_t alpha_in,
2146  int32_t beta_in,
2147  int8_t *tc0)
2148 {
2149  int32_t col, tc_val;
2150  int16_t out0, out1;
2151  v16u8 alpha, beta, res;
2152 
2153  alpha = (v16u8) __msa_fill_b(alpha_in);
2154  beta = (v16u8) __msa_fill_b(beta_in);
2155 
2156  for (col = 0; col < 4; col++) {
2157  tc_val = (tc0[col] - 1) + 1;
2158 
2159  if (tc_val <= 0) {
2160  src += 4 * stride;
2161  continue;
2162  }
2163 
2164  AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2165 
2166  out0 = __msa_copy_s_h((v8i16) res, 0);
2167  out1 = __msa_copy_s_h((v8i16) res, 1);
2168 
2169  SH(out0, (src - 1));
2170  src += stride;
2171  SH(out1, (src - 1));
2172  src += stride;
2173  }
2174 }
2175 
2177  int alpha, int beta, int8_t *tc)
2178 {
2179  uint8_t bs0 = 1;
2180  uint8_t bs1 = 1;
2181  uint8_t bs2 = 1;
2182  uint8_t bs3 = 1;
2183 
2184  if (tc[0] < 0)
2185  bs0 = 0;
2186  if (tc[1] < 0)
2187  bs1 = 0;
2188  if (tc[2] < 0)
2189  bs2 = 0;
2190  if (tc[3] < 0)
2191  bs3 = 0;
2192 
2193  avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2194  tc[0], tc[1], tc[2], tc[3],
2195  alpha, beta, img_width);
2196 }
2197 
2199  int alpha, int beta, int8_t *tc)
2200 {
2201 
2202  uint8_t bs0 = 1;
2203  uint8_t bs1 = 1;
2204  uint8_t bs2 = 1;
2205  uint8_t bs3 = 1;
2206 
2207  if (tc[0] < 0)
2208  bs0 = 0;
2209  if (tc[1] < 0)
2210  bs1 = 0;
2211  if (tc[2] < 0)
2212  bs2 = 0;
2213  if (tc[3] < 0)
2214  bs3 = 0;
2215 
2216  avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2217  tc[0], tc[1], tc[2], tc[3],
2218  alpha, beta, img_width);
2219 }
2220 
2222  int alpha, int beta, int8_t *tc)
2223 {
2224  uint8_t bs0 = 1;
2225  uint8_t bs1 = 1;
2226  uint8_t bs2 = 1;
2227  uint8_t bs3 = 1;
2228 
2229  if (tc[0] < 0)
2230  bs0 = 0;
2231  if (tc[1] < 0)
2232  bs1 = 0;
2233  if (tc[2] < 0)
2234  bs2 = 0;
2235  if (tc[3] < 0)
2236  bs3 = 0;
2237 
2238  avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2239  tc[0], tc[1], tc[2], tc[3],
2240  alpha, beta, img_width);
2241 }
2242 
2244  int alpha, int beta, int8_t *tc)
2245 {
2246  uint8_t bs0 = 1;
2247  uint8_t bs1 = 1;
2248  uint8_t bs2 = 1;
2249  uint8_t bs3 = 1;
2250 
2251  if (tc[0] < 0)
2252  bs0 = 0;
2253  if (tc[1] < 0)
2254  bs1 = 0;
2255  if (tc[2] < 0)
2256  bs2 = 0;
2257  if (tc[3] < 0)
2258  bs3 = 0;
2259 
2260  avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2261  tc[0], tc[1], tc[2], tc[3],
2262  alpha, beta, img_width);
2263 }
2264 
2266  int alpha, int beta)
2267 {
2269  (uint8_t) beta,
2270  (unsigned int) img_width);
2271 }
2272 
2274  int alpha, int beta)
2275 {
2277  (uint8_t) beta,
2278  (unsigned int) img_width);
2279 }
2280 
2282  int alpha, int beta)
2283 {
2285  (uint8_t) beta,
2286  (unsigned int) img_width);
2287 }
2288 
2290  int alpha, int beta)
2291 {
2293  (uint8_t) beta,
2294  (unsigned int) img_width);
2295 }
2296 
2298  int32_t ystride,
2299  int32_t alpha, int32_t beta,
2300  int8_t *tc0)
2301 {
2302  avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2303 }
2304 
2306  int32_t ystride,
2307  int32_t alpha,
2308  int32_t beta,
2309  int8_t *tc0)
2310 {
2311  avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2312 }
2313 
2315  int32_t ystride,
2316  int32_t alpha,
2317  int32_t beta,
2318  int8_t *tc0)
2319 {
2320  avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2321 }
2322 
2324  int32_t ystride,
2325  int32_t alpha,
2326  int32_t beta)
2327 {
2328  avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2329 }
2330 
2332  int height, int log2_denom,
2333  int weight_src, int offset_in)
2334 {
2335  uint32_t offset_val;
2336  v16i8 zero = { 0 };
2337  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2338  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2339  v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2340  v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2341  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2342  v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2343  v8i16 wgt, denom, offset;
2344 
2345  offset_val = (unsigned) offset_in << log2_denom;
2346 
2347  wgt = __msa_fill_h(weight_src);
2348  offset = __msa_fill_h(offset_val);
2349  denom = __msa_fill_h(log2_denom);
2350 
2351  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2352  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2353  src2_r, src3_r);
2354  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2355  src2_l, src3_l);
2356  ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2357  src6_r, src7_r);
2358  ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2359  src6_l, src7_l);
2360  MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2361  tmp3);
2362  MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2363  tmp7);
2364  MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2365  tmp11);
2366  MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2367  tmp14, tmp15);
2368  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2369  tmp1, tmp2, tmp3);
2370  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2371  tmp5, tmp6, tmp7);
2372  ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2373  tmp9, tmp10, tmp11);
2374  ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2375  tmp12, tmp13, tmp14, tmp15);
2376  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2377  MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2378  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2379  SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2380  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2381  SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2382  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2383  dst2, dst3);
2384  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2385  dst5, dst6, dst7);
2386  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2387  src += 8 * stride;
2388 
2389  if (16 == height) {
2390  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2391  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2392  src1_r, src2_r, src3_r);
2393  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2394  src1_l, src2_l, src3_l);
2395  ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2396  src5_r, src6_r, src7_r);
2397  ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2398  src5_l, src6_l, src7_l);
2399  MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2400  tmp2, tmp3);
2401  MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2402  tmp6, tmp7);
2403  MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2404  tmp10, tmp11);
2405  MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2406  tmp14, tmp15);
2407  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2408  tmp0, tmp1, tmp2, tmp3);
2409  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2410  tmp4, tmp5, tmp6, tmp7);
2411  ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2412  tmp8, tmp9, tmp10, tmp11);
2413  ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2414  tmp12, tmp13, tmp14, tmp15);
2415  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2416  MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2417  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2418  SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2419  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2420  SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2421  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2422  dst2, dst3);
2423  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2424  dst5, dst6, dst7);
2425  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2426  }
2427 }
2428 
2430  int height, int log2_denom,
2431  int weight_src, int offset)
2432 {
2433  if (4 == height) {
2434  avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2435  } else if (8 == height) {
2436  avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2437  } else {
2438  avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2439  }
2440 }
2441 
2443  int height, int log2_denom,
2444  int weight_src, int offset)
2445 {
2446  if (2 == height) {
2447  avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2448  } else if (4 == height) {
2449  avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2450  } else {
2451  avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2452  }
2453 }
2454 
2456  ptrdiff_t stride, int height,
2457  int log2_denom, int weight_dst,
2458  int weight_src, int offset_in)
2459 {
2460  v16i8 src_wgt, dst_wgt, wgt;
2461  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2462  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2463  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2464  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2465  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2466  v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2467  v8i16 denom, offset;
2468 
2469  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2470  offset_in += (128 * (weight_src + weight_dst));
2471 
2472  src_wgt = __msa_fill_b(weight_src);
2473  dst_wgt = __msa_fill_b(weight_dst);
2474  offset = __msa_fill_h(offset_in);
2475  denom = __msa_fill_h(log2_denom + 1);
2476 
2477  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2478 
2479  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2480  src += 8 * stride;
2481  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2482  XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2483  XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2484  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
2485  vec6);
2486  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
2487  vec7);
2488  ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2489  vec12, vec14);
2490  ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2491  vec13, vec15);
2492  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2493  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2494  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2495  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2496  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2497  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2498  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2499  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2500  tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2501  tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2502  tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2503  tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2504  tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2505  tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2506  tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2507  tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2508  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2509  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2510  SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2511  SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2512  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
2513  CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
2514  CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
2515  CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
2516  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2517  dst2, dst3);
2518  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2519  dst5, dst6, dst7);
2520  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2521  dst += 8 * stride;
2522 
2523  if (16 == height) {
2524  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2525  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2526  XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2527  XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2528  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
2529  vec4, vec6);
2530  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
2531  vec5, vec7);
2532  ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2533  vec12, vec14);
2534  ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2535  vec13, vec15);
2536  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2537  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2538  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2539  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2540  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2541  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2542  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2543  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2544  tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2545  tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2546  tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2547  tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2548  tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2549  tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2550  tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2551  tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2552  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2553  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2554  SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2555  SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2556  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
2557  CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
2558  CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
2559  CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
2560  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2561  dst2, dst3);
2562  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2563  dst5, dst6, dst7);
2564  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2565  }
2566 }
2567 
2569  ptrdiff_t stride, int height,
2570  int log2_denom, int weight_dst,
2571  int weight_src, int offset)
2572 {
2573  if (4 == height) {
2574  avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2575  offset);
2576  } else if (8 == height) {
2577  avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2578  offset);
2579  } else {
2580  avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2581  offset);
2582  }
2583 }
2584 
2586  ptrdiff_t stride, int height,
2587  int log2_denom, int weight_dst,
2588  int weight_src, int offset)
2589 {
2590  if (2 == height) {
2591  avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2592  offset);
2593  } else if (4 == height) {
2594  avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2595  offset);
2596  } else {
2597  avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2598  offset);
2599  }
2600 }
#define MAXI_SH2_SH(...)
void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2314
static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1221
static float alpha(float a)
#define XORI_B8_128_UB(...)
static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:2025
#define ILVRL_B2_SH(...)
static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:2144
static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:1719
#define PCKEV_B2_SH(...)
static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1172
#define LW(psrc)
void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2442
#define MUL2(in0, in1, in2, in3, out0, out1)
#define tc
Definition: regdef.h:69
#define ILVRL_H2_SW(...)
#define PCKEV_B3_UB(...)
void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2221
static const uint8_t q1[256]
Definition: twofish.c:96
#define LD_UB4(...)
#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:626
#define src
Definition: vp8dsp.c:254
#define ILVL_H2_SH(...)
static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:181
int stride
Definition: mace.c:144
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:226
#define SRA_4V(in0, in1, in2, in3, shift)
void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2265
#define XORI_B4_128_UB(...)
static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:77
uint8_t
#define SAT_UH8_SH(...)
#define LD4(psrc, stride, out0, out1, out2, out3)
static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:139
#define UNPCK_UB_SH(in, out0, out1)
void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2243
void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset_in)
Definition: h264dsp_msa.c:2331
void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2273
#define SLDI_B4_0_SB(...)
static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t image_width)
Definition: h264dsp_msa.c:1563
#define CLIP_SH_0_255(in)
#define XORI_B2_128_UB(...)
const char data[16]
Definition: mxf.c:90
#define height
static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:814
#define ILVRL_H2_SH(...)
#define LD_UB5(...)
#define CLIP_SH2_0_255(in0, in1)
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, q3_or_p3_org_in, p1_or_q1_org_in, p2_or_q2_org_in, q1_or_p1_org_in, p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)
Definition: h264dsp_msa.c:487
static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:681
void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2281
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define INSERT_W2_UB(...)
static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:24
#define zero
Definition: regdef.h:64
#define LW2(psrc, stride, out0, out1)
void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2297
#define ADDS_SH2_SH(...)
#define ILVR_B2_SH(...)
#define ILVR_W2_SB(...)
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
static const uint8_t q0[256]
Definition: twofish.c:77
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define SLDI_B2_0_UB(...)
#define TRANSPOSE8x4_UB_UB(...)
static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1939
static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:293
#define LD_UB8(...)
static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1283
void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2585
int32_t
#define ILVR_B4_UH(...)
#define ILVL_B4_SH(...)
#define ST_UB(...)
#define ST2x4_UB(in, stidx, pdst, stride)
void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta)
Definition: h264dsp_msa.c:2323
#define PCKEV_B4_UB(...)
#define SRLR_H8_SH(...)
#define INSERT_W4_UB(...)
void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2568
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, p1_or_q1_org_in, p2_or_q2_org_in, negate_tc_in, tc_in, p1_or_q1_out)
Definition: h264dsp_msa.c:523
#define ILVL_B2_SH(...)
#define ST_UB8(...)
#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, p1_or_q1_org_in, q1_or_p1_org_in, negate_threshold_in, threshold_in, p0_or_q0_out, q0_or_p0_out)
Definition: h264dsp_msa.c:538
#define ILVR_B2_UH(...)
#define SAT_UH2_SH(...)
#define src1
Definition: h264pred.c:139
#define SAT_UH4_SH(...)
#define ILVL_B4_SB(...)
void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2289
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
#define MAXI_SH8_SH(...)
#define SRLR_H4_SH(...)
#define ILVR_B4_SH(...)
#define CLIP_SH(in, min, max)
static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:51
static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:259
#define src0
Definition: h264pred.c:138
#define LD(psrc)
#define SH(val, pdst)
static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:371
static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in)
Definition: h264dsp_msa.c:974
#define SW(val, pdst)
void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2198
void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2176
#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:560
void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2429
#define ILVR_W2_UB(...)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ILVL_W2_SB(...)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7)
#define INSERT_D2_UB(...)
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ILVRL_B2_SB(...)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVR_H2_SH(...)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2305
#define LD_UB(...)
#define MAXI_SH4_SH(...)
static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:2120
static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:333
#define ILVR_B4_SB(...)
void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset_in)
Definition: h264dsp_msa.c:2455
#define ST4x2_UB(in, pdst, stride)
#define PCKEV_B2_UB(...)
static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:423
#define ILVR_B2_UB(...)
#define ADDS_SH4_SH(...)
static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:108
#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, p1_or_q1_org_in, p0_or_q0_out)
Definition: h264dsp_msa.c:514