26 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
27 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
31 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
32 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
33 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
36 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \ 39 v16i8 tmp0_m, tmp1_m; \ 40 v16i8 minus5b_m = __msa_ldi_b(-5); \ 41 v16i8 plus20b_m = __msa_ldi_b(20); \ 43 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \ 44 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \ 45 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \ 46 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \ 47 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \ 48 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \ 51 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \ 55 v16i8 minus5b = __msa_ldi_b(-5); \ 56 v16i8 plus20b = __msa_ldi_b(20); \ 58 tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \ 59 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \ 61 tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \ 62 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \ 64 tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \ 65 out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \ 70 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ 74 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \ 75 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \ 76 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \ 81 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \ 85 out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \ 86 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \ 87 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \ 88 out0_m = __msa_srari_w(out0_m, 10); \ 89 out0_m = __msa_sat_s_w(out0_m, 7); \ 96 const int16_t filt_const0 = 0xfb01;
97 const int16_t filt_const1 = 0x1414;
98 const int16_t filt_const2 = 0x1fb;
100 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
101 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
102 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
103 v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
104 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
106 filt0 = (v16i8) __msa_fill_h(filt_const0);
107 filt1 = (v16i8) __msa_fill_h(filt_const1);
108 filt2 = (v16i8) __msa_fill_h(filt_const2);
112 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
115 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
116 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
117 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
118 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
122 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
130 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
132 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
133 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
134 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
135 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
138 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
139 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
140 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
142 vt_out1 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
147 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
148 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
152 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
158 const int16_t filt_const0 = 0xfb01;
159 const int16_t filt_const1 = 0x1414;
160 const int16_t filt_const2 = 0x1fb;
162 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
163 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
164 v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
165 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
166 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
167 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
168 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
169 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
171 filt0 = (v16i8) __msa_fill_h(filt_const0);
172 filt1 = (v16i8) __msa_fill_h(filt_const1);
173 filt2 = (v16i8) __msa_fill_h(filt_const2);
176 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
181 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
190 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
191 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
193 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
197 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
198 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
199 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
200 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
201 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
203 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
205 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
207 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
209 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
210 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
212 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
213 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
214 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
215 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
217 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
226 LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
234 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
235 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
237 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
238 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
240 vt_out0 =
AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
242 vt_out1 =
AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
244 vt_out2 =
AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
246 vt_out3 =
AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
248 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
249 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
251 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
252 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
253 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
254 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
267 const int16_t filt_const0 = 0xfb01;
268 const int16_t filt_const1 = 0x1414;
269 const int16_t filt_const2 = 0x1fb;
270 const uint8_t *src_x_tmp = src_x;
271 const uint8_t *src_y_tmp = src_y;
273 uint32_t multiple8_cnt, loop_cnt;
275 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
276 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
277 v16i8 src_vt7, src_vt8;
278 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
279 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
280 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
281 v8i16 vt_out3, out0, out1, out2, out3;
283 filt0 = (v16i8) __msa_fill_h(filt_const0);
284 filt1 = (v16i8) __msa_fill_h(filt_const1);
285 filt2 = (v16i8) __msa_fill_h(filt_const2);
289 for (multiple8_cnt = 2; multiple8_cnt--;) {
294 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
299 for (loop_cnt = 4; loop_cnt--;) {
300 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
308 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
309 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
311 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
315 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
316 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
318 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
319 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
321 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
323 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
325 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
327 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
329 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
330 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
332 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
333 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
334 out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
335 out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
361 uint32_t tp0, tp1, tp2, tp3;
362 const int16_t filt_const0 = 0xfb01;
363 const int16_t filt_const1 = 0x1414;
364 const int16_t filt_const2 = 0x1fb;
365 v16u8 res, dst0 = { 0 };
366 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
367 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
368 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
369 v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
370 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
372 filt0 = (v16i8) __msa_fill_h(filt_const0);
373 filt1 = (v16i8) __msa_fill_h(filt_const1);
374 filt2 = (v16i8) __msa_fill_h(filt_const2);
378 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
381 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
382 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
383 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
384 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
388 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
396 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
398 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
399 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
400 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
401 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
404 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
405 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
406 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
408 vt_out1 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
412 LW4(dst, stride, tp0, tp1, tp2, tp3);
415 res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
416 res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
420 dst0 = __msa_aver_u_b(res, dst0);
422 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
430 const int16_t filt_const0 = 0xfb01;
431 const int16_t filt_const1 = 0x1414;
432 const int16_t filt_const2 = 0x1fb;
433 uint64_t tp0, tp1, tp2, tp3;
434 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
435 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
436 v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
437 v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
438 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
439 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
440 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
441 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
442 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
444 filt0 = (v16i8) __msa_fill_h(filt_const0);
445 filt1 = (v16i8) __msa_fill_h(filt_const1);
446 filt2 = (v16i8) __msa_fill_h(filt_const2);
449 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
454 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
463 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
464 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
466 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
470 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
471 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
472 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
473 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
474 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
476 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
478 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
480 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
482 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
483 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
485 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
486 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
487 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
488 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
490 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
493 LD4(dst, stride, tp0, tp1, tp2, tp3);
504 LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
512 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
513 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
515 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
516 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
518 vt_out0 =
AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
520 vt_out1 =
AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
522 vt_out2 =
AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
524 vt_out3 =
AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
526 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
527 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
529 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
530 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
531 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
532 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
534 LD4(dst, stride, tp0, tp1, tp2, tp3);
551 const int16_t filt_const0 = 0xfb01;
552 const int16_t filt_const1 = 0x1414;
553 const int16_t filt_const2 = 0x1fb;
554 const uint8_t *src_x_tmp = src_x;
555 const uint8_t *src_y_tmp = src_y;
557 uint32_t multiple8_cnt, loop_cnt;
558 uint64_t tp0, tp1, tp2, tp3;
559 v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
560 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
561 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
562 v16i8 src_vt7, src_vt8;
563 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
564 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
565 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
566 v8i16 vt_out3, out0, out1, out2, out3;
568 filt0 = (v16i8) __msa_fill_h(filt_const0);
569 filt1 = (v16i8) __msa_fill_h(filt_const1);
570 filt2 = (v16i8) __msa_fill_h(filt_const2);
574 for (multiple8_cnt = 2; multiple8_cnt--;) {
579 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
584 for (loop_cnt = 4; loop_cnt--;) {
585 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
593 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
594 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
596 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
600 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
601 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
603 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
604 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
606 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
608 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
610 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
612 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
614 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
615 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
617 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
618 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
619 out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
620 out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
622 LD4(dst, stride, tp0, tp1, tp2, tp3);
649 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
650 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
652 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
654 LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
656 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
658 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
664 uint64_t
src0,
src1, src2, src3, src4, src5, src6, src7;
666 LD4(src, stride, src0, src1, src2, src3);
668 LD4(src, stride, src4, src5, src6, src7);
669 SD4(src0, src1, src2, src3, dst, stride);
671 SD4(src4, src5, src6, src7, dst, stride);
677 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
678 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
680 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
682 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
684 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
686 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
688 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
691 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
692 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
694 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
696 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
698 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
704 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
705 v16u8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
706 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
708 LD4(src, stride, tp0, tp1, tp2, tp3);
710 LD4(src, stride, tp4, tp5, tp6, tp7);
716 LD4(dst, stride, tp0, tp1, tp2, tp3);
717 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
723 AVER_UB4_UB(src0, dst0,
src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
726 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
732 uint32_t tp0, tp1, tp2, tp3;
733 v16u8
src0 = { 0 }, dst0 = { 0 };
735 LW4(src, stride, tp0, tp1, tp2, tp3);
737 LW4(dst, stride, tp0, tp1, tp2, tp3);
740 dst0 = __msa_aver_u_b(src0, dst0);
742 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
749 v16i8 dst0, dst1, dst2, dst3,
src0,
src1, src2, src3, src4, src5, src6;
750 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
751 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
752 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
753 v16i8 minus5b = __msa_ldi_b(-5);
754 v16i8 plus20b = __msa_ldi_b(20);
762 for (loop_cnt = 4; loop_cnt--;) {
763 LD_SB2(src, 16, src0, src1);
765 LD_SB2(src, 16, src2, src3);
767 LD_SB2(src, 16, src4, src5);
769 LD_SB2(src, 16, src6, src7);
773 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
774 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
775 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
776 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
777 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
778 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
779 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
780 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
781 minus5b, res0, res1, res2, res3);
782 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
783 plus20b, res0, res1, res2, res3);
784 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
785 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
786 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
787 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
788 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
789 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
790 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
791 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
792 minus5b, res4, res5, res6, res7);
793 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
794 plus20b, res4, res5, res6, res7);
795 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
796 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
803 dst0 = __msa_aver_s_b(dst0, src0);
804 dst1 = __msa_aver_s_b(dst1, src2);
805 dst2 = __msa_aver_s_b(dst2, src4);
806 dst3 = __msa_aver_s_b(dst3, src6);
808 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
817 v16i8 dst0, dst1, dst2, dst3,
src0,
src1, src2, src3, src4, src5, src6;
818 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
819 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
820 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
821 v16i8 minus5b = __msa_ldi_b(-5);
822 v16i8 plus20b = __msa_ldi_b(20);
830 for (loop_cnt = 4; loop_cnt--;) {
831 LD_SB2(src, 16, src0, src1);
833 LD_SB2(src, 16, src2, src3);
835 LD_SB2(src, 16, src4, src5);
837 LD_SB2(src, 16, src6, src7);
841 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
842 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
843 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
844 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
845 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
846 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
847 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
848 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
849 minus5b, res0, res1, res2, res3);
850 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
851 plus20b, res0, res1, res2, res3);
852 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
853 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
854 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
855 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
856 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
857 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
858 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
859 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
860 minus5b, res4, res5, res6, res7);
861 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
862 plus20b, res4, res5, res6, res7);
863 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
864 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
871 dst0 = __msa_aver_s_b(dst0, src0);
872 dst1 = __msa_aver_s_b(dst1, src2);
873 dst2 = __msa_aver_s_b(dst2, src4);
874 dst3 = __msa_aver_s_b(dst3, src6);
876 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
884 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
885 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
886 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
887 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
888 v16i8 minus5b = __msa_ldi_b(-5);
889 v16i8 plus20b = __msa_ldi_b(20);
892 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
894 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
895 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
896 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
897 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
898 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
899 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
900 res0, res1, res2, res3);
901 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
902 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
903 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
904 res0, res1, res2, res3);
905 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
906 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
907 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
908 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
909 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
910 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
911 res4, res5, res6, res7);
912 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
913 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
914 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
915 res4, res5, res6, res7);
916 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
917 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
918 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
919 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
928 tmp0 = __msa_aver_s_b(tmp0, src0);
929 tmp1 = __msa_aver_s_b(tmp1, src1);
930 tmp2 = __msa_aver_s_b(tmp2, src4);
931 tmp3 = __msa_aver_s_b(tmp3, src5);
933 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
939 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
940 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
941 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
942 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
943 v16i8 minus5b = __msa_ldi_b(-5);
944 v16i8 plus20b = __msa_ldi_b(20);
947 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
949 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
950 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
951 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
952 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
953 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
954 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
955 res0, res1, res2, res3);
956 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
957 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
958 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
959 res0, res1, res2, res3);
960 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
961 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
962 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
963 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
964 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
965 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
966 res4, res5, res6, res7);
967 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
968 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
969 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
970 res4, res5, res6, res7);
971 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
972 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
973 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
974 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
983 tmp0 = __msa_aver_s_b(tmp0, src0);
984 tmp1 = __msa_aver_s_b(tmp1, src1);
985 tmp2 = __msa_aver_s_b(tmp2, src4);
986 tmp3 = __msa_aver_s_b(tmp3, src5);
988 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
994 v16i8
src0,
src1, src2, src3, res, mask0, mask1, mask2;
995 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
997 v16i8 minus5b = __msa_ldi_b(-5);
998 v16i8 plus20b = __msa_ldi_b(20);
1001 LD_SB4(src - 2, stride, src0, src1, src2, src3);
1003 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1005 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1006 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1007 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1008 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1011 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1012 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
1013 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
1014 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32) src1);
1015 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1016 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64) src1);
1017 res = __msa_aver_s_b(res, src0);
1018 res = (v16i8) __msa_xori_b((v16u8) res, 128);
1019 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
1025 v16i8
src0,
src1, src2, src3, res, mask0, mask1, mask2;
1026 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1028 v16i8 minus5b = __msa_ldi_b(-5);
1029 v16i8 plus20b = __msa_ldi_b(20);
1032 LD_SB4(src - 2, stride, src0, src1, src2, src3);
1034 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1036 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1037 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1038 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1039 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1042 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1043 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
1044 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
1045 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32) src1);
1046 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1047 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64) src1);
1048 res = __msa_aver_s_b(res, src0);
1049 res = (v16i8) __msa_xori_b((v16u8) res, 128);
1050 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
1057 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1058 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1060 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1061 v16i8 minus5b = __msa_ldi_b(-5);
1062 v16i8 plus20b = __msa_ldi_b(20);
1067 for (loop_cnt = 4; loop_cnt--;) {
1068 LD_SB2(src, 8, src0, src1);
1070 LD_SB2(src, 8, src2, src3);
1072 LD_SB2(src, 8, src4, src5);
1074 LD_SB2(src, 8, src6, src7);
1078 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1079 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1080 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1081 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1082 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1083 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1084 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1085 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1086 minus5b, res0, res1, res2, res3);
1087 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1088 plus20b, res0, res1, res2, res3);
1089 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1090 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1091 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1092 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1093 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1094 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1095 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1096 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1097 minus5b, res4, res5, res6, res7);
1098 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1099 plus20b, res4, res5, res6, res7);
1104 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
1107 ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
1115 v16u8 out0, out1, out2, out3;
1116 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1117 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1119 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1120 v16i8 minus5b = __msa_ldi_b(-5);
1121 v16i8 plus20b = __msa_ldi_b(20);
1124 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1126 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1127 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1128 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1129 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1130 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1131 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1132 res0, res1, res2, res3);
1133 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1134 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1135 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1136 plus20b, res0, res1, res2, res3);
1137 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1138 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1139 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1140 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1141 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1142 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1143 res4, res5, res6, res7);
1144 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1145 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1146 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1147 plus20b, res4, res5, res6, res7);
1156 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1163 v16i8
src0,
src1, src2, src3, mask0, mask1, mask2;
1164 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1166 v16i8 minus5b = __msa_ldi_b(-5);
1167 v16i8 plus20b = __msa_ldi_b(20);
1170 LD_SB4(src - 2, stride, src0, src1, src2, src3);
1172 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1174 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1175 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1176 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1177 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1181 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1188 int16_t filt_const0 = 0xfb01;
1189 int16_t filt_const1 = 0x1414;
1190 int16_t filt_const2 = 0x1fb;
1191 v16u8 res0, res1, res2, res3;
1192 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1193 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1194 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1195 v16i8 src65_l, src87_l, filt0, filt1, filt2;
1196 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1198 filt0 = (v16i8) __msa_fill_h(filt_const0);
1199 filt1 = (v16i8) __msa_fill_h(filt_const1);
1200 filt2 = (v16i8) __msa_fill_h(filt_const2);
1202 src -= (stride * 2);
1204 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1208 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1210 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1213 for (loop_cnt = 4; loop_cnt--;) {
1214 LD_SB4(src, stride, src5, src6, src7, src8);
1218 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1219 src65_r, src76_r, src87_r);
1220 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1221 src65_l, src76_l, src87_l);
1222 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1223 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1224 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1225 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1226 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1227 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1228 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1229 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1231 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1233 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1234 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1235 out3_r, res0, res1, res2, res3);
1236 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
1237 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
1238 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
1239 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
1241 ST_UB4(res0, res1, res2, res3, dst, stride);
1262 int16_t filt_const0 = 0xfb01;
1263 int16_t filt_const1 = 0x1414;
1264 int16_t filt_const2 = 0x1fb;
1265 v16u8 res0, res1, res2, res3;
1266 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1267 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1268 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1269 v16i8 src65_l, src87_l, filt0, filt1, filt2;
1270 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1272 filt0 = (v16i8) __msa_fill_h(filt_const0);
1273 filt1 = (v16i8) __msa_fill_h(filt_const1);
1274 filt2 = (v16i8) __msa_fill_h(filt_const2);
1276 src -= (stride * 2);
1278 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1282 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1284 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1287 for (loop_cnt = 4; loop_cnt--;) {
1288 LD_SB4(src, stride, src5, src6, src7, src8);
1292 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1293 src65_r, src76_r, src87_r);
1294 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1295 src65_l, src76_l, src87_l);
1296 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1297 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1298 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1299 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1300 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1301 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1302 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1303 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1305 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1307 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1308 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1309 out3_r, res0, res1, res2, res3);
1310 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
1311 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
1312 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
1313 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
1315 ST_UB4(res0, res1, res2, res3, dst, stride);
1334 const int16_t filt_const0 = 0xfb01;
1335 const int16_t filt_const1 = 0x1414;
1336 const int16_t filt_const2 = 0x1fb;
1337 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1338 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1339 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1340 v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
1341 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1343 filt0 = (v16i8) __msa_fill_h(filt_const0);
1344 filt1 = (v16i8) __msa_fill_h(filt_const1);
1345 filt2 = (v16i8) __msa_fill_h(filt_const2);
1347 src -= (stride * 2);
1349 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1351 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1352 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1354 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1356 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1358 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1359 src109_r, src1110_r, src1211_r);
1360 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1361 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1362 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1363 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1364 out4_r =
AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1365 out5_r =
AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1366 out6_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1367 out7_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1372 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1373 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1374 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1375 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1376 out0 = __msa_aver_s_b(out0, tmp0);
1377 out1 = __msa_aver_s_b(out1, tmp1);
1378 out2 = __msa_aver_s_b(out2, tmp2);
1379 out3 = __msa_aver_s_b(out3, tmp3);
1381 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1387 const int16_t filt_const0 = 0xfb01;
1388 const int16_t filt_const1 = 0x1414;
1389 const int16_t filt_const2 = 0x1fb;
1390 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1391 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1392 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1393 v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
1394 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1396 filt0 = (v16i8) __msa_fill_h(filt_const0);
1397 filt1 = (v16i8) __msa_fill_h(filt_const1);
1398 filt2 = (v16i8) __msa_fill_h(filt_const2);
1400 src -= (stride * 2);
1402 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1404 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1406 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1407 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1409 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1411 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1412 src109_r, src1110_r, src1211_r);
1413 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1414 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1415 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1416 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1417 out4_r =
AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1418 out5_r =
AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1419 out6_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1420 out7_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1425 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1426 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1427 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1428 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1429 out0 = __msa_aver_s_b(out0, tmp0);
1430 out1 = __msa_aver_s_b(out1, tmp1);
1431 out2 = __msa_aver_s_b(out2, tmp2);
1432 out3 = __msa_aver_s_b(out3, tmp3);
1434 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1440 int16_t filt_const0 = 0xfb01;
1441 int16_t filt_const1 = 0x1414;
1442 int16_t filt_const2 = 0x1fb;
1444 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1445 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1446 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1449 filt0 = (v16i8) __msa_fill_h(filt_const0);
1450 filt1 = (v16i8) __msa_fill_h(filt_const1);
1451 filt2 = (v16i8) __msa_fill_h(filt_const2);
1453 src -= (stride * 2);
1455 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1457 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1459 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1461 LD_SB4(src, stride, src5, src6, src7, src8);
1462 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1464 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1466 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1467 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1471 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1472 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1473 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1474 out = __msa_aver_u_b(out, (v16u8) src32_r);
1475 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1481 int16_t filt_const0 = 0xfb01;
1482 int16_t filt_const1 = 0x1414;
1483 int16_t filt_const2 = 0x1fb;
1485 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1486 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1487 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1490 filt0 = (v16i8) __msa_fill_h(filt_const0);
1491 filt1 = (v16i8) __msa_fill_h(filt_const1);
1492 filt2 = (v16i8) __msa_fill_h(filt_const2);
1494 src -= (stride * 2);
1496 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1498 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1500 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1502 LD_SB4(src, stride, src5, src6, src7, src8);
1503 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1505 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1507 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1508 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1512 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1513 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1514 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1515 out = __msa_aver_u_b(out, (v16u8) src32_r);
1516 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1601 uint32_t multiple8_cnt, loop_cnt;
1602 const int32_t filt_const0 = 0xfffb0001;
1603 const int32_t filt_const1 = 0x140014;
1604 const int32_t filt_const2 = 0x1fffb;
1606 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1608 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1609 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1610 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1611 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1612 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1613 v8i16 hz_out87_l, filt0, filt1, filt2;
1616 filt0 = (v8i16) __msa_fill_w(filt_const0);
1617 filt1 = (v8i16) __msa_fill_w(filt_const1);
1618 filt2 = (v8i16) __msa_fill_w(filt_const2);
1622 for (multiple8_cnt = 2; multiple8_cnt--;) {
1626 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1636 for (loop_cnt = 4; loop_cnt--;) {
1637 LD_SB4(src, stride, src5, src6, src7, src8);
1647 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1648 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1650 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1651 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1653 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1654 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1656 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1657 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1664 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1669 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1674 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1679 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1681 dst1 = __msa_srari_h(hz_out2, 5);
1682 dst3 = __msa_srari_h(hz_out3, 5);
1683 dst5 = __msa_srari_h(hz_out4, 5);
1684 dst7 = __msa_srari_h(hz_out5, 5);
1687 dst0 = __msa_aver_s_h(dst0, dst1);
1688 dst1 = __msa_aver_s_h(dst2, dst3);
1689 dst2 = __msa_aver_s_h(dst4, dst5);
1690 dst3 = __msa_aver_s_h(dst6, dst7);
1714 uint32_t multiple8_cnt, loop_cnt;
1715 const int32_t filt_const0 = 0xfffb0001;
1716 const int32_t filt_const1 = 0x140014;
1717 const int32_t filt_const2 = 0x1fffb;
1719 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1721 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1722 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1723 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1724 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1725 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1726 v8i16 hz_out87_l, filt0, filt1, filt2;
1729 filt0 = (v8i16) __msa_fill_w(filt_const0);
1730 filt1 = (v8i16) __msa_fill_w(filt_const1);
1731 filt2 = (v8i16) __msa_fill_w(filt_const2);
1735 for (multiple8_cnt = 2; multiple8_cnt--;) {
1739 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1749 for (loop_cnt = 4; loop_cnt--;) {
1750 LD_SB4(src, stride, src5, src6, src7, src8);
1760 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1761 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1763 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1764 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1766 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1767 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1769 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1770 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1777 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1782 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1787 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1792 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1794 dst1 = __msa_srari_h(hz_out3, 5);
1795 dst3 = __msa_srari_h(hz_out4, 5);
1796 dst5 = __msa_srari_h(hz_out5, 5);
1797 dst7 = __msa_srari_h(hz_out6, 5);
1800 dst0 = __msa_aver_s_h(dst0, dst1);
1801 dst1 = __msa_aver_s_h(dst2, dst3);
1802 dst2 = __msa_aver_s_h(dst4, dst5);
1803 dst3 = __msa_aver_s_h(dst6, dst7);
1825 const int32_t filt_const0 = 0xfffb0001;
1826 const int32_t filt_const1 = 0x140014;
1827 const int32_t filt_const2 = 0x1fffb;
1829 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1830 v16i8 src11, src12, mask0, mask1, mask2;
1831 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1832 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1833 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1834 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1835 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1836 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1837 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1838 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1843 filt0 = (v8i16) __msa_fill_w(filt_const0);
1844 filt1 = (v8i16) __msa_fill_w(filt_const1);
1845 filt2 = (v8i16) __msa_fill_w(filt_const2);
1847 src -= ((2 *
stride) + 2);
1849 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1859 LD_SB4(src, stride, src5, src6, src7, src8);
1868 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1869 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
1870 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1871 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
1872 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1873 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
1874 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1875 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
1877 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
1879 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
1881 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1882 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
1884 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
1886 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1887 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
1889 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
1891 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1892 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
1894 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
1896 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1898 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
1899 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
1901 dst0 = __msa_aver_s_h(dst0, hz_out2);
1902 dst1 = __msa_aver_s_h(dst1, hz_out3);
1903 dst2 = __msa_aver_s_h(dst2, hz_out4);
1904 dst3 = __msa_aver_s_h(dst3, hz_out5);
1911 LD_SB4(src, stride, src9, src10, src11, src12);
1917 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1918 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
1920 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1921 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
1923 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
1925 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
1927 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1928 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
1930 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
1932 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1933 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
1935 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
1937 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1938 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
1940 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
1942 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1944 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
1945 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
1947 dst0 = __msa_aver_s_h(dst0, hz_out6);
1948 dst1 = __msa_aver_s_h(dst1, hz_out7);
1949 dst2 = __msa_aver_s_h(dst2, hz_out8);
1950 dst3 = __msa_aver_s_h(dst3, hz_out9);
1960 const int32_t filt_const0 = 0xfffb0001;
1961 const int32_t filt_const1 = 0x140014;
1962 const int32_t filt_const2 = 0x1fffb;
1964 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1965 v16i8 src11, src12, mask0, mask1, mask2;
1966 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1967 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1968 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1969 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1970 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1971 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1972 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1973 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1978 filt0 = (v8i16) __msa_fill_w(filt_const0);
1979 filt1 = (v8i16) __msa_fill_w(filt_const1);
1980 filt2 = (v8i16) __msa_fill_w(filt_const2);
1982 src -= ((2 *
stride) + 2);
1984 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1994 LD_SB4(src, stride, src5, src6, src7, src8);
2003 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2004 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2005 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2006 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
2007 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2008 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2009 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2010 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
2012 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2014 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
2016 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2017 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2019 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
2021 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2022 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2024 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
2026 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2027 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2029 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
2031 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2033 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
2034 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
2036 dst0 = __msa_aver_s_h(dst0, hz_out3);
2037 dst1 = __msa_aver_s_h(dst1, hz_out4);
2038 dst2 = __msa_aver_s_h(dst2, hz_out5);
2039 dst3 = __msa_aver_s_h(dst3, hz_out6);
2046 LD_SB4(src, stride, src9, src10, src11, src12);
2052 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2053 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
2055 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2056 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
2058 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
2060 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
2062 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2063 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
2065 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
2067 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2068 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
2070 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
2072 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2073 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
2075 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
2077 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2079 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
2080 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
2082 dst0 = __msa_aver_s_h(dst0, hz_out7);
2083 dst1 = __msa_aver_s_h(dst1, hz_out8);
2084 dst2 = __msa_aver_s_h(dst2, hz_out9);
2085 dst3 = __msa_aver_s_h(dst3, hz_out10);
2095 const int32_t filt_const0 = 0xfffb0001;
2096 const int32_t filt_const1 = 0x140014;
2097 const int32_t filt_const2 = 0x1fffb;
2099 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2100 v16i8 mask0, mask1, mask2;
2101 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2102 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2103 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2104 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2109 filt0 = (v8i16) __msa_fill_w(filt_const0);
2110 filt1 = (v8i16) __msa_fill_w(filt_const1);
2111 filt2 = (v8i16) __msa_fill_w(filt_const2);
2113 src -= ((2 *
stride) + 2);
2115 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2117 LD_SB4(src, stride, src5, src6, src7, src8);
2127 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2128 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2130 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2131 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2132 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2133 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2135 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2137 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2139 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2140 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2142 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2144 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2149 dst0 = __msa_aver_s_h(dst0, hz_out2);
2150 dst1 = __msa_aver_s_h(dst1, hz_out4);
2153 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2159 const int32_t filt_const0 = 0xfffb0001;
2160 const int32_t filt_const1 = 0x140014;
2161 const int32_t filt_const2 = 0x1fffb;
2163 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2164 v16i8 mask0, mask1, mask2;
2165 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2166 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2167 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2168 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2173 filt0 = (v8i16) __msa_fill_w(filt_const0);
2174 filt1 = (v8i16) __msa_fill_w(filt_const1);
2175 filt2 = (v8i16) __msa_fill_w(filt_const2);
2177 src -= ((2 *
stride) + 2);
2179 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2181 LD_SB4(src, stride, src5, src6, src7, src8);
2191 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2192 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2194 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2195 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2196 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2197 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2199 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2201 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2203 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2204 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2206 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2208 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2210 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
2214 dst0 = __msa_aver_s_h(dst0, hz_out0);
2215 dst1 = __msa_aver_s_h(dst1, hz_out1);
2218 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2225 int16_t filt_const0 = 0xfb01;
2226 int16_t filt_const1 = 0x1414;
2227 int16_t filt_const2 = 0x1fb;
2228 v16u8 res0, res1, res2, res3;
2229 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2230 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2231 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2232 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2233 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2235 filt0 = (v16i8) __msa_fill_h(filt_const0);
2236 filt1 = (v16i8) __msa_fill_h(filt_const1);
2237 filt2 = (v16i8) __msa_fill_h(filt_const2);
2238 src -= (stride * 2);
2240 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2244 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2246 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2249 for (loop_cnt = 4; loop_cnt--;) {
2250 LD_SB4(src, stride, src5, src6, src7, src8);
2254 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2255 src65_r, src76_r, src87_r);
2256 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2257 src65_l, src76_l, src87_l);
2258 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2259 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2260 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2261 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2262 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2263 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2264 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2265 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2267 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2269 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2270 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2271 out3_r, res0, res1, res2, res3);
2273 ST_UB4(res0, res1, res2, res3, dst, stride);
2291 const int16_t filt_const0 = 0xfb01;
2292 const int16_t filt_const1 = 0x1414;
2293 const int16_t filt_const2 = 0x1fb;
2294 v16u8 out0, out1, out2, out3;
2295 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2296 v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
2297 v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
2298 v16i8 filt0, filt1, filt2;
2299 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2301 filt0 = (v16i8) __msa_fill_h(filt_const0);
2302 filt1 = (v16i8) __msa_fill_h(filt_const1);
2303 filt2 = (v16i8) __msa_fill_h(filt_const2);
2305 src -= (stride * 2);
2307 LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2309 LD_SB5(src, stride, src8, src9, src10, src11, src12);
2310 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2312 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
2314 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
2315 src910_r, src1110_r, src1211_r);
2319 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2320 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2321 out2_r =
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2322 out3_r =
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2323 out4_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
2324 out5_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
2325 out6_r =
AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
2326 out7_r =
AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
2329 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2330 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2335 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2341 const int16_t filt_const0 = 0xfb01;
2342 const int16_t filt_const1 = 0x1414;
2343 const int16_t filt_const2 = 0x1fb;
2345 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2346 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2347 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2350 filt0 = (v16i8) __msa_fill_h(filt_const0);
2351 filt1 = (v16i8) __msa_fill_h(filt_const1);
2352 filt2 = (v16i8) __msa_fill_h(filt_const2);
2354 src -= (stride * 2);
2356 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2358 LD_SB4(src, stride, src5, src6, src7, src8);
2360 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2362 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2364 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
2365 src76_r, src2110, src4332, src6554, src8776);
2367 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2368 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2372 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2380 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2382 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2383 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2384 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2385 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2386 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2387 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2388 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2389 v8i16 minus5h = __msa_ldi_h(-5);
2390 v8i16 plus20h = __msa_ldi_h(20);
2396 src -= ((2 *
stride) + 2);
2398 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2399 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2404 for (row = 16; row--;) {
2405 LD_SB2(src, 8, src5, src6);
2413 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2414 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2415 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2416 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2417 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2418 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2419 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2420 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2421 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2422 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2423 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2424 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2425 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2426 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2427 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2428 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2429 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2430 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2431 dst0 = __msa_srari_h(shf_vec2, 5);
2432 dst1 = __msa_srari_h(shf_vec5, 5);
2433 dst2 = __msa_srari_h(shf_vec8, 5);
2434 dst3 = __msa_srari_h(shf_vec11, 5);
2437 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2438 dst0 = __msa_aver_s_h(dst2, dst0);
2439 dst1 = __msa_aver_s_h(dst3, dst1);
2462 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2464 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2465 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2466 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2467 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2468 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2469 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2470 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2471 v8i16 minus5h = __msa_ldi_h(-5);
2472 v8i16 plus20h = __msa_ldi_h(20);
2478 src -= ((2 *
stride) + 2);
2480 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2481 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2486 for (row = 16; row--;) {
2487 LD_SB2(src, 8, src5, src6);
2495 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2496 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2497 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2498 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2499 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2500 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2501 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2502 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2503 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2504 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2505 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2506 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2507 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2508 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2509 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2510 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2511 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2512 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2513 dst0 = __msa_srari_h(shf_vec2, 5);
2514 dst1 = __msa_srari_h(shf_vec5, 5);
2515 dst2 = __msa_srari_h(shf_vec8, 5);
2516 dst3 = __msa_srari_h(shf_vec11, 5);
2518 dst0 = __msa_pckod_h(dst2, dst0);
2519 dst1 = __msa_pckod_h(dst3, dst1);
2520 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2521 dst0 = __msa_aver_s_h(dst2, dst0);
2522 dst1 = __msa_aver_s_h(dst3, dst1);
2545 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2546 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2547 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2548 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2549 v8i16 mask3, mask4, mask5;
2550 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2551 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2552 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2553 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2554 v8i16 minus5h = __msa_ldi_h(-5);
2555 v8i16 plus20h = __msa_ldi_h(20);
2561 src -= ((2 *
stride) + 2);
2563 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2567 for (row = 4; row--;) {
2568 LD_SB2(src, stride, src5, src6);
2576 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2577 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2578 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2579 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2580 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2581 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2582 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2583 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2584 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2585 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2586 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2587 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2588 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2589 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2590 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2591 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2592 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2593 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2594 dst0 = __msa_srari_h(shf_vec2, 5);
2595 dst1 = __msa_srari_h(shf_vec5, 5);
2596 dst2 = __msa_srari_h(shf_vec8, 5);
2597 dst3 = __msa_srari_h(shf_vec11, 5);
2600 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2601 dst0 = __msa_aver_s_h(dst2, dst0);
2602 dst1 = __msa_aver_s_h(dst3, dst1);
2620 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2621 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2622 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2623 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2624 v8i16 mask3, mask4, mask5;
2625 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2626 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2627 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2628 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2629 v8i16 minus5h = __msa_ldi_h(-5);
2630 v8i16 plus20h = __msa_ldi_h(20);
2636 src -= ((2 *
stride) + 2);
2638 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2642 for (row = 4; row--;) {
2643 LD_SB2(src, stride, src5, src6);
2651 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2652 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2653 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2654 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2655 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2656 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2657 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2658 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2659 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2660 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2661 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2662 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2663 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2664 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2665 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2666 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2667 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2668 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2669 dst0 = __msa_srari_h(shf_vec2, 5);
2670 dst1 = __msa_srari_h(shf_vec5, 5);
2671 dst2 = __msa_srari_h(shf_vec8, 5);
2672 dst3 = __msa_srari_h(shf_vec11, 5);
2674 dst0 = __msa_pckod_h(dst2, dst0);
2675 dst1 = __msa_pckod_h(dst3, dst1);
2676 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2677 dst0 = __msa_aver_s_h(dst2, dst0);
2678 dst1 = __msa_aver_s_h(dst3, dst1);
2694 const int16_t filt_const0 = 0xfb01;
2695 const int16_t filt_const1 = 0x1414;
2696 const int16_t filt_const2 = 0x1fb;
2698 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2699 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2700 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2701 v16i8 src76_l, src87_l, filt0, filt1, filt2;
2702 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2703 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2704 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2705 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2706 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2707 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2708 v8i16 minus5h = __msa_ldi_h(-5);
2709 v8i16 plus20h = __msa_ldi_h(20);
2710 v8i16 zeros = { 0 };
2712 filt0 = (v16i8) __msa_fill_h(filt_const0);
2713 filt1 = (v16i8) __msa_fill_h(filt_const1);
2714 filt2 = (v16i8) __msa_fill_h(filt_const2);
2716 src -= ((2 *
stride) + 2);
2718 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2721 LD_SB4(src, stride, src5, src6, src7, src8);
2724 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2726 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2728 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2730 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2732 vt_res0 =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2733 vt_res1 =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2734 vt_res2 =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2735 vt_res3 =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2736 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2737 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2738 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2739 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2740 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2741 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2742 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2743 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2745 vt_res0 =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2746 vt_res1 =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2747 vt_res2 =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2748 vt_res3 =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2749 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2750 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2751 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2752 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2753 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2754 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2755 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2756 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2763 dst0 = __msa_srari_h(shf_vec2, 5);
2764 dst1 = __msa_srari_h(shf_vec5, 5);
2765 dst2 = __msa_srari_h(shf_vec6, 5);
2766 dst3 = __msa_srari_h(shf_vec7, 5);
2770 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
2771 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
2773 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2774 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2775 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2776 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2778 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2780 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2786 const int16_t filt_const0 = 0xfb01;
2787 const int16_t filt_const1 = 0x1414;
2788 const int16_t filt_const2 = 0x1fb;
2790 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2791 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2792 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2793 v16i8 src76_l, src87_l, filt0, filt1, filt2;
2794 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2795 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2796 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2797 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2798 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2799 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2800 v8i16 minus5h = __msa_ldi_h(-5);
2801 v8i16 plus20h = __msa_ldi_h(20);
2802 v8i16 zeros = { 0 };
2804 filt0 = (v16i8) __msa_fill_h(filt_const0);
2805 filt1 = (v16i8) __msa_fill_h(filt_const1);
2806 filt2 = (v16i8) __msa_fill_h(filt_const2);
2808 src -= ((2 *
stride) + 2);
2810 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2813 LD_SB4(src, stride, src5, src6, src7, src8);
2816 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2818 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2820 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2822 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2825 vt_res0 =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2826 vt_res1 =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2827 vt_res2 =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2828 vt_res3 =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2829 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2830 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2831 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2832 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2833 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2834 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2835 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2836 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2838 vt_res0 =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2839 vt_res1 =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2840 vt_res2 =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2841 vt_res3 =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2842 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2843 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2844 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2845 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2846 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2847 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2848 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2849 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2856 dst0 = __msa_srari_h(shf_vec2, 5);
2857 dst1 = __msa_srari_h(shf_vec5, 5);
2858 dst2 = __msa_srari_h(shf_vec6, 5);
2859 dst3 = __msa_srari_h(shf_vec7, 5);
2864 dst0 = __msa_ilvod_h(zeros, dst0);
2865 dst1 = __msa_ilvod_h(zeros, dst1);
2866 dst2 = __msa_ilvod_h(zeros, dst2);
2867 dst3 = __msa_ilvod_h(zeros, dst3);
2869 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2870 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2871 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2872 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2874 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2876 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2882 const int32_t filt_const0 = 0xfffb0001;
2883 const int32_t filt_const1 = 0x140014;
2884 const int32_t filt_const2 = 0x1fffb;
2887 uint32_t multiple8_cnt, loop_cnt;
2889 v16i8
src0,
src1, src2, src3, src4, mask0, mask1, mask2;
2890 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2891 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
2892 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2893 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2894 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2895 v8i16 hz_out87_l, filt0, filt1, filt2;
2898 filt0 = (v8i16) __msa_fill_w(filt_const0);
2899 filt1 = (v8i16) __msa_fill_w(filt_const1);
2900 filt2 = (v8i16) __msa_fill_w(filt_const2);
2904 for (multiple8_cnt = 2; multiple8_cnt--;) {
2908 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2918 for (loop_cnt = 4; loop_cnt--;) {
2919 LD_SB4(src, stride, src0, src1, src2, src3);
2928 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2929 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2931 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2932 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2934 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2935 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2937 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2938 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2945 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2950 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2955 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2960 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2982 const int32_t filt_const0 = 0xfffb0001;
2983 const int32_t filt_const1 = 0x140014;
2984 const int32_t filt_const2 = 0x1fffb;
2986 v16i8
src0,
src1, src2, src3, src4, mask0, mask1, mask2;
2987 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2988 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2989 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2990 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2991 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2992 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2993 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2994 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
2997 filt0 = (v8i16) __msa_fill_w(filt_const0);
2998 filt1 = (v8i16) __msa_fill_w(filt_const1);
2999 filt2 = (v8i16) __msa_fill_w(filt_const2);
3003 src -= ((2 *
stride) + 2);
3004 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3014 LD_SB4(src, stride, src0, src1, src2, src3);
3021 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3022 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3023 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3024 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3025 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3026 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3027 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3028 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3030 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3032 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3034 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3035 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3037 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3039 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3040 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3042 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3044 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3045 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3047 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3049 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3055 LD_SB4(src, stride, src0, src1, src2, src3);
3061 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3062 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3064 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3065 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3067 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3069 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3071 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3072 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3074 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3076 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3077 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3079 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3081 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3082 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3084 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3086 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3095 const int32_t filt_const0 = 0xfffb0001;
3096 const int32_t filt_const1 = 0x140014;
3097 const int32_t filt_const2 = 0x1fffb;
3099 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3100 v16i8 mask0, mask1, mask2;
3101 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3102 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3103 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3104 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3109 filt0 = (v8i16) __msa_fill_w(filt_const0);
3110 filt1 = (v8i16) __msa_fill_w(filt_const1);
3111 filt2 = (v8i16) __msa_fill_w(filt_const2);
3113 src -= ((2 *
stride) + 2);
3115 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3117 LD_SB4(src, stride, src5, src6, src7, src8);
3126 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3127 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3128 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3129 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3130 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3131 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3133 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3135 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3137 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3138 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3140 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3142 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3144 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3151 v16u8 dst0, dst1, dst2, dst3;
3152 v16i8 out0, out1, out2, out3,
src0,
src1, src2, src3, src4, src5, src6;
3153 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3154 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3155 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3156 v16i8 minus5b = __msa_ldi_b(-5);
3157 v16i8 plus20b = __msa_ldi_b(20);
3165 for (loop_cnt = 4; loop_cnt--;) {
3166 LD_SB2(src, 16, src0, src1);
3168 LD_SB2(src, 16, src2, src3);
3170 LD_SB2(src, 16, src4, src5);
3172 LD_SB2(src, 16, src6, src7);
3175 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3177 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3178 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3179 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3180 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3181 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3182 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3183 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3184 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3185 minus5b, res0, res1, res2, res3);
3186 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3187 plus20b, res0, res1, res2, res3);
3188 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3189 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3190 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3191 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3192 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3193 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3194 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3195 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3196 minus5b, res4, res5, res6, res7);
3197 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3198 plus20b, res4, res5, res6, res7);
3199 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
3200 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
3207 out0 = __msa_aver_s_b(out0, src0);
3208 out1 = __msa_aver_s_b(out1, src2);
3209 out2 = __msa_aver_s_b(out2, src4);
3210 out3 = __msa_aver_s_b(out3, src6);
3214 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3223 v16u8 dst0, dst1, dst2, dst3;
3224 v16i8 out0, out1, out2, out3,
src0,
src1, src2, src3, src4, src5, src6;
3225 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3226 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3227 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3228 v16i8 minus5b = __msa_ldi_b(-5);
3229 v16i8 plus20b = __msa_ldi_b(20);
3237 for (loop_cnt = 4; loop_cnt--;) {
3238 LD_SB2(src, 16, src0, src1);
3240 LD_SB2(src, 16, src2, src3);
3242 LD_SB2(src, 16, src4, src5);
3244 LD_SB2(src, 16, src6, src7);
3247 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3249 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3250 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3251 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3252 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3253 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3254 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3255 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3256 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3257 minus5b, res0, res1, res2, res3);
3258 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3259 plus20b, res0, res1, res2, res3);
3260 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3261 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3262 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3263 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3264 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3265 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3266 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3267 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3268 minus5b, res4, res5, res6, res7);
3269 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3270 plus20b, res4, res5, res6, res7);
3271 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
3272 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
3279 out0 = __msa_aver_s_b(out0, src0);
3280 out1 = __msa_aver_s_b(out1, src2);
3281 out2 = __msa_aver_s_b(out2, src4);
3282 out3 = __msa_aver_s_b(out3, src6);
3286 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3294 uint64_t tp0, tp1, tp2, tp3;
3295 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3296 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3297 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3298 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3299 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3300 v16i8 minus5b = __msa_ldi_b(-5);
3301 v16i8 plus20b = __msa_ldi_b(20);
3304 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3306 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3307 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3308 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3309 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3310 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3311 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3312 res0, res1, res2, res3);
3313 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3314 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3315 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3316 res0, res1, res2, res3);
3317 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3318 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3319 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3320 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3321 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3322 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3323 res4, res5, res6, res7);
3324 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3325 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3326 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3327 res4, res5, res6, res7);
3328 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
3329 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
3330 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
3331 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
3340 tmp0 = __msa_aver_s_b(tmp0, src0);
3341 tmp1 = __msa_aver_s_b(tmp1, src1);
3342 tmp2 = __msa_aver_s_b(tmp2, src4);
3343 tmp3 = __msa_aver_s_b(tmp3, src5);
3345 LD4(dst, stride, tp0, tp1, tp2, tp3);
3348 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3353 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3359 uint64_t tp0, tp1, tp2, tp3;
3360 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3361 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3362 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3363 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3364 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3365 v16i8 minus5b = __msa_ldi_b(-5);
3366 v16i8 plus20b = __msa_ldi_b(20);
3369 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3371 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3372 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3373 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3374 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3375 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3376 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3377 res0, res1, res2, res3);
3378 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3379 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3380 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3381 res0, res1, res2, res3);
3382 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3383 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3384 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3385 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3386 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3387 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3388 res4, res5, res6, res7);
3389 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3390 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3391 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3392 res4, res5, res6, res7);
3393 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
3394 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
3395 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
3396 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
3405 tmp0 = __msa_aver_s_b(tmp0, src0);
3406 tmp1 = __msa_aver_s_b(tmp1, src1);
3407 tmp2 = __msa_aver_s_b(tmp2, src4);
3408 tmp3 = __msa_aver_s_b(tmp3, src5);
3410 LD4(dst, stride, tp0, tp1, tp2, tp3);
3413 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3418 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3424 uint32_t tp0, tp1, tp2, tp3;
3426 v16i8
src0,
src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3427 v16i8 mask0, mask1, mask2;
3429 v16i8 minus5b = __msa_ldi_b(-5);
3430 v16i8 plus20b = __msa_ldi_b(20);
3433 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3435 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3437 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3438 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3439 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3440 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3443 res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3444 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
3445 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
3446 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32) src1);
3447 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3448 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64) src1);
3449 res = __msa_aver_s_b(res, src0);
3450 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3451 LW4(dst, stride, tp0, tp1, tp2, tp3);
3453 dst0 = __msa_aver_u_b((v16u8) res, dst0);
3454 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3460 uint32_t tp0, tp1, tp2, tp3;
3462 v16i8
src0,
src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3463 v16i8 mask0, mask1, mask2;
3465 v16i8 minus5b = __msa_ldi_b(-5);
3466 v16i8 plus20b = __msa_ldi_b(20);
3469 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3471 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3473 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3474 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3475 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3476 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3479 res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3480 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
3481 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
3482 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32) src1);
3483 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3484 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64) src1);
3485 res = __msa_aver_s_b(res, src0);
3486 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3487 LW4(dst, stride, tp0, tp1, tp2, tp3);
3489 dst0 = __msa_aver_u_b((v16u8) res, dst0);
3490 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3497 v16u8 dst0, dst1, dst2, dst3;
3498 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3499 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3501 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3502 v16i8 minus5b = __msa_ldi_b(-5);
3503 v16i8 plus20b = __msa_ldi_b(20);
3508 for (loop_cnt = 4; loop_cnt--;) {
3509 LD_SB2(src, 8, src0, src1);
3511 LD_SB2(src, 8, src2, src3);
3513 LD_SB2(src, 8, src4, src5);
3515 LD_SB2(src, 8, src6, src7);
3518 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3520 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
3521 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
3522 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
3523 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
3524 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
3525 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
3526 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3527 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3528 minus5b, res0, res1, res2, res3);
3529 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3530 plus20b, res0, res1, res2, res3);
3531 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
3532 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
3533 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
3534 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
3535 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
3536 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
3537 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3538 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3539 minus5b, res4, res5, res6, res7);
3540 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3541 plus20b, res4, res5, res6, res7);
3546 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
3551 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3559 uint64_t tp0, tp1, tp2, tp3;
3560 v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
3561 v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
3562 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3563 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3565 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3566 v16i8 minus5b = __msa_ldi_b(-5);
3567 v16i8 plus20b = __msa_ldi_b(20);
3571 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3573 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3574 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3575 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3576 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3577 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3578 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3579 res0, res1, res2, res3);
3580 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3581 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3582 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3583 res0, res1, res2, res3);
3584 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3585 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3586 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3587 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3588 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3589 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3590 res4, res5, res6, res7);
3591 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3592 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3593 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3594 res4, res5, res6, res7);
3603 LD4(dst, stride, tp0, tp1, tp2, tp3);
3606 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3611 ST8x8_UB(out0, out1, out4, out5, dst, stride);
3617 uint32_t tp0, tp1, tp2, tp3;
3618 v16u8 res, dst0 = { 0 };
3619 v16i8
src0,
src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
3620 v16i8 mask0, mask1, mask2;
3622 v16i8 minus5b = __msa_ldi_b(-5);
3623 v16i8 plus20b = __msa_ldi_b(20);
3626 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3628 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3630 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3631 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3632 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3633 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3637 LW4(dst, stride, tp0, tp1, tp2, tp3);
3639 res = __msa_aver_u_b(res, dst0);
3640 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3647 int16_t filt_const0 = 0xfb01;
3648 int16_t filt_const1 = 0x1414;
3649 int16_t filt_const2 = 0x1fb;
3650 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3651 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3652 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3653 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3654 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3655 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3657 filt0 = (v16i8) __msa_fill_h(filt_const0);
3658 filt1 = (v16i8) __msa_fill_h(filt_const1);
3659 filt2 = (v16i8) __msa_fill_h(filt_const2);
3661 src -= (stride * 2);
3663 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3667 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3669 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3672 for (loop_cnt = 4; loop_cnt--;) {
3673 LD_SB4(src, stride, src5, src6, src7, src8);
3677 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3678 src65_r, src76_r, src87_r);
3679 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3680 src65_l, src76_l, src87_l);
3681 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3682 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3683 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3684 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3685 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3686 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3687 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3688 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3690 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3692 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3693 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3694 out3_r, res0, res1, res2, res3);
3695 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
3696 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
3697 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
3698 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
3699 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3703 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3724 int16_t filt_const0 = 0xfb01;
3725 int16_t filt_const1 = 0x1414;
3726 int16_t filt_const2 = 0x1fb;
3727 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3728 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3729 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3730 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3731 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3732 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3734 filt0 = (v16i8) __msa_fill_h(filt_const0);
3735 filt1 = (v16i8) __msa_fill_h(filt_const1);
3736 filt2 = (v16i8) __msa_fill_h(filt_const2);
3738 src -= (stride * 2);
3740 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3744 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3746 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3749 for (loop_cnt = 4; loop_cnt--;) {
3750 LD_SB4(src, stride, src5, src6, src7, src8);
3754 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3755 src65_r, src76_r, src87_r);
3756 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3757 src65_l, src76_l, src87_l);
3758 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3759 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3760 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3761 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3762 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3763 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3764 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3765 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3767 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3769 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3770 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3771 out3_r, res0, res1, res2, res3);
3772 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
3773 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
3774 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
3775 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
3776 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3780 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3799 uint64_t tp0, tp1, tp2, tp3;
3800 const int16_t filt_const0 = 0xfb01;
3801 const int16_t filt_const1 = 0x1414;
3802 const int16_t filt_const2 = 0x1fb;
3803 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3804 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3805 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3806 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3807 v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3808 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3810 filt0 = (v16i8) __msa_fill_h(filt_const0);
3811 filt1 = (v16i8) __msa_fill_h(filt_const1);
3812 filt2 = (v16i8) __msa_fill_h(filt_const2);
3814 src -= (stride * 2);
3816 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3820 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3822 LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3823 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3824 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3825 src87_r, src98_r, src109_r);
3826 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3827 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3828 out2_r =
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3829 out3_r =
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3831 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3832 src21_r, src32_r, src43_r);
3833 out4_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3834 out5_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3835 out6_r =
AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3836 out7_r =
AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3837 PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
3840 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3841 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3843 LD4(dst, stride, tp0, tp1, tp2, tp3);
3846 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3850 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3851 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3852 out0 = __msa_aver_s_b(out0, tmp0);
3853 out1 = __msa_aver_s_b(out1, tmp1);
3854 out2 = __msa_aver_s_b(out2, tmp2);
3855 out3 = __msa_aver_s_b(out3, tmp3);
3857 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3859 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3865 uint64_t tp0, tp1, tp2, tp3;
3866 const int16_t filt_const0 = 0xfb01;
3867 const int16_t filt_const1 = 0x1414;
3868 const int16_t filt_const2 = 0x1fb;
3869 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3870 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3871 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3872 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3873 v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3874 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3876 filt0 = (v16i8) __msa_fill_h(filt_const0);
3877 filt1 = (v16i8) __msa_fill_h(filt_const1);
3878 filt2 = (v16i8) __msa_fill_h(filt_const2);
3880 src -= (stride * 2);
3882 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3886 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3888 LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3889 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3890 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3891 src87_r, src98_r, src109_r);
3892 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3893 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3894 out2_r =
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3895 out3_r =
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3897 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3898 src21_r, src32_r, src43_r);
3899 out4_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3900 out5_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3901 out6_r =
AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3902 out7_r =
AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3903 PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
3906 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3907 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3909 LD4(dst, stride, tp0, tp1, tp2, tp3);
3912 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3916 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3917 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3918 out0 = __msa_aver_s_b(out0, tmp0);
3919 out1 = __msa_aver_s_b(out1, tmp1);
3920 out2 = __msa_aver_s_b(out2, tmp2);
3921 out3 = __msa_aver_s_b(out3, tmp3);
3923 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3925 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3931 uint32_t tp0, tp1, tp2, tp3;
3932 int16_t filt_const0 = 0xfb01;
3933 int16_t filt_const1 = 0x1414;
3934 int16_t filt_const2 = 0x1fb;
3935 v16u8 res, dst0 = { 0 };
3936 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3937 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3938 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3941 filt0 = (v16i8) __msa_fill_h(filt_const0);
3942 filt1 = (v16i8) __msa_fill_h(filt_const1);
3943 filt2 = (v16i8) __msa_fill_h(filt_const2);
3945 src -= (stride * 2);
3946 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3949 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3951 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3953 LD_SB4(src, stride, src5, src6, src7, src8);
3954 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3956 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3958 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3959 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
3960 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3961 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3962 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3965 LW4(dst, stride, tp0, tp1, tp2, tp3);
3968 res = __msa_aver_u_b(res, (v16u8) src32_r);
3969 dst0 = __msa_aver_u_b(res, dst0);
3970 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3976 uint32_t tp0, tp1, tp2, tp3;
3977 int16_t filt_const0 = 0xfb01;
3978 int16_t filt_const1 = 0x1414;
3979 int16_t filt_const2 = 0x1fb;
3980 v16u8 res, dst0 = { 0 };
3981 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3982 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3983 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3986 filt0 = (v16i8) __msa_fill_h(filt_const0);
3987 filt1 = (v16i8) __msa_fill_h(filt_const1);
3988 filt2 = (v16i8) __msa_fill_h(filt_const2);
3990 src -= (stride * 2);
3992 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3995 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3997 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3999 LD_SB4(src, stride, src5, src6, src7, src8);
4000 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4002 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4004 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4005 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4008 LW4(dst, stride, tp0, tp1, tp2, tp3);
4011 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
4012 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
4013 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
4014 res = __msa_aver_u_b(res, (v16u8) src32_r);
4015 dst0 = __msa_aver_u_b(res, dst0);
4016 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
4031 src - (stride * 2) +
4048 src - (stride * 2) +
4065 src - (stride * 2) +
4066 sizeof(
uint8_t), dst, stride);
4081 src - (stride * 2) +
4082 sizeof(
uint8_t), dst, stride);
4098 src - (stride * 2) +
4099 sizeof(
uint8_t), dst, stride);
4114 src - (stride * 2) +
4115 sizeof(
uint8_t), dst, stride);
4121 uint64_t tp0, tp1, tp2, tp3;
4124 uint32_t multiple8_cnt, loop_cnt;
4125 const int32_t filt_const0 = 0xfffb0001;
4126 const int32_t filt_const1 = 0x140014;
4127 const int32_t filt_const2 = 0x1fffb;
4128 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4129 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4131 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4132 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4133 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4134 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4135 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4136 v8i16 hz_out87_l, filt0, filt1, filt2;
4137 v4i32 tmp0_w, tmp1_w;
4139 filt0 = (v8i16) __msa_fill_w(filt_const0);
4140 filt1 = (v8i16) __msa_fill_w(filt_const1);
4141 filt2 = (v8i16) __msa_fill_w(filt_const2);
4145 for (multiple8_cnt = 2; multiple8_cnt--;) {
4149 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4159 for (loop_cnt = 4; loop_cnt--;) {
4160 LD_SB2(src, stride, src5, src6);
4166 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4167 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4169 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4170 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4172 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
4174 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
4176 tmp0_w =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4178 tmp1_w =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4180 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4181 tmp0_w =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4183 tmp1_w =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4185 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4187 tmp1 = __msa_srari_h(hz_out2, 5);
4188 tmp3 = __msa_srari_h(hz_out3, 5);
4191 tmp0 = __msa_aver_s_h(tmp0, tmp1);
4192 tmp1 = __msa_aver_s_h(tmp2, tmp3);
4194 LD2(dst, stride, tp0, tp1);
4198 dst0 = __msa_aver_u_b(out0, dst0);
4202 LD_SB2(src, stride, src7, src8);
4208 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4210 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4212 tmp0_w =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4214 tmp1_w =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4216 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4217 tmp0_w =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4219 tmp1_w =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4221 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4223 tmp5 = __msa_srari_h(hz_out4, 5);
4224 tmp7 = __msa_srari_h(hz_out5, 5);
4227 tmp2 = __msa_aver_s_h(tmp4, tmp5);
4228 tmp3 = __msa_aver_s_h(tmp6, tmp7);
4230 LD2(dst, stride, tp2, tp3);
4234 dst1 = __msa_aver_u_b(out1, dst1);
4253 uint64_t tp0, tp1, tp2, tp3;
4256 uint32_t multiple8_cnt, loop_cnt;
4257 const int32_t filt_const0 = 0xfffb0001;
4258 const int32_t filt_const1 = 0x140014;
4259 const int32_t filt_const2 = 0x1fffb;
4260 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4261 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4263 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4264 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4265 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4266 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4267 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4268 v8i16 hz_out87_l, filt0, filt1, filt2;
4269 v4i32 tmp0_w, tmp1_w;
4271 filt0 = (v8i16) __msa_fill_w(filt_const0);
4272 filt1 = (v8i16) __msa_fill_w(filt_const1);
4273 filt2 = (v8i16) __msa_fill_w(filt_const2);
4277 for (multiple8_cnt = 2; multiple8_cnt--;) {
4281 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4291 for (loop_cnt = 4; loop_cnt--;) {
4292 LD_SB2(src, stride, src5, src6);
4298 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4299 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4301 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4302 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4304 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
4305 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
4307 tmp0_w =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4309 tmp1_w =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4311 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4312 tmp0_w =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4314 tmp1_w =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4316 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4318 tmp1 = __msa_srari_h(hz_out3, 5);
4319 tmp3 = __msa_srari_h(hz_out4, 5);
4322 tmp0 = __msa_aver_s_h(tmp0, tmp1);
4323 tmp1 = __msa_aver_s_h(tmp2, tmp3);
4325 LD2(dst, stride, tp0, tp1);
4328 dst0 = __msa_aver_u_b(out0, dst0);
4332 LD_SB2(src, stride, src7, src8);
4338 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4340 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4342 tmp0_w =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4344 tmp1_w =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4346 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4347 tmp0_w =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4349 tmp1_w =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4351 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4353 tmp5 = __msa_srari_h(hz_out5, 5);
4354 tmp7 = __msa_srari_h(hz_out6, 5);
4357 tmp2 = __msa_aver_s_h(tmp4, tmp5);
4358 tmp3 = __msa_aver_s_h(tmp6, tmp7);
4360 LD2(dst, stride, tp2, tp3);
4363 dst1 = __msa_aver_u_b(out1, dst1);
4382 const int32_t filt_const0 = 0xfffb0001;
4383 const int32_t filt_const1 = 0x140014;
4384 const int32_t filt_const2 = 0x1fffb;
4385 uint64_t tp0, tp1, tp2, tp3;
4386 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4387 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4388 v16i8 src11, src12, mask0, mask1, mask2;
4389 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4390 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4391 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4392 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4393 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4394 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4395 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4396 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4397 v4i32 tmp0_w, tmp1_w;
4401 filt0 = (v8i16) __msa_fill_w(filt_const0);
4402 filt1 = (v8i16) __msa_fill_w(filt_const1);
4403 filt2 = (v8i16) __msa_fill_w(filt_const2);
4405 src -= ((2 *
stride) + 2);
4407 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4417 LD_SB4(src, stride, src5, src6, src7, src8);
4426 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4427 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4428 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4429 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4430 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4431 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4432 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4433 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4435 tmp0_w =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4437 tmp1_w =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4439 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4440 tmp0_w =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4442 tmp1_w =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4444 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4445 tmp0_w =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4447 tmp1_w =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4449 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4450 tmp0_w =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4452 tmp1_w =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4454 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4456 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
4457 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
4459 LD4(dst, stride, tp0, tp1, tp2, tp3);
4463 tmp0 = __msa_aver_s_h(tmp0, hz_out2);
4464 tmp1 = __msa_aver_s_h(tmp1, hz_out3);
4465 tmp2 = __msa_aver_s_h(tmp2, hz_out4);
4466 tmp3 = __msa_aver_s_h(tmp3, hz_out5);
4474 LD_SB4(src, stride, src9, src10, src11, src12);
4480 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4481 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4483 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4484 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4486 tmp0_w =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4488 tmp1_w =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4490 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4491 tmp0_w =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4493 tmp1_w =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4495 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4496 tmp0_w =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4498 tmp1_w =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4500 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4501 tmp0_w =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4503 tmp1_w =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4505 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4507 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
4508 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
4510 LD4(dst, stride, tp0, tp1, tp2, tp3);
4514 tmp0 = __msa_aver_s_h(tmp0, hz_out6);
4515 tmp1 = __msa_aver_s_h(tmp1, hz_out7);
4516 tmp2 = __msa_aver_s_h(tmp2, hz_out8);
4517 tmp3 = __msa_aver_s_h(tmp3, hz_out9);
4528 const int32_t filt_const0 = 0xfffb0001;
4529 const int32_t filt_const1 = 0x140014;
4530 const int32_t filt_const2 = 0x1fffb;
4531 uint64_t tp0, tp1, tp2, tp3;
4532 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4533 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4534 v16i8 src11, src12, mask0, mask1, mask2;
4535 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4536 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4537 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4538 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4539 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4540 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4541 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4542 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4543 v4i32 tmp0_w, tmp1_w;
4547 filt0 = (v8i16) __msa_fill_w(filt_const0);
4548 filt1 = (v8i16) __msa_fill_w(filt_const1);
4549 filt2 = (v8i16) __msa_fill_w(filt_const2);
4551 src -= ((2 *
stride) + 2);
4553 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4563 LD_SB4(src, stride, src5, src6, src7, src8);
4572 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4573 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4574 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4575 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4576 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4577 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4578 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4579 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4581 tmp0_w =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4583 tmp1_w =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4585 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4586 tmp0_w =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4588 tmp1_w =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4590 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4591 tmp0_w =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4593 tmp1_w =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4595 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4596 tmp0_w =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4598 tmp1_w =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4600 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4602 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
4603 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
4605 LD4(dst, stride, tp0, tp1, tp2, tp3);
4609 tmp0 = __msa_aver_s_h(tmp0, hz_out3);
4610 tmp1 = __msa_aver_s_h(tmp1, hz_out4);
4611 tmp2 = __msa_aver_s_h(tmp2, hz_out5);
4612 tmp3 = __msa_aver_s_h(tmp3, hz_out6);
4620 LD_SB4(src, stride, src9, src10, src11, src12);
4626 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4627 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4629 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4630 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4632 tmp0_w =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4634 tmp1_w =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4636 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4637 tmp0_w =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4639 tmp1_w =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4641 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4642 tmp0_w =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4644 tmp1_w =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4646 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4647 tmp0_w =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4649 tmp1_w =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4651 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4653 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
4654 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
4656 LD4(dst, stride, tp0, tp1, tp2, tp3);
4660 tmp0 = __msa_aver_s_h(tmp0, hz_out7);
4661 tmp1 = __msa_aver_s_h(tmp1, hz_out8);
4662 tmp2 = __msa_aver_s_h(tmp2, hz_out9);
4663 tmp3 = __msa_aver_s_h(tmp3, hz_out10);
4674 uint32_t tp0, tp1, tp2, tp3;
4675 const int32_t filt_const0 = 0xfffb0001;
4676 const int32_t filt_const1 = 0x140014;
4677 const int32_t filt_const2 = 0x1fffb;
4678 v16u8 res,
out = { 0 };
4679 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4680 v16i8 mask0, mask1, mask2;
4681 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4682 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4683 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4684 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4689 filt0 = (v8i16) __msa_fill_w(filt_const0);
4690 filt1 = (v8i16) __msa_fill_w(filt_const1);
4691 filt2 = (v8i16) __msa_fill_w(filt_const2);
4693 src -= ((2 *
stride) + 2);
4695 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4697 LD_SB4(src, stride, src5, src6, src7, src8);
4707 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4708 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4710 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4711 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4712 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4713 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4715 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4717 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4719 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4720 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4722 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4724 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4729 dst0 = __msa_aver_s_h(dst0, hz_out2);
4730 dst1 = __msa_aver_s_h(dst1, hz_out4);
4731 LW4(dst, stride, tp0, tp1, tp2, tp3);
4734 res = __msa_aver_u_b(res, out);
4735 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
4741 const int32_t filt_const0 = 0xfffb0001;
4742 const int32_t filt_const1 = 0x140014;
4743 const int32_t filt_const2 = 0x1fffb;
4744 uint32_t tp0, tp1, tp2, tp3;
4745 v16u8 res,
out = { 0 };
4746 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4747 v16i8 mask0, mask1, mask2;
4748 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4749 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4750 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4751 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4756 filt0 = (v8i16) __msa_fill_w(filt_const0);
4757 filt1 = (v8i16) __msa_fill_w(filt_const1);
4758 filt2 = (v8i16) __msa_fill_w(filt_const2);
4760 src -= ((2 *
stride) + 2);
4762 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4764 LD_SB4(src, stride, src5, src6, src7, src8);
4774 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4775 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4777 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4778 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4779 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4780 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4782 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4784 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4786 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4787 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4789 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4791 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4793 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
4797 dst0 = __msa_aver_s_h(dst0, hz_out0);
4798 dst1 = __msa_aver_s_h(dst1, hz_out1);
4799 LW4(dst, stride, tp0, tp1, tp2, tp3);
4802 res = __msa_aver_u_b(res, out);
4803 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
4810 int16_t filt_const0 = 0xfb01;
4811 int16_t filt_const1 = 0x1414;
4812 int16_t filt_const2 = 0x1fb;
4813 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
4814 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4815 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4816 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
4817 v16i8 src65_l, src87_l, filt0, filt1, filt2;
4818 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
4820 filt0 = (v16i8) __msa_fill_h(filt_const0);
4821 filt1 = (v16i8) __msa_fill_h(filt_const1);
4822 filt2 = (v16i8) __msa_fill_h(filt_const2);
4823 src -= (stride * 2);
4825 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4829 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4831 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
4834 for (loop_cnt = 4; loop_cnt--;) {
4835 LD_SB4(src, stride, src5, src6, src7, src8);
4839 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
4840 src65_r, src76_r, src87_r);
4841 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
4842 src65_l, src76_l, src87_l);
4843 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
4844 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
4845 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
4846 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
4847 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
4848 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
4849 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
4850 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
4852 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4854 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
4855 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
4856 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
4857 out3_r, res0, res1, res2, res3);
4861 ST_UB4(res0, res1, res2, res3, dst, stride);
4879 uint64_t tp0, tp1, tp2, tp3;
4880 const int16_t filt_const0 = 0xfb01;
4881 const int16_t filt_const1 = 0x1414;
4882 const int16_t filt_const2 = 0x1fb;
4883 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
4884 v16u8 out0, out1, out2, out3;
4885 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10, src109_r;
4886 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
4887 v16i8 filt0, filt1, filt2;
4888 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
4890 filt0 = (v16i8) __msa_fill_h(filt_const0);
4891 filt1 = (v16i8) __msa_fill_h(filt_const1);
4892 filt2 = (v16i8) __msa_fill_h(filt_const2);
4894 src -= (stride * 2);
4896 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4900 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4903 LD_SB4(src, stride, src7, src8, src9, src10);
4906 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
4907 src87_r, src98_r, src109_r);
4908 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
4909 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
4910 out2_r =
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
4911 out3_r =
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
4913 LD_SB4(src, stride, src0, src1, src2, src3);
4915 ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r,
4916 src21_r, src32_r, src43_r);
4917 out4_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
4918 out5_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
4919 out6_r =
AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
4920 out7_r =
AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
4922 LD4(dst, stride, tp0, tp1, tp2, tp3);
4925 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
4931 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4932 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
4937 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
4939 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
4945 uint32_t tp0, tp1, tp2, tp3;
4946 int16_t filt_const0 = 0xfb01;
4947 int16_t filt_const1 = 0x1414;
4948 int16_t filt_const2 = 0x1fb;
4949 v16u8 res, dst0 = { 0 };
4950 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4951 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4952 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
4955 filt0 = (v16i8) __msa_fill_h(filt_const0);
4956 filt1 = (v16i8) __msa_fill_h(filt_const1);
4957 filt2 = (v16i8) __msa_fill_h(filt_const2);
4959 src -= (stride * 2);
4960 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4963 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4965 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
4967 LD_SB4(src, stride, src5, src6, src7, src8);
4968 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4970 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4972 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4973 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4976 LW4(dst, stride, tp0, tp1, tp2, tp3);
4979 dst0 = __msa_aver_u_b(res, dst0);
4980 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
4988 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4990 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
4991 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
4992 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
4993 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
4994 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
4995 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
4996 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
4997 v8i16 minus5h = __msa_ldi_h(-5);
4998 v8i16 plus20h = __msa_ldi_h(20);
5004 src -= ((2 *
stride) + 2);
5006 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5007 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5012 for (row = 16; row--;) {
5013 LD_SB2(src, 8, src5, src6);
5022 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5023 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5024 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5025 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5026 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5027 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5028 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5029 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5030 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5031 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5032 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5033 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5034 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5035 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5036 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5037 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5038 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5039 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5040 tmp0 = __msa_srari_h(shf_vec2, 5);
5041 tmp1 = __msa_srari_h(shf_vec5, 5);
5042 tmp2 = __msa_srari_h(shf_vec8, 5);
5043 tmp3 = __msa_srari_h(shf_vec11, 5);
5046 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5047 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5048 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5050 out = __msa_aver_u_b(out, dst0);
5072 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5074 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
5075 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5076 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
5077 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5078 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5079 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5080 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5081 v8i16 minus5h = __msa_ldi_h(-5);
5082 v8i16 plus20h = __msa_ldi_h(20);
5088 src -= ((2 *
stride) + 2);
5090 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5091 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5096 for (row = 16; row--;) {
5097 LD_SB2(src, 8, src5, src6);
5106 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5107 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5108 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5109 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5110 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5111 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5112 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5113 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5114 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5115 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5116 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5117 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5118 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5119 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5120 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5121 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5122 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5123 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5124 tmp0 = __msa_srari_h(shf_vec2, 5);
5125 tmp1 = __msa_srari_h(shf_vec5, 5);
5126 tmp2 = __msa_srari_h(shf_vec8, 5);
5127 tmp3 = __msa_srari_h(shf_vec11, 5);
5129 tmp0 = __msa_pckod_h(tmp2, tmp0);
5130 tmp1 = __msa_pckod_h(tmp3, tmp1);
5131 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5132 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5133 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5135 out = __msa_aver_u_b(out, dst0);
5157 v16u8
out, dst0 = { 0 };
5158 v16i8
src0,
src1, src2, src3, src4, src5, src6;
5159 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5160 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5161 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5162 v8i16 mask3, mask4, mask5;
5163 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5164 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5165 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5166 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5167 v8i16 minus5h = __msa_ldi_h(-5);
5168 v8i16 plus20h = __msa_ldi_h(20);
5174 src -= ((2 *
stride) + 2);
5176 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5180 for (row = 4; row--;) {
5181 LD_SB2(src, stride, src5, src6);
5189 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5190 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5191 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5192 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5193 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5194 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5195 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5196 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5197 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5198 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5199 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5200 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5201 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5202 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5203 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5204 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5205 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5206 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5207 tmp0 = __msa_srari_h(shf_vec2, 5);
5208 tmp1 = __msa_srari_h(shf_vec5, 5);
5209 tmp2 = __msa_srari_h(shf_vec8, 5);
5210 tmp3 = __msa_srari_h(shf_vec11, 5);
5211 LD2(dst, stride, tp0, tp1);
5215 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5216 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5217 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5219 out = __msa_aver_u_b(out, dst0);
5236 v16u8
out, dst0 = { 0 };
5237 v16i8
src0,
src1, src2, src3, src4, src5, src6;
5238 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5239 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5240 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5241 v8i16 mask3, mask4, mask5;
5242 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5243 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5244 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5245 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5246 v8i16 minus5h = __msa_ldi_h(-5);
5247 v8i16 plus20h = __msa_ldi_h(20);
5253 src -= ((2 *
stride) + 2);
5255 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5259 for (row = 4; row--;) {
5260 LD_SB2(src, stride, src5, src6);
5268 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5269 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5270 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5271 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5272 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5273 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5274 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5275 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5276 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5277 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5278 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5279 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5280 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5281 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5282 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5283 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5284 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5285 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5286 tmp0 = __msa_srari_h(shf_vec2, 5);
5287 tmp1 = __msa_srari_h(shf_vec5, 5);
5288 tmp2 = __msa_srari_h(shf_vec8, 5);
5289 tmp3 = __msa_srari_h(shf_vec11, 5);
5290 LD2(dst, stride, tp0, tp1);
5293 tmp0 = __msa_pckod_h(tmp2, tmp0);
5294 tmp1 = __msa_pckod_h(tmp3, tmp1);
5295 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5296 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5297 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5299 out = __msa_aver_u_b(out, dst0);
5314 uint32_t tp0, tp1, tp2, tp3;
5315 const int16_t filt_const0 = 0xfb01;
5316 const int16_t filt_const1 = 0x1414;
5317 const int16_t filt_const2 = 0x1fb;
5318 v16u8
out, dstv = { 0 };
5319 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
5320 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5321 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5322 v16i8 src76_l, src87_l, filt0, filt1, filt2;
5323 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5324 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5325 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5326 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5327 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5328 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5329 v8i16 minus5h = __msa_ldi_h(-5);
5330 v8i16 plus20h = __msa_ldi_h(20);
5331 v8i16 zeros = { 0 };
5333 filt0 = (v16i8) __msa_fill_h(filt_const0);
5334 filt1 = (v16i8) __msa_fill_h(filt_const1);
5335 filt2 = (v16i8) __msa_fill_h(filt_const2);
5337 src -= ((2 *
stride) + 2);
5339 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5342 LD_SB4(src, stride, src5, src6, src7, src8);
5345 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5347 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5349 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5351 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5353 vt_res0 =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5354 vt_res1 =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5355 vt_res2 =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5356 vt_res3 =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5357 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5358 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5359 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5360 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5361 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5362 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5363 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5364 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5366 vt_res0 =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5367 vt_res1 =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5368 vt_res2 =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5369 vt_res3 =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5370 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5371 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5372 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5373 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5374 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5375 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5376 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5377 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5384 dst0 = __msa_srari_h(shf_vec2, 5);
5385 dst1 = __msa_srari_h(shf_vec5, 5);
5386 dst2 = __msa_srari_h(shf_vec6, 5);
5387 dst3 = __msa_srari_h(shf_vec7, 5);
5391 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
5392 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
5394 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5395 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5396 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5397 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5399 LW4(dst, stride, tp0, tp1, tp2, tp3);
5401 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5403 out = __msa_aver_u_b(out, dstv);
5404 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
5410 uint32_t tp0, tp1, tp2, tp3;
5411 const int16_t filt_const0 = 0xfb01;
5412 const int16_t filt_const1 = 0x1414;
5413 const int16_t filt_const2 = 0x1fb;
5414 v16u8
out, dstv = { 0 };
5415 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
5416 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5417 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5418 v16i8 src76_l, src87_l, filt0, filt1, filt2;
5419 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5420 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5421 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5422 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5423 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5424 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5425 v8i16 minus5h = __msa_ldi_h(-5);
5426 v8i16 plus20h = __msa_ldi_h(20);
5427 v8i16 zeros = { 0 };
5429 filt0 = (v16i8) __msa_fill_h(filt_const0);
5430 filt1 = (v16i8) __msa_fill_h(filt_const1);
5431 filt2 = (v16i8) __msa_fill_h(filt_const2);
5433 src -= ((2 *
stride) + 2);
5435 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5438 LD_SB4(src, stride, src5, src6, src7, src8);
5441 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5443 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5445 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5447 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5449 vt_res0 =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5450 vt_res1 =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5451 vt_res2 =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5452 vt_res3 =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5453 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5454 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5455 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5456 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5457 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5458 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5459 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5460 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5462 vt_res0 =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5463 vt_res1 =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5464 vt_res2 =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5465 vt_res3 =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5466 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5467 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5468 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5469 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5470 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5471 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5472 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5473 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5480 dst0 = __msa_srari_h(shf_vec2, 5);
5481 dst1 = __msa_srari_h(shf_vec5, 5);
5482 dst2 = __msa_srari_h(shf_vec6, 5);
5483 dst3 = __msa_srari_h(shf_vec7, 5);
5488 dst0 = __msa_ilvod_h(zeros, dst0);
5489 dst1 = __msa_ilvod_h(zeros, dst1);
5490 dst2 = __msa_ilvod_h(zeros, dst2);
5491 dst3 = __msa_ilvod_h(zeros, dst3);
5493 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5494 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5495 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5496 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5498 LW4(dst, stride, tp0, tp1, tp2, tp3);
5500 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5502 out = __msa_aver_u_b(out, dstv);
5503 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
5509 const int32_t filt_const0 = 0xfffb0001;
5510 const int32_t filt_const1 = 0x140014;
5511 const int32_t filt_const2 = 0x1fffb;
5514 uint64_t tp0, tp1, tp2, tp3;
5515 uint32_t multiple8_cnt, loop_cnt;
5516 v16u8 dst0, dst1, out0, out1;
5517 v16i8
src0,
src1, src2, src3, src4, mask0, mask1, mask2;
5518 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5519 v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
5520 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5521 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
5522 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
5523 v8i16 hz_out87_l, filt0, filt1, filt2;
5526 filt0 = (v8i16) __msa_fill_w(filt_const0);
5527 filt1 = (v8i16) __msa_fill_w(filt_const1);
5528 filt2 = (v8i16) __msa_fill_w(filt_const2);
5532 for (multiple8_cnt = 2; multiple8_cnt--;) {
5536 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5546 for (loop_cnt = 4; loop_cnt--;) {
5547 LD_SB4(src, stride, src0, src1, src2, src3);
5555 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5556 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
5558 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5559 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
5561 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5562 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
5564 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5565 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
5572 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5577 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5582 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5587 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5589 LD4(dst, stride, tp0, tp1, tp2, tp3);
5613 const int32_t filt_const0 = 0xfffb0001;
5614 const int32_t filt_const1 = 0x140014;
5615 const int32_t filt_const2 = 0x1fffb;
5616 uint64_t tp0, tp1, tp2, tp3;
5617 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
5618 v16i8
src0,
src1, src2, src3, src4, mask0, mask1, mask2;
5619 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5620 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
5621 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5622 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
5623 v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
5624 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
5625 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
5626 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
5629 filt0 = (v8i16) __msa_fill_w(filt_const0);
5630 filt1 = (v8i16) __msa_fill_w(filt_const1);
5631 filt2 = (v8i16) __msa_fill_w(filt_const2);
5635 src -= ((2 *
stride) + 2);
5636 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5646 LD_SB4(src, stride, src0, src1, src2, src3);
5653 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5654 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5655 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5656 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
5657 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5658 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5659 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5660 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
5662 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5664 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
5666 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5667 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5669 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
5671 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5672 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5674 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
5676 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5677 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5679 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
5681 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5682 LD4(dst, stride, tp0, tp1, tp2, tp3);
5691 LD_SB4(src, stride, src0, src1, src2, src3);
5697 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5698 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
5700 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5701 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
5703 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
5705 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
5707 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5708 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
5710 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
5712 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5713 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
5715 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
5717 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5718 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
5720 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
5722 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5723 LD4(dst, stride, tp0, tp1, tp2, tp3);
5735 const int32_t filt_const0 = 0xfffb0001;
5736 const int32_t filt_const1 = 0x140014;
5737 const int32_t filt_const2 = 0x1fffb;
5738 uint32_t tp0, tp1, tp2, tp3;
5739 v16u8 res, dst0 = { 0 };
5740 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
5741 v16i8 mask0, mask1, mask2;
5742 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5743 v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
5744 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5745 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
5750 filt0 = (v8i16) __msa_fill_w(filt_const0);
5751 filt1 = (v8i16) __msa_fill_w(filt_const1);
5752 filt2 = (v8i16) __msa_fill_w(filt_const2);
5754 src -= ((2 *
stride) + 2);
5756 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5758 LD_SB4(src, stride, src5, src6, src7, src8);
5767 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
5768 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
5769 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5770 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5771 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5772 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5774 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5776 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5778 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5779 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5781 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5783 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5784 LW4(dst, stride, tp0, tp1, tp2, tp3);
5787 res = __msa_aver_u_b(res, dst0);
5788 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x, const uint8_t *src_y, uint8_t *dst, int32_t stride)
#define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2)
void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B2_128_SB(...)
static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x, const uint8_t *src_y, uint8_t *dst, int32_t stride)
#define PCKEV_XORI128_UB(in0, in1)
void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define DPADD_SB4_SH(...)
void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B4_128_UB(...)
void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define LD4(psrc, stride, out0, out1, out2, out3)
void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static int aligned(int val)
void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST8x2_UB(in, pdst, stride)
void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B4_128_SB(...)
void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)
void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define DPADD_SB2_SH(...)
void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x, const uint8_t *src_y, uint8_t *dst, int32_t stride)
#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define INSERT_W4_UB(...)
void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y, uint8_t *dst, int32_t stride)
void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y, uint8_t *dst, int32_t stride)
void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, out1, out2)
void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define DPADD_SH2_SW(...)
static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x, const uint8_t *src_y, uint8_t *dst, int32_t stride)
void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define INSERT_D2_UB(...)
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ST8x4_UB(in0, in1, pdst, stride)
void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define LD2(psrc, stride, out0, out1)
static const uint8_t luma_mask_arr[16 *6]
void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)