25 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
33 uint32_t coeff0, uint32_t coeff1)
40 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
46 LD_SB2(src, stride, src0, src1);
48 src0 = __msa_vshf_b(mask, src1, src0);
49 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
51 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52 res_r = __msa_sat_u_h(res_r, 7);
53 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
55 out0 = __msa_copy_u_h(res, 0);
56 out1 = __msa_copy_u_h(res, 2);
64 uint32_t coeff0, uint32_t coeff1)
70 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
76 LD_UB4(src, stride, src0, src1, src2, src3);
78 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
80 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
82 res_r = __msa_dotp_u_h(src0, coeff_vec);
84 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85 res_r = __msa_sat_u_h(res_r, 7);
86 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
92 uint32_t coeff0, uint32_t coeff1,
97 }
else if (4 == height) {
103 uint32_t coeff0, uint32_t coeff1)
109 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
115 LD_SB2(src, stride, src0, src1);
117 src0 = __msa_vshf_b(mask, src1, src0);
118 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
120 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121 res_r = __msa_sat_u_h(res_r, 7);
122 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
128 uint32_t coeff0, uint32_t coeff1)
131 v8u16 res0_r, res1_r;
133 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
139 LD_UB4(src, stride, src0, src1, src2, src3);
140 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
141 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
146 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
147 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
151 uint32_t coeff0, uint32_t coeff1)
153 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, out0, out1;
155 v8u16 res0, res1, res2, res3;
156 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
162 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
163 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
164 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
165 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
166 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167 SLLI_4V(res0, res1, res2, res3, 3);
175 uint32_t coeff0, uint32_t coeff1,
180 }
else if (4 == height) {
182 }
else if (8 == height) {
188 uint32_t coeff0, uint32_t coeff1)
190 v16u8
src0,
src1, src2, src3, out0, out1;
191 v8u16 res0, res1, res2, res3;
193 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
198 LD_UB4(src, stride, src0, src1, src2, src3);
199 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
200 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
201 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
202 coeff_vec, res0, res1, res2, res3);
203 SLLI_4V(res0, res1, res2, res3, 3);
211 uint32_t coeff0, uint32_t coeff1)
213 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
214 v16u8 out0, out1, out2, out3;
215 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
217 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
223 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
224 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
225 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
226 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
227 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
228 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
229 coeff_vec, res0, res1, res2, res3);
230 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231 coeff_vec, res4, res5, res6, res7);
232 SLLI_4V(res0, res1, res2, res3, 3);
233 SLLI_4V(res4, res5, res6, res7, 3);
240 ST8x8_UB(out0, out1, out2, out3, dst, stride);
248 v16u8
src0,
src1, src2, src3, out0, out1;
249 v8u16 res0, res1, res2, res3;
251 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
257 for (row = height >> 2; row--;) {
258 LD_UB4(src, stride, src0, src1, src2, src3);
261 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
262 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
263 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
264 coeff_vec, res0, res1, res2, res3);
265 SLLI_4V(res0, res1, res2, res3, 3);
273 if (0 != (height % 4)) {
274 for (row = (height % 4); row--;) {
278 src0 = (v16u8) __msa_vshf_b(mask, (v16i8)
src0, (v16i8) src0);
280 res0 = __msa_dotp_u_h(src0, coeff_vec);
282 res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283 res0 = __msa_sat_u_h(res0, 7);
284 res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
293 uint32_t coeff0, uint32_t coeff1,
298 }
else if (8 == height) {
306 uint32_t coeff0, uint32_t coeff1)
313 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
317 LD_SB3(src, stride, src0, src1, src2);
319 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
321 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
323 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
325 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326 res_r = __msa_sat_u_h(res_r, 7);
327 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
329 out0 = __msa_copy_u_h(res, 0);
330 out1 = __msa_copy_u_h(res, 2);
338 uint32_t coeff0, uint32_t coeff1)
341 v16u8 tmp0, tmp1, tmp2, tmp3;
344 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
348 LD_UB5(src, stride, src0, src1, src2, src3, src4);
349 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
350 tmp0, tmp1, tmp2, tmp3);
351 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
353 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
355 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
357 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358 res_r = __msa_sat_u_h(res_r, 7);
360 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
366 uint32_t coeff0, uint32_t coeff1,
371 }
else if (4 == height) {
377 uint32_t coeff0, uint32_t coeff1)
383 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
387 LD_UB3(src, stride, src0, src1, src2);
388 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
390 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
393 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394 res_r = __msa_sat_u_h(res_r, 7);
395 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
401 uint32_t coeff0, uint32_t coeff1)
404 v16u8 tmp0, tmp1, tmp2, tmp3;
406 v8u16 res0_r, res1_r;
407 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
411 LD_UB5(src, stride, src0, src1, src2, src3, src4);
412 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
414 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
420 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
421 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
425 uint32_t coeff0, uint32_t coeff1)
427 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
428 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429 v8u16 res0, res1, res2, res3;
430 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
434 LD_UB5(src, stride, src0, src1, src2, src3, src4);
436 LD_UB4(src, stride, src5, src6, src7, src8);
437 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
439 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
441 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445 SLLI_4V(res0, res1, res2, res3, 3);
453 uint32_t coeff0, uint32_t coeff1,
458 }
else if (4 == height) {
460 }
else if (8 == height) {
466 uint32_t coeff0, uint32_t coeff1)
468 v16u8
src0,
src1, src2, src3, src4, out0, out1;
469 v8u16 res0, res1, res2, res3;
470 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
474 LD_UB5(src, stride, src0, src1, src2, src3, src4);
475 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
477 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
478 coeff_vec, res0, res1, res2, res3);
479 SLLI_4V(res0, res1, res2, res3, 3);
487 uint32_t coeff0, uint32_t coeff1)
489 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
490 v16u8 out0, out1, out2, out3;
491 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
496 LD_UB5(src, stride, src0, src1, src2, src3, src4);
498 LD_UB4(src, stride, src5, src6, src7, src8);
499 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
501 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
503 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
504 coeff_vec, res0, res1, res2, res3);
505 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506 coeff_vec, res4, res5, res6, res7);
507 SLLI_4V(res0, res1, res2, res3, 3);
508 SLLI_4V(res4, res5, res6, res7, 3);
515 ST8x8_UB(out0, out1, out2, out3, dst, stride);
519 uint32_t coeff0, uint32_t coeff1,
524 }
else if (8 == height) {
530 uint32_t coef_hor0, uint32_t coef_hor1,
531 uint32_t coef_ver0, uint32_t coef_ver1)
535 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
538 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
539 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
540 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
541 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
542 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
546 LD_UB3(src, stride, src0, src1, src2);
547 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
548 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
549 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
552 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
553 res_vt0 = __msa_sat_u_h(res_vt0, 7);
554 res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
556 out0 = __msa_copy_u_h(res_vert, 0);
557 out1 = __msa_copy_u_h(res_vert, 1);
565 uint32_t coef_hor0, uint32_t coef_hor1,
566 uint32_t coef_ver0, uint32_t coef_ver1)
569 v16u8 tmp0, tmp1, tmp2, tmp3;
570 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
573 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
574 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
575 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
576 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
577 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
581 LD_UB5(src, stride, src0, src1, src2, src3, src4);
583 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
584 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
585 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
586 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
587 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
590 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
591 res_vt0 = __msa_sat_u_h(res_vt0, 7);
593 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
599 uint32_t coef_hor0, uint32_t coef_hor1,
600 uint32_t coef_ver0, uint32_t coef_ver1,
606 }
else if (4 == height) {
613 uint32_t coef_hor0, uint32_t coef_hor1,
614 uint32_t coef_ver0, uint32_t coef_ver1)
617 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
620 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
621 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
622 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
623 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
624 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
627 LD_UB3(src, stride, src0, src1, src2);
628 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
629 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
630 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
633 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
634 res_vt0 = __msa_sat_u_h(res_vt0, 7);
635 res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
641 uint32_t coef_hor0, uint32_t coef_hor1,
642 uint32_t coef_ver0, uint32_t coef_ver1)
645 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
648 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
649 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
650 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
651 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
652 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
657 LD_UB5(src, stride, src0, src1, src2, src3, src4);
658 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
659 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
660 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
661 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
663 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
664 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
665 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
668 PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
669 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, stride);
673 uint32_t coef_hor0, uint32_t coef_hor1,
674 uint32_t coef_ver0, uint32_t coef_ver1)
676 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
677 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
678 v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
680 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
681 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
682 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
683 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
684 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
688 LD_UB5(src, stride, src0, src1, src2, src3, src4);
690 LD_UB4(src, stride, src5, src6, src7, src8);
692 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
693 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
694 VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
695 VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
696 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
697 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
698 DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
699 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
700 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
701 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
702 MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
703 res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
704 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
705 ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
706 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
707 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
708 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
713 uint32_t coef_hor0, uint32_t coef_hor1,
714 uint32_t coef_ver0, uint32_t coef_ver1,
720 }
else if (4 == height) {
723 }
else if (8 == height) {
730 uint32_t coef_hor0, uint32_t coef_hor1,
731 uint32_t coef_ver0, uint32_t coef_ver1)
733 v16u8
src0,
src1, src2, src3, src4, out0, out1;
734 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
735 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
737 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
738 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
739 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
740 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
741 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
748 src0 = (v16u8) __msa_vshf_b(mask, (v16i8)
src0, (v16i8) src0);
749 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
751 LD_UB4(src, stride, src1, src2, src3, src4);
754 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
755 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
756 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
757 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
758 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
759 res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
761 res_vt0 += (res_hz0 * coeff_vt_vec1);
762 res_vt1 += (res_hz1 * coeff_vt_vec1);
763 res_vt2 += (res_hz2 * coeff_vt_vec1);
764 res_vt3 += (res_hz3 * coeff_vt_vec1);
766 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
767 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
768 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
773 uint32_t coef_hor0, uint32_t coef_hor1,
774 uint32_t coef_ver0, uint32_t coef_ver1)
776 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
777 v16u8 out0, out1, out2, out3;
778 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
779 v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
780 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
781 v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
783 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
784 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
785 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
786 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
787 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
791 LD_UB5(src, stride, src0, src1, src2, src3, src4);
793 LD_UB4(src, stride, src5, src6, src7, src8);
794 src0 = (v16u8) __msa_vshf_b(mask, (v16i8)
src0, (v16i8) src0);
795 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
796 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
797 VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
798 VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
799 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
800 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
801 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
803 DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
804 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
805 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
806 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
808 MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
809 coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
811 res_vt0 += (res_hz0 * coeff_vt_vec1);
812 res_vt1 += (res_hz1 * coeff_vt_vec1);
813 res_vt2 += (res_hz2 * coeff_vt_vec1);
814 res_vt3 += (res_hz3 * coeff_vt_vec1);
815 res_vt4 += (res_hz4 * coeff_vt_vec1);
816 res_vt5 += (res_hz5 * coeff_vt_vec1);
817 res_vt6 += (res_hz6 * coeff_vt_vec1);
818 res_vt7 += (res_hz7 * coeff_vt_vec1);
819 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
820 SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
821 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
822 SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
823 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
824 PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
825 ST8x8_UB(out0, out1, out2, out3, dst, stride);
829 uint32_t coef_hor0, uint32_t coef_hor1,
830 uint32_t coef_ver0, uint32_t coef_ver1,
836 }
else if (8 == height) {
848 v16u8 dst_data = { 0 };
852 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
853 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
854 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
858 LD_SB2(src, stride, src0, src1);
861 out1 =
LH(dst + stride);
863 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
864 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
866 src0 = __msa_vshf_b(mask, src1, src0);
868 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
870 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
871 res_r = __msa_sat_u_h(res_r, 7);
873 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
874 dst_data = __msa_aver_u_b(res, dst_data);
876 out0 = __msa_copy_u_h((v8i16) dst_data, 0);
877 out1 = __msa_copy_u_h((v8i16) dst_data, 2);
888 uint16_t tp0, tp1, tp2, tp3;
890 v16u8 dst0, dst_data = { 0 };
893 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
894 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
895 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
899 LD_UB4(src, stride, src0, src1, src2, src3);
901 tp1 =
LH(dst + stride);
902 tp2 =
LH(dst + 2 * stride);
903 tp3 =
LH(dst + 3 * stride);
904 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
905 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
906 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
907 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
909 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
911 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
913 res_r = __msa_dotp_u_h(src0, coeff_vec);
915 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
916 res_r = __msa_sat_u_h(res_r, 7);
918 dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
919 dst0 = __msa_aver_u_b(dst0, dst_data);
930 }
else if (4 == height) {
939 uint32_t load0, load1;
941 v16u8 dst_data = { 0 };
944 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
945 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
946 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
950 LD_SB2(src, stride, src0, src1);
952 LW2(dst, stride, load0, load1);
956 src0 = __msa_vshf_b(mask, src1, src0);
958 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
960 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
961 res_r = __msa_sat_u_h(res_r, 7);
962 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
963 dst_data = __msa_aver_u_b((v16u8) res, dst_data);
972 uint32_t tp0, tp1, tp2, tp3;
974 v16u8
out, dst_data = { 0 };
976 v8u16 res0_r, res1_r;
977 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
978 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
979 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
983 LD_UB4(src, stride, src0, src1, src2, src3);
984 LW4(dst, stride, tp0, tp1, tp2, tp3);
986 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
987 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
992 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
993 out = __msa_aver_u_b(out, dst_data);
994 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1001 uint32_t tp0, tp1, tp2, tp3;
1002 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, out0, out1;
1003 v16u8 dst0 = { 0 }, dst1 = { 0 };
1005 v8u16 res0, res1, res2, res3;
1006 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1007 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1008 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1012 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1013 LW4(dst, stride, tp0, tp1, tp2, tp3);
1015 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1017 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1018 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
1019 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
1020 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
1021 SLLI_4V(res0, res1, res2, res3, 3);
1035 }
else if (4 == height) {
1037 }
else if (8 == height) {
1046 uint64_t tp0, tp1, tp2, tp3;
1047 v16u8
src0,
src1, src2, src3, out0, out1;
1048 v16u8 dst0 = { 0 }, dst1 = { 0 };
1049 v8u16 res0, res1, res2, res3;
1051 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1052 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1053 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1056 LD_UB4(src, stride, src0, src1, src2, src3);
1057 LD4(dst, stride, tp0, tp1, tp2, tp3);
1060 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1061 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1062 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1063 coeff_vec, res0, res1, res2, res3);
1064 SLLI_4V(res0, res1, res2, res3, 3);
1076 uint64_t tp0, tp1, tp2, tp3;
1077 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
1078 v16u8 out0, out1, out2, out3;
1079 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1080 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1082 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1083 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1084 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1088 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1089 LD4(dst, stride, tp0, tp1, tp2, tp3);
1092 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1095 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1096 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1097 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
1098 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
1099 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1100 coeff_vec, res0, res1, res2, res3);
1101 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1102 coeff_vec, res4, res5, res6, res7);
1103 SLLI_4V(res0, res1, res2, res3, 3);
1104 SLLI_4V(res4, res5, res6, res7, 3);
1113 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1122 }
else if (8 == height) {
1131 uint16_t out0, out1;
1132 v16i8
src0,
src1, src2, tmp0, tmp1, res;
1133 v16u8 dst_data = { 0 };
1136 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1137 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1138 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1140 LD_SB3(src, stride, src0, src1, src2);
1142 out1 =
LH(dst + stride);
1144 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
1145 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
1147 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1149 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1150 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1152 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1153 res_r = __msa_sat_u_h(res_r, 7);
1154 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1155 out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1156 out0 = __msa_copy_u_h(out, 0);
1157 out1 = __msa_copy_u_h(out, 2);
1168 uint16_t tp0, tp1, tp2, tp3;
1169 v16i8
src0,
src1, src2, src3, src4;
1170 v16u8 tmp0, tmp1, tmp2, tmp3;
1173 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1174 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1175 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1176 v16u8 dst_data = { 0 };
1178 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1181 tp1 =
LH(dst + stride);
1182 tp2 =
LH(dst + 2 * stride);
1183 tp3 =
LH(dst + 3 * stride);
1184 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
1185 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
1186 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
1187 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
1189 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1190 tmp0, tmp1, tmp2, tmp3);
1191 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1193 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1195 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1197 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1198 res_r = __msa_sat_u_h(res_r, 7);
1200 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1201 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1212 }
else if (4 == height) {
1221 uint32_t load0, load1;
1222 v16u8
src0,
src1, src2, tmp0, tmp1;
1223 v16u8 dst_data = { 0 };
1226 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1227 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1228 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1230 LD_UB3(src, stride, src0, src1, src2);
1232 LW2(dst, stride, load0, load1);
1235 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
1237 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1239 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1241 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1242 res_r = __msa_sat_u_h(res_r, 7);
1243 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1244 res = __msa_aver_u_b(res, dst_data);
1253 uint32_t tp0, tp1, tp2, tp3;
1254 v16u8
src0,
src1, src2, src3, src4;
1255 v16u8 tmp0, tmp1, tmp2, tmp3;
1257 v8u16 res0_r, res1_r;
1259 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1260 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1261 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1263 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1264 LW4(dst, stride, tp0, tp1, tp2, tp3);
1266 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1268 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1269 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1274 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
1275 out = __msa_aver_u_b(out, dst0);
1276 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1283 uint32_t tp0, tp1, tp2, tp3;
1284 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1285 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
1286 v16u8 dst0 = { 0 }, dst1 = { 0 };
1287 v8u16 res0, res1, res2, res3;
1288 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1289 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1290 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1292 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1294 LD_UB4(src, stride, src5, src6, src7, src8);
1295 LW4(dst, stride, tp0, tp1, tp2, tp3);
1297 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1299 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1301 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
1303 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1304 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
1305 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
1306 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
1307 SLLI_4V(res0, res1, res2, res3, 3);
1321 }
else if (4 == height) {
1323 }
else if (8 == height) {
1332 uint64_t tp0, tp1, tp2, tp3;
1333 v16u8
src0,
src1, src2, src3, src4;
1335 v8u16 res0, res1, res2, res3;
1336 v16u8 dst0 = { 0 }, dst1 = { 0 };
1337 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1338 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1339 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1341 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1342 LD4(dst, stride, tp0, tp1, tp2, tp3);
1345 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1346 src0, src1, src2, src3);
1347 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1348 coeff_vec, res0, res1, res2, res3);
1349 SLLI_4V(res0, res1, res2, res3, 3);
1361 uint64_t tp0, tp1, tp2, tp3;
1362 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1363 v16u8 out0, out1, out2, out3;
1364 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1365 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1366 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1367 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1368 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1370 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1372 LD_UB4(src, stride, src5, src6, src7, src8);
1373 LD4(dst, stride, tp0, tp1, tp2, tp3);
1376 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1379 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1380 src0, src1, src2, src3);
1381 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1382 src4, src5, src6, src7);
1383 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1384 coeff_vec, res0, res1, res2, res3);
1385 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1386 coeff_vec, res4, res5, res6, res7);
1387 SLLI_4V(res0, res1, res2, res3, 3);
1388 SLLI_4V(res4, res5, res6, res7, 3);
1397 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1406 }
else if (8 == height) {
1418 uint16_t out0, out1;
1421 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1423 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1424 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1425 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1426 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1427 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1431 LD_UB3(src, stride, src0, src1, src2);
1433 out1 =
LH(dst + stride);
1434 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
1435 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
1436 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1437 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1438 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1441 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1442 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1443 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1444 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1445 out0 = __msa_copy_u_h((v8i16) dst0, 0);
1446 out1 = __msa_copy_u_h((v8i16) dst0, 1);
1460 uint16_t tp0, tp1, tp2, tp3;
1461 v16u8
src0,
src1, src2, src3, src4;
1462 v16u8 tmp0, tmp1, tmp2, tmp3;
1464 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1466 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1467 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1468 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1469 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1470 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1474 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1476 tp1 =
LH(dst + stride);
1477 tp2 =
LH(dst + 2 * stride);
1478 tp3 =
LH(dst + 3 * stride);
1479 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0);
1480 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1);
1481 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2);
1482 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3);
1483 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1484 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1485 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1486 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1487 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1490 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1491 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1492 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1493 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1508 coef_hor1, coef_ver0, coef_ver1);
1509 }
else if (4 == height) {
1511 coef_hor1, coef_ver0, coef_ver1);
1524 v16u8 dst0, dst_data = { 0 };
1525 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1527 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1528 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1529 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1530 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1531 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1535 LD_UB3(src, stride, src0, src1, src2);
1536 LW2(dst, stride, tp0, tp1);
1538 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1539 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1540 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1543 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1544 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1545 dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1546 dst0 = __msa_aver_u_b(dst0, dst_data);
1558 uint32_t tp0, tp1, tp2, tp3;
1559 v16u8
src0,
src1, src2, src3, src4;
1560 v16u8
out, dst_data = { 0 };
1561 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1562 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1564 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1565 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1566 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1567 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1568 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1572 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1573 LW4(dst, stride, tp0, tp1, tp2, tp3);
1575 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1576 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1577 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1578 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1580 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1581 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1582 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1585 out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
1586 out = __msa_aver_u_b(out, dst_data);
1587 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1597 uint32_t tp0, tp1, tp2, tp3;
1598 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1599 v16u8 dst0 = { 0 }, dst1 = { 0 };
1600 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
1601 v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
1603 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1604 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1605 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1606 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1607 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1611 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1613 LD_UB4(src, stride, src5, src6, src7, src8);
1614 LW4(dst, stride, tp0, tp1, tp2, tp3);
1616 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1618 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1619 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1620 VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
1621 VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
1622 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1623 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
1624 DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
1625 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
1626 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1627 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1628 MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
1629 res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
1630 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1631 ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
1632 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1633 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1634 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
1649 coef_hor1, coef_ver0, coef_ver1);
1650 }
else if (4 == height) {
1652 coef_hor1, coef_ver0, coef_ver1);
1653 }
else if (8 == height) {
1655 coef_hor1, coef_ver0, coef_ver1);
1666 uint64_t tp0, tp1, tp2, tp3;
1667 v16u8
src0,
src1, src2, src3, src4, out0, out1;
1668 v8u16 res_hz0, res_hz1, res_hz2;
1669 v8u16 res_hz3, res_hz4;
1670 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1671 v16u8 dst0 = { 0 }, dst1 = { 0 };
1673 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1674 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1675 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1676 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1677 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1683 src0 = (v16u8) __msa_vshf_b(mask, (v16i8)
src0, (v16i8) src0);
1684 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1685 LD_UB4(src, stride, src1, src2, src3, src4);
1687 LD4(dst, stride, tp0, tp1, tp2, tp3);
1690 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1691 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1692 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1693 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
1694 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
1695 res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1696 res_vt0 += (res_hz0 * coeff_vt_vec1);
1697 res_vt1 += (res_hz1 * coeff_vt_vec1);
1698 res_vt2 += (res_hz2 * coeff_vt_vec1);
1699 res_vt3 += (res_hz3 * coeff_vt_vec1);
1700 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1701 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1702 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1714 uint64_t tp0, tp1, tp2, tp3;
1715 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1716 v16u8 out0, out1, out2, out3;
1717 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1718 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
1719 v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
1720 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1721 v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
1723 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1724 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1725 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1726 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1727 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1731 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1733 LD_UB4(src, stride, src5, src6, src7, src8);
1734 src0 = (v16u8) __msa_vshf_b(mask, (v16i8)
src0, (v16i8) src0);
1735 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1736 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1737 VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
1738 VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
1739 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1740 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1741 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1743 DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
1744 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
1745 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1746 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1748 MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
1749 coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
1751 LD4(dst, stride, tp0, tp1, tp2, tp3);
1754 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1757 res_vt0 += (res_hz0 * coeff_vt_vec1);
1758 res_vt1 += (res_hz1 * coeff_vt_vec1);
1759 res_vt2 += (res_hz2 * coeff_vt_vec1);
1760 res_vt3 += (res_hz3 * coeff_vt_vec1);
1761 res_vt4 += (res_hz4 * coeff_vt_vec1);
1762 res_vt5 += (res_hz5 * coeff_vt_vec1);
1763 res_vt6 += (res_hz6 * coeff_vt_vec1);
1764 res_vt7 += (res_hz7 * coeff_vt_vec1);
1765 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1766 SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
1767 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1768 SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
1769 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1770 PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
1773 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1786 coef_hor1, coef_ver0, coef_ver1);
1787 }
else if (8 == height) {
1789 coef_hor1, coef_ver0, coef_ver1);
1796 uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1799 LW4(src, stride, tp0, tp1, tp2, tp3);
1801 LW4(src, stride, tp4, tp5, tp6, tp7);
1802 SW4(tp0, tp1, tp2, tp3, dst, stride);
1804 SW4(tp4, tp5, tp6, tp7, dst, stride);
1805 }
else if (4 == height) {
1806 LW4(src, stride, tp0, tp1, tp2, tp3);
1807 SW4(tp0, tp1, tp2, tp3, dst, stride);
1808 }
else if (2 == height) {
1809 LW2(src, stride, tp0, tp1);
1819 uint64_t
src0,
src1, src2, src3, src4, src5, src6, src7;
1822 LD4(src, stride, src0, src1, src2, src3);
1824 LD4(src, stride, src4, src5, src6, src7);
1825 SD4(src0, src1, src2, src3, dst, stride);
1827 SD4(src4, src5, src6, src7, dst, stride);
1828 }
else if (4 == height) {
1829 LD4(src, stride, src0, src1, src2, src3);
1830 SD4(src0, src1, src2, src3, dst, stride);
1837 uint32_t tp0, tp1, tp2, tp3;
1838 v16u8
src0 = { 0 },
src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
1841 LW4(src, stride, tp0, tp1, tp2, tp3);
1844 LW4(src, stride, tp0, tp1, tp2, tp3);
1846 LW4(dst, stride, tp0, tp1, tp2, tp3);
1848 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1852 }
else if (4 == height) {
1853 LW4(src, stride, tp0, tp1, tp2, tp3);
1855 LW4(dst, stride, tp0, tp1, tp2, tp3);
1857 dst0 = __msa_aver_u_b(src0, dst0);
1858 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
1859 }
else if (2 == height) {
1860 LW2(src, stride, tp0, tp1);
1862 LW2(dst, stride, tp0, tp1);
1864 dst0 = __msa_aver_u_b(src0, dst0);
1872 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1873 v16u8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1874 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1877 LD4(src, stride, tp0, tp1, tp2, tp3);
1879 LD4(src, stride, tp4, tp5, tp6, tp7);
1884 LD4(dst, stride, tp0, tp1, tp2, tp3);
1885 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1890 AVER_UB4_UB(src0, dst0,
src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1892 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
1893 }
else if (4 == height) {
1894 LD4(src, stride, tp0, tp1, tp2, tp3);
1897 LD4(dst, stride, tp0, tp1, tp2, tp3);
1908 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1924 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1942 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1951 for (cnt = height; cnt--;) {
1952 *((uint16_t *) dst) = *((uint16_t *) src);
1963 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1981 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2000 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2010 for (cnt = height; cnt--;) {
2011 dst[0] = (dst[0] + src[0] + 1) >> 1;
2012 dst[1] = (dst[1] + src[1] + 1) >> 1;
static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define MUL2(in0, in1, in2, in3, out0, out1)
static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
#define LD4(psrc, stride, out0, out1, out2, out3)
static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
#define INSERT_W2_UB(...)
static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static const uint16_t mask[17]
static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define LW2(psrc, stride, out0, out1)
static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t height)
#define SW4(in0, in1, in2, in3, pdst, stride)
static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define ST2x4_UB(in, stidx, pdst, stride)
#define INSERT_W4_UB(...)
static const uint8_t chroma_mask_arr[16 *5]
static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t height)
static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
#define ADD2(in0, in1, in2, in3, out0, out1)
static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
#define SLLI_4V(in0, in1, in2, in3, shift)
static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
#define ST4x8_UB(in0, in1, pdst, stride)
static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
#define INSERT_D2_UB(...)
static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ST8x4_UB(in0, in1, pdst, stride)
static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void copy_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t height)
static void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void copy_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t height)
#define ST8x1_UB(in, pdst)
#define ST4x2_UB(in, pdst, stride)
static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)