27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \ 34 v4i32 out0_r, out1_r, out0_l, out1_l; \ 36 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \ 37 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \ 39 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \ 40 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \ 41 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \ 42 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ 44 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ 45 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ 46 CLIP_SH2_0_255(out0, out1); \ 49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \ 50 wgt, rnd, offset, out0, out1, out2, out3) \ 52 HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \ 53 HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \ 56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \ 59 v4i32 out0_r, out1_r, out0_l, out1_l; \ 61 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \ 62 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \ 63 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \ 64 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \ 65 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \ 66 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ 67 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ 68 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ 69 CLIP_SH2_0_255_MAX_SATU(out0, out1); \ 72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ 73 vec3, wgt, rnd, offset, out0, out1, \ 76 HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \ 78 HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \ 95 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96 uint64_t tpd0, tpd1, tpd2, tpd3;
101 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102 v8i16 dst0, dst1, dst2, dst3, weight_vec;
103 v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
105 offset = (offset0 + offset1) << rnd_val;
106 weight0 = weight0 & 0x0000FFFF;
107 weight = weight0 | (weight1 << 16);
109 offset_vec = __msa_fill_w(offset);
110 weight_vec = (v8i16) __msa_fill_w(weight);
111 rnd_vec = __msa_fill_w(rnd_val + 1);
114 LW2(src0_ptr, src_stride, tp0, tp1);
116 LD2(src1_ptr, src2_stride, tpd0, tpd1);
119 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
123 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
126 dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
128 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
130 }
else if (4 == height) {
131 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
133 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
139 offset_vec, dst0, dst1);
140 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
142 }
else if (0 == height % 8) {
143 for (loop_cnt = (height >> 3); loop_cnt--;) {
144 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145 src0_ptr += 4 * src_stride;
147 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148 src0_ptr += 4 * src_stride;
150 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151 src1_ptr += (4 * src2_stride);
154 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155 src1_ptr += (4 * src2_stride);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
162 in3, weight_vec, rnd_vec, offset_vec,
163 dst0, dst1, dst2, dst3);
165 ST4x8_UB(out0, out1, dst, dst_stride);
166 dst += (8 * dst_stride);
186 uint64_t tp0, tp1, tp2, tp3;
190 v8i16 in0, in1, in2, in3;
191 v8i16 dst0, dst1, dst2, dst3;
192 v4i32 offset_vec, weight_vec, rnd_vec;
194 offset = (offset0 + offset1) << rnd_val;
195 weight0 = weight0 & 0x0000FFFF;
196 weight = weight0 | (weight1 << 16);
198 weight_vec = __msa_fill_w(weight);
199 offset_vec = __msa_fill_w(offset);
200 rnd_vec = __msa_fill_w(rnd_val + 1);
202 for (loop_cnt = (height >> 2); loop_cnt--;) {
203 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204 src0_ptr += (4 * src_stride);
207 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208 src1_ptr += (4 * src2_stride);
211 SLLI_4V(dst0, dst1, dst2, dst3, 6);
214 weight_vec, rnd_vec, offset_vec,
215 dst0, dst1, dst2, dst3);
217 ST6x4_UB(out0, out1, dst, dst_stride);
218 dst += (4 * dst_stride);
235 uint64_t tp0, tp1, tp2, tp3;
237 v16u8 out0, out1, out2;
239 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 };
240 v8i16 in0, in1, in2, in3, in4, in5;
241 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
242 v4i32 offset_vec, weight_vec, rnd_vec;
244 offset = (offset0 + offset1) << rnd_val;
245 weight0 = weight0 & 0x0000FFFF;
246 weight = weight0 | (weight1 << 16);
248 offset_vec = __msa_fill_w(offset);
249 weight_vec = __msa_fill_w(weight);
250 rnd_vec = __msa_fill_w(rnd_val + 1);
253 LD2(src0_ptr, src_stride, tp0, tp1);
255 LD_SH2(src1_ptr, src2_stride, in0, in1);
260 weight_vec, rnd_vec, offset_vec,
263 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
265 }
else if (6 == height) {
266 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
267 src0_ptr += 4 * src_stride;
270 LD2(src0_ptr, src_stride, tp0, tp1);
275 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
276 SLLI_4V(dst0, dst1, dst2, dst3, 6);
279 weight_vec, rnd_vec, offset_vec, dst0, dst1,
282 offset_vec, dst4, dst5);
283 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
284 ST8x4_UB(out0, out1, dst, dst_stride);
285 dst += (4 * dst_stride);
287 }
else if (0 == height % 4) {
290 for (loop_cnt = (height >> 2); loop_cnt--;) {
291 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
292 src0_ptr += (4 * src_stride);
297 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
298 src1_ptr += (4 * src2_stride);
300 SLLI_4V(dst0, dst1, dst2, dst3, 6);
302 in3, weight_vec, rnd_vec, offset_vec,
303 dst0, dst1, dst2, dst3);
305 ST8x4_UB(out0, out1, dst, dst_stride);
306 dst += (4 * dst_stride);
327 v16u8 out0, out1, out2;
329 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
330 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
331 v4i32 offset_vec, weight_vec, rnd_vec;
333 offset = (offset0 + offset1) << rnd_val;
334 weight0 = weight0 & 0x0000FFFF;
335 weight = weight0 | (weight1 << 16);
337 offset_vec = __msa_fill_w(offset);
338 weight_vec = __msa_fill_w(weight);
339 rnd_vec = __msa_fill_w(rnd_val + 1);
341 for (loop_cnt = (16 >> 2); loop_cnt--;) {
342 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
343 src0_ptr += (4 * src_stride);
344 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
345 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
346 src1_ptr += (4 * src2_stride);
349 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
350 dst0, dst1, dst2, dst3);
352 SLLI_4V(dst0, dst1, dst2, dst3, 6);
353 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
354 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
359 weight_vec, rnd_vec, offset_vec, dst0, dst1,
362 offset_vec, dst4, dst5);
363 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
364 ST12x4_UB(out0, out1, out2, dst, dst_stride);
365 dst += (4 * dst_stride);
384 v16u8 out0, out1, out2, out3;
387 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
388 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
389 v4i32 offset_vec, weight_vec, rnd_vec;
391 offset = (offset0 + offset1) << rnd_val;
392 weight0 = weight0 & 0x0000FFFF;
393 weight = weight0 | (weight1 << 16);
395 offset_vec = __msa_fill_w(offset);
396 weight_vec = __msa_fill_w(weight);
397 rnd_vec = __msa_fill_w(rnd_val + 1);
399 for (loop_cnt = (height >> 2); loop_cnt--;) {
400 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
401 src0_ptr += (4 * src_stride);
402 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
403 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
404 src1_ptr += (4 * src2_stride);
405 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
407 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
409 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
410 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
412 weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
415 weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
419 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
420 dst += (4 * dst_stride);
439 v16u8 out0, out1, out2, out3, out4, out5;
440 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
zero = { 0 };
441 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
442 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
443 v4i32 offset_vec, weight_vec, rnd_vec;
445 offset = (offset0 + offset1) << rnd_val;
446 weight0 = weight0 & 0x0000FFFF;
447 weight = weight0 | (weight1 << 16);
449 offset_vec = __msa_fill_w(offset);
450 weight_vec = __msa_fill_w(weight);
451 rnd_vec = __msa_fill_w(rnd_val + 1);
453 for (loop_cnt = 8; loop_cnt--;) {
454 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
455 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
456 src0_ptr += (4 * src_stride);
457 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
458 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
459 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
460 src1_ptr += (4 * src2_stride);
464 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
467 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
468 SLLI_4V(dst0, dst1, dst2, dst3, 6);
469 SLLI_4V(dst4, dst5, dst6, dst7, 6);
470 SLLI_4V(dst8, dst9, dst10, dst11, 6);
472 weight_vec, rnd_vec, offset_vec, dst0, dst1,
475 weight_vec, rnd_vec, offset_vec, dst4, dst5,
478 in11, weight_vec, rnd_vec, offset_vec,
479 dst8, dst9, dst10, dst11);
480 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
481 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
482 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
483 ST8x4_UB(out2, out5, dst + 16, dst_stride);
484 dst += (4 * dst_stride);
503 v16u8 out0, out1, out2, out3;
506 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
507 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
508 v4i32 offset_vec, weight_vec, rnd_vec;
510 offset = (offset0 + offset1) << rnd_val;
511 weight0 = weight0 & 0x0000FFFF;
512 weight = weight0 | (weight1 << 16);
514 offset_vec = __msa_fill_w(offset);
515 weight_vec = __msa_fill_w(weight);
516 rnd_vec = __msa_fill_w(rnd_val + 1);
518 for (loop_cnt = (height >> 1); loop_cnt--;) {
519 LD_SB2(src0_ptr, 16, src0, src1);
520 src0_ptr += src_stride;
521 LD_SB2(src0_ptr, 16, src2, src3);
522 src0_ptr += src_stride;
523 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
524 src1_ptr += src2_stride;
525 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
526 src1_ptr += src2_stride;
532 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
533 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
535 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
538 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
542 ST_UB2(out0, out1, dst, 16);
544 ST_UB2(out2, out3, dst, 16);
564 v16u8 out0, out1, out2;
567 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
568 v4i32 offset_vec, weight_vec, rnd_vec;
570 offset = (offset0 + offset1) << rnd_val;
571 weight0 = weight0 & 0x0000FFFF;
572 weight = weight0 | (weight1 << 16);
574 offset_vec = __msa_fill_w(offset);
575 weight_vec = __msa_fill_w(weight);
576 rnd_vec = __msa_fill_w(rnd_val + 1);
578 for (loop_cnt = 64; loop_cnt--;) {
579 LD_SB3(src0_ptr, 16, src0, src1, src2);
580 src0_ptr += src_stride;
581 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
582 src1_ptr += src2_stride;
587 SLLI_4V(dst0, dst1, dst2, dst3, 6);
590 weight_vec, rnd_vec, offset_vec, dst0, dst1,
593 offset_vec, dst4, dst5);
594 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
595 ST_UB2(out0, out1, dst, 16);
596 ST_UB(out2, dst + 32);
616 v16u8 out0, out1, out2, out3;
619 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
620 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
621 v4i32 offset_vec, weight_vec, rnd_vec;
623 offset = (offset0 + offset1) << rnd_val;
624 weight0 = weight0 & 0x0000FFFF;
625 weight = weight0 | (weight1 << 16);
627 offset_vec = __msa_fill_w(offset);
628 weight_vec = __msa_fill_w(weight);
629 rnd_vec = __msa_fill_w(rnd_val + 1);
631 for (loop_cnt = height; loop_cnt--;) {
632 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
633 src0_ptr += src_stride;
634 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
635 src1_ptr += src2_stride;
637 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
639 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
641 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
642 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
644 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
647 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
651 ST_UB4(out0, out1, out2, out3, dst, 16);
672 v8i16 filt0, filt1, filt2, filt3;
674 v16i8 mask1, mask2, mask3;
675 v16i8 vec0, vec1, vec2, vec3;
677 v8i16 in0, in1, in2, in3;
678 v8i16 filter_vec, out0, out1;
679 v4i32 weight_vec, offset_vec, rnd_vec;
683 filter_vec =
LD_SH(filter);
684 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
690 offset = (offset0 + offset1) << rnd_val;
691 weight0 = weight0 & 0x0000FFFF;
692 weight = weight0 | (weight1 << 16);
693 constant = 128 * weight1;
697 offset_vec = __msa_fill_w(offset);
698 weight_vec = __msa_fill_w(weight);
699 rnd_vec = __msa_fill_w(rnd_val + 1);
701 for (loop_cnt = (height >> 2); loop_cnt--;) {
702 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
703 src0_ptr += (4 * src_stride);
704 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
705 src1_ptr += (4 * src2_stride);
709 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
710 vec0, vec1, vec2, vec3);
713 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
714 vec0, vec1, vec2, vec3);
719 weight_vec, rnd_vec, offset_vec,
722 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
723 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
724 dst += (4 * dst_stride);
744 v8i16 filt0, filt1, filt2, filt3;
746 v16i8 mask1, mask2, mask3;
747 v16i8 vec0, vec1, vec2, vec3;
748 v8i16 dst0, dst1, dst2, dst3;
749 v8i16 in0, in1, in2, in3;
750 v8i16 filter_vec, out0, out1, out2, out3;
751 v4i32 weight_vec, offset_vec, rnd_vec;
755 offset = (offset0 + offset1) << rnd_val;
756 weight0 = weight0 & 0x0000FFFF;
757 weight = weight0 | (weight1 << 16);
758 constant = 128 * weight1;
762 offset_vec = __msa_fill_w(offset);
763 weight_vec = __msa_fill_w(weight);
764 rnd_vec = __msa_fill_w(rnd_val + 1);
766 filter_vec =
LD_SH(filter);
767 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
773 for (loop_cnt = (height >> 2); loop_cnt--;) {
774 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
775 src0_ptr += (4 * src_stride);
776 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
777 src1_ptr += (4 * src2_stride);
780 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
781 vec0, vec1, vec2, vec3);
784 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
785 vec0, vec1, vec2, vec3);
788 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
789 vec0, vec1, vec2, vec3);
792 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
793 vec0, vec1, vec2, vec3);
799 weight_vec, rnd_vec, offset_vec,
800 out0, out1, out2, out3);
803 ST8x4_UB(out0, out1, dst, dst_stride);
804 dst += (4 * dst_stride);
824 v16i8
src0,
src1, src2, src3, vec0, vec1, vec2, vec3;
825 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
826 v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
827 v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
828 v4i32 weight_vec, offset_vec, rnd_vec;
832 weight0 = weight0 & 0x0000FFFF;
833 weight = weight0 | (weight1 << 16);
834 constant = 128 * weight1;
836 offset = (offset0 + offset1) << rnd_val;
839 offset_vec = __msa_fill_w(offset);
840 weight_vec = __msa_fill_w(weight);
841 rnd_vec = __msa_fill_w(rnd_val + 1);
843 filter_vec =
LD_SH(filter);
844 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
855 for (loop_cnt = 4; loop_cnt--;) {
856 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
857 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
859 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
863 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
867 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
871 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
876 weight_vec, rnd_vec, offset_vec, out0, out1, out2,
879 ST8x4_UB(out0, out1, dst, dst_stride);
881 LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
882 src0_ptr += (4 * src_stride);
883 LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
884 src1_ptr += (4 * src2_stride);
887 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
891 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
896 offset_vec, out0, out1);
897 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
898 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
899 dst += (4 * dst_stride);
920 v8i16 in0, in1, in2, in3;
921 v8i16 filt0, filt1, filt2, filt3;
922 v16i8 mask1, mask2, mask3;
923 v8i16 filter_vec, out0, out1, out2, out3;
924 v16i8 vec0, vec1, vec2, vec3;
925 v8i16 dst0, dst1, dst2, dst3;
926 v4i32 weight_vec, offset_vec, rnd_vec;
927 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
930 offset = (offset0 + offset1) << rnd_val;
931 weight0 = weight0 & 0x0000FFFF;
932 weight = weight0 | (weight1 << 16);
933 constant = 128 * weight1;
937 offset_vec = __msa_fill_w(offset);
938 weight_vec = __msa_fill_w(weight);
939 rnd_vec = __msa_fill_w(rnd_val + 1);
941 filter_vec =
LD_SH(filter);
942 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
948 for (loop_cnt = (height >> 1); loop_cnt--;) {
949 LD_SB2(src0_ptr, 8, src0, src1);
950 src0_ptr += src_stride;
951 LD_SB2(src0_ptr, 8, src2, src3);
952 src0_ptr += src_stride;
953 LD_SH2(src1_ptr, 8, in0, in1);
954 src1_ptr += src2_stride;
955 LD_SH2(src1_ptr, 8, in2, in3);
956 src1_ptr += src2_stride;
959 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
960 vec0, vec1, vec2, vec3);
963 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
964 vec0, vec1, vec2, vec3);
967 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
968 vec0, vec1, vec2, vec3);
971 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
972 vec0, vec1, vec2, vec3);
978 weight_vec, rnd_vec, offset_vec,
979 out0, out1, out2, out3);
982 ST_SH2(out0, out1, dst, dst_stride);
983 dst += (2 * dst_stride);
1005 v8i16 in0, in1, in2;
1006 v8i16 filt0, filt1, filt2, filt3;
1007 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1008 v16i8 vec0, vec1, vec2, vec3;
1009 v8i16 dst0, dst1, dst2;
1010 v4i32 dst2_r, dst2_l;
1011 v8i16 filter_vec, out0, out1, out2;
1012 v4i32 weight_vec, offset_vec, rnd_vec;
1015 src0_ptr = src0_ptr - 3;
1016 offset = (offset0 + offset1) << rnd_val;
1017 weight0 = weight0 & 0x0000FFFF;
1018 weight = weight0 | (weight1 << 16);
1019 constant = 128 * weight1;
1023 offset_vec = __msa_fill_w(offset);
1024 weight_vec = __msa_fill_w(weight);
1025 rnd_vec = __msa_fill_w(rnd_val + 1);
1027 filter_vec =
LD_SH(filter);
1028 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1038 LD_SB2(src0_ptr, 16, src0, src1);
1039 src0_ptr += src_stride;
1040 LD_SH2(src1_ptr, 8, in0, in1);
1041 in2 =
LD_SH(src1_ptr + 16);
1042 src1_ptr += src2_stride;
1045 for (loop_cnt = 31; loop_cnt--;) {
1046 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1047 vec0, vec1, vec2, vec3);
1050 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1051 vec0, vec1, vec2, vec3);
1054 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1055 vec0, vec1, vec2, vec3);
1060 weight_vec, rnd_vec, offset_vec,
1064 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1065 (v8i16) weight_vec);
1066 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1067 (v8i16) weight_vec);
1069 dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1072 LD_SB2(src0_ptr, 16, src0, src1);
1073 src0_ptr += src_stride;
1074 LD_SH2(src1_ptr, 8, in0, in1);
1075 in2 =
LD_SH(src1_ptr + 16);
1076 src1_ptr += src2_stride;
1079 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1081 SD(dst_val0, dst + 16);
1085 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1088 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1091 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1097 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1098 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1100 dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1103 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1105 SD(dst_val0, dst + 16);
1126 v8i16 in0, in1, in2, in3;
1127 v8i16 filt0, filt1, filt2, filt3;
1129 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1130 v16i8 vec0, vec1, vec2, vec3;
1131 v8i16 dst0, dst1, dst2, dst3;
1132 v8i16 filter_vec, out0, out1, out2, out3;
1133 v4i32 weight_vec, offset_vec, rnd_vec;
1136 offset = (offset0 + offset1) << rnd_val;
1137 weight0 = weight0 & 0x0000FFFF;
1138 weight = weight0 | (weight1 << 16);
1139 constant = 128 * weight1;
1143 offset_vec = __msa_fill_w(offset);
1144 weight_vec = __msa_fill_w(weight);
1145 rnd_vec = __msa_fill_w(rnd_val + 1);
1147 filter_vec =
LD_SH(filter);
1148 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1158 for (loop_cnt = height; loop_cnt--;) {
1159 LD_SB2(src0_ptr, 16, src0, src1);
1160 src2 =
LD_SB(src0_ptr + 24);
1161 src0_ptr += src_stride;
1162 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1163 src1_ptr += src2_stride;
1167 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1168 vec0, vec1, vec2, vec3);
1171 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1172 vec0, vec1, vec2, vec3);
1175 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1176 vec0, vec1, vec2, vec3);
1179 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1180 vec0, vec1, vec2, vec3);
1186 weight_vec, rnd_vec, offset_vec,
1187 out0, out1, out2, out3);
1190 ST_SH2(out0, out1, dst, 16);
1211 v16i8
src0,
src1, src2, src3, src4;
1212 v8i16 in0, in1, in2, in3;
1213 v8i16 filt0, filt1, filt2, filt3;
1215 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1216 v16i8 vec0, vec1, vec2, vec3;
1217 v8i16 dst0, dst1, dst2, dst3;
1218 v8i16 filter_vec, out0, out1, out2, out3;
1219 v4i32 weight_vec, offset_vec, rnd_vec;
1222 offset = (offset0 + offset1) << rnd_val;
1223 weight0 = weight0 & 0x0000FFFF;
1224 weight = weight0 | (weight1 << 16);
1225 constant = 128 * weight1;
1229 offset_vec = __msa_fill_w(offset);
1230 weight_vec = __msa_fill_w(weight);
1231 rnd_vec = __msa_fill_w(rnd_val + 1);
1233 filter_vec =
LD_SH(filter);
1234 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1244 for (loop_cnt = 64; loop_cnt--;) {
1245 LD_SB2(src0_ptr, 16, src0, src1);
1246 src2 =
LD_SB(src0_ptr + 24);
1247 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1249 LD_SB2(src0_ptr + 32, 8, src3, src4);
1250 src0_ptr += src_stride;
1253 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1254 vec0, vec1, vec2, vec3);
1257 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1258 vec0, vec1, vec2, vec3);
1261 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1262 vec0, vec1, vec2, vec3);
1265 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1266 vec0, vec1, vec2, vec3);
1271 weight_vec, rnd_vec, offset_vec,
1272 out0, out1, out2, out3);
1275 ST_SH2(out0, out1, dst, 16);
1277 LD_SH2(src1_ptr + 32, 8, in2, in3);
1278 src1_ptr += src2_stride;
1280 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1281 vec0, vec1, vec2, vec3);
1284 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1285 vec0, vec1, vec2, vec3);
1290 weight_vec, rnd_vec, offset_vec,
1293 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1294 ST_SH(out0, dst + 32);
1315 int16_t *src1_ptr_tmp;
1316 uint32_t loop_cnt, cnt;
1319 v8i16 in0, in1, in2, in3;
1320 v8i16 filt0, filt1, filt2, filt3;
1322 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1323 v16i8 vec0, vec1, vec2, vec3;
1324 v8i16 dst0, dst1, dst2, dst3;
1325 v8i16 filter_vec, out0, out1, out2, out3;
1326 v4i32 weight_vec, offset_vec, rnd_vec;
1329 offset = (offset0 + offset1) << rnd_val;
1330 weight0 = weight0 & 0x0000FFFF;
1331 weight = weight0 | (weight1 << 16);
1332 constant = 128 * weight1;
1336 offset_vec = __msa_fill_w(offset);
1337 weight_vec = __msa_fill_w(weight);
1338 rnd_vec = __msa_fill_w(rnd_val + 1);
1340 filter_vec =
LD_SH(filter);
1341 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1351 for (loop_cnt = height; loop_cnt--;) {
1352 src0_ptr_tmp = src0_ptr;
1354 src1_ptr_tmp = src1_ptr;
1356 for (cnt = 2; cnt--;) {
1357 LD_SB2(src0_ptr_tmp, 16, src0, src1);
1358 src2 =
LD_SB(src0_ptr_tmp + 24);
1360 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1364 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1365 vec0, vec1, vec2, vec3);
1368 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1369 vec0, vec1, vec2, vec3);
1372 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1373 vec0, vec1, vec2, vec3);
1376 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1377 vec0, vec1, vec2, vec3);
1383 weight_vec, rnd_vec, offset_vec,
1384 out0, out1, out2, out3);
1387 ST_SH2(out0, out1, dst_tmp, 16);
1391 src0_ptr += src_stride;
1392 src1_ptr += src2_stride;
1414 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1415 v16i8 src11, src12, src13, src14;
1416 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1417 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1418 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1419 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1420 v16i8 src2110, src4332, src6554, src8776, src10998;
1421 v16i8 src12111110, src14131312;
1422 v8i16 dst10, dst32, dst54, dst76;
1423 v8i16 filt0, filt1, filt2, filt3;
1424 v8i16 filter_vec, out0, out1, out2, out3;
1425 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1427 src0_ptr -= (3 * src_stride);
1428 offset = (offset0 + offset1) << rnd_val;
1429 weight0 = weight0 & 0x0000FFFF;
1430 weight = weight0 | (weight1 << 16);
1432 const_vec = __msa_ldi_w(128);
1434 offset_vec = __msa_fill_w(offset);
1435 weight_vec = __msa_fill_w(weight);
1436 rnd_vec = __msa_fill_w(rnd_val + 1);
1437 weight1_vec = __msa_fill_w(weight1);
1438 offset_vec += const_vec * weight1_vec;
1440 filter_vec =
LD_SH(filter);
1441 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1443 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1444 src0_ptr += (7 * src_stride);
1446 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1447 src10_r, src32_r, src54_r, src21_r);
1448 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1449 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1450 src2110, src4332, src6554);
1453 for (loop_cnt = (height >> 3); loop_cnt--;) {
1454 LD_SB8(src0_ptr, src_stride,
1455 src7, src8, src9, src10, src11, src12, src13, src14);
1456 src0_ptr += (8 * src_stride);
1457 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1458 src1_ptr += (8 * src2_stride);
1462 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1463 src76_r, src87_r, src98_r, src109_r);
1464 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1465 src1110_r, src1211_r, src1312_r, src1413_r);
1466 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1467 src1413_r, src1312_r,
1468 src8776, src10998, src12111110, src14131312);
1471 DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1472 filt0, dst10, dst32, dst54, dst76);
1473 DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1474 filt1, dst10, dst32, dst54, dst76);
1475 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1476 filt2, filt2, dst10, dst32, dst54, dst76);
1477 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1478 filt3, filt3, dst10, dst32, dst54, dst76);
1482 weight_vec, rnd_vec, offset_vec,
1483 out0, out1, out2, out3);
1486 ST4x8_UB(out0, out1, dst, dst_stride);
1487 dst += (8 * dst_stride);
1490 src4332 = src12111110;
1491 src6554 = src14131312;
1512 v16i8
src0,
src1, src2, src3, src4, src5;
1513 v16i8 src6, src7, src8, src9, src10;
1514 v8i16 in0, in1, in2, in3;
1515 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1516 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1517 v8i16 tmp0, tmp1, tmp2, tmp3;
1518 v8i16 filt0, filt1, filt2, filt3;
1519 v8i16 filter_vec, out0, out1, out2, out3;
1520 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1522 src0_ptr -= (3 * src_stride);
1523 offset = (offset0 + offset1) << rnd_val;
1524 weight0 = weight0 & 0x0000FFFF;
1525 weight = weight0 | (weight1 << 16);
1527 const_vec = __msa_ldi_w(128);
1529 offset_vec = __msa_fill_w(offset);
1530 weight_vec = __msa_fill_w(weight);
1531 rnd_vec = __msa_fill_w(rnd_val + 1);
1532 weight1_vec = __msa_fill_w(weight1);
1533 offset_vec += const_vec * weight1_vec;
1535 filter_vec =
LD_SH(filter);
1536 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1538 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1539 src0_ptr += (7 * src_stride);
1542 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1543 src10_r, src32_r, src54_r, src21_r);
1544 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1546 for (loop_cnt = (height >> 2); loop_cnt--;) {
1547 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1548 src0_ptr += (4 * src_stride);
1549 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1550 src1_ptr += (4 * src2_stride);
1553 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1554 src76_r, src87_r, src98_r, src109_r);
1556 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1557 filt0, tmp0, tmp1, tmp2, tmp3);
1558 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1559 filt1, tmp0, tmp1, tmp2, tmp3);
1560 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1561 filt2, tmp0, tmp1, tmp2, tmp3);
1562 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1563 filt3, tmp0, tmp1, tmp2, tmp3);
1567 weight_vec, rnd_vec, offset_vec,
1568 out0, out1, out2, out3);
1571 ST8x4_UB(out0, out1, dst, dst_stride);
1572 dst += (4 * dst_stride);
1600 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1601 v8i16 in0, in1, in2, in3;
1602 v16i8 src10_r, src32_r, src54_r, src76_r;
1603 v16i8 src21_r, src43_r, src65_r, src87_r;
1604 v8i16 tmp0, tmp1, tmp2;
1605 v16i8 src10_l, src32_l, src54_l, src76_l;
1606 v16i8 src21_l, src43_l, src65_l, src87_l;
1607 v16i8 src2110, src4332, src6554, src8776;
1608 v8i16 filt0, filt1, filt2, filt3;
1609 v8i16 out0, out1, out2, filter_vec;
1610 v4i32 dst2_r, dst2_l;
1611 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1613 src0_ptr -= (3 * src_stride);
1614 offset = (offset0 + offset1) << rnd_val;
1615 weight0 = weight0 & 0x0000FFFF;
1616 weight = weight0 | (weight1 << 16);
1618 const_vec = __msa_ldi_w(128);
1620 offset_vec = __msa_fill_w(offset);
1621 weight_vec = __msa_fill_w(weight);
1622 rnd_vec = __msa_fill_w(rnd_val + 1);
1623 weight1_vec = __msa_fill_w(weight1);
1624 offset_vec += const_vec * weight1_vec;
1626 filter_vec =
LD_SH(filter);
1627 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1629 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1630 src0_ptr += (7 * src_stride);
1633 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1634 src10_r, src32_r, src54_r, src21_r);
1635 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1636 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1637 src10_l, src32_l, src54_l, src21_l);
1638 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1639 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1640 src2110, src4332, src6554);
1642 for (loop_cnt = 8; loop_cnt--;) {
1643 LD_SB2(src0_ptr, src_stride, src7, src8);
1644 src0_ptr += (2 * src_stride);
1645 LD_SH2(src1_ptr, src2_stride, in0, in1);
1646 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1647 src1_ptr += (2 * src2_stride);
1648 in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1651 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1652 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1653 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1655 DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1657 DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1658 tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1659 DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1660 tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1661 DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1662 tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1665 weight_vec, rnd_vec, offset_vec,
1669 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1670 (v8i16) weight_vec);
1671 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1672 (v8i16) weight_vec);
1674 dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1678 ST4x2_UB(out2, dst + 8, dst_stride);
1679 dst += (2 * dst_stride);
1710 int16_t *src1_ptr_tmp;
1712 uint32_t loop_cnt, cnt;
1714 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1715 v8i16 in0, in1, in2, in3;
1716 v16i8 src10_r, src32_r, src54_r, src76_r;
1717 v16i8 src21_r, src43_r, src65_r, src87_r;
1718 v16i8 src10_l, src32_l, src54_l, src76_l;
1719 v16i8 src21_l, src43_l, src65_l, src87_l;
1720 v8i16 tmp0, tmp1, tmp2, tmp3;
1721 v8i16 filt0, filt1, filt2, filt3;
1723 v8i16 out0, out1, out2, out3;
1724 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1726 src0_ptr -= (3 * src_stride);
1728 offset = (offset0 + offset1) << rnd_val;
1729 weight0 = weight0 & 0x0000FFFF;
1730 weight = weight0 | (weight1 << 16);
1732 const_vec = __msa_ldi_w(128);
1734 offset_vec = __msa_fill_w(offset);
1735 weight_vec = __msa_fill_w(weight);
1736 rnd_vec = __msa_fill_w(rnd_val + 1);
1737 weight1_vec = __msa_fill_w(weight1);
1738 offset_vec += const_vec * weight1_vec;
1740 filter_vec =
LD_SH(filter);
1741 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1743 for (cnt = (width >> 4); cnt--;) {
1744 src0_ptr_tmp = src0_ptr;
1745 src1_ptr_tmp = src1_ptr;
1748 LD_SB7(src0_ptr_tmp, src_stride,
1749 src0, src1, src2, src3, src4, src5, src6);
1750 src0_ptr_tmp += (7 * src_stride);
1753 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1754 src10_r, src32_r, src54_r, src21_r);
1755 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1756 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1757 src10_l, src32_l, src54_l, src21_l);
1758 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1760 for (loop_cnt = (height >> 1); loop_cnt--;) {
1761 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1762 src0_ptr_tmp += (2 * src_stride);
1763 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1764 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1765 src1_ptr_tmp += (2 * src2_stride);
1768 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1769 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1771 DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1772 filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1773 DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1774 filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1775 DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1776 filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1777 DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1778 filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1782 weight_vec, rnd_vec, offset_vec,
1783 out0, out1, out2, out3);
1786 ST_SH2(out0, out1, dst_tmp, dst_stride);
1787 dst_tmp += (2 * dst_stride);
1825 src1_ptr, src2_stride,
1826 dst, dst_stride, filter, height,
1827 weight0, weight1, offset0, offset1,
1846 src1_ptr, src2_stride,
1847 dst, dst_stride, filter, height,
1848 weight0, weight1, offset0, offset1,
1851 src1_ptr + 16, src2_stride,
1852 dst + 16, dst_stride, filter, height,
1853 weight0, weight1, offset0, offset1, rnd_val);
1871 src1_ptr, src2_stride,
1872 dst, dst_stride, filter, height,
1873 weight0, weight1, offset0, offset1,
1892 src1_ptr, src2_stride,
1893 dst, dst_stride, filter, height,
1894 weight0, weight1, offset0, offset1,
1913 src1_ptr, src2_stride,
1914 dst, dst_stride, filter, height,
1915 weight0, weight1, offset0, offset1,
1925 const int8_t *filter_x,
1926 const int8_t *filter_y,
1938 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1939 v8i16 in0 = { 0 }, in1 = { 0 };
1940 v8i16 filt0, filt1, filt2, filt3;
1941 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1942 v16i8 mask1, mask2, mask3;
1943 v8i16 filter_vec, weight_vec;
1944 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1945 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1946 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1947 v8i16 tmp0, tmp1, tmp2, tmp3;
1948 v8i16 dst10, dst32, dst54, dst76;
1949 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1950 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1953 src0_ptr -= ((3 * src_stride) + 3);
1955 filter_vec =
LD_SH(filter_x);
1956 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1958 filter_vec =
LD_SH(filter_y);
1961 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1967 offset = (offset0 + offset1) << rnd_val;
1968 weight0 = weight0 & 0x0000FFFF;
1969 weight = weight0 | (weight1 << 16);
1971 const_vec = __msa_fill_w((128 * weight1));
1973 offset_vec = __msa_fill_w(offset);
1974 rnd_vec = __msa_fill_w(rnd_val + 1);
1975 offset_vec += const_vec;
1976 weight_vec = (v8i16) __msa_fill_w(weight);
1978 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1979 src0_ptr += (7 * src_stride);
1983 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1984 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1985 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1986 vec8, vec9, vec10, vec11);
1987 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1988 vec12, vec13, vec14, vec15);
2003 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2005 for (loop_cnt = height >> 2; loop_cnt--;) {
2006 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2007 src0_ptr += (4 * src_stride);
2010 LD2(src1_ptr, src2_stride, tp0, tp1);
2012 src1_ptr += (2 * src2_stride);
2013 LD2(src1_ptr, src2_stride, tp0, tp1);
2015 src1_ptr += (2 * src2_stride);
2017 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2018 vec0, vec1, vec2, vec3);
2019 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2020 vec4, vec5, vec6, vec7);
2026 dst76 = __msa_ilvr_h(dst97, dst66);
2028 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2029 dst98 = __msa_ilvr_h(dst66, dst108);
2031 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2033 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2035 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2037 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2039 SRA_4V(dst0, dst1, dst2, dst3, 6);
2043 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2044 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2045 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2046 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2050 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2051 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2052 dst += (4 * dst_stride);
2060 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2070 const int8_t *filter_x,
2071 const int8_t *filter_y,
2080 uint32_t loop_cnt, cnt;
2083 int16_t *src1_ptr_tmp;
2086 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2088 v8i16 filt0, filt1, filt2, filt3;
2089 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2091 v16i8 mask1, mask2, mask3;
2092 v8i16 filter_vec, weight_vec;
2093 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2094 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2095 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2096 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2097 v8i16 tmp0, tmp1, tmp2, tmp3;
2098 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2099 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2100 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2101 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2102 v4i32 offset_vec, rnd_vec, const_vec;
2104 src0_ptr -= ((3 * src_stride) + 3);
2106 offset = (offset0 + offset1) << rnd_val;
2107 weight0 = weight0 & 0x0000FFFF;
2108 weight = weight0 | (weight1 << 16);
2110 const_vec = __msa_fill_w((128 * weight1));
2112 offset_vec = __msa_fill_w(offset);
2113 rnd_vec = __msa_fill_w(rnd_val + 1);
2114 offset_vec += const_vec;
2115 weight_vec = (v8i16) __msa_fill_w(weight);
2117 filter_vec =
LD_SH(filter_x);
2118 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2120 filter_vec =
LD_SH(filter_y);
2123 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2129 for (cnt = width8mult; cnt--;) {
2130 src0_ptr_tmp = src0_ptr;
2131 src1_ptr_tmp = src1_ptr;
2134 LD_SB7(src0_ptr_tmp, src_stride,
2135 src0, src1, src2, src3, src4, src5, src6);
2136 src0_ptr_tmp += (7 * src_stride);
2141 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2142 vec0, vec1, vec2, vec3);
2143 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2144 vec4, vec5, vec6, vec7);
2145 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2146 vec8, vec9, vec10, vec11);
2147 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2148 vec12, vec13, vec14, vec15);
2160 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2161 vec0, vec1, vec2, vec3);
2162 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2163 vec4, vec5, vec6, vec7);
2164 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2165 vec8, vec9, vec10, vec11);
2174 for (loop_cnt = height >> 1; loop_cnt--;) {
2175 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2177 src0_ptr_tmp += 2 * src_stride;
2179 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2180 src1_ptr_tmp += (2 * src2_stride);
2182 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2183 dst32_r, dst54_r, dst21_r);
2184 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2185 dst32_l, dst54_l, dst21_l);
2186 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2187 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2189 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2190 vec0, vec1, vec2, vec3);
2196 filt_h0, filt_h1, filt_h2, filt_h3);
2198 filt_h0, filt_h1, filt_h2, filt_h3);
2204 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2205 vec0, vec1, vec2, vec3);
2211 filt_h0, filt_h1, filt_h2, filt_h3);
2213 filt_h0, filt_h1, filt_h2, filt_h3);
2218 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2221 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2222 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2223 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2224 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2225 SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2227 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2228 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2229 ST8x2_UB(out, dst_tmp, dst_stride);
2230 dst_tmp += (2 * dst_stride);
2253 const int8_t *filter_x,
2254 const int8_t *filter_y,
2263 src1_ptr, src2_stride,
2264 dst, dst_stride, filter_x, filter_y,
2265 height, weight0, weight1, offset0,
2266 offset1, rnd_val, 1);
2275 const int8_t *filter_x,
2276 const int8_t *filter_y,
2285 uint8_t *src0_ptr_tmp, *dst_tmp;
2286 int16_t *src1_ptr_tmp;
2290 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2291 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2292 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2293 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2294 v8i16 in0 = { 0 }, in1 = { 0 };
2295 v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2296 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2297 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2298 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2299 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2300 v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2301 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2302 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2304 src0_ptr -= ((3 * src_stride) + 3);
2306 offset = (offset0 + offset1) << rnd_val;
2307 weight0 = weight0 & 0x0000FFFF;
2308 weight = weight0 | (weight1 << 16);
2310 const_vec = __msa_fill_w((128 * weight1));
2312 offset_vec = __msa_fill_w(offset);
2313 rnd_vec = __msa_fill_w(rnd_val + 1);
2314 offset_vec += const_vec;
2315 weight_vec = (v8i16) __msa_fill_w(weight);
2317 filter_vec =
LD_SH(filter_x);
2318 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2320 filter_vec =
LD_SH(filter_y);
2323 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2330 src0_ptr_tmp = src0_ptr;
2331 src1_ptr_tmp = src1_ptr;
2334 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2335 src0_ptr_tmp += (7 * src_stride);
2338 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2339 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2340 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2342 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2352 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2353 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2354 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2363 for (loop_cnt = 8; loop_cnt--;) {
2364 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2365 src0_ptr_tmp += (2 * src_stride);
2368 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2369 src1_ptr_tmp += (2 * src2_stride);
2371 ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2372 dst10_r, dst32_r, dst54_r, dst21_r);
2373 ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2374 dst10_l, dst32_l, dst54_l, dst21_l);
2375 ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2376 ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2378 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2384 dst0 =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2385 filt_h1, filt_h2, filt_h3);
2386 dst1 =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2387 filt_h1, filt_h2, filt_h3);
2391 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2397 dst2 =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2398 filt_h1, filt_h2, filt_h3);
2399 dst3 =
HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2400 filt_h1, filt_h2, filt_h3);
2407 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2408 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2409 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2410 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2414 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2415 ST8x2_UB(out, dst_tmp, dst_stride);
2416 dst_tmp += (2 * dst_stride);
2436 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2437 src0_ptr += (7 * src_stride);
2440 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2441 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2442 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2444 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2458 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2460 for (loop_cnt = 4; loop_cnt--;) {
2461 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2462 src0_ptr += (4 * src_stride);
2465 LD2(src1_ptr, src2_stride, tp0, tp1);
2467 src1_ptr += (2 * src2_stride);
2468 LD2(src1_ptr, src2_stride, tp0, tp1);
2470 src1_ptr += (2 * src2_stride);
2472 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2474 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2481 dst76 = __msa_ilvr_h(dst97, dst66);
2483 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2484 dst98 = __msa_ilvr_h(dst66, dst108);
2486 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2488 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2490 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2492 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2494 SRA_4V(dst0, dst1, dst2, dst3, 6);
2498 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2499 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2500 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2501 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2505 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2506 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2507 dst += (4 * dst_stride);
2515 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2525 const int8_t *filter_x,
2526 const int8_t *filter_y,
2535 src1_ptr, src2_stride,
2536 dst, dst_stride, filter_x, filter_y,
2537 height, weight0, weight1, offset0,
2538 offset1, rnd_val, 2);
2547 const int8_t *filter_x,
2548 const int8_t *filter_y,
2557 src1_ptr, src2_stride,
2558 dst, dst_stride, filter_x, filter_y,
2559 height, weight0, weight1, offset0,
2560 offset1, rnd_val, 3);
2569 const int8_t *filter_x,
2570 const int8_t *filter_y,
2579 src1_ptr, src2_stride,
2580 dst, dst_stride, filter_x, filter_y,
2581 height, weight0, weight1, offset0,
2582 offset1, rnd_val, 4);
2591 const int8_t *filter_x,
2592 const int8_t *filter_y,
2601 src1_ptr, src2_stride,
2602 dst, dst_stride, filter_x, filter_y,
2603 height, weight0, weight1, offset0,
2604 offset1, rnd_val, 6);
2613 const int8_t *filter_x,
2614 const int8_t *filter_y,
2623 src1_ptr, src2_stride,
2624 dst, dst_stride, filter_x, filter_y,
2625 height, weight0, weight1, offset0,
2626 offset1, rnd_val, 8);
2647 v16i8 mask1, vec0, vec1;
2649 v4i32 dst0_r, dst0_l;
2650 v8i16 out0, filter_vec;
2651 v4i32 weight_vec, offset_vec, rnd_vec;
2655 filter_vec =
LD_SH(filter);
2660 offset = (offset0 + offset1) << rnd_val;
2661 weight0 = weight0 & 0x0000FFFF;
2662 weight = weight0 | (weight1 << 16);
2663 constant = 128 * weight1;
2667 offset_vec = __msa_fill_w(offset);
2668 weight_vec = __msa_fill_w(weight);
2669 rnd_vec = __msa_fill_w(rnd_val + 1);
2671 LD_SB2(src0_ptr, src_stride, src0, src1);
2672 LD_SH2(src1_ptr, src2_stride, in0, in1);
2673 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2676 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2680 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2681 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2683 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2685 out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2709 v8i16 in0, in1, in2, in3;
2711 v4i32 weight_vec, offset_vec, rnd_vec;
2716 filter_vec =
LD_SH(filter);
2721 offset = (offset0 + offset1) << rnd_val;
2722 weight0 = weight0 & 0x0000FFFF;
2723 weight = weight0 | (weight1 << 16);
2724 constant = 128 * weight1;
2728 offset_vec = __msa_fill_w(offset);
2729 weight_vec = __msa_fill_w(weight);
2730 rnd_vec = __msa_fill_w(rnd_val + 1);
2732 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2734 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2737 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2739 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2742 weight_vec, rnd_vec, offset_vec,
2745 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2746 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2766 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2770 v8i16 dst0, dst1, dst2, dst3;
2771 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2773 v4i32 weight_vec, offset_vec, rnd_vec;
2777 filter_vec =
LD_SH(filter);
2780 offset = (offset0 + offset1) << rnd_val;
2781 weight0 = weight0 & 0x0000FFFF;
2782 weight = weight0 | (weight1 << 16);
2783 constant = 128 * weight1;
2787 offset_vec = __msa_fill_w(offset);
2788 weight_vec = __msa_fill_w(weight);
2789 rnd_vec = __msa_fill_w(rnd_val + 1);
2793 for (loop_cnt = (height >> 3); loop_cnt--;) {
2794 LD_SB8(src0_ptr, src_stride,
2795 src0, src1, src2, src3, src4, src5, src6, src7);
2796 src0_ptr += (8 * src_stride);
2797 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2798 src1_ptr += (4 * src2_stride);
2799 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2800 src1_ptr += (4 * src2_stride);
2805 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2807 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2809 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2811 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2815 weight_vec, rnd_vec, offset_vec,
2816 dst0, dst1, dst2, dst3);
2819 ST4x8_UB(dst0, dst1, dst, dst_stride);
2820 dst += (8 * dst_stride);
2840 dst, dst_stride, filter,
2841 weight0, weight1, offset0, offset1, rnd_val);
2842 }
else if (4 == height) {
2844 dst, dst_stride, filter,
2845 weight0, weight1, offset0, offset1, rnd_val);
2846 }
else if (0 == (height % 8)) {
2848 src1_ptr, src2_stride,
2849 dst, dst_stride, filter, height,
2850 weight0, weight1, offset0, offset1,
2876 v8i16 in0, in1, in2, in3;
2877 v8i16 dst0, dst1, dst2, dst3;
2879 v4i32 weight_vec, offset_vec, rnd_vec;
2883 filter_vec =
LD_SH(filter);
2886 offset = (offset0 + offset1) << rnd_val;
2887 weight0 = weight0 & 0x0000FFFF;
2888 weight = weight0 | (weight1 << 16);
2889 constant = 128 * weight1;
2893 offset_vec = __msa_fill_w(offset);
2894 weight_vec = __msa_fill_w(weight);
2895 rnd_vec = __msa_fill_w(rnd_val + 1);
2899 for (loop_cnt = 2; loop_cnt--;) {
2900 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2901 src0_ptr += (4 * src_stride);
2902 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2903 src1_ptr += (4 * src2_stride);
2906 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2908 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2910 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2912 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2917 weight_vec, rnd_vec, offset_vec,
2918 dst0, dst1, dst2, dst3);
2921 ST6x4_UB(dst0, dst1, dst, dst_stride);
2922 dst += (4 * dst_stride);
2944 v16i8 mask1, vec0, vec1;
2947 v4i32 weight_vec, offset_vec, rnd_vec;
2951 filter_vec =
LD_SH(filter);
2954 offset = (offset0 + offset1) << rnd_val;
2955 weight0 = weight0 & 0x0000FFFF;
2956 weight = weight0 | (weight1 << 16);
2957 constant = 128 * weight1;
2961 offset_vec = __msa_fill_w(offset);
2962 weight_vec = __msa_fill_w(weight);
2963 rnd_vec = __msa_fill_w(rnd_val + 1);
2967 LD_SB2(src0_ptr, src_stride, src0, src1);
2968 LD_SH2(src1_ptr, src2_stride, in0, in1);
2970 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2972 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2975 weight_vec, rnd_vec, offset_vec,
2978 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2997 v16i8
src0,
src1, src2, src3, src4, src5;
2998 v8i16 in0, in1, in2, in3, in4, in5;
3002 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3004 v4i32 weight_vec, offset_vec, rnd_vec;
3008 filter_vec =
LD_SH(filter);
3011 offset = (offset0 + offset1) << rnd_val;
3012 weight0 = weight0 & 0x0000FFFF;
3013 weight = weight0 | (weight1 << 16);
3014 constant = 128 * weight1;
3018 offset_vec = __msa_fill_w(offset);
3019 weight_vec = __msa_fill_w(weight);
3020 rnd_vec = __msa_fill_w(rnd_val + 1);
3024 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3026 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3027 src1_ptr += (4 * src2_stride);
3028 LD_SH2(src1_ptr, src2_stride, in4, in5);
3030 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3032 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3034 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3036 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3038 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3040 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3044 weight_vec, rnd_vec, offset_vec,
3045 dst0, dst1, dst2, dst3);
3047 weight_vec, rnd_vec, offset_vec,
3051 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3052 ST8x4_UB(dst0, dst1, dst, dst_stride);
3053 dst += (4 * dst_stride);
3078 v8i16 in0, in1, in2, in3;
3079 v8i16 dst0, dst1, dst2, dst3;
3081 v4i32 weight_vec, offset_vec, rnd_vec;
3085 filter_vec =
LD_SH(filter);
3088 offset = (offset0 + offset1) << rnd_val;
3089 weight0 = weight0 & 0x0000FFFF;
3090 weight = weight0 | (weight1 << 16);
3091 constant = 128 * weight1;
3095 offset_vec = __msa_fill_w(offset);
3096 weight_vec = __msa_fill_w(weight);
3097 rnd_vec = __msa_fill_w(rnd_val + 1);
3101 for (loop_cnt = (height >> 2); loop_cnt--;) {
3102 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3103 src0_ptr += (4 * src_stride);
3104 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3105 src1_ptr += (4 * src2_stride);
3108 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3110 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3112 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3114 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3118 weight_vec, rnd_vec, offset_vec,
3119 dst0, dst1, dst2, dst3);
3122 ST8x4_UB(dst0, dst1, dst, dst_stride);
3123 dst += (4 * dst_stride);
3143 dst, dst_stride, filter,
3144 weight0, weight1, offset0, offset1, rnd_val);
3145 }
else if (6 == height) {
3147 dst, dst_stride, filter,
3148 weight0, weight1, offset0, offset1, rnd_val);
3149 }
else if (0 == (height % 4)) {
3151 src1_ptr, src2_stride,
3152 dst, dst_stride, filter, height,
3153 weight0, weight1, offset0, offset1,
3176 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3179 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3183 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3185 v4i32 weight_vec, offset_vec, rnd_vec;
3189 filter_vec =
LD_SH(filter);
3192 offset = (offset0 + offset1) << rnd_val;
3193 weight0 = weight0 & 0x0000FFFF;
3194 weight = weight0 | (weight1 << 16);
3195 constant = 128 * weight1;
3199 offset_vec = __msa_fill_w(offset);
3200 weight_vec = __msa_fill_w(weight);
3201 rnd_vec = __msa_fill_w(rnd_val + 1);
3206 for (loop_cnt = 4; loop_cnt--;) {
3207 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3208 src0_ptr += (4 * src_stride);
3209 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3210 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3211 src1_ptr += (4 * src2_stride);
3215 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3217 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3219 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3221 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3223 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3225 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3230 weight_vec, rnd_vec, offset_vec,
3231 dst0, dst1, dst2, dst3);
3233 weight_vec, rnd_vec, offset_vec,
3237 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3238 ST12x4_UB(dst0, dst1, dst3, dst, dst_stride);
3239 dst += (4 * dst_stride);
3259 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
3260 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3264 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3267 v4i32 weight_vec, offset_vec, rnd_vec;
3271 filter_vec =
LD_SH(filter);
3274 offset = (offset0 + offset1) << rnd_val;
3275 weight0 = weight0 & 0x0000FFFF;
3276 weight = weight0 | (weight1 << 16);
3277 constant = 128 * weight1;
3281 offset_vec = __msa_fill_w(offset);
3282 weight_vec = __msa_fill_w(weight);
3283 rnd_vec = __msa_fill_w(rnd_val + 1);
3287 for (loop_cnt = (height >> 2); loop_cnt--;) {
3288 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3289 LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3290 src0_ptr += (4 * src_stride);
3291 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3292 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3293 src1_ptr += (4 * src2_stride);
3296 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3298 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3300 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3302 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3304 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3306 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3308 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3310 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3314 weight_vec, rnd_vec, offset_vec,
3315 dst0, dst1, dst2, dst3);
3318 ST_SH2(dst0, dst1, dst, dst_stride);
3319 dst += (2 * dst_stride);
3323 weight_vec, rnd_vec, offset_vec,
3324 dst0, dst1, dst2, dst3);
3327 ST_SH2(dst0, dst1, dst, dst_stride);
3328 dst += (2 * dst_stride);
3351 v16i8 mask1, mask2, mask3;
3353 v8i16 dst0, dst1, dst2, dst3;
3354 v8i16 in0, in1, in2, in3, in4, in5;
3356 v4i32 weight_vec, offset_vec, rnd_vec;
3360 filter_vec =
LD_SH(filter);
3363 offset = (offset0 + offset1) << rnd_val;
3364 weight0 = weight0 & 0x0000FFFF;
3365 weight = weight0 | (weight1 << 16);
3366 constant = 128 * weight1;
3370 offset_vec = __msa_fill_w(offset);
3371 weight_vec = __msa_fill_w(weight);
3372 rnd_vec = __msa_fill_w(rnd_val + 1);
3378 for (loop_cnt = 16; loop_cnt--;) {
3379 LD_SB2(src0_ptr, src_stride, src0, src2);
3380 LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3381 src0_ptr += (2 * src_stride);
3382 LD_SH2(src1_ptr, src2_stride, in0, in2);
3383 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3384 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3385 src1_ptr += (2 * src2_stride);
3388 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3390 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3392 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3394 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3398 weight_vec, rnd_vec, offset_vec,
3399 dst0, dst1, dst2, dst3);
3402 ST_SH2(dst0, dst1, dst, dst_stride);
3405 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3407 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3410 weight_vec, rnd_vec, offset_vec,
3413 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3414 ST8x2_UB(dst0, (dst + 16), dst_stride);
3415 dst += (2 * dst_stride);
3438 v16i8 mask1, mask2, mask3;
3439 v8i16 dst0, dst1, dst2, dst3;
3441 v8i16 in0, in1, in2, in3;
3443 v4i32 weight_vec, offset_vec, rnd_vec;
3447 filter_vec =
LD_SH(filter);
3450 offset = (offset0 + offset1) << rnd_val;
3451 weight0 = weight0 & 0x0000FFFF;
3452 weight = weight0 | (weight1 << 16);
3453 constant = 128 * weight1;
3457 offset_vec = __msa_fill_w(offset);
3458 weight_vec = __msa_fill_w(weight);
3459 rnd_vec = __msa_fill_w(rnd_val + 1);
3465 for (loop_cnt = height; loop_cnt--;) {
3466 LD_SB2(src0_ptr, 16, src0, src1);
3467 src2 =
LD_SB(src0_ptr + 24);
3468 src0_ptr += src_stride;
3469 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3470 src1_ptr += src2_stride;
3473 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3475 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3477 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3479 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3483 weight_vec, rnd_vec, offset_vec,
3484 dst0, dst1, dst2, dst3);
3487 ST_SH2(dst0, dst1, dst, 16);
3506 v16i8
src0,
src1, src2, src3, src4;
3507 v8i16 in0, in1, dst10;
3508 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3509 v4i32 dst10_r, dst10_l;
3511 v8i16 filter_vec,
out;
3512 v4i32 weight_vec, offset_vec, rnd_vec;
3514 src0_ptr -= src_stride;
3516 offset = (offset0 + offset1) << rnd_val;
3517 weight0 = weight0 & 0x0000FFFF;
3518 weight = weight0 | (weight1 << 16);
3519 constant = 128 * weight1;
3523 offset_vec = __msa_fill_w(offset);
3524 weight_vec = __msa_fill_w(weight);
3525 rnd_vec = __msa_fill_w(rnd_val + 1);
3527 filter_vec =
LD_SH(filter);
3530 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3531 src0_ptr += (3 * src_stride);
3532 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3533 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3534 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3535 LD_SB2(src0_ptr, src_stride, src3, src4);
3536 src0_ptr += (2 * src_stride);
3537 LD_SH2(src1_ptr, src2_stride, in0, in1);
3538 src1_ptr += (2 * src2_stride);
3540 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3541 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3542 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3543 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3548 dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3549 dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3551 dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3553 out = (v8i16) __msa_pckev_b((v16i8)
out, (v16i8) out);
3571 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3572 v8i16 in0, in1, in2, in3;
3573 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3574 v16i8 src2110, src4332, src6554;
3578 v4i32 weight_vec, offset_vec, rnd_vec;
3580 src0_ptr -= src_stride;
3582 offset = (offset0 + offset1) << rnd_val;
3583 weight0 = weight0 & 0x0000FFFF;
3584 weight = weight0 | (weight1 << 16);
3585 constant = 128 * weight1;
3589 offset_vec = __msa_fill_w(offset);
3590 weight_vec = __msa_fill_w(weight);
3591 rnd_vec = __msa_fill_w(rnd_val + 1);
3593 filter_vec =
LD_SH(filter);
3596 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3597 src0_ptr += (3 * src_stride);
3598 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3599 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3600 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3602 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3603 src0_ptr += (4 * src_stride);
3604 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3605 src1_ptr += (4 * src2_stride);
3607 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3608 src32_r, src43_r, src54_r, src65_r);
3609 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3616 weight_vec, rnd_vec, offset_vec,
3619 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3620 ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
3621 dst += (4 * dst_stride);
3640 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
3641 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3642 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3643 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3644 v16i8 src2110, src4332, src6554, src8776;
3645 v8i16 dst10, dst32, dst54, dst76;
3648 v4i32 weight_vec, offset_vec, rnd_vec;
3650 src0_ptr -= src_stride;
3652 offset = (offset0 + offset1) << rnd_val;
3653 weight0 = weight0 & 0x0000FFFF;
3654 weight = weight0 | (weight1 << 16);
3655 constant = 128 * weight1;
3659 offset_vec = __msa_fill_w(offset);
3660 weight_vec = __msa_fill_w(weight);
3661 rnd_vec = __msa_fill_w(rnd_val + 1);
3663 filter_vec =
LD_SH(filter);
3666 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3667 src0_ptr += (3 * src_stride);
3668 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3669 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3670 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3672 for (loop_cnt = (height >> 3); loop_cnt--;) {
3673 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3674 src0_ptr += (6 * src_stride);
3675 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3676 src1_ptr += (8 * src2_stride);
3681 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3682 src32_r, src43_r, src54_r, src65_r);
3683 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3684 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3685 src4332, src6554, src8776);
3692 LD_SB2(src0_ptr, src_stride, src9, src2);
3693 src0_ptr += (2 * src_stride);
3694 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3695 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3696 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3701 weight_vec, rnd_vec, offset_vec,
3702 dst10, dst32, dst54, dst76);
3704 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3705 ST4x8_UB(dst10, dst32, dst, dst_stride);
3706 dst += (8 * dst_stride);
3726 dst, dst_stride, filter,
3727 weight0, weight1, offset0, offset1, rnd_val);
3728 }
else if (4 == height) {
3730 dst, dst_stride, filter,
3731 weight0, weight1, offset0, offset1, rnd_val);
3732 }
else if (0 == (height % 8)) {
3734 src1_ptr, src2_stride,
3735 dst, dst_stride, filter, height,
3736 weight0, weight1, offset0, offset1,
3757 v16i8
src0,
src1, src2, src3, src4;
3758 v8i16 in0, in1, in2, in3;
3759 v16i8 src10_r, src32_r, src21_r, src43_r;
3760 v8i16 tmp0, tmp1, tmp2, tmp3;
3763 v4i32 weight_vec, offset_vec, rnd_vec;
3765 src0_ptr -= src_stride;
3767 offset = (offset0 + offset1) << rnd_val;
3768 weight0 = weight0 & 0x0000FFFF;
3769 weight = weight0 | (weight1 << 16);
3770 constant = 128 * weight1;
3774 offset_vec = __msa_fill_w(offset);
3775 weight_vec = __msa_fill_w(weight);
3776 rnd_vec = __msa_fill_w(rnd_val + 1);
3778 filter_vec =
LD_SH(filter);
3781 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3782 src0_ptr += (3 * src_stride);
3784 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3786 for (loop_cnt = (height >> 2); loop_cnt--;) {
3787 LD_SB2(src0_ptr, src_stride, src3, src4);
3788 src0_ptr += (2 * src_stride);
3789 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3790 src1_ptr += (4 * src2_stride);
3792 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3797 LD_SB2(src0_ptr, src_stride, src1, src2);
3798 src0_ptr += (2 * src_stride);
3800 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3806 weight_vec, rnd_vec, offset_vec,
3807 tmp0, tmp1, tmp2, tmp3);
3810 ST6x4_UB(tmp0, tmp1, dst, dst_stride);
3811 dst += (4 * dst_stride);
3829 v16i8
src0,
src1, src2, src3, src4;
3830 v8i16 in0, in1, tmp0, tmp1;
3831 v16i8 src10_r, src32_r, src21_r, src43_r;
3834 v4i32 weight_vec, offset_vec, rnd_vec;
3836 src0_ptr -= src_stride;
3838 offset = (offset0 + offset1) << rnd_val;
3839 weight0 = weight0 & 0x0000FFFF;
3840 weight = weight0 | (weight1 << 16);
3841 constant = 128 * weight1;
3845 offset_vec = __msa_fill_w(offset);
3846 weight_vec = __msa_fill_w(weight);
3847 rnd_vec = __msa_fill_w(rnd_val + 1);
3849 filter_vec =
LD_SH(filter);
3852 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3853 src0_ptr += (3 * src_stride);
3855 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3857 LD_SB2(src0_ptr, src_stride, src3, src4);
3858 LD_SH2(src1_ptr, src2_stride, in0, in1);
3860 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3865 weight_vec, rnd_vec, offset_vec,
3868 tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3886 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3887 v8i16 in0, in1, in2, in3, in4, in5;
3888 v16i8 src10_r, src32_r, src54_r, src76_r;
3889 v16i8 src21_r, src43_r, src65_r, src87_r;
3890 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3893 v4i32 weight_vec, offset_vec, rnd_vec;
3895 src0_ptr -= src_stride;
3897 offset = (offset0 + offset1) << rnd_val;
3898 weight0 = weight0 & 0x0000FFFF;
3899 weight = weight0 | (weight1 << 16);
3900 constant = 128 * weight1;
3904 offset_vec = __msa_fill_w(offset);
3905 weight_vec = __msa_fill_w(weight);
3906 rnd_vec = __msa_fill_w(rnd_val + 1);
3908 filter_vec =
LD_SH(filter);
3911 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3912 src0_ptr += (3 * src_stride);
3914 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3916 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3917 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3919 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3920 src32_r, src43_r, src54_r, src65_r);
3921 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3931 weight_vec, rnd_vec, offset_vec,
3932 tmp0, tmp1, tmp2, tmp3);
3934 weight_vec, rnd_vec, offset_vec,
3938 tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3939 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
3940 dst += (4 * dst_stride);
3960 v16i8
src0,
src1, src2, src3, src4;
3961 v8i16 in0, in1, in2, in3;
3962 v16i8 src10_r, src32_r, src21_r, src43_r;
3963 v8i16 tmp0, tmp1, tmp2, tmp3;
3966 v4i32 weight_vec, offset_vec, rnd_vec;
3968 src0_ptr -= src_stride;
3970 offset = (offset0 + offset1) << rnd_val;
3971 weight0 = weight0 & 0x0000FFFF;
3972 weight = weight0 | (weight1 << 16);
3973 constant = 128 * weight1;
3977 offset_vec = __msa_fill_w(offset);
3978 weight_vec = __msa_fill_w(weight);
3979 rnd_vec = __msa_fill_w(rnd_val + 1);
3981 filter_vec =
LD_SH(filter);
3984 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3985 src0_ptr += (3 * src_stride);
3987 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3989 for (loop_cnt = (height >> 2); loop_cnt--;) {
3990 LD_SB2(src0_ptr, src_stride, src3, src4);
3991 src0_ptr += (2 * src_stride);
3992 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3993 src1_ptr += (4 * src2_stride);
3995 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4000 LD_SB2(src0_ptr, src_stride, src1, src2);
4001 src0_ptr += (2 * src_stride);
4003 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4009 weight_vec, rnd_vec, offset_vec,
4010 tmp0, tmp1, tmp2, tmp3);
4013 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
4014 dst += (4 * dst_stride);
4034 dst, dst_stride, filter,
4035 weight0, weight1, offset0, offset1, rnd_val);
4036 }
else if (6 == height) {
4038 dst, dst_stride, filter,
4039 weight0, weight1, offset0, offset1, rnd_val);
4042 src1_ptr, src2_stride,
4043 dst, dst_stride, filter, height,
4044 weight0, weight1, offset0, offset1,
4065 v16i8
src0,
src1, src2, src3, src4, src5;
4066 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4067 v16i8 src10_r, src32_r, src21_r, src43_r;
4068 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4069 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4070 v16i8 src2110, src4332;
4073 v4i32 weight_vec, offset_vec, rnd_vec;
4075 src0_ptr -= (1 * src_stride);
4077 offset = (offset0 + offset1) << rnd_val;
4078 weight0 = weight0 & 0x0000FFFF;
4079 weight = weight0 | (weight1 << 16);
4080 constant = 128 * weight1;
4084 offset_vec = __msa_fill_w(offset);
4085 weight_vec = __msa_fill_w(weight);
4086 rnd_vec = __msa_fill_w(rnd_val + 1);
4088 filter_vec =
LD_SH(filter);
4091 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4092 src0_ptr += (3 * src_stride);
4094 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4095 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4096 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4098 for (loop_cnt = (height >> 2); loop_cnt--;) {
4099 LD_SB2(src0_ptr, src_stride, src3, src4);
4100 src0_ptr += (2 * src_stride);
4101 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4102 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4103 src1_ptr += (4 * src2_stride);
4107 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4108 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4109 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4115 LD_SB2(src0_ptr, src_stride, src5, src2);
4116 src0_ptr += (2 * src_stride);
4118 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4119 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4120 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4127 weight_vec, rnd_vec, offset_vec,
4128 tmp0, tmp1, tmp2, tmp3);
4130 weight_vec, rnd_vec, offset_vec,
4134 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4135 ST12x4_UB(tmp0, tmp1, tmp2, dst, dst_stride);
4136 dst += (4 * dst_stride);
4156 v16i8
src0,
src1, src2, src3, src4, src5;
4157 v8i16 in0, in1, in2, in3;
4158 v16i8 src10_r, src32_r, src21_r, src43_r;
4159 v16i8 src10_l, src32_l, src21_l, src43_l;
4160 v8i16 tmp0, tmp1, tmp2, tmp3;
4163 v4i32 weight_vec, offset_vec, rnd_vec;
4165 src0_ptr -= src_stride;
4167 offset = (offset0 + offset1) << rnd_val;
4168 weight0 = weight0 & 0x0000FFFF;
4169 weight = weight0 | (weight1 << 16);
4170 constant = 128 * weight1;
4174 offset_vec = __msa_fill_w(offset);
4175 weight_vec = __msa_fill_w(weight);
4176 rnd_vec = __msa_fill_w(rnd_val + 1);
4178 filter_vec =
LD_SH(filter);
4181 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4182 src0_ptr += (3 * src_stride);
4184 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4185 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4187 for (loop_cnt = (height >> 2); loop_cnt--;) {
4188 LD_SB2(src0_ptr, src_stride, src3, src4);
4189 src0_ptr += (2 * src_stride);
4190 LD_SH2(src1_ptr, src2_stride, in0, in1);
4191 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4192 src1_ptr += (2 * src2_stride);
4194 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4195 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4204 weight_vec, rnd_vec, offset_vec,
4205 tmp0, tmp1, tmp2, tmp3);
4207 ST_SH2(tmp0, tmp1, dst, dst_stride);
4208 dst += (2 * dst_stride);
4209 LD_SB2(src0_ptr, src_stride, src5, src2);
4210 src0_ptr += (2 * src_stride);
4212 LD_SH2(src1_ptr, src2_stride, in0, in1);
4213 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4214 src1_ptr += (2 * src2_stride);
4216 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4217 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4225 weight_vec, rnd_vec, offset_vec,
4226 tmp0, tmp1, tmp2, tmp3);
4229 ST_SH2(tmp0, tmp1, dst, dst_stride);
4230 dst += (2 * dst_stride);
4250 v16i8
src0,
src1, src2, src3, src4, src5;
4251 v16i8 src6, src7, src8, src9, src10, src11;
4252 v8i16 in0, in1, in2, in3, in4, in5;
4253 v16i8 src10_r, src32_r, src76_r, src98_r;
4254 v16i8 src10_l, src32_l, src21_l, src43_l;
4255 v16i8 src21_r, src43_r, src87_r, src109_r;
4256 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4259 v4i32 weight_vec, offset_vec, rnd_vec;
4261 src0_ptr -= src_stride;
4263 offset = (offset0 + offset1) << rnd_val;
4264 weight0 = weight0 & 0x0000FFFF;
4265 weight = weight0 | (weight1 << 16);
4266 constant = 128 * weight1;
4270 offset_vec = __msa_fill_w(offset);
4271 weight_vec = __msa_fill_w(weight);
4272 rnd_vec = __msa_fill_w(rnd_val + 1);
4274 filter_vec =
LD_SH(filter);
4278 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4280 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4281 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4283 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4284 src0_ptr += (3 * src_stride);
4286 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4288 for (loop_cnt = (height >> 2); loop_cnt--;) {
4290 LD_SB2(src0_ptr, src_stride, src3, src4);
4291 LD_SH2(src1_ptr, src2_stride, in0, in1);
4292 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4294 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4295 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4298 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4299 src0_ptr += (2 * src_stride);
4300 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4301 src1_ptr += (2 * src2_stride);
4303 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4315 weight_vec, rnd_vec, offset_vec,
4316 tmp0, tmp1, tmp4, tmp5);
4319 weight_vec, rnd_vec, offset_vec,
4324 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4325 ST_SH2(tmp0, tmp1, dst, dst_stride);
4326 ST8x2_UB(tmp2, dst + 16, dst_stride);
4327 dst += (2 * dst_stride);
4330 LD_SB2(src0_ptr, src_stride, src5, src2);
4331 LD_SH2(src1_ptr, src2_stride, in0, in1);
4332 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4334 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4335 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4337 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4338 src0_ptr += (2 * src_stride);
4339 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4340 src1_ptr += (2 * src2_stride);
4342 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4354 weight_vec, rnd_vec, offset_vec,
4355 tmp0, tmp1, tmp4, tmp5);
4358 weight_vec, rnd_vec, offset_vec,
4364 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4365 ST_SH2(tmp0, tmp1, dst, dst_stride);
4366 ST8x2_UB(tmp2, dst + 16, dst_stride);
4367 dst += (2 * dst_stride);
4388 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
4389 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4390 v16i8 src10_r, src32_r, src76_r, src98_r;
4391 v16i8 src21_r, src43_r, src87_r, src109_r;
4392 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4393 v16i8 src10_l, src32_l, src76_l, src98_l;
4394 v16i8 src21_l, src43_l, src87_l, src109_l;
4397 v4i32 weight_vec, offset_vec, rnd_vec;
4399 src0_ptr -= src_stride;
4401 offset = (offset0 + offset1) << rnd_val;
4402 weight0 = weight0 & 0x0000FFFF;
4403 weight = weight0 | (weight1 << 16);
4404 constant = 128 * weight1;
4408 offset_vec = __msa_fill_w(offset);
4409 weight_vec = __msa_fill_w(weight);
4410 rnd_vec = __msa_fill_w(rnd_val + 1);
4412 filter_vec =
LD_SH(filter);
4416 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4418 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4419 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4421 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4422 src0_ptr += (3 * src_stride);
4424 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4425 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4427 for (loop_cnt = (height >> 1); loop_cnt--;) {
4429 LD_SB2(src0_ptr, src_stride, src3, src4);
4430 LD_SH2(src1_ptr, src2_stride, in0, in1);
4431 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4433 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4434 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4444 weight_vec, rnd_vec, offset_vec,
4445 tmp0, tmp1, tmp4, tmp5);
4448 ST_SH2(tmp0, tmp1, dst, dst_stride);
4449 dst += (2 * dst_stride);
4458 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4459 src0_ptr += (2 * src_stride);
4460 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4461 LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4462 src1_ptr += (2 * src2_stride);
4464 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4465 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4474 weight_vec, rnd_vec, offset_vec,
4475 tmp2, tmp3, tmp6, tmp7);
4479 ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4480 dst_tmp += (2 * dst_stride);
4496 const int8_t *filter_x,
4497 const int8_t *filter_y,
4508 v16i8
src0,
src1, src2, src3, src4;
4510 v8i16 filt_h0, filt_h1;
4513 v8i16 filter_vec,
tmp, weight_vec;
4514 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4515 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4516 v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4518 src0_ptr -= (src_stride + 1);
4520 filter_vec =
LD_SH(filter_x);
4523 filter_vec =
LD_SH(filter_y);
4530 offset = (offset0 + offset1) << rnd_val;
4531 weight0 = weight0 & 0x0000FFFF;
4532 weight = weight0 | (weight1 << 16);
4534 const_vec = __msa_fill_w((128 * weight1));
4536 offset_vec = __msa_fill_w(offset);
4537 weight_vec = (v8i16) __msa_fill_w(weight);
4538 rnd_vec = __msa_fill_w(rnd_val + 1);
4539 offset_vec += const_vec;
4541 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4544 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4545 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4546 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4559 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4561 LD2(src1_ptr, src2_stride, tp0, tp1);
4565 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4566 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4568 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4570 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8) tmp);
4580 const int8_t *filter_x,
4581 const int8_t *filter_y,
4591 v8i16 in0 = { 0 }, in1 = { 0 };
4592 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4594 v8i16 filt_h0, filt_h1;
4597 v8i16 filter_vec, weight_vec;
4598 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4599 v8i16 tmp0, tmp1, tmp2, tmp3;
4600 v8i16 dst30, dst41, dst52, dst63;
4601 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4602 v4i32 offset_vec, rnd_vec, const_vec;
4603 v4i32 dst0, dst1, dst2, dst3;
4605 src0_ptr -= (src_stride + 1);
4607 filter_vec =
LD_SH(filter_x);
4610 filter_vec =
LD_SH(filter_y);
4617 offset = (offset0 + offset1) << rnd_val;
4618 weight0 = weight0 & 0x0000FFFF;
4619 weight = weight0 | (weight1 << 16);
4621 const_vec = __msa_fill_w((128 * weight1));
4623 offset_vec = __msa_fill_w(offset);
4624 weight_vec = (v8i16) __msa_fill_w(weight);
4625 rnd_vec = __msa_fill_w(rnd_val + 1);
4626 offset_vec += const_vec;
4628 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4631 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4632 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4633 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4634 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4648 SRA_4V(dst0, dst1, dst2, dst3, 6);
4651 LD2(src1_ptr, src2_stride, tp0, tp1);
4653 src1_ptr += (2 * src2_stride);
4654 LD2(src1_ptr, src2_stride, tp0, tp1);
4660 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4661 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4662 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4663 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4667 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4668 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
4677 const int8_t *filter_x,
4678 const int8_t *filter_y,
4690 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4691 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4693 v8i16 filt_h0, filt_h1;
4694 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4697 v8i16 filter_vec, weight_vec;
4698 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4699 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4700 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4701 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4702 v8i16 dst98_r, dst109_r;
4703 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4704 v4i32 offset_vec, rnd_vec, const_vec;
4706 src0_ptr -= (src_stride + 1);
4708 filter_vec =
LD_SH(filter_x);
4711 filter_vec =
LD_SH(filter_y);
4718 offset = (offset0 + offset1) << rnd_val;
4719 weight0 = weight0 & 0x0000FFFF;
4720 weight = weight0 | (weight1 << 16);
4722 const_vec = __msa_fill_w((128 * weight1));
4724 offset_vec = __msa_fill_w(offset);
4725 weight_vec = (v8i16) __msa_fill_w(weight);
4726 rnd_vec = __msa_fill_w(rnd_val + 1);
4727 offset_vec += const_vec;
4729 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4730 src0_ptr += (3 * src_stride);
4733 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4734 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4738 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4740 for (loop_cnt = height >> 3; loop_cnt--;) {
4741 LD_SB8(src0_ptr, src_stride,
4742 src3, src4, src5, src6, src7, src8, src9, src10);
4743 src0_ptr += (8 * src_stride);
4745 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4746 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4747 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4748 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4755 dst32_r = __msa_ilvr_h(dst73, dst22);
4759 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4760 dst76_r = __msa_ilvr_h(dst22, dst106);
4762 LD2(src1_ptr, src2_stride, tp0, tp1);
4763 src1_ptr += 2 * src2_stride;
4765 LD2(src1_ptr, src2_stride, tp0, tp1);
4766 src1_ptr += 2 * src2_stride;
4769 LD2(src1_ptr, src2_stride, tp0, tp1);
4770 src1_ptr += 2 * src2_stride;
4772 LD2(src1_ptr, src2_stride, tp0, tp1);
4773 src1_ptr += 2 * src2_stride;
4784 SRA_4V(dst0, dst1, dst2, dst3, 6);
4785 SRA_4V(dst4, dst5, dst6, dst7, 6);
4786 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4792 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4793 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4794 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4795 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4796 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4797 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4798 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4799 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4802 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4806 ST4x8_UB(out0, out1, dst, dst_stride);
4807 dst += (8 * dst_stride);
4811 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4821 const int8_t *filter_x,
4822 const int8_t *filter_y,
4832 dst, dst_stride, filter_x, filter_y,
4833 weight0, weight1, offset0, offset1, rnd_val);
4834 }
else if (4 == height) {
4836 dst, dst_stride, filter_x, filter_y,
4837 weight0, weight1, offset0, offset1, rnd_val);
4838 }
else if (0 == (height % 8)) {
4840 src1_ptr, src2_stride,
4841 dst, dst_stride, filter_x, filter_y,
4842 height, weight0, weight1,
4843 offset0, offset1, rnd_val);
4853 const int8_t *filter_x,
4854 const int8_t *filter_y,
4862 uint32_t tpw0, tpw1, tpw2, tpw3;
4865 v16u8 out0, out1, out2;
4866 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4867 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4868 v8i16 in4 = { 0 }, in5 = { 0 };
4870 v8i16 filt_h0, filt_h1, filter_vec;
4871 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4874 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4875 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4876 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4877 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4878 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4879 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4880 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4881 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4882 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4883 v4i32 offset_vec, rnd_vec, const_vec;
4885 src0_ptr -= (src_stride + 1);
4887 filter_vec =
LD_SH(filter_x);
4890 filter_vec =
LD_SH(filter_y);
4897 offset = (offset0 + offset1) << rnd_val;
4898 weight0 = weight0 & 0x0000FFFF;
4899 weight = weight0 | (weight1 << 16);
4901 const_vec = __msa_fill_w((128 * weight1));
4903 offset_vec = __msa_fill_w(offset);
4904 weight_vec = (v8i16) __msa_fill_w(weight);
4905 rnd_vec = __msa_fill_w(rnd_val + 1);
4906 offset_vec += const_vec;
4908 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4909 src0_ptr += (3 * src_stride);
4912 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4913 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4914 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4922 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4926 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4927 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4928 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4929 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4936 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4937 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4938 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4939 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4954 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4955 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4956 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4969 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4970 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4971 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4972 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4973 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4974 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4976 LD2(src1_ptr, src2_stride, tp0, tp1);
4978 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4981 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4983 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4990 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4991 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4992 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4993 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4994 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4995 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4996 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4997 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5000 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5004 ST4x8_UB(out0, out1, dst, dst_stride);
5006 PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5008 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5009 src1_ptr += (4 * src2_stride);
5011 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5017 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5018 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5019 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5020 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5025 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5026 ST2x4_UB(out2, 0, dst + 4, dst_stride);
5027 dst += 4 * dst_stride;
5028 ST2x4_UB(out2, 4, dst + 4, dst_stride);
5037 const int8_t *filter_x,
5038 const int8_t *filter_y,
5047 v16i8
src0,
src1, src2, src3, src4;
5049 v8i16 filt_h0, filt_h1;
5052 v8i16 filter_vec, weight_vec;
5053 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5054 v8i16 dst0, dst1, dst2, dst3, dst4;
5056 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5057 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5058 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5059 v8i16 tmp0, tmp1, tmp2, tmp3;
5060 v4i32 offset_vec, rnd_vec, const_vec;
5062 src0_ptr -= (src_stride + 1);
5064 filter_vec =
LD_SH(filter_x);
5067 filter_vec =
LD_SH(filter_y);
5074 offset = (offset0 + offset1) << rnd_val;
5075 weight0 = weight0 & 0x0000FFFF;
5076 weight = weight0 | (weight1 << 16);
5078 const_vec = __msa_fill_w((128 * weight1));
5080 offset_vec = __msa_fill_w(offset);
5081 weight_vec = (v8i16) __msa_fill_w(weight);
5082 rnd_vec = __msa_fill_w(rnd_val + 1);
5083 offset_vec += const_vec;
5085 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5088 LD_SH2(src1_ptr, src2_stride, in0, in1);
5090 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5091 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5092 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5093 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5094 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5110 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5111 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5116 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5117 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5118 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5119 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5120 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5121 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5123 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5133 const int8_t *filter_x,
5134 const int8_t *filter_y,
5145 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
5146 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5147 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5148 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5149 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5150 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5151 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5152 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5153 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5154 v4i32 offset_vec, rnd_vec, const_vec;
5156 src0_ptr -= (src_stride + 1);
5158 filter_vec =
LD_SH(filter_x);
5161 filter_vec =
LD_SH(filter_y);
5169 offset = (offset0 + offset1) << rnd_val;
5170 weight0 = weight0 & 0x0000FFFF;
5171 weight = weight0 | (weight1 << 16);
5173 const_vec = __msa_fill_w((128 * weight1));
5175 offset_vec = __msa_fill_w(offset);
5176 rnd_vec = __msa_fill_w(rnd_val + 1);
5177 offset_vec += const_vec;
5178 weight_vec = (v8i16) __msa_fill_w(weight);
5180 for (cnt = width8mult; cnt--;) {
5181 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5185 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5188 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5189 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5190 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5199 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5200 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5201 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5202 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5223 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5224 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5225 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5226 dst3_r, dst0, dst1, dst2, dst3);
5232 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5233 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5234 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5235 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5236 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5237 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5238 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5239 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5242 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5243 tmp0, tmp1, tmp2, tmp3);
5246 ST8x4_UB(out0, out1, dst, dst_stride);
5257 const int8_t *filter_x,
5258 const int8_t *filter_y,
5266 v16u8 out0, out1, out2;
5267 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
5269 v8i16 filt_h0, filt_h1;
5272 v8i16 filter_vec, weight_vec;
5273 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5274 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5275 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5276 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5277 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5278 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5279 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5280 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5281 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5282 v8i16 in0, in1, in2, in3, in4, in5;
5283 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5284 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5285 v4i32 offset_vec, rnd_vec, const_vec;
5287 src0_ptr -= (src_stride + 1);
5289 filter_vec =
LD_SH(filter_x);
5292 filter_vec =
LD_SH(filter_y);
5299 offset = (offset0 + offset1) << rnd_val;
5300 weight0 = weight0 & 0x0000FFFF;
5301 weight = weight0 | (weight1 << 16);
5303 const_vec = __msa_fill_w((128 * weight1));
5305 offset_vec = __msa_fill_w(offset);
5306 weight_vec = (v8i16) __msa_fill_w(weight);
5307 rnd_vec = __msa_fill_w(rnd_val + 1);
5308 offset_vec += const_vec;
5310 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5311 src0_ptr += (5 * src_stride);
5312 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5317 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5319 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5320 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5321 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5322 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5323 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5324 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5325 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5326 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5327 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5361 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5362 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5363 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5364 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5365 dst0, dst1, dst2, dst3);
5371 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5372 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5373 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5374 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5375 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5376 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5377 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5378 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5381 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5382 tmp0, tmp1, tmp2, tmp3);
5386 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5389 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5390 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5391 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5392 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5396 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5397 ST8x4_UB(out0, out1, dst, dst_stride);
5398 dst += (4 * dst_stride);
5408 const int8_t *filter_x,
5409 const int8_t *filter_y,
5422 int16_t *src1_ptr_tmp;
5425 v16i8
src0,
src1, src2, src3, src4, src5, src6;
5426 v8i16 in0, in1, in2, in3;
5428 v8i16 filt_h0, filt_h1;
5432 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5433 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5434 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5435 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5436 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5437 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5438 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5439 v4i32 offset_vec, rnd_vec, const_vec;
5441 src0_ptr -= (src_stride + 1);
5443 filter_vec =
LD_SH(filter_x);
5446 filter_vec =
LD_SH(filter_y);
5453 offset = (offset0 + offset1) << rnd_val;
5454 weight0 = weight0 & 0x0000FFFF;
5455 weight = weight0 | (weight1 << 16);
5457 const_vec = __msa_fill_w((128 * weight1));
5459 offset_vec = __msa_fill_w(offset);
5460 weight_vec = (v8i16) __msa_fill_w(weight);
5461 rnd_vec = __msa_fill_w(rnd_val + 1);
5462 offset_vec += const_vec;
5464 for (cnt = width >> 3; cnt--;) {
5465 src0_ptr_tmp = src0_ptr;
5466 src1_ptr_tmp = src1_ptr;
5469 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5470 src0_ptr_tmp += (3 * src_stride);
5473 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5474 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5475 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5483 for (loop_cnt = height >> 2; loop_cnt--;) {
5484 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5485 src0_ptr_tmp += (4 * src_stride);
5486 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5487 src1_ptr_tmp += (4 * src2_stride);
5490 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5491 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5492 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5493 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5514 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5515 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5516 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5517 dst3_r, dst0, dst1, dst2, dst3);
5522 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5523 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5524 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5525 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5526 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5527 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5528 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5529 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5532 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5533 tmp0, tmp1, tmp2, tmp3);
5536 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
5537 dst_tmp += (4 * dst_stride);
5558 const int8_t *filter_x,
5559 const int8_t *filter_y,
5569 dst, dst_stride, filter_x, filter_y,
5570 weight0, weight1, offset0, offset1, rnd_val);
5571 }
else if (4 == height) {
5573 src2_stride, dst, dst_stride, filter_x,
5574 filter_y, weight0, weight1, offset0,
5575 offset1, rnd_val, 1);
5576 }
else if (6 == height) {
5578 dst, dst_stride, filter_x, filter_y,
5579 weight0, weight1, offset0, offset1, rnd_val);
5580 }
else if (0 == (height % 4)) {
5582 src1_ptr, src2_stride,
5583 dst, dst_stride, filter_x, filter_y,
5585 weight1, offset0, offset1, rnd_val, 8);
5595 const int8_t *filter_x,
5596 const int8_t *filter_y,
5607 uint8_t *src0_ptr_tmp, *dst_tmp;
5608 int16_t *src1_ptr_tmp;
5610 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5611 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5612 v16i8 mask0, mask1, mask2, mask3;
5613 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5614 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5615 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5616 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5617 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5618 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5619 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5620 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5621 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5622 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5623 v4i32 offset_vec, rnd_vec, const_vec;
5625 src0_ptr -= (src_stride + 1);
5627 filter_vec =
LD_SH(filter_x);
5630 filter_vec =
LD_SH(filter_y);
5638 offset = (offset0 + offset1) << rnd_val;
5639 weight0 = weight0 & 0x0000FFFF;
5640 weight = weight0 | (weight1 << 16);
5642 const_vec = __msa_fill_w((128 * weight1));
5644 offset_vec = __msa_fill_w(offset);
5645 rnd_vec = __msa_fill_w(rnd_val + 1);
5646 offset_vec += const_vec;
5647 weight_vec = (v8i16) __msa_fill_w(weight);
5649 src0_ptr_tmp = src0_ptr;
5651 src1_ptr_tmp = src1_ptr;
5653 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5654 src0_ptr_tmp += (3 * src_stride);
5658 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5659 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5660 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5669 for (loop_cnt = 4; loop_cnt--;) {
5670 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5671 src0_ptr_tmp += (4 * src_stride);
5674 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5675 src1_ptr_tmp += (4 * src2_stride);
5677 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5678 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5679 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5680 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5701 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5702 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5703 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5704 dst3_r, dst0, dst1, dst2, dst3);
5709 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5710 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5711 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5712 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5713 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5714 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5715 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5716 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5719 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5720 tmp0, tmp1, tmp2, tmp3);
5723 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
5724 dst_tmp += (4 * dst_stride);
5740 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5741 src0_ptr += (3 * src_stride);
5743 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5744 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5750 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5752 for (loop_cnt = 2; loop_cnt--;) {
5753 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5755 src0_ptr += (8 * src_stride);
5757 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5758 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5759 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5760 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5767 dst32_r = __msa_ilvr_h(dst73, dst22);
5771 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5772 dst76_r = __msa_ilvr_h(dst22, dst106);
5774 LD2(src1_ptr, src2_stride, tp0, tp1);
5775 src1_ptr += 2 * src2_stride;
5777 LD2(src1_ptr, src2_stride, tp0, tp1);
5778 src1_ptr += 2 * src2_stride;
5781 LD2(src1_ptr, src2_stride, tp0, tp1);
5782 src1_ptr += 2 * src2_stride;
5784 LD2(src1_ptr, src2_stride, tp0, tp1);
5785 src1_ptr += 2 * src2_stride;
5797 SRA_4V(dst0, dst1, dst2, dst3, 6);
5798 SRA_4V(dst4, dst5, dst6, dst7, 6);
5799 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5800 dst0, dst1, dst2, dst3);
5805 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5806 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5807 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5808 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5809 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5810 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5811 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5812 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5815 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5816 tmp0, tmp1, tmp2, tmp3);
5819 ST4x8_UB(out0, out1, dst, dst_stride);
5820 dst += (8 * dst_stride);
5824 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5834 const int8_t *filter_x,
5835 const int8_t *filter_y,
5845 src2_stride, dst, dst_stride, filter_x,
5846 filter_y, weight0, weight1, offset0,
5847 offset1, rnd_val, 2);
5850 src2_stride, dst, dst_stride,
5851 filter_x, filter_y, height, weight0,
5852 weight1, offset0, offset1, rnd_val, 16);
5862 const int8_t *filter_x,
5863 const int8_t *filter_y,
5872 src1_ptr, src2_stride,
5874 filter_x, filter_y, height, weight0,
5875 weight1, offset0, offset1, rnd_val, 24);
5884 const int8_t *filter_x,
5885 const int8_t *filter_y,
5894 src1_ptr, src2_stride,
5896 filter_x, filter_y, height, weight0,
5897 weight1, offset0, offset1, rnd_val, 32);
5900 #define BI_W_MC_COPY(WIDTH) \ 5901 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 5902 ptrdiff_t dst_stride, \ 5904 ptrdiff_t src_stride, \ 5905 int16_t *src_16bit, \ 5916 int shift = 14 + 1 - 8; \ 5917 int log2Wd = denom + shift - 1; \ 5919 hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \ 5920 dst, dst_stride, height, \ 5921 weight0, weight1, offset0, \ 5937 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 5938 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 5944 int16_t *src_16bit, \ 5955 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 5956 int log2Wd = denom + 14 - 8; \ 5958 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 5959 MAX_PB_SIZE, dst, dst_stride, \ 5960 filter, height, weight0, \ 5961 weight1, offset0, offset1, \ 5974 BI_W_MC(qpel, v, 4, 8, vt, my);
5975 BI_W_MC(qpel, v, 8, 8, vt, my);
5976 BI_W_MC(qpel, v, 12, 8, vt, my);
5977 BI_W_MC(qpel, v, 16, 8, vt, my);
5978 BI_W_MC(qpel, v, 24, 8, vt, my);
5979 BI_W_MC(qpel, v, 32, 8, vt, my);
5980 BI_W_MC(qpel, v, 48, 8, vt, my);
5981 BI_W_MC(qpel, v, 64, 8, vt, my);
5991 BI_W_MC(epel, v, 4, 4, vt, my);
5992 BI_W_MC(epel, v, 8, 4, vt, my);
5993 BI_W_MC(epel, v, 6, 4, vt, my);
5994 BI_W_MC(epel, v, 12, 4, vt, my);
5995 BI_W_MC(epel, v, 16, 4, vt, my);
5996 BI_W_MC(epel, v, 24, 4, vt, my);
5997 BI_W_MC(epel, v, 32, 4, vt, my);
6001 #define BI_W_MC_HV(PEL, WIDTH, TAP) \ 6002 void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 6003 ptrdiff_t dst_stride, \ 6005 ptrdiff_t src_stride, \ 6006 int16_t *src_16bit, \ 6017 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 6018 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 6019 int log2Wd = denom + 14 - 8; \ 6021 hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 6022 MAX_PB_SIZE, dst, dst_stride, \ 6023 filter_x, filter_y, height, \ 6024 weight0, weight1, offset0, \
static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define XORI_B5_128_SB(...)
static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define XORI_B8_128_SB(...)
static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1)
static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define XORI_B2_128_SB(...)
#define XORI_B3_128_SB(...)
static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define UNPCK_R_SB_SH(in, out)
static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1)
#define SRA_4V(in0, in1, in2, in3, shift)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
#define LD4(psrc, stride, out0, out1, out2, out3)
#define SPLATI_W2_SH(...)
static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define CLIP_SH_0_255(in)
static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define SPLATI_H4_SH(...)
static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static const uint8_t ff_hevc_mask_arr[16 *2]
#define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3)
static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width8mult)
static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static int aligned(int val)
static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST8x2_UB(in, pdst, stride)
static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define CLIP_SH_0_255_MAX_SATU(in)
static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define XORI_B7_128_SB(...)
#define LW2(psrc, stride, out0, out1)
#define XORI_B4_128_SB(...)
static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width8mult)
static const uint8_t offset[127][2]
#define SPLATI_W4_SH(...)
#define DPADD_SB2_SH(...)
static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, wgt, rnd, offset, out0, out1, out2, out3)
#define INSERT_D2_SH(...)
static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST2x4_UB(in, stidx, pdst, stride)
static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define INSERT_W4_SH(...)
static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define INSERT_W2_SB(...)
static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static int weight(int i, int blen, int offset)
static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, vec3, wgt, rnd, offset, out0, out1, out2, out3)
static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define SLLI_4V(in0, in1, in2, in3, shift)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define LW4(psrc, stride, out0, out1, out2, out3)
static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST8x4_UB(in0, in1, pdst, stride)
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
#define INSERT_D2_SB(...)
#define BI_W_MC_HV(PEL, WIDTH, TAP)
#define XORI_B6_128_SB(...)
static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST12x4_UB(in0, in1, in2, pdst, stride)
static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define BI_W_MC_COPY(WIDTH)
static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define SLLI_2V(in0, in1, shift)
static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST4x2_UB(in, pdst, stride)
#define INSERT_W4_SB(...)
#define LD2(psrc, stride, out0, out1)
static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define CLIP_SH2_0_255_MAX_SATU(in0, in1)
static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
#define CLIP_SW4_0_255_MAX_SATU(in0, in1, in2, in3)
static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)