27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 35 mask0, mask1, mask2, mask3, \ 36 filt0, filt1, filt2, filt3, \ 39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ 47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ 48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \ 51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 52 mask0, mask1, mask2, mask3, \ 53 filt0, filt1, filt2, filt3, \ 54 out0, out1, out2, out3) \ 56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 61 out0, out1, out2, out3); \ 62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ 63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ 64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ 65 out0, out1, out2, out3); \ 66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ 67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ 68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ 69 out0, out1, out2, out3); \ 70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ 71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ 72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ 73 out0, out1, out2, out3); \ 76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 77 mask0, mask1, filt0, filt1, \ 80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 89 mask0, mask1, filt0, filt1, \ 90 out0, out1, out2, out3) \ 92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 97 out0, out1, out2, out3); \ 98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 101 out0, out1, out2, out3); \ 109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
112 LD2(src, src_stride, out0, out1);
116 }
else if (6 == height) {
117 LD4(src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(src, src_stride, out0, out1);
125 }
else if (0 == (height % 8)) {
126 for (cnt = (height >> 3); cnt--;) {
127 LD4(src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3, dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7, dst, dst_stride);
134 dst += (4 * dst_stride);
136 }
else if (0 == (height % 4)) {
137 for (cnt = (height >> 2); cnt--;) {
138 LD4(src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3, dst, dst_stride);
141 dst += (4 * dst_stride);
150 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
152 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153 src += (8 * src_stride);
154 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155 dst += (8 * dst_stride);
156 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
165 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
168 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
170 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171 dst += (8 * dst_stride);
172 LD_UB4(src, src_stride, src0, src1, src2, src3);
173 src += (4 * src_stride);
174 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175 dst += (4 * dst_stride);
176 }
else if (0 == (height % 8)) {
177 for (cnt = (height >> 3); cnt--;) {
178 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
180 src += (8 * src_stride);
181 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
183 dst += (8 * dst_stride);
185 }
else if (0 == (height % 4)) {
186 for (cnt = (height >> 2); cnt--;) {
187 LD_UB4(src, src_stride, src0, src1, src2, src3);
188 src += (4 * src_stride);
190 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191 dst += (4 * dst_stride);
201 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
204 for (cnt = 4; cnt--;) {
205 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206 LD4(src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
211 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
224 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
226 for (cnt = (height >> 2); cnt--;) {
227 LD_UB4(src, src_stride, src0, src1, src2, src3);
228 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
230 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232 dst += (4 * dst_stride);
241 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
244 for (cnt = (height >> 2); cnt--;) {
245 LD_UB4(src, src_stride, src0, src1, src2, src3);
246 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
250 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253 dst += (4 * dst_stride);
262 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
265 for (cnt = (height >> 2); cnt--;) {
266 LD_UB4(src, 16, src0, src1, src2, src3);
268 LD_UB4(src, 16, src4, src5, src6, src7);
270 LD_UB4(src, 16, src8, src9, src10, src11);
272 LD_UB4(src, 16, src12, src13, src14, src15);
275 ST_UB4(src0, src1, src2, src3, dst, 16);
277 ST_UB4(src4, src5, src6, src7, dst, 16);
279 ST_UB4(src8, src9, src10, src11, dst, 16);
281 ST_UB4(src12, src13, src14, src15, dst, 16);
290 v16u8 mask0, mask1, mask2, mask3,
out;
291 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
292 v8i16
filt, out0, out1;
298 filt =
LD_SH(filter);
299 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
305 LD_SB4(src, src_stride, src0, src1, src2, src3);
308 mask3, filt0, filt1, filt2, filt3, out0, out1);
312 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
319 v16i8 filt0, filt1, filt2, filt3;
321 v16u8 mask0, mask1, mask2, mask3,
out;
322 v8i16
filt, out0, out1, out2, out3;
328 filt =
LD_SH(filter);
329 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
335 LD_SB4(src, src_stride, src0, src1, src2, src3);
337 src += (4 * src_stride);
339 mask3, filt0, filt1, filt2, filt3, out0, out1);
340 LD_SB4(src, src_stride, src0, src1, src2, src3);
343 mask3, filt0, filt1, filt2, filt3, out2, out3);
347 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
348 dst += (4 * dst_stride);
350 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
357 v16u8 mask0, mask1, mask2, mask3,
out;
358 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
359 v8i16
filt, out0, out1, out2, out3;
365 filt =
LD_SH(filter);
366 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
372 LD_SB4(src, src_stride, src0, src1, src2, src3);
374 src += (4 * src_stride);
376 mask3, filt0, filt1, filt2, filt3, out0, out1);
377 LD_SB4(src, src_stride, src0, src1, src2, src3);
379 src += (4 * src_stride);
381 mask3, filt0, filt1, filt2, filt3, out2, out3);
385 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
386 dst += (4 * dst_stride);
388 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
389 dst += (4 * dst_stride);
391 LD_SB4(src, src_stride, src0, src1, src2, src3);
393 src += (4 * src_stride);
395 mask3, filt0, filt1, filt2, filt3, out0, out1);
396 LD_SB4(src, src_stride, src0, src1, src2, src3);
398 src += (4 * src_stride);
400 mask3, filt0, filt1, filt2, filt3, out2, out3);
405 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
406 dst += (4 * dst_stride);
408 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
417 }
else if (8 == height) {
419 }
else if (16 == height) {
429 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
430 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
431 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
432 v8i16
filt, out0, out1, out2, out3;
438 filt =
LD_SH(filter);
439 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
445 for (loop_cnt = (height >> 2); loop_cnt--;) {
446 LD_SB4(src, src_stride, src0, src1, src2, src3);
448 src += (4 * src_stride);
450 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
451 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
452 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
453 out0, out1, out2, out3);
454 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
455 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
456 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
457 out0, out1, out2, out3);
458 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
459 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
460 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
461 out0, out1, out2, out3);
462 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
463 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
464 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
465 out0, out1, out2, out3);
471 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
472 dst += (4 * dst_stride);
481 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
482 v16u8 tmp0, tmp1, tmp2;
483 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
484 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
485 v16i8 filt0, filt1, filt2, filt3;
486 v8i16
filt, out0, out1, out2, out3, out4, out5;
494 filt =
LD_SH(filter);
495 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
504 for (loop_cnt = 4; loop_cnt--;) {
506 LD_SB4(src, src_stride, src0, src1, src2, src3);
508 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
512 src += (4 * src_stride);
514 VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
515 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
516 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
518 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
519 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
520 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
522 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
523 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
524 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
526 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
527 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
528 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
532 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
534 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
536 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
538 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
549 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
550 ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
551 dst += (4 * dst_stride);
560 v16u8 mask0, mask1, mask2, mask3,
out;
561 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
562 v16i8 filt0, filt1, filt2, filt3;
563 v8i16
filt, out0, out1, out2, out3;
569 filt =
LD_SH(filter);
570 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
576 for (loop_cnt = (height >> 2); loop_cnt--;) {
577 LD_SB2(src, src_stride, src0, src2);
578 LD_SB2(src + 8, src_stride, src1, src3);
579 src += (2 * src_stride);
581 LD_SB2(src, src_stride, src4, src6);
582 LD_SB2(src + 8, src_stride, src5, src7);
583 src += (2 * src_stride);
588 mask3, filt0, filt1, filt2, filt3, out0,
600 mask3, filt0, filt1, filt2, filt3, out0,
618 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
619 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
620 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
622 v8i16 out0, out1, out2, out3, out8, out9,
filt;
628 filt =
LD_SH(filter);
629 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
639 for (loop_cnt = 16; loop_cnt--;) {
640 LD_SB2(src, src_stride, src0, src2);
641 LD_SB2(src + 16, src_stride, src1, src3);
643 src += (2 * src_stride);
644 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
645 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
646 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
647 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
650 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
651 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
652 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
653 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
654 out0, out8, out2, out9);
656 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
657 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
658 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
659 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
660 out0, out8, out2, out9);
662 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
663 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
664 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
665 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
666 out0, out8, out2, out9);
673 ST8x2_UB(out, dst + 16, dst_stride);
688 v16u8 mask0, mask1, mask2, mask3,
out;
689 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
690 v16i8 filt0, filt1, filt2, filt3;
691 v8i16
filt, out0, out1, out2, out3;
697 filt =
LD_SH(filter);
698 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
704 for (loop_cnt = (height >> 1); loop_cnt--;) {
706 src1 =
LD_SB(src + 8);
707 src2 =
LD_SB(src + 16);
708 src3 =
LD_SB(src + 24);
713 src5 =
LD_SB(src + 8);
714 src6 =
LD_SB(src + 16);
715 src7 =
LD_SB(src + 24);
720 mask3, filt0, filt1, filt2, filt3, out0,
728 ST_UB(out, dst + 16);
732 mask3, filt0, filt1, filt2, filt3, out0,
739 ST_UB(out, dst + 16);
749 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
751 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
752 v8i16
filt, out0, out1, out2, out3;
758 filt =
LD_SH(filter);
759 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
769 for (loop_cnt = 64; loop_cnt--;) {
771 src1 =
LD_SB(src + 8);
772 src2 =
LD_SB(src + 16);
773 src3 =
LD_SB(src + 32);
774 src4 =
LD_SB(src + 40);
778 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
780 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
782 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
783 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
786 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
787 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
790 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
792 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
795 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
798 out3 = __msa_srari_h(out2, 6);
803 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
805 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
806 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
809 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
810 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
813 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
814 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
817 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
820 out2 = __msa_srari_h(out2, 6);
823 ST_UB(out, dst + 16);
825 ST_UB(out, dst + 32);
835 v16u8 mask0, mask1, mask2, mask3,
out;
836 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
837 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
838 v16i8 filt0, filt1, filt2, filt3;
839 v8i16 res0, res1, res2, res3,
filt;
845 filt =
LD_SH(filter);
846 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
852 for (loop_cnt = height; loop_cnt--;) {
853 LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
858 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
859 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
860 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
862 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
863 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
864 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
866 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
867 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
868 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
870 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
871 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
872 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
880 ST_UB(out, dst + 16);
882 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
883 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
884 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
886 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
887 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
888 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
890 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
891 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
892 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
894 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
895 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
896 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
902 ST_UB(out, dst + 32);
904 ST_UB(out, dst + 48);
915 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
916 v16i8 src11, src12, src13, src14;
917 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
918 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
919 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
920 v16i8 src10998, filt0, filt1, filt2, filt3;
921 v8i16
filt, out10, out32, out54, out76;
923 src -= (3 * src_stride);
925 filt =
LD_SH(filter);
926 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
928 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
929 src += (7 * src_stride);
931 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
933 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
934 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
938 for (loop_cnt = (height >> 3); loop_cnt--;) {
939 LD_SB4(src, src_stride, src7, src8, src9, src10);
940 src += (4 * src_stride);
941 LD_SB4(src, src_stride, src11, src12, src13, src14);
942 src += (4 * src_stride);
944 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
945 src87_r, src98_r, src109_r);
946 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
947 src1110_r, src1211_r, src1312_r, src1413_r);
948 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
949 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
950 src12111110, src14131312);
954 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
955 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
956 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
957 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
958 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
959 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
960 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
961 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
968 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
969 dst += (4 * dst_stride);
970 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
971 dst += (4 * dst_stride);
974 src4332 = src12111110;
975 src6554 = src14131312;
985 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
986 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
987 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
989 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
991 src -= (3 * src_stride);
993 filt =
LD_SH(filter);
994 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
996 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
998 src += (7 * src_stride);
999 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1001 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1003 for (loop_cnt = (height >> 2); loop_cnt--;) {
1004 LD_SB4(src, src_stride, src7, src8, src9, src10);
1006 src += (4 * src_stride);
1008 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1009 src87_r, src98_r, src109_r);
1010 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1011 filt0, out0_r, out1_r, out2_r, out3_r);
1012 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1013 filt1, out0_r, out1_r, out2_r, out3_r);
1014 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1015 filt2, out0_r, out1_r, out2_r, out3_r);
1016 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1017 filt3, out0_r, out1_r, out2_r, out3_r);
1019 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1022 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1023 dst += (4 * dst_stride);
1040 uint32_t out2, out3;
1041 uint64_t out0, out1;
1042 v16u8 tmp0, tmp1, tmp2, tmp3;
1043 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1044 v16i8 filt0, filt1, filt2, filt3;
1045 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1046 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1047 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1048 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1050 src -= (3 * src_stride);
1052 filt =
LD_SH(filter);
1053 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1055 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1056 src += (7 * src_stride);
1060 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1062 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1063 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1065 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1067 for (loop_cnt = 4; loop_cnt--;) {
1068 LD_SB4(src, src_stride, src7, src8, src9, src10);
1070 src += (4 * src_stride);
1072 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1073 src87_r, src98_r, src109_r);
1074 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1075 src87_l, src98_l, src109_l);
1077 filt1, filt2, filt3);
1079 filt1, filt2, filt3);
1081 filt1, filt2, filt3);
1083 filt1, filt2, filt3);
1085 filt1, filt2, filt3);
1087 filt1, filt2, filt3);
1089 filt1, filt2, filt3);
1091 filt1, filt2, filt3);
1094 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1095 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1096 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1097 out3_r, tmp0, tmp1, tmp2, tmp3);
1100 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1101 out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1102 out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1103 out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1105 SW(out2, (dst + 8));
1108 SW(out3, (dst + 8));
1110 out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1111 out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1112 out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1113 out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1115 SW(out2, (dst + 8));
1118 SW(out3, (dst + 8));
1142 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1143 v16i8 filt0, filt1, filt2, filt3;
1144 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1145 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1146 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1147 v16u8 tmp0, tmp1, tmp2, tmp3;
1148 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1150 src -= (3 * src_stride);
1152 filt =
LD_SH(filter);
1153 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1155 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1157 src += (7 * src_stride);
1158 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1160 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1161 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1163 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1165 for (loop_cnt = (height >> 2); loop_cnt--;) {
1166 LD_SB4(src, src_stride, src7, src8, src9, src10);
1168 src += (4 * src_stride);
1170 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1171 src87_r, src98_r, src109_r);
1172 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1173 src87_l, src98_l, src109_l);
1175 filt1, filt2, filt3);
1177 filt1, filt2, filt3);
1179 filt1, filt2, filt3);
1181 filt1, filt2, filt3);
1183 filt1, filt2, filt3);
1185 filt1, filt2, filt3);
1187 filt1, filt2, filt3);
1189 filt1, filt2, filt3);
1192 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1193 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1194 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1195 out3_r, tmp0, tmp1, tmp2, tmp3);
1197 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1198 dst += (4 * dst_stride);
1223 uint32_t loop_cnt, cnt;
1224 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1225 v16i8 filt0, filt1, filt2, filt3;
1226 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1227 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1228 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1229 v16u8 tmp0, tmp1, tmp2, tmp3;
1230 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1232 src -= (3 * src_stride);
1234 filt =
LD_SH(filter);
1235 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1237 for (cnt = (width >> 4); cnt--;) {
1241 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1243 src_tmp += (7 * src_stride);
1244 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1245 src32_r, src54_r, src21_r);
1246 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1247 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1248 src32_l, src54_l, src21_l);
1249 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1251 for (loop_cnt = (height >> 2); loop_cnt--;) {
1252 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1254 src_tmp += (4 * src_stride);
1255 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1256 src87_r, src98_r, src109_r);
1257 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1258 src87_l, src98_l, src109_l);
1260 filt0, filt1, filt2, filt3);
1262 filt0, filt1, filt2, filt3);
1264 filt0, filt1, filt2, filt3);
1266 filt0, filt1, filt2, filt3);
1268 filt0, filt1, filt2, filt3);
1270 filt0, filt1, filt2, filt3);
1272 filt0, filt1, filt2, filt3);
1274 filt0, filt1, filt2, filt3);
1277 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1278 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1279 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1280 out3_r, tmp0, tmp1, tmp2, tmp3);
1282 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1283 dst_tmp += (4 * dst_stride);
1344 const int8_t *filter_x,
1345 const int8_t *filter_y,
1350 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1351 v16i8 src9, src10, src11, src12, src13, src14;
1352 v8i16 filt0, filt1, filt2, filt3;
1353 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1354 v16i8 mask1, mask2, mask3;
1356 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1357 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1358 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1359 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1360 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1361 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1364 src -= ((3 * src_stride) + 3);
1365 filter_vec =
LD_SH(filter_x);
1366 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1368 filter_vec =
LD_SH(filter_y);
1371 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1377 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1378 src += (7 * src_stride);
1381 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1382 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1383 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1384 vec8, vec9, vec10, vec11);
1385 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1386 vec12, vec13, vec14, vec15);
1401 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1403 for (loop_cnt = height >> 3; loop_cnt--;) {
1404 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1406 src += (8 * src_stride);
1407 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1409 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1410 vec0, vec1, vec2, vec3);
1411 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1412 vec4, vec5, vec6, vec7);
1413 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1414 vec8, vec9, vec10, vec11);
1415 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1416 vec12, vec13, vec14, vec15);
1427 dst76_r = __msa_ilvr_h(dst117, dst66);
1430 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1431 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1432 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1434 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1435 filt_h1, filt_h2, filt_h3);
1436 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1437 filt_h1, filt_h2, filt_h3);
1438 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1439 filt_h1, filt_h2, filt_h3);
1440 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1441 filt_h1, filt_h2, filt_h3);
1442 dst4_r =
HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1443 filt_h1, filt_h2, filt_h3);
1444 dst5_r =
HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1445 filt_h1, filt_h2, filt_h3);
1446 dst6_r =
HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1447 filt_h1, filt_h2, filt_h3);
1449 filt_h0, filt_h1, filt_h2, filt_h3);
1451 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1452 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1455 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1456 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1457 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1458 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1461 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1462 dst += (4 * dst_stride);
1463 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1464 dst += (4 * dst_stride);
1467 dst32_r = dst1110_r;
1468 dst54_r = dst1312_r;
1470 dst43_r = dst1211_r;
1471 dst65_r = dst1413_r;
1472 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1480 const int8_t *filter_x,
1481 const int8_t *filter_y,
1484 uint32_t loop_cnt, cnt;
1488 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1489 v8i16 filt0, filt1, filt2, filt3;
1490 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1491 v16i8 mask1, mask2, mask3;
1493 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1494 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1495 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1496 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1497 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1498 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1499 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1500 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1503 src -= ((3 * src_stride) + 3);
1505 filter_vec =
LD_SH(filter_x);
1506 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1508 filter_vec =
LD_SH(filter_y);
1511 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1517 for (cnt = width >> 3; cnt--;) {
1521 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1522 src_tmp += (7 * src_stride);
1526 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1527 vec0, vec1, vec2, vec3);
1528 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1529 vec4, vec5, vec6, vec7);
1530 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1531 vec8, vec9, vec10, vec11);
1532 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1533 vec12, vec13, vec14, vec15);
1543 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1544 vec0, vec1, vec2, vec3);
1545 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1546 vec4, vec5, vec6, vec7);
1547 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1548 vec8, vec9, vec10, vec11);
1556 for (loop_cnt = height >> 1; loop_cnt--;) {
1557 LD_SB2(src_tmp, src_stride, src7, src8);
1559 src_tmp += 2 * src_stride;
1561 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1562 dst10_r, dst32_r, dst54_r, dst21_r);
1563 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1564 dst10_l, dst32_l, dst54_l, dst21_l);
1565 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1566 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1568 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1569 vec0, vec1, vec2, vec3);
1575 filt_h0, filt_h1, filt_h2, filt_h3);
1577 filt_h0, filt_h1, filt_h2, filt_h3);
1581 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1582 vec0, vec1, vec2, vec3);
1588 filt_h0, filt_h1, filt_h2, filt_h3);
1590 filt_h0, filt_h1, filt_h2, filt_h3);
1594 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1596 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1598 ST8x2_UB(out, dst_tmp, dst_stride);
1599 dst_tmp += (2 * dst_stride);
1619 const int8_t *filter_x,
1620 const int8_t *filter_y,
1624 filter_x, filter_y, height, 8);
1631 const int8_t *filter_x,
1632 const int8_t *filter_y,
1638 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1639 v16i8 src11, src12, src13, src14;
1640 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1641 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1642 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1643 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1644 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1645 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1646 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1647 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1648 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1649 v8i16 dst1413_r, dst87_l, filter_vec;
1650 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1651 v4i32 dst0_l, dst1_l;
1653 src -= ((3 * src_stride) + 3);
1655 filter_vec =
LD_SH(filter_x);
1656 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1658 filter_vec =
LD_SH(filter_y);
1661 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1671 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1672 src_tmp += (7 * src_stride);
1676 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1677 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1678 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1680 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1691 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1692 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1693 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1702 for (loop_cnt = 8; loop_cnt--;) {
1703 LD_SB2(src_tmp, src_stride, src7, src8);
1705 src_tmp += 2 * src_stride;
1707 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1708 dst32_r, dst54_r, dst21_r);
1709 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1710 dst32_l, dst54_l, dst21_l);
1711 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1712 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1714 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1721 filt_h0, filt_h1, filt_h2, filt_h3);
1723 filt_h0, filt_h1, filt_h2, filt_h3);
1727 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1734 filt_h0, filt_h1, filt_h2, filt_h3);
1736 filt_h0, filt_h1, filt_h2, filt_h3);
1740 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1742 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1744 ST8x2_UB(out0, dst_tmp, dst_stride);
1745 dst_tmp += (2 * dst_stride);
1764 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1765 src += (7 * src_stride);
1768 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1769 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1770 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1772 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1788 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1790 for (loop_cnt = 2; loop_cnt--;) {
1791 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1793 src += (8 * src_stride);
1794 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1796 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1798 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1800 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1802 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1814 dst76_r = __msa_ilvr_h(dst117, dst66);
1817 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1818 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1819 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1821 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1822 filt_h1, filt_h2, filt_h3);
1823 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1824 filt_h1, filt_h2, filt_h3);
1825 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1826 filt_h1, filt_h2, filt_h3);
1827 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1828 filt_h1, filt_h2, filt_h3);
1829 dst4_r =
HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1830 filt_h1, filt_h2, filt_h3);
1831 dst5_r =
HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1832 filt_h1, filt_h2, filt_h3);
1833 dst6_r =
HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1834 filt_h1, filt_h2, filt_h3);
1836 filt_h0, filt_h1, filt_h2, filt_h3);
1838 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1839 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1842 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1843 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1844 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1845 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1848 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1849 dst += (4 * dst_stride);
1850 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1851 dst += (4 * dst_stride);
1854 dst32_r = dst1110_r;
1855 dst54_r = dst1312_r;
1857 dst43_r = dst1211_r;
1858 dst65_r = dst1413_r;
1859 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1867 const int8_t *filter_x,
1868 const int8_t *filter_y,
1872 filter_x, filter_y, height, 16);
1879 const int8_t *filter_x,
1880 const int8_t *filter_y,
1884 filter_x, filter_y, height, 24);
1891 const int8_t *filter_x,
1892 const int8_t *filter_y,
1896 filter_x, filter_y, height, 32);
1903 const int8_t *filter_x,
1904 const int8_t *filter_y,
1908 filter_x, filter_y, height, 48);
1915 const int8_t *filter_x,
1916 const int8_t *filter_y,
1920 filter_x, filter_y, height, 64);
1927 v16i8 filt0, filt1,
src0,
src1, mask0, mask1, vec0, vec1;
1935 filt =
LD_SH(filter);
1940 LD_SB2(src, src_stride, src0, src1);
1942 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1944 res0 = __msa_srari_h(res0, 6);
1945 res0 = __msa_sat_s_h(res0, 7);
1954 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1955 v8i16
filt, out0, out1;
1962 filt =
LD_SH(filter);
1967 LD_SB4(src, src_stride, src0, src1, src2, src3);
1970 filt0, filt1, out0, out1);
1974 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1981 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1983 v8i16
filt, out0, out1, out2, out3;
1989 filt =
LD_SH(filter);
1994 LD_SB4(src, src_stride, src0, src1, src2, src3);
1995 src += (4 * src_stride);
1999 filt0, filt1, out0, out1);
2000 LD_SB4(src, src_stride, src0, src1, src2, src3);
2003 filt0, filt1, out2, out3);
2007 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2008 dst += (4 * dst_stride);
2010 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2017 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2018 v16i8 filt0, filt1, mask0, mask1;
2020 v8i16
filt, out0, out1, out2, out3;
2026 filt =
LD_SH(filter);
2031 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2032 src += (8 * src_stride);
2035 filt0, filt1, out0, out1);
2037 filt0, filt1, out2, out3);
2041 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2042 dst += (4 * dst_stride);
2044 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2045 dst += (4 * dst_stride);
2047 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2048 src += (8 * src_stride);
2051 filt0, filt1, out0, out1);
2053 filt0, filt1, out2, out3);
2057 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2058 dst += (4 * dst_stride);
2060 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2069 }
else if (4 == height) {
2071 }
else if (8 == height) {
2073 }
else if (16 == height) {
2082 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
2084 v8i16
filt, out0, out1, out2, out3;
2090 filt =
LD_SH(filter);
2095 LD_SB4(src, src_stride, src0, src1, src2, src3);
2096 src += (4 * src_stride);
2100 filt1, out0, out1, out2, out3);
2105 ST6x4_UB(out4, out5, dst, dst_stride);
2106 dst += (4 * dst_stride);
2108 LD_SB4(src, src_stride, src0, src1, src2, src3);
2109 src += (4 * src_stride);
2113 filt1, out0, out1, out2, out3);
2118 ST6x4_UB(out4, out5, dst, dst_stride);
2119 dst += (4 * dst_stride);
2127 v16i8
src0,
src1, filt0, filt1, mask0, mask1;
2129 v8i16
filt, vec0, vec1, vec2, vec3;
2134 filt =
LD_SH(filter);
2139 for (loop_cnt = (height >> 1); loop_cnt--;) {
2140 LD_SB2(src, src_stride, src0, src1);
2141 src += (2 * src_stride);
2144 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2145 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2146 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2152 dst += (2 * dst_stride);
2161 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
2163 v8i16
filt, out0, out1, out2, out3;
2169 filt =
LD_SH(filter);
2174 for (loop_cnt = (height >> 2); loop_cnt--;) {
2175 LD_SB4(src, src_stride, src0, src1, src2, src3);
2176 src += (4 * src_stride);
2180 filt1, out0, out1, out2, out3);
2185 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2186 dst += (4 * dst_stride);
2194 if ((2 == height) || (6 == height)) {
2208 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2209 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2212 v8i16
filt, out0, out1, out2, out3, out4, out5;
2220 filt =
LD_SH(filter);
2226 for (loop_cnt = 4; loop_cnt--;) {
2227 LD_SB4(src, src_stride, src0, src1, src2, src3);
2228 src += (4 * src_stride);
2231 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2232 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2233 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2238 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2240 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2241 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2242 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2243 out2, out3, out4, out5);
2244 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2245 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2246 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2247 out2, out3, out4, out5);
2252 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2253 dst += (4 * dst_stride);
2262 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2263 v16i8 filt0, filt1, mask0, mask1;
2264 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2265 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2272 filt =
LD_SH(filter);
2277 for (loop_cnt = (height >> 2); loop_cnt--;) {
2278 LD_SB4(src, src_stride, src0, src2, src4, src6);
2279 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2280 src += (4 * src_stride);
2284 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2285 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2286 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2287 out0, out1, out2, out3);
2288 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2289 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2290 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2291 out0, out1, out2, out3);
2301 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2302 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2303 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2304 out4, out5, out6, out7);
2305 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2306 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2307 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2308 out4, out5, out6, out7);
2326 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2327 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2328 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2329 v8i16
filt, out0, out1, out2, out3;
2336 filt =
LD_SH(filter);
2341 mask11 = mask0 + 10;
2343 for (loop_cnt = 8; loop_cnt--;) {
2344 LD_SB4(src, src_stride, src0, src2, src4, src6);
2345 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2346 src += (4 * src_stride);
2349 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2350 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2351 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2352 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2353 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2354 out0, out1, out2, out3);
2355 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2356 out0, out1, out2, out3);
2366 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2367 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2368 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2369 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2370 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2371 out0, out1, out2, out3);
2372 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2373 out0, out1, out2, out3);
2384 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2385 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2386 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2387 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2389 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2390 out0, out1, out2, out3);
2391 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2392 out0, out1, out2, out3);
2398 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2399 dst1 += (4 * dst_stride);
2408 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2409 v16i8 filt0, filt1, mask0, mask1;
2411 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2412 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2418 filt =
LD_SH(filter);
2423 for (loop_cnt = (height >> 1); loop_cnt--;) {
2425 src1 =
LD_SB(src + 8);
2426 src2 =
LD_SB(src + 16);
2427 src3 =
LD_SB(src + 24);
2430 src5 =
LD_SB(src + 8);
2431 src6 =
LD_SB(src + 16);
2432 src7 =
LD_SB(src + 24);
2437 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2438 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2439 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2440 out0, out1, out2, out3);
2441 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2442 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2443 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2444 out0, out1, out2, out3);
2446 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2447 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2448 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2449 out4, out5, out6, out7);
2450 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2451 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2452 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2453 out4, out5, out6, out7);
2461 ST_UB(out, dst + 16);
2466 ST_UB(out, dst + 16);
2475 v16i8
src0,
src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2476 v16i8 src2110, src4332, filt0, filt1;
2482 filt =
LD_SH(filter);
2485 LD_SB3(src, src_stride, src0, src1, src2);
2486 src += (3 * src_stride);
2488 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2489 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2490 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2491 LD_SB2(src, src_stride, src3, src4);
2492 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2493 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2494 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2496 out10 = __msa_srari_h(out10, 6);
2497 out10 = __msa_sat_s_h(out10, 7);
2507 v16i8
src0,
src1, src2, src3, src4, src5;
2508 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2509 v16i8 src2110, src4332, filt0, filt1;
2510 v8i16
filt, out10, out32;
2515 filt =
LD_SH(filter);
2518 LD_SB3(src, src_stride, src0, src1, src2);
2519 src += (3 * src_stride);
2521 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2523 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2524 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2526 for (loop_cnt = (height >> 2); loop_cnt--;) {
2527 LD_SB3(src, src_stride, src3, src4, src5);
2528 src += (3 * src_stride);
2529 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2530 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2531 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2535 src += (src_stride);
2536 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2537 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2538 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2543 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2544 dst += (4 * dst_stride);
2565 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2566 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2567 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2571 filter_vec =
LD_SH(filter);
2574 LD_SB3(src, src_stride, src0, src1, src2);
2575 src += (3 * src_stride);
2577 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2579 LD_SB2(src, src_stride, src3, src4);
2580 src += (2 * src_stride);
2582 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2587 LD_SB2(src, src_stride, src5, src6);
2588 src += (2 * src_stride);
2590 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2596 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2599 ST6x4_UB(out0, out1, dst, dst_stride);
2600 dst += (4 * dst_stride);
2602 LD_SB2(src, src_stride, src3, src4);
2603 src += (2 * src_stride);
2605 ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2610 LD_SB2(src, src_stride, src5, src6);
2611 src += (2 * src_stride);
2613 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2619 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2622 ST6x4_UB(out0, out1, dst, dst_stride);
2629 v16i8
src0,
src1, src2, src3, src4;
2630 v8i16 src01, src12, src23, src34, tmp0, tmp1,
filt, filt0, filt1;
2636 filt =
LD_SH(filter);
2639 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2641 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2643 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2656 uint64_t out0, out1, out2;
2657 v16i8
src0,
src1, src2, src3, src4, src5;
2658 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2659 v8i16
filt, filt0, filt1;
2664 filt =
LD_SH(filter);
2667 LD_SB3(src, src_stride, src0, src1, src2);
2668 src += (3 * src_stride);
2671 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2673 for (loop_cnt = 2; loop_cnt--;) {
2674 LD_SB3(src, src_stride, src3, src4, src5);
2675 src += (3 * src_stride);
2678 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2683 tmp2 = __msa_srari_h(tmp2, 6);
2688 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2689 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2690 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2709 v16i8
src0,
src1, src2, src7, src8, src9, src10;
2710 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2712 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
2716 filt =
LD_SH(filter);
2719 LD_SB3(src, src_stride, src0, src1, src2);
2720 src += (3 * src_stride);
2723 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2725 for (loop_cnt = (height >> 2); loop_cnt--;) {
2726 LD_SB4(src, src_stride, src7, src8, src9, src10);
2727 src += (4 * src_stride);
2730 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2731 src72_r, src87_r, src98_r, src109_r);
2737 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2740 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2741 dst += (4 * dst_stride);
2755 }
else if (6 == height) {
2768 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2770 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2771 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2772 v16i8 src2110, src4332, src6554;
2773 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2776 src -= (1 * src_stride);
2778 filter_vec =
LD_SH(filter);
2781 LD_SB3(src, src_stride, src0, src1, src2);
2782 src += (3 * src_stride);
2785 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2786 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2787 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2789 for (loop_cnt = 4; loop_cnt--;) {
2790 LD_SB4(src, src_stride, src3, src4, src5, src6);
2791 src += (4 * src_stride);
2794 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2795 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2796 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2797 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2798 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2799 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2810 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2814 ST8x4_UB(out0, out1, dst, dst_stride);
2816 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2817 dst += (4 * dst_stride);
2831 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2832 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2833 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2834 v16u8 tmp0, tmp1, tmp2, tmp3;
2835 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2839 filt =
LD_SH(filter);
2842 LD_SB3(src, src_stride, src0, src1, src2);
2843 src += (3 * src_stride);
2846 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2847 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2849 for (loop_cnt = (height >> 2); loop_cnt--;) {
2850 LD_SB4(src, src_stride, src3, src4, src5, src6);
2851 src += (4 * src_stride);
2854 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2855 src32_r, src43_r, src54_r, src65_r);
2856 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2857 src32_l, src43_l, src54_l, src65_l);
2868 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2869 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2870 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2871 out3_r, tmp0, tmp1, tmp2, tmp3);
2873 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2874 dst += (4 * dst_stride);
2889 uint64_t out0, out1;
2890 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2891 v16i8 src11, filt0, filt1;
2892 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2893 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2895 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2899 filt =
LD_SH(filter);
2903 LD_SB3(src, src_stride, src0, src1, src2);
2905 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2906 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2909 LD_SB3(src + 16, src_stride, src6, src7, src8);
2910 src += (3 * src_stride);
2912 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2914 for (loop_cnt = 8; loop_cnt--;) {
2916 LD_SB2(src, src_stride, src3, src4);
2918 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2919 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2922 LD_SB2(src + 16, src_stride, src9, src10);
2923 src += (2 * src_stride);
2925 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2940 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2944 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2946 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2947 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2956 LD_SB2(src, src_stride, src5, src2);
2958 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2959 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2962 LD_SB2(src + 16, src_stride, src11, src8);
2963 src += (2 * src_stride);
2965 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2980 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3000 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
3001 v16i8 src10_r, src32_r, src76_r, src98_r;
3002 v16i8 src21_r, src43_r, src87_r, src109_r;
3003 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3004 v16i8 src10_l, src32_l, src76_l, src98_l;
3005 v16i8 src21_l, src43_l, src87_l, src109_l;
3012 filt =
LD_SH(filter);
3016 LD_SB3(src, src_stride, src0, src1, src2);
3019 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3020 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3023 LD_SB3(src + 16, src_stride, src6, src7, src8);
3024 src += (3 * src_stride);
3027 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3028 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3030 for (loop_cnt = (height >> 1); loop_cnt--;) {
3032 LD_SB2(src, src_stride, src3, src4);
3034 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3035 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3045 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3049 ST_UB(out, dst + dst_stride);
3058 LD_SB2(src + 16, src_stride, src9, src10);
3059 src += (2 * src_stride);
3061 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3062 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3072 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3074 ST_UB(out, dst + 16);
3076 ST_UB(out, dst + 16 + dst_stride);
3078 dst += 2 * dst_stride;
3092 const int8_t *filter_x,
3093 const int8_t *filter_y)
3096 v16i8
src0,
src1, src2, src3, src4;
3098 v8i16 filt_h0, filt_h1;
3101 v8i16 filter_vec,
tmp;
3102 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3103 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3106 src -= (src_stride + 1);
3108 filter_vec =
LD_SH(filter_x);
3111 filter_vec =
LD_SH(filter_y);
3118 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3121 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3122 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3123 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3136 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3137 tmp = __msa_srari_h(tmp, 6);
3138 tmp = __msa_sat_s_h(tmp, 7);
3147 const int8_t *filter_x,
3148 const int8_t *filter_y)
3151 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3153 v8i16 filt_h0, filt_h1;
3156 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3157 v8i16 filter_vec, tmp0, tmp1;
3158 v8i16 dst30, dst41, dst52, dst63;
3159 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3160 v4i32 dst0, dst1, dst2, dst3;
3162 src -= (src_stride + 1);
3164 filter_vec =
LD_SH(filter_x);
3167 filter_vec =
LD_SH(filter_y);
3174 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3177 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3178 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3179 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3180 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3194 SRA_4V(dst0, dst1, dst2, dst3, 6);
3199 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3206 const int8_t *filter_x,
3207 const int8_t *filter_y,
3212 v16i8
src0,
src1, src2, src3, src4, src5;
3213 v16i8 src6, src7, src8, src9, src10;
3215 v8i16 filt_h0, filt_h1;
3218 v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3219 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3220 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3221 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3222 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3223 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3224 v8i16 dst98_r, dst109_r;
3226 src -= (src_stride + 1);
3228 filter_vec =
LD_SH(filter_x);
3231 filter_vec =
LD_SH(filter_y);
3238 LD_SB3(src, src_stride, src0, src1, src2);
3239 src += (3 * src_stride);
3243 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3244 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3248 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3250 for (loop_cnt = height >> 3; loop_cnt--;) {
3252 src3, src4, src5, src6, src7, src8, src9, src10);
3253 src += (8 * src_stride);
3257 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3258 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3259 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3260 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3267 dst32_r = __msa_ilvr_h(dst73, dst22);
3271 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3272 dst76_r = __msa_ilvr_h(dst22, dst106);
3282 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3283 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3285 dst5_r, dst4_r, dst7_r, dst6_r,
3286 tmp0, tmp1, tmp2, tmp3);
3291 ST4x8_UB(out0, out1, dst, dst_stride);
3292 dst += (8 * dst_stride);
3296 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3304 const int8_t *filter_x,
3305 const int8_t *filter_y,
3310 filter_x, filter_y);
3311 }
else if (4 == height) {
3313 filter_x, filter_y);
3314 }
else if (0 == (height % 8)) {
3316 filter_x, filter_y, height);
3324 const int8_t *filter_x,
3325 const int8_t *filter_y,
3328 v16u8 out0, out1, out2;
3329 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3330 v16i8 src7, src8, src9, src10;
3332 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3335 v8i16 filt_h0, filt_h1, filter_vec;
3336 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3337 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3338 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3339 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3340 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3341 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3342 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3343 v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3344 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3346 src -= (src_stride + 1);
3348 filter_vec =
LD_SH(filter_x);
3351 filter_vec =
LD_SH(filter_y);
3358 LD_SB3(src, src_stride, src0, src1, src2);
3359 src += (3 * src_stride);
3363 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3364 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3365 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3374 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3377 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3378 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3379 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3380 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3387 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3388 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3389 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3390 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3406 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3407 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3408 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3421 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3422 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3423 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3424 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3425 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3426 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3427 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3435 ST4x8_UB(out0, out1, dst, dst_stride);
3436 ST2x4_UB(out2, 0, dst + 4, dst_stride);
3437 dst += 4 * dst_stride;
3438 ST2x4_UB(out2, 4, dst + 4, dst_stride);
3445 const int8_t *filter_x,
3446 const int8_t *filter_y)
3449 v16i8
src0,
src1, src2, src3, src4;
3451 v8i16 filt_h0, filt_h1, filter_vec;
3454 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3455 v8i16 dst0, dst1, dst2, dst3, dst4;
3456 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3457 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3458 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3459 v8i16 out0_r, out1_r;
3461 src -= (src_stride + 1);
3463 filter_vec =
LD_SH(filter_x);
3466 filter_vec =
LD_SH(filter_y);
3473 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3476 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3477 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3478 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3479 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3480 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3495 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3496 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3507 const int8_t *filter_x,
3508 const int8_t *filter_y,
3513 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
3514 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3515 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3516 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3517 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3518 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3519 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3521 src -= (src_stride + 1);
3523 filter_vec =
LD_SH(filter_x);
3526 filter_vec =
LD_SH(filter_y);
3534 for (cnt = width8mult; cnt--;) {
3535 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3539 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3540 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3541 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3550 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3551 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3552 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3553 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3574 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3575 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3577 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3578 dst3_r, tmp0, tmp1, tmp2, tmp3);
3583 ST8x4_UB(out0, out1, dst, dst_stride);
3592 const int8_t *filter_x,
3593 const int8_t *filter_y)
3595 v16u8 out0, out1, out2;
3596 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3598 v8i16 filt_h0, filt_h1, filter_vec;
3601 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3602 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3603 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3604 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3605 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3606 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3607 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3608 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3609 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3610 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3612 src -= (src_stride + 1);
3614 filter_vec =
LD_SH(filter_x);
3617 filter_vec =
LD_SH(filter_y);
3624 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3625 src += (5 * src_stride);
3626 LD_SB4(src, src_stride, src5, src6, src7, src8);
3631 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3632 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3633 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3634 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3635 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3636 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3637 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3638 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3639 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3673 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3674 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3675 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3677 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3678 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3681 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3687 ST8x4_UB(out0, out1, dst, dst_stride);
3688 dst += (4 * dst_stride);
3696 const int8_t *filter_x,
3697 const int8_t *filter_y,
3701 uint32_t loop_cnt, cnt;
3705 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3707 v8i16 filt_h0, filt_h1, filter_vec;
3710 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3711 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3712 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3713 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3714 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3715 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3716 v8i16 out0_r, out1_r, out2_r, out3_r;
3718 src -= (src_stride + 1);
3720 filter_vec =
LD_SH(filter_x);
3723 filter_vec =
LD_SH(filter_y);
3730 for (cnt = width8mult; cnt--;) {
3734 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3735 src_tmp += (3 * src_stride);
3739 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3740 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3741 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3750 for (loop_cnt = (height >> 2); loop_cnt--;) {
3751 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3752 src_tmp += (4 * src_stride);
3756 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3757 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3758 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3759 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3780 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3781 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3784 dst2_l, dst2_r, dst3_l, dst3_r,
3785 out0_r, out1_r, out2_r, out3_r);
3788 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3791 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
3792 dst_tmp += (4 * dst_stride);
3810 const int8_t *filter_x,
3811 const int8_t *filter_y,
3816 filter_x, filter_y);
3817 }
else if (4 == height) {
3819 filter_x, filter_y, 1);
3820 }
else if (6 == height) {
3822 filter_x, filter_y);
3823 }
else if (0 == (height % 4)) {
3825 filter_x, filter_y, height, 1);
3833 const int8_t *filter_x,
3834 const int8_t *filter_y,
3840 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3841 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3842 v16i8 mask0, mask1, mask2, mask3;
3843 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3844 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3845 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3846 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3847 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3848 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3849 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3850 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3852 src -= (src_stride + 1);
3854 filter_vec =
LD_SH(filter_x);
3857 filter_vec =
LD_SH(filter_y);
3868 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3869 src_tmp += (3 * src_stride);
3873 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3874 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3875 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3884 for (loop_cnt = 4; loop_cnt--;) {
3885 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3886 src_tmp += (4 * src_stride);
3889 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3890 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3891 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3892 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3913 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3914 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3916 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3917 dst3_r, tmp0, tmp1, tmp2, tmp3);
3922 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
3923 dst_tmp += (4 * dst_stride);
3938 LD_SB3(src, src_stride, src0, src1, src2);
3939 src += (3 * src_stride);
3941 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3942 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3948 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3950 for (loop_cnt = 2; loop_cnt--;) {
3952 src3, src4, src5, src6, src7, src8, src9, src10);
3953 src += (8 * src_stride);
3955 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3956 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3957 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3958 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3965 dst32_r = __msa_ilvr_h(dst73, dst22);
3969 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3970 dst76_r = __msa_ilvr_h(dst22, dst106);
3980 SRA_4V(dst0, dst1, dst2, dst3, 6);
3981 SRA_4V(dst4, dst5, dst6, dst7, 6);
3982 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3983 tmp0, tmp1, tmp2, tmp3);
3988 ST4x8_UB(out0, out1, dst, dst_stride);
3989 dst += (8 * dst_stride);
3993 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4001 const int8_t *filter_x,
4002 const int8_t *filter_y,
4010 filter_x, filter_y, height, 2);
4018 const int8_t *filter_x,
4019 const int8_t *filter_y,
4023 filter_x, filter_y, height, 3);
4030 const int8_t *filter_x,
4031 const int8_t *filter_y,
4035 filter_x, filter_y, height, 4);
4038 #define UNI_MC_COPY(WIDTH) \ 4039 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 4040 ptrdiff_t dst_stride, \ 4042 ptrdiff_t src_stride, \ 4048 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \ 4061 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 4062 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 4063 ptrdiff_t dst_stride, \ 4065 ptrdiff_t src_stride, \ 4071 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 4073 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 4077 UNI_MC(qpel,
h, 4, 8, hz, mx);
4078 UNI_MC(qpel,
h, 8, 8, hz, mx);
4079 UNI_MC(qpel,
h, 12, 8, hz, mx);
4080 UNI_MC(qpel,
h, 16, 8, hz, mx);
4081 UNI_MC(qpel,
h, 24, 8, hz, mx);
4082 UNI_MC(qpel,
h, 32, 8, hz, mx);
4083 UNI_MC(qpel,
h, 48, 8, hz, mx);
4084 UNI_MC(qpel,
h, 64, 8, hz, mx);
4086 UNI_MC(qpel, v, 4, 8, vt, my);
4087 UNI_MC(qpel, v, 8, 8, vt, my);
4088 UNI_MC(qpel, v, 12, 8, vt, my);
4089 UNI_MC(qpel, v, 16, 8, vt, my);
4090 UNI_MC(qpel, v, 24, 8, vt, my);
4091 UNI_MC(qpel, v, 32, 8, vt, my);
4092 UNI_MC(qpel, v, 48, 8, vt, my);
4093 UNI_MC(qpel, v, 64, 8, vt, my);
4095 UNI_MC(epel,
h, 4, 4, hz, mx);
4096 UNI_MC(epel,
h, 6, 4, hz, mx);
4097 UNI_MC(epel,
h, 8, 4, hz, mx);
4098 UNI_MC(epel,
h, 12, 4, hz, mx);
4099 UNI_MC(epel,
h, 16, 4, hz, mx);
4100 UNI_MC(epel,
h, 24, 4, hz, mx);
4101 UNI_MC(epel,
h, 32, 4, hz, mx);
4103 UNI_MC(epel, v, 4, 4, vt, my);
4104 UNI_MC(epel, v, 6, 4, vt, my);
4105 UNI_MC(epel, v, 8, 4, vt, my);
4106 UNI_MC(epel, v, 12, 4, vt, my);
4107 UNI_MC(epel, v, 16, 4, vt, my);
4108 UNI_MC(epel, v, 24, 4, vt, my);
4109 UNI_MC(epel, v, 32, 4, vt, my);
4113 #define UNI_MC_HV(PEL, WIDTH, TAP) \ 4114 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 4115 ptrdiff_t dst_stride, \ 4117 ptrdiff_t src_stride, \ 4123 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 4124 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 4126 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 4127 filter_x, filter_y, height); \
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B2_128_SB(...)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define XORI_B3_128_SB(...)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNPCK_R_SB_SH(in, out)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SH(...)
#define SRA_4V(in0, in1, in2, in3, shift)
#define XORI_B4_128_UB(...)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
#define LD4(psrc, stride, out0, out1, out2, out3)
#define SPLATI_W2_SH(...)
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static int aligned(int val)
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST8x2_UB(in, pdst, stride)
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B7_128_SB(...)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B4_128_SB(...)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_W4_SH(...)
#define DPADD_SB2_SH(...)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNI_MC_HV(PEL, WIDTH, TAP)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_H4_SB(...)
#define ST2x4_UB(in, stidx, pdst, stride)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static const int8_t filt[NUMTAPS]
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ST8x4_UB(in0, in1, pdst, stride)
static const uint8_t ff_hevc_mask_arr[16 *3]
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
#define UNI_MC_COPY(WIDTH)
static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST8x1_UB(in, pdst)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x2_UB(in, pdst, stride)
#define LD2(psrc, stride, out0, out1)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)