28 uint8_t *p3 = src - (stride << 2);
29 uint8_t *p2 = src - ((stride << 1) + stride);
30 uint8_t *p1 = src - (stride << 1);
34 uint8_t *q2 = src + (stride << 1);
35 uint8_t *q3 = src + (stride << 1) + stride;
37 int32_t dp00, dq00, dp30, dq30, d00, d30;
39 int32_t dp04, dq04, dp34, dq34, d04, d34;
40 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
42 uint64_t dst_val0, dst_val1;
43 v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
44 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
49 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
51 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
53 dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
54 dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
55 dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
56 dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
59 dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
60 dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
61 dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
62 dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
66 p_is_pcm0 = p_is_pcm[0];
67 p_is_pcm4 = p_is_pcm[1];
68 q_is_pcm0 = q_is_pcm[0];
69 q_is_pcm4 = q_is_pcm[1];
71 cmp0 = __msa_fill_d(p_is_pcm0);
72 cmp1 = __msa_fill_d(p_is_pcm4);
73 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
74 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
76 d0030 = (d00 + d30) >= beta;
77 d0434 = (d04 + d34) >= beta;
79 cmp0 = (v2i64) __msa_fill_w(d0030);
80 cmp1 = (v2i64) __msa_fill_w(d0434);
81 cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
82 cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
84 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
91 cmp0 = __msa_fill_d(q_is_pcm0);
92 cmp1 = __msa_fill_d(q_is_pcm4);
93 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
94 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
99 tc250 = ((tc0 * 5 + 1) >> 1);
101 tc254 = ((tc4 * 5 + 1) >> 1);
103 cmp0 = (v2i64) __msa_fill_h(tc0);
104 cmp1 = (v2i64) __msa_fill_h(tc4);
106 ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
107 p3_src, p2_src, p1_src, p0_src);
113 flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
114 abs(p0[0] - q0[0]) < tc250;
115 flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
116 abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
117 (d30 << 1) < beta20);
119 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
120 ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
121 q0_src, q1_src, q2_src, q3_src);
122 flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
123 abs(p0[4] - q0[4]) < tc254;
124 flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
125 abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
126 (d34 << 1) < beta20);
128 cmp0 = (v2i64) __msa_fill_w(flag0);
129 cmp1 = (v2i64) __msa_fill_w(flag1);
130 cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
131 cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0);
133 if (flag0 && flag1) {
139 temp0 = (p1_src + p0_src + q0_src);
140 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
141 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
142 temp2 = (v8i16) (temp1 - p2_src);
143 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
144 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
146 temp1 = temp0 + p2_src;
147 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
148 temp2 = (v8i16) (temp1 - p1_src);
149 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
150 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
152 temp1 = (temp0 << 1) + p2_src + q1_src;
153 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
154 temp2 = (v8i16) (temp1 - p0_src);
155 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
156 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
158 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
159 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
160 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
163 temp0 = (q1_src + p0_src + q0_src);
165 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
166 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
167 temp2 = (v8i16) (temp1 - q2_src);
168 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
169 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
171 temp1 = temp0 + q2_src;
172 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
173 temp2 = (v8i16) (temp1 - q1_src);
174 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
175 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
177 temp1 = (temp0 << 1) + p1_src + q2_src;
178 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
179 temp2 = (v8i16) (temp1 - q0_src);
180 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
181 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
183 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
184 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
185 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
189 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
192 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
193 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
195 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
196 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
197 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
199 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
200 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
208 }
else if (flag0 == flag1) {
212 diff0 = (v8i16) (q0_src - p0_src);
213 diff1 = (v8i16) (q1_src - p1_src);
214 diff0 = (diff0 << 3) + diff0;
215 diff1 = (diff1 << 1) + diff1;
216 delta0 = diff0 - diff1;
217 delta0 = __msa_srari_h(delta0, 4);
219 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
220 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
221 abs_delta0 = (v8u16) abs_delta0 < temp1;
223 delta0 =
CLIP_SH(delta0, tc_neg, tc_pos);
225 temp0 = (v8u16) (delta0 + p0_src);
227 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
228 (v16u8) p_is_pcm_vec);
230 temp2 = (v8i16) (q0_src - delta0);
232 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
233 (v16u8) q_is_pcm_vec);
235 p_is_pcm_vec = ~p_is_pcm_vec;
236 q_is_pcm_vec = ~q_is_pcm_vec;
237 tmp = (beta + (beta >> 1)) >> 3;
238 cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
239 cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
240 cmp0 = __msa_ilvev_d(cmp1, cmp0);
241 cmp0 = __msa_ceqi_d(cmp0, 0);
242 p_is_pcm_vec = p_is_pcm_vec | cmp0;
244 cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
245 cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
246 cmp0 = __msa_ilvev_d(cmp1, cmp0);
247 cmp0 = __msa_ceqi_d(cmp0, 0);
248 q_is_pcm_vec = q_is_pcm_vec | cmp0;
253 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
254 delta1 -= (v8i16) p1_src;
257 delta1 =
CLIP_SH(delta1, tc_neg, tc_pos);
258 delta1 = (v8i16) p1_src + (v8i16) delta1;
260 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
261 (v16u8) p_is_pcm_vec);
263 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
264 delta2 = delta2 - (v8i16) q1_src;
265 delta2 = delta2 - delta0;
266 delta2 = delta2 >> 1;
267 delta2 =
CLIP_SH(delta2, tc_neg, tc_pos);
268 delta2 = (v8i16) q1_src + (v8i16) delta2;
270 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
271 (v16u8) q_is_pcm_vec);
273 dst1 = (v16u8) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
275 dst2 = (v16u8) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
277 dst3 = (v16u8) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
279 dst4 = (v16u8) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
285 PCKEV_B2_UB(p0_src, p1_src, q1_src, q0_src, dst2, dst3);
287 dst0 = __msa_bmz_v(dst0, dst2, (v16u8) cmp3);
288 dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
299 temp0 = (p1_src + p0_src + q0_src);
300 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
301 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
302 temp2 = (v8i16) (temp1 - p2_src);
303 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
304 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
306 temp1 = temp0 + p2_src;
307 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
308 temp2 = (v8i16) (temp1 - p1_src);
309 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
310 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
312 temp1 = (temp0 << 1) + p2_src + q1_src;
313 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
314 temp2 = (v8i16) (temp1 - p0_src);
315 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
316 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
318 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
319 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
320 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
323 temp0 = (q1_src + p0_src + q0_src);
325 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
326 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
327 temp2 = (v8i16) (temp1 - q2_src);
328 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
329 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
331 temp1 = temp0 + q2_src;
332 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
333 temp2 = (v8i16) (temp1 - q1_src);
334 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
335 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
337 temp1 = (temp0 << 1) + p1_src + q2_src;
338 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
339 temp2 = (v8i16) (temp1 - q0_src);
340 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
341 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
343 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
344 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
345 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
349 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
356 diff0 = (v8i16) (q0_src - p0_src);
357 diff1 = (v8i16) (q1_src - p1_src);
358 diff0 = (diff0 << 3) + diff0;
359 diff1 = (diff1 << 1) + diff1;
360 delta0 = diff0 - diff1;
361 delta0 = __msa_srari_h(delta0, 4);
363 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
364 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
365 abs_delta0 = (v8u16) abs_delta0 < temp1;
367 delta0 =
CLIP_SH(delta0, tc_neg, tc_pos);
369 temp0 = (v8u16) (delta0 + p0_src);
371 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
372 (v16u8) p_is_pcm_vec);
374 temp2 = (v8i16) (q0_src - delta0);
376 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
377 (v16u8) q_is_pcm_vec);
379 p_is_pcm_vec = ~p_is_pcm_vec;
380 q_is_pcm_vec = ~q_is_pcm_vec;
381 tmp = (beta + (beta >> 1)) >> 3;
382 cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
383 cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
384 cmp0 = __msa_ilvev_d(cmp1, cmp0);
385 p_is_pcm_vec = p_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
387 cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
388 cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
389 cmp0 = __msa_ilvev_d(cmp1, cmp0);
390 q_is_pcm_vec = q_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
395 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
396 delta1 -= (v8i16) p1_src;
399 delta1 =
CLIP_SH(delta1, tc_neg, tc_pos);
400 delta1 = (v8i16) p1_src + (v8i16) delta1;
402 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
403 (v16u8) p_is_pcm_vec);
405 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
406 delta2 = delta2 - (v8i16) q1_src;
407 delta2 = delta2 - delta0;
408 delta2 = delta2 >> 1;
409 delta2 =
CLIP_SH(delta2, tc_neg, tc_pos);
410 delta2 = (v8i16) q1_src + (v8i16) delta2;
412 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
413 (v16u8) q_is_pcm_vec);
415 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
417 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
419 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
421 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
426 PCKEV_B2_UB(delta1, p2_src, temp2, temp0, dst3, dst4);
427 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) delta2);
430 dst0 = __msa_bmnz_v(dst0, dst3, (v16u8) cmp2);
431 dst1 = __msa_bmnz_v(dst1, dst4, (v16u8) cmp2);
432 dst2 = __msa_bmnz_v(dst2, dst5, (v16u8) cmp2);
435 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
436 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
438 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
439 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
440 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
442 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
443 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
460 uint8_t *p1 = src + (stride << 2);
465 int32_t dp00, dq00, dp30, dq30, d00, d30;
467 int32_t dp04, dq04, dp34, dq34, d04, d34;
468 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
469 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254,
tmp;
470 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
471 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
475 v8i16 tc_pos, tc_neg;
476 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
478 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
480 dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
481 dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
482 dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
483 dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
486 p_is_pcm0 = p_is_pcm[0];
487 q_is_pcm0 = q_is_pcm[0];
489 dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
490 dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
491 dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
492 dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
495 p_is_pcm4 = p_is_pcm[1];
496 q_is_pcm4 = q_is_pcm[1];
498 cmp0 = __msa_fill_d(p_is_pcm0);
499 cmp1 = __msa_fill_d(p_is_pcm4);
500 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
501 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
503 d0030 = (d00 + d30) >= beta;
504 d0434 = (d04 + d34) >= beta;
506 cmp0 = __msa_fill_d(d0030);
507 cmp1 = __msa_fill_d(d0434);
508 cmp3 = __msa_ilvev_d(cmp1, cmp0);
509 cmp3 = (v2i64) __msa_ceqi_d(cmp3, 0);
511 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
512 (!d0030 || !d0434)) {
514 LD_UH8(src, stride, p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
517 cmp0 = __msa_fill_d(q_is_pcm0);
518 cmp1 = __msa_fill_d(q_is_pcm4);
519 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
520 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
525 tc250 = ((tc0 * 5 + 1) >> 1);
528 tc254 = ((tc4 * 5 + 1) >> 1);
529 cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
530 cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
531 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
534 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
535 q0_src, q1_src, q2_src, q3_src);
537 flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
538 abs(p3[-1] - p3[0]) < tc250;
539 flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
540 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
541 (d30 << 1) < beta20);
542 cmp0 = __msa_fill_d(flag0);
543 ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
544 p3_src, p2_src, p1_src, p0_src);
546 flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
547 abs(p1[-1] - p1[0]) < tc254;
548 flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
549 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
550 (d34 << 1) < beta20);
551 ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
552 q0_src, q1_src, q2_src, q3_src);
554 cmp1 = __msa_fill_d(flag1);
555 cmp2 = __msa_ilvev_d(cmp1, cmp0);
556 cmp2 = __msa_ceqi_d(cmp2, 0);
558 if (flag0 && flag1) {
563 temp0 = (p1_src + p0_src + q0_src);
565 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
566 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
567 temp2 = (v8i16) (temp1 - p2_src);
568 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
569 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
571 temp1 = temp0 + p2_src;
572 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
573 temp2 = (v8i16) (temp1 - p1_src);
574 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
575 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
577 temp1 = (temp0 << 1) + p2_src + q1_src;
578 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
579 temp2 = (v8i16) (temp1 - p0_src);
580 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
581 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
583 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
584 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
585 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
588 temp0 = (q1_src + p0_src + q0_src);
589 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
590 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
591 temp2 = (v8i16) (temp1 - q2_src);
592 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
593 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
595 temp1 = temp0 + q2_src;
596 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
597 temp2 = (v8i16) (temp1 - q1_src);
598 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
599 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
601 temp1 = (temp0 << 1) + p1_src + q2_src;
602 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
603 temp2 = (v8i16) (temp1 - q0_src);
604 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
605 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
607 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
608 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
609 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
611 }
else if (flag0 == flag1) {
616 diff0 = (v8i16) (q0_src - p0_src);
617 diff1 = (v8i16) (q1_src - p1_src);
618 diff0 = (diff0 << 3) + diff0;
619 diff1 = (diff1 << 1) + diff1;
620 delta0 = diff0 - diff1;
621 delta0 = __msa_srari_h(delta0, 4);
623 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
624 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
625 abs_delta0 = (v8u16) abs_delta0 < temp1;
627 delta0 =
CLIP_SH(delta0, tc_neg, tc_pos);
628 temp0 = (v8u16) (delta0 + p0_src);
630 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
631 (v16u8) p_is_pcm_vec);
633 temp2 = (v8i16) (q0_src - delta0);
635 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
636 (v16u8) q_is_pcm_vec);
638 tmp = ((beta + (beta >> 1)) >> 3);
639 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
640 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
641 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
642 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
644 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 <
tmp));
645 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 <
tmp));
646 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
647 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
652 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
653 delta1 -= (v8i16) p1_src;
656 delta1 =
CLIP_SH(delta1, tc_neg, tc_pos);
657 delta1 = (v8i16) p1_src + (v8i16) delta1;
659 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
660 (v16u8) p_is_pcm_vec);
662 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
663 delta2 = delta2 - (v8i16) q1_src;
664 delta2 = delta2 - delta0;
665 delta2 = delta2 >> 1;
666 delta2 =
CLIP_SH(delta2, tc_neg, tc_pos);
667 delta2 = (v8i16) q1_src + (v8i16) delta2;
669 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
670 (v16u8) q_is_pcm_vec);
672 dst0 = __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
674 dst1 = __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
676 dst2 = __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
678 dst3 = __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
682 dst0 = __msa_bmz_v(dst0, (v16u8) p1_src, (v16u8) cmp3);
683 dst1 = __msa_bmz_v(dst1, (v16u8) p0_src, (v16u8) cmp3);
684 dst2 = __msa_bmz_v(dst2, (v16u8) q0_src, (v16u8) cmp3);
685 dst3 = __msa_bmz_v(dst3, (v16u8) q1_src, (v16u8) cmp3);
695 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
696 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
702 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
703 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
709 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
710 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
716 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
717 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
728 temp0 = (p1_src + p0_src + q0_src);
730 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
731 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
732 temp2 = (v8i16) (temp1 - p2_src);
733 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
734 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
736 temp1 = temp0 + p2_src;
737 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
738 temp2 = (v8i16) (temp1 - p1_src);
739 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
740 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
742 temp1 = (temp0 << 1) + p2_src + q1_src;
743 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
744 temp2 = (v8i16) (temp1 - p0_src);
745 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
746 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
748 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
749 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
750 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
753 temp0 = (q1_src + p0_src + q0_src);
754 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
755 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
756 temp2 = (v8i16) (temp1 - q2_src);
757 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
758 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
760 temp1 = temp0 + q2_src;
761 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
762 temp2 = (v8i16) (temp1 - q1_src);
763 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
764 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
766 temp1 = (temp0 << 1) + p1_src + q2_src;
767 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
768 temp2 = (v8i16) (temp1 - q0_src);
769 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
770 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
772 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
773 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
774 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
781 diff0 = (v8i16) (q0_src - p0_src);
782 diff1 = (v8i16) (q1_src - p1_src);
783 diff0 = (diff0 << 3) + diff0;
784 diff1 = (diff1 << 1) + diff1;
785 delta0 = diff0 - diff1;
786 delta0 = __msa_srari_h(delta0, 4);
788 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
789 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
790 abs_delta0 = (v8u16) abs_delta0 < temp1;
792 delta0 =
CLIP_SH(delta0, tc_neg, tc_pos);
794 temp0 = (v8u16) (delta0 + p0_src);
796 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
797 (v16u8) p_is_pcm_vec);
799 temp2 = (v8i16) (q0_src - delta0);
801 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
802 (v16u8) q_is_pcm_vec);
804 tmp = (beta + (beta >> 1)) >> 3;
805 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
806 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
807 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
808 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
810 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 <
tmp));
811 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 <
tmp));
812 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
813 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
818 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
819 delta1 -= (v8i16) p1_src;
822 delta1 =
CLIP_SH(delta1, tc_neg, tc_pos);
823 delta1 = (v8i16) p1_src + (v8i16) delta1;
825 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
826 (v16u8) p_is_pcm_vec);
828 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
829 delta2 = delta2 - (v8i16) q1_src;
830 delta2 = delta2 - delta0;
831 delta2 = delta2 >> 1;
832 delta2 =
CLIP_SH(delta2, tc_neg, tc_pos);
833 delta2 = (v8i16) q1_src + (v8i16) delta2;
835 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
836 (v16u8) q_is_pcm_vec);
837 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
839 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
841 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
843 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
848 dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
849 dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
850 dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
851 dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
852 dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
853 dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
856 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp3);
857 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp3);
858 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp3);
859 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp3);
860 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp3);
861 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp3);
864 PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst4, dst4, dst5, dst5, dst0, dst1,
875 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
876 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
877 tmp0 = __msa_copy_u_h((v8i16) dst2, 0);
878 tmp1 = __msa_copy_u_h((v8i16) dst2, 2);
886 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
887 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
888 tmp0 = __msa_copy_u_h((v8i16) dst2, 4);
889 tmp1 = __msa_copy_u_h((v8i16) dst2, 6);
897 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
898 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
899 tmp0 = __msa_copy_u_h((v8i16) dst3, 0);
900 tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
908 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
909 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
910 tmp0 = __msa_copy_u_h((v8i16) dst3, 4);
911 tmp1 = __msa_copy_u_h((v8i16) dst3, 6);
924 uint8_t *p1_ptr = src - (stride << 1);
928 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
929 v8u16 p1, p0,
q0,
q1;
930 v8i16 tc_pos, tc_neg;
932 v8i16 temp0, temp1,
delta;
934 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
935 cmp0 = (v2i64) __msa_fill_h(tc[0]);
936 cmp1 = (v2i64) __msa_fill_h(tc[1]);
937 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
940 cmp0 = __msa_fill_d(p_is_pcm[0]);
941 cmp1 = __msa_fill_d(p_is_pcm[1]);
942 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
943 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
945 cmp0 = __msa_fill_d(q_is_pcm[0]);
946 cmp1 = __msa_fill_d(q_is_pcm[1]);
947 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
948 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
955 ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
957 temp0 = (v8i16) (q0 - p0);
958 temp1 = (v8i16) (p1 - q1);
961 delta = __msa_srari_h((v8i16) temp0, 3);
962 delta =
CLIP_SH(delta, tc_neg, tc_pos);
964 temp0 = (v8i16) ((v8i16) p0 +
delta);
966 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
967 (v16u8) p_is_pcm_vec);
969 temp1 = (v8i16) ((v8i16) q0 -
delta);
971 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
972 (v16u8) q_is_pcm_vec);
974 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
975 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
976 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
978 temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
987 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
988 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
989 v8u16 p1, p0,
q0,
q1;
990 v8i16 tc_pos, tc_neg;
992 v8i16 temp0, temp1,
delta;
994 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
995 cmp0 = (v2i64) __msa_fill_h(tc[0]);
996 cmp1 = (v2i64) __msa_fill_h(tc[1]);
997 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
1000 cmp0 = __msa_fill_d(p_is_pcm[0]);
1001 cmp1 = __msa_fill_d(p_is_pcm[1]);
1002 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
1003 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
1005 cmp0 = __msa_fill_d(q_is_pcm[0]);
1006 cmp1 = __msa_fill_d(q_is_pcm[1]);
1007 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
1008 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
1011 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1014 ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
1016 temp0 = (v8i16) (q0 - p0);
1017 temp1 = (v8i16) (p1 - q1);
1020 delta = __msa_srari_h((v8i16) temp0, 3);
1021 delta =
CLIP_SH(delta, tc_neg, tc_pos);
1023 temp0 = (v8i16) ((v8i16) p0 +
delta);
1025 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
1026 (v16u8) p_is_pcm_vec);
1028 temp1 = (v8i16) ((v8i16) q0 -
delta);
1030 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
1031 (v16u8) q_is_pcm_vec);
1033 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
1034 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
1035 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
1037 temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
1049 int16_t *sao_offset_val,
1053 v16i8 src0_r, src1_r;
1055 v16i8 dst0, offset0, offset1;
1058 offset_val =
LD_SB(sao_offset_val + 1);
1059 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1061 offset_val = __msa_pckev_b(offset_val, offset_val);
1062 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1063 offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
1064 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
1067 LD_UB4(src, src_stride, src0, src1, src2, src3);
1069 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1070 SWAP(offset0, offset1);
1073 for (height -= 4;
height; height -= 4) {
1074 src += (4 * src_stride);
1076 ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
1078 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1079 mask = __msa_srli_b(src0_r, 3);
1080 offset = __msa_vshf_b(mask, offset1, offset0);
1082 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1083 dst0 = __msa_adds_s_b(src0_r, offset);
1084 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1087 LD_UB4(src, src_stride, src0, src1, src2, src3);
1090 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1091 dst += (4 * dst_stride);
1094 ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
1096 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1097 mask = __msa_srli_b(src0_r, 3);
1098 offset = __msa_vshf_b(mask, offset1, offset0);
1100 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1101 dst0 = __msa_adds_s_b(src0_r, offset);
1102 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1105 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1111 int16_t *sao_offset_val,
1115 v16i8 src0_r, src1_r, mask0, mask1;
1116 v16i8 offset_mask0, offset_mask1, offset_val;
1117 v16i8 offset0, offset1, dst0, dst1;
1120 offset_val =
LD_SB(sao_offset_val + 1);
1121 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1122 offset_val = __msa_pckev_b(offset_val, offset_val);
1123 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1124 offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
1125 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
1128 LD_UB4(src, src_stride, src0, src1, src2, src3);
1130 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1131 SWAP(offset0, offset1);
1134 for (height -= 4;
height; height -= 4) {
1135 src += src_stride << 2;
1137 ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
1139 mask0 = __msa_srli_b(src0_r, 3);
1140 mask1 = __msa_srli_b(src1_r, 3);
1142 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1143 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1146 LD_UB4(src, src_stride, src0, src1, src2, src3);
1150 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1151 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1156 ST8x4_UB(dst0, dst1, dst, dst_stride);
1157 dst += dst_stride << 2;
1160 ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
1162 mask0 = __msa_srli_b(src0_r, 3);
1163 mask1 = __msa_srli_b(src1_r, 3);
1165 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1166 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1170 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1171 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1176 ST8x4_UB(dst0, dst1, dst, dst_stride);
1184 int16_t *sao_offset_val,
1189 v16i8 out0, out1, out2, out3;
1190 v16i8 mask0, mask1, mask2, mask3;
1191 v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
1192 v16i8 offset0, offset1;
1195 offset_val =
LD_SB(sao_offset_val + 1);
1196 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1197 offset_val = __msa_pckev_b(offset_val, offset_val);
1198 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1199 offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
1200 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
1202 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1203 SWAP(offset0, offset1);
1206 while (height > 0) {
1208 LD_UB4(src, src_stride, src0, src1, src2, src3);
1210 for (w_cnt = 16; w_cnt <
width; w_cnt += 16) {
1211 mask0 = __msa_srli_b((v16i8) src0, 3);
1212 mask1 = __msa_srli_b((v16i8) src1, 3);
1213 mask2 = __msa_srli_b((v16i8) src2, 3);
1214 mask3 = __msa_srli_b((v16i8) src3, 3);
1216 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
1218 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
1222 out0 = __msa_adds_s_b((v16i8) src0, tmp0);
1223 out1 = __msa_adds_s_b((v16i8) src1, tmp1);
1224 out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1225 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1228 LD_UB4(src + w_cnt, src_stride, src0, src1, src2, src3);
1232 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1235 mask0 = __msa_srli_b((v16i8) src0, 3);
1236 mask1 = __msa_srli_b((v16i8) src1, 3);
1237 mask2 = __msa_srli_b((v16i8) src2, 3);
1238 mask3 = __msa_srli_b((v16i8) src3, 3);
1240 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, tmp0,
1242 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, tmp2,
1246 out0 = __msa_adds_s_b((v16i8) src0, tmp0);
1247 out1 = __msa_adds_s_b((v16i8) src1, tmp1);
1248 out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1249 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1253 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1255 src += src_stride << 2;
1256 dst += dst_stride << 2;
1265 int16_t *sao_offset_val,
1268 uint32_t dst_val0, dst_val1;
1269 v16u8 cmp_minus10, diff_minus10, diff_minus11, src_minus10, src_minus11;
1270 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1271 v16i8 sao_offset =
LD_SB(sao_offset_val);
1273 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1276 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1280 LD_UB2(src, src_stride, src_minus10, src_minus11);
1282 for (height -= 2;
height; height -= 2) {
1283 src += (2 * src_stride);
1285 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1286 (v2i64) src_minus10);
1288 src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
1289 src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
1291 cmp_minus10 = ((v16u8) src0 == src_minus10);
1292 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1293 cmp_minus10 = (src_minus10 < (v16u8) src0);
1294 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1296 cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10);
1297 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1298 cmp_minus10 = ((v16u8) src_plus10 < (v16u8)
src0);
1299 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1301 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1304 LD_UB2(src, src_stride, src_minus10, src_minus11);
1306 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1309 src0 = (v16i8) __msa_xori_b((v16u8)
src0, 128);
1310 dst0 = __msa_adds_s_b(src0, offset);
1311 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1313 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1314 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1321 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1322 (v2i64) src_minus10);
1324 src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
1325 src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
1327 cmp_minus10 = ((v16u8) src0 == src_minus10);
1328 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1329 cmp_minus10 = (src_minus10 < (v16u8) src0);
1330 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1332 cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10);
1333 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1334 cmp_minus10 = ((v16u8) src_plus10 < (v16u8)
src0);
1335 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1337 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1338 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1341 src0 = (v16i8) __msa_xori_b((v16u8)
src0, 128);
1342 dst0 = __msa_adds_s_b(src0, offset);
1343 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1345 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1346 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1357 int16_t *sao_offset_val,
1360 uint64_t dst_val0, dst_val1;
1361 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1362 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1363 v16u8 cmp_minus10, diff_minus10, diff_minus11;
1364 v16u8
src0,
src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1367 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1371 LD_UB2(src, src_stride, src_minus10, src_minus11);
1373 for (height -= 2;
height; height -= 2) {
1374 src += (src_stride << 1);
1377 SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
1379 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10,
1380 src_minus10, src_plus10);
1381 src0 = (v16u8) __msa_pckev_d((v2i64)
src1, (v2i64) src0);
1383 cmp_minus10 = (src0 == src_minus10);
1384 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1385 cmp_minus10 = (src_minus10 <
src0);
1386 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1388 cmp_minus10 = (src0 == src_plus10);
1389 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1390 cmp_minus10 = (src_plus10 <
src0);
1391 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1393 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1396 LD_UB2(src, src_stride, src_minus10, src_minus11);
1398 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1401 src0 = __msa_xori_b(src0, 128);
1402 dst0 = (v16u8) __msa_adds_s_b((v16i8)
src0,
offset);
1403 dst0 = __msa_xori_b(dst0, 128);
1405 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1406 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1414 SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
1416 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10,
1418 src0 = (v16u8) __msa_pckev_d((v2i64)
src1, (v2i64) src0);
1420 cmp_minus10 = ((v16u8) src0 == src_minus10);
1421 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1422 cmp_minus10 = (src_minus10 < (v16u8) src0);
1423 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1425 cmp_minus10 = (src0 == src_plus10);
1426 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1427 cmp_minus10 = (src_plus10 <
src0);
1428 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1430 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1432 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1435 src0 = __msa_xori_b(src0, 128);
1436 dst0 = (v16u8) __msa_adds_s_b((v16i8)
src0,
offset);
1437 dst0 = __msa_xori_b(dst0, 128);
1439 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1440 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1450 int16_t *sao_offset_val,
1454 uint8_t *dst_ptr, *src_minus1;
1456 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1457 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1459 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1460 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1461 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1463 v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1464 v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
1465 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1466 v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
1467 v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
1469 sao_offset =
LD_SB(sao_offset_val);
1470 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1472 for (;
height; height -= 4) {
1473 src_minus1 = src - 1;
1474 LD_UB4(src_minus1, src_stride,
1475 src_minus10, src_minus11, src_minus12, src_minus13);
1477 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1479 dst_ptr = dst + v_cnt;
1480 LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
1482 SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0,
1484 SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2,
1486 SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10,
1488 SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12,
1491 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1492 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1493 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1494 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1495 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1496 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1497 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1498 cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
1500 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1501 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1502 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1503 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1504 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1505 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1506 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1507 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1509 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1510 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1511 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1512 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1513 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1514 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1515 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1516 cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
1518 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1519 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1520 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1521 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1522 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1523 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1524 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1525 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1527 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1528 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1529 offset_mask0, offset_mask0, offset_mask0);
1530 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1531 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1532 offset_mask1, offset_mask1, offset_mask1);
1533 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1534 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
1535 offset_mask2, offset_mask2, offset_mask2);
1536 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1537 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
1538 offset_mask3, offset_mask3, offset_mask3);
1542 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
1543 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
1544 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
1545 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
1549 src_minus10 = src10;
1550 ST_UB(dst0, dst_ptr);
1551 src_minus11 = src11;
1552 ST_UB(dst1, dst_ptr + dst_stride);
1553 src_minus12 = src12;
1554 ST_UB(dst2, dst_ptr + (dst_stride << 1));
1555 src_minus13 = src13;
1556 ST_UB(dst3, dst_ptr + (dst_stride * 3));
1559 src += (src_stride << 2);
1560 dst += (dst_stride << 2);
1568 int16_t *sao_offset_val,
1571 uint32_t dst_val0, dst_val1;
1572 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1573 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1575 v16i8 sao_offset =
LD_SB(sao_offset_val);
1576 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1577 v16u8 src_minus10, src_minus11, src10, src11;
1578 v16i8 src_zero0, src_zero1;
1580 v8i16 offset_mask0, offset_mask1;
1582 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1585 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1586 LD_UB2(src + src_stride, src_stride, src10, src11);
1588 for (height -= 2;
height; height -= 2) {
1589 src += (src_stride << 1);
1591 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1592 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1593 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1594 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1596 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1597 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1598 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1599 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1601 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1602 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1603 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1604 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1606 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1607 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1609 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1610 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1612 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1615 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1616 dst0 = __msa_adds_s_b(dst0, offset);
1617 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1619 src_minus10 = src10;
1620 src_minus11 = src11;
1623 LD_UB2(src + src_stride, src_stride, src10, src11);
1625 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1626 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1634 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1635 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1636 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1637 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1639 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1640 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1641 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1642 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1644 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1645 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1646 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1647 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1649 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1650 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1652 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1653 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1655 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
1656 offset, offset, offset);
1658 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1659 dst0 = __msa_adds_s_b(dst0, offset);
1660 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1662 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1663 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1673 int16_t *sao_offset_val,
1676 uint64_t dst_val0, dst_val1;
1677 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1678 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1680 v16i8 src_zero0, src_zero1, dst0;
1681 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1682 v16u8 src_minus10, src_minus11, src10, src11;
1683 v8i16 offset_mask0, offset_mask1;
1685 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1688 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1689 LD_UB2(src + src_stride, src_stride, src10, src11);
1691 for (height -= 2;
height; height -= 2) {
1692 src += (src_stride << 1);
1694 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1695 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1696 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1697 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1699 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1700 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1701 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1702 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1704 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1705 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1706 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1707 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1709 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1710 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1712 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1713 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1715 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
1716 offset, offset, offset);
1718 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1719 dst0 = __msa_adds_s_b(dst0, offset);
1720 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1722 src_minus10 = src10;
1723 src_minus11 = src11;
1726 LD_UB2(src + src_stride, src_stride, src10, src11);
1728 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1729 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1736 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1737 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1738 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1739 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1741 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1742 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1743 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1744 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1746 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1747 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1748 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1749 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1751 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1752 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1754 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1755 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1757 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1760 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1761 dst0 = __msa_adds_s_b(dst0, offset);
1762 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1764 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1765 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1783 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1784 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1785 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1786 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1787 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1789 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1790 v16u8 src12, dst2, src13, dst3;
1791 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1793 sao_offset =
LD_SB(sao_offset_val);
1794 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1796 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1797 src = src_orig + v_cnt;
1798 dst = dst_orig + v_cnt;
1800 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1802 for (h_cnt = (height >> 2); h_cnt--;) {
1803 LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13);
1805 cmp_minus10 = (src_minus11 == src_minus10);
1806 cmp_plus10 = (src_minus11 == src10);
1807 cmp_minus11 = (src10 == src_minus11);
1808 cmp_plus11 = (src10 == src11);
1809 cmp_minus12 = (src11 == src10);
1810 cmp_plus12 = (src11 == src12);
1811 cmp_minus13 = (src12 == src11);
1812 cmp_plus13 = (src12 == src13);
1814 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1815 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1816 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1817 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1818 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1819 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1820 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1821 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1823 cmp_minus10 = (src_minus10 < src_minus11);
1824 cmp_plus10 = (src10 < src_minus11);
1825 cmp_minus11 = (src_minus11 < src10);
1826 cmp_plus11 = (src11 < src10);
1827 cmp_minus12 = (src10 < src11);
1828 cmp_plus12 = (src12 < src11);
1829 cmp_minus13 = (src11 < src12);
1830 cmp_plus13 = (src13 < src12);
1832 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1833 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1834 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1835 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1836 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1837 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1838 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1839 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1841 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1842 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1843 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1844 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1845 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1846 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1847 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1848 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1849 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1850 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1851 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1852 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1854 src_minus10 = src12;
1857 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_minus11, offset_mask0);
1858 dst1 = (v16u8) __msa_adds_s_b((v16i8) src10, offset_mask1);
1859 dst2 = (v16u8) __msa_adds_s_b((v16i8) src11, offset_mask2);
1860 dst3 = (v16u8) __msa_adds_s_b((v16i8) src12, offset_mask3);
1863 src_minus11 = src13;
1865 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1867 src += (src_stride << 2);
1868 dst += (dst_stride << 2);
1877 int16_t *sao_offset_val,
1881 uint32_t dst_val0, dst_val1;
1882 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1883 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1885 v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1886 v16u8 src_minus11, src10, src11;
1887 v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1888 v8i16 offset_mask0, offset_mask1;
1890 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1895 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1896 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1898 for (height -= 2;
height; height -= 2) {
1899 src_orig += (src_stride << 1);
1901 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1904 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1906 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1909 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1910 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1911 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1912 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1914 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1915 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1916 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1917 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1919 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1920 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1922 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1923 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1925 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
1926 offset, offset, offset);
1928 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1929 dst0 = __msa_adds_s_b(dst0, offset);
1930 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1932 src_minus10 = src10;
1933 src_minus11 = src11;
1936 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1938 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1939 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1947 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1950 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1952 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1955 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1956 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1957 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1958 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1960 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1961 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1962 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1963 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1965 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1966 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1968 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1969 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1971 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1974 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1975 dst0 = __msa_adds_s_b(dst0, offset);
1976 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1978 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1979 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1989 int16_t *sao_offset_val,
1993 uint64_t dst_val0, dst_val1;
1994 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1995 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1997 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1998 v16u8 src_minus10, src10, src_minus11, src11;
1999 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0;
2000 v8i16 offset_mask0, offset_mask1;
2002 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2006 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2007 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2009 for (height -= 2;
height; height -= 2) {
2010 src_orig += (src_stride << 1);
2012 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2015 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
2016 src_minus10, src_minus11);
2017 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
2018 src_zero0, src_zero1);
2020 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2021 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2022 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2023 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2025 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2026 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2027 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2028 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2030 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2031 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2033 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2034 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2036 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2039 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2040 dst0 = __msa_adds_s_b(dst0, offset);
2041 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2043 src_minus10 = src10;
2044 src_minus11 = src11;
2047 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2049 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2050 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2057 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2059 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10,
2061 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2064 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2065 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2066 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2067 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2069 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2070 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2071 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2072 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2074 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2075 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2077 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2078 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2080 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2083 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2084 dst0 = __msa_adds_s_b(dst0, offset);
2085 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2087 src_minus10 = src10;
2088 src_minus11 = src11;
2091 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2093 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2094 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2112 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2113 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2114 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
2115 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
2116 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
2117 v16u8 diff_plus13, src_minus14, src_plus13;
2118 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
2119 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
2120 v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
2121 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
2122 v16i8 src_zero3, sao_offset;
2124 sao_offset =
LD_SB(sao_offset_val);
2125 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2127 for (;
height; height -= 4) {
2130 LD_UB4(src_orig, src_stride, src_minus11, src_minus12, src_minus13,
2133 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2134 src_minus10 =
LD_UB(src_orig - src_stride);
2135 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2136 src_plus13 =
LD_UB(src + 1 + v_cnt + (src_stride << 2));
2139 SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0,
2141 SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2,
2143 SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10,
2146 src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
2148 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2149 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
2150 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2151 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
2152 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
2153 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
2154 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
2155 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2157 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2158 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2159 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2160 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2161 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2162 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2163 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2164 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2166 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2167 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
2168 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2169 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
2170 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
2171 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
2172 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
2173 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2175 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2176 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2177 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2178 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2179 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2180 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2181 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2182 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2184 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2185 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2186 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2187 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2189 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2190 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2191 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2192 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2193 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2194 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2195 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2196 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2200 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2201 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2202 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2203 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2207 src_minus11 = src10;
2208 src_minus12 = src11;
2209 src_minus13 = src12;
2210 src_minus14 = src13;
2212 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2216 src += (src_stride << 2);
2217 dst += (dst_stride << 2);
2225 int16_t *sao_offset_val,
2229 uint32_t dst_val0, dst_val1;
2230 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2231 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2233 v16i8 src_zero0, src_zero1, dst0;
2234 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2235 v16u8 src_minus10, src10, src_minus11, src11;
2236 v8i16 offset_mask0, offset_mask1;
2238 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2242 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2243 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2245 for (height -= 2;
height; height -= 2) {
2246 src_orig += (src_stride << 1);
2248 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2249 SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
2251 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2253 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2256 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2257 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2258 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2259 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2261 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2262 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2263 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2264 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2266 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2267 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2269 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2270 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2272 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2275 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2276 dst0 = __msa_adds_s_b(dst0, offset);
2277 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2279 src_minus10 = src10;
2280 src_minus11 = src11;
2283 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2285 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2286 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2295 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2296 SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
2298 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2300 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2303 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2304 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2305 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2306 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2308 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2309 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2310 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2311 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2313 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2314 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2316 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2317 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2319 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2322 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2323 dst0 = __msa_adds_s_b(dst0, offset);
2324 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2326 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2327 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2339 int16_t *sao_offset_val,
2343 uint64_t dst_val0, dst_val1;
2344 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2345 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2347 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2348 v16u8 src_minus10, src10, src_minus11, src11;
2349 v16i8 src_zero0, src_zero1, dst0;
2350 v8i16 offset_mask0, offset_mask1;
2352 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2356 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2357 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2359 for (height -= 2;
height; height -= 2) {
2360 src_orig += (src_stride << 1);
2362 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2363 SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
2364 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2366 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2369 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2370 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2371 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2372 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2374 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2375 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2376 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2377 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2379 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2380 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2382 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2383 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2385 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2388 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2389 dst0 = __msa_adds_s_b(dst0, offset);
2390 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2392 src_minus10 = src10;
2393 src_minus11 = src11;
2396 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2398 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2399 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2407 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2408 SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
2409 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2411 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2414 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2415 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2416 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2417 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2419 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2420 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2421 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2422 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2424 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2425 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2427 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2428 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2430 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2433 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2434 dst0 = __msa_adds_s_b(dst0, offset);
2435 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2437 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2438 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2457 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2458 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2459 v16u8 dst0, dst1, dst2, dst3;
2460 v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2461 v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2462 v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2463 v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2464 v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
2465 v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2466 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2468 sao_offset =
LD_SB(sao_offset_val);
2469 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2471 for (;
height; height -= 4) {
2475 LD_UB4(src_orig, src_stride, src_minus11, src_plus10, src_plus11,
2478 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2479 src_minus10 =
LD_UB(src_orig + 2 - src_stride);
2480 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2481 src_plus13 =
LD_UB(src_orig + (src_stride << 2));
2484 src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
2485 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2486 cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
2488 src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
2489 src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
2490 (v16i8) src_minus11, 2);
2491 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2492 cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
2494 src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
2495 src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
2496 cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
2497 cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
2499 src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
2500 src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
2501 cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
2502 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2504 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2505 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2506 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2507 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2508 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2509 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2510 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2511 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2513 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2514 cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
2515 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2516 cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
2517 cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
2518 cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
2519 cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
2520 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2522 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2523 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2524 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2525 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2526 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2527 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2528 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2529 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2531 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2532 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2533 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2534 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2536 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2537 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2538 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2539 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2540 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2541 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2542 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2543 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2547 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2548 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2549 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2550 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2554 src_minus11 = src10;
2559 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2563 src += (src_stride << 2);
2564 dst += (dst_stride << 2);
2569 ptrdiff_t src_stride,
2577 ptrdiff_t src_stride,
2585 ptrdiff_t src_stride,
2593 ptrdiff_t src_stride,
2601 ptrdiff_t stride_dst, ptrdiff_t stride_src,
2602 int16_t *sao_offset_val,
int sao_left_class,
2607 sao_left_class, sao_offset_val,
2608 width - (width % 16), height);
2609 dst += width - (width % 16);
2610 src += width - (width % 16);
2616 sao_left_class, sao_offset_val, height);
2624 sao_left_class, sao_offset_val, height);
2629 ptrdiff_t stride_dst,
2630 int16_t *sao_offset_val,
2633 ptrdiff_t stride_src = (2 * 64 + 32) /
sizeof(
uint8_t);
2641 width - (width % 16),
2643 dst += width - (width % 16);
2644 src += width - (width % 16);
2651 sao_offset_val, height);
2660 sao_offset_val, height);
2669 width - (width % 16),
2671 dst += width - (width % 16);
2672 src += width - (width % 16);
2679 sao_offset_val, height);
2688 sao_offset_val, height);
2697 width - (width % 16),
2699 dst += width - (width % 16);
2700 src += width - (width % 16);
2707 sao_offset_val, height);
2716 sao_offset_val, height);
2725 width - (width % 16),
2727 dst += width - (width % 16);
2728 src += width - (width % 16);
2735 sao_offset_val, height);
2744 sao_offset_val, height);
static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int32_t sao_left_class, int16_t *sao_offset_val, int32_t width, int32_t height)
static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
#define TRANSPOSE8x4_UB_UH(...)
static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int32_t sao_left_class, int16_t *sao_offset_val, int32_t height)
void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src, ptrdiff_t src_stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q)
#define XORI_B2_128_SB(...)
static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, int32_t beta, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)
void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src, ptrdiff_t src_stride, int32_t beta, int32_t *tc, uint8_t *no_p, uint8_t *no_q)
static const uint8_t q1[256]
#define TRANSPOSE8x8_UB_UH(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int16_t *sao_offset_val, int sao_left_class, int width, int height)
#define XORI_B4_128_UB(...)
#define CLIP_SH_0_255(in)
static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)
#define ST8x2_UB(in, pdst, stride)
static const uint16_t mask[17]
static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int32_t sao_left_class, int16_t *sao_offset_val, int32_t height)
#define XORI_B4_128_SB(...)
static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
static const uint8_t offset[127][2]
static const uint8_t q0[256]
static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
#define SLDI_B2_0_UB(...)
static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
#define SLDI_B2_0_SB(...)
#define ST2x4_UB(in, stidx, pdst, stride)
static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)
static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src, ptrdiff_t src_stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q)
static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)
#define CLIP_SH(in, min, max)
static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)
static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, int32_t beta, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)
static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)
void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, int16_t *sao_offset_val, int eo, int width, int height)
#define ST8x4_UB(in0, in1, pdst, stride)
void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src, ptrdiff_t src_stride, int32_t beta, int32_t *tc, uint8_t *no_p, uint8_t *no_q)
static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)