FFmpeg  4.0
vp9_intra_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
26 { \
27  out0 = __msa_subs_u_h(out0, in0); \
28  out1 = __msa_subs_u_h(out1, in1); \
29 }
30 
31 void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
32  const uint8_t *src)
33 {
34  uint32_t row;
35  v16u8 src0;
36 
37  src0 = LD_UB(src);
38 
39  for (row = 16; row--;) {
40  ST_UB(src0, dst);
41  dst += dst_stride;
42  }
43 }
44 
45 void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
46  const uint8_t *src)
47 {
48  uint32_t row;
49  v16u8 src1, src2;
50 
51  src1 = LD_UB(src);
52  src2 = LD_UB(src + 16);
53 
54  for (row = 32; row--;) {
55  ST_UB2(src1, src2, dst, 16);
56  dst += dst_stride;
57  }
58 }
59 
60 void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
61  const uint8_t *top)
62 {
63  uint32_t row, inp;
64  v16u8 src0, src1, src2, src3;
65 
66  src += 12;
67  for (row = 4; row--;) {
68  inp = LW(src);
69  src -= 4;
70 
71  src0 = (v16u8) __msa_fill_b(inp >> 24);
72  src1 = (v16u8) __msa_fill_b(inp >> 16);
73  src2 = (v16u8) __msa_fill_b(inp >> 8);
74  src3 = (v16u8) __msa_fill_b(inp);
75 
76  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
77  dst += (4 * dst_stride);
78  }
79 }
80 
81 void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
82  const uint8_t *top)
83 {
84  uint32_t row, inp;
85  v16u8 src0, src1, src2, src3;
86 
87  src += 28;
88  for (row = 8; row--;) {
89  inp = LW(src);
90  src -= 4;
91 
92  src0 = (v16u8) __msa_fill_b(inp >> 24);
93  src1 = (v16u8) __msa_fill_b(inp >> 16);
94  src2 = (v16u8) __msa_fill_b(inp >> 8);
95  src3 = (v16u8) __msa_fill_b(inp);
96 
97  ST_UB2(src0, src0, dst, 16);
98  dst += dst_stride;
99  ST_UB2(src1, src1, dst, 16);
100  dst += dst_stride;
101  ST_UB2(src2, src2, dst, 16);
102  dst += dst_stride;
103  ST_UB2(src3, src3, dst, 16);
104  dst += dst_stride;
105  }
106 }
107 
108 void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
109  const uint8_t *src_top)
110 {
111  uint32_t val0, val1;
112  v16i8 store, src = { 0 };
113  v8u16 sum_h;
114  v4u32 sum_w;
115  v2u64 sum_d;
116 
117  val0 = LW(src_top);
118  val1 = LW(src_left);
119  INSERT_W2_SB(val0, val1, src);
120  sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src);
121  sum_w = __msa_hadd_u_w(sum_h, sum_h);
122  sum_d = __msa_hadd_u_d(sum_w, sum_w);
123  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);
124  store = __msa_splati_b((v16i8) sum_w, 0);
125  val0 = __msa_copy_u_w((v4i32) store, 0);
126 
127  SW4(val0, val0, val0, val0, dst, dst_stride);
128 }
129 
130 #define INTRA_DC_TL_4x4(dir) \
131 void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, \
132  const uint8_t *left, \
133  const uint8_t *top) \
134 { \
135  uint32_t val0; \
136  v16i8 store, data = { 0 }; \
137  v8u16 sum_h; \
138  v4u32 sum_w; \
139  \
140  val0 = LW(dir); \
141  data = (v16i8) __msa_insert_w((v4i32) data, 0, val0); \
142  sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data); \
143  sum_w = __msa_hadd_u_w(sum_h, sum_h); \
144  sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2); \
145  store = __msa_splati_b((v16i8) sum_w, 0); \
146  val0 = __msa_copy_u_w((v4i32) store, 0); \
147  \
148  SW4(val0, val0, val0, val0, dst, dst_stride); \
149 }
150 INTRA_DC_TL_4x4(top);
151 INTRA_DC_TL_4x4(left);
152 
153 void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
154  const uint8_t *src_top)
155 {
156  uint64_t val0, val1;
157  v16i8 store;
158  v16u8 src = { 0 };
159  v8u16 sum_h;
160  v4u32 sum_w;
161  v2u64 sum_d;
162 
163  val0 = LD(src_top);
164  val1 = LD(src_left);
165  INSERT_D2_UB(val0, val1, src);
166  sum_h = __msa_hadd_u_h(src, src);
167  sum_w = __msa_hadd_u_w(sum_h, sum_h);
168  sum_d = __msa_hadd_u_d(sum_w, sum_w);
169  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
170  sum_d = __msa_hadd_u_d(sum_w, sum_w);
171  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);
172  store = __msa_splati_b((v16i8) sum_w, 0);
173  val0 = __msa_copy_u_d((v2i64) store, 0);
174 
175  SD4(val0, val0, val0, val0, dst, dst_stride);
176  dst += (4 * dst_stride);
177  SD4(val0, val0, val0, val0, dst, dst_stride);
178 }
179 
180 #define INTRA_DC_TL_8x8(dir) \
181 void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, \
182  const uint8_t *left, \
183  const uint8_t *top) \
184 { \
185  uint64_t val0; \
186  v16i8 store; \
187  v16u8 data = { 0 }; \
188  v8u16 sum_h; \
189  v4u32 sum_w; \
190  v2u64 sum_d; \
191  \
192  val0 = LD(dir); \
193  data = (v16u8) __msa_insert_d((v2i64) data, 0, val0); \
194  sum_h = __msa_hadd_u_h(data, data); \
195  sum_w = __msa_hadd_u_w(sum_h, sum_h); \
196  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
197  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3); \
198  store = __msa_splati_b((v16i8) sum_w, 0); \
199  val0 = __msa_copy_u_d((v2i64) store, 0); \
200  \
201  SD4(val0, val0, val0, val0, dst, dst_stride); \
202  dst += (4 * dst_stride); \
203  SD4(val0, val0, val0, val0, dst, dst_stride); \
204 }
205 
206 INTRA_DC_TL_8x8(top);
207 INTRA_DC_TL_8x8(left);
208 
209 void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
210  const uint8_t *src_left, const uint8_t *src_top)
211 {
212  v16u8 top, left, out;
213  v8u16 sum_h, sum_top, sum_left;
214  v4u32 sum_w;
215  v2u64 sum_d;
216 
217  top = LD_UB(src_top);
218  left = LD_UB(src_left);
219  HADD_UB2_UH(top, left, sum_top, sum_left);
220  sum_h = sum_top + sum_left;
221  sum_w = __msa_hadd_u_w(sum_h, sum_h);
222  sum_d = __msa_hadd_u_d(sum_w, sum_w);
223  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
224  sum_d = __msa_hadd_u_d(sum_w, sum_w);
225  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);
226  out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
227 
228  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
229  dst += (8 * dst_stride);
230  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
231 }
232 
233 #define INTRA_DC_TL_16x16(dir) \
234 void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \
235  const uint8_t *left, \
236  const uint8_t *top) \
237 { \
238  v16u8 data, out; \
239  v8u16 sum_h; \
240  v4u32 sum_w; \
241  v2u64 sum_d; \
242  \
243  data = LD_UB(dir); \
244  sum_h = __msa_hadd_u_h(data, data); \
245  sum_w = __msa_hadd_u_w(sum_h, sum_h); \
246  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
247  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \
248  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
249  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4); \
250  out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \
251  \
252  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
253  dst += (8 * dst_stride); \
254  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
255 }
256 INTRA_DC_TL_16x16(top);
257 INTRA_DC_TL_16x16(left);
258 
259 void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
260  const uint8_t *src_left, const uint8_t *src_top)
261 {
262  uint32_t row;
263  v16u8 top0, top1, left0, left1, out;
264  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
265  v4u32 sum_w;
266  v2u64 sum_d;
267 
268  LD_UB2(src_top, 16, top0, top1);
269  LD_UB2(src_left, 16, left0, left1);
270  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
271  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
272  sum_h = sum_top0 + sum_top1;
273  sum_h += sum_left0 + sum_left1;
274  sum_w = __msa_hadd_u_w(sum_h, sum_h);
275  sum_d = __msa_hadd_u_d(sum_w, sum_w);
276  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
277  sum_d = __msa_hadd_u_d(sum_w, sum_w);
278  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6);
279  out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
280 
281  for (row = 16; row--;)
282  {
283  ST_UB2(out, out, dst, 16);
284  dst += dst_stride;
285  ST_UB2(out, out, dst, 16);
286  dst += dst_stride;
287  }
288 }
289 
290 #define INTRA_DC_TL_32x32(dir) \
291 void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \
292  const uint8_t *left, \
293  const uint8_t *top) \
294 { \
295  uint32_t row; \
296  v16u8 data0, data1, out; \
297  v8u16 sum_h, sum_data0, sum_data1; \
298  v4u32 sum_w; \
299  v2u64 sum_d; \
300  \
301  LD_UB2(dir, 16, data0, data1); \
302  HADD_UB2_UH(data0, data1, sum_data0, sum_data1); \
303  sum_h = sum_data0 + sum_data1; \
304  sum_w = __msa_hadd_u_w(sum_h, sum_h); \
305  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
306  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \
307  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
308  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5); \
309  out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \
310  \
311  for (row = 16; row--;) \
312  { \
313  ST_UB2(out, out, dst, 16); \
314  dst += dst_stride; \
315  ST_UB2(out, out, dst, 16); \
316  dst += dst_stride; \
317  } \
318 }
319 INTRA_DC_TL_32x32(top);
320 INTRA_DC_TL_32x32(left);
321 
322 #define INTRA_PREDICT_VALDC_16X16_MSA(val) \
323 void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \
324  const uint8_t *left, const uint8_t *top) \
325 { \
326  v16u8 out = (v16u8) __msa_ldi_b(val); \
327  \
328  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
329  dst += (8 * dst_stride); \
330  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
331 }
332 
336 
337 #define INTRA_PREDICT_VALDC_32X32_MSA(val) \
338 void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \
339  const uint8_t *left, const uint8_t *top) \
340 { \
341  uint32_t row; \
342  v16u8 out = (v16u8) __msa_ldi_b(val); \
343  \
344  for (row = 16; row--;) \
345  { \
346  ST_UB2(out, out, dst, 16); \
347  dst += dst_stride; \
348  ST_UB2(out, out, dst, 16); \
349  dst += dst_stride; \
350  } \
351 }
352 
356 
357 void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
358  const uint8_t *src_left, const uint8_t *src_top_ptr)
359 {
360  uint32_t left;
361  uint8_t top_left = src_top_ptr[-1];
362  v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
363  v16u8 src0, src1, src2, src3;
364  v8u16 src_top_left, vec0, vec1, vec2, vec3;
365 
366  src_top_left = (v8u16) __msa_fill_h(top_left);
367  src_top = LD_SB(src_top_ptr);
368  left = LW(src_left);
369  src_left0 = __msa_fill_b(left >> 24);
370  src_left1 = __msa_fill_b(left >> 16);
371  src_left2 = __msa_fill_b(left >> 8);
372  src_left3 = __msa_fill_b(left);
373 
374  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
375  src_left3, src_top, src0, src1, src2, src3);
376  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
377  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
378  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
379  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
380  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
381  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
382 }
383 
384 void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
385  const uint8_t *src_left, const uint8_t *src_top_ptr)
386 {
387  uint8_t top_left = src_top_ptr[-1];
388  uint32_t loop_cnt, left;
389  v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
390  v8u16 src_top_left, vec0, vec1, vec2, vec3;
391  v16u8 src0, src1, src2, src3;
392 
393  src_top = LD_SB(src_top_ptr);
394  src_top_left = (v8u16) __msa_fill_h(top_left);
395 
396  src_left += 4;
397  for (loop_cnt = 2; loop_cnt--;) {
398  left = LW(src_left);
399  src_left0 = __msa_fill_b(left >> 24);
400  src_left1 = __msa_fill_b(left >> 16);
401  src_left2 = __msa_fill_b(left >> 8);
402  src_left3 = __msa_fill_b(left);
403  src_left -= 4;
404 
405  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
406  src_left3, src_top, src0, src1, src2, src3);
407  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
408  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
409  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
410  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
411  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
412  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
413  dst += (4 * dst_stride);
414  }
415 }
416 
417 void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
418  const uint8_t *src_left, const uint8_t *src_top_ptr)
419 {
420  uint8_t top_left = src_top_ptr[-1];
421  uint32_t loop_cnt, left;
422  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
423  v8u16 src_top_left, res_r, res_l;
424 
425  src_top = LD_SB(src_top_ptr);
426  src_top_left = (v8u16) __msa_fill_h(top_left);
427 
428  src_left += 12;
429  for (loop_cnt = 4; loop_cnt--;) {
430  left = LW(src_left);
431  src_left0 = __msa_fill_b(left >> 24);
432  src_left1 = __msa_fill_b(left >> 16);
433  src_left2 = __msa_fill_b(left >> 8);
434  src_left3 = __msa_fill_b(left);
435  src_left -= 4;
436 
437  ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
438  HADD_UB2_UH(res_r, res_l, res_r, res_l);
439  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
440 
441  SAT_UH2_UH(res_r, res_l, 7);
442  PCKEV_ST_SB(res_r, res_l, dst);
443  dst += dst_stride;
444 
445  ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
446  HADD_UB2_UH(res_r, res_l, res_r, res_l);
447  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
448  SAT_UH2_UH(res_r, res_l, 7);
449  PCKEV_ST_SB(res_r, res_l, dst);
450  dst += dst_stride;
451 
452  ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
453  HADD_UB2_UH(res_r, res_l, res_r, res_l);
454  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
455  SAT_UH2_UH(res_r, res_l, 7);
456  PCKEV_ST_SB(res_r, res_l, dst);
457  dst += dst_stride;
458 
459  ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
460  HADD_UB2_UH(res_r, res_l, res_r, res_l);
461  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
462  SAT_UH2_UH(res_r, res_l, 7);
463  PCKEV_ST_SB(res_r, res_l, dst);
464  dst += dst_stride;
465  }
466 }
467 
468 void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
469  const uint8_t *src_left, const uint8_t *src_top_ptr)
470 {
471  uint8_t top_left = src_top_ptr[-1];
472  uint32_t loop_cnt, left;
473  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
474  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
475 
476  src_top0 = LD_SB(src_top_ptr);
477  src_top1 = LD_SB(src_top_ptr + 16);
478  src_top_left = (v8u16) __msa_fill_h(top_left);
479 
480  src_left += 28;
481  for (loop_cnt = 8; loop_cnt--;) {
482  left = LW(src_left);
483  src_left0 = __msa_fill_b(left >> 24);
484  src_left1 = __msa_fill_b(left >> 16);
485  src_left2 = __msa_fill_b(left >> 8);
486  src_left3 = __msa_fill_b(left);
487  src_left -= 4;
488 
489  ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
490  ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
491  HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
492  res_l1);
493  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
494  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
495  SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
496  PCKEV_ST_SB(res_r0, res_l0, dst);
497  PCKEV_ST_SB(res_r1, res_l1, dst + 16);
498  dst += dst_stride;
499 
500  ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
501  ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
502  HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
503  res_l1);
504  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
505  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
506  SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
507  PCKEV_ST_SB(res_r0, res_l0, dst);
508  PCKEV_ST_SB(res_r1, res_l1, dst + 16);
509  dst += dst_stride;
510 
511  ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
512  ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
513  HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
514  res_l1);
515  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
516  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
517  SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
518  PCKEV_ST_SB(res_r0, res_l0, dst);
519  PCKEV_ST_SB(res_r1, res_l1, dst + 16);
520  dst += dst_stride;
521 
522  ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
523  ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
524  HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
525  res_l1);
526  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
527  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
528  SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
529  PCKEV_ST_SB(res_r0, res_l0, dst);
530  PCKEV_ST_SB(res_r1, res_l1, dst + 16);
531  dst += dst_stride;
532  }
533 }
void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, const uint8_t *src)
Definition: vp9_intra_msa.c:45
void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
#define INTRA_PREDICT_VALDC_16X16_MSA(val)
void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, const uint8_t *top)
Definition: vp9_intra_msa.c:81
#define LW(psrc)
#define LD_SB(...)
static void sum_d(const int *input, int *output, int len)
Definition: dcadct.c:51
#define src
Definition: vp8dsp.c:254
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define INTRA_DC_TL_32x32(dir)
void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, const uint8_t *src)
Definition: vp9_intra_msa.c:31
#define PCKEV_ST_SB(in0, in1, pdst)
#define ILVL_B2_UH(...)
uint8_t
void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
#define INTRA_DC_TL_16x16(dir)
#define LD_UB2(...)
#define INTRA_DC_TL_8x8(dir)
#define PCKEV_B2_SB(...)
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1)
Definition: vp9_intra_msa.c:25
#define ILVRL_B2_UH(...)
#define SW4(in0, in1, in2, in3, pdst, stride)
void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, const uint8_t *top)
Definition: vp9_intra_msa.c:60
void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
#define ILVR_B4_UB(...)
#define ST_UB(...)
#define ST_UB2(...)
#define ST_UB8(...)
#define ST_UB4(...)
#define ILVR_B2_UH(...)
#define src1
Definition: h264pred.c:139
void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
#define HADD_UB2_UH(...)
void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
#define src0
Definition: h264pred.c:138
#define LD(psrc)
#define INSERT_W2_SB(...)
#define INTRA_DC_TL_4x4(dir)
#define SD4(in0, in1, in2, in3, pdst, stride)
void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
#define INSERT_D2_UB(...)
#define ST8x4_UB(in0, in1, pdst, stride)
#define SAT_UH2_UH(...)
#define SAT_UH4_UH(...)
#define LD_UB(...)
#define HADD_UB4_UH(...)
FILE * out
Definition: movenc.c:54
#define INTRA_PREDICT_VALDC_32X32_MSA(val)
void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)