FFmpeg  4.0
mpegvideo_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h263dsp_mips.h"
23 
24 static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul,
25  int16_t qadd, int8_t n_coeffs,
26  uint8_t loop_start)
27 {
28  int16_t *block_dup = block;
29  int32_t level, cnt;
30  v8i16 block_vec, qmul_vec, qadd_vec, sub;
31  v8i16 add, mask, mul, zero_mask;
32 
33  qmul_vec = __msa_fill_h(qmul);
34  qadd_vec = __msa_fill_h(qadd);
35  for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
36  block_vec = LD_SH(block_dup + loop_start);
37  mask = __msa_clti_s_h(block_vec, 0);
38  zero_mask = __msa_ceqi_h(block_vec, 0);
39  mul = block_vec * qmul_vec;
40  sub = mul - qadd_vec;
41  add = mul + qadd_vec;
42  add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask);
43  block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
44  (v16u8) zero_mask);
45  ST_SH(block_vec, block_dup + loop_start);
46  block_dup += 8;
47  }
48 
49  cnt = ((n_coeffs >> 3) * 8) + loop_start;
50 
51  for (; cnt <= n_coeffs; cnt++) {
52  level = block[cnt];
53  if (level) {
54  if (level < 0) {
55  level = level * qmul - qadd;
56  } else {
57  level = level * qmul + qadd;
58  }
59  block[cnt] = level;
60  }
61  }
62 }
63 
65  int32_t qscale,
66  const int16_t *quant_matrix)
67 {
68  int32_t cnt, sum_res = -1;
69  v8i16 block_vec, block_neg, qscale_vec, mask;
70  v8i16 block_org0, block_org1, block_org2, block_org3;
71  v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
72  v8i16 sum, mul, zero_mask;
73  v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
74  v4i32 block_l, block_r, sad;
75 
76  qscale_vec = __msa_fill_h(qscale);
77  for (cnt = 0; cnt < 2; cnt++) {
78  LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3);
79  LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
80  mask = __msa_clti_s_h(block_org0, 0);
81  zero_mask = __msa_ceqi_h(block_org0, 0);
82  block_neg = -block_org0;
83  block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
84  (v16u8) mask);
85  block_vec <<= 1;
86  block_vec += 1;
87  UNPCK_SH_SW(block_vec, block_r, block_l);
88  UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
89  UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l);
90  mul_vec = block_l * qscale_l;
91  mul_vec *= quant_m_l;
92  block_l = mul_vec >> 4;
93  mul_vec = block_r * qscale_r;
94  mul_vec *= quant_m_r;
95  block_r = mul_vec >> 4;
96  mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
97  block_neg = - mul;
98  sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
99  (v16u8) mask);
100  sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
101  (v16u8) zero_mask);
102  ST_SH(sum, block);
103  block += 8;
104  quant_matrix += 8;
105  sad = __msa_hadd_s_w(sum, sum);
106  sum_res += HADD_SW_S32(sad);
107  mask = __msa_clti_s_h(block_org1, 0);
108  zero_mask = __msa_ceqi_h(block_org1, 0);
109  block_neg = - block_org1;
110  block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
111  (v16u8) mask);
112  block_vec <<= 1;
113  block_vec += 1;
114  UNPCK_SH_SW(block_vec, block_r, block_l);
115  UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
116  UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l);
117  mul_vec = block_l * qscale_l;
118  mul_vec *= quant_m_l;
119  block_l = mul_vec >> 4;
120  mul_vec = block_r * qscale_r;
121  mul_vec *= quant_m_r;
122  block_r = mul_vec >> 4;
123  mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
124  block_neg = - mul;
125  sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
126  (v16u8) mask);
127  sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
128  (v16u8) zero_mask);
129  ST_SH(sum, block);
130 
131  block += 8;
132  quant_matrix += 8;
133  sad = __msa_hadd_s_w(sum, sum);
134  sum_res += HADD_SW_S32(sad);
135  mask = __msa_clti_s_h(block_org2, 0);
136  zero_mask = __msa_ceqi_h(block_org2, 0);
137  block_neg = - block_org2;
138  block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
139  (v16u8) mask);
140  block_vec <<= 1;
141  block_vec += 1;
142  UNPCK_SH_SW(block_vec, block_r, block_l);
143  UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
144  UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l);
145  mul_vec = block_l * qscale_l;
146  mul_vec *= quant_m_l;
147  block_l = mul_vec >> 4;
148  mul_vec = block_r * qscale_r;
149  mul_vec *= quant_m_r;
150  block_r = mul_vec >> 4;
151  mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
152  block_neg = - mul;
153  sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
154  (v16u8) mask);
155  sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
156  (v16u8) zero_mask);
157  ST_SH(sum, block);
158 
159  block += 8;
160  quant_matrix += 8;
161  sad = __msa_hadd_s_w(sum, sum);
162  sum_res += HADD_SW_S32(sad);
163  mask = __msa_clti_s_h(block_org3, 0);
164  zero_mask = __msa_ceqi_h(block_org3, 0);
165  block_neg = - block_org3;
166  block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
167  (v16u8) mask);
168  block_vec <<= 1;
169  block_vec += 1;
170  UNPCK_SH_SW(block_vec, block_r, block_l);
171  UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
172  UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l);
173  mul_vec = block_l * qscale_l;
174  mul_vec *= quant_m_l;
175  block_l = mul_vec >> 4;
176  mul_vec = block_r * qscale_r;
177  mul_vec *= quant_m_r;
178  block_r = mul_vec >> 4;
179  mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
180  block_neg = - mul;
181  sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
182  (v16u8) mask);
183  sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
184  (v16u8) zero_mask);
185  ST_SH(sum, block);
186 
187  block += 8;
188  quant_matrix += 8;
189  sad = __msa_hadd_s_w(sum, sum);
190  sum_res += HADD_SW_S32(sad);
191  }
192 
193  return sum_res;
194 }
195 
197  int16_t *block, int32_t index,
198  int32_t qscale)
199 {
200  int32_t qmul, qadd;
201  int32_t nCoeffs;
202 
203  av_assert2(s->block_last_index[index] >= 0 || s->h263_aic);
204 
205  qmul = qscale << 1;
206 
207  if (!s->h263_aic) {
208  block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale;
209  qadd = (qscale - 1) | 1;
210  } else {
211  qadd = 0;
212  }
213  if (s->ac_pred)
214  nCoeffs = 63;
215  else
217 
218  h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
219 }
220 
222  int16_t *block, int32_t index,
223  int32_t qscale)
224 {
225  int32_t qmul, qadd;
226  int32_t nCoeffs;
227 
228  av_assert2(s->block_last_index[index] >= 0);
229 
230  qadd = (qscale - 1) | 1;
231  qmul = qscale << 1;
232 
234 
235  h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
236 }
237 
239  int16_t *block, int32_t index,
240  int32_t qscale)
241 {
242  const uint16_t *quant_matrix;
243  int32_t sum = -1;
244 
245  quant_matrix = s->inter_matrix;
246 
247  sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix);
248 
249  block[63] ^= sum & 1;
250 }
void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, int16_t *block, int32_t index, int32_t qscale)
const char * s
Definition: avisynth_c.h:768
uint8_t raster_end[64]
Definition: idctdsp.h:34
int h263_aic
Advanced INTRA Coding (AIC)
Definition: mpegvideo.h:87
static int16_t block[64]
Definition: dct.c:115
uint8_t
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
#define LD_SH(...)
static const uint16_t mask[17]
Definition: lzw.c:38
static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block, int32_t qscale, const int16_t *quant_matrix)
Definition: mpegvideo_msa.c:64
int32_t
int block_last_index[12]
last non zero coefficient in block
Definition: mpegvideo.h:86
void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, int16_t *block, int32_t index, int32_t qscale)
#define UNPCK_SH_SW(in, out0, out1)
uint16_t inter_matrix[64]
Definition: mpegvideo.h:302
void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, int16_t *block, int32_t index, int32_t qscale)
#define ST_SH(...)
int index
Definition: gxfenc.c:89
uint8_t level
Definition: svq3.c:207
MpegEncContext.
Definition: mpegvideo.h:81
#define LD_SH4(...)
ScanTable inter_scantable
if inter == intra then intra should be used to reduce the cache usage
Definition: mpegvideo.h:90
static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul, int16_t qadd, int8_t n_coeffs, uint8_t loop_start)
Definition: mpegvideo_msa.c:24
#define HADD_SW_S32(in)