FFmpeg  4.0
wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * @brief Windows Media Audio Voice compatible decoder
25  * @author Ronald S. Bultje <rsbultje@gmail.com>
26  */
27 
28 #include <math.h>
29 
31 #include "libavutil/float_dsp.h"
32 #include "libavutil/mem.h"
33 #include "libavutil/thread.h"
34 #include "avcodec.h"
35 #include "internal.h"
36 #include "get_bits.h"
37 #include "put_bits.h"
38 #include "wmavoice_data.h"
39 #include "celp_filters.h"
40 #include "acelp_vectors.h"
41 #include "acelp_filters.h"
42 #include "lsp.h"
43 #include "dct.h"
44 #include "rdft.h"
45 #include "sinewin.h"
46 
47 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
48 #define MAX_LSPS 16 ///< maximum filter order
49 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple
50  ///< of 16 for ASM input buffer alignment
51 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
52 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
53 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
54 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
55  ///< maximum number of samples per superframe
56 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
57  ///< was split over two packets
58 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
59 
60 /**
61  * Frame type VLC coding.
62  */
64 
65 /**
66  * Adaptive codebook types.
67  */
68 enum {
69  ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
70  ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
71  ///< we interpolate to get a per-sample pitch.
72  ///< Signal is generated using an asymmetric sinc
73  ///< window function
74  ///< @note see #wmavoice_ipol1_coeffs
75  ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
76  ///< a Hamming sinc window function
77  ///< @note see #wmavoice_ipol2_coeffs
78 };
79 
80 /**
81  * Fixed codebook types.
82  */
83 enum {
84  FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
85  ///< generated from a hardcoded (fixed) codebook
86  ///< with per-frame (low) gain values
87  FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
88  ///< gain values
89  FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
90  ///< used in particular for low-bitrate streams
91  FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
92  ///< combinations of either single pulses or
93  ///< pulse pairs
94 };
95 
96 /**
97  * Description of frame types.
98  */
99 static const struct frame_type_desc {
100  uint8_t n_blocks; ///< amount of blocks per frame (each block
101  ///< (contains 160/#n_blocks samples)
102  uint8_t log_n_blocks; ///< log2(#n_blocks)
103  uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
104  uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
105  uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
106  ///< (rather than just one single pulse)
107  ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
108 } frame_descs[17] = {
109  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0 },
110  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0 },
126 };
127 
128 /**
129  * WMA Voice decoding context.
130  */
131 typedef struct WMAVoiceContext {
132  /**
133  * @name Global values specified in the stream header / extradata or used all over.
134  * @{
135  */
136  GetBitContext gb; ///< packet bitreader. During decoder init,
137  ///< it contains the extradata from the
138  ///< demuxer. During decoding, it contains
139  ///< packet data.
140  int8_t vbm_tree[25]; ///< converts VLC codes to frame type
141 
142  int spillover_bitsize; ///< number of bits used to specify
143  ///< #spillover_nbits in the packet header
144  ///< = ceil(log2(ctx->block_align << 3))
145  int history_nsamples; ///< number of samples in history for signal
146  ///< prediction (through ACB)
147 
148  /* postfilter specific values */
149  int do_apf; ///< whether to apply the averaged
150  ///< projection filter (APF)
151  int denoise_strength; ///< strength of denoising in Wiener filter
152  ///< [0-11]
153  int denoise_tilt_corr; ///< Whether to apply tilt correction to the
154  ///< Wiener filter coefficients (postfilter)
155  int dc_level; ///< Predicted amount of DC noise, based
156  ///< on which a DC removal filter is used
157 
158  int lsps; ///< number of LSPs per frame [10 or 16]
159  int lsp_q_mode; ///< defines quantizer defaults [0, 1]
160  int lsp_def_mode; ///< defines different sets of LSP defaults
161  ///< [0, 1]
162 
163  int min_pitch_val; ///< base value for pitch parsing code
164  int max_pitch_val; ///< max value + 1 for pitch parsing
165  int pitch_nbits; ///< number of bits used to specify the
166  ///< pitch value in the frame header
167  int block_pitch_nbits; ///< number of bits used to specify the
168  ///< first block's pitch value
169  int block_pitch_range; ///< range of the block pitch
170  int block_delta_pitch_nbits; ///< number of bits used to specify the
171  ///< delta pitch between this and the last
172  ///< block's pitch value, used in all but
173  ///< first block
174  int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
175  ///< from -this to +this-1)
176  uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
177  ///< conversion
178 
179  /**
180  * @}
181  *
182  * @name Packet values specified in the packet header or related to a packet.
183  *
184  * A packet is considered to be a single unit of data provided to this
185  * decoder by the demuxer.
186  * @{
187  */
188  int spillover_nbits; ///< number of bits of the previous packet's
189  ///< last superframe preceding this
190  ///< packet's first full superframe (useful
191  ///< for re-synchronization also)
192  int has_residual_lsps; ///< if set, superframes contain one set of
193  ///< LSPs that cover all frames, encoded as
194  ///< independent and residual LSPs; if not
195  ///< set, each frame contains its own, fully
196  ///< independent, LSPs
197  int skip_bits_next; ///< number of bits to skip at the next call
198  ///< to #wmavoice_decode_packet() (since
199  ///< they're part of the previous superframe)
200 
202  ///< cache for superframe data split over
203  ///< multiple packets
204  int sframe_cache_size; ///< set to >0 if we have data from an
205  ///< (incomplete) superframe from a previous
206  ///< packet that spilled over in the current
207  ///< packet; specifies the amount of bits in
208  ///< #sframe_cache
209  PutBitContext pb; ///< bitstream writer for #sframe_cache
210 
211  /**
212  * @}
213  *
214  * @name Frame and superframe values
215  * Superframe and frame data - these can change from frame to frame,
216  * although some of them do in that case serve as a cache / history for
217  * the next frame or superframe.
218  * @{
219  */
220  double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
221  ///< superframe
222  int last_pitch_val; ///< pitch value of the previous frame
223  int last_acb_type; ///< frame type [0-2] of the previous frame
224  int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
225  ///< << 16) / #MAX_FRAMESIZE
226  float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
227 
228  int aw_idx_is_ext; ///< whether the AW index was encoded in
229  ///< 8 bits (instead of 6)
230  int aw_pulse_range; ///< the range over which #aw_pulse_set1()
231  ///< can apply the pulse, relative to the
232  ///< value in aw_first_pulse_off. The exact
233  ///< position of the first AW-pulse is within
234  ///< [pulse_off, pulse_off + this], and
235  ///< depends on bitstream values; [16 or 24]
236  int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
237  ///< that this number can be negative (in
238  ///< which case it basically means "zero")
239  int aw_first_pulse_off[2]; ///< index of first sample to which to
240  ///< apply AW-pulses, or -0xff if unset
241  int aw_next_pulse_off_cache; ///< the position (relative to start of the
242  ///< second block) at which pulses should
243  ///< start to be positioned, serves as a
244  ///< cache for pitch-adaptive window pulses
245  ///< between blocks
246 
247  int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
248  ///< only used for comfort noise in #pRNG()
249  int nb_superframes; ///< number of superframes in current packet
250  float gain_pred_err[6]; ///< cache for gain prediction
251  float excitation_history[MAX_SIGNAL_HISTORY];
252  ///< cache of the signal of previous
253  ///< superframes, used as a history for
254  ///< signal generation
255  float synth_history[MAX_LSPS]; ///< see #excitation_history
256  /**
257  * @}
258  *
259  * @name Postfilter values
260  *
261  * Variables used for postfilter implementation, mostly history for
262  * smoothing and so on, and context variables for FFT/iFFT.
263  * @{
264  */
265  RDFTContext rdft, irdft; ///< contexts for FFT-calculation in the
266  ///< postfilter (for denoise filter)
267  DCTContext dct, dst; ///< contexts for phase shift (in Hilbert
268  ///< transform, part of postfilter)
269  float sin[511], cos[511]; ///< 8-bit cosine/sine windows over [-pi,pi]
270  ///< range
271  float postfilter_agc; ///< gain control memory, used in
272  ///< #adaptive_gain_control()
273  float dcf_mem[2]; ///< DC filter history
274  float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
275  ///< zero filter output (i.e. excitation)
276  ///< by postfilter
277  float denoise_filter_cache[MAX_FRAMESIZE];
278  int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
279  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
280  ///< aligned buffer for LPC tilting
281  DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
282  ///< aligned buffer for denoise coefficients
283  DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
284  ///< aligned buffer for postfilter speech
285  ///< synthesis
286  /**
287  * @}
288  */
290 
291 /**
292  * Set up the variable bit mode (VBM) tree from container extradata.
293  * @param gb bit I/O context.
294  * The bit context (s->gb) should be loaded with byte 23-46 of the
295  * container extradata (i.e. the ones containing the VBM tree).
296  * @param vbm_tree pointer to array to which the decoded VBM tree will be
297  * written.
298  * @return 0 on success, <0 on error.
299  */
300 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
301 {
302  int cntr[8] = { 0 }, n, res;
303 
304  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
305  for (n = 0; n < 17; n++) {
306  res = get_bits(gb, 3);
307  if (cntr[res] > 3) // should be >= 3 + (res == 7))
308  return -1;
309  vbm_tree[res * 3 + cntr[res]++] = n;
310  }
311  return 0;
312 }
313 
315 {
316  static const uint8_t bits[] = {
317  2, 2, 2, 4, 4, 4,
318  6, 6, 6, 8, 8, 8,
319  10, 10, 10, 12, 12, 12,
320  14, 14, 14, 14
321  };
322  static const uint16_t codes[] = {
323  0x0000, 0x0001, 0x0002, // 00/01/10
324  0x000c, 0x000d, 0x000e, // 11+00/01/10
325  0x003c, 0x003d, 0x003e, // 1111+00/01/10
326  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
327  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
328  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
329  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
330  };
331 
332  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
333  bits, 1, 1, codes, 2, 2, 132);
334 }
335 
337 {
338  WMAVoiceContext *s = ctx->priv_data;
339  int n;
340 
341  s->postfilter_agc = 0;
342  s->sframe_cache_size = 0;
343  s->skip_bits_next = 0;
344  for (n = 0; n < s->lsps; n++)
345  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
346  memset(s->excitation_history, 0,
347  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
348  memset(s->synth_history, 0,
349  sizeof(*s->synth_history) * MAX_LSPS);
350  memset(s->gain_pred_err, 0,
351  sizeof(s->gain_pred_err));
352 
353  if (s->do_apf) {
354  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
355  sizeof(*s->synth_filter_out_buf) * s->lsps);
356  memset(s->dcf_mem, 0,
357  sizeof(*s->dcf_mem) * 2);
358  memset(s->zero_exc_pf, 0,
359  sizeof(*s->zero_exc_pf) * s->history_nsamples);
360  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
361  }
362 }
363 
364 /**
365  * Set up decoder with parameters from demuxer (extradata etc.).
366  */
368 {
369  static AVOnce init_static_once = AV_ONCE_INIT;
370  int n, flags, pitch_range, lsp16_flag;
371  WMAVoiceContext *s = ctx->priv_data;
372 
373  ff_thread_once(&init_static_once, wmavoice_init_static_data);
374 
375  /**
376  * Extradata layout:
377  * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
378  * - byte 19-22: flags field (annoyingly in LE; see below for known
379  * values),
380  * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
381  * rest is 0).
382  */
383  if (ctx->extradata_size != 46) {
384  av_log(ctx, AV_LOG_ERROR,
385  "Invalid extradata size %d (should be 46)\n",
386  ctx->extradata_size);
387  return AVERROR_INVALIDDATA;
388  }
389  if (ctx->block_align <= 0) {
390  av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
391  return AVERROR_INVALIDDATA;
392  }
393 
394  flags = AV_RL32(ctx->extradata + 18);
395  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
396  s->do_apf = flags & 0x1;
397  if (s->do_apf) {
398  ff_rdft_init(&s->rdft, 7, DFT_R2C);
399  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
400  ff_dct_init(&s->dct, 6, DCT_I);
401  ff_dct_init(&s->dst, 6, DST_I);
402 
403  ff_sine_window_init(s->cos, 256);
404  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
405  for (n = 0; n < 255; n++) {
406  s->sin[n] = -s->sin[510 - n];
407  s->cos[510 - n] = s->cos[n];
408  }
409  }
410  s->denoise_strength = (flags >> 2) & 0xF;
411  if (s->denoise_strength >= 12) {
412  av_log(ctx, AV_LOG_ERROR,
413  "Invalid denoise filter strength %d (max=11)\n",
414  s->denoise_strength);
415  return AVERROR_INVALIDDATA;
416  }
417  s->denoise_tilt_corr = !!(flags & 0x40);
418  s->dc_level = (flags >> 7) & 0xF;
419  s->lsp_q_mode = !!(flags & 0x2000);
420  s->lsp_def_mode = !!(flags & 0x4000);
421  lsp16_flag = flags & 0x1000;
422  if (lsp16_flag) {
423  s->lsps = 16;
424  } else {
425  s->lsps = 10;
426  }
427  for (n = 0; n < s->lsps; n++)
428  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
429 
430  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
431  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
432  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
433  return AVERROR_INVALIDDATA;
434  }
435 
436  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
437  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
438  pitch_range = s->max_pitch_val - s->min_pitch_val;
439  if (pitch_range <= 0) {
440  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
441  return AVERROR_INVALIDDATA;
442  }
443  s->pitch_nbits = av_ceil_log2(pitch_range);
444  s->last_pitch_val = 40;
446  s->history_nsamples = s->max_pitch_val + 8;
447 
449  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
450  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
451 
452  av_log(ctx, AV_LOG_ERROR,
453  "Unsupported samplerate %d (min=%d, max=%d)\n",
454  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
455 
456  return AVERROR(ENOSYS);
457  }
458 
459  s->block_conv_table[0] = s->min_pitch_val;
460  s->block_conv_table[1] = (pitch_range * 25) >> 6;
461  s->block_conv_table[2] = (pitch_range * 44) >> 6;
462  s->block_conv_table[3] = s->max_pitch_val - 1;
463  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
464  if (s->block_delta_pitch_hrange <= 0) {
465  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
466  return AVERROR_INVALIDDATA;
467  }
468  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
470  s->block_conv_table[3] + 1 +
471  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
472  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
473 
474  ctx->channels = 1;
477 
478  return 0;
479 }
480 
481 /**
482  * @name Postfilter functions
483  * Postfilter functions (gain control, wiener denoise filter, DC filter,
484  * kalman smoothening, plus surrounding code to wrap it)
485  * @{
486  */
487 /**
488  * Adaptive gain control (as used in postfilter).
489  *
490  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
491  * that the energy here is calculated using sum(abs(...)), whereas the
492  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
493  *
494  * @param out output buffer for filtered samples
495  * @param in input buffer containing the samples as they are after the
496  * postfilter steps so far
497  * @param speech_synth input buffer containing speech synth before postfilter
498  * @param size input buffer size
499  * @param alpha exponential filter factor
500  * @param gain_mem pointer to filter memory (single float)
501  */
502 static void adaptive_gain_control(float *out, const float *in,
503  const float *speech_synth,
504  int size, float alpha, float *gain_mem)
505 {
506  int i;
507  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
508  float mem = *gain_mem;
509 
510  for (i = 0; i < size; i++) {
511  speech_energy += fabsf(speech_synth[i]);
512  postfilter_energy += fabsf(in[i]);
513  }
514  gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
515  (1.0 - alpha) * speech_energy / postfilter_energy;
516 
517  for (i = 0; i < size; i++) {
518  mem = alpha * mem + gain_scale_factor;
519  out[i] = in[i] * mem;
520  }
521 
522  *gain_mem = mem;
523 }
524 
525 /**
526  * Kalman smoothing function.
527  *
528  * This function looks back pitch +/- 3 samples back into history to find
529  * the best fitting curve (that one giving the optimal gain of the two
530  * signals, i.e. the highest dot product between the two), and then
531  * uses that signal history to smoothen the output of the speech synthesis
532  * filter.
533  *
534  * @param s WMA Voice decoding context
535  * @param pitch pitch of the speech signal
536  * @param in input speech signal
537  * @param out output pointer for smoothened signal
538  * @param size input/output buffer size
539  *
540  * @returns -1 if no smoothening took place, e.g. because no optimal
541  * fit could be found, or 0 on success.
542  */
543 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
544  const float *in, float *out, int size)
545 {
546  int n;
547  float optimal_gain = 0, dot;
548  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
549  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
550  *best_hist_ptr = NULL;
551 
552  /* find best fitting point in history */
553  do {
554  dot = avpriv_scalarproduct_float_c(in, ptr, size);
555  if (dot > optimal_gain) {
556  optimal_gain = dot;
557  best_hist_ptr = ptr;
558  }
559  } while (--ptr >= end);
560 
561  if (optimal_gain <= 0)
562  return -1;
563  dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
564  if (dot <= 0) // would be 1.0
565  return -1;
566 
567  if (optimal_gain <= dot) {
568  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
569  } else
570  dot = 0.625;
571 
572  /* actual smoothing */
573  for (n = 0; n < size; n++)
574  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
575 
576  return 0;
577 }
578 
579 /**
580  * Get the tilt factor of a formant filter from its transfer function
581  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
582  * but somehow (??) it does a speech synthesis filter in the
583  * middle, which is missing here
584  *
585  * @param lpcs LPC coefficients
586  * @param n_lpcs Size of LPC buffer
587  * @returns the tilt factor
588  */
589 static float tilt_factor(const float *lpcs, int n_lpcs)
590 {
591  float rh0, rh1;
592 
593  rh0 = 1.0 + avpriv_scalarproduct_float_c(lpcs, lpcs, n_lpcs);
594  rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
595 
596  return rh1 / rh0;
597 }
598 
599 /**
600  * Derive denoise filter coefficients (in real domain) from the LPCs.
601  */
602 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
603  int fcb_type, float *coeffs, int remainder)
604 {
605  float last_coeff, min = 15.0, max = -15.0;
606  float irange, angle_mul, gain_mul, range, sq;
607  int n, idx;
608 
609  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
610  s->rdft.rdft_calc(&s->rdft, lpcs);
611 #define log_range(var, assign) do { \
612  float tmp = log10f(assign); var = tmp; \
613  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
614  } while (0)
615  log_range(last_coeff, lpcs[1] * lpcs[1]);
616  for (n = 1; n < 64; n++)
617  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
618  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
619  log_range(lpcs[0], lpcs[0] * lpcs[0]);
620 #undef log_range
621  range = max - min;
622  lpcs[64] = last_coeff;
623 
624  /* Now, use this spectrum to pick out these frequencies with higher
625  * (relative) power/energy (which we then take to be "not noise"),
626  * and set up a table (still in lpc[]) of (relative) gains per frequency.
627  * These frequencies will be maintained, while others ("noise") will be
628  * decreased in the filter output. */
629  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
630  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
631  (5.0 / 14.7));
632  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
633  for (n = 0; n <= 64; n++) {
634  float pwr;
635 
636  idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
638  lpcs[n] = angle_mul * pwr;
639 
640  /* 70.57 =~ 1/log10(1.0331663) */
641  idx = (pwr * gain_mul - 0.0295) * 70.570526123;
642  if (idx > 127) { // fall back if index falls outside table range
643  coeffs[n] = wmavoice_energy_table[127] *
644  powf(1.0331663, idx - 127);
645  } else
646  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
647  }
648 
649  /* calculate the Hilbert transform of the gains, which we do (since this
650  * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
651  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
652  * "moment" of the LPCs in this filter. */
653  s->dct.dct_calc(&s->dct, lpcs);
654  s->dst.dct_calc(&s->dst, lpcs);
655 
656  /* Split out the coefficient indexes into phase/magnitude pairs */
657  idx = 255 + av_clip(lpcs[64], -255, 255);
658  coeffs[0] = coeffs[0] * s->cos[idx];
659  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
660  last_coeff = coeffs[64] * s->cos[idx];
661  for (n = 63;; n--) {
662  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
663  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
664  coeffs[n * 2] = coeffs[n] * s->cos[idx];
665 
666  if (!--n) break;
667 
668  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
669  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
670  coeffs[n * 2] = coeffs[n] * s->cos[idx];
671  }
672  coeffs[1] = last_coeff;
673 
674  /* move into real domain */
675  s->irdft.rdft_calc(&s->irdft, coeffs);
676 
677  /* tilt correction and normalize scale */
678  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
679  if (s->denoise_tilt_corr) {
680  float tilt_mem = 0;
681 
682  coeffs[remainder - 1] = 0;
683  ff_tilt_compensation(&tilt_mem,
684  -1.8 * tilt_factor(coeffs, remainder - 1),
685  coeffs, remainder);
686  }
687  sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
688  remainder));
689  for (n = 0; n < remainder; n++)
690  coeffs[n] *= sq;
691 }
692 
693 /**
694  * This function applies a Wiener filter on the (noisy) speech signal as
695  * a means to denoise it.
696  *
697  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
698  * - using this power spectrum, calculate (for each frequency) the Wiener
699  * filter gain, which depends on the frequency power and desired level
700  * of noise subtraction (when set too high, this leads to artifacts)
701  * We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
702  * of 4-8kHz);
703  * - by doing a phase shift, calculate the Hilbert transform of this array
704  * of per-frequency filter-gains to get the filtering coefficients;
705  * - smoothen/normalize/de-tilt these filter coefficients as desired;
706  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
707  * to get the denoised speech signal;
708  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
709  * the frame boundary) are saved and applied to subsequent frames by an
710  * overlap-add method (otherwise you get clicking-artifacts).
711  *
712  * @param s WMA Voice decoding context
713  * @param fcb_type Frame (codebook) type
714  * @param synth_pf input: the noisy speech signal, output: denoised speech
715  * data; should be 16-byte aligned (for ASM purposes)
716  * @param size size of the speech data
717  * @param lpcs LPCs used to synthesize this frame's speech data
718  */
720  float *synth_pf, int size,
721  const float *lpcs)
722 {
723  int remainder, lim, n;
724 
725  if (fcb_type != FCB_TYPE_SILENCE) {
726  float *tilted_lpcs = s->tilted_lpcs_pf,
727  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
728 
729  tilted_lpcs[0] = 1.0;
730  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
731  memset(&tilted_lpcs[s->lsps + 1], 0,
732  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
733  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
734  tilted_lpcs, s->lsps + 2);
735 
736  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
737  * size is applied to the next frame. All input beyond this is zero,
738  * and thus all output beyond this will go towards zero, hence we can
739  * limit to min(size-1, 127-size) as a performance consideration. */
740  remainder = FFMIN(127 - size, size - 1);
741  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
742 
743  /* apply coefficients (in frequency spectrum domain), i.e. complex
744  * number multiplication */
745  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
746  s->rdft.rdft_calc(&s->rdft, synth_pf);
747  s->rdft.rdft_calc(&s->rdft, coeffs);
748  synth_pf[0] *= coeffs[0];
749  synth_pf[1] *= coeffs[1];
750  for (n = 1; n < 64; n++) {
751  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
752  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
753  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
754  }
755  s->irdft.rdft_calc(&s->irdft, synth_pf);
756  }
757 
758  /* merge filter output with the history of previous runs */
759  if (s->denoise_filter_cache_size) {
760  lim = FFMIN(s->denoise_filter_cache_size, size);
761  for (n = 0; n < lim; n++)
762  synth_pf[n] += s->denoise_filter_cache[n];
763  s->denoise_filter_cache_size -= lim;
764  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
766  }
767 
768  /* move remainder of filter output into a cache for future runs */
769  if (fcb_type != FCB_TYPE_SILENCE) {
770  lim = FFMIN(remainder, s->denoise_filter_cache_size);
771  for (n = 0; n < lim; n++)
772  s->denoise_filter_cache[n] += synth_pf[size + n];
773  if (lim < remainder) {
774  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
775  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
776  s->denoise_filter_cache_size = remainder;
777  }
778  }
779 }
780 
781 /**
782  * Averaging projection filter, the postfilter used in WMAVoice.
783  *
784  * This uses the following steps:
785  * - A zero-synthesis filter (generate excitation from synth signal)
786  * - Kalman smoothing on excitation, based on pitch
787  * - Re-synthesized smoothened output
788  * - Iterative Wiener denoise filter
789  * - Adaptive gain filter
790  * - DC filter
791  *
792  * @param s WMAVoice decoding context
793  * @param synth Speech synthesis output (before postfilter)
794  * @param samples Output buffer for filtered samples
795  * @param size Buffer size of synth & samples
796  * @param lpcs Generated LPCs used for speech synthesis
797  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
798  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
799  * @param pitch Pitch of the input signal
800  */
801 static void postfilter(WMAVoiceContext *s, const float *synth,
802  float *samples, int size,
803  const float *lpcs, float *zero_exc_pf,
804  int fcb_type, int pitch)
805 {
806  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
807  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
808  *synth_filter_in = zero_exc_pf;
809 
810  av_assert0(size <= MAX_FRAMESIZE / 2);
811 
812  /* generate excitation from input signal */
813  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
814 
815  if (fcb_type >= FCB_TYPE_AW_PULSES &&
816  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
817  synth_filter_in = synth_filter_in_buf;
818 
819  /* re-synthesize speech after smoothening, and keep history */
820  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
821  synth_filter_in, size, s->lsps);
822  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
823  sizeof(synth_pf[0]) * s->lsps);
824 
825  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
826 
827  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
828  &s->postfilter_agc);
829 
830  if (s->dc_level > 8) {
831  /* remove ultra-low frequency DC noise / highpass filter;
832  * coefficients are identical to those used in SIPR decoding,
833  * and very closely resemble those used in AMR-NB decoding. */
835  (const float[2]) { -1.99997, 1.0 },
836  (const float[2]) { -1.9330735188, 0.93589198496 },
837  0.93980580475, s->dcf_mem, size);
838  }
839 }
840 /**
841  * @}
842  */
843 
844 /**
845  * Dequantize LSPs
846  * @param lsps output pointer to the array that will hold the LSPs
847  * @param num number of LSPs to be dequantized
848  * @param values quantized values, contains n_stages values
849  * @param sizes range (i.e. max value) of each quantized value
850  * @param n_stages number of dequantization runs
851  * @param table dequantization table to be used
852  * @param mul_q LSF multiplier
853  * @param base_q base (lowest) LSF values
854  */
855 static void dequant_lsps(double *lsps, int num,
856  const uint16_t *values,
857  const uint16_t *sizes,
858  int n_stages, const uint8_t *table,
859  const double *mul_q,
860  const double *base_q)
861 {
862  int n, m;
863 
864  memset(lsps, 0, num * sizeof(*lsps));
865  for (n = 0; n < n_stages; n++) {
866  const uint8_t *t_off = &table[values[n] * num];
867  double base = base_q[n], mul = mul_q[n];
868 
869  for (m = 0; m < num; m++)
870  lsps[m] += base + mul * t_off[m];
871 
872  table += sizes[n] * num;
873  }
874 }
875 
876 /**
877  * @name LSP dequantization routines
878  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
879  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
880  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
881  * @{
882  */
883 /**
884  * Parse 10 independently-coded LSPs.
885  */
886 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
887 {
888  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
889  static const double mul_lsf[4] = {
890  5.2187144800e-3, 1.4626986422e-3,
891  9.6179549166e-4, 1.1325736225e-3
892  };
893  static const double base_lsf[4] = {
894  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
895  M_PI * -3.3486e-2, M_PI * -5.7408e-2
896  };
897  uint16_t v[4];
898 
899  v[0] = get_bits(gb, 8);
900  v[1] = get_bits(gb, 6);
901  v[2] = get_bits(gb, 5);
902  v[3] = get_bits(gb, 5);
903 
904  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
905  mul_lsf, base_lsf);
906 }
907 
908 /**
909  * Parse 10 independently-coded LSPs, and then derive the tables to
910  * generate LSPs for the other frames from them (residual coding).
911  */
913  double *i_lsps, const double *old,
914  double *a1, double *a2, int q_mode)
915 {
916  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
917  static const double mul_lsf[3] = {
918  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
919  };
920  static const double base_lsf[3] = {
921  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
922  };
923  const float (*ipol_tab)[2][10] = q_mode ?
925  uint16_t interpol, v[3];
926  int n;
927 
928  dequant_lsp10i(gb, i_lsps);
929 
930  interpol = get_bits(gb, 5);
931  v[0] = get_bits(gb, 7);
932  v[1] = get_bits(gb, 6);
933  v[2] = get_bits(gb, 6);
934 
935  for (n = 0; n < 10; n++) {
936  double delta = old[n] - i_lsps[n];
937  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
938  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
939  }
940 
941  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
942  mul_lsf, base_lsf);
943 }
944 
945 /**
946  * Parse 16 independently-coded LSPs.
947  */
948 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
949 {
950  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
951  static const double mul_lsf[5] = {
952  3.3439586280e-3, 6.9908173703e-4,
953  3.3216608306e-3, 1.0334960326e-3,
954  3.1899104283e-3
955  };
956  static const double base_lsf[5] = {
957  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
958  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
959  M_PI * -1.29816e-1
960  };
961  uint16_t v[5];
962 
963  v[0] = get_bits(gb, 8);
964  v[1] = get_bits(gb, 6);
965  v[2] = get_bits(gb, 7);
966  v[3] = get_bits(gb, 6);
967  v[4] = get_bits(gb, 7);
968 
969  dequant_lsps( lsps, 5, v, vec_sizes, 2,
970  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
971  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
972  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
973  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
974  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
975 }
976 
977 /**
978  * Parse 16 independently-coded LSPs, and then derive the tables to
979  * generate LSPs for the other frames from them (residual coding).
980  */
982  double *i_lsps, const double *old,
983  double *a1, double *a2, int q_mode)
984 {
985  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
986  static const double mul_lsf[3] = {
987  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
988  };
989  static const double base_lsf[3] = {
990  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
991  };
992  const float (*ipol_tab)[2][16] = q_mode ?
994  uint16_t interpol, v[3];
995  int n;
996 
997  dequant_lsp16i(gb, i_lsps);
998 
999  interpol = get_bits(gb, 5);
1000  v[0] = get_bits(gb, 7);
1001  v[1] = get_bits(gb, 7);
1002  v[2] = get_bits(gb, 7);
1003 
1004  for (n = 0; n < 16; n++) {
1005  double delta = old[n] - i_lsps[n];
1006  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1007  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1008  }
1009 
1010  dequant_lsps( a2, 10, v, vec_sizes, 1,
1011  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
1012  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1013  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1014  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1015  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1016 }
1017 
1018 /**
1019  * @}
1020  * @name Pitch-adaptive window coding functions
1021  * The next few functions are for pitch-adaptive window coding.
1022  * @{
1023  */
1024 /**
1025  * Parse the offset of the first pitch-adaptive window pulses, and
1026  * the distribution of pulses between the two blocks in this frame.
1027  * @param s WMA Voice decoding context private data
1028  * @param gb bit I/O context
1029  * @param pitch pitch for each block in this frame
1030  */
1032  const int *pitch)
1033 {
1034  static const int16_t start_offset[94] = {
1035  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1036  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1037  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1038  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1039  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1040  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1041  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1042  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1043  };
1044  int bits, offset;
1045 
1046  /* position of pulse */
1047  s->aw_idx_is_ext = 0;
1048  if ((bits = get_bits(gb, 6)) >= 54) {
1049  s->aw_idx_is_ext = 1;
1050  bits += (bits - 54) * 3 + get_bits(gb, 2);
1051  }
1052 
1053  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1054  * the distribution of the pulses in each block contained in this frame. */
1055  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1056  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1057  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1058  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1059  offset += s->aw_n_pulses[0] * pitch[0];
1060  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1061  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1062 
1063  /* if continuing from a position before the block, reset position to
1064  * start of block (when corrected for the range over which it can be
1065  * spread in aw_pulse_set1()). */
1066  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1067  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1068  s->aw_first_pulse_off[1] -= pitch[1];
1069  if (start_offset[bits] < 0)
1070  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1071  s->aw_first_pulse_off[0] -= pitch[0];
1072  }
1073 }
1074 
1075 /**
1076  * Apply second set of pitch-adaptive window pulses.
1077  * @param s WMA Voice decoding context private data
1078  * @param gb bit I/O context
1079  * @param block_idx block index in frame [0, 1]
1080  * @param fcb structure containing fixed codebook vector info
1081  * @return -1 on error, 0 otherwise
1082  */
1084  int block_idx, AMRFixed *fcb)
1085 {
1086  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1087  uint16_t *use_mask = use_mask_mem + 2;
1088  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1089  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1090  * of idx are the position of the bit within a particular item in the
1091  * array (0 being the most significant bit, and 15 being the least
1092  * significant bit), and the remainder (>> 4) is the index in the
1093  * use_mask[]-array. This is faster and uses less memory than using a
1094  * 80-byte/80-int array. */
1095  int pulse_off = s->aw_first_pulse_off[block_idx],
1096  pulse_start, n, idx, range, aidx, start_off = 0;
1097 
1098  /* set offset of first pulse to within this block */
1099  if (s->aw_n_pulses[block_idx] > 0)
1100  while (pulse_off + s->aw_pulse_range < 1)
1101  pulse_off += fcb->pitch_lag;
1102 
1103  /* find range per pulse */
1104  if (s->aw_n_pulses[0] > 0) {
1105  if (block_idx == 0) {
1106  range = 32;
1107  } else /* block_idx = 1 */ {
1108  range = 8;
1109  if (s->aw_n_pulses[block_idx] > 0)
1110  pulse_off = s->aw_next_pulse_off_cache;
1111  }
1112  } else
1113  range = 16;
1114  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1115 
1116  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1117  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1118  * we exclude that range from being pulsed again in this function. */
1119  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1120  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1121  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1122  if (s->aw_n_pulses[block_idx] > 0)
1123  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1124  int excl_range = s->aw_pulse_range; // always 16 or 24
1125  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1126  int first_sh = 16 - (idx & 15);
1127  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1128  excl_range -= first_sh;
1129  if (excl_range >= 16) {
1130  *use_mask_ptr++ = 0;
1131  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1132  } else
1133  *use_mask_ptr &= 0xFFFF >> excl_range;
1134  }
1135 
1136  /* find the 'aidx'th offset that is not excluded */
1137  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1138  for (n = 0; n <= aidx; pulse_start++) {
1139  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1140  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1141  if (use_mask[0]) idx = 0x0F;
1142  else if (use_mask[1]) idx = 0x1F;
1143  else if (use_mask[2]) idx = 0x2F;
1144  else if (use_mask[3]) idx = 0x3F;
1145  else if (use_mask[4]) idx = 0x4F;
1146  else return -1;
1147  idx -= av_log2_16bit(use_mask[idx >> 4]);
1148  }
1149  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1150  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1151  n++;
1152  start_off = idx;
1153  }
1154  }
1155 
1156  fcb->x[fcb->n] = start_off;
1157  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1158  fcb->n++;
1159 
1160  /* set offset for next block, relative to start of that block */
1161  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1162  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1163  return 0;
1164 }
1165 
1166 /**
1167  * Apply first set of pitch-adaptive window pulses.
1168  * @param s WMA Voice decoding context private data
1169  * @param gb bit I/O context
1170  * @param block_idx block index in frame [0, 1]
1171  * @param fcb storage location for fixed codebook pulse info
1172  */
1174  int block_idx, AMRFixed *fcb)
1175 {
1176  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1177  float v;
1178 
1179  if (s->aw_n_pulses[block_idx] > 0) {
1180  int n, v_mask, i_mask, sh, n_pulses;
1181 
1182  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1183  n_pulses = 3;
1184  v_mask = 8;
1185  i_mask = 7;
1186  sh = 4;
1187  } else { // 4 pulses, 1:sign + 2:index each
1188  n_pulses = 4;
1189  v_mask = 4;
1190  i_mask = 3;
1191  sh = 3;
1192  }
1193 
1194  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1195  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1196  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1197  s->aw_first_pulse_off[block_idx];
1198  while (fcb->x[fcb->n] < 0)
1199  fcb->x[fcb->n] += fcb->pitch_lag;
1200  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1201  fcb->n++;
1202  }
1203  } else {
1204  int num2 = (val & 0x1FF) >> 1, delta, idx;
1205 
1206  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1207  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1208  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1209  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1210  v = (val & 0x200) ? -1.0 : 1.0;
1211 
1212  fcb->no_repeat_mask |= 3 << fcb->n;
1213  fcb->x[fcb->n] = idx - delta;
1214  fcb->y[fcb->n] = v;
1215  fcb->x[fcb->n + 1] = idx;
1216  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1217  fcb->n += 2;
1218  }
1219 }
1220 
1221 /**
1222  * @}
1223  *
1224  * Generate a random number from frame_cntr and block_idx, which will live
1225  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1226  * table of size 1000 of which you want to read block_size entries).
1227  *
1228  * @param frame_cntr current frame number
1229  * @param block_num current block index
1230  * @param block_size amount of entries we want to read from a table
1231  * that has 1000 entries
1232  * @return a (non-)random number in the [0, 1000 - block_size] range.
1233  */
1234 static int pRNG(int frame_cntr, int block_num, int block_size)
1235 {
1236  /* array to simplify the calculation of z:
1237  * y = (x % 9) * 5 + 6;
1238  * z = (49995 * x) / y;
1239  * Since y only has 9 values, we can remove the division by using a
1240  * LUT and using FASTDIV-style divisions. For each of the 9 values
1241  * of y, we can rewrite z as:
1242  * z = x * (49995 / y) + x * ((49995 % y) / y)
1243  * In this table, each col represents one possible value of y, the
1244  * first number is 49995 / y, and the second is the FASTDIV variant
1245  * of 49995 % y / y. */
1246  static const unsigned int div_tbl[9][2] = {
1247  { 8332, 3 * 715827883U }, // y = 6
1248  { 4545, 0 * 390451573U }, // y = 11
1249  { 3124, 11 * 268435456U }, // y = 16
1250  { 2380, 15 * 204522253U }, // y = 21
1251  { 1922, 23 * 165191050U }, // y = 26
1252  { 1612, 23 * 138547333U }, // y = 31
1253  { 1388, 27 * 119304648U }, // y = 36
1254  { 1219, 16 * 104755300U }, // y = 41
1255  { 1086, 39 * 93368855U } // y = 46
1256  };
1257  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1258  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1259  // so this is effectively a modulo (%)
1260  y = x - 9 * MULH(477218589, x); // x % 9
1261  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1262  // z = x * 49995 / (y * 5 + 6)
1263  return z % (1000 - block_size);
1264 }
1265 
1266 /**
1267  * Parse hardcoded signal for a single block.
1268  * @note see #synth_block().
1269  */
1271  int block_idx, int size,
1272  const struct frame_type_desc *frame_desc,
1273  float *excitation)
1274 {
1275  float gain;
1276  int n, r_idx;
1277 
1278  av_assert0(size <= MAX_FRAMESIZE);
1279 
1280  /* Set the offset from which we start reading wmavoice_std_codebook */
1281  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1282  r_idx = pRNG(s->frame_cntr, block_idx, size);
1283  gain = s->silence_gain;
1284  } else /* FCB_TYPE_HARDCODED */ {
1285  r_idx = get_bits(gb, 8);
1286  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1287  }
1288 
1289  /* Clear gain prediction parameters */
1290  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1291 
1292  /* Apply gain to hardcoded codebook and use that as excitation signal */
1293  for (n = 0; n < size; n++)
1294  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1295 }
1296 
1297 /**
1298  * Parse FCB/ACB signal for a single block.
1299  * @note see #synth_block().
1300  */
1302  int block_idx, int size,
1303  int block_pitch_sh2,
1304  const struct frame_type_desc *frame_desc,
1305  float *excitation)
1306 {
1307  static const float gain_coeff[6] = {
1308  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1309  };
1310  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1311  int n, idx, gain_weight;
1312  AMRFixed fcb;
1313 
1314  av_assert0(size <= MAX_FRAMESIZE / 2);
1315  memset(pulses, 0, sizeof(*pulses) * size);
1316 
1317  fcb.pitch_lag = block_pitch_sh2 >> 2;
1318  fcb.pitch_fac = 1.0;
1319  fcb.no_repeat_mask = 0;
1320  fcb.n = 0;
1321 
1322  /* For the other frame types, this is where we apply the innovation
1323  * (fixed) codebook pulses of the speech signal. */
1324  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1325  aw_pulse_set1(s, gb, block_idx, &fcb);
1326  if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1327  /* Conceal the block with silence and return.
1328  * Skip the correct amount of bits to read the next
1329  * block from the correct offset. */
1330  int r_idx = pRNG(s->frame_cntr, block_idx, size);
1331 
1332  for (n = 0; n < size; n++)
1333  excitation[n] =
1334  wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1335  skip_bits(gb, 7 + 1);
1336  return;
1337  }
1338  } else /* FCB_TYPE_EXC_PULSES */ {
1339  int offset_nbits = 5 - frame_desc->log_n_blocks;
1340 
1341  fcb.no_repeat_mask = -1;
1342  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1343  * (instead of double) for a subset of pulses */
1344  for (n = 0; n < 5; n++) {
1345  float sign;
1346  int pos1, pos2;
1347 
1348  sign = get_bits1(gb) ? 1.0 : -1.0;
1349  pos1 = get_bits(gb, offset_nbits);
1350  fcb.x[fcb.n] = n + 5 * pos1;
1351  fcb.y[fcb.n++] = sign;
1352  if (n < frame_desc->dbl_pulses) {
1353  pos2 = get_bits(gb, offset_nbits);
1354  fcb.x[fcb.n] = n + 5 * pos2;
1355  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1356  }
1357  }
1358  }
1359  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1360 
1361  /* Calculate gain for adaptive & fixed codebook signal.
1362  * see ff_amr_set_fixed_gain(). */
1363  idx = get_bits(gb, 7);
1365  gain_coeff, 6) -
1366  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1367  acb_gain = wmavoice_gain_codebook_acb[idx];
1368  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1369  -2.9957322736 /* log(0.05) */,
1370  1.6094379124 /* log(5.0) */);
1371 
1372  gain_weight = 8 >> frame_desc->log_n_blocks;
1373  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1374  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1375  for (n = 0; n < gain_weight; n++)
1376  s->gain_pred_err[n] = pred_err;
1377 
1378  /* Calculation of adaptive codebook */
1379  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1380  int len;
1381  for (n = 0; n < size; n += len) {
1382  int next_idx_sh16;
1383  int abs_idx = block_idx * size + n;
1384  int pitch_sh16 = (s->last_pitch_val << 16) +
1385  s->pitch_diff_sh16 * abs_idx;
1386  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1387  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1388  idx = idx_sh16 >> 16;
1389  if (s->pitch_diff_sh16) {
1390  if (s->pitch_diff_sh16 > 0) {
1391  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1392  } else
1393  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1394  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1395  1, size - n);
1396  } else
1397  len = size;
1398 
1399  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1401  idx, 9, len);
1402  }
1403  } else /* ACB_TYPE_HAMMING */ {
1404  int block_pitch = block_pitch_sh2 >> 2;
1405  idx = block_pitch_sh2 & 3;
1406  if (idx) {
1407  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1409  idx, 8, size);
1410  } else
1411  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1412  sizeof(float) * size);
1413  }
1414 
1415  /* Interpolate ACB/FCB and use as excitation signal */
1416  ff_weighted_vector_sumf(excitation, excitation, pulses,
1417  acb_gain, fcb_gain, size);
1418 }
1419 
1420 /**
1421  * Parse data in a single block.
1422  *
1423  * @param s WMA Voice decoding context private data
1424  * @param gb bit I/O context
1425  * @param block_idx index of the to-be-read block
1426  * @param size amount of samples to be read in this block
1427  * @param block_pitch_sh2 pitch for this block << 2
1428  * @param lsps LSPs for (the end of) this frame
1429  * @param prev_lsps LSPs for the last frame
1430  * @param frame_desc frame type descriptor
1431  * @param excitation target memory for the ACB+FCB interpolated signal
1432  * @param synth target memory for the speech synthesis filter output
1433  * @return 0 on success, <0 on error.
1434  */
1436  int block_idx, int size,
1437  int block_pitch_sh2,
1438  const double *lsps, const double *prev_lsps,
1439  const struct frame_type_desc *frame_desc,
1440  float *excitation, float *synth)
1441 {
1442  double i_lsps[MAX_LSPS];
1443  float lpcs[MAX_LSPS];
1444  float fac;
1445  int n;
1446 
1447  if (frame_desc->acb_type == ACB_TYPE_NONE)
1448  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1449  else
1450  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1451  frame_desc, excitation);
1452 
1453  /* convert interpolated LSPs to LPCs */
1454  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1455  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1456  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1457  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1458 
1459  /* Speech synthesis */
1460  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1461 }
1462 
1463 /**
1464  * Synthesize output samples for a single frame.
1465  *
1466  * @param ctx WMA Voice decoder context
1467  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1468  * @param frame_idx Frame number within superframe [0-2]
1469  * @param samples pointer to output sample buffer, has space for at least 160
1470  * samples
1471  * @param lsps LSP array
1472  * @param prev_lsps array of previous frame's LSPs
1473  * @param excitation target buffer for excitation signal
1474  * @param synth target buffer for synthesized speech data
1475  * @return 0 on success, <0 on error.
1476  */
1477 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1478  float *samples,
1479  const double *lsps, const double *prev_lsps,
1480  float *excitation, float *synth)
1481 {
1482  WMAVoiceContext *s = ctx->priv_data;
1483  int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1484  int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1485 
1486  /* Parse frame type ("frame header"), see frame_descs */
1487  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1488 
1489  if (bd_idx < 0) {
1490  av_log(ctx, AV_LOG_ERROR,
1491  "Invalid frame type VLC code, skipping\n");
1492  return AVERROR_INVALIDDATA;
1493  }
1494 
1495  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1496 
1497  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1498  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1499  /* Pitch is provided per frame, which is interpreted as the pitch of
1500  * the last sample of the last block of this frame. We can interpolate
1501  * the pitch of other blocks (and even pitch-per-sample) by gradually
1502  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1503  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1504  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1505  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1506  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1507  if (s->last_acb_type == ACB_TYPE_NONE ||
1508  20 * abs(cur_pitch_val - s->last_pitch_val) >
1509  (cur_pitch_val + s->last_pitch_val))
1510  s->last_pitch_val = cur_pitch_val;
1511 
1512  /* pitch per block */
1513  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1514  int fac = n * 2 + 1;
1515 
1516  pitch[n] = (MUL16(fac, cur_pitch_val) +
1517  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1518  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1519  }
1520 
1521  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1522  s->pitch_diff_sh16 =
1523  ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1524  }
1525 
1526  /* Global gain (if silence) and pitch-adaptive window coordinates */
1527  switch (frame_descs[bd_idx].fcb_type) {
1528  case FCB_TYPE_SILENCE:
1530  break;
1531  case FCB_TYPE_AW_PULSES:
1532  aw_parse_coords(s, gb, pitch);
1533  break;
1534  }
1535 
1536  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1537  int bl_pitch_sh2;
1538 
1539  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1540  switch (frame_descs[bd_idx].acb_type) {
1541  case ACB_TYPE_HAMMING: {
1542  /* Pitch is given per block. Per-block pitches are encoded as an
1543  * absolute value for the first block, and then delta values
1544  * relative to this value) for all subsequent blocks. The scale of
1545  * this pitch value is semi-logarithmic compared to its use in the
1546  * decoder, so we convert it to normal scale also. */
1547  int block_pitch,
1548  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1549  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1550  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1551 
1552  if (n == 0) {
1553  block_pitch = get_bits(gb, s->block_pitch_nbits);
1554  } else
1555  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1557  /* Convert last_ so that any next delta is within _range */
1558  last_block_pitch = av_clip(block_pitch,
1560  s->block_pitch_range -
1562 
1563  /* Convert semi-log-style scale back to normal scale */
1564  if (block_pitch < t1) {
1565  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1566  } else {
1567  block_pitch -= t1;
1568  if (block_pitch < t2) {
1569  bl_pitch_sh2 =
1570  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1571  } else {
1572  block_pitch -= t2;
1573  if (block_pitch < t3) {
1574  bl_pitch_sh2 =
1575  (s->block_conv_table[2] + block_pitch) << 2;
1576  } else
1577  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1578  }
1579  }
1580  pitch[n] = bl_pitch_sh2 >> 2;
1581  break;
1582  }
1583 
1584  case ACB_TYPE_ASYMMETRIC: {
1585  bl_pitch_sh2 = pitch[n] << 2;
1586  break;
1587  }
1588 
1589  default: // ACB_TYPE_NONE has no pitch
1590  bl_pitch_sh2 = 0;
1591  break;
1592  }
1593 
1594  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1595  lsps, prev_lsps, &frame_descs[bd_idx],
1596  &excitation[n * block_nsamples],
1597  &synth[n * block_nsamples]);
1598  }
1599 
1600  /* Averaging projection filter, if applicable. Else, just copy samples
1601  * from synthesis buffer */
1602  if (s->do_apf) {
1603  double i_lsps[MAX_LSPS];
1604  float lpcs[MAX_LSPS];
1605 
1606  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1607  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1608  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1609  postfilter(s, synth, samples, 80, lpcs,
1610  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1611  frame_descs[bd_idx].fcb_type, pitch[0]);
1612 
1613  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1614  i_lsps[n] = cos(lsps[n]);
1615  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1616  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1617  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1618  frame_descs[bd_idx].fcb_type, pitch[0]);
1619  } else
1620  memcpy(samples, synth, 160 * sizeof(synth[0]));
1621 
1622  /* Cache values for next frame */
1623  s->frame_cntr++;
1624  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1625  s->last_acb_type = frame_descs[bd_idx].acb_type;
1626  switch (frame_descs[bd_idx].acb_type) {
1627  case ACB_TYPE_NONE:
1628  s->last_pitch_val = 0;
1629  break;
1630  case ACB_TYPE_ASYMMETRIC:
1631  s->last_pitch_val = cur_pitch_val;
1632  break;
1633  case ACB_TYPE_HAMMING:
1634  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1635  break;
1636  }
1637 
1638  return 0;
1639 }
1640 
1641 /**
1642  * Ensure minimum value for first item, maximum value for last value,
1643  * proper spacing between each value and proper ordering.
1644  *
1645  * @param lsps array of LSPs
1646  * @param num size of LSP array
1647  *
1648  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1649  * useful to put in a generic location later on. Parts are also
1650  * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1651  * which is in float.
1652  */
1653 static void stabilize_lsps(double *lsps, int num)
1654 {
1655  int n, m, l;
1656 
1657  /* set minimum value for first, maximum value for last and minimum
1658  * spacing between LSF values.
1659  * Very similar to ff_set_min_dist_lsf(), but in double. */
1660  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1661  for (n = 1; n < num; n++)
1662  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1663  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1664 
1665  /* reorder (looks like one-time / non-recursed bubblesort).
1666  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1667  for (n = 1; n < num; n++) {
1668  if (lsps[n] < lsps[n - 1]) {
1669  for (m = 1; m < num; m++) {
1670  double tmp = lsps[m];
1671  for (l = m - 1; l >= 0; l--) {
1672  if (lsps[l] <= tmp) break;
1673  lsps[l + 1] = lsps[l];
1674  }
1675  lsps[l + 1] = tmp;
1676  }
1677  break;
1678  }
1679  }
1680 }
1681 
1682 /**
1683  * Synthesize output samples for a single superframe. If we have any data
1684  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1685  * in s->gb.
1686  *
1687  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1688  * to give a total of 480 samples per frame. See #synth_frame() for frame
1689  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1690  * (if these are globally specified for all frames (residually); they can
1691  * also be specified individually per-frame. See the s->has_residual_lsps
1692  * option), and can specify the number of samples encoded in this superframe
1693  * (if less than 480), usually used to prevent blanks at track boundaries.
1694  *
1695  * @param ctx WMA Voice decoder context
1696  * @return 0 on success, <0 on error or 1 if there was not enough data to
1697  * fully parse the superframe
1698  */
1700  int *got_frame_ptr)
1701 {
1702  WMAVoiceContext *s = ctx->priv_data;
1703  GetBitContext *gb = &s->gb, s_gb;
1704  int n, res, n_samples = MAX_SFRAMESIZE;
1705  double lsps[MAX_FRAMES][MAX_LSPS];
1706  const double *mean_lsf = s->lsps == 16 ?
1708  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1709  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1710  float *samples;
1711 
1712  memcpy(synth, s->synth_history,
1713  s->lsps * sizeof(*synth));
1714  memcpy(excitation, s->excitation_history,
1715  s->history_nsamples * sizeof(*excitation));
1716 
1717  if (s->sframe_cache_size > 0) {
1718  gb = &s_gb;
1720  s->sframe_cache_size = 0;
1721  }
1722 
1723  /* First bit is speech/music bit, it differentiates between WMAVoice
1724  * speech samples (the actual codec) and WMAVoice music samples, which
1725  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1726  * the wild yet. */
1727  if (!get_bits1(gb)) {
1728  avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1729  return AVERROR_PATCHWELCOME;
1730  }
1731 
1732  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1733  if (get_bits1(gb)) {
1734  if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1735  av_log(ctx, AV_LOG_ERROR,
1736  "Superframe encodes > %d samples (%d), not allowed\n",
1737  MAX_SFRAMESIZE, n_samples);
1738  return AVERROR_INVALIDDATA;
1739  }
1740  }
1741 
1742  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1743  if (s->has_residual_lsps) {
1744  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1745 
1746  for (n = 0; n < s->lsps; n++)
1747  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1748 
1749  if (s->lsps == 10) {
1750  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1751  } else /* s->lsps == 16 */
1752  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1753 
1754  for (n = 0; n < s->lsps; n++) {
1755  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1756  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1757  lsps[2][n] += mean_lsf[n];
1758  }
1759  for (n = 0; n < 3; n++)
1760  stabilize_lsps(lsps[n], s->lsps);
1761  }
1762 
1763  /* synth_superframe can run multiple times per packet
1764  * free potential previous frame */
1765  av_frame_unref(frame);
1766 
1767  /* get output buffer */
1768  frame->nb_samples = MAX_SFRAMESIZE;
1769  if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1770  return res;
1771  frame->nb_samples = n_samples;
1772  samples = (float *)frame->data[0];
1773 
1774  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1775  for (n = 0; n < 3; n++) {
1776  if (!s->has_residual_lsps) {
1777  int m;
1778 
1779  if (s->lsps == 10) {
1780  dequant_lsp10i(gb, lsps[n]);
1781  } else /* s->lsps == 16 */
1782  dequant_lsp16i(gb, lsps[n]);
1783 
1784  for (m = 0; m < s->lsps; m++)
1785  lsps[n][m] += mean_lsf[m];
1786  stabilize_lsps(lsps[n], s->lsps);
1787  }
1788 
1789  if ((res = synth_frame(ctx, gb, n,
1790  &samples[n * MAX_FRAMESIZE],
1791  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1792  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1793  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1794  *got_frame_ptr = 0;
1795  return res;
1796  }
1797  }
1798 
1799  /* Statistics? FIXME - we don't check for length, a slight overrun
1800  * will be caught by internal buffer padding, and anything else
1801  * will be skipped, not read. */
1802  if (get_bits1(gb)) {
1803  res = get_bits(gb, 4);
1804  skip_bits(gb, 10 * (res + 1));
1805  }
1806 
1807  if (get_bits_left(gb) < 0) {
1808  wmavoice_flush(ctx);
1809  return AVERROR_INVALIDDATA;
1810  }
1811 
1812  *got_frame_ptr = 1;
1813 
1814  /* Update history */
1815  memcpy(s->prev_lsps, lsps[2],
1816  s->lsps * sizeof(*s->prev_lsps));
1817  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1818  s->lsps * sizeof(*synth));
1819  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1820  s->history_nsamples * sizeof(*excitation));
1821  if (s->do_apf)
1822  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1823  s->history_nsamples * sizeof(*s->zero_exc_pf));
1824 
1825  return 0;
1826 }
1827 
1828 /**
1829  * Parse the packet header at the start of each packet (input data to this
1830  * decoder).
1831  *
1832  * @param s WMA Voice decoding context private data
1833  * @return <0 on error, nb_superframes on success.
1834  */
1836 {
1837  GetBitContext *gb = &s->gb;
1838  unsigned int res, n_superframes = 0;
1839 
1840  skip_bits(gb, 4); // packet sequence number
1841  s->has_residual_lsps = get_bits1(gb);
1842  do {
1843  res = get_bits(gb, 6); // number of superframes per packet
1844  // (minus first one if there is spillover)
1845  n_superframes += res;
1846  } while (res == 0x3F);
1848 
1849  return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1850 }
1851 
1852 /**
1853  * Copy (unaligned) bits from gb/data/size to pb.
1854  *
1855  * @param pb target buffer to copy bits into
1856  * @param data source buffer to copy bits from
1857  * @param size size of the source data, in bytes
1858  * @param gb bit I/O context specifying the current position in the source.
1859  * data. This function might use this to align the bit position to
1860  * a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1861  * source data
1862  * @param nbits the amount of bits to copy from source to target
1863  *
1864  * @note after calling this function, the current position in the input bit
1865  * I/O context is undefined.
1866  */
1867 static void copy_bits(PutBitContext *pb,
1868  const uint8_t *data, int size,
1869  GetBitContext *gb, int nbits)
1870 {
1871  int rmn_bytes, rmn_bits;
1872 
1873  rmn_bits = rmn_bytes = get_bits_left(gb);
1874  if (rmn_bits < nbits)
1875  return;
1876  if (nbits > pb->size_in_bits - put_bits_count(pb))
1877  return;
1878  rmn_bits &= 7; rmn_bytes >>= 3;
1879  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1880  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1881  avpriv_copy_bits(pb, data + size - rmn_bytes,
1882  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1883 }
1884 
1885 /**
1886  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1887  * and we expect that the demuxer / application provides it to us as such
1888  * (else you'll probably get garbage as output). Every packet has a size of
1889  * ctx->block_align bytes, starts with a packet header (see
1890  * #parse_packet_header()), and then a series of superframes. Superframe
1891  * boundaries may exceed packets, i.e. superframes can split data over
1892  * multiple (two) packets.
1893  *
1894  * For more information about frames, see #synth_superframe().
1895  */
1897  int *got_frame_ptr, AVPacket *avpkt)
1898 {
1899  WMAVoiceContext *s = ctx->priv_data;
1900  GetBitContext *gb = &s->gb;
1901  int size, res, pos;
1902 
1903  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1904  * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1905  * feeds us ASF packets, which may concatenate multiple "codec" packets
1906  * in a single "muxer" packet, so we artificially emulate that by
1907  * capping the packet size at ctx->block_align. */
1908  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1909  init_get_bits(&s->gb, avpkt->data, size << 3);
1910 
1911  /* size == ctx->block_align is used to indicate whether we are dealing with
1912  * a new packet or a packet of which we already read the packet header
1913  * previously. */
1914  if (!(size % ctx->block_align)) { // new packet header
1915  if (!size) {
1916  s->spillover_nbits = 0;
1917  s->nb_superframes = 0;
1918  } else {
1919  if ((res = parse_packet_header(s)) < 0)
1920  return res;
1921  s->nb_superframes = res;
1922  }
1923 
1924  /* If the packet header specifies a s->spillover_nbits, then we want
1925  * to push out all data of the previous packet (+ spillover) before
1926  * continuing to parse new superframes in the current packet. */
1927  if (s->sframe_cache_size > 0) {
1928  int cnt = get_bits_count(gb);
1929  if (cnt + s->spillover_nbits > avpkt->size * 8) {
1930  s->spillover_nbits = avpkt->size * 8 - cnt;
1931  }
1932  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1933  flush_put_bits(&s->pb);
1935  if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1936  *got_frame_ptr) {
1937  cnt += s->spillover_nbits;
1938  s->skip_bits_next = cnt & 7;
1939  res = cnt >> 3;
1940  return res;
1941  } else
1942  skip_bits_long (gb, s->spillover_nbits - cnt +
1943  get_bits_count(gb)); // resync
1944  } else if (s->spillover_nbits) {
1945  skip_bits_long(gb, s->spillover_nbits); // resync
1946  }
1947  } else if (s->skip_bits_next)
1948  skip_bits(gb, s->skip_bits_next);
1949 
1950  /* Try parsing superframes in current packet */
1951  s->sframe_cache_size = 0;
1952  s->skip_bits_next = 0;
1953  pos = get_bits_left(gb);
1954  if (s->nb_superframes-- == 0) {
1955  *got_frame_ptr = 0;
1956  return size;
1957  } else if (s->nb_superframes > 0) {
1958  if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1959  return res;
1960  } else if (*got_frame_ptr) {
1961  int cnt = get_bits_count(gb);
1962  s->skip_bits_next = cnt & 7;
1963  res = cnt >> 3;
1964  return res;
1965  }
1966  } else if ((s->sframe_cache_size = pos) > 0) {
1967  /* ... cache it for spillover in next packet */
1969  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1970  // FIXME bad - just copy bytes as whole and add use the
1971  // skip_bits_next field
1972  }
1973 
1974  return size;
1975 }
1976 
1978 {
1979  WMAVoiceContext *s = ctx->priv_data;
1980 
1981  if (s->do_apf) {
1982  ff_rdft_end(&s->rdft);
1983  ff_rdft_end(&s->irdft);
1984  ff_dct_end(&s->dct);
1985  ff_dct_end(&s->dst);
1986  }
1987 
1988  return 0;
1989 }
1990 
1992  .name = "wmavoice",
1993  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
1994  .type = AVMEDIA_TYPE_AUDIO,
1995  .id = AV_CODEC_ID_WMAVOICE,
1996  .priv_data_size = sizeof(WMAVoiceContext),
1998  .close = wmavoice_decode_end,
2001  .flush = wmavoice_flush,
2002 };
RDFTContext rdft
Definition: wmavoice.c:265
Description of frame types.
Definition: wmavoice.c:99
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply first set of pitch-adaptive window pulses.
Definition: wmavoice.c:1173
av_cold void ff_rdft_end(RDFTContext *s)
Definition: rdft.c:114
static const uint8_t wmavoice_dq_lsp16r2[0x500]
#define NULL
Definition: coverity.c:32
const char const char void * val
Definition: avisynth_c.h:771
int do_apf
whether to apply the averaged projection filter (APF)
Definition: wmavoice.c:149
const char * s
Definition: avisynth_c.h:768
#define AVERROR_INVALIDDATA
Invalid data found when processing input.
Definition: error.h:59
static int pRNG(int frame_cntr, int block_num, int block_size)
Generate a random number from frame_cntr and block_idx, which will live in the range [0...
Definition: wmavoice.c:1234
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
Set up the variable bit mode (VBM) tree from container extradata.
Definition: wmavoice.c:300
void ff_celp_lp_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP synthesis filter.
Definition: celp_filters.c:84
float gain_pred_err[6]
cache for gain prediction
Definition: wmavoice.c:250
int size
static float alpha(float a)
This structure describes decoded (raw) audio or video data.
Definition: frame.h:218
Per-block pitch with signal generation using a Hamming sinc window function.
Definition: wmavoice.c:75
int aw_next_pulse_off_cache
the position (relative to start of the second block) at which pulses should start to be positioned...
Definition: wmavoice.c:241
int nb_superframes
number of superframes in current packet
Definition: wmavoice.c:249
static void flush(AVCodecContext *avctx)
float postfilter_agc
gain control memory, used in adaptive_gain_control()
Definition: wmavoice.c:271
void ff_acelp_apply_order_2_transfer_function(float *out, const float *in, const float zero_coeffs[2], const float pole_coeffs[2], float gain, float mem[2], int n)
Apply an order 2 rational transfer function in-place.
static void put_bits(Jpeg2000EncoderContext *s, int val, int n)
put n times val bit
Definition: j2kenc.c:207
static unsigned int get_bits(GetBitContext *s, int n)
Read 1-25 bits.
Definition: get_bits.h:269
static void postfilter(WMAVoiceContext *s, const float *synth, float *samples, int size, const float *lpcs, float *zero_exc_pf, int fcb_type, int pitch)
Averaging projection filter, the postfilter used in WMAVoice.
Definition: wmavoice.c:801
Memory handling functions.
void ff_weighted_vector_sumf(float *out, const float *in_a, const float *in_b, float weight_coeff_a, float weight_coeff_b, int length)
float implementation of weighted sum of two vectors.
static void skip_bits_long(GetBitContext *s, int n)
Skips the specified number of bits.
Definition: get_bits.h:212
static av_cold int init(AVCodecContext *avctx)
Definition: avrndec.c:35
#define INIT_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size)
Definition: vlc.h:75
float synth_filter_out_buf[0x80+MAX_LSPS_ALIGN16]
aligned buffer for postfilter speech synthesis
Definition: wmavoice.c:283
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb, const int *pitch)
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between th...
Definition: wmavoice.c:1031
static const int8_t pulses[4]
Number of non-zero pulses in the MP-MLQ excitation.
Definition: g723_1.h:720
int x[10]
Definition: acelp_vectors.h:55
int size
Definition: avcodec.h:1431
int aw_n_pulses[2]
number of AW-pulses in each block; note that this number can be negative (in which case it basically ...
Definition: wmavoice.c:236
static av_cold void wmavoice_init_static_data(void)
Definition: wmavoice.c:314
static int interpol(MBContext *s, uint32_t *color, int x, int y, int linesize)
void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length)
Copy the content of src to the bitstream.
Definition: bitstream.c:64
static void stabilize_lsps(double *lsps, int num)
Ensure minimum value for first item, maximum value for last value, proper spacing between each value ...
Definition: wmavoice.c:1653
static const float wmavoice_gain_codebook_fcb[128]
static const uint8_t wmavoice_dq_lsp16i1[0x640]
#define a1
Definition: regdef.h:47
static const uint8_t wmavoice_dq_lsp16r1[0x500]
int spillover_nbits
number of bits of the previous packet&#39;s last superframe preceding this packet&#39;s first full superframe...
Definition: wmavoice.c:188
void ff_set_fixed_vector(float *out, const AMRFixed *in, float scale, int size)
Add fixed vector to an array from a sparse representation.
int block_pitch_nbits
number of bits used to specify the first block&#39;s pitch value
Definition: wmavoice.c:167
static const uint8_t wmavoice_dq_lsp16i3[0x300]
float pitch_fac
Definition: acelp_vectors.h:59
comfort noise during silence generated from a hardcoded (fixed) codebook with per-frame (low) gain va...
Definition: wmavoice.c:84
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx, float *samples, const double *lsps, const double *prev_lsps, float *excitation, float *synth)
Synthesize output samples for a single frame.
Definition: wmavoice.c:1477
static void calc_input_response(WMAVoiceContext *s, float *lpcs, int fcb_type, float *coeffs, int remainder)
Derive denoise filter coefficients (in real domain) from the LPCs.
Definition: wmavoice.c:602
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
Parse 10 independently-coded LSPs.
Definition: wmavoice.c:886
AVCodec.
Definition: avcodec.h:3408
#define MAX_LSPS_ALIGN16
same as MAX_LSPS; needs to be multiple
Definition: wmavoice.c:49
int block_align
number of bytes per packet if constant and known or 0 Used by some WAV based audio codecs...
Definition: avcodec.h:2210
static void decode(AVCodecContext *dec_ctx, AVPacket *pkt, AVFrame *frame, FILE *outfile)
Definition: decode_audio.c:42
static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply second set of pitch-adaptive window pulses.
Definition: wmavoice.c:1083
static const float wmavoice_ipol1_coeffs[17 *9]
adaptive codebook with per-frame pitch, which we interpolate to get a per-sample pitch.
Definition: wmavoice.c:70
static const uint8_t wmavoice_dq_lsp16i2[0x3c0]
#define AV_CODEC_CAP_DELAY
Encoder or decoder requires flushing with NULL input at the end in order to give the complete and cor...
Definition: avcodec.h:984
#define av_assert0(cond)
assert() equivalent, that is always enabled.
Definition: avassert.h:37
int spillover_bitsize
number of bits used to specify spillover_nbits in the packet header = ceil(log2(ctx->block_align << 3...
Definition: wmavoice.c:142
void void avpriv_request_sample(void *avc, const char *msg,...) av_printf_format(2
Log a generic warning message about a missing feature.
int block_delta_pitch_nbits
number of bits used to specify the delta pitch between this and the last block&#39;s pitch value...
Definition: wmavoice.c:170
enum AVSampleFormat sample_fmt
audio sample format
Definition: avcodec.h:2181
int mem
Definition: avisynth_c.h:821
uint8_t
#define av_cold
Definition: attributes.h:82
Sparse representation for the algebraic codebook (fixed) vector.
Definition: acelp_vectors.h:53
static const uint8_t wmavoice_dq_lsp16r3[0x600]
float delta
DCTContext dct
Definition: wmavoice.c:267
static const float wmavoice_gain_codebook_acb[128]
hardcoded (fixed) codebook with per-block gain values
Definition: wmavoice.c:87
uint8_t log_n_blocks
log2(n_blocks)
Definition: wmavoice.c:102
int aw_first_pulse_off[2]
index of first sample to which to apply AW-pulses, or -0xff if unset
Definition: wmavoice.c:239
static av_cold int end(AVCodecContext *avctx)
Definition: avrndec.c:90
int has_residual_lsps
if set, superframes contain one set of LSPs that cover all frames, encoded as independent and residua...
Definition: wmavoice.c:192
float tilted_lpcs_pf[0x80]
aligned buffer for LPC tilting
Definition: wmavoice.c:279
uint8_t * extradata
some codecs need / can use extradata like Huffman tables.
Definition: avcodec.h:1618
static float tilt_factor(const float *lpcs, int n_lpcs)
Get the tilt factor of a formant filter from its transfer function.
Definition: wmavoice.c:589
#define u(width, name, range_min, range_max)
Definition: cbs_h2645.c:344
static const uint8_t wmavoice_dq_lsp10r[0x1400]
static AVFrame * frame
const char data[16]
Definition: mxf.c:90
static void dequant_lsps(double *lsps, int num, const uint16_t *values, const uint16_t *sizes, int n_stages, const uint8_t *table, const double *mul_q, const double *base_q)
Dequantize LSPs.
Definition: wmavoice.c:855
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:112
static const float wmavoice_ipol2_coeffs[32]
Hamming-window sinc function (num = 32, x = [ 0, 31 ]): (0.54 + 0.46 * cos(2 * M_PI * x / (num - 1)))...
uint8_t * data
Definition: avcodec.h:1430
static int get_bits_count(const GetBitContext *s)
Definition: get_bits.h:200
static int flags
Definition: log.c:55
float dcf_mem[2]
DC filter history.
Definition: wmavoice.c:273
void av_memcpy_backptr(uint8_t *dst, int back, int cnt)
Overlapping memcpy() implementation.
Definition: mem.c:414
bitstream reader API header.
static av_cold void wmavoice_flush(AVCodecContext *ctx)
Definition: wmavoice.c:336
float synth_history[MAX_LSPS]
see excitation_history
Definition: wmavoice.c:255
Pitch-adaptive window (AW) pulse signals, used in particular for low-bitrate streams.
Definition: wmavoice.c:89
double prev_lsps[MAX_LSPS]
LSPs of the last frame of the previous superframe.
Definition: wmavoice.c:220
static void copy_bits(PutBitContext *pb, const uint8_t *data, int size, GetBitContext *gb, int nbits)
Copy (unaligned) bits from gb/data/size to pb.
Definition: wmavoice.c:1867
#define AVOnce
Definition: thread.h:159
#define av_log(a,...)
#define expf(x)
Definition: libm.h:283
#define U(x)
Definition: vp56_arith.h:37
static int get_bits_left(GetBitContext *gb)
Definition: get_bits.h:596
int size_in_bits
Definition: put_bits.h:39
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:176
static const double wmavoice_mean_lsf16[2][16]
int sframe_cache_size
set to >0 if we have data from an (incomplete) superframe from a previous packet that spilled over in...
Definition: wmavoice.c:204
static const float wmavoice_lsp10_intercoeff_b[32][2][10]
int block_pitch_range
range of the block pitch
Definition: wmavoice.c:169
static const float wmavoice_std_codebook[1000]
no adaptive codebook (only hardcoded fixed)
Definition: wmavoice.c:69
static const int sizes[][2]
Definition: img2dec.c:51
int last_acb_type
frame type [0-2] of the previous frame
Definition: wmavoice.c:223
#define AVERROR(e)
Definition: error.h:43
static const struct endianess table[]
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification. ...
Definition: internal.h:186
static const float wmavoice_gain_silence[256]
int denoise_filter_cache_size
samples in denoise_filter_cache
Definition: wmavoice.c:278
int history_nsamples
number of samples in history for signal prediction (through ACB)
Definition: wmavoice.c:145
static const uint8_t wmavoice_dq_lsp10i[0xf00]
Definition: wmavoice_data.h:33
static const float wmavoice_lsp10_intercoeff_a[32][2][10]
#define t1
Definition: regdef.h:29
static const float wmavoice_energy_table[128]
LUT for 1.071575641632 * pow(1.0331663, n - 127)
Windows Media Voice (WMAVoice) tables.
Definition: avfft.h:73
const char * name
Name of the codec implementation.
Definition: avcodec.h:3415
int no_repeat_mask
Definition: acelp_vectors.h:57
int denoise_tilt_corr
Whether to apply tilt correction to the Wiener filter coefficients (postfilter)
Definition: wmavoice.c:153
int aw_idx_is_ext
whether the AW index was encoded in 8 bits (instead of 6)
Definition: wmavoice.c:228
#define t3
Definition: regdef.h:31
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
#define FFMAX(a, b)
Definition: common.h:94
uint16_t block_conv_table[4]
boundaries for block pitch unit/scale conversion
Definition: wmavoice.c:176
#define MUL16(ra, rb)
Definition: mathops.h:88
DCTContext dst
contexts for phase shift (in Hilbert transform, part of postfilter)
Definition: wmavoice.c:267
int lsp_def_mode
defines different sets of LSP defaults [0, 1]
Definition: wmavoice.c:160
Definition: vlc.h:26
uint64_t channel_layout
Audio channel layout.
Definition: avcodec.h:2224
static int put_bits_count(PutBitContext *s)
Definition: put_bits.h:85
#define powf(x, y)
Definition: libm.h:50
int skip_bits_next
number of bits to skip at the next call to wmavoice_decode_packet() (since they&#39;re part of the previo...
Definition: wmavoice.c:197
static void dequant_lsp16r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:981
int min_pitch_val
base value for pitch parsing code
Definition: wmavoice.c:163
WMA Voice decoding context.
Definition: wmavoice.c:131
static void wiener_denoise(WMAVoiceContext *s, int fcb_type, float *synth_pf, int size, const float *lpcs)
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it...
Definition: wmavoice.c:719
int denoise_strength
strength of denoising in Wiener filter [0-11]
Definition: wmavoice.c:151
uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE+AV_INPUT_BUFFER_PADDING_SIZE]
cache for superframe data split over multiple packets
Definition: wmavoice.c:201
audio channel layout utility functions
Definition: avfft.h:97
#define FFMIN(a, b)
Definition: common.h:96
#define log_range(var, assign)
#define MAX_LSPS
maximum filter order
Definition: wmavoice.c:48
static VLC frame_type_vlc
Frame type VLC coding.
Definition: wmavoice.c:63
int pitch_nbits
number of bits used to specify the pitch value in the frame header
Definition: wmavoice.c:165
#define MAX_BLOCKS
maximum number of blocks per frame
Definition: wmavoice.c:47
float denoise_coeffs_pf[0x80]
aligned buffer for denoise coefficients
Definition: wmavoice.c:281
void(* dct_calc)(struct DCTContext *s, FFTSample *data)
Definition: dct.h:38
static void dequant_lsp10r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:912
float y[10]
Definition: acelp_vectors.h:56
AVFormatContext * ctx
Definition: movenc.c:48
static av_always_inline unsigned UMULH(unsigned a, unsigned b)
Definition: mathops.h:68
#define a2
Definition: regdef.h:48
Definition: dct.h:32
float sin[511]
Definition: wmavoice.c:269
static av_always_inline int get_vlc2(GetBitContext *s, VLC_TYPE(*table)[2], int bits, int max_depth)
Parse a vlc code.
Definition: get_bits.h:563
#define AV_RL32
Definition: intreadwrite.h:146
Definition: avfft.h:72
int n
Definition: avisynth_c.h:684
void(* rdft_calc)(struct RDFTContext *s, FFTSample *z)
Definition: rdft.h:38
if(ret< 0)
Definition: vf_mcdeint.c:279
static int kalman_smoothen(WMAVoiceContext *s, int pitch, const float *in, float *out, int size)
Kalman smoothing function.
Definition: wmavoice.c:543
void ff_tilt_compensation(float *mem, float tilt, float *samples, int size)
Apply tilt compensation filter, 1 - tilt * z-1.
static const float wmavoice_gain_universal[64]
void ff_acelp_lspd2lpc(const double *lsp, float *lpc, int lp_half_order)
Reconstruct LPC coefficients from the line spectral pair frequencies.
Definition: lsp.c:209
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
Set up decoder with parameters from demuxer (extradata etc.).
Definition: wmavoice.c:367
#define AVERROR_PATCHWELCOME
Not yet implemented in FFmpeg, patches welcome.
Definition: error.h:62
static const uint8_t last_coeff[3]
Definition: qdm2data.h:257
static const struct frame_type_desc frame_descs[17]
#define AV_ONCE_INIT
Definition: thread.h:160
float denoise_filter_cache[MAX_FRAMESIZE]
Definition: wmavoice.c:277
Libavcodec external API header.
int sample_rate
samples per second
Definition: avcodec.h:2173
void AAC_RENAME() ff_sine_window_init(INTFLOAT *window, int n)
Generate a sine window.
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, int *got_frame_ptr, AVPacket *avpkt)
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer...
Definition: wmavoice.c:1896
main external API structure.
Definition: avcodec.h:1518
static int parse_packet_header(WMAVoiceContext *s)
Parse the packet header at the start of each packet (input data to this decoder). ...
Definition: wmavoice.c:1835
Innovation (fixed) codebook pulse sets in combinations of either single pulses or pulse pairs...
Definition: wmavoice.c:91
int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
Get a buffer for a frame.
Definition: decode.c:1891
AVCodec ff_wmavoice_decoder
Definition: wmavoice.c:1991
int8_t vbm_tree[25]
converts VLC codes to frame type
Definition: wmavoice.c:140
int extradata_size
Definition: avcodec.h:1619
static unsigned int get_bits1(GetBitContext *s)
Definition: get_bits.h:321
static void synth_block(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const double *lsps, const double *prev_lsps, const struct frame_type_desc *frame_desc, float *excitation, float *synth)
Parse data in a single block.
Definition: wmavoice.c:1435
static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
Definition: wmavoice.c:1977
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
static void skip_bits(GetBitContext *s, int n)
Definition: get_bits.h:314
av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
Set up DCT.
Definition: dct.c:177
#define AV_CODEC_CAP_SUBFRAMES
Codec can output multiple frames per AVPacket Normally demuxers return one frame at a time...
Definition: avcodec.h:1002
int pitch_diff_sh16
((cur_pitch_val - last_pitch_val) << 16) / MAX_FRAMESIZE
Definition: wmavoice.c:224
static int init_get_bits(GetBitContext *s, const uint8_t *buffer, int bit_size)
Initialize GetBitContext.
Definition: get_bits.h:433
#define MAX_SFRAMESIZE
maximum number of samples per superframe
Definition: wmavoice.c:54
int lsp_q_mode
defines quantizer defaults [0, 1]
Definition: wmavoice.c:159
int frame_cntr
current frame index [0 - 0xFFFE]; is only used for comfort noise in pRNG()
Definition: wmavoice.c:247
void ff_celp_lp_zero_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP zero synthesis filter.
Definition: celp_filters.c:199
float avpriv_scalarproduct_float_c(const float *v1, const float *v2, int len)
Return the scalar product of two vectors.
Definition: float_dsp.c:116
static void adaptive_gain_control(float *out, const float *in, const float *speech_synth, int size, float alpha, float *gain_mem)
Adaptive gain control (as used in postfilter).
Definition: wmavoice.c:502
static const float mean_lsf[10]
Definition: siprdata.h:27
#define SFRAME_CACHE_MAXSIZE
maximum cache size for frame data that
Definition: wmavoice.c:56
#define av_log2_16bit
Definition: intmath.h:84
void av_frame_unref(AVFrame *frame)
Unreference all the buffers referenced by frame and reset the frame fields.
Definition: frame.c:551
uint8_t fcb_type
Fixed codebook type (FCB_TYPE_*)
Definition: wmavoice.c:104
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
Parse 16 independently-coded LSPs.
Definition: wmavoice.c:948
RDFTContext irdft
contexts for FFT-calculation in the postfilter (for denoise filter)
Definition: wmavoice.c:265
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:232
static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, int *got_frame_ptr)
Synthesize output samples for a single superframe.
Definition: wmavoice.c:1699
#define M_LN10
Definition: mathematics.h:43
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, const struct frame_type_desc *frame_desc, float *excitation)
Parse hardcoded signal for a single block.
Definition: wmavoice.c:1270
uint8_t n_blocks
amount of blocks per frame (each block (contains 160/n_blocks samples)
Definition: wmavoice.c:100
common internal api header.
static void flush_put_bits(PutBitContext *s)
Pad the end of the output stream with zeros.
Definition: put_bits.h:101
int pitch_lag
Definition: acelp_vectors.h:58
float excitation_history[MAX_SIGNAL_HISTORY]
cache of the signal of previous superframes, used as a history for signal generation ...
Definition: wmavoice.c:251
static void init_put_bits(PutBitContext *s, uint8_t *buffer, int buffer_size)
Initialize the PutBitContext s.
Definition: put_bits.h:48
int last_pitch_val
pitch value of the previous frame
Definition: wmavoice.c:222
#define AV_INPUT_BUFFER_PADDING_SIZE
Required number of additionally allocated bytes at the end of the input bitstream for decoding...
Definition: avcodec.h:773
void * priv_data
Definition: avcodec.h:1545
#define MAX_FRAMESIZE
maximum number of samples per frame
Definition: wmavoice.c:52
float silence_gain
set for use in blocks if ACB_TYPE_NONE
Definition: wmavoice.c:226
static const double wmavoice_mean_lsf10[2][10]
static const int16_t coeffs[]
int len
int channels
number of audio channels
Definition: avcodec.h:2174
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:162
VLC_TYPE(* table)[2]
code, bits
Definition: vlc.h:28
#define lrint
Definition: tablegen.h:53
av_cold void ff_dct_end(DCTContext *s)
Definition: dct.c:220
void ff_acelp_interpolatef(float *out, const float *in, const float *filter_coeffs, int precision, int frac_pos, int filter_length, int length)
Floating point version of ff_acelp_interpolate()
Definition: acelp_filters.c:78
int block_delta_pitch_hrange
1/2 range of the delta (full range is from -this to +this-1)
Definition: wmavoice.c:174
int max_pitch_val
max value + 1 for pitch parsing
Definition: wmavoice.c:164
#define av_uninit(x)
Definition: attributes.h:148
int lsps
number of LSPs per frame [10 or 16]
Definition: wmavoice.c:158
FILE * out
Definition: movenc.c:54
#define MAX_FRAMES
maximum number of frames per superframe
Definition: wmavoice.c:51
static const float wmavoice_lsp16_intercoeff_b[32][2][16]
PutBitContext pb
bitstream writer for sframe_cache
Definition: wmavoice.c:209
#define M_PI
Definition: mathematics.h:52
uint8_t acb_type
Adaptive codebook type (ACB_TYPE_*)
Definition: wmavoice.c:103
static const float wmavoice_denoise_power_table[12][64]
LUT for f(x,y) = pow((y + 6.9) / 64, 0.025 * (x + 1)).
int dc_level
Predicted amount of DC noise, based on which a DC removal filter is used.
Definition: wmavoice.c:155
#define VLC_NBITS
number of bits to read per VLC iteration
Definition: wmavoice.c:58
static const float wmavoice_lsp16_intercoeff_a[32][2][16]
Definition: avfft.h:96
float cos[511]
8-bit cosine/sine windows over [-pi,pi] range
Definition: wmavoice.c:269
#define AV_CH_LAYOUT_MONO
av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
Set up a real FFT.
Definition: rdft.c:88
int aw_pulse_range
the range over which aw_pulse_set1() can apply the pulse, relative to the value in aw_first_pulse_off...
Definition: wmavoice.c:230
float min
This structure stores compressed data.
Definition: avcodec.h:1407
int nb_samples
number of audio samples (per channel) described by this frame
Definition: frame.h:284
float zero_exc_pf[MAX_SIGNAL_HISTORY+MAX_SFRAMESIZE]
zero filter output (i.e.
Definition: wmavoice.c:274
#define AV_CODEC_CAP_DR1
Codec uses get_buffer() for allocating buffers and supports custom allocators.
Definition: avcodec.h:959
for(j=16;j >0;--j)
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const struct frame_type_desc *frame_desc, float *excitation)
Parse FCB/ACB signal for a single block.
Definition: wmavoice.c:1301
uint8_t dbl_pulses
how many pulse vectors have pulse pairs (rather than just one single pulse) only if fcb_type == FCB_T...
Definition: wmavoice.c:105
#define t2
Definition: regdef.h:30
#define MAX_SIGNAL_HISTORY
maximum excitation signal history
Definition: wmavoice.c:53
#define MULH
Definition: mathops.h:42
GetBitContext gb
packet bitreader.
Definition: wmavoice.c:136
static uint8_t tmp[11]
Definition: aes_ctr.c:26
bitstream writer API