Source code for librosa.feature.spectral

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Spectral feature extraction"""

import numpy as np
import scipy
import scipy.signal

from .. import util
from .. import filters
from ..util.exceptions import ParameterError

from ..core.time_frequency import fft_frequencies
from ..core.audio import zero_crossings, to_mono
from ..core.spectrum import power_to_db, _spectrogram
from ..core.constantq import cqt, hybrid_cqt
from ..core.pitch import estimate_tuning


__all__ = ['spectral_centroid',
           'spectral_bandwidth',
           'spectral_contrast',
           'spectral_rolloff',
           'spectral_flatness',
           'poly_features',
           'rmse',
           'zero_crossing_rate',
           'chroma_stft',
           'chroma_cqt',
           'chroma_cens',
           'melspectrogram',
           'mfcc',
           'tonnetz']


# -- Spectral features -- #
[docs]def spectral_centroid(y=None, sr=22050, S=None, n_fft=2048, hop_length=512, freq=None): '''Compute the spectral centroid. Each frame of a magnitude spectrogram is normalized and treated as a distribution over frequency bins, from which the mean (centroid) is extracted per frame. Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. freq : None or np.ndarray [shape=(d,) or shape=(d, t)] Center frequencies for spectrogram bins. If `None`, then FFT bin center frequencies are used. Otherwise, it can be a single array of `d` center frequencies, or a matrix of center frequencies as constructed by `librosa.core.ifgram` Returns ------- centroid : np.ndarray [shape=(1, t)] centroid frequencies See Also -------- librosa.core.stft Short-time Fourier Transform librosa.core.ifgram Instantaneous-frequency spectrogram Examples -------- From time-series input: >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> cent = librosa.feature.spectral_centroid(y=y, sr=sr) >>> cent array([[ 4382.894, 626.588, ..., 5037.07 , 5413.398]]) From spectrogram input: >>> S, phase = librosa.magphase(librosa.stft(y=y)) >>> librosa.feature.spectral_centroid(S=S) array([[ 4382.894, 626.588, ..., 5037.07 , 5413.398]]) Using variable bin center frequencies: >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> if_gram, D = librosa.ifgram(y) >>> librosa.feature.spectral_centroid(S=np.abs(D), freq=if_gram) array([[ 4420.719, 625.769, ..., 5011.86 , 5221.492]]) Plot the result >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> plt.semilogy(cent.T, label='Spectral centroid') >>> plt.ylabel('Hz') >>> plt.xticks([]) >>> plt.xlim([0, cent.shape[-1]]) >>> plt.legend() >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('log Power spectrogram') >>> plt.tight_layout() ''' S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) if not np.isrealobj(S): raise ParameterError('Spectral centroid is only defined ' 'with real-valued input') elif np.any(S < 0): raise ParameterError('Spectral centroid is only defined ' 'with non-negative energies') # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) if freq.ndim == 1: freq = freq.reshape((-1, 1)) # Column-normalize S return np.sum(freq * util.normalize(S, norm=1, axis=0), axis=0, keepdims=True)
[docs]def spectral_bandwidth(y=None, sr=22050, S=None, n_fft=2048, hop_length=512, freq=None, centroid=None, norm=True, p=2): '''Compute p'th-order spectral bandwidth: (sum_k S[k] * (freq[k] - centroid)**p)**(1/p) Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. freq : None or np.ndarray [shape=(d,) or shape=(d, t)] Center frequencies for spectrogram bins. If `None`, then FFT bin center frequencies are used. Otherwise, it can be a single array of `d` center frequencies, or a matrix of center frequencies as constructed by `librosa.core.ifgram` centroid : None or np.ndarray [shape=(1, t)] pre-computed centroid frequencies norm : bool Normalize per-frame spectral energy (sum to one) p : float > 0 Power to raise deviation from spectral centroid. Returns ------- bandwidth : np.ndarray [shape=(1, t)] frequency bandwidth for each frame Examples -------- From time-series input >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr) >>> spec_bw array([[ 3379.878, 1429.486, ..., 3235.214, 3080.148]]) From spectrogram input >>> S, phase = librosa.magphase(librosa.stft(y=y)) >>> librosa.feature.spectral_bandwidth(S=S) array([[ 3379.878, 1429.486, ..., 3235.214, 3080.148]]) Using variable bin center frequencies >>> if_gram, D = librosa.ifgram(y) >>> librosa.feature.spectral_bandwidth(S=np.abs(D), freq=if_gram) array([[ 3380.011, 1429.11 , ..., 3235.22 , 3080.148]]) Plot the result >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> plt.semilogy(spec_bw.T, label='Spectral bandwidth') >>> plt.ylabel('Hz') >>> plt.xticks([]) >>> plt.xlim([0, spec_bw.shape[-1]]) >>> plt.legend() >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('log Power spectrogram') >>> plt.tight_layout() ''' S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) if not np.isrealobj(S): raise ParameterError('Spectral bandwidth is only defined ' 'with real-valued input') elif np.any(S < 0): raise ParameterError('Spectral bandwidth is only defined ' 'with non-negative energies') if centroid is None: centroid = spectral_centroid(y=y, sr=sr, S=S, n_fft=n_fft, hop_length=hop_length, freq=freq) # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) if freq.ndim == 1: deviation = np.abs(np.subtract.outer(freq, centroid[0])) else: deviation = np.abs(freq - centroid[0]) # Column-normalize S if norm: S = util.normalize(S, norm=1, axis=0) return np.sum(S * deviation**p, axis=0, keepdims=True)**(1./p)
[docs]def spectral_contrast(y=None, sr=22050, S=None, n_fft=2048, hop_length=512, freq=None, fmin=200.0, n_bands=6, quantile=0.02, linear=False): '''Compute spectral contrast [1]_ .. [1] Jiang, Dan-Ning, Lie Lu, Hong-Jiang Zhang, Jian-Hua Tao, and Lian-Hong Cai. "Music type classification by spectral contrast feature." In Multimedia and Expo, 2002. ICME'02. Proceedings. 2002 IEEE International Conference on, vol. 1, pp. 113-116. IEEE, 2002. Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. freq : None or np.ndarray [shape=(d,)] Center frequencies for spectrogram bins. If `None`, then FFT bin center frequencies are used. Otherwise, it can be a single array of `d` center frequencies. fmin : float > 0 Frequency cutoff for the first bin `[0, fmin]` Subsequent bins will cover `[fmin, 2*fmin]`, `[2*fmin, 4*fmin]`, etc. n_bands : int > 1 number of frequency bands quantile : float in (0, 1) quantile for determining peaks and valleys linear : bool If `True`, return the linear difference of magnitudes: `peaks - valleys`. If `False`, return the logarithmic difference: `log(peaks) - log(valleys)`. Returns ------- contrast : np.ndarray [shape=(n_bands + 1, t)] each row of spectral contrast values corresponds to a given octave-based frequency Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> S = np.abs(librosa.stft(y)) >>> contrast = librosa.feature.spectral_contrast(S=S, sr=sr) >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ... ref=np.max), ... y_axis='log') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Power spectrogram') >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(contrast, x_axis='time') >>> plt.colorbar() >>> plt.ylabel('Frequency bands') >>> plt.title('Spectral contrast') >>> plt.tight_layout() ''' S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) freq = np.atleast_1d(freq) if freq.ndim != 1 or len(freq) != S.shape[0]: raise ParameterError('freq.shape mismatch: expected ' '({:d},)'.format(S.shape[0])) if n_bands < 1 or not isinstance(n_bands, int): raise ParameterError('n_bands must be a positive integer') if not 0.0 < quantile < 1.0: raise ParameterError('quantile must lie in the range (0, 1)') if fmin <= 0: raise ParameterError('fmin must be a positive number') octa = np.zeros(n_bands + 2) octa[1:] = fmin * (2.0**np.arange(0, n_bands + 1)) if np.any(octa[:-1] >= 0.5 * sr): raise ParameterError('Frequency band exceeds Nyquist. ' 'Reduce either fmin or n_bands.') valley = np.zeros((n_bands + 1, S.shape[1])) peak = np.zeros_like(valley) for k, (f_low, f_high) in enumerate(zip(octa[:-1], octa[1:])): current_band = np.logical_and(freq >= f_low, freq <= f_high) idx = np.flatnonzero(current_band) if k > 0: current_band[idx[0] - 1] = True if k == n_bands: current_band[idx[-1] + 1:] = True sub_band = S[current_band] if k < n_bands: sub_band = sub_band[:-1] # Always take at least one bin from each side idx = np.rint(quantile * np.sum(current_band)) idx = int(np.maximum(idx, 1)) sortedr = np.sort(sub_band, axis=0) valley[k] = np.mean(sortedr[:idx], axis=0) peak[k] = np.mean(sortedr[-idx:], axis=0) if linear: return peak - valley else: return power_to_db(peak) - power_to_db(valley)
[docs]def spectral_rolloff(y=None, sr=22050, S=None, n_fft=2048, hop_length=512, freq=None, roll_percent=0.85): '''Compute roll-off frequency Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. freq : None or np.ndarray [shape=(d,) or shape=(d, t)] Center frequencies for spectrogram bins. If `None`, then FFT bin center frequencies are used. Otherwise, it can be a single array of `d` center frequencies, .. note:: `freq` is assumed to be sorted in increasing order roll_percent : float [0 < roll_percent < 1] Roll-off percentage. Returns ------- rolloff : np.ndarray [shape=(1, t)] roll-off frequency for each frame Examples -------- From time-series input >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr) >>> rolloff array([[ 8376.416, 968.994, ..., 8925.513, 9108.545]]) From spectrogram input >>> S, phase = librosa.magphase(librosa.stft(y)) >>> librosa.feature.spectral_rolloff(S=S, sr=sr) array([[ 8376.416, 968.994, ..., 8925.513, 9108.545]]) >>> # With a higher roll percentage: >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.95) array([[ 10012.939, 3003.882, ..., 10034.473, 10077.539]]) >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> plt.semilogy(rolloff.T, label='Roll-off frequency') >>> plt.ylabel('Hz') >>> plt.xticks([]) >>> plt.xlim([0, rolloff.shape[-1]]) >>> plt.legend() >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('log Power spectrogram') >>> plt.tight_layout() ''' if not 0.0 < roll_percent < 1.0: raise ParameterError('roll_percent must lie in the range (0, 1)') S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) if not np.isrealobj(S): raise ParameterError('Spectral rolloff is only defined ' 'with real-valued input') elif np.any(S < 0): raise ParameterError('Spectral rolloff is only defined ' 'with non-negative energies') # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) # Make sure that frequency can be broadcast if freq.ndim == 1: freq = freq.reshape((-1, 1)) total_energy = np.cumsum(S, axis=0) threshold = roll_percent * total_energy[-1] ind = np.where(total_energy < threshold, np.nan, 1) return np.nanmin(ind * freq, axis=0, keepdims=True)
[docs]def spectral_flatness(y=None, S=None, n_fft=2048, hop_length=512, amin=1e-10, power=2.0): '''Compute spectral flatness Spectral flatness (or tonality coefficient) is a measure to quantify how much noise-like a sound is, as opposed to being tone-like [1]_. A high spectral flatness (closer to 1.0) indicates the spectrum is similar to white noise. It is often converted to decibel. .. [1] Dubnov, Shlomo "Generalization of spectral flatness measure for non-gaussian linear processes" IEEE Signal Processing Letters, 2004, Vol. 11. Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series S : np.ndarray [shape=(d, t)] or None (optional) pre-computed spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. amin : float > 0 [scalar] minimum threshold for `S` (=added noise floor for numerical stability) power : float > 0 [scalar] Exponent for the magnitude spectrogram. e.g., 1 for energy, 2 for power, etc. Power spectrogram is usually used for computing spectral flatness. Returns ------- flatness : np.ndarray [shape=(1, t)] spectral flatness for each frame. The returned value is in [0, 1] and often converted to dB scale. Examples -------- From time-series input >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> flatness = librosa.feature.spectral_flatness(y=y) >>> flatness array([[ 1.00000e+00, 5.82299e-03, 5.64624e-04, ..., 9.99063e-01, 1.00000e+00, 1.00000e+00]], dtype=float32) From spectrogram input >>> S, phase = librosa.magphase(librosa.stft(y)) >>> librosa.feature.spectral_flatness(S=S) array([[ 1.00000e+00, 5.82299e-03, 5.64624e-04, ..., 9.99063e-01, 1.00000e+00, 1.00000e+00]], dtype=float32) From power spectrogram input >>> S, phase = librosa.magphase(librosa.stft(y)) >>> S_power = S ** 2 >>> librosa.feature.spectral_flatness(S=S_power, power=1.0) array([[ 1.00000e+00, 5.82299e-03, 5.64624e-04, ..., 9.99063e-01, 1.00000e+00, 1.00000e+00]], dtype=float32) ''' if amin <= 0: raise ParameterError('amin must be strictly positive') S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length, power=1.) if not np.isrealobj(S): raise ParameterError('Spectral flatness is only defined ' 'with real-valued input') elif np.any(S < 0): raise ParameterError('Spectral flatness is only defined ' 'with non-negative energies') S_thresh = np.maximum(amin, S ** power) gmean = np.exp(np.mean(np.log(S_thresh), axis=0, keepdims=True)) amean = np.mean(S_thresh, axis=0, keepdims=True) return gmean / amean
[docs]def rmse(y=None, S=None, frame_length=2048, hop_length=512, center=True, pad_mode='reflect'): '''Compute root-mean-square (RMS) energy for each frame, either from the audio samples `y` or from a spectrogram `S`. Computing the energy from audio samples is faster as it doesn't require a STFT calculation. However, using a spectrogram will give a more accurate representation of energy over time because its frames can be windowed, thus prefer using `S` if it's already available. Parameters ---------- y : np.ndarray [shape=(n,)] or None (optional) audio time series. Required if `S` is not input. S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude. Required if `y` is not input. frame_length : int > 0 [scalar] length of analysis frame (in samples) for energy calculation hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. center : bool If `True` and operating on time-domain input (`y`), pad the signal by `frame_length//2` on either side. If operating on spectrogram input, this has no effect. pad_mode : str Padding mode for centered analysis. See `np.pad` for valid values. Returns ------- rms : np.ndarray [shape=(1, t)] RMS value for each frame Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.feature.rmse(y=y) array([[ 0. , 0.056, ..., 0. , 0. ]], dtype=float32) Or from spectrogram input >>> S, phase = librosa.magphase(librosa.stft(y)) >>> rms = librosa.feature.rmse(S=S) >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> plt.semilogy(rms.T, label='RMS Energy') >>> plt.xticks([]) >>> plt.xlim([0, rms.shape[-1]]) >>> plt.legend(loc='best') >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('log Power spectrogram') >>> plt.tight_layout() Use a STFT window of constant ones and no frame centering to get consistent results with the RMS energy computed from the audio samples `y` >>> S = librosa.magphase(librosa.stft(y, window=np.ones, center=False))[0] >>> librosa.feature.rmse(S=S) ''' if y is not None and S is not None: raise ValueError('Either `y` or `S` should be input.') if y is not None: y = to_mono(y) if center: y = np.pad(y, int(frame_length // 2), mode=pad_mode) x = util.frame(y, frame_length=frame_length, hop_length=hop_length) elif S is not None: x, _ = _spectrogram(y=y, S=S, n_fft=frame_length, hop_length=hop_length) else: raise ValueError('Either `y` or `S` must be input.') return np.sqrt(np.mean(np.abs(x)**2, axis=0, keepdims=True))
[docs]def poly_features(y=None, sr=22050, S=None, n_fft=2048, hop_length=512, order=1, freq=None): '''Get coefficients of fitting an nth-order polynomial to the columns of a spectrogram. Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None (optional) spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.core.stft` for details. order : int > 0 order of the polynomial to fit freq : None or np.ndarray [shape=(d,) or shape=(d, t)] Center frequencies for spectrogram bins. If `None`, then FFT bin center frequencies are used. Otherwise, it can be a single array of `d` center frequencies, or a matrix of center frequencies as constructed by `librosa.core.ifgram` Returns ------- coefficients : np.ndarray [shape=(order+1, t)] polynomial coefficients for each frame. `coeffecients[0]` corresponds to the highest degree (`order`), `coefficients[1]` corresponds to the next highest degree (`order-1`), down to the constant term `coefficients[order]`. Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> S = np.abs(librosa.stft(y)) Fit a degree-0 polynomial (constant) to each frame >>> p0 = librosa.feature.poly_features(S=S, order=0) Fit a linear polynomial to each frame >>> p1 = librosa.feature.poly_features(S=S, order=1) Fit a quadratic to each frame >>> p2 = librosa.feature.poly_features(S=S, order=2) Plot the results for comparison >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(8, 8)) >>> ax = plt.subplot(4,1,1) >>> plt.plot(p2[2], label='order=2', alpha=0.8) >>> plt.plot(p1[1], label='order=1', alpha=0.8) >>> plt.plot(p0[0], label='order=0', alpha=0.8) >>> plt.xticks([]) >>> plt.ylabel('Constant') >>> plt.legend() >>> plt.subplot(4,1,2, sharex=ax) >>> plt.plot(p2[1], label='order=2', alpha=0.8) >>> plt.plot(p1[0], label='order=1', alpha=0.8) >>> plt.xticks([]) >>> plt.ylabel('Linear') >>> plt.subplot(4,1,3, sharex=ax) >>> plt.plot(p2[0], label='order=2', alpha=0.8) >>> plt.xticks([]) >>> plt.ylabel('Quadratic') >>> plt.subplot(4,1,4, sharex=ax) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), ... y_axis='log') >>> plt.tight_layout() ''' S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) # If frequencies are constant over frames, then we only need to fit once if freq.ndim == 1: coefficients = np.polyfit(freq, S, order) else: # Else, fit each frame independently and stack the results coefficients = np.concatenate([[np.polyfit(freq[:, i], S[:, i], order)] for i in range(S.shape[1])], axis=0).T return coefficients
[docs]def zero_crossing_rate(y, frame_length=2048, hop_length=512, center=True, **kwargs): '''Compute the zero-crossing rate of an audio time series. Parameters ---------- y : np.ndarray [shape=(n,)] Audio time series frame_length : int > 0 Length of the frame over which to compute zero crossing rates hop_length : int > 0 Number of samples to advance for each frame center : bool If `True`, frames are centered by padding the edges of `y`. This is similar to the padding in `librosa.core.stft`, but uses edge-value copies instead of reflection. kwargs : additional keyword arguments See `librosa.core.zero_crossings` .. note:: By default, the `pad` parameter is set to `False`, which differs from the default specified by `librosa.core.zero_crossings`. Returns ------- zcr : np.ndarray [shape=(1, t)] `zcr[0, i]` is the fraction of zero crossings in the `i` th frame See Also -------- librosa.core.zero_crossings Compute zero-crossings in a time-series Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.feature.zero_crossing_rate(y) array([[ 0.134, 0.139, ..., 0.387, 0.322]]) ''' util.valid_audio(y) if center: y = np.pad(y, int(frame_length // 2), mode='edge') y_framed = util.frame(y, frame_length, hop_length) kwargs['axis'] = 0 kwargs.setdefault('pad', False) crossings = zero_crossings(y_framed, **kwargs) return np.mean(crossings, axis=0, keepdims=True)
# -- Chroma --#
[docs]def chroma_stft(y=None, sr=22050, S=None, norm=np.inf, n_fft=2048, hop_length=512, tuning=None, **kwargs): """Compute a chromagram from a waveform or power spectrogram. This implementation is derived from `chromagram_E` [1]_ .. [1] Ellis, Daniel P.W. "Chroma feature analysis and synthesis" 2007/04/21 http://labrosa.ee.columbia.edu/matlab/chroma-ansyn/ Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] sampling rate of `y` S : np.ndarray [shape=(d, t)] or None power spectrogram norm : float or None Column-wise normalization. See `librosa.util.normalize` for details. If `None`, no normalization is performed. n_fft : int > 0 [scalar] FFT window size if provided `y, sr` instead of `S` hop_length : int > 0 [scalar] hop length if provided `y, sr` instead of `S` tuning : float in `[-0.5, 0.5)` [scalar] or None. Deviation from A440 tuning in fractional bins (cents). If `None`, it is automatically estimated. kwargs : additional keyword arguments Arguments to parameterize chroma filters. See `librosa.filters.chroma` for details. Returns ------- chromagram : np.ndarray [shape=(n_chroma, t)] Normalized energy for each chroma bin at each frame. See Also -------- librosa.filters.chroma Chroma filter bank construction librosa.util.normalize Vector normalization Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.feature.chroma_stft(y=y, sr=sr) array([[ 0.974, 0.881, ..., 0.925, 1. ], [ 1. , 0.841, ..., 0.882, 0.878], ..., [ 0.658, 0.985, ..., 0.878, 0.764], [ 0.969, 0.92 , ..., 0.974, 0.915]]) Use an energy (magnitude) spectrum instead of power spectrogram >>> S = np.abs(librosa.stft(y)) >>> chroma = librosa.feature.chroma_stft(S=S, sr=sr) >>> chroma array([[ 0.884, 0.91 , ..., 0.861, 0.858], [ 0.963, 0.785, ..., 0.968, 0.896], ..., [ 0.871, 1. , ..., 0.928, 0.829], [ 1. , 0.982, ..., 0.93 , 0.878]]) Use a pre-computed power spectrogram with a larger frame >>> S = np.abs(librosa.stft(y, n_fft=4096))**2 >>> chroma = librosa.feature.chroma_stft(S=S, sr=sr) >>> chroma array([[ 0.685, 0.477, ..., 0.961, 0.986], [ 0.674, 0.452, ..., 0.952, 0.926], ..., [ 0.844, 0.575, ..., 0.934, 0.869], [ 0.793, 0.663, ..., 0.964, 0.972]]) >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(10, 4)) >>> librosa.display.specshow(chroma, y_axis='chroma', x_axis='time') >>> plt.colorbar() >>> plt.title('Chromagram') >>> plt.tight_layout() """ S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length, power=2) n_chroma = kwargs.get('n_chroma', 12) if tuning is None: tuning = estimate_tuning(S=S, sr=sr, bins_per_octave=n_chroma) # Get the filter bank if 'A440' not in kwargs: kwargs['A440'] = 440.0 * 2.0**(float(tuning) / n_chroma) chromafb = filters.chroma(sr, n_fft, **kwargs) # Compute raw chroma raw_chroma = np.dot(chromafb, S) # Compute normalization factor for each frame return util.normalize(raw_chroma, norm=norm, axis=0)
[docs]def chroma_cqt(y=None, sr=22050, C=None, hop_length=512, fmin=None, norm=np.inf, threshold=0.0, tuning=None, n_chroma=12, n_octaves=7, window=None, bins_per_octave=None, cqt_mode='full'): r'''Constant-Q chromagram Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 sampling rate of `y` C : np.ndarray [shape=(d, t)] [Optional] a pre-computed constant-Q spectrogram hop_length : int > 0 number of samples between successive chroma frames fmin : float > 0 minimum frequency to analyze in the CQT. Default: 'C1' ~= 32.7 Hz norm : int > 0, +-np.inf, or None Column-wise normalization of the chromagram. threshold : float Pre-normalization energy threshold. Values below the threshold are discarded, resulting in a sparse chromagram. tuning : float Deviation (in cents) from A440 tuning n_chroma : int > 0 Number of chroma bins to produce n_octaves : int > 0 Number of octaves to analyze above `fmin` window : None or np.ndarray Optional window parameter to `filters.cq_to_chroma` bins_per_octave : int > 0 Number of bins per octave in the CQT. Default: matches `n_chroma` cqt_mode : ['full', 'hybrid'] Constant-Q transform mode Returns ------- chromagram : np.ndarray [shape=(n_chroma, t)] The output chromagram See Also -------- librosa.util.normalize librosa.core.cqt librosa.core.hybrid_cqt chroma_stft Examples -------- Compare a long-window STFT chromagram to the CQT chromagram >>> y, sr = librosa.load(librosa.util.example_audio_file(), ... offset=10, duration=15) >>> chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, ... n_chroma=12, n_fft=4096) >>> chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr) >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2,1,1) >>> librosa.display.specshow(chroma_stft, y_axis='chroma') >>> plt.title('chroma_stft') >>> plt.colorbar() >>> plt.subplot(2,1,2) >>> librosa.display.specshow(chroma_cq, y_axis='chroma', x_axis='time') >>> plt.title('chroma_cqt') >>> plt.colorbar() >>> plt.tight_layout() ''' cqt_func = {'full': cqt, 'hybrid': hybrid_cqt} if bins_per_octave is None: bins_per_octave = n_chroma # Build the CQT if we don't have one already if C is None: C = np.abs(cqt_func[cqt_mode](y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_octaves * bins_per_octave, bins_per_octave=bins_per_octave, tuning=tuning)) # Map to chroma cq_to_chr = filters.cq_to_chroma(C.shape[0], bins_per_octave=bins_per_octave, n_chroma=n_chroma, fmin=fmin, window=window) chroma = cq_to_chr.dot(C) if threshold is not None: chroma[chroma < threshold] = 0.0 # Normalize if norm is not None: chroma = util.normalize(chroma, norm=norm, axis=0) return chroma
[docs]def chroma_cens(y=None, sr=22050, C=None, hop_length=512, fmin=None, tuning=None, n_chroma=12, n_octaves=7, bins_per_octave=None, cqt_mode='full', window=None, norm=2, win_len_smooth=41): r'''Computes the chroma variant "Chroma Energy Normalized" (CENS), following [1]_. .. [1] Meinard Müller and Sebastian Ewert Chroma Toolbox: MATLAB implementations for extracting variants of chroma-based audio features In Proceedings of the International Conference on Music Information Retrieval (ISMIR), 2011. Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 sampling rate of `y` C : np.ndarray [shape=(d, t)] [Optional] a pre-computed constant-Q spectrogram hop_length : int > 0 number of samples between successive chroma frames fmin : float > 0 minimum frequency to analyze in the CQT. Default: 'C1' ~= 32.7 Hz norm : int > 0, +-np.inf, or None Column-wise normalization of the chromagram. tuning : float Deviation (in cents) from A440 tuning n_chroma : int > 0 Number of chroma bins to produce n_octaves : int > 0 Number of octaves to analyze above `fmin` window : None or np.ndarray Optional window parameter to `filters.cq_to_chroma` bins_per_octave : int > 0 Number of bins per octave in the CQT. Default: matches `n_chroma` cqt_mode : ['full', 'hybrid'] Constant-Q transform mode win_len_smooth : int > 0 Length of temporal smoothing window. Default: 41 Returns ------- chroma_cens : np.ndarray [shape=(n_chroma, t)] The output cens-chromagram See Also -------- chroma_cqt Compute a chromagram from a constant-Q transform. chroma_stft Compute a chromagram from an STFT spectrogram or waveform. Examples -------- Compare standard cqt chroma to CENS. >>> y, sr = librosa.load(librosa.util.example_audio_file(), ... offset=10, duration=15) >>> chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr) >>> chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr) >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2,1,1) >>> librosa.display.specshow(chroma_cq, y_axis='chroma') >>> plt.title('chroma_cq') >>> plt.colorbar() >>> plt.subplot(2,1,2) >>> librosa.display.specshow(chroma_cens, y_axis='chroma', x_axis='time') >>> plt.title('chroma_cens') >>> plt.colorbar() >>> plt.tight_layout() ''' chroma = chroma_cqt(y=y, C=C, sr=sr, hop_length=hop_length, fmin=fmin, bins_per_octave=bins_per_octave, tuning=tuning, norm=None, n_chroma=n_chroma, n_octaves=n_octaves, cqt_mode=cqt_mode, window=window) # L1-Normalization chroma = util.normalize(chroma, norm=1, axis=0) # Quantize amplitudes QUANT_STEPS = [0.4, 0.2, 0.1, 0.05] QUANT_WEIGHTS = [0.25, 0.25, 0.25, 0.25] chroma_quant = np.zeros_like(chroma) for cur_quant_step_idx, cur_quant_step in enumerate(QUANT_STEPS): chroma_quant += (chroma > cur_quant_step) * QUANT_WEIGHTS[cur_quant_step_idx] # Apply temporal smoothing win = scipy.signal.hanning(win_len_smooth + 2) win /= np.sum(win) win = np.atleast_2d(win) cens = scipy.signal.convolve2d(chroma_quant, win, mode='same', boundary='fill') # L2-Normalization return util.normalize(cens, norm=norm, axis=0)
[docs]def tonnetz(y=None, sr=22050, chroma=None): '''Computes the tonal centroid features (tonnetz), following the method of [1]_. .. [1] Harte, C., Sandler, M., & Gasser, M. (2006). "Detecting Harmonic Change in Musical Audio." In Proceedings of the 1st ACM Workshop on Audio and Music Computing Multimedia (pp. 21-26). Santa Barbara, CA, USA: ACM Press. doi:10.1145/1178723.1178727. Parameters ---------- y : np.ndarray [shape=(n,)] or None Audio time series. sr : number > 0 [scalar] sampling rate of `y` chroma : np.ndarray [shape=(n_chroma, t)] or None Normalized energy for each chroma bin at each frame. If `None`, a cqt chromagram is performed. Returns ------- tonnetz : np.ndarray [shape(6, t)] Tonal centroid features for each frame. Tonnetz dimensions: - 0: Fifth x-axis - 1: Fifth y-axis - 2: Minor x-axis - 3: Minor y-axis - 4: Major x-axis - 5: Major y-axis See Also -------- chroma_cqt Compute a chromagram from a constant-Q transform. chroma_stft Compute a chromagram from an STFT spectrogram or waveform. Examples -------- Compute tonnetz features from the harmonic component of a song >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> y = librosa.effects.harmonic(y) >>> tonnetz = librosa.feature.tonnetz(y=y, sr=sr) >>> tonnetz array([[-0.073, -0.053, ..., -0.054, -0.073], [ 0.001, 0.001, ..., -0.054, -0.062], ..., [ 0.039, 0.034, ..., 0.044, 0.064], [ 0.005, 0.002, ..., 0.011, 0.017]]) Compare the tonnetz features to `chroma_cqt` >>> import matplotlib.pyplot as plt >>> plt.subplot(2, 1, 1) >>> librosa.display.specshow(tonnetz, y_axis='tonnetz') >>> plt.colorbar() >>> plt.title('Tonal Centroids (Tonnetz)') >>> plt.subplot(2, 1, 2) >>> librosa.display.specshow(librosa.feature.chroma_cqt(y, sr=sr), ... y_axis='chroma', x_axis='time') >>> plt.colorbar() >>> plt.title('Chroma') >>> plt.tight_layout() ''' if y is None and chroma is None: raise ParameterError('Either the audio samples or the chromagram must be ' 'passed as an argument.') if chroma is None: chroma = chroma_cqt(y=y, sr=sr) # Generate Transformation matrix dim_map = np.linspace(0, 12, num=chroma.shape[0], endpoint=False) scale = np.asarray([7. / 6, 7. / 6, 3. / 2, 3. / 2, 2. / 3, 2. / 3]) V = np.multiply.outer(scale, dim_map) # Even rows compute sin() V[::2] -= 0.5 R = np.array([1, 1, # Fifths 1, 1, # Minor 0.5, 0.5]) # Major phi = R[:, np.newaxis] * np.cos(np.pi * V) # Do the transform to tonnetz return phi.dot(util.normalize(chroma, norm=1, axis=0))
# -- Mel spectrogram and MFCCs -- #
[docs]def mfcc(y=None, sr=22050, S=None, n_mfcc=20, **kwargs): """Mel-frequency cepstral coefficients Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time series sr : number > 0 [scalar] sampling rate of `y` S : np.ndarray [shape=(d, t)] or None log-power Mel spectrogram n_mfcc: int > 0 [scalar] number of MFCCs to return kwargs : additional keyword arguments Arguments to `melspectrogram`, if operating on time series input Returns ------- M : np.ndarray [shape=(n_mfcc, t)] MFCC sequence See Also -------- melspectrogram Examples -------- Generate mfccs from a time series >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.feature.mfcc(y=y, sr=sr) array([[ -5.229e+02, -4.944e+02, ..., -5.229e+02, -5.229e+02], [ 7.105e-15, 3.787e+01, ..., -7.105e-15, -7.105e-15], ..., [ 1.066e-14, -7.500e+00, ..., 1.421e-14, 1.421e-14], [ 3.109e-14, -5.058e+00, ..., 2.931e-14, 2.931e-14]]) Use a pre-computed log-power Mel spectrogram >>> S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, ... fmax=8000) >>> librosa.feature.mfcc(S=librosa.power_to_db(S)) array([[ -5.207e+02, -4.898e+02, ..., -5.207e+02, -5.207e+02], [ -2.576e-14, 4.054e+01, ..., -3.997e-14, -3.997e-14], ..., [ 7.105e-15, -3.534e+00, ..., 0.000e+00, 0.000e+00], [ 3.020e-14, -2.613e+00, ..., 3.553e-14, 3.553e-14]]) Get more components >>> mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) Visualize the MFCC series >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(10, 4)) >>> librosa.display.specshow(mfccs, x_axis='time') >>> plt.colorbar() >>> plt.title('MFCC') >>> plt.tight_layout() """ if S is None: S = power_to_db(melspectrogram(y=y, sr=sr, **kwargs)) return np.dot(filters.dct(n_mfcc, S.shape[0]), S)
[docs]def melspectrogram(y=None, sr=22050, S=None, n_fft=2048, hop_length=512, power=2.0, **kwargs): """Compute a mel-scaled spectrogram. If a spectrogram input `S` is provided, then it is mapped directly onto the mel basis `mel_f` by `mel_f.dot(S)`. If a time-series input `y, sr` is provided, then its magnitude spectrogram `S` is first computed, and then mapped onto the mel scale by `mel_f.dot(S**power)`. By default, `power=2` operates on a power spectrum. Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time-series sr : number > 0 [scalar] sampling rate of `y` S : np.ndarray [shape=(d, t)] spectrogram n_fft : int > 0 [scalar] length of the FFT window hop_length : int > 0 [scalar] number of samples between successive frames. See `librosa.core.stft` power : float > 0 [scalar] Exponent for the magnitude melspectrogram. e.g., 1 for energy, 2 for power, etc. kwargs : additional keyword arguments Mel filter bank parameters. See `librosa.filters.mel` for details. Returns ------- S : np.ndarray [shape=(n_mels, t)] Mel spectrogram See Also -------- librosa.filters.mel Mel filter bank construction librosa.core.stft Short-time Fourier Transform Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.feature.melspectrogram(y=y, sr=sr) array([[ 2.891e-07, 2.548e-03, ..., 8.116e-09, 5.633e-09], [ 1.986e-07, 1.162e-02, ..., 9.332e-08, 6.716e-09], ..., [ 3.668e-09, 2.029e-08, ..., 3.208e-09, 2.864e-09], [ 2.561e-10, 2.096e-09, ..., 7.543e-10, 6.101e-10]]) Using a pre-computed power spectrogram >>> D = np.abs(librosa.stft(y))**2 >>> S = librosa.feature.melspectrogram(S=D) >>> # Passing through arguments to the Mel filters >>> S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, ... fmax=8000) >>> import matplotlib.pyplot as plt >>> plt.figure(figsize=(10, 4)) >>> librosa.display.specshow(librosa.power_to_db(S, ... ref=np.max), ... y_axis='mel', fmax=8000, ... x_axis='time') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Mel spectrogram') >>> plt.tight_layout() """ S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length, power=power) # Build a Mel filter mel_basis = filters.mel(sr, n_fft, **kwargs) return np.dot(mel_basis, S)