#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Core IO, DSP and utility functions."""
import os
import six
import audioread
import numpy as np
import scipy.signal
import scipy.fftpack as fft
import resampy
from .time_frequency import frames_to_samples, time_to_samples
from .. import cache
from .. import util
from ..util.exceptions import ParameterError
__all__ = ['load', 'to_mono', 'resample', 'get_duration',
'autocorrelate', 'zero_crossings', 'clicks']
# Resampling bandwidths as percentage of Nyquist
BW_BEST = resampy.filters.get_filter('kaiser_best')[2]
BW_FASTEST = resampy.filters.get_filter('kaiser_fast')[2]
# -- CORE ROUTINES --#
# Load should never be cached, since we cannot verify that the contents of
# 'path' are unchanged across calls.
[docs]def load(path, sr=22050, mono=True, offset=0.0, duration=None,
dtype=np.float32, res_type='kaiser_best'):
"""Load an audio file as a floating point time series.
Audio will be automatically resampled to the given rate
(default `sr=22050`).
To preserve the native sampling rate of the file, use `sr=None`.
Parameters
----------
path : string
path to the input file.
Any format supported by `audioread` will work.
sr : number > 0 [scalar]
target sampling rate
'None' uses the native sampling rate
mono : bool
convert signal to mono
offset : float
start reading after this time (in seconds)
duration : float
only load up to this much audio (in seconds)
dtype : numeric type
data type of `y`
res_type : str
resample type (see note)
.. note::
By default, this uses `resampy`'s high-quality mode ('kaiser_best').
To use a faster method, set `res_type='kaiser_fast'`.
To use `scipy.signal.resample`, set `res_type='scipy'`.
Returns
-------
y : np.ndarray [shape=(n,) or (2, n)]
audio time series
sr : number > 0 [scalar]
sampling rate of `y`
Examples
--------
>>> # Load a wav file
>>> filename = librosa.util.example_audio_file()
>>> y, sr = librosa.load(filename)
>>> y
array([ -4.756e-06, -6.020e-06, ..., -1.040e-06, 0.000e+00], dtype=float32)
>>> sr
22050
>>> # Load a wav file and resample to 11 KHz
>>> filename = librosa.util.example_audio_file()
>>> y, sr = librosa.load(filename, sr=11025)
>>> y
array([ -2.077e-06, -2.928e-06, ..., -4.395e-06, 0.000e+00], dtype=float32)
>>> sr
11025
>>> # Load 5 seconds of a wav file, starting 15 seconds in
>>> filename = librosa.util.example_audio_file()
>>> y, sr = librosa.load(filename, offset=15.0, duration=5.0)
>>> y
array([ 0.069, 0.1 , ..., -0.101, 0. ], dtype=float32)
>>> sr
22050
"""
y = []
with audioread.audio_open(os.path.realpath(path)) as input_file:
sr_native = input_file.samplerate
n_channels = input_file.channels
s_start = int(np.round(sr_native * offset)) * n_channels
if duration is None:
s_end = np.inf
else:
s_end = s_start + (int(np.round(sr_native * duration))
* n_channels)
n = 0
for frame in input_file:
frame = util.buf_to_float(frame, dtype=dtype)
n_prev = n
n = n + len(frame)
if n < s_start:
# offset is after the current frame
# keep reading
continue
if s_end < n_prev:
# we're off the end. stop reading
break
if s_end < n:
# the end is in this frame. crop.
frame = frame[:s_end - n_prev]
if n_prev <= s_start <= n:
# beginning is in this frame
frame = frame[(s_start - n_prev):]
# tack on the current frame
y.append(frame)
if y:
y = np.concatenate(y)
if n_channels > 1:
y = y.reshape((-1, n_channels)).T
if mono:
y = to_mono(y)
if sr is not None:
y = resample(y, sr_native, sr, res_type=res_type)
else:
sr = sr_native
# Final cleanup for dtype and contiguity
y = np.ascontiguousarray(y, dtype=dtype)
return (y, sr)
[docs]@cache(level=20)
def to_mono(y):
'''Force an audio signal down to mono.
Parameters
----------
y : np.ndarray [shape=(2,n) or shape=(n,)]
audio time series, either stereo or mono
Returns
-------
y_mono : np.ndarray [shape=(n,)]
`y` as a monophonic time-series
Notes
-----
This function caches at level 20.
Examples
--------
>>> y, sr = librosa.load(librosa.util.example_audio_file(), mono=False)
>>> y.shape
(2, 1355168)
>>> y_mono = librosa.to_mono(y)
>>> y_mono.shape
(1355168,)
'''
# Validate the buffer. Stereo is ok here.
util.valid_audio(y, mono=False)
if y.ndim > 1:
y = np.mean(y, axis=0)
return y
[docs]@cache(level=20)
def resample(y, orig_sr, target_sr, res_type='kaiser_best', fix=True, scale=False, **kwargs):
"""Resample a time series from orig_sr to target_sr
Parameters
----------
y : np.ndarray [shape=(n,) or shape=(2, n)]
audio time series. Can be mono or stereo.
orig_sr : number > 0 [scalar]
original sampling rate of `y`
target_sr : number > 0 [scalar]
target sampling rate
res_type : str
resample type (see note)
.. note::
By default, this uses `resampy`'s high-quality mode ('kaiser_best').
To use a faster method, set `res_type='kaiser_fast'`.
To use `scipy.signal.resample`, set `res_type='scipy'`.
fix : bool
adjust the length of the resampled signal to be of size exactly
`ceil(target_sr * len(y) / orig_sr)`
scale : bool
Scale the resampled signal so that `y` and `y_hat` have approximately
equal total energy.
kwargs : additional keyword arguments
If `fix==True`, additional keyword arguments to pass to
`librosa.util.fix_length`.
Returns
-------
y_hat : np.ndarray [shape=(n * target_sr / orig_sr,)]
`y` resampled from `orig_sr` to `target_sr`
See Also
--------
librosa.util.fix_length
scipy.signal.resample
resampy.resample
Notes
-----
This function caches at level 20.
Examples
--------
Downsample from 22 KHz to 8 KHz
>>> y, sr = librosa.load(librosa.util.example_audio_file(), sr=22050)
>>> y_8k = librosa.resample(y, sr, 8000)
>>> y.shape, y_8k.shape
((1355168,), (491671,))
"""
# First, validate the audio buffer
util.valid_audio(y, mono=False)
if orig_sr == target_sr:
return y
ratio = float(target_sr) / orig_sr
n_samples = int(np.ceil(y.shape[-1] * ratio))
if res_type == 'scipy':
y_hat = scipy.signal.resample(y, n_samples, axis=-1)
else:
y_hat = resampy.resample(y, orig_sr, target_sr, filter=res_type, axis=-1)
if fix:
y_hat = util.fix_length(y_hat, n_samples, **kwargs)
if scale:
y_hat /= np.sqrt(ratio)
return np.ascontiguousarray(y_hat, dtype=y.dtype)
[docs]def get_duration(y=None, sr=22050, S=None, n_fft=2048, hop_length=512,
center=True, filename=None):
"""Compute the duration (in seconds) of an audio time series,
feature matrix, or filename.
Examples
--------
>>> # Load the example audio file
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> librosa.get_duration(y=y, sr=sr)
61.44
>>> # Or directly from an audio file
>>> librosa.get_duration(filename=librosa.util.example_audio_file())
61.4
>>> # Or compute duration from an STFT matrix
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> S = librosa.stft(y)
>>> librosa.get_duration(S=S, sr=sr)
61.44
>>> # Or a non-centered STFT matrix
>>> S_left = librosa.stft(y, center=False)
>>> librosa.get_duration(S=S_left, sr=sr)
61.3471201814059
Parameters
----------
y : np.ndarray [shape=(n,), (2, n)] or None
audio time series
sr : number > 0 [scalar]
audio sampling rate of `y`
S : np.ndarray [shape=(d, t)] or None
STFT matrix, or any STFT-derived matrix (e.g., chromagram
or mel spectrogram).
n_fft : int > 0 [scalar]
FFT window size for `S`
hop_length : int > 0 [ scalar]
number of audio samples between columns of `S`
center : boolean
- If `True`, `S[:, t]` is centered at `y[t * hop_length]`
- If `False`, then `S[:, t]` begins at `y[t * hop_length]`
filename : str
If provided, all other parameters are ignored, and the
duration is calculated directly from the audio file.
Note that this avoids loading the contents into memory,
and is therefore useful for querying the duration of
long files.
Returns
-------
d : float >= 0
Duration (in seconds) of the input time series or spectrogram.
"""
if filename is not None:
with audioread.audio_open(filename) as fdesc:
return fdesc.duration
if y is None:
assert S is not None
n_frames = S.shape[1]
n_samples = n_fft + hop_length * (n_frames - 1)
# If centered, we lose half a window from each end of S
if center:
n_samples = n_samples - 2 * int(n_fft / 2)
else:
# Validate the audio buffer. Stereo is okay here.
util.valid_audio(y, mono=False)
if y.ndim == 1:
n_samples = len(y)
else:
n_samples = y.shape[-1]
return float(n_samples) / sr
[docs]@cache(level=20)
def autocorrelate(y, max_size=None, axis=-1):
"""Bounded auto-correlation
Parameters
----------
y : np.ndarray
array to autocorrelate
max_size : int > 0 or None
maximum correlation lag.
If unspecified, defaults to `y.shape[axis]` (unbounded)
axis : int
The axis along which to autocorrelate.
By default, the last axis (-1) is taken.
Returns
-------
z : np.ndarray
truncated autocorrelation `y*y` along the specified axis.
If `max_size` is specified, then `z.shape[axis]` is bounded
to `max_size`.
Notes
-----
This function caches at level 20.
Examples
--------
Compute full autocorrelation of y
>>> y, sr = librosa.load(librosa.util.example_audio_file(), offset=20, duration=10)
>>> librosa.autocorrelate(y)
array([ 3.226e+03, 3.217e+03, ..., 8.277e-04, 3.575e-04], dtype=float32)
Compute onset strength auto-correlation up to 4 seconds
>>> import matplotlib.pyplot as plt
>>> odf = librosa.onset.onset_strength(y=y, sr=sr, hop_length=512)
>>> ac = librosa.autocorrelate(odf, max_size=4* sr / 512)
>>> plt.plot(ac)
>>> plt.title('Auto-correlation')
>>> plt.xlabel('Lag (frames)')
"""
if max_size is None:
max_size = y.shape[axis]
max_size = int(min(max_size, y.shape[axis]))
# Compute the power spectrum along the chosen axis
# Pad out the signal to support full-length auto-correlation.
powspec = np.abs(fft.fft(y, n=2 * y.shape[axis] + 1, axis=axis))**2
# Convert back to time domain
autocorr = fft.ifft(powspec, axis=axis, overwrite_x=True)
# Slice down to max_size
subslice = [slice(None)] * autocorr.ndim
subslice[axis] = slice(max_size)
autocorr = autocorr[subslice]
if not np.iscomplexobj(y):
autocorr = autocorr.real
return autocorr
[docs]@cache(level=20)
def zero_crossings(y, threshold=1e-10, ref_magnitude=None, pad=True,
zero_pos=True, axis=-1):
'''Find the zero-crossings of a signal `y`: indices `i` such that
`sign(y[i]) != sign(y[j])`.
If `y` is multi-dimensional, then zero-crossings are computed along
the specified `axis`.
Parameters
----------
y : np.ndarray
The input array
threshold : float > 0 or None
If specified, values where `-threshold <= y <= threshold` are
clipped to 0.
ref_magnitude : float > 0 or callable
If numeric, the threshold is scaled relative to `ref_magnitude`.
If callable, the threshold is scaled relative to
`ref_magnitude(np.abs(y))`.
pad : boolean
If `True`, then `y[0]` is considered a valid zero-crossing.
zero_pos : boolean
If `True` then the value 0 is interpreted as having positive sign.
If `False`, then 0, -1, and +1 all have distinct signs.
axis : int
Axis along which to compute zero-crossings.
Returns
-------
zero_crossings : np.ndarray [shape=y.shape, dtype=boolean]
Indicator array of zero-crossings in `y` along the selected axis.
Notes
-----
This function caches at level 20.
Examples
--------
>>> # Generate a time-series
>>> y = np.sin(np.linspace(0, 4 * 2 * np.pi, 20))
>>> y
array([ 0.000e+00, 9.694e-01, 4.759e-01, -7.357e-01,
-8.372e-01, 3.247e-01, 9.966e-01, 1.646e-01,
-9.158e-01, -6.142e-01, 6.142e-01, 9.158e-01,
-1.646e-01, -9.966e-01, -3.247e-01, 8.372e-01,
7.357e-01, -4.759e-01, -9.694e-01, -9.797e-16])
>>> # Compute zero-crossings
>>> z = librosa.zero_crossings(y)
>>> z
array([ True, False, False, True, False, True, False, False,
True, False, True, False, True, False, False, True,
False, True, False, True], dtype=bool)
>>> # Stack y against the zero-crossing indicator
>>> np.vstack([y, z]).T
array([[ 0.000e+00, 1.000e+00],
[ 9.694e-01, 0.000e+00],
[ 4.759e-01, 0.000e+00],
[ -7.357e-01, 1.000e+00],
[ -8.372e-01, 0.000e+00],
[ 3.247e-01, 1.000e+00],
[ 9.966e-01, 0.000e+00],
[ 1.646e-01, 0.000e+00],
[ -9.158e-01, 1.000e+00],
[ -6.142e-01, 0.000e+00],
[ 6.142e-01, 1.000e+00],
[ 9.158e-01, 0.000e+00],
[ -1.646e-01, 1.000e+00],
[ -9.966e-01, 0.000e+00],
[ -3.247e-01, 0.000e+00],
[ 8.372e-01, 1.000e+00],
[ 7.357e-01, 0.000e+00],
[ -4.759e-01, 1.000e+00],
[ -9.694e-01, 0.000e+00],
[ -9.797e-16, 1.000e+00]])
>>> # Find the indices of zero-crossings
>>> np.nonzero(z)
(array([ 0, 3, 5, 8, 10, 12, 15, 17, 19]),)
'''
# Clip within the threshold
if threshold is None:
threshold = 0.0
if six.callable(ref_magnitude):
threshold = threshold * ref_magnitude(np.abs(y))
elif ref_magnitude is not None:
threshold = threshold * ref_magnitude
if threshold > 0:
y = y.copy()
y[np.abs(y) <= threshold] = 0
# Extract the sign bit
if zero_pos:
y_sign = np.signbit(y)
else:
y_sign = np.sign(y)
# Find the change-points by slicing
slice_pre = [slice(None)] * y.ndim
slice_pre[axis] = slice(1, None)
slice_post = [slice(None)] * y.ndim
slice_post[axis] = slice(-1)
# Since we've offset the input by one, pad back onto the front
padding = [(0, 0)] * y.ndim
padding[axis] = (1, 0)
return np.pad((y_sign[slice_post] != y_sign[slice_pre]),
padding,
mode='constant',
constant_values=pad)
[docs]def clicks(times=None, frames=None, sr=22050, hop_length=512,
click_freq=1000.0, click_duration=0.1, click=None, length=None):
"""Returns a signal with the signal `click` placed at each specified time
Parameters
----------
times : np.ndarray or None
times to place clicks, in seconds
frames : np.ndarray or None
frame indices to place clicks
sr : number > 0
desired sampling rate of the output signal
hop_length : int > 0
if positions are specified by `frames`, the number of samples between frames.
click_freq : float > 0
frequency (in Hz) of the default click signal. Default is 1KHz.
click_duration : float > 0
duration (in seconds) of the default click signal. Default is 100ms.
click : np.ndarray or None
optional click signal sample to use instead of the default blip.
length : int > 0
desired number of samples in the output signal
Returns
-------
click_signal : np.ndarray
Synthesized click signal
Raises
------
ParameterError
- If neither `times` nor `frames` are provided.
- If any of `click_freq`, `click_duration`, or `length` are out of range.
Examples
--------
>>> # Sonify detected beat events
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
>>> y_beats = librosa.clicks(frames=beats, sr=sr)
>>> # Or generate a signal of the same length as y
>>> y_beats = librosa.clicks(frames=beats, sr=sr, length=len(y))
>>> # Or use timing instead of frame indices
>>> times = librosa.frames_to_time(beats, sr=sr)
>>> y_beat_times = librosa.clicks(times=times, sr=sr)
>>> # Or with a click frequency of 880Hz and a 500ms sample
>>> y_beat_times880 = librosa.clicks(times=times, sr=sr,
... click_freq=880, click_duration=0.5)
Display click waveform next to the spectrogram
>>> import matplotlib.pyplot as plt
>>> plt.figure()
>>> S = librosa.feature.melspectrogram(y=y, sr=sr)
>>> ax = plt.subplot(2,1,2)
>>> librosa.display.specshow(librosa.power_to_db(S, ref=np.max),
... x_axis='time', y_axis='mel')
>>> plt.subplot(2,1,1, sharex=ax)
>>> librosa.display.waveplot(y_beat_times, sr=sr, label='Beat clicks')
>>> plt.legend()
>>> plt.xlim(15, 30)
>>> plt.tight_layout()
"""
# Compute sample positions from time or frames
if times is None:
if frames is None:
raise ParameterError('either "times" or "frames" must be provided')
positions = frames_to_samples(frames, hop_length=hop_length)
else:
# Convert times to positions
positions = time_to_samples(times, sr=sr)
if click is not None:
# Check that we have a well-formed audio buffer
util.valid_audio(click, mono=True)
else:
# Create default click signal
if click_duration <= 0:
raise ParameterError('click_duration must be strictly positive')
if click_freq <= 0:
raise ParameterError('click_freq must be strictly positive')
angular_freq = 2 * np.pi * click_freq / float(sr)
click = np.logspace(0, -10,
num=int(np.round(sr * click_duration)),
base=2.0)
click *= np.sin(angular_freq * np.arange(len(click)))
# Set default length
if length is None:
length = positions.max() + click.shape[0]
else:
if length < 1:
raise ParameterError('length must be a positive integer')
# Filter out any positions past the length boundary
positions = positions[positions < length]
# Pre-allocate click signal
click_signal = np.zeros(length, dtype=np.float32)
# Place clicks
for start in positions:
# Compute the end-point of this click
end = start + click.shape[0]
if end >= length:
click_signal[start:] += click[:length - start]
else:
# Normally, just add a click here
click_signal[start:end] += click
return click_signal