import numpy as np
from scipy.ndimage.filters import convolve
from scipy.ndimage import maximum_filter, gaussian_filter
from .. import MaskSeparationBase, SeparationException
from ..benchmark import HighLowPassFilter
from ... import AudioSignal
from ... import vamp_imported
import numpy as np
import scipy.signal
if vamp_imported:
import vamp
# function for generating the vocal chord impulse response
def rosenmodel(t1, t2, fs):
"""
This model for generating singing vowel sounds from sine tones comes
from:
https://simonl02.users.greyc.fr/files/Documents/Teaching/SignalProcessingLabs/lab3.pdf
The above will be referred to throughout these docstrings as THE DOCUMENT.
Original author: Fatemeh Pishdadian
The arguments to the fucntions throughout this text follow the
signatures laid out in THE DOCUMENT.
This is used in Melodia to generate the melody signal to produce a
mask.
Equation 2 in THE DOCUMENT.
"""
N1 = np.floor(t1 * fs)
N2 = np.floor(t2 * fs)
samp_vec1 = np.arange(N1+1)
samp_vec2 = np.arange(N1,N1+N2+1)
ir_func1 = 0.5 * (1 - np.cos((np.pi * samp_vec1)/N1))
ir_func2 = np.cos(np.pi * (samp_vec2 - N1)/(2 * N2))
vchord_filt = np.concatenate((ir_func1,ir_func2))
return vchord_filt
# function for computing the denominator coeffs of the vocal cavity filter transfer function
def oral_cavity_filt(pole_amps, pole_freqs,fs):
"""
This model for generating singing vowel sounds from sine tones comes
from:
https://simonl02.users.greyc.fr/files/Documents/Teaching/SignalProcessingLabs/lab3.pdf
The above will be referred to throughout these docstrings as THE DOCUMENT.
Original author: Fatemeh Pishdadian
The arguments to the fucntions throughout this text follow the
signatures laid out in THE DOCUMENT.
This is used in Melodia to generate the melody signal to produce a
mask.
Solves "Q. Write a function to synthesize filter H(z)" in
THE DOCUMENT
"""
num_pole_pair = len(pole_amps)
poles = pole_amps * np.exp(1j * 2 * np.pi * pole_freqs / fs)
poles_conj = np.conj(poles)
denom_coeffs = 1
for i in range(num_pole_pair):
pole_temp = poles[i]
pole_conj_temp = poles_conj[i]
pole_pair_coeffs = np.convolve(np.array([1,-pole_temp]),np.array([1,-pole_conj_temp]))
denom_coeffs = np.convolve(denom_coeffs, pole_pair_coeffs)
return denom_coeffs
def _apply_vowel_filter(impulse_train, fs, t1=0.0075, t2=.013,
pole_amps=None, pole_freqs=None):
"""
This model for generating singing vowel sounds from sine tones comes
from:
https://simonl02.users.greyc.fr/files/Documents/Teaching/SignalProcessingLabs/lab3.pdf
The above will be referred to throughout these docstrings as THE DOCUMENT.
Original author: Fatemeh Pishdadian
The arguments to the fucntions throughout this text follow the
signatures laid out in THE DOCUMENT.
This is used in Melodia to generate the melody signal to produce a
mask.
Args:
impulse_train (np.ndarray): Numpy array with data to be filtered
fs (int): Sample rate of audio.
t1 (float, optional): N1 in Equation 2 in THE DOCUMENT. Defaults to 0.0075.
t2 (float, optional): N2 in Equation 2 in THE DOCUMENT. Defaults to .013.
pole_amps (np.ndarray, optional): Pole amplitudes, see Figures 2-4 in THE DOCUMENT.
Defaults to None, which maps to E vowel.
pole_freqs (np.ndarray, optional): Pole frequencies, see Figures 2-4 in THE DOCUMENT.
Defaults to None, which maps to E vowel
Returns:
np.ndarray: Filtered impulse train that should sound sort of like the desired
vowel.
"""
if pole_amps is None:
pole_amps = np.array([0.99,0.98,0.9,0.9])
if pole_freqs is None:
pole_freqs = np.array([800,1200,2800,3600])
vchord_filt = rosenmodel(t1, t2, fs)
vchord_out = np.convolve(impulse_train, vchord_filt)
denom_coeffs = oral_cavity_filt(pole_amps, pole_freqs, fs)
oral_out = scipy.signal.lfilter(
np.array([1]), denom_coeffs, vchord_out)
lip_out = np.real(scipy.signal.lfilter(
np.array([1,-1]), np.array([1]), oral_out))
lip_out = lip_out[:impulse_train.shape[0]]
return np.real(lip_out)
[docs]class Melodia(MaskSeparationBase):
"""
Implements melody extraction using Melodia [1].
This needs Melodia installed as a vamp plugin, as well as having vampy for
Python installed. Install Melodia via: https://www.upf.edu/web/mtg/melodia.
Note that Melodia can be used only for NON-COMMERCIAL use.
References:
[1] J. Salamon and E. Gómez, "Melody Extraction from Polyphonic Music Signals using
Pitch Contour Characteristics", IEEE Transactions on Audio, Speech and
Language Processing, 20(6):1759-1770, Aug. 2012.
Args:
input_audio_signal (AudioSignal object): The AudioSignal object that has the
audio data that Melodia will be run on.
high_pass_cutoff (optional, float): value (in Hz) for the high pass cutoff
filter.
minimum_frequency (optional, float): minimum frequency in Hertz (default 55.0)
maximum_frequency (optional, float): maximum frequency in Hertz (default 1760.0)
voicing_tolerance (optional, float): Greater values will result in more pitch contours
included in the final melody. Smaller values will result in less pitch
contours included in the final melody (default 0.2).
minimum_peak_salience (optional, float): a hack to avoid silence turning into junk
contours when analyzing monophonic recordings (e.g. solo voice with
no accompaniment). Generally you want to leave this untouched (default 0.0).
num_overtones (optional, int): Number of overtones to use when creating
melody mask.
apply_vowel_filter (optional, bool): Whether or not to apply a vowel filter
on the resynthesized melody signal when masking.
smooth_length (optional, int): Number of frames to smooth discontinuities in the
mask.
add_lower_octave (optional, fool): Use octave below fundamental frequency as well
to take care of octave errors in pitch tracking, since we only care about
the mask. Defaults to False.
mask_type (optional, str): Type of mask to use.
mask_threshold (optional, float): Threshold for mask to convert to binary.
"""
def __init__(self, input_audio_signal, high_pass_cutoff=100, minimum_frequency=55.0,
maximum_frequency=1760.0, voicing_tolerance=0.2, minimum_peak_salience=0.0,
compression=0.5, num_overtones=40, apply_vowel_filter=False, smooth_length=5,
add_lower_octave=False, mask_type='soft', mask_threshold=0.5):
# lazy load vamp to check if it exists
from ... import vamp_imported
melodia_installed = False
if vamp_imported:
melodia_installed = 'mtg-melodia:melodia' in vamp.list_plugins()
if not vamp_imported or not melodia_installed:
self._raise_vamp_melodia_error()
super().__init__(
input_audio_signal=input_audio_signal,
mask_type=mask_type,
mask_threshold=mask_threshold
)
self.high_pass_cutoff = high_pass_cutoff
self.minimum_frequency = float(minimum_frequency)
self.maximum_frequency = float(maximum_frequency)
self.voicing_tolerance = float(voicing_tolerance)
self.minimum_peak_salience = float(minimum_peak_salience)
self.compression = compression
self.apply_vowel_filter = apply_vowel_filter
self.add_lower_octave = add_lower_octave
self.melody = None
self.melody_signal = None
self.timestamps = None
self.num_overtones = num_overtones
self.smooth_length = smooth_length
def _raise_vamp_melodia_error(self):
raise SeparationException(
'\n**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~**'
'\n* Are Vamp and Melodia installed correctly? *'
'\n* Check https://bit.ly/2DXbrAk for installation instructions! *'
'\n**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~**')
def extract_melody(self):
"""
Extracts melody from the audio using the melodia vamp plugin. Uses arguments kept
in self:
- `self.minimum_frequency` (default: 55 Hz)
- `self.maximum_frequency` (default: 1760 Hz)
- `self.voicing_tolerance` (default: 0.2)
- `self.minimum_peak_salience` (default: 0.0)
This function sets two class members used in other parts:
- `self.melody`: (numpy array) contains the melody in Hz for every timestep
(0 indicates no voice).
- `self.timestamps`: (numpy array) contains the timestamps for each melody note
"""
params = {
'minfqr': self.minimum_frequency,
'maxfqr': self.maximum_frequency,
'voicing': self.voicing_tolerance,
'minpeaksalience': self.minimum_peak_salience
}
data = vamp.collect(self.audio_signal.audio_data, self.sample_rate,
"mtg-melodia:melodia", parameters=params)
_, melody = data['vector']
hop = 128. / 44100. # hard coded hop in Melodia vamp plugin, converting it to frames.
timestamps = 8 * hop + np.arange(len(melody)) * hop
melody[melody < 0] = 0
self.melody = melody
self.timestamps = timestamps
def create_melody_signal(self, num_overtones):
"""
Adapted from Melosynth by Justin Salamon: https://github.com/justinsalamon/melosynth.
To mask the mixture, we need to identify time-frequency bins that belong to the
melody. Melodia outputs only the fundamental frequency of the melodic line.
To construct the mask we take the fundamental frequency and add all the
overtones of it (up to num_overtones) to the mask. The melody is faded in and
out at onsets and offsets to make the separation sound more natural
(hard-coded by transition_length).
Args:
num_overtones (int): Number of overtones to expand out to build the mask.
"""
if self.timestamps[0] > 0:
estimated_hop = np.median(np.diff(self.timestamps))
previous_time = max(self.timestamps[0] - estimated_hop, 0)
self.timestamps = np.insert(self.timestamps, 0, previous_time)
self.melody = np.insert(self.melody, 0, 0)
sample_rate = self.audio_signal.sample_rate
melody_signal = []
transition_length = .001 # duration for fade in/out and frequency interpretation
phase = np.zeros(num_overtones)
previous_frequency = 0
previous_time = 0
overtone_weights = np.ones(num_overtones)
for time, frequency in zip(self.timestamps, self.melody):
if self.add_lower_octave:
frequency = frequency / 2
# taking care of octave errors since we only care about masking
num_samples = int(np.round((time - previous_time) * sample_rate))
if num_samples > 0:
num_transition_samples = float(
min(np.round(transition_length * sample_rate), num_samples))
frequency_series = np.ones(num_samples) * previous_frequency
if previous_frequency > 0 and frequency > 0:
frequency_series += np.minimum(
np.arange(num_samples) / num_transition_samples, 1) * \
(frequency - previous_frequency)
elif frequency > 0:
frequency_series = np.ones(num_samples) * frequency
samples = np.zeros(num_samples)
for overtone in range(num_overtones):
overtone_num = overtone + 1
phasors = 2 * np.pi * overtone_num * frequency_series / float(sample_rate)
phases = phase[overtone] + np.cumsum(phasors)
samples += overtone_weights[overtone] * np.sign(np.sin(phases))
phase[overtone] = phases[-1]
if previous_frequency == 0 and frequency > 0:
samples *= np.minimum(np.arange(num_samples) / num_transition_samples, 1)
elif previous_frequency > 0 and frequency == 0:
samples *= np.maximum(1 - np.arange(num_samples) / num_transition_samples, 0)
elif previous_frequency == 0 and frequency == 0:
samples *= 0
melody_signal.extend(samples)
previous_frequency = frequency
previous_time = time
melody_signal = np.asarray(melody_signal)
if self.apply_vowel_filter:
melody_signal = _apply_vowel_filter(melody_signal, sample_rate)
melody_signal /= float(max(np.max(melody_signal), 1e-7))
melody_signal = [melody_signal for _ in range(self.audio_signal.num_channels)]
melody_signal = np.asarray(melody_signal)
melody_signal = melody_signal[:, 0:self.audio_signal.signal_length]
melody_signal = AudioSignal(
audio_data_array=melody_signal,
sample_rate=sample_rate,
stft_params=self.audio_signal.stft_params
)
self.melody_signal = melody_signal
return melody_signal
def create_harmonic_mask(self, melody_signal):
"""
Creates a harmonic mask from the melody signal. The mask is smoothed to reduce
the effects of discontinuities in the melody synthesizer.
"""
stft = np.abs(melody_signal.stft())
# Need to threshold the melody stft since the synthesized
# F0 sequence overtones are at different weights.
stft = stft ** self.compression
stft /= np.maximum(np.max(stft, axis=1, keepdims=True), 1e-7)
mask = np.empty(self.stft.shape)
# Smoothing the mask row-wise using a low-pass filter to
# get rid of discontuinities in the mask.
kernel = np.full((1, self.smooth_length), 1 / self.smooth_length)
for ch in range(self.audio_signal.num_channels):
mask[..., ch] = convolve(stft[..., ch], kernel)
return mask
def run(self):
high_low = HighLowPassFilter(self.audio_signal, self.high_pass_cutoff)
high_pass_masks = high_low.run()
# separate the mixture foreground melody by masking
if self.melody_signal is None:
self.extract_melody()
self.create_melody_signal(self.num_overtones)
foreground_mask = self.create_harmonic_mask(self.melody_signal)
foreground_mask = self.MASKS['soft'](foreground_mask)
foreground_mask = foreground_mask
background_mask = foreground_mask.invert_mask()
_masks = np.stack(
[background_mask.mask, foreground_mask.mask], axis=-1)
self.result_masks = []
for i in range(_masks.shape[-1]):
mask_data = _masks[..., i]
if self.mask_type == self.MASKS['binary']:
mask_data = _masks[..., i] == np.max(_masks, axis=-1)
if i == 0:
mask_data = np.maximum(mask_data, high_pass_masks[i].mask)
elif i == 1:
mask_data = np.minimum(mask_data, high_pass_masks[i].mask)
mask = self.mask_type(mask_data)
self.result_masks.append(mask)
return self.result_masks