Source code for nussl.separation.primitive.melodia

import numpy as np
from scipy.ndimage.filters import convolve
from scipy.ndimage import maximum_filter, gaussian_filter

from .. import MaskSeparationBase, SeparationException
from ..benchmark import HighLowPassFilter
from ... import AudioSignal
from ... import vamp_imported
import numpy as np
import scipy.signal

if vamp_imported:
    import vamp

# function for generating the vocal chord impulse response
def rosenmodel(t1, t2, fs):
    """
    This model for generating singing vowel sounds from sine tones comes
    from:

    https://simonl02.users.greyc.fr/files/Documents/Teaching/SignalProcessingLabs/lab3.pdf

    The above will be referred to throughout these docstrings as THE DOCUMENT.

    Original author: Fatemeh Pishdadian

    The arguments to the fucntions throughout this text follow the 
    signatures laid out in THE DOCUMENT.

    This is used in Melodia to generate the melody signal to produce a 
    mask.

    Equation 2 in THE DOCUMENT.
    """

    N1 = np.floor(t1 * fs)
    N2 = np.floor(t2 * fs)

    samp_vec1 = np.arange(N1+1)
    samp_vec2 = np.arange(N1,N1+N2+1)

    ir_func1 =  0.5 * (1 - np.cos((np.pi * samp_vec1)/N1))
    ir_func2 = np.cos(np.pi * (samp_vec2 - N1)/(2 * N2))

    vchord_filt = np.concatenate((ir_func1,ir_func2))

    return vchord_filt


# function for computing the denominator coeffs of the vocal cavity filter transfer function
def oral_cavity_filt(pole_amps, pole_freqs,fs):
    """
    This model for generating singing vowel sounds from sine tones comes
    from:

    https://simonl02.users.greyc.fr/files/Documents/Teaching/SignalProcessingLabs/lab3.pdf

    The above will be referred to throughout these docstrings as THE DOCUMENT.

    Original author: Fatemeh Pishdadian

    The arguments to the fucntions throughout this text follow the 
    signatures laid out in THE DOCUMENT.

    This is used in Melodia to generate the melody signal to produce a 
    mask.

    Solves "Q. Write a function to synthesize filter H(z)" in 
    THE DOCUMENT
    """
    num_pole_pair = len(pole_amps)
    poles = pole_amps * np.exp(1j * 2 * np.pi * pole_freqs / fs)
    poles_conj = np.conj(poles)

    denom_coeffs = 1
    for i in range(num_pole_pair):
        pole_temp = poles[i]
        pole_conj_temp = poles_conj[i]
        pole_pair_coeffs = np.convolve(np.array([1,-pole_temp]),np.array([1,-pole_conj_temp]))

        denom_coeffs = np.convolve(denom_coeffs, pole_pair_coeffs)
    return denom_coeffs

def _apply_vowel_filter(impulse_train, fs, t1=0.0075, t2=.013,
                     pole_amps=None, pole_freqs=None):
    """
    This model for generating singing vowel sounds from sine tones comes
    from:

    https://simonl02.users.greyc.fr/files/Documents/Teaching/SignalProcessingLabs/lab3.pdf

    The above will be referred to throughout these docstrings as THE DOCUMENT.

    Original author: Fatemeh Pishdadian

    The arguments to the fucntions throughout this text follow the 
    signatures laid out in THE DOCUMENT.

    This is used in Melodia to generate the melody signal to produce a 
    mask.
    
    Args:
        impulse_train (np.ndarray): Numpy array with data to be filtered
        fs (int): Sample rate of audio.
        t1 (float, optional): N1 in Equation 2 in THE DOCUMENT. Defaults to 0.0075.
        t2 (float, optional): N2 in Equation 2 in THE DOCUMENT. Defaults to .013.
        pole_amps (np.ndarray, optional): Pole amplitudes, see Figures 2-4 in THE DOCUMENT. 
          Defaults to None, which maps to E vowel.
        pole_freqs (np.ndarray, optional): Pole frequencies, see Figures 2-4 in THE DOCUMENT. 
          Defaults to None, which maps to E vowel
    
    Returns:
        np.ndarray: Filtered impulse train that should sound sort of like the desired 
          vowel.
    """
    if pole_amps is None:
        pole_amps = np.array([0.99,0.98,0.9,0.9])
    if pole_freqs is None:
        pole_freqs = np.array([800,1200,2800,3600])
        
    vchord_filt = rosenmodel(t1, t2, fs)
    vchord_out = np.convolve(impulse_train, vchord_filt)
        
    denom_coeffs = oral_cavity_filt(pole_amps, pole_freqs, fs)
    oral_out = scipy.signal.lfilter(
        np.array([1]), denom_coeffs, vchord_out)
    lip_out = np.real(scipy.signal.lfilter(
        np.array([1,-1]), np.array([1]), oral_out))
    
    lip_out = lip_out[:impulse_train.shape[0]]
    
    return np.real(lip_out)

[docs]class Melodia(MaskSeparationBase): """ Implements melody extraction using Melodia [1]. This needs Melodia installed as a vamp plugin, as well as having vampy for Python installed. Install Melodia via: https://www.upf.edu/web/mtg/melodia. Note that Melodia can be used only for NON-COMMERCIAL use. References: [1] J. Salamon and E. Gómez, "Melody Extraction from Polyphonic Music Signals using Pitch Contour Characteristics", IEEE Transactions on Audio, Speech and Language Processing, 20(6):1759-1770, Aug. 2012. Args: input_audio_signal (AudioSignal object): The AudioSignal object that has the audio data that Melodia will be run on. high_pass_cutoff (optional, float): value (in Hz) for the high pass cutoff filter. minimum_frequency (optional, float): minimum frequency in Hertz (default 55.0) maximum_frequency (optional, float): maximum frequency in Hertz (default 1760.0) voicing_tolerance (optional, float): Greater values will result in more pitch contours included in the final melody. Smaller values will result in less pitch contours included in the final melody (default 0.2). minimum_peak_salience (optional, float): a hack to avoid silence turning into junk contours when analyzing monophonic recordings (e.g. solo voice with no accompaniment). Generally you want to leave this untouched (default 0.0). num_overtones (optional, int): Number of overtones to use when creating melody mask. apply_vowel_filter (optional, bool): Whether or not to apply a vowel filter on the resynthesized melody signal when masking. smooth_length (optional, int): Number of frames to smooth discontinuities in the mask. add_lower_octave (optional, fool): Use octave below fundamental frequency as well to take care of octave errors in pitch tracking, since we only care about the mask. Defaults to False. mask_type (optional, str): Type of mask to use. mask_threshold (optional, float): Threshold for mask to convert to binary. """ def __init__(self, input_audio_signal, high_pass_cutoff=100, minimum_frequency=55.0, maximum_frequency=1760.0, voicing_tolerance=0.2, minimum_peak_salience=0.0, compression=0.5, num_overtones=40, apply_vowel_filter=False, smooth_length=5, add_lower_octave=False, mask_type='soft', mask_threshold=0.5): # lazy load vamp to check if it exists from ... import vamp_imported melodia_installed = False if vamp_imported: melodia_installed = 'mtg-melodia:melodia' in vamp.list_plugins() if not vamp_imported or not melodia_installed: self._raise_vamp_melodia_error() super().__init__( input_audio_signal=input_audio_signal, mask_type=mask_type, mask_threshold=mask_threshold ) self.high_pass_cutoff = high_pass_cutoff self.minimum_frequency = float(minimum_frequency) self.maximum_frequency = float(maximum_frequency) self.voicing_tolerance = float(voicing_tolerance) self.minimum_peak_salience = float(minimum_peak_salience) self.compression = compression self.apply_vowel_filter = apply_vowel_filter self.add_lower_octave = add_lower_octave self.melody = None self.melody_signal = None self.timestamps = None self.num_overtones = num_overtones self.smooth_length = smooth_length def _raise_vamp_melodia_error(self): raise SeparationException( '\n**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~**' '\n* Are Vamp and Melodia installed correctly? *' '\n* Check https://bit.ly/2DXbrAk for installation instructions! *' '\n**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~**') def extract_melody(self): """ Extracts melody from the audio using the melodia vamp plugin. Uses arguments kept in self: - `self.minimum_frequency` (default: 55 Hz) - `self.maximum_frequency` (default: 1760 Hz) - `self.voicing_tolerance` (default: 0.2) - `self.minimum_peak_salience` (default: 0.0) This function sets two class members used in other parts: - `self.melody`: (numpy array) contains the melody in Hz for every timestep (0 indicates no voice). - `self.timestamps`: (numpy array) contains the timestamps for each melody note """ params = { 'minfqr': self.minimum_frequency, 'maxfqr': self.maximum_frequency, 'voicing': self.voicing_tolerance, 'minpeaksalience': self.minimum_peak_salience } data = vamp.collect(self.audio_signal.audio_data, self.sample_rate, "mtg-melodia:melodia", parameters=params) _, melody = data['vector'] hop = 128. / 44100. # hard coded hop in Melodia vamp plugin, converting it to frames. timestamps = 8 * hop + np.arange(len(melody)) * hop melody[melody < 0] = 0 self.melody = melody self.timestamps = timestamps def create_melody_signal(self, num_overtones): """ Adapted from Melosynth by Justin Salamon: https://github.com/justinsalamon/melosynth. To mask the mixture, we need to identify time-frequency bins that belong to the melody. Melodia outputs only the fundamental frequency of the melodic line. To construct the mask we take the fundamental frequency and add all the overtones of it (up to num_overtones) to the mask. The melody is faded in and out at onsets and offsets to make the separation sound more natural (hard-coded by transition_length). Args: num_overtones (int): Number of overtones to expand out to build the mask. """ if self.timestamps[0] > 0: estimated_hop = np.median(np.diff(self.timestamps)) previous_time = max(self.timestamps[0] - estimated_hop, 0) self.timestamps = np.insert(self.timestamps, 0, previous_time) self.melody = np.insert(self.melody, 0, 0) sample_rate = self.audio_signal.sample_rate melody_signal = [] transition_length = .001 # duration for fade in/out and frequency interpretation phase = np.zeros(num_overtones) previous_frequency = 0 previous_time = 0 overtone_weights = np.ones(num_overtones) for time, frequency in zip(self.timestamps, self.melody): if self.add_lower_octave: frequency = frequency / 2 # taking care of octave errors since we only care about masking num_samples = int(np.round((time - previous_time) * sample_rate)) if num_samples > 0: num_transition_samples = float( min(np.round(transition_length * sample_rate), num_samples)) frequency_series = np.ones(num_samples) * previous_frequency if previous_frequency > 0 and frequency > 0: frequency_series += np.minimum( np.arange(num_samples) / num_transition_samples, 1) * \ (frequency - previous_frequency) elif frequency > 0: frequency_series = np.ones(num_samples) * frequency samples = np.zeros(num_samples) for overtone in range(num_overtones): overtone_num = overtone + 1 phasors = 2 * np.pi * overtone_num * frequency_series / float(sample_rate) phases = phase[overtone] + np.cumsum(phasors) samples += overtone_weights[overtone] * np.sign(np.sin(phases)) phase[overtone] = phases[-1] if previous_frequency == 0 and frequency > 0: samples *= np.minimum(np.arange(num_samples) / num_transition_samples, 1) elif previous_frequency > 0 and frequency == 0: samples *= np.maximum(1 - np.arange(num_samples) / num_transition_samples, 0) elif previous_frequency == 0 and frequency == 0: samples *= 0 melody_signal.extend(samples) previous_frequency = frequency previous_time = time melody_signal = np.asarray(melody_signal) if self.apply_vowel_filter: melody_signal = _apply_vowel_filter(melody_signal, sample_rate) melody_signal /= float(max(np.max(melody_signal), 1e-7)) melody_signal = [melody_signal for _ in range(self.audio_signal.num_channels)] melody_signal = np.asarray(melody_signal) melody_signal = melody_signal[:, 0:self.audio_signal.signal_length] melody_signal = AudioSignal( audio_data_array=melody_signal, sample_rate=sample_rate, stft_params=self.audio_signal.stft_params ) self.melody_signal = melody_signal return melody_signal def create_harmonic_mask(self, melody_signal): """ Creates a harmonic mask from the melody signal. The mask is smoothed to reduce the effects of discontinuities in the melody synthesizer. """ stft = np.abs(melody_signal.stft()) # Need to threshold the melody stft since the synthesized # F0 sequence overtones are at different weights. stft = stft ** self.compression stft /= np.maximum(np.max(stft, axis=1, keepdims=True), 1e-7) mask = np.empty(self.stft.shape) # Smoothing the mask row-wise using a low-pass filter to # get rid of discontuinities in the mask. kernel = np.full((1, self.smooth_length), 1 / self.smooth_length) for ch in range(self.audio_signal.num_channels): mask[..., ch] = convolve(stft[..., ch], kernel) return mask def run(self): high_low = HighLowPassFilter(self.audio_signal, self.high_pass_cutoff) high_pass_masks = high_low.run() # separate the mixture foreground melody by masking if self.melody_signal is None: self.extract_melody() self.create_melody_signal(self.num_overtones) foreground_mask = self.create_harmonic_mask(self.melody_signal) foreground_mask = self.MASKS['soft'](foreground_mask) foreground_mask = foreground_mask background_mask = foreground_mask.invert_mask() _masks = np.stack( [background_mask.mask, foreground_mask.mask], axis=-1) self.result_masks = [] for i in range(_masks.shape[-1]): mask_data = _masks[..., i] if self.mask_type == self.MASKS['binary']: mask_data = _masks[..., i] == np.max(_masks, axis=-1) if i == 0: mask_data = np.maximum(mask_data, high_pass_masks[i].mask) elif i == 1: mask_data = np.minimum(mask_data, high_pass_masks[i].mask) mask = self.mask_type(mask_data) self.result_masks.append(mask) return self.result_masks