Source code for nussl.separation.primitive.repet

import numpy as np
import scipy.fftpack as scifft

from .. import MaskSeparationBase, SeparationException
from ..benchmark import HighLowPassFilter
from ...core import constants


[docs]class Repet(MaskSeparationBase):
    """Implements the original REpeating Pattern Extraction Technique algorithm 
    using the beat spectrum.

    REPET is a simple method for separating a repeating background from a 
    non-repeating foreground in an audio mixture. It assumes a single repeating 
    period over the whole signal duration, and finds that period based on finding 
    a peak in the beat spectrum. The period can also be provided exactly, or you
    can give ``Repet`` a guess of the min and max period. Once it has a period, 
    it "overlays" spectrogram sections of length ``period`` to create a median 
    model (the background).
    
    References:

    [1] Rafii, Zafar, and Bryan Pardo.
        "Repeating pattern extraction technique (REPET): A simple method for 
        music/voice separation." IEEE transactions on audio, speech, 
        and language processing 21.1 (2012): 73-84.

    Args:
        input_audio_signal (AudioSignal): Signal to separate.

        min_period (float, optional): minimum time to look for repeating period in 
          terms of seconds.
        
        max_period (float, optional): maximum time to look for repeating period in 
          terms of seconds.
        
        period (float, optional): exact time that the repeating period is 
          (in seconds).
        
        high_pass_cutoff (float, optional): value (in Hz) for the high pass 
          cutoff filter.
                        
        mask_type (str, optional): Mask type. Defaults to 'soft'.

        mask_threshold (float, optional): Masking threshold. Defaults to 0.5.

    """

    def __init__(self, input_audio_signal, min_period=None, max_period=None,
                 period=None, high_pass_cutoff=100.0, mask_type='soft',
                 mask_threshold=0.5):

        super().__init__(
            input_audio_signal=input_audio_signal,
            mask_type=mask_type,
            mask_threshold=mask_threshold)

        # Check input parameters
        if (min_period or max_period) and period:
            raise SeparationException(
                'Cannot set both period and (min_period or max_period)!')

        self._is_period_converted_to_hops = False
        self.period = period

        if self.period is None:
            self.min_period = min_period if min_period else .8
            max_period = max_period if max_period else 8
            self.max_period = min(max_period, self.audio_signal.signal_duration / 3)
        else:
            self.period = self._update_period(period)
            self._is_period_converted_to_hops = True

        self.high_pass_cutoff = high_pass_cutoff
        self.magnitude_spectrogram = None
        self.repeating_period = None
        self.beat_spectrum = None

    def run(self):
        high_low = HighLowPassFilter(self.audio_signal, self.high_pass_cutoff)
        high_pass_masks = high_low.run()

        self.magnitude_spectrogram = np.abs(self.stft)

        background_masks = []
        foreground_masks = []

        self.repeating_period = self._calculate_repeating_period()

        for ch in range(self.audio_signal.num_channels):
            background_mask = self._compute_repeating_mask(
                self.magnitude_spectrogram[..., ch])
            foreground_mask = 1 - background_mask

            background_masks.append(background_mask)
            foreground_masks.append(foreground_mask)

        background_masks = np.stack(background_masks, axis=-1)
        foreground_masks = np.stack(foreground_masks, axis=-1)

        _masks = np.stack([background_masks, foreground_masks], axis=-1)

        self.result_masks = []

        for i in range(_masks.shape[-1]):
            mask_data = _masks[..., i]
            if self.mask_type == self.MASKS['binary']:
                mask_data = _masks[..., i] == np.max(_masks, axis=-1)

            if i == 0:
                mask_data = np.maximum(mask_data, high_pass_masks[i].mask)
            elif i == 1:
                mask_data = np.minimum(mask_data, high_pass_masks[i].mask)

            mask = self.mask_type(mask_data)
            self.result_masks.append(mask)

        return self.result_masks

    def get_beat_spectrum(self):
        """
        Calculates and returns the beat spectrum for the audio signal associated 
        with this object

        Returns:
            beat_spectrum (np.array): beat spectrum for the audio file

        Example:

        .. code-block:: python
            :linenos:
            
            # Set up audio signal
            signal = nussl.AudioSignal('path_to_file.wav')

            # Set up a Repet object
            repet = nussl.Repet(signal)

            # I don't have to run repet to get a beat spectrum for signal
            beat_spec = repet.get_beat_spectrum()
            
        """
        # TODO: Make this multi-channel. The np.mean() reduces the n channels to 1.       
        self.beat_spectrum = self.compute_beat_spectrum(
            np.mean(np.square(self.magnitude_spectrogram),
                    axis=constants.STFT_CHAN_INDEX).T
        )
        return self.beat_spectrum

    def _calculate_repeating_period(self):
        # user provided a period, so no calculations to do
        if self.period is not None:
            return self.period

        # get beat spectrum
        self.beat_spectrum = self.get_beat_spectrum()

        # update the min and max so they're in units of time bin indices
        if not self._is_period_converted_to_hops:
            self.min_period = self._update_period(self.min_period)
            self.max_period = self._update_period(self.max_period)
            self._is_period_converted_to_hops = True

        self.repeating_period = self.find_repeating_period_simple(self.beat_spectrum,
                                                                  self.min_period, self.max_period)

        return self.repeating_period

    @staticmethod
    def compute_beat_spectrum(power_spectrogram):
        """ Computes the beat spectrum averages (over freq's) the autocorrelation matrix of a one-sided spectrogram.

        The autocorrelation matrix is computed by taking the autocorrelation of each row of the spectrogram and
        dismissing the symmetric half.

        Args:
            power_spectrogram (:obj:`np.array`): 2D matrix containing the one-sided power spectrogram of an audio signal
            
        Returns:
            (:obj:`np.array`): array containing the beat spectrum based on the power spectrogram
            
        See Also:
            J Foote's original derivation of the Beat Spectrum: 
            Foote, Jonathan, and Shingo Uchihashi. "The beat spectrum: A new approach to rhythm analysis." 
            Multimedia and Expo, 2001. ICME 2001. IEEE International Conference on. IEEE, 2001.
            (`See PDF here <http://rotorbrain.com/foote/papers/icme2001.pdf>`_)
            
        """
        freq_bins, time_bins = power_spectrogram.shape

        # row-wise autocorrelation according to the Wiener-Khinchin theorem
        power_spectrogram = np.vstack([power_spectrogram, np.zeros_like(power_spectrogram)])

        nearest_power_of_two = 2 ** np.ceil(np.log(power_spectrogram.shape[0]) / np.log(2))
        pad_amount = int(nearest_power_of_two - power_spectrogram.shape[0])
        power_spectrogram = np.pad(power_spectrogram, ((0, pad_amount), (0, 0)), 'constant')
        fft_power_spec = scifft.fft(power_spectrogram, axis=0)
        abs_fft = np.abs(fft_power_spec) ** 2
        autocorrelation_rows = np.real(
            scifft.ifft(abs_fft, axis=0)[:freq_bins, :])  # ifft over columns

        # normalization factor
        norm_factor = np.tile(np.arange(freq_bins, 0, -1), (time_bins, 1)).T
        autocorrelation_rows = autocorrelation_rows / norm_factor

        # compute the beat spectrum
        beat_spectrum = np.mean(autocorrelation_rows, axis=1)
        # average over frequencies

        return beat_spectrum

    @staticmethod
    def find_repeating_period_simple(beat_spectrum, min_period, max_period):
        """
        Computes the repeating period of the sound signal using the beat spectrum.
        This algorithm just looks for the max value in the interval 
        ``[min_period, max_period]``, inclusive. It discards the first value, and 
        returns the period in units of stft time bins.

        Args:
            beat_spectrum (:obj:`np.array`): input beat spectrum array
            min_period (int): minimum possible period value
            max_period (int): maximum possible period value
            
        Returns:
             period (int): The period of the sound signal in stft time bins
             
        See Also:
            :func:`find_repeating_period_complex`
            
        """
        min_period, max_period = int(min_period), int(max_period)
        # discard the first element of beat_spectrum (lag 0)
        beat_spectrum = beat_spectrum[1:]
        beat_spectrum = beat_spectrum[min_period - 1: max_period]

        if len(beat_spectrum) == 0:
            raise SeparationException('min_period is larger than the beat spectrum!')

        period = np.argmax(beat_spectrum) + min_period

        return period

    def _compute_repeating_mask(self, magnitude_spectrogram_channel):
        """
        Computes the soft mask for the repeating part using the magnitude 
        spectrogram and the repeating period

        Args:
            magnitude_spectrogram_channel (:obj:`np.array`): 2D matrix containing the 
              magnitude spectrogram of a signal
            
        Returns:
            (:obj:`np.array`): 2D matrix (Lf by Lt) containing the soft mask for the 
              repeating part, elements of M take on values in ``[0, 1]``

        """
        period = self.repeating_period
        freq_bins, time_bins = magnitude_spectrogram_channel.shape
        n_repetitions = int(np.ceil(float(time_bins) / period))
        one_period = freq_bins * period

        # Pad to make an integer number of repetitions. Pad with 'nan's to not affect the median.
        remainder = (period * n_repetitions) % time_bins
        mask_reshaped = np.hstack(
            [magnitude_spectrogram_channel,
             float('nan') * np.zeros((freq_bins, remainder))]
        )

        # reshape to take the median of each period
        mask_reshaped = np.reshape(mask_reshaped.T, (n_repetitions, one_period))

        # take median of repeating periods before and after the padding
        median_mask = np.nanmedian(mask_reshaped, axis=0)

        # reshape to it's original shape
        median_mask = np.reshape(
            np.tile(median_mask, (n_repetitions, 1)),
            (n_repetitions * period, freq_bins)).T
        median_mask = median_mask[:, :time_bins]

        # take minimum of computed mask and original input and scale
        min_median_mask = np.minimum(median_mask, magnitude_spectrogram_channel)
        mask = (min_median_mask + constants.EPSILON) / (
                    magnitude_spectrogram_channel + constants.EPSILON)

        return mask

    def _update_period(self, period):
        period = float(period)
        secs_per_bin = self.stft_params.hop_length / self.audio_signal.sample_rate
        bins_in_period = period / secs_per_bin
        period_in_frames = int(np.ceil(bins_in_period))
        return period_in_frames