Source code for nussl.separation.primitive.repet_sim

import numpy as np

from .. import MaskSeparationBase
from ..benchmark import HighLowPassFilter
from ...core import utils
from ...core import constants


[docs]class RepetSim(MaskSeparationBase): """ Implements the REpeating Pattern Extraction Technique algorithm using the Similarity Matrix (REPET-SIM). REPET is a simple method for separating the repeating background from the non-repeating foreground in a piece of audio mixture. REPET-SIM is a generalization of REPET, which looks for similarities instead of periodicities. References: [1] Zafar Rafii and Bryan Pardo. "Music/Voice Separation using the Similarity Matrix," 13th International Society on Music Information Retrieval, Porto, Portugal, October 8-12, 2012. Args: input_audio_signal (AudioSignal): Audio signal to be separated. similarity_threshold (int, optional): Threshold for considering two frames to be similar. Defaults to 0. min_distance_between_frames (float, optional): Number of seconds two frames must be apart to be considered neighbors. Defaults to 1. max_repeating_frames (int, optional): Max number of frames to consider as neighbors. Defaults to 100. high_pass_cutoff (float, optional): Cutoff for high pass filters. Bins below this cutoff will be given to the background. Defaults to 100. mask_type (str, optional): Mask type to use.. Defaults to 'soft'. mask_threshold (float, optional): Threshold for mask converting to binary. Defaults to 0.5. """ def __init__(self, input_audio_signal, similarity_threshold=0, min_distance_between_frames=1, max_repeating_frames=100, high_pass_cutoff=100, mask_type='soft', mask_threshold=0.5): super().__init__( input_audio_signal=input_audio_signal, mask_type=mask_type, mask_threshold=mask_threshold) self.high_pass_cutoff = high_pass_cutoff self.similarity_threshold = similarity_threshold self.min_distance_between_frames = min_distance_between_frames self.max_repeating_frames = max_repeating_frames self._min_distance_converted_to_hops = False self.similarity_matrix = None self.similarity_indices = None self.magnitude_spectrogram = None def run(self): high_low = HighLowPassFilter(self.audio_signal, self.high_pass_cutoff) high_pass_masks = high_low.run() self.magnitude_spectrogram = np.abs(self.stft) background_masks = [] foreground_masks = [] self.similarity_indices = self._get_similarity_indices() for ch in range(self.audio_signal.num_channels): background_mask = self._compute_mask( self.magnitude_spectrogram[..., ch]) foreground_mask = 1 - background_mask background_masks.append(background_mask) foreground_masks.append(foreground_mask) background_masks = np.stack(background_masks, axis=-1) foreground_masks = np.stack(foreground_masks, axis=-1) _masks = np.stack([background_masks, foreground_masks], axis=-1) self.result_masks = [] for i in range(_masks.shape[-1]): mask_data = _masks[..., i] if self.mask_type == self.MASKS['binary']: mask_data = _masks[..., i] == np.max(_masks, axis=-1) if i == 0: mask_data = np.maximum(mask_data, high_pass_masks[i].mask) elif i == 1: mask_data = np.minimum(mask_data, high_pass_masks[i].mask) mask = self.mask_type(mask_data) self.result_masks.append(mask) return self.result_masks def _get_similarity_indices(self): self.similarity_matrix = self.get_similarity_matrix() if not self._min_distance_converted_to_hops: self.min_distance_between_frames *= ( self.audio_signal.sample_rate / self.stft_params.hop_length ) self._min_distance_converted_to_hops = True return self._find_similarity_indices() @staticmethod def compute_similarity_matrix(matrix): """Computes the cosine similarity matrix using the cosine similarity for any given input matrix X. Parameters: matrix (np.array): 2D matrix containing the magnitude spectrogram of the audio signal Returns: S (np.array): 2D similarity matrix """ # ignore the 'divide by zero' warning with np.errstate(divide='ignore'): # Code below is adopted from http://stackoverflow.com/a/20687984/5768001 # base similarity matrix (all dot products) similarity = np.dot(matrix, matrix.T) # inverse_mask of the squared magnitude of preference vectors (number of occurrences) inv_square_mag = 1 / np.diag(similarity) # if it doesn't occur, set it's inverse_mask magnitude to zero (instead of inf) inv_square_mag[np.isinf(inv_square_mag)] = 0 # inverse_mask of the magnitude inv_mag = np.sqrt(inv_square_mag) # cosine similarity (element-wise multiply by inverse_mask magnitudes) cosine = similarity * inv_mag cosine = cosine.T * inv_mag return cosine def _find_similarity_indices(self): """Finds the similarity indices for all time frames from the similarity matrix Returns: similarity_indices (list of lists): similarity indices for all time frames """ similarity_indices = [] for i in range(self.audio_signal.stft_length): cur_indices = utils.find_peak_indices( self.similarity_matrix[i, :], self.max_repeating_frames, min_dist=self.min_distance_between_frames, threshold=self.similarity_threshold) # the first peak is always itself so we throw it out # we also want only self.max_repeating_frames peaks # so +1 for 0-based, and +1 for the first peak we threw out cur_indices = cur_indices[1:self.max_repeating_frames + 2] similarity_indices.append(cur_indices) return similarity_indices def _compute_mask(self, magnitude_spectrogram_channel): mask = np.zeros_like(magnitude_spectrogram_channel) for i in range(self.audio_signal.stft_length): cur_similarities = self.similarity_indices[i] if not cur_similarities: # If there are no similarities, then just add ones to the mask here. mask[:, i] = np.ones(mask.shape[constants.STFT_VERT_INDEX]) else: similar_times = np.array([magnitude_spectrogram_channel[:, j] for j in cur_similarities]) mask[:, i] = np.median(similar_times, axis=0) mask = np.minimum(mask, magnitude_spectrogram_channel) mask = (mask + constants.EPSILON) / (magnitude_spectrogram_channel + constants.EPSILON) return mask def get_similarity_matrix(self): """Calculates and returns the similarity matrix for the audio file associated with this object Returns: similarity_matrix (np.array): similarity matrix for the audio file. EXAMPLE:: # Set up audio signal signal = nussl.AudioSignal('path_to_file.wav') # Set up a RepetSim object repet_sim = nussl.RepetSim(signal) # I don't have to run repet to get a similarity matrix for signal sim_mat = repet_sim.get_similarity_matrix() """ mean_magnitude_spectrogram = np.mean(self.magnitude_spectrogram, axis=2) return self.compute_similarity_matrix(mean_magnitude_spectrogram.T)