Source code for nussl.separation.composite.ensemble_clustering

import numpy as np

from .. import ClusteringSeparationBase, SeparationException

[docs]class EnsembleClustering(ClusteringSeparationBase): """ Run multiple separation algorithms on a single mixture and concatenate their masks to input into a clustering algorithm. This algorithm allows you to combine the outputs of multiple separation algorithms, fusing them into a single output via clustering. It was first developed in [1]. When used with primitive separation algorithms, it becomes the PrimitiveClustering algorithm described in [1]. References: [1] Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition. Diss. Northwestern University, 2019. Args: input_audio_signal (AudioSignal): Signal to separate. num_sources (int): Number of sources to separate from signal. separators (list): List of instantiated separation algorithms that will be run on the input audio signal. weights (list, optional): Weight to give to each algorithm in the resultant feature vector. For example, `[3, 1]`, will repeat the features from the first algorithm 3 times and the second algorithm 1 time. Defaults to None - every algorithm gets a weight of 1. returns (list, optional): Which outputs of each algorithm to keep in the resultant feature vector. Defaults to None. num_cascades (int, optional): The output of each algorithm can be cascaded into one another. The outputs of the first layer of algorithms will be refed to each separation algorithm to create more features. Defaults to 1. extracted_feature (str, optional): Which feature to extract from each algorithm. Must be one of `['estimates', 'masks']`. `estimates` will reconstruct a soft mask using the output of the algorithm (useful if the algorithm is not a masking based separation algorithm). `masks` will use the data in the `result_masks` attribute of the separation algorithm. Defaults to 'masks'. clustering_type (str): One of 'KMeans', 'GaussianMixture', and 'MiniBatchKMeans'. The clustering approach to use on the features. Defaults to 'KMeans'. fit_clusterer (bool, optional): Whether or not to call fit on the clusterer. If False, then the clusterer should already be fit for this to work. Defaults to True. percentile (int, optional): Percentile of time-frequency points to consider by loudness. Audio spectrograms are very high dimensional, and louder points tend to matter more than quieter points. By setting the percentile high, one can more efficiently cluster an auditory scene by considering only points above that threshold. Defaults to 90 (which means the top 10 percentile of time-frequency points will be used for clustering). beta (float, optional): When using KMeans, we use soft KMeans, which has an additional parameter `beta`. `beta` controls how soft the assignments are. As beta increases, the assignments become more binary (either 0 or 1). Defaults to 5.0, a value discovered through cross-validation. mask_type (str, optional): Masking approach to use. Passed up to MaskSeparationBase. mask_threshold (float, optional): Threshold for masking. Passed up to MaskSeparationBase. **kwargs (dict, optional): Additional keyword arguments that are passed to the clustering object (one of KMeans, GaussianMixture, or MiniBatchKMeans). Example: .. code-block:: python from nussl.separation import ( primitive, factorization, composite, SeparationException ) separators = [ primitive.FT2D(mix), factorization.RPCA(mix), primitive.Melodia(mix, voicing_tolerance=0.2), primitive.HPSS(mix), ] weights = [3, 3, 1, 1] returns = [[1], [1], [1], [0]] ensemble = composite.EnsembleClustering( mix, 2, separators, weights=weights, returns=returns) estimates = ensemble() """ def __init__(self, input_audio_signal, num_sources, separators, weights=None, returns=None, num_cascades=1, extracted_feature='masks', clustering_type='KMeans', fit_clusterer=True, percentile=90, beta=5.0, mask_type='soft', mask_threshold=0.5, **kwargs): super().__init__( input_audio_signal, num_sources, clustering_type=clustering_type, percentile=percentile, fit_clusterer=fit_clusterer, beta=beta, mask_type=mask_type, mask_threshold=mask_threshold, **kwargs) self.separators = separators self.num_cascades = num_cascades if isinstance(weights, list): if len(weights) != len(separators): raise SeparationException( f"len(weights) must be the same as len(separators)!") self.weights = weights else: self.weights = [1 for _ in range(len(self.separators))] if isinstance(returns, list): if len(returns) != len(separators): raise SeparationException( f"len(returns) must be the same as len(separators)!") self.returns = returns else: self.returns = None if extracted_feature not in ['masks', 'estimates']: raise SeparationException( f"extracted_feature must be one of ['masks', 'estimates']. " f"Got {extracted_feature}.") self.extracted_feature = extracted_feature def run_separators_on_mixture(self, mixture): estimates = [] masks = [] for i, separator in enumerate(self.separators): weight = self.weights[i] _estimates = separator(audio_signal=mixture) _names = [str(separator) for _ in _estimates] _masks = [] if hasattr(separator, 'result_masks'): _masks = separator.result_masks if self.returns is not None: returns = self.returns[i] _estimates = [_estimates[j] for j in returns] if _masks: _masks = [_masks[j] for j in returns] for _ in range(weight): estimates.extend(_estimates) masks.extend(_masks) return estimates, masks def extract_features(self): features = [] current_signal = [self.audio_signal] for i in range(self.num_cascades): new_signals = [] for _signal in current_signal: estimates, masks = self.run_separators_on_mixture(_signal) new_signals.extend(estimates) if self.extracted_feature == 'masks': _features = self._extract_features_from_masks(masks) elif self.extracted_feature == 'estimates': _features = self._extract_features_from_estimates(estimates) features.append(_features) current_signal = new_signals features = np.concatenate(features, axis=-1) return features def _extract_features_from_estimates(self, estimates): features = [] mix_stft = np.abs(self.stft) for e in estimates: _stft = np.abs(e.stft()) data = _stft / np.maximum(_stft, mix_stft + 1e-7) features.append(data) features = np.stack(features, axis=-1) return features @staticmethod def _extract_features_from_masks(masks): features = [] for m in masks: mask_data = m.mask features.append(mask_data) features = np.stack(features, axis=-1) return features