"""
There are ways to measure the quality of a separated source without
requiring ground truth. These functions operate on the output of
clustering-based separation algorithms and work by analyzing
the clusterability of the feature space used to generate the
separated sources.
"""
from sklearn.metrics import silhouette_samples
import numpy as np
from .cluster import KMeans, GaussianMixture
from scipy.special import logsumexp
from .train import loss
import torch
def softmax(x, axis=None):
return np.exp(x - logsumexp(x, axis=axis, keepdims=True))
[docs]def jensen_shannon_divergence(gmm_p, gmm_q, n_samples=10**5):
"""
Compute Jensen-Shannon (JS) divergence between two Gaussian Mixture Models via
sampling. JS divergence is also known as symmetric Kullback-Leibler divergence.
JS divergence has no closed form in general for GMMs, thus we use sampling to
compute it.
Args:
gmm_p (GaussianMixture): A GaussianMixture class fit to some data.
gmm_q (GaussianMixture): Another GaussianMixture class fit to some data.
n_samples (int): Number of samples to use to estimate JS divergence.
Returns:
JS divergence between gmm_p and gmm_q
"""
X = gmm_p.sample(n_samples)[0]
log_p_X = gmm_p.score_samples(X)
log_q_X = gmm_q.score_samples(X)
log_mix_X = np.logaddexp(log_p_X, log_q_X)
Y = gmm_q.sample(n_samples)[0]
log_p_Y = gmm_p.score_samples(Y)
log_q_Y = gmm_q.score_samples(Y)
log_mix_Y = np.logaddexp(log_p_Y, log_q_Y)
return (log_p_X.mean() - (log_mix_X.mean() - np.log(2))
+ log_q_Y.mean() - (log_mix_Y.mean() - np.log(2))) / 2
def _get_loud_bins_mask(threshold, audio_signal=None, representation=None):
if representation is None:
representation = np.abs(audio_signal.stft())
threshold = np.percentile(representation, threshold)
mask = representation > threshold
return mask, representation
[docs]def jensen_shannon_confidence(audio_signal, features, num_sources, threshold=95,
n_samples=10**5, **kwargs):
"""
Calculates the clusterability of a space by comparing a K-cluster GMM
with a 1-cluster GMM on the same features. This function fits two
GMMs to all of the points that are above the specified threshold (defaults
to 95: 95th percentile of all the data). This saves on computation time and
also allows one to have the confidence measure only focus on the louder
more perceptually important points.
References:
Seetharaman, Prem, Gordon Wichern, Jonathan Le Roux, and Bryan Pardo.
“Bootstrapping Single-Channel Source Separation via Unsupervised Spatial
Clustering on Stereo Mixtures”. 44th International Conference on Acoustics,
Speech, and Signal Processing, Brighton, UK, May, 2019
Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition.
Diss. Northwestern University, 2019.
Args:
audio_signal (AudioSignal): AudioSignal object which will be used to compute
the mask over which to compute the confidence measure. This can be None, if
and only if ``representation`` is passed as a keyword argument to this
function.
features (np.ndarray): Numpy array containing the features to be clustered.
Should have the same dimensions as the representation.
n_sources (int): Number of sources to cluster the features into.
threshold (int, optional): Threshold by loudness. Points below the threshold are
excluded from being used in the confidence measure. Defaults to 95.
kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
go here as a keyword argument.
Returns:
float: Confidence given by Jensen-Shannon divergence.
"""
mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
embedding_size = features.shape[-1]
features = features[mask].reshape(-1, embedding_size)
one_component_gmm = GaussianMixture(1)
n_component_gmm = GaussianMixture(num_sources)
one_component_gmm.fit(features)
n_component_gmm.fit(features)
confidence = jensen_shannon_divergence(
one_component_gmm, n_component_gmm, n_samples=n_samples)
return confidence
[docs]def posterior_confidence(audio_signal, features, num_sources, threshold=95,
**kwargs):
"""
Calculates the clusterability of an embedding space by looking at the
strength of the assignments of each point to a specific cluster. The
more points that are "in between" clusters (e.g. no strong assignmment),
the lower the clusterability.
References:
Seetharaman, Prem, Gordon Wichern, Jonathan Le Roux, and Bryan Pardo.
“Bootstrapping Single-Channel Source Separation via Unsupervised Spatial
Clustering on Stereo Mixtures”. 44th International Conference on Acoustics,
Speech, and Signal Processing, Brighton, UK, May, 2019
Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition.
Diss. Northwestern University, 2019.
Args:
audio_signal (AudioSignal): AudioSignal object which will be used to compute
the mask over which to compute the confidence measure. This can be None, if
and only if ``representation`` is passed as a keyword argument to this
function.
features (np.ndarray): Numpy array containing the features to be clustered.
Should have the same dimensions as the representation.
n_sources (int): Number of sources to cluster the features into.
threshold (int, optional): Threshold by loudness. Points below the threshold are
excluded from being used in the confidence measure. Defaults to 95.
kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
go here as a keyword argument.
Returns:
float: Confidence given by posteriors.
"""
mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
embedding_size = features.shape[-1]
features = features[mask].reshape(-1, embedding_size)
kmeans = KMeans(num_sources)
distances = kmeans.fit_transform(features)
confidence = softmax(-distances, axis=-1)
confidence = (
(num_sources * np.max(confidence, axis=-1) - 1) /
(num_sources - 1)
)
return confidence.mean()
[docs]def silhouette_confidence(audio_signal, features, num_sources, threshold=95,
max_points=1000, **kwargs):
"""
Uses the silhouette score to compute the clusterability of the feature space.
The Silhouette Coefficient is calculated using the
mean intra-cluster distance (a) and the mean nearest-cluster distance (b)
for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b).
To clarify, b is the distance between a sample and the nearest cluster
that the sample is not a part of. Note that Silhouette Coefficient is
only defined if number of labels is 2 <= n_labels <= n_samples - 1.
References:
Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition.
Diss. Northwestern University, 2019.
Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the
Interpretation and Validation of Cluster Analysis”. Computational and
Applied Mathematics 20: 53-65.
Args:
audio_signal (AudioSignal): AudioSignal object which will be used to compute
the mask over which to compute the confidence measure. This can be None, if
and only if ``representation`` is passed as a keyword argument to this
function.
features (np.ndarray): Numpy array containing the features to be clustered.
Should have the same dimensions as the representation.
n_sources (int): Number of sources to cluster the features into.
threshold (int, optional): Threshold by loudness. Points below the threshold are
excluded from being used in the confidence measure. Defaults to 95.
kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
go here as a keyword argument.
max_points (int, optional): Maximum number of points to compute the Silhouette
score for. Silhouette score is a costly operation. Defaults to 1000.
Returns:
float: Confidence given by Silhouette score.
"""
mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
embedding_size = features.shape[-1]
features = features[mask].reshape(-1, embedding_size)
if features.shape[0] > max_points:
idx = np.random.choice(
np.arange(features.shape[0]), max_points,
replace=False)
features = features[idx]
kmeans = KMeans(num_sources)
labels = kmeans.fit_predict(features)
confidence = silhouette_samples(features, labels)
return confidence.mean()
[docs]def loudness_confidence(audio_signal, features, num_sources, threshold=95,
**kwargs):
"""
Computes the clusterability of the feature space by comparing the absolute
size of each cluster.
References:
Seetharaman, Prem, Gordon Wichern, Jonathan Le Roux, and Bryan Pardo.
“Bootstrapping Single-Channel Source Separation via Unsupervised Spatial
Clustering on Stereo Mixtures”. 44th International Conference on Acoustics,
Speech, and Signal Processing, Brighton, UK, May, 2019
Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition.
Diss. Northwestern University, 2019.
Args:
audio_signal (AudioSignal): AudioSignal object which will be used to compute
the mask over which to compute the confidence measure. This can be None, if
and only if ``representation`` is passed as a keyword argument to this
function.
features (np.ndarray): Numpy array containing the features to be clustered.
Should have the same dimensions as the representation.
n_sources (int): Number of sources to cluster the features into.
threshold (int, optional): Threshold by loudness. Points below the threshold are
excluded from being used in the confidence measure. Defaults to 95.
kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
go here as a keyword argument.
Returns:
float: Confidence given by size of smallest cluster.
"""
mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
embedding_size = features.shape[-1]
features = features[mask].reshape(-1, embedding_size)
kmeans = KMeans(num_sources)
labels = kmeans.fit_predict(features)
source_shares = np.array(
[(labels == i).sum() for i in range(num_sources)]
).astype(float)
source_shares *= (1 / source_shares.sum())
confidence = source_shares.min()
return confidence
[docs]def whitened_kmeans_confidence(audio_signal, features, num_sources, threshold=95,
**kwargs):
"""
Computes the clusterability in two steps:
1. Cluster the feature space using KMeans into assignments
2. Compute the Whitened K-Means loss between the features and the assignments.
Args:
audio_signal (AudioSignal): AudioSignal object which will be used to compute
the mask over which to compute the confidence measure. This can be None, if
and only if ``representation`` is passed as a keyword argument to this
function.
features (np.ndarray): Numpy array containing the features to be clustered.
Should have the same dimensions as the representation.
n_sources (int): Number of sources to cluster the features into.
threshold (int, optional): Threshold by loudness. Points below the threshold are
excluded from being used in the confidence measure. Defaults to 95.
kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
go here as a keyword argument.
Returns:
float: Confidence given by whitened k-means loss.
"""
mask, representation = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
embedding_size = features.shape[-1]
features = features[mask].reshape(-1, embedding_size)
weights = representation[mask].reshape(-1)
kmeans = KMeans(num_sources)
distances = kmeans.fit_transform(features)
assignments = (distances == distances.max(axis=-1, keepdims=True))
loss_func = loss.WhitenedKMeansLoss()
features = torch.from_numpy(features).unsqueeze(0).float()
assignments = torch.from_numpy(assignments).unsqueeze(0).float()
weights = torch.from_numpy(weights).unsqueeze(0).float()
loss_val = loss_func(features, assignments, weights).item()
upper_bound = embedding_size + num_sources
confidence = 1 - (loss_val / upper_bound)
return confidence
[docs]def dpcl_classic_confidence(audio_signal, features, num_sources, threshold=95,
**kwargs):
"""
Computes the clusterability in two steps:
1. Cluster the feature space using KMeans into assignments
2. Compute the classic deep clustering loss between the features and the assignments.
Args:
audio_signal (AudioSignal): AudioSignal object which will be used to compute
the mask over which to compute the confidence measure. This can be None, if
and only if ``representation`` is passed as a keyword argument to this
function.
features (np.ndarray): Numpy array containing the features to be clustered.
Should have the same dimensions as the representation.
n_sources (int): Number of sources to cluster the features into.
threshold (int, optional): Threshold by loudness. Points below the threshold are
excluded from being used in the confidence measure. Defaults to 95.
kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
go here as a keyword argument.
Returns:
float: Confidence given by deep clustering loss.
"""
mask, representation = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
embedding_size = features.shape[-1]
features = features[mask].reshape(-1, embedding_size)
weights = representation[mask].reshape(-1)
kmeans = KMeans(num_sources)
distances = kmeans.fit_transform(features)
assignments = (distances == distances.max(axis=-1, keepdims=True))
loss_func = loss.DeepClusteringLoss()
features = torch.from_numpy(features).unsqueeze(0).float()
assignments = torch.from_numpy(assignments).unsqueeze(0).float()
weights = torch.from_numpy(weights).unsqueeze(0).float()
loss_val = loss_func(features, assignments, weights).item()
confidence = 1 - loss_val
return confidence