Source code for nussl.ml.confidence

"""
There are ways to measure the quality of a separated source without
requiring ground truth. These functions operate on the output of
clustering-based separation algorithms and work by analyzing
the clusterability of the feature space used to generate the
separated sources.
"""

from sklearn.metrics import silhouette_samples
import numpy as np
from .cluster import KMeans, GaussianMixture
from scipy.special import logsumexp
from .train import loss
import torch

def softmax(x, axis=None):
    return np.exp(x - logsumexp(x, axis=axis, keepdims=True))

[docs]def jensen_shannon_divergence(gmm_p, gmm_q, n_samples=10**5):
    """
    Compute Jensen-Shannon (JS) divergence between two Gaussian Mixture Models via 
    sampling. JS divergence is also known as symmetric Kullback-Leibler divergence.
    JS divergence has no closed form in general for GMMs, thus we use sampling to 
    compute it.

    Args:
        gmm_p (GaussianMixture): A GaussianMixture class fit to some data.
        gmm_q (GaussianMixture): Another GaussianMixture class fit to some data.
        n_samples (int): Number of samples to use to estimate JS divergence. 

    Returns:
        JS divergence between gmm_p and gmm_q
    """
    X = gmm_p.sample(n_samples)[0]
    log_p_X = gmm_p.score_samples(X)
    log_q_X = gmm_q.score_samples(X)
    log_mix_X = np.logaddexp(log_p_X, log_q_X)

    Y = gmm_q.sample(n_samples)[0]
    log_p_Y = gmm_p.score_samples(Y)
    log_q_Y = gmm_q.score_samples(Y)
    log_mix_Y = np.logaddexp(log_p_Y, log_q_Y)

    return (log_p_X.mean() - (log_mix_X.mean() - np.log(2))
            + log_q_Y.mean() - (log_mix_Y.mean() - np.log(2))) / 2

def _get_loud_bins_mask(threshold, audio_signal=None, representation=None):
    if representation is None:
        representation = np.abs(audio_signal.stft())
    threshold = np.percentile(representation, threshold)
    mask = representation > threshold
    return mask, representation

[docs]def jensen_shannon_confidence(audio_signal, features, num_sources, threshold=95, 
                              n_samples=10**5, **kwargs):
    """
    Calculates the clusterability of a space by comparing a K-cluster GMM
    with a 1-cluster GMM on the same features. This function fits two
    GMMs to all of the points that are above the specified threshold (defaults
    to 95: 95th percentile of all the data). This saves on computation time and
    also allows one to have the confidence measure only focus on the louder
    more perceptually important points.

    References:

    Seetharaman, Prem, Gordon Wichern, Jonathan Le Roux, and Bryan Pardo. 
    “Bootstrapping Single-Channel Source Separation via Unsupervised Spatial 
    Clustering on Stereo Mixtures”. 44th International Conference on Acoustics, 
    Speech, and Signal Processing, Brighton, UK, May, 2019

    Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition. 
    Diss. Northwestern University, 2019.
    
    Args:
        audio_signal (AudioSignal): AudioSignal object which will be used to compute
          the mask over which to compute the confidence measure. This can be None, if
          and only if ``representation`` is passed as a keyword argument to this 
          function.
        features (np.ndarray): Numpy array containing the features to be clustered. 
          Should have the same dimensions as the representation.
        n_sources (int): Number of sources to cluster the features into.
        threshold (int, optional): Threshold by loudness. Points below the threshold are
          excluded from being used in the confidence measure. Defaults to 95.
        kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
          go here as a keyword argument.

    Returns:
        float: Confidence given by Jensen-Shannon divergence.
    """
    mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
    embedding_size = features.shape[-1]
    features = features[mask].reshape(-1, embedding_size)

    one_component_gmm = GaussianMixture(1)
    n_component_gmm = GaussianMixture(num_sources)

    one_component_gmm.fit(features)
    n_component_gmm.fit(features)

    confidence = jensen_shannon_divergence(
        one_component_gmm, n_component_gmm, n_samples=n_samples)

    return confidence

[docs]def posterior_confidence(audio_signal, features, num_sources, threshold=95, 
                         **kwargs):
    """
    Calculates the clusterability of an embedding space by looking at the
    strength of the assignments of each point to a specific cluster. The 
    more points that are "in between" clusters (e.g. no strong assignmment),
    the lower the clusterability.

    References:

    Seetharaman, Prem, Gordon Wichern, Jonathan Le Roux, and Bryan Pardo. 
    “Bootstrapping Single-Channel Source Separation via Unsupervised Spatial 
    Clustering on Stereo Mixtures”. 44th International Conference on Acoustics, 
    Speech, and Signal Processing, Brighton, UK, May, 2019

    Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition. 
    Diss. Northwestern University, 2019.
    
    Args:
        audio_signal (AudioSignal): AudioSignal object which will be used to compute
          the mask over which to compute the confidence measure. This can be None, if
          and only if ``representation`` is passed as a keyword argument to this 
          function.
        features (np.ndarray): Numpy array containing the features to be clustered. 
          Should have the same dimensions as the representation.
        n_sources (int): Number of sources to cluster the features into.
        threshold (int, optional): Threshold by loudness. Points below the threshold are
          excluded from being used in the confidence measure. Defaults to 95.
        kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
          go here as a keyword argument.
    
    Returns:
        float: Confidence given by posteriors.
    """
    mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
    embedding_size = features.shape[-1]
    features = features[mask].reshape(-1, embedding_size)

    kmeans = KMeans(num_sources)
    distances = kmeans.fit_transform(features)

    confidence = softmax(-distances, axis=-1)

    confidence = (
        (num_sources * np.max(confidence, axis=-1) - 1) / 
        (num_sources - 1)
    )

    return confidence.mean()

[docs]def silhouette_confidence(audio_signal, features, num_sources, threshold=95, 
                          max_points=1000, **kwargs):
    """
    Uses the silhouette score to compute the clusterability of the feature space.

    The Silhouette Coefficient is calculated using the 
    mean intra-cluster distance (a) and the mean nearest-cluster distance (b) 
    for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b). 
    To clarify, b is the distance between a sample and the nearest cluster 
    that the sample is not a part of. Note that Silhouette Coefficient is 
    only defined if number of labels is 2 <= n_labels <= n_samples - 1.

    References:

    Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition. 
    Diss. Northwestern University, 2019.

    Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the 
    Interpretation and Validation of Cluster Analysis”. Computational and 
    Applied Mathematics 20: 53-65.
    
    Args:
        audio_signal (AudioSignal): AudioSignal object which will be used to compute
          the mask over which to compute the confidence measure. This can be None, if
          and only if ``representation`` is passed as a keyword argument to this 
          function.
        features (np.ndarray): Numpy array containing the features to be clustered. 
          Should have the same dimensions as the representation.
        n_sources (int): Number of sources to cluster the features into.
        threshold (int, optional): Threshold by loudness. Points below the threshold are
          excluded from being used in the confidence measure. Defaults to 95.
        kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
          go here as a keyword argument.
        max_points (int, optional): Maximum number of points to compute the Silhouette
          score for. Silhouette score is a costly operation. Defaults to 1000.
    
    Returns:
        float: Confidence given by Silhouette score.
    """
    mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
    embedding_size = features.shape[-1]
    features = features[mask].reshape(-1, embedding_size)

    if features.shape[0] > max_points:
        idx = np.random.choice(
            np.arange(features.shape[0]), max_points,
            replace=False)
        features = features[idx]
    
    kmeans = KMeans(num_sources)

    labels = kmeans.fit_predict(features)
    confidence = silhouette_samples(features, labels)

    return confidence.mean()

[docs]def loudness_confidence(audio_signal, features, num_sources, threshold=95, 
                        **kwargs):
    """
    Computes the clusterability of the feature space by comparing the absolute
    size of each cluster.
    
    References:

    Seetharaman, Prem, Gordon Wichern, Jonathan Le Roux, and Bryan Pardo. 
    “Bootstrapping Single-Channel Source Separation via Unsupervised Spatial 
    Clustering on Stereo Mixtures”. 44th International Conference on Acoustics, 
    Speech, and Signal Processing, Brighton, UK, May, 2019

    Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition. 
    Diss. Northwestern University, 2019.
    
    Args:
        audio_signal (AudioSignal): AudioSignal object which will be used to compute
          the mask over which to compute the confidence measure. This can be None, if
          and only if ``representation`` is passed as a keyword argument to this 
          function.
        features (np.ndarray): Numpy array containing the features to be clustered. 
          Should have the same dimensions as the representation.
        n_sources (int): Number of sources to cluster the features into.
        threshold (int, optional): Threshold by loudness. Points below the threshold are
          excluded from being used in the confidence measure. Defaults to 95.
        kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
          go here as a keyword argument.
    
    Returns:
        float: Confidence given by size of smallest cluster.
    """
    mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
    embedding_size = features.shape[-1]
    features = features[mask].reshape(-1, embedding_size)

    kmeans = KMeans(num_sources)
    labels = kmeans.fit_predict(features)

    source_shares = np.array(
        [(labels == i).sum() for i in range(num_sources)]
    ).astype(float)
    source_shares *= (1 / source_shares.sum())
    confidence = source_shares.min()

    return confidence

[docs]def whitened_kmeans_confidence(audio_signal, features, num_sources, threshold=95, 
                               **kwargs):
    """
    Computes the clusterability in two steps:

    1. Cluster the feature space using KMeans into assignments
    2. Compute the Whitened K-Means loss between the features and the assignments.
    
    Args:
        audio_signal (AudioSignal): AudioSignal object which will be used to compute
          the mask over which to compute the confidence measure. This can be None, if
          and only if ``representation`` is passed as a keyword argument to this 
          function.
        features (np.ndarray): Numpy array containing the features to be clustered. 
          Should have the same dimensions as the representation.
        n_sources (int): Number of sources to cluster the features into.
        threshold (int, optional): Threshold by loudness. Points below the threshold are
          excluded from being used in the confidence measure. Defaults to 95.
        kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
          go here as a keyword argument.
    
    Returns:
        float: Confidence given by whitened k-means loss.
    """
    mask, representation = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
    embedding_size = features.shape[-1]
    features = features[mask].reshape(-1, embedding_size)
    weights = representation[mask].reshape(-1)

    kmeans = KMeans(num_sources)
    distances = kmeans.fit_transform(features)
    assignments = (distances == distances.max(axis=-1, keepdims=True))

    loss_func = loss.WhitenedKMeansLoss()

    features = torch.from_numpy(features).unsqueeze(0).float()
    assignments = torch.from_numpy(assignments).unsqueeze(0).float()
    weights = torch.from_numpy(weights).unsqueeze(0).float()

    loss_val = loss_func(features, assignments, weights).item()
    upper_bound = embedding_size + num_sources
    confidence = 1 - (loss_val / upper_bound)
    return confidence

[docs]def dpcl_classic_confidence(audio_signal, features, num_sources, threshold=95, 
                            **kwargs):
    """
    Computes the clusterability in two steps:

    1. Cluster the feature space using KMeans into assignments
    2. Compute the classic deep clustering loss between the features and the assignments.
    
    Args:
        audio_signal (AudioSignal): AudioSignal object which will be used to compute
          the mask over which to compute the confidence measure. This can be None, if
          and only if ``representation`` is passed as a keyword argument to this 
          function.
        features (np.ndarray): Numpy array containing the features to be clustered. 
          Should have the same dimensions as the representation.
        n_sources (int): Number of sources to cluster the features into.
        threshold (int, optional): Threshold by loudness. Points below the threshold are
          excluded from being used in the confidence measure. Defaults to 95.
        kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
          go here as a keyword argument.
    
    Returns:
        float: Confidence given by deep clustering loss.
    """
    mask, representation = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
    embedding_size = features.shape[-1]
    features = features[mask].reshape(-1, embedding_size)
    weights = representation[mask].reshape(-1)

    kmeans = KMeans(num_sources)
    distances = kmeans.fit_transform(features)
    assignments = (distances == distances.max(axis=-1, keepdims=True))

    loss_func = loss.DeepClusteringLoss()

    features = torch.from_numpy(features).unsqueeze(0).float()
    assignments = torch.from_numpy(assignments).unsqueeze(0).float()
    weights = torch.from_numpy(weights).unsqueeze(0).float()

    loss_val = loss_func(features, assignments, weights).item()
    confidence = 1 - loss_val
    return confidence