Source code for nussl.evaluation.bss_eval

import numpy as np
import museval

from .evaluation_base import EvaluationBase

def _scale_bss_eval(references, estimate, idx, compute_sir_sar=True):
    """
    Helper for scale_bss_eval to avoid infinite recursion loop.
    """
    source = references[..., idx]
    source_energy = (source ** 2).sum()

    alpha = (
        source @ estimate / source_energy
    )

    e_true = source
    e_res = estimate - e_true

    signal = (e_true ** 2).sum()
    noise = (e_res ** 2).sum()

    snr = 10 * np.log10(signal / noise)

    e_true = source * alpha
    e_res = estimate - e_true

    signal = (e_true ** 2).sum()
    noise = (e_res ** 2).sum()

    si_sdr = 10 * np.log10(signal / noise)

    srr = -10 * np.log10((1 - (1/alpha)) ** 2)
    sd_sdr = snr + 10 * np.log10(alpha ** 2)

    si_sir = np.nan
    si_sar = np.nan

    if compute_sir_sar:
        references_projection = references.T @ references

        references_onto_residual = np.dot(references.transpose(), e_res)
        b = np.linalg.solve(references_projection, references_onto_residual)

        e_interf = np.dot(references, b)
        e_artif = e_res - e_interf

        si_sir = 10 * np.log10(signal / (e_interf ** 2).sum())
        si_sar = 10 * np.log10(signal / (e_artif ** 2).sum())

    return si_sdr, si_sir, si_sar, sd_sdr, snr, srr

[docs]def scale_bss_eval(references, estimate, mixture, idx, 
                   compute_sir_sar=True):
    """
    Computes metrics for references[idx] relative to the
    chosen estimates. This only works for mono audio. Each
    channel should be done independently when calling this
    function. Lovingly borrowed from Gordon Wichern and 
    Jonathan Le Roux at Mitsubishi Electric Research Labs.

    This returns 9 numbers (in this order):

    - SI-SDR: Scale-invariant source-to-distortion ratio. Higher is better.
    - SI-SIR: Scale-invariant source-to-interference ratio. Higher is better.
    - SI-SAR: Scale-invariant source-to-artifact ratio. Higher is better.
    - SD-SDR: Scale-dependent source-to-distortion ratio. Higher is better.
    - SNR: Signal-to-noise ratio. Higher is better.
    - SRR: The source-to-rescaled-source ratio. This corresponds to 
      a term that punishes the estimate if its scale is off relative
      to the reference. This is an unnumbered equation in [1], but
      is the term on page 2, second column, second to last line:
      ||s - alpha*s||**2. s here is factored out. Higher is better.
    - SI-SDRi: Improvement in SI-SDR over using the mixture as the estimate.
    - SD-SDRi: Improvement in SD-SDR over using the mixture as the estimate.
    - SNRi: Improvement in SNR over using the mixture as the estimate.

    References:

    [1] Le Roux, J., Wisdom, S., Erdogan, H., & Hershey, J. R. 
        (2019, May). SDR–half-baked or well done?. In ICASSP 2019-2019 IEEE 
        International Conference on Acoustics, Speech and Signal 
        Processing (ICASSP) (pp. 626-630). IEEE.
    
    Args:
        references (np.ndarray): object containing the
          references data. Of shape (n_samples, n_sources).
         
        estimate (np.ndarray): object containing the
          estimate data. Of shape (n_samples, 1).

        mixture (np.ndarray): objct containingthe
          mixture data. Of shape (n_samples, 1).

        idx (int): Which reference to compute metrics against.

        compute_sir_sar (bool, optional): Whether or not to compute SIR/SAR
          metrics, which can be computationally expensive and may not be
          relevant for your evaluation. Defaults to True

    Returns:
        tuple: SI-SDR, SI-SIR, SI-SAR, SD-SDR, SNR, SRR, SI-SDRi, SD-SDRi, SNRi
    """    
    si_sdr, si_sir, si_sar, sd_sdr, snr, srr = _scale_bss_eval(
        references, estimate, idx, compute_sir_sar=compute_sir_sar)
    mix_metrics = _scale_bss_eval(
        references, mixture, idx, compute_sir_sar=False)

    si_sdri = si_sdr - mix_metrics[0]
    sd_sdri = sd_sdr - mix_metrics[3]
    snri = snr - mix_metrics[4]

    return si_sdr, si_sir, si_sar, sd_sdr, snr, srr, si_sdri, sd_sdri, snri


[docs]class BSSEvaluationBase(EvaluationBase):
    """
    Base class for all evaluation classes that are based on BSSEval metrics. This 
    contains some useful verification functions, preprocessing functions that are
    used in many separation-based evaluation. Specific evaluation metrics are 
    thin wrappers around this base class, basically only implementing the
    ``self.evaluate_helper`` function.
    
    Both ``true_sources_list`` and ``estimated_sources_list`` get validated 
    using the private method :func:`_verify_input_list`. If your evaluation 
    needs to verify that input is set correctly (recommended) overwrite that method 
    to add checking.
    
    Args:
        true_sources_list (list): List of objects that contain one ground truth source per object.
          In some instances (such as the :class:`BSSEval` objects) this list is filled with
          :class:`AudioSignals` but in other cases it is populated with
          :class:`MaskBase` -derived objects (i.e., either a :class:`BinaryMask` or
          :class:`SoftMask` object).
        estimated_sources_list (list): List of objects that contain source estimations from a source
          separation algorithm. List should be populated with the same type of objects and in the
          same order as :param:`true_sources_list`.
        source_labels (list): List of strings that are labels for each source to be used as keys for
          the scores. Default value is `None` and in that case labels use the file_name attribute.
          If that is also `None`, then the source labels are `Source 0`, `Source 1`, etc.
        compute_permutation (bool): Whether or not to evaluate in a permutation-invariant 
          fashion, where the estimates are permuted to match the true sources. Only the 
          best permutation according to ``best_permutation_key`` is returned to the 
          scores dict. Defaults to False.
        best_permutation_key (str): Which metric to use to decide which permutation of 
          the sources was best.
        **kwargs (dict): Any additional arguments are passed on to evaluate_helper.
    """

    def __init__(self, true_sources_list, estimated_sources_list, source_labels=None,
                 compute_permutation=False, best_permutation_key="SDR", **kwargs):
        super().__init__(true_sources_list, estimated_sources_list, source_labels=source_labels,
                         compute_permutation=compute_permutation,
                         best_permutation_key=best_permutation_key,
                         **kwargs)

[docs]    def preprocess(self):
        """
        Implements preprocess by stacking the audio_data inside each AudioSignal
        object in both self.true_sources_list and self.estimated_sources_list.
        
        Returns:
            tuple: Tuple containing reference and estimate arrays.
        """
        references = np.stack(
            [x.audio_data for x in self.true_sources_list],
            axis=-1
        )
        estimates = np.stack(
            [x.audio_data for x in self.estimated_sources_list],
            axis=-1
        )
        return references.transpose(1, 0, 2), estimates.transpose(1, 0, 2)


[docs]class BSSEvalV4(BSSEvaluationBase):
[docs]    def evaluate_helper(self, references, estimates, **kwargs):
        """
        Implements evaluation using museval.metrics.bss_eval
        """
        # museval expects shape=(nsrc, nsampl, nchan)
        # we have (nsampl, nchan, nsrc)
        # so let's massage the data so it matches before feeding it in

        references = np.transpose(references, (2, 0, 1))
        estimates = np.transpose(estimates, (2, 0, 1))

        sdr, isr, sir, sar, _ = museval.metrics.bss_eval(
            references, estimates, compute_permutation=False, **kwargs)

        scores = []
        for j in range(references.shape[0]):
            score = {
                'SDR': sdr[j].tolist(), 
                'ISR': isr[j].tolist(), 
                'SIR': sir[j].tolist(), 
                'SAR': sar[j].tolist(),
            }
            scores.append(score)
        return scores


[docs]class BSSEvalScale(BSSEvaluationBase):
[docs]    def preprocess(self):
        """
        Scale invariant metrics expects zero-mean centered references and sources.
        """
        references, estimates = super().preprocess()

        mixture = references.sum(axis=-1)
        mixture -= mixture.mean(axis=0)

        self.mixture = mixture
        references -= references.mean(axis=0)
        estimates -= estimates.mean(axis=0)

        return references, estimates

[docs]    def evaluate_helper(self, references, estimates, compute_sir_sar=True):
        """
        Implements evaluation using new BSSEval metrics [1]. This computes every
        metric described in [1], including:

        - SI-SDR: Scale-invariant source-to-distortion ratio. Higher is better.
        - SI-SIR: Scale-invariant source-to-interference ratio. Higher is better.
        - SI-SAR: Scale-invariant source-to-artifact ratio. Higher is better.
        - SD-SDR: Scale-dependent source-to-distortion ratio. Higher is better.
        - SNR: Signal-to-noise ratio. Higher is better.
        - SRR: The source-to-rescaled-source ratio. This corresponds to 
          a term that punishes the estimate if its scale is off relative
          to the reference. This is an unnumbered equation in [1], but
          is the term on page 2, second column, second to last line:
          ||s - alpha*s||**2. s is factored out. Higher is better.
        - SI-SDRi: Improvement in SI-SDR over using the mixture as the estimate. Higher 
          is better.
        - SD-SDRi: Improvement in SD-SDR over using the mixture as the estimate. Higher
          is better.
        - SNRi: Improvement in SNR over using the mixture as the estimate. Higher is
          better.

        Note:

        If `compute_sir_sar = False`, then you'll get `np.nan` for SI-SIR and 
        SI-SAR!

        References:

        [1] Le Roux, J., Wisdom, S., Erdogan, H., & Hershey, J. R. 
        (2019, May). SDR–half-baked or well done?. In ICASSP 2019-2019 IEEE 
        International Conference on Acoustics, Speech and Signal 
        Processing (ICASSP) (pp. 626-630). IEEE.
        """

        sisdr, sisir, sisar, sdsdr, snr, srr, sisdri, sdsdri, snri = \
            [], [], [], [], [], [], [], [], []
        for j in range(references.shape[-1]):
            cSISDR, cSISIR, cSISAR, cSDSDR, cSNR, cSRR, cSISDRi, cSDSDRi, cSNRi = \
                [], [], [], [], [], [], [], [], []
            for ch in range(references.shape[-2]):
                _SISDR, _SISIR, _SISAR, _SDSDR, _SNR, _SRR, _SISDRi, _SDSDRi, _SNRi = (
                    scale_bss_eval(
                        references[..., ch, :], estimates[..., ch, j], 
                        self.mixture[..., ch], j, compute_sir_sar=compute_sir_sar
                    )
                )
            
                cSISDR.append(_SISDR)
                cSISIR.append(_SISIR)
                cSISAR.append(_SISAR)
                cSDSDR.append(_SDSDR)
                cSNR.append(_SNR)
                cSRR.append(_SRR)
                cSISDRi.append(_SISDRi)
                cSDSDRi.append(_SDSDRi)
                cSNRi.append(_SNRi)

            sisdr.append(cSISDR)
            sisir.append(cSISIR)
            sisar.append(cSISAR)

            sdsdr.append(cSDSDR)
            snr.append(cSNR)
            srr.append(cSRR)

            sisdri.append(cSISDRi)
            sdsdri.append(cSDSDRi)
            snri.append(cSNRi)

        scores = []
        for j in range(references.shape[-1]):
            score = {
                'SI-SDR': sisdr[j], 'SI-SIR': sisir[j], 'SI-SAR': sisar[j],
                'SD-SDR': sdsdr[j], 'SNR': snr[j], 'SRR': srr[j], 
                'SI-SDRi': sisdri[j], 'SD-SDRi': sdsdri[j], 'SNRi': snri[j]
            }
            scores.append(score)
        return scores