import numpy as np
import museval
from .evaluation_base import EvaluationBase
def _scale_bss_eval(references, estimate, idx, compute_sir_sar=True):
"""
Helper for scale_bss_eval to avoid infinite recursion loop.
"""
source = references[..., idx]
source_energy = (source ** 2).sum()
alpha = (
source @ estimate / source_energy
)
e_true = source
e_res = estimate - e_true
signal = (e_true ** 2).sum()
noise = (e_res ** 2).sum()
snr = 10 * np.log10(signal / noise)
e_true = source * alpha
e_res = estimate - e_true
signal = (e_true ** 2).sum()
noise = (e_res ** 2).sum()
si_sdr = 10 * np.log10(signal / noise)
srr = -10 * np.log10((1 - (1/alpha)) ** 2)
sd_sdr = snr + 10 * np.log10(alpha ** 2)
si_sir = np.nan
si_sar = np.nan
if compute_sir_sar:
references_projection = references.T @ references
references_onto_residual = np.dot(references.transpose(), e_res)
b = np.linalg.solve(references_projection, references_onto_residual)
e_interf = np.dot(references, b)
e_artif = e_res - e_interf
si_sir = 10 * np.log10(signal / (e_interf ** 2).sum())
si_sar = 10 * np.log10(signal / (e_artif ** 2).sum())
return si_sdr, si_sir, si_sar, sd_sdr, snr, srr
[docs]def scale_bss_eval(references, estimate, mixture, idx,
compute_sir_sar=True):
"""
Computes metrics for references[idx] relative to the
chosen estimates. This only works for mono audio. Each
channel should be done independently when calling this
function. Lovingly borrowed from Gordon Wichern and
Jonathan Le Roux at Mitsubishi Electric Research Labs.
This returns 9 numbers (in this order):
- SI-SDR: Scale-invariant source-to-distortion ratio. Higher is better.
- SI-SIR: Scale-invariant source-to-interference ratio. Higher is better.
- SI-SAR: Scale-invariant source-to-artifact ratio. Higher is better.
- SD-SDR: Scale-dependent source-to-distortion ratio. Higher is better.
- SNR: Signal-to-noise ratio. Higher is better.
- SRR: The source-to-rescaled-source ratio. This corresponds to
a term that punishes the estimate if its scale is off relative
to the reference. This is an unnumbered equation in [1], but
is the term on page 2, second column, second to last line:
||s - alpha*s||**2. s here is factored out. Higher is better.
- SI-SDRi: Improvement in SI-SDR over using the mixture as the estimate.
- SD-SDRi: Improvement in SD-SDR over using the mixture as the estimate.
- SNRi: Improvement in SNR over using the mixture as the estimate.
References:
[1] Le Roux, J., Wisdom, S., Erdogan, H., & Hershey, J. R.
(2019, May). SDR–half-baked or well done?. In ICASSP 2019-2019 IEEE
International Conference on Acoustics, Speech and Signal
Processing (ICASSP) (pp. 626-630). IEEE.
Args:
references (np.ndarray): object containing the
references data. Of shape (n_samples, n_sources).
estimate (np.ndarray): object containing the
estimate data. Of shape (n_samples, 1).
mixture (np.ndarray): objct containingthe
mixture data. Of shape (n_samples, 1).
idx (int): Which reference to compute metrics against.
compute_sir_sar (bool, optional): Whether or not to compute SIR/SAR
metrics, which can be computationally expensive and may not be
relevant for your evaluation. Defaults to True
Returns:
tuple: SI-SDR, SI-SIR, SI-SAR, SD-SDR, SNR, SRR, SI-SDRi, SD-SDRi, SNRi
"""
si_sdr, si_sir, si_sar, sd_sdr, snr, srr = _scale_bss_eval(
references, estimate, idx, compute_sir_sar=compute_sir_sar)
mix_metrics = _scale_bss_eval(
references, mixture, idx, compute_sir_sar=False)
si_sdri = si_sdr - mix_metrics[0]
sd_sdri = sd_sdr - mix_metrics[3]
snri = snr - mix_metrics[4]
return si_sdr, si_sir, si_sar, sd_sdr, snr, srr, si_sdri, sd_sdri, snri
[docs]class BSSEvaluationBase(EvaluationBase):
"""
Base class for all evaluation classes that are based on BSSEval metrics. This
contains some useful verification functions, preprocessing functions that are
used in many separation-based evaluation. Specific evaluation metrics are
thin wrappers around this base class, basically only implementing the
``self.evaluate_helper`` function.
Both ``true_sources_list`` and ``estimated_sources_list`` get validated
using the private method :func:`_verify_input_list`. If your evaluation
needs to verify that input is set correctly (recommended) overwrite that method
to add checking.
Args:
true_sources_list (list): List of objects that contain one ground truth source per object.
In some instances (such as the :class:`BSSEval` objects) this list is filled with
:class:`AudioSignals` but in other cases it is populated with
:class:`MaskBase` -derived objects (i.e., either a :class:`BinaryMask` or
:class:`SoftMask` object).
estimated_sources_list (list): List of objects that contain source estimations from a source
separation algorithm. List should be populated with the same type of objects and in the
same order as :param:`true_sources_list`.
source_labels (list): List of strings that are labels for each source to be used as keys for
the scores. Default value is `None` and in that case labels use the file_name attribute.
If that is also `None`, then the source labels are `Source 0`, `Source 1`, etc.
compute_permutation (bool): Whether or not to evaluate in a permutation-invariant
fashion, where the estimates are permuted to match the true sources. Only the
best permutation according to ``best_permutation_key`` is returned to the
scores dict. Defaults to False.
best_permutation_key (str): Which metric to use to decide which permutation of
the sources was best.
**kwargs (dict): Any additional arguments are passed on to evaluate_helper.
"""
def __init__(self, true_sources_list, estimated_sources_list, source_labels=None,
compute_permutation=False, best_permutation_key="SDR", **kwargs):
super().__init__(true_sources_list, estimated_sources_list, source_labels=source_labels,
compute_permutation=compute_permutation,
best_permutation_key=best_permutation_key,
**kwargs)
[docs] def preprocess(self):
"""
Implements preprocess by stacking the audio_data inside each AudioSignal
object in both self.true_sources_list and self.estimated_sources_list.
Returns:
tuple: Tuple containing reference and estimate arrays.
"""
references = np.stack(
[x.audio_data for x in self.true_sources_list],
axis=-1
)
estimates = np.stack(
[x.audio_data for x in self.estimated_sources_list],
axis=-1
)
return references.transpose(1, 0, 2), estimates.transpose(1, 0, 2)
[docs]class BSSEvalV4(BSSEvaluationBase):
[docs] def evaluate_helper(self, references, estimates, **kwargs):
"""
Implements evaluation using museval.metrics.bss_eval
"""
# museval expects shape=(nsrc, nsampl, nchan)
# we have (nsampl, nchan, nsrc)
# so let's massage the data so it matches before feeding it in
references = np.transpose(references, (2, 0, 1))
estimates = np.transpose(estimates, (2, 0, 1))
sdr, isr, sir, sar, _ = museval.metrics.bss_eval(
references, estimates, compute_permutation=False, **kwargs)
scores = []
for j in range(references.shape[0]):
score = {
'SDR': sdr[j].tolist(),
'ISR': isr[j].tolist(),
'SIR': sir[j].tolist(),
'SAR': sar[j].tolist(),
}
scores.append(score)
return scores
[docs]class BSSEvalScale(BSSEvaluationBase):
[docs] def preprocess(self):
"""
Scale invariant metrics expects zero-mean centered references and sources.
"""
references, estimates = super().preprocess()
mixture = references.sum(axis=-1)
mixture -= mixture.mean(axis=0)
self.mixture = mixture
references -= references.mean(axis=0)
estimates -= estimates.mean(axis=0)
return references, estimates
[docs] def evaluate_helper(self, references, estimates, compute_sir_sar=True):
"""
Implements evaluation using new BSSEval metrics [1]. This computes every
metric described in [1], including:
- SI-SDR: Scale-invariant source-to-distortion ratio. Higher is better.
- SI-SIR: Scale-invariant source-to-interference ratio. Higher is better.
- SI-SAR: Scale-invariant source-to-artifact ratio. Higher is better.
- SD-SDR: Scale-dependent source-to-distortion ratio. Higher is better.
- SNR: Signal-to-noise ratio. Higher is better.
- SRR: The source-to-rescaled-source ratio. This corresponds to
a term that punishes the estimate if its scale is off relative
to the reference. This is an unnumbered equation in [1], but
is the term on page 2, second column, second to last line:
||s - alpha*s||**2. s is factored out. Higher is better.
- SI-SDRi: Improvement in SI-SDR over using the mixture as the estimate. Higher
is better.
- SD-SDRi: Improvement in SD-SDR over using the mixture as the estimate. Higher
is better.
- SNRi: Improvement in SNR over using the mixture as the estimate. Higher is
better.
Note:
If `compute_sir_sar = False`, then you'll get `np.nan` for SI-SIR and
SI-SAR!
References:
[1] Le Roux, J., Wisdom, S., Erdogan, H., & Hershey, J. R.
(2019, May). SDR–half-baked or well done?. In ICASSP 2019-2019 IEEE
International Conference on Acoustics, Speech and Signal
Processing (ICASSP) (pp. 626-630). IEEE.
"""
sisdr, sisir, sisar, sdsdr, snr, srr, sisdri, sdsdri, snri = \
[], [], [], [], [], [], [], [], []
for j in range(references.shape[-1]):
cSISDR, cSISIR, cSISAR, cSDSDR, cSNR, cSRR, cSISDRi, cSDSDRi, cSNRi = \
[], [], [], [], [], [], [], [], []
for ch in range(references.shape[-2]):
_SISDR, _SISIR, _SISAR, _SDSDR, _SNR, _SRR, _SISDRi, _SDSDRi, _SNRi = (
scale_bss_eval(
references[..., ch, :], estimates[..., ch, j],
self.mixture[..., ch], j, compute_sir_sar=compute_sir_sar
)
)
cSISDR.append(_SISDR)
cSISIR.append(_SISIR)
cSISAR.append(_SISAR)
cSDSDR.append(_SDSDR)
cSNR.append(_SNR)
cSRR.append(_SRR)
cSISDRi.append(_SISDRi)
cSDSDRi.append(_SDSDRi)
cSNRi.append(_SNRi)
sisdr.append(cSISDR)
sisir.append(cSISIR)
sisar.append(cSISAR)
sdsdr.append(cSDSDR)
snr.append(cSNR)
srr.append(cSRR)
sisdri.append(cSISDRi)
sdsdri.append(cSDSDRi)
snri.append(cSNRi)
scores = []
for j in range(references.shape[-1]):
score = {
'SI-SDR': sisdr[j], 'SI-SIR': sisir[j], 'SI-SAR': sisar[j],
'SD-SDR': sdsdr[j], 'SNR': snr[j], 'SRR': srr[j],
'SI-SDRi': sisdri[j], 'SD-SDRi': sdsdri[j], 'SNRi': snri[j]
}
scores.append(score)
return scores