Source code for nussl.datasets.hooks

"""
While *nussl* does not come with any data sets, it does have the capability to interface with
many common source separation data sets used within the MIR and speech separation communities.
These data set "hooks" subclass BaseDataset and by default return AudioSignal objects in
labeled dictionaries for ease of use. Transforms can be applied to these datasets for use
in machine learning pipelines.
"""
import os

import musdb
import jams

from ..core import constants, utils
from .base_dataset import BaseDataset, DataSetException


[docs]class MUSDB18(BaseDataset):
    """
    Hook for MUSDB18. Uses the musdb.DB object to access the
    dataset. If ``download=True``, then the 7s snippets of each track
    are downloaded to ``self.folder``. If no folder is given, then
    the tracks are downloaded to ~/.nussl/musdb18. 
    
    Getting an item from this dataset with no transforms returns the 
    following dictionary:

    .. code-block:: none

        {
            'mix': [AudioSignal object containing mix audio],
            'source': {
                'bass': [AudioSignal object containing vocals],
                'drums': [AudioSignal object containing drums],
                'other': [AudioSignal object containing other],
                'vocals': [AudioSignal object containing vocals],
            }
            'metadata': {
                'labels': ['bass', 'drums', 'other', 'vocals']
            }
        }
    
    Args:
        folder (str, optional): Location that should be processed to produce the 
            list of files. Defaults to None.
        is_wav (bool, optional):  Expect subfolder with wav files for each source 
            instead of stems, defaults to False.
        download (bool, optional): Download sample version of MUSDB18 which 
            includes 7s excerpts. Defaults to False.
        subsets (list, optional): Select a musdb subset train or test. 
            Defaults to ['train', 'test'] (all tracks).
        split (str, optional): When subset train is loaded, split selects the 
            train/validation split. split=’train’ loads the training split, 
            `split=’valid’ loads the validation split. split=None applies no 
            splitting. Defaults to None.
        **kwargs: Any additional arguments that are passed up to BaseDataset 
            (see ``nussl.datasets.BaseDataset``).
    """
    DATASET_HASHES = {
        "musdb": "56777516ad56fe6a8590badf877e6be013ff932c010e0fbdb0aba03ef878d4cd",
    }
    
    def __init__(self, folder=None, is_wav=False, download=False,
                 subsets=None, split=None, **kwargs):
        subsets = ['train', 'test'] if subsets is None else subsets
        if folder is None:
            folder = os.path.join(
                constants.DEFAULT_DOWNLOAD_DIRECTORY, 'musdb18'
            )
        self.musdb = musdb.DB(root=folder, is_wav=is_wav, download=download, 
                              subsets=subsets, split=split)
        super().__init__(folder, **kwargs)

[docs]    def get_items(self, folder):
        items = range(len(self.musdb))
        return list(items)

[docs]    def process_item(self, item):
        track = self.musdb[item]
        mix, sources = utils.musdb_track_to_audio_signals(track)
        self._setup_audio_signal(mix)
        for source in list(sources.values()):
            self._setup_audio_signal(source)
        
        output = {
            'mix': mix,
            'sources': sources,
            'metadata': {
                'labels': ['bass', 'drums', 'other', 'vocals']
            }
        }
        return output


[docs]class MixSourceFolder(BaseDataset):
    """
    This dataset expects your data to be formatted in the following way:

    .. code-block:: none

        data/
            mix/
                [file0].wav
                [file1].wav
                [file2].wav
                ...
            [label0]/
                [file0].wav
                [file1].wav
                [file2].wav
                ...
            [label1]/
                [file0].wav
                [file1].wav
                [file2].wav
                ...
            [label2]/
                [file0].wav
                [file1].wav
                [file2].wav
                ...
            ...

    Note that the the filenames match between the mix folder and each source folder.
    The source folder names can be whatever you want. Given a file in the 
    ``self.mix_folder`` folder, this dataset will look up the corresponding files 
    with the same name in the source folders. These are the source audio files. 
    The sum of the sources should equal the mixture. Each source will be labeled 
    according to the folder name it comes from.

    Getting an item from this dataset with no transforms returns the 
    following dictionary:

    .. code-block:: none

        {
            'mix': [AudioSignal object containing mix audio],
            'source': {
                '[label0]': [AudioSignal object containing label0 audio],
                '[label1]': [AudioSignal object containing label1 audio],
                '[label2]': [AudioSignal object containing label2 audio],
                '[label3]': [AudioSignal object containing label3 audio],
                ...
            }
            'metadata': {
                'labels': ['label0', 'label1', 'label2', 'label3']
            }
        }


    Args:
        folder (str, optional): Location that should be processed to produce the 
            list of files. Defaults to None.
        mix_folder (str, optional): Folder to look in for mixtures. Defaults to 'mix'.
        source_folders (list, optional): List of folders to look in for sources. 
            Path is defined relative to folder. If None, all folders other than 
            mix_folder are treated as the source folders. Defaults to None.
        ext (list, optional): Audio extensions to look for in mix_folder. 
            Defaults to ['.wav', '.flac', '.mp3'].
        **kwargs: Any additional arguments that are passed up to BaseDataset 
            (see ``nussl.datasets.BaseDataset``).
    """
    def __init__(self, folder, mix_folder='mix', source_folders=None, sample_rate=None,
                 ext=None, **kwargs):
        self.mix_folder = mix_folder
        self.source_folders = source_folders
        self.ext = ['.wav', '.flac', '.mp3'] if ext is None else ext
        super().__init__(folder, **kwargs)

[docs]    def get_items(self, folder):
        if self.source_folders is None:
            self.source_folders = sorted([
                f for f in os.listdir(folder)
                if os.path.isdir(os.path.join(folder, f))
                and f != self.mix_folder
            ])

        mix_folder = os.path.join(folder, self.mix_folder)
        items = sorted([
            x for x in os.listdir(mix_folder)
            if os.path.splitext(x)[1] in self.ext
        ])
        return items

[docs]    def process_item(self, item):
        mix_path = os.path.join(self.folder, self.mix_folder, item)
        mix = self._load_audio_file(mix_path)
        sources = {}
        for k in self.source_folders:
            source_path = os.path.join(self.folder, k, item)
            if os.path.exists(source_path):
                sources[k] = self._load_audio_file(source_path)
        output = {
            'mix': mix,
            'sources': sources,
            'metadata': {
                'labels': self.source_folders
            }
        }
        return output


[docs]class Scaper(BaseDataset):
    """
    Source separation datasets can be generated using Scaper, a library for
    automatic soundscape generation. Datasets that are generated with Scaper
    can be fed into this class easily. Scaper generates a large list of JAMS
    files which specify the parameters of the soundscape. If the soundscape is
    generated with `save_isolated_events=True`, then the audio corresponding
    to each event in the soundscape will be saved as well.

    Below is an example of using Scaper to generate a small dataset of 10 
    mixtures with 2 sources each. The generated dataset can then be immediately
    loaded into an instance of ``nussl.datasets.Scaper`` for integration into
    a training or evaluation pipeline.

    The sources are output in a dictionary that looks like this:

    .. code-block:: none

        data['sources] = {
            '{label}::{count}': AudioSignal,
            '{label}::{count}': AudioSignal,
            ...
        }

    For example:

    .. code-block:: none

        data['sources] = {
            'siren::0': AudioSignal,
            'siren::1': AudioSignal,
            'car_horn::0': AudioSignal,
            ...
        }

    Getting an item from this dataset with no transforms returns the 
    following dictionary:

    .. code-block:: none

        {
            'mix': [AudioSignal object containing mix audio],
            'source': {
                '[label0::count]': [AudioSignal object containing label0 audio],
                '[label1::count]': [AudioSignal object containing label1 audio],
                '[label2::count]': [AudioSignal object containing label2 audio],
                '[label3::count]': [AudioSignal object containing label3 audio],
                ...
            }
            'metadata': {
                'jams': [the content of the jams file used to generate the soundscape]
                'labels': ['label0', 'label1', 'label2', 'label3']
            }
        }


    Example of generating a Scaper dataset and then loading it with nussl:

    >>> n_sources = 2
    >>> n_mixtures = 10
    >>> duration = 3
    >>> ref_db = -40
    >>> fg_path = '/path/to/foreground/'
    >>> output_dir = '/output/path'
    >>> for i in range(n_mixtures):
    >>>     sc = scaper.Scaper(
    >>>         duration, fg_path, fg_path, random_state=i)
    >>>     sc.ref_db = ref_db
    >>>     sc.sr = 16000
    >>>     for j in range(n_sources):
    >>>         sc.add_event(
    >>>             label=('choose', []),
    >>>             source_file=('choose', []),
    >>>             source_time=('const', 0),
    >>>             event_time=('const', 0),
    >>>             event_duration=('const', duration),
    >>>             snr=('const', 0),
    >>>             pitch_shift=None,
    >>>             time_stretch=None
    >>>         )
    >>>     audio_path = os.path.join(output_dir, f'{i}.wav')
    >>>     jams_path = os.path.join(output_dir, f'{i}.jams')
    >>>     sc.generate(audio_path, jams_path, save_isolated_events=True)
    >>> dataset = nussl.datasets.Scaper(output_dir)
    >>> dataset[0] # contains mix, sources, and metadata corresponding to 0.jams.
        
    Raises:
        DataSetException: if Scaper dataset wasn't saved with isolated event audio.
    """
[docs]    def get_items(self, folder):
        items = sorted([
            x for x in os.listdir(folder)
            if os.path.splitext(x)[1] in ['.jams']
        ])
        return items

    def _get_info_from_item(self, item):
        jam = jams.load(os.path.join(self.folder, item))
        ann = jam.annotations.search(namespace='scaper')[0]
        mix_path = ann.sandbox.scaper['soundscape_audio_path']
        source_paths = ann.sandbox.scaper['isolated_events_audio_path']
        return jam, ann, mix_path, source_paths

[docs]    def process_item(self, item):
        jam, ann, mix_path, source_paths = self._get_info_from_item(item)
        if not source_paths:
            raise DataSetException(
                "No paths to isolated events found! Did you generate "
                "the soundscape with save_isolated_events=True?")

        mix = self._load_audio_file(mix_path)
        sources = {}

        for event_spec, event_audio_path in zip(ann, source_paths):
            label = event_spec.value['label']
            label_count = 0
            for k in sources:
                if label in k:
                    label_count += 1
            label = f"{label}::{label_count}"
            sources[label] = self._load_audio_file(event_audio_path)

        output = {
            'mix': mix,
            'sources': sources,
            'metadata': {
                'scaper': jam,
                'labels': ann.sandbox.scaper['fg_labels'],
            }
        }
        return output

class FUSS(Scaper):
    """
    The Free Universal Sound Separation (FUSS) Dataset is a database of arbitrary 
    sound mixtures and source-level references, for use in experiments on 
    arbitrary sound separation. 

    This is the official sound separation data for the DCASE2020 Challenge Task 4: 
    Sound Event Detection and Separation in Domestic Environments.

    This is a hook for reading in this dataset, and making sure that the mix and 
    source paths are massaged to be relative paths.

    References:

    [1]  Scott Wisdom, Hakan Erdogan, Daniel P. W. Ellis, Romain Serizel, 
    Nicolas Turpault, Eduardo Fonseca, Justin Salamon, Prem Seetharaman, 
    John R. Hershey, "What's All the FUSS About Free Universal Sound Separation 
    Data?", 2020, in preparation.

    [2] Eduardo Fonseca, Jordi Pons, Xavier Favory, Frederic Font Corbera, 
    Dmitry Bogdanov, Andrés Ferraro, Sergio Oramas, Alastair Porter, and 
    Xavier Serra. "Freesound Datasets: A Platform for the Creation of Open Audio 
    Datasets." International Society for Music Information Retrieval Conference 
    (ISMIR), pp. 486–493. Suzhou, China, 2017.
    
    Args:
        root (str): Folder where the FUSS data is. Either points to ssdata or 
          ssdata_reverb.
        split (str): Either the ``train``, ``validation``, or ``eval`` split. 
        kwargs: Additional keyword arguments to BaseDataset.
    """
    def __init__(self, root, split='train', **kwargs):
        if split not in ['train', 'validation', 'eval']:
            raise DataSetException(
                f"split '{split}' not one of the accepted splits: "
                f"'train', 'validation', 'eval'.")
        
        folder = os.path.join(root, split)
        super().__init__(folder, sample_rate=16000, strict_sample_rate=True, 
                         **kwargs)

    def _get_info_from_item(self, item):
        path_to_item = os.path.join(self.folder, item)
        item_base_name = os.path.splitext(item)[0]

        jam = jams.load(path_to_item)
        ann = jam.annotations.search(namespace='scaper')[0]
        mix_path = ann.sandbox.scaper['soundscape_audio_path']
        source_paths = ann.sandbox.scaper['isolated_events_audio_path']

        mix_path = os.path.join(
            self.folder, item_base_name + mix_path.split(item_base_name)[-1])
        for i, source_path in enumerate(source_paths):
            source_paths[i] = os.path.join(
                self.folder, item_base_name + source_path.split(item_base_name)[-1])

        return jam, ann, mix_path, source_paths


[docs]class WHAM(MixSourceFolder):
    """
    Hook for the WHAM dataset. Essentially subclasses MixSourceFolder but with presets
    that are helpful for WHAM, which as the following directory structure:

    .. code-block:: none

        [wav8k, wav16k]/
          [min, max]/
            [tr, cv, tt]/
                mix_both/
                mix_clean/
                mix_single/
                noise/
                s1/
                s2/
        wham_noise/
          tr/
          cv/
          tt/
          metadata/

    Args:
        MixSourceFolder ([type]): [description]
    """
    MIX_TO_SOURCE_MAP = {
        'mix_clean': ['s1', 's2'],
        'mix_both': ['s1', 's2', 'noise'],
        'mix_single': ['s1'],
    }

    DATASET_HASHES = {
        "wav8k": "acd49e0dae066e16040c983d71cc5a8adb903abff6e5cbb92b3785a1997b7547", 
        "wav16k": "5691d6a35382f2408a99594f21d820b58371b5ea061841db37d548c0b8d6ec7f"
    }

    def __init__(self, root, mix_folder='mix_clean', mode='min', split='tr', 
                 sample_rate=8000, **kwargs):
        if mix_folder not in self.MIX_TO_SOURCE_MAP.keys():
            raise DataSetException(
                f"{mix_folder} must be in {list(self.MIX_TO_SOURCE_MAP.keys())}")
        if sample_rate not in [8000, 16000]:
            raise DataSetException(
                f"{sample_rate} not available for WHAM (only 8000 and 16000 Hz allowed)")
        if mode not in ['min', 'max']:
            raise DataSetException(
                f"{mode} not available, only 'min' or 'max' allowed.")
        if split not in ['tr', 'cv', 'tt']:
            raise DataSetException(
                f"{split} not available, must be one of 'tr' (train), "
                f"'cv' (validation), and 'tt' (test)")

        wav_folder = 'wav8k' if sample_rate == 8000 else 'wav16k'
        folder = os.path.join(root, wav_folder, mode, split)
        source_folders = self.MIX_TO_SOURCE_MAP[mix_folder]

        super().__init__(folder, mix_folder=mix_folder, source_folders=source_folders,
                         sample_rate=sample_rate, strict_sample_rate=True, **kwargs)