"""
Functions that make it easy to build commonly used source separation architectures.
Currently contains mask inference, deep clustering, and chimera networks that are
based on recurrent neural networks. These functions are a good place to start when
creating your own network toplogies. Since there can be dependencies between layers
depending on input size, it's good to work this out in a function like those below.
"""
[docs]def build_recurrent_mask_inference(num_features, hidden_size, num_layers, bidirectional,
dropout, num_sources, mask_activation, num_audio_channels=1,
rnn_type='lstm', normalization_class='BatchNorm',
mix_key='mix_magnitude'):
"""
Builds a config for a mask inference network that can be passed to
SeparationModel. This mask inference network uses a recurrent neural network (RNN)
to process the input representation.
Args:
num_features (int): Number of features in the input spectrogram (usually means
window length of STFT // 2 + 1.)
hidden_size (int): Hidden size of the RNN.
num_layers (int): Number of layers in the RNN.
bidirectional (int): Whether the RNN is bidirectional.
dropout (float): Amount of dropout to be used between layers of RNN.
num_sources (int): Number of sources to create masks for.
mask_activation (list of str): Activation of the mask ('sigmoid', 'softmax', etc.).
See ``nussl.ml.networks.modules.Embedding``.
num_audio_channels (int): Number of audio channels in input (e.g. mono or stereo).
Defaults to 1.
rnn_type (str, optional): RNN type, either 'lstm' or 'gru'. Defaults to 'lstm'.
normalization_class (str, optional): Type of normalization to apply, either
'InstanceNorm' or 'BatchNorm'. Defaults to 'BatchNorm'.
mix_key (str, optional): The key to look for in the input dictionary that contains
the mixture spectrogram. Defaults to 'mix_magnitude'.
Returns:
dict: A recurrent mask inference network configuration that can be passed to
SeparationModel.
"""
# define the building blocks
modules = {
mix_key: {},
'log_spectrogram': {
'class': 'AmplitudeToDB'
},
'normalization': {
'class': normalization_class,
},
'recurrent_stack': {
'class': 'RecurrentStack',
'args': {
'num_features': num_features,
'hidden_size': hidden_size,
'num_layers': num_layers,
'bidirectional': bidirectional,
'dropout': dropout,
'rnn_type': rnn_type
}
},
'mask': {
'class': 'Embedding',
'args': {
'num_features': num_features,
'hidden_size': hidden_size * 2 if bidirectional else hidden_size,
'embedding_size': num_sources,
'activation': mask_activation,
'num_audio_channels': num_audio_channels
}
},
'estimates': {
'class': 'Mask',
},
}
# define the topology
connections = [
['log_spectrogram', [mix_key, ]],
['normalization', ['log_spectrogram', ]],
['recurrent_stack', ['normalization', ]],
['mask', ['recurrent_stack', ]],
['estimates', ['mask', mix_key]]
]
# define the outputs
output = ['estimates', 'mask']
# put it together
config = {
'modules': modules,
'connections': connections,
'output': output
}
return config
[docs]def build_recurrent_dpcl(num_features, hidden_size, num_layers, bidirectional,
dropout, embedding_size, embedding_activation, num_audio_channels=1,
rnn_type='lstm',
normalization_class='BatchNorm', mix_key='mix_magnitude'):
"""
Builds a config for a deep clustering network that can be passed to
SeparationModel. This deep clustering network uses a recurrent neural network (RNN)
to process the input representation.
Args:
num_features (int): Number of features in the input spectrogram (usually means
window length of STFT // 2 + 1.)
hidden_size (int): Hidden size of the RNN.
num_layers (int): Number of layers in the RNN.
bidirectional (int): Whether the RNN is bidirectional.
dropout (float): Amount of dropout to be used between layers of RNN.
embedding_size (int): Embedding dimensionality of the deep clustering network.
embedding_activation (list of str): Activation of the embedding ('sigmoid', 'softmax', etc.).
See ``nussl.ml.networks.modules.Embedding``.
num_audio_channels (int): Number of audio channels in input (e.g. mono or stereo).
Defaults to 1.
rnn_type (str, optional): RNN type, either 'lstm' or 'gru'. Defaults to 'lstm'.
normalization_class (str, optional): Type of normalization to apply, either
'InstanceNorm' or 'BatchNorm'. Defaults to 'BatchNorm'.
mix_key (str, optional): The key to look for in the input dictionary that contains
the mixture spectrogram. Defaults to 'mix_magnitude'.
Returns:
dict: A recurrent deep clustering network configuration that can be passed to
SeparationModel.
"""
# define the building blocks
modules = {
mix_key: {},
'log_spectrogram': {
'class': 'AmplitudeToDB'
},
'normalization': {
'class': normalization_class,
},
'recurrent_stack': {
'class': 'RecurrentStack',
'args': {
'num_features': num_features,
'hidden_size': hidden_size,
'num_layers': num_layers,
'bidirectional': bidirectional,
'dropout': dropout,
'rnn_type': rnn_type
}
},
'embedding': {
'class': 'Embedding',
'args': {
'num_features': num_features,
'hidden_size': hidden_size * 2 if bidirectional else hidden_size,
'embedding_size': embedding_size,
'activation': embedding_activation,
'num_audio_channels': num_audio_channels
}
},
}
# define the topology
connections = [
['log_spectrogram', ['mix_magnitude', ]],
['normalization', ['log_spectrogram', ]],
['recurrent_stack', ['normalization', ]],
['embedding', ['recurrent_stack', ]],
]
# define the outputs
output = ['embedding']
# put it together
config = {
'modules': modules,
'connections': connections,
'output': output
}
return config
[docs]def build_recurrent_chimera(num_features, hidden_size, num_layers, bidirectional,
dropout, embedding_size, embedding_activation, num_sources,
mask_activation,
num_audio_channels=1, rnn_type='lstm', normalization_class='BatchNorm',
mix_key='mix_magnitude'):
"""
Builds a config for a Chimera network that can be passed to SeparationModel.
Chimera networks are so-called because they have two "heads" which can be trained
via different loss functions. In traditional Chimera, one head is trained using a
deep clustering loss while the other is trained with a mask inference loss.
This Chimera network uses a recurrent neural network (RNN) to process the input
representation.
Args:
num_features (int): Number of features in the input spectrogram (usually means
window length of STFT // 2 + 1.)
hidden_size (int): Hidden size of the RNN.
num_layers (int): Number of layers in the RNN.
bidirectional (int): Whether the RNN is bidirectional.
dropout (float): Amount of dropout to be used between layers of RNN.
embedding_size (int): Embedding dimensionality of the deep clustering network.
embedding_activation (list of str): Activation of the embedding ('sigmoid', 'softmax', etc.).
See ``nussl.ml.networks.modules.Embedding``.
num_sources (int): Number of sources to create masks for.
mask_activation (list of str): Activation of the mask ('sigmoid', 'softmax', etc.).
See ``nussl.ml.networks.modules.Embedding``.
num_audio_channels (int): Number of audio channels in input (e.g. mono or stereo).
Defaults to 1.
rnn_type (str, optional): RNN type, either 'lstm' or 'gru'. Defaults to 'lstm'.
normalization_class (str, optional): Type of normalization to apply, either
'InstanceNorm' or 'BatchNorm'. Defaults to 'BatchNorm'.
mix_key (str, optional): The key to look for in the input dictionary that contains
the mixture spectrogram. Defaults to 'mix_magnitude'.
Returns:
dict: A recurrent Chimera network configuration that can be passed to
SeparationModel.
"""
# define the building blocks
modules = {
mix_key: {},
'log_spectrogram': {
'class': 'AmplitudeToDB'
},
'normalization': {
'class': normalization_class,
},
'recurrent_stack': {
'class': 'RecurrentStack',
'args': {
'num_features': num_features,
'hidden_size': hidden_size,
'num_layers': num_layers,
'bidirectional': bidirectional,
'dropout': dropout,
'rnn_type': rnn_type
}
},
'embedding': {
'class': 'Embedding',
'args': {
'num_features': num_features,
'hidden_size': hidden_size * 2 if bidirectional else hidden_size,
'embedding_size': embedding_size,
'activation': embedding_activation,
'num_audio_channels': num_audio_channels
}
},
'mask': {
'class': 'Embedding',
'args': {
'num_features': num_features,
'hidden_size': hidden_size * 2 if bidirectional else hidden_size,
'embedding_size': num_sources,
'activation': mask_activation,
'num_audio_channels': num_audio_channels
}
},
'estimates': {
'class': 'Mask',
},
}
# define the topology
connections = [
['log_spectrogram', ['mix_magnitude', ]],
['normalization', ['log_spectrogram', ]],
['recurrent_stack', ['normalization', ]],
['embedding', ['recurrent_stack', ]],
['mask', ['recurrent_stack', ]],
['estimates', ['mask', 'mix_magnitude']]
]
# define the outputs
output = ['embedding', 'estimates', 'mask']
# put it together
config = {
'modules': modules,
'connections': connections,
'output': output
}
return config
[docs]def build_open_unmix_like(num_features, hidden_size, num_layers,
bidirectional, dropout, num_sources,
num_audio_channels=1, add_embedding=False,
embedding_size=20, embedding_activation='sigmoid',
rnn_type='lstm', mix_key='mix_magnitude'):
"""
This is a builder for an open-unmix LIKE (UMX) architecture for music source
separation.
The architecture is not exactly the same but is very similar for the
most part. This architecture also has the option of having an embedding space
attached to it, making it a UMX + Chimera network that you can regularize with
a deep clustering loss.
Args:
num_features (int): Number of features in the input spectrogram (usually means
window length of STFT // 2 + 1.)
hidden_size (int): Hidden size of the RNN. Will be hidden_size // 2 if bidirectional is True.
num_layers (int): Number of layers in the RNN.
bidirectional (int): Whether the RNN is bidirectional.
dropout (float): Amount of dropout to be used between layers of RNN.
num_sources (int): Number of sources to create masks for.
num_audio_channels (int): Number of audio channels in input (e.g. mono or stereo).
Defaults to 1.
add_embedding (bool): Whether or not to add an embedding layer to this to make this a
Chimera network. If True, then ``embedding_size`` and ``embedding_activation`` will
be used to define this.
embedding_size (int): Embedding dimensionality of the deep clustering network.
embedding_activation (list of str): Activation of the embedding ('sigmoid', 'softmax', etc.).
See ``nussl.ml.networks.modules.Embedding``.
rnn_type (str, optional): RNN type, either 'lstm' or 'gru'. Defaults to 'lstm'.
mix_key (str, optional): The key to look for in the input dictionary that contains
the mixture spectrogram. Defaults to 'mix_magnitude'.
Returns:
dict: An OpenUnmix-like configuration that can be passed to
SeparationModel.
"""
# define the building blocks
modules = {
mix_key: {},
'projection': {
'class': 'Embedding',
'args': {
'num_features': hidden_size,
'hidden_size': num_features * num_audio_channels,
'embedding_size': 1,
'activation': 'none',
'num_audio_channels': 1,
'bias': False,
'reshape': False,
'dim_to_embed': [2, 3]
}
},
'recurrent_stack': {
'class': 'RecurrentStack',
'args': {
'num_features': hidden_size,
'hidden_size': (
hidden_size // 2
if bidirectional
else hidden_size
),
'num_layers': num_layers,
'bidirectional': bidirectional,
'dropout': dropout,
'rnn_type': rnn_type
}
},
'skip_connection': {
'class': 'Concatenate',
'args': {
'dim': -1
}
},
'input_scalar': {
'class': 'BatchNorm'
},
'output_scalar': {
'class': 'BatchNorm'
},
'bn1': {
'class': 'BatchNorm',
'args': {
'num_features': hidden_size
}
},
'bn2': {
'class': 'BatchNorm',
'args': {
'num_features': hidden_size
}
},
'bn3': {
'class': 'BatchNorm',
'args': {
'num_features': num_features
}
},
'tanh_before_lstm': {
'class': 'Tanh'
},
'mask': {
'class': 'ReLU'
},
'dense_after_lstm': {
'class': 'Linear',
'args': {
'in_features': 2 * hidden_size,
'out_features': hidden_size,
'bias': False
}
},
'inverse_projection': {
'class': 'Embedding',
'args': {
'num_features': num_features,
'hidden_size': hidden_size,
'embedding_size': num_sources,
'activation': 'none',
'num_audio_channels': num_audio_channels,
'bias': False,
'reshape': True,
'dim_to_embed': -1
}
},
'estimates': {
'class': 'Mask',
}
}
# define the topology
connections = [
['input_scalar', [mix_key]],
['projection', ['input_scalar']],
['bn1', ['projection']],
['tanh_before_lstm', ['bn1']],
['recurrent_stack', ['tanh_before_lstm']],
['skip_connection', ['recurrent_stack', 'tanh_before_lstm']],
['dense_after_lstm', ['skip_connection']],
['bn2', ['dense_after_lstm']],
['inverse_projection', ['bn2']],
['bn3', ['inverse_projection']],
['output_scalar', ['bn3']],
['mask', ['output_scalar']],
['estimates', ['mask', mix_key]]
]
# define the outputs
output = ['estimates', 'mask']
if add_embedding:
modules['embedding'] = {
'class': 'Embedding',
'args': {
'num_features': num_features,
'hidden_size': hidden_size,
'embedding_size': embedding_size,
'activation': embedding_activation,
'num_audio_channels': num_audio_channels,
'bias': True,
'reshape': True,
'dim_to_embed': -1
}
}
connections.append(['embedding', ['bn2']])
output.append('embedding')
# put it together
config = {
'modules': modules,
'connections': connections,
'output': output
}
return config
[docs]def build_recurrent_end_to_end(num_filters, filter_length, hop_length, window_type,
hidden_size, num_layers, bidirectional, dropout,
num_sources, mask_activation, num_audio_channels=1,
mask_complex=False, trainable=False, rnn_type='lstm',
mix_key='mix_audio', normalization_class='BatchNorm'):
"""
Builds a config for a BLSTM-based network that operates on the time-series.
Uses an STFT within the network and can apply the mixture phase to
the estimate, or can learn a mask on the phase as well as the magnitude.
Args:
num_filters (int): Number of learnable filters in the front end network.
filter_length (int): Length of the filters.
hop_length (int): Hop length between frames.
window_type (str): Type of windowing function on apply to each frame.
hidden_size (int): Hidden size of the RNN.
num_layers (int): Number of layers in the RNN.
bidirectional (int): Whether the RNN is bidirectional.
dropout (float): Amount of dropout to be used between layers of RNN.
num_sources (int): Number of sources to create masks for.
mask_activation (list of str): Activation of the mask ('sigmoid', 'softmax', etc.).
See ``nussl.ml.networks.modules.Embedding``.
num_audio_channels (int): Number of audio channels in input (e.g. mono or stereo).
Defaults to 1.
mask_complex (bool, optional): Whether to also place a mask on the complex part, or
whether to just use the mixture phase.
trainable (bool, optional): Whether to learn the filters, which start from a
Fourier basis.
rnn_type (str, optional): RNN type, either 'lstm' or 'gru'. Defaults to 'lstm'.
normalization_class (str, optional): Type of normalization to apply, either
'InstanceNorm' or 'BatchNorm'. Defaults to 'BatchNorm'.
mix_key (str, optional): The key to look for in the input dictionary that contains
the mixture spectrogram. Defaults to 'mix_magnitude'.
Returns:
dict: A recurrent end-to-end network configuration that can be passed to
SeparationModel.
"""
cutoff = num_filters // 2 + 1
num_features = 2 * cutoff if mask_complex else cutoff
# define the building blocks
modules = {
mix_key: {},
'audio': {
'class': 'STFT',
'args': {
'num_filters': num_filters,
'filter_length': filter_length,
'hop_length': hop_length,
'window_type': window_type,
'requires_grad': trainable
}
},
'log_spectrogram': {
'class': 'AmplitudeToDB'
},
'normalization': {
'class': normalization_class
},
'split': {
'class': 'Split',
'args': {
'split_sizes': (cutoff, cutoff),
'dim': 2
}
},
'concatenate': {
'class': 'Concatenate',
'args': {
'dim': 2
}
},
'expand': {
'class': 'Expand',
},
'recurrent_stack': {
'class': 'RecurrentStack',
'args': {
'num_features': num_features,
'hidden_size': hidden_size,
'num_layers': num_layers,
'bidirectional': bidirectional,
'dropout': dropout,
'rnn_type': rnn_type
}
},
'mask': {
'class': 'Embedding',
'args': {
'num_features': num_features,
'hidden_size': hidden_size * 2 if bidirectional else hidden_size,
'embedding_size': num_sources,
'activation': mask_activation,
'num_audio_channels': num_audio_channels
}
},
'estimates': {
'class': 'Mask',
},
}
# define the topology
if not mask_complex:
connections = [
['audio', [mix_key, {'direction': 'transform'}]],
['split', ['audio',]],
['log_spectrogram', ['split:0', ]],
['normalization', ['log_spectrogram']],
['recurrent_stack', ['normalization', ]],
['mask', ['recurrent_stack', ]],
['estimates', ['mask', 'split:0']],
['expand', ['mask', 'split:1']],
['concatenate', ['estimates', 'expand']],
['audio', ['concatenate', {'direction': 'inverse'}]]
]
else:
connections = [
['audio', [mix_key, {'direction': 'transform'}]],
['normalization', ['audio']],
['recurrent_stack', ['normalization', ]],
['mask', ['recurrent_stack', ]],
['estimates', ['mask', 'audio']],
['audio', ['estimates', {'direction': 'inverse'}]]
]
# define the outputs
output = ['audio', 'mask']
# put it together
config = {
'modules': modules,
'connections': connections,
'output': output
}
return config
[docs]def build_dual_path_recurrent_end_to_end(
num_filters, filter_length, hop_length,
chunk_size, hop_size, hidden_size, num_layers,
bidirectional, bottleneck_size,
num_sources, mask_activation, num_audio_channels=1,
window_type='sqrt_hann', skip_connection=False,
rnn_type='lstm', mix_key='mix_audio'):
"""
Builds a config for a dual path recurrent network that operates on the
time-series. Uses a learned filterbank within the network.
Args:
num_filters (int): Number of learnable filters in the front end network.
filter_length (int): Length of the filters.
hop_length (int): Hop length between frames.
window_type (str): Type of windowing function on apply to each frame.
hidden_size (int): Hidden size of the RNN.
num_layers (int): Number of layers in the RNN.
bidirectional (int): Whether the RNN is bidirectional.
dropout (float): Amount of dropout to be used between layers of RNN.
num_sources (int): Number of sources to create masks for.
mask_activation (list of str): Activation of the mask ('sigmoid', 'softmax', etc.).
See ``nussl.ml.networks.modules.Embedding``.
num_audio_channels (int): Number of audio channels in input (e.g. mono or stereo).
Defaults to 1.
rnn_type (str, optional): RNN type, either 'lstm' or 'gru'. Defaults to 'lstm'.
normalization_class (str, optional): Type of normalization to apply, either
'InstanceNorm' or 'BatchNorm'. Defaults to 'BatchNorm'.
mix_key (str, optional): The key to look for in the input dictionary that contains
the mixture spectrogram. Defaults to 'mix_magnitude'.
Returns:
dict: A TASNet configuration that can be passed to
SeparationModel.
"""
# define the building blocks
recurrent_hidden_size = hidden_size // 2 if bidirectional else hidden_size
recurrent_stack = {
'class': 'RecurrentStack',
'args': {
'num_features': bottleneck_size,
'hidden_size': recurrent_hidden_size,
'num_layers': 1,
'bidirectional': bidirectional,
'dropout': 0.0,
'rnn_type': rnn_type,
'batch_first': True
}
}
modules = {
mix_key: {},
'audio': {
'class': 'LearnedFilterBank',
'args': {
'num_filters': num_filters,
'filter_length': filter_length,
'hop_length': hop_length,
'window_type': window_type,
'requires_grad': True
}
},
'mixture_weights': {
'class': 'ReLU'
},
'dual_path': {
'class': 'DualPath',
'args': {
'num_layers': num_layers,
'chunk_size': chunk_size,
'hop_size': hop_size,
'skip_connection': skip_connection,
'in_features': num_filters,
'bottleneck_size': bottleneck_size,
# rest are args to DualPathBlock
'hidden_size': hidden_size,
'intra_processor': recurrent_stack,
'inter_processor': recurrent_stack,
}
},
'mask': {
'class': 'Embedding',
'args': {
'num_features': num_filters,
'hidden_size': num_filters,
'embedding_size': num_sources,
'activation': mask_activation,
'num_audio_channels': num_audio_channels,
'dim_to_embed': [2, 3],
}
},
'estimates': {
'class': 'Mask',
},
}
# define the topology
connections = [
['audio', [mix_key, {'direction': 'transform'}]],
['mixture_weights', ['audio']],
['dual_path', ['mixture_weights', ]],
['mask', ['dual_path', ]],
['estimates', ['mask', 'mixture_weights']],
['audio', ['estimates', {'direction': 'inverse'}]]
]
# define the outputs
output = ['audio', 'mask']
# put it together
config = {
'modules': modules,
'connections': connections,
'output': output
}
return config