multipitch-architectures / libdl / nn_models / basic_cnns.py
basic_cnns.py
Raw
import torch
import torch.nn as nn


class basic_cnn(torch.nn.Module):
    """
    Basic CNN as in Johannes Zeitler's report, for HCQT input with 75 (-1) context frames
    and variable amount of pitch bins (num octaves * 12 * 3 bins per semitone)
    The number of input channels, channels in the hidden layers, and output
    dimensions (e.g. for pitch output) can be parameterized.
    Layer normalization is only performed over frequency and channel dimensions,
    not over time (in order to work with variable length input).

    Args (Defaults: BasicCNN by Johannes Zeitler but with 6 input channels):
        n_chan_input:   Number of input channels (harmonics in HCQT)
        n_chan_layers:  Number of channels in the hidden layers (list)
        n_bins_in:      Number of input bins (12 * number of octaves)
        n_bins_out:     Number of output bins (12 for pitch class, 72 for pitch)
        a_lrelu:        alpha parameter (slope) of LeakyReLU activation function
        p_dropout:      Dropout probability
    """
    def __init__(self, n_chan_input=6, n_chan_layers=[20,20,10,1], n_bins_in=216, n_bins_out=12, a_lrelu=0.3, p_dropout=0.2):
        super(basic_cnn, self).__init__()

        n_in = n_chan_input
        n_ch = n_chan_layers
        last_kernel_size = n_bins_in//3 + 1 - n_bins_out

        # Layer normalization over frequency and channels (harmonics of HCQT)
        self.layernorm = nn.LayerNorm(normalized_shape=[n_in, n_bins_in])
        # Prefiltering
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=n_in, out_channels=n_ch[0], kernel_size=(15,15), padding=(7,7), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(2,1), stride=(2,1), padding=(0,0)),
            nn.Dropout(p=p_dropout)
        )
        # Binning to MIDI pitches
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[0], out_channels=n_ch[1], kernel_size=(3,3), padding=(0,0), stride=(3,3)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(2,1), stride=(2,1), padding=(0,0)),
            nn.Dropout(p=p_dropout)
        )
        # Time reduction
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[1], out_channels=n_ch[2], kernel_size=(6,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout)
        )
        # Chroma reduction
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[2], out_channels=n_ch[3], kernel_size=(1,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout),
            nn.Conv2d(in_channels=n_ch[3], out_channels=1, kernel_size=(1,last_kernel_size), padding=(0,0), stride=(1,1)),
            nn.Sigmoid()
        )
    def forward(self, x):
        x_norm = self.layernorm(x.transpose(1, 2)).transpose(1, 2)
        conv1_lrelu = self.conv1(x_norm)
        conv2_lrelu = self.conv2(conv1_lrelu)
        conv3_lrelu = self.conv3(conv2_lrelu)
        y_pred = self.conv4(conv3_lrelu)
        return y_pred


class basic_cnn_pool(torch.nn.Module):
    """
    Basic CNN as in Johannes Zeitler's report, for HCQT input with 75 (-1) context frames
    and variable amount of pitch bins (num octaves * 12 * 3 bins per semitone)
    The number of input channels, channels in the hidden layers, and output
    dimensions (e.g. for pitch output) can be parameterized.
    This "pool" variant replaces strided convolutions by longer max pooling in
    order to achieve better shift invariance in time
    Layer normalization is only performed over frequency and channel dimensions,
    not over time (in order to work with variable length input).

    Args (Defaults: BasicCNN by Johannes Zeitler but with 6 input channels):
        n_chan_input:   Number of input channels (harmonics in HCQT)
        n_chan_layers:  Number of channels in the hidden layers (list)
        n_bins_in:      Number of input bins (12 * number of octaves)
        n_bins_out:     Number of output bins (12 for pitch class, 72 for pitch)
        a_lrelu:        alpha parameter (slope) of LeakyReLU activation function
        p_dropout:      Dropout probability
    """
    def __init__(self, n_chan_input=6, n_chan_layers=[20,20,10,1], n_bins_in=216, n_bins_out=12, a_lrelu=0.3, p_dropout=0.2):
        super(basic_cnn_pool, self).__init__()

        n_in = n_chan_input
        n_ch = n_chan_layers
        last_kernel_size = n_bins_in//3 + 1 - n_bins_out

        # Layer normalization over frequency and channels (harmonics of HCQT)
        self.layernorm = nn.LayerNorm(normalized_shape=[n_in, n_bins_in])
        # Prefiltering
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=n_in, out_channels=n_ch[0], kernel_size=(15,15), padding=(7,7), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(8,1), stride=(8,1), padding=(0,0)),
            nn.Dropout(p=p_dropout)
        )
        # Binning to MIDI pitches
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[0], out_channels=n_ch[1], kernel_size=(3,3), padding=(1,1), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(3,3), stride=(3,3), padding=(0,0)),
            nn.Dropout(p=p_dropout)
        )
        # Time reduction
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[1], out_channels=n_ch[2], kernel_size=(3,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout)
        )
        # Chroma reduction
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[2], out_channels=n_ch[3], kernel_size=(1,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout),
            nn.Conv2d(in_channels=n_ch[3], out_channels=1, kernel_size=(1,last_kernel_size), padding=(0,0), stride=(1,1)),
            nn.Sigmoid()
        )
    def forward(self, x):
        x_norm = self.layernorm(x.transpose(1, 2)).transpose(1, 2)
        conv1_lrelu = self.conv1(x_norm)
        conv2_lrelu = self.conv2(conv1_lrelu)
        conv3_lrelu = self.conv3(conv2_lrelu)
        y_pred = self.conv4(conv3_lrelu)
        return y_pred


class basic_cnn_segm_sigmoid(torch.nn.Module):
    """
    Basic CNN similar to the one in Johannes Zeitler's report,
    but for longer HCQT input (always stride 1 in time)
    Still with 75 (-1) context frames, i.e. 37 frames padded to each side
    The number of input channels, channels in the hidden layers, and output
    dimensions (e.g. for pitch output) can be parameterized.
    Layer normalization is only performed over frequency and channel dimensions,
    not over time (in order to work with variable length input).
    Outputs one channel with sigmoid activation.

    Args (Defaults: BasicCNN by Johannes Zeitler but with 6 input channels):
        n_chan_input:   Number of input channels (harmonics in HCQT)
        n_chan_layers:  Number of channels in the hidden layers (list)
        n_bins_in:      Number of input bins (12 * number of octaves)
        n_bins_out:     Number of output bins (12 for pitch class, 72 for pitch)
        a_lrelu:        alpha parameter (slope) of LeakyReLU activation function
        p_dropout:      Dropout probability
    """
    def __init__(self, n_chan_input=6, n_chan_layers=[20,20,10,1], n_bins_in=216, n_bins_out=12, a_lrelu=0.3, p_dropout=0.2):
        super(basic_cnn_segm_sigmoid, self).__init__()

        n_in = n_chan_input
        n_ch = n_chan_layers
        last_kernel_size = n_bins_in//3 + 1 - n_bins_out

        # Layer normalization over frequency and channels (harmonics of HCQT)
        self.layernorm = nn.LayerNorm(normalized_shape=[n_in, n_bins_in])
        # Prefiltering
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=n_in, out_channels=n_ch[0], kernel_size=(15,15), padding=(7,7), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(3,1), stride=(1,1), padding=(1,0)),
            nn.Dropout(p=p_dropout)
        )
        # Binning to MIDI pitches
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[0], out_channels=n_ch[1], kernel_size=(3,3), padding=(1,0), stride=(1,3)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(13,1), stride=(1,1), padding=(6,0)),
            nn.Dropout(p=p_dropout)
        )
        # Time reduction
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[1], out_channels=n_ch[2], kernel_size=(75,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout)
        )
        # Chroma reduction
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[2], out_channels=n_ch[3], kernel_size=(1,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout),
            nn.Conv2d(in_channels=n_ch[3], out_channels=1, kernel_size=(1,last_kernel_size), padding=(0,0), stride=(1,1)),
            nn.Sigmoid()
        )
    def forward(self, x):
        x_norm = self.layernorm(x.transpose(1, 2)).transpose(1, 2)
        conv1_lrelu = self.conv1(x_norm)
        conv2_lrelu = self.conv2(conv1_lrelu)
        conv3_lrelu = self.conv3(conv2_lrelu)
        y_pred = self.conv4(conv3_lrelu)
        return y_pred


class basic_cnn_segm_logsoftmax(torch.nn.Module):
    """
    Basic CNN similar to the one in Johannes Zeitler's report,
    but for longer HCQT input (always stride 1 in time)
    Still with 75 (-1) context frames, i.e. 37 frames padded to each side
    The number of input channels, channels in the hidden layers, and output
    dimensions (e.g. for pitch output) can be parameterized.
    Layer normalization is only performed over frequency and channel dimensions,
    not over time (in order to work with variable length input).
    Outputs an arbitrary number of dimensions (e.g. 2 for active and non-active
    pitch) with LogSoftmax activation (corresponding to log probabilities)

    Args (Defaults: BasicCNN by Johannes Zeitler but with 6 input channels):
        n_chan_input:   Number of input channels (harmonics in HCQT)
        n_chan_layers:  Number of channels in the hidden layers (list)
        n_ch_out:       Number of output channels (with softmax activation across channel dim.)
        n_bins_in:      Number of input bins (12 * number of octaves)
        n_bins_out:     Number of output bins (12 for pitch class, 72 for pitch)
        a_lrelu:        alpha parameter (slope) of LeakyReLU activation function
        p_dropout:      Dropout probability
    """
    def __init__(self, n_chan_input=6, n_chan_layers=[20,20,10,1], n_ch_out=2, n_bins_in=216, n_bins_out=12, a_lrelu=0.3, p_dropout=0.2):
        super(basic_cnn_segm_logsoftmax, self).__init__()

        n_in = n_chan_input
        n_ch = n_chan_layers
        n_out = n_ch_out
        last_kernel_size = n_bins_in//3 + 1 - n_bins_out

        # Layer normalization over frequency and channels (harmonics of HCQT)
        self.layernorm = nn.LayerNorm(normalized_shape=[n_in, n_bins_in])
        # Prefiltering
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=n_in, out_channels=n_ch[0], kernel_size=(15,15), padding=(7,7), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(3,1), stride=(1,1), padding=(1,0)),
            nn.Dropout(p=p_dropout)
        )
        # Binning to MIDI pitches
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[0], out_channels=n_ch[1], kernel_size=(3,3), padding=(1,0), stride=(1,3)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(13,1), stride=(1,1), padding=(6,0)),
            nn.Dropout(p=p_dropout)
        )
        # Time reduction
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[1], out_channels=n_ch[2], kernel_size=(75,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout)
        )
        # Chroma reduction
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[2], out_channels=n_ch[3], kernel_size=(1,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout),
            nn.Conv2d(in_channels=n_ch[3], out_channels=n_out, kernel_size=(1,last_kernel_size), padding=(0,0), stride=(1,1)),
            nn.LogSoftmax(dim=1)
        )
    def forward(self, x):
        x_norm = self.layernorm(x.transpose(1, 2)).transpose(1, 2)
        conv1_lrelu = self.conv1(x_norm)
        conv2_lrelu = self.conv2(conv1_lrelu)
        conv3_lrelu = self.conv3(conv2_lrelu)
        y_pred = self.conv4(conv3_lrelu)

        return y_pred


class basic_cnn_segm_blank_logsoftmax(torch.nn.Module):
    """
    Basic CNN similar to the one in Johannes Zeitler's report,
    but for longer HCQT input (always stride 1 in time)
    Still with 75 (-1) context frames, i.e. 37 frames padded to each side
    The number of input channels, channels in the hidden layers, and output
    dimensions (e.g. for pitch output) can be parameterized.
    Layer normalization is only performed over frequency and channel dimensions,
    not over time (in order to work with variable length input).
    Outputs an arbitrary number of dimensions (e.g. 2 for active and non-active
    pitch) with LogSoftmax activation (corresponding to log probabilities).
    Adds an extra output dimension (in "pitch" direction, not a new channel), e.g.
    for predicting probability of an overall blank symbol (MCTC)

    Args (Defaults: BasicCNN by Johannes Zeitler but with 6 input channels):
        n_chan_input:   Number of input channels (harmonics in HCQT)
        n_chan_layers:  Number of channels in the hidden layers (list)
        n_ch_out:       Number of output channels (with softmax activation across channel dim.)
        n_bins_in:      Number of input bins (12 * number of octaves)
        n_bins_out:     Number of output bins (12 for pitch class, 72 for pitch)
        a_lrelu:        alpha parameter (slope) of LeakyReLU activation function
        p_dropout:      Dropout probability
    """
    def __init__(self, n_chan_input=6, n_chan_layers=[20,20,10,1], n_ch_out=2, n_bins_in=216, n_bins_out=12, a_lrelu=0.3, p_dropout=0.2):
        super(basic_cnn_segm_blank_logsoftmax, self).__init__()

        n_in = n_chan_input
        n_ch = n_chan_layers
        n_out = n_ch_out
        last_kernel_size = n_bins_in//3 + 1 - n_bins_out

        # Layer normalization over frequency and channels (harmonics of HCQT)
        self.layernorm = nn.LayerNorm(normalized_shape=[n_in, n_bins_in])
        # Prefiltering
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=n_in, out_channels=n_ch[0], kernel_size=(15,15), padding=(7,7), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(3,1), stride=(1,1), padding=(1,0)),
            nn.Dropout(p=p_dropout)
        )
        # Binning to MIDI pitches
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[0], out_channels=n_ch[1], kernel_size=(3,3), padding=(1,0), stride=(1,3)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(13,1), stride=(1,1), padding=(6,0)),
            nn.Dropout(p=p_dropout)
        )
        # Time reduction
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[1], out_channels=n_ch[2], kernel_size=(75,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout)
        )
        # Chroma reduction
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[2], out_channels=n_ch[3], kernel_size=(1,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout)
        )
        self.conv5a = nn.Conv2d(in_channels=n_ch[3], out_channels=n_out, kernel_size=(1,last_kernel_size), padding=(0,0), stride=(1,1))
        self.conv5b = nn.Conv2d(in_channels=n_ch[3], out_channels=n_out, kernel_size=(1,72), padding=(0,0), stride=(1,1))
        self.logsoftmax7 = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x_norm = self.layernorm(x.transpose(1, 2)).transpose(1, 2)
        conv1_lrelu = self.conv1(x_norm)
        conv2_lrelu = self.conv2(conv1_lrelu)
        conv3_lrelu = self.conv3(conv2_lrelu)
        conv4_lrelu = self.conv4(conv3_lrelu)
        stacked = torch.cat((self.conv5b(conv4_lrelu), self.conv5a(conv4_lrelu)), dim=3)
        y_pred = self.logsoftmax7(stacked)

        return y_pred


class deep_cnn_segm_sigmoid(torch.nn.Module):
    """
    Basic CNN similar to the one in Johannes Zeitler's report,
    but for longer HCQT input (always stride 1 in time)
    Still with 75 (-1) context frames, i.e. 37 frames padded to each side
    The number of input channels, channels in the hidden layers, and output
    dimensions (e.g. for pitch output) can be parameterized.
    Layer normalization is only performed over frequency and channel dimensions,
    not over time (in order to work with variable length input).
    Outputs one channel with sigmoid activation.

    Args (Defaults: BasicCNN by Johannes Zeitler but with 6 input channels):
        n_chan_input:     Number of input channels (harmonics in HCQT)
        n_chan_layers:    Number of channels in the hidden layers (list)
        n_prefilt_layers: Number of repetitions of the prefiltering layer
        residual:         If True, use residual connections for prefiltering (default: False)
        n_bins_in:        Number of input bins (12 * number of octaves)
        n_bins_out:       Number of output bins (12 for pitch class, 72 for pitch)
        a_lrelu:          alpha parameter (slope) of LeakyReLU activation function
        p_dropout:        Dropout probability
    """
    def __init__(self, n_chan_input=6, n_chan_layers=[20,20,10,1], n_prefilt_layers=1, residual=False, n_bins_in=216, n_bins_out=12, a_lrelu=0.3, p_dropout=0.2):
        super(deep_cnn_segm_sigmoid, self).__init__()

        n_in = n_chan_input
        n_ch = n_chan_layers
        last_kernel_size = n_bins_in//3 + 1 - n_bins_out

        # Layer normalization over frequency and channels (harmonics of HCQT)
        self.layernorm = nn.LayerNorm(normalized_shape=[n_in, n_bins_in])
        # Prefiltering
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=n_in, out_channels=n_ch[0], kernel_size=(15,15), padding=(7,7), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(3,1), stride=(1,1), padding=(1,0)),
            nn.Dropout(p=p_dropout)
        )
        self.n_prefilt_layers = n_prefilt_layers
        self.prefilt_list = nn.ModuleList()
        for p in range(1,n_prefilt_layers):
            self.prefilt_list.append(nn.Sequential(
                nn.Conv2d(in_channels=n_ch[0], out_channels=n_ch[0], kernel_size=(15,15), padding=(7,7), stride=(1,1)),
                nn.LeakyReLU(negative_slope=a_lrelu),
                nn.MaxPool2d(kernel_size=(3,1), stride=(1,1), padding=(1,0)),
                nn.Dropout(p=p_dropout)
            ))
        self.residual = residual
        # Binning to MIDI pitches
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[0], out_channels=n_ch[1], kernel_size=(3,3), padding=(1,0), stride=(1,3)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.MaxPool2d(kernel_size=(13,1), stride=(1,1), padding=(6,0)),
            nn.Dropout(p=p_dropout)
        )
        # Time reduction
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[1], out_channels=n_ch[2], kernel_size=(75,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout)
        )
        # Chroma reduction
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=n_ch[2], out_channels=n_ch[3], kernel_size=(1,1), padding=(0,0), stride=(1,1)),
            nn.LeakyReLU(negative_slope=a_lrelu),
            nn.Dropout(p=p_dropout),
            nn.Conv2d(in_channels=n_ch[3], out_channels=1, kernel_size=(1,last_kernel_size), padding=(0,0), stride=(1,1)),
            nn.Sigmoid()
        )
    def forward(self, x):
        x_norm = self.layernorm(x.transpose(1, 2)).transpose(1, 2)
        x = self.conv1(x_norm)
        for p in range(0,self.n_prefilt_layers-1):
            prefilt_layer = self.prefilt_list[p]
            if self.residual:
                x_new = prefilt_layer(x)
                x = x_new + x
            else:
                x = prefilt_layer(x)
        conv2_lrelu = self.conv2(x)
        conv3_lrelu = self.conv3(conv2_lrelu)
        y_pred = self.conv4(conv3_lrelu)
        return y_pred