multipitch-architectures / libfmp / c8 / c8s2_salience.py
c8s2_salience.py
Raw
"""
Module: libfmp.c8.c8s2_salience
Author: Sebastian Rosenzweig, Meinard Müller
License: The MIT license, https://opensource.org/licenses/MIT

This file is part of the FMP Notebooks (https://www.audiolabs-erlangen.de/FMP)
"""

import numpy as np
import librosa
from scipy import ndimage
from numba import jit

import libfmp.b


@jit(nopython=True)
def principal_argument(v):
    """Principal argument function

    Notebook: C8/C8S2_InstantFreqEstimation.ipynb, see also /C6/C6S1_NoveltyPhase.ipynb

    Args:
        v: value

    Returns:
        w: Principle value of v
    """
    w = np.mod(v + 0.5, 1) - 0.5
    return w


@jit(nopython=True)
def compute_if(X, Fs, N, H):
    """Instantenous frequency (IF) estamation

    Notebook: C8/C8S2_InstantFreqEstimation.ipynb, see also /C6/C6S1_NoveltyPhase.ipynb

    Args:
        X: STFT
        Fs: Sampling rate
        N: Window size in samples
        H: Hop size in samples

    Returns:
        F_coef_IF: Matrix of IF values
    """
    phi_1 = np.angle(X[:, 0:-1]) / (2 * np.pi)
    phi_2 = np.angle(X[:, 1:]) / (2 * np.pi)

    K = X.shape[0]
    index_k = np.arange(0, K).reshape(-1, 1)
    # Bin offset (FMP, Eq. (8.45))
    kappa = (N / H) * principal_argument(phi_2 - phi_1 - index_k * H / N)
    # Instantaneous frequencies (FMP, Eq. (8.44))
    F_coef_IF = (index_k + kappa) * Fs / N

    # Extend F_coef_IF by copying first column to match dimensions of X
    F_coef_IF = np.hstack((np.copy(F_coef_IF[:, 0]).reshape(-1, 1), F_coef_IF))

    return F_coef_IF


@jit(nopython=True)
def f_coef(k, Fs, N):
    """STFT center frequency

    Notebook: C8/C8S2_SalienceRepresentation.ipynb

    Args:
        k: Coefficient number
        Fs: Sampling rate in Hz
        N: Window length in samples

    Returns:
        STFT center frequency
    """
    return k * Fs / N


@jit(nopython=True)
def frequency_to_bin_index(F, R=10, F_ref=55.0):
    """Binning function with variable frequency resolution
    Note: Indexing starts with 0  (opposed to [FMP, Eq. (8.49)])

    Notebook: C8/C8S2_SalienceRepresentation.ipynb

    Args:
        F: Frequency in Hz
        R: Frequency resolution in cents
        F_ref: Reference frequency in Hz

    Returns:
        bin_index: index vor bin (starting with index 0)
    """
    bin_index = np.floor((1200 / R) * np.log2(F / F_ref) + 0.5).astype(np.int64)
    return bin_index


@jit(nopython=True)
def p_bin(b, freq, R=10, F_ref=55):
    """Computes binning mask [FMP, Eq. (8.50)]

    Notebook: C8/C8S2_SalienceRepresentation.ipynb

    Args:
        b: Bin index
        freq: Center frequency
        R: Frequency resolution in cents
        F_ref: Reference frequency in Hz

    Returns:
        mask: Binning mask
    """
    mask = frequency_to_bin_index(freq, R, F_ref) == b
    mask = mask.reshape(-1, 1)
    return mask


@jit(nopython=True)
def compute_y_lf_bin(Y, Fs, N, R=10, F_min=55, F_max=1760):
    """Log-frequency Spectrogram with variable frequency resolution using binning

    Notebook: C8/C8S2_SalienceRepresentation.ipynb

    Args:
        Y: Magnitude spectrogram
        Fs: Sampling rate in Hz
        N: Window length in samples
        R: Frequency resolution in cents
        F_min: Lower frequency bound (reference frequency)
        F_max: Upper frequency bound (is included)

    Returns:
        Y_LF_bin: Binned log-frequency spectrogram
        F_coef_hertz: Frequency axis in Hz
        F_coef_cents: Frequency axis in cents
    """
    # [FMP, Eq. (8.51)]
    B = frequency_to_bin_index(np.array([F_max]), R, F_min)[0] + 1
    F_coef_hertz = 2 ** (np.arange(0, B) * R / 1200) * F_min
    F_coef_cents = np.arange(0, B*R, R)
    Y_LF_bin = np.zeros((B, Y.shape[1]))

    K = Y.shape[0]
    freq = f_coef(np.arange(0, K), Fs, N)
    freq_lim_idx = np.where(np.logical_and(freq >= F_min, freq <= F_max))[0]
    freq_lim = freq[freq_lim_idx]
    Y_lim = Y[freq_lim_idx, :]

    for b in range(B):
        coef_mask = p_bin(b, freq_lim, R, F_min)
        Y_LF_bin[b, :] = (Y_lim*coef_mask).sum(axis=0)
    return Y_LF_bin, F_coef_hertz, F_coef_cents


@jit(nopython=True)
def p_bin_if(b, F_coef_IF, R=10, F_ref=55.0):
    """Computes binning mask for instantaneous frequency binning [FMP, Eq. (8.52)]

    Notebook: C8/C8S2_SalienceRepresentation.ipynb

    Args:
        b: Bin index
        F_coef_IF: Instantaneous frequencies
        R: Frequency resolution in cents
        F_ref: Reference frequency in Hz

    Returns:
        mask: Binning mask
    """
    mask = frequency_to_bin_index(F_coef_IF, R, F_ref) == b
    return mask


@jit(nopython=True)
def compute_y_lf_if_bin(X, Fs, N, H, R=10, F_min=55.0, F_max=1760.0, gamma=0):
    """Binned Log-frequency Spectrogram with variable frequency resolution based on instantaneous frequency

    Notebook: C8/C8S2_SalienceRepresentation.ipynb

    Args:
        X: Complex spectrogram
        Fs: Sampling rate in Hz
        N: Window length in samples
        H: Hopsize in samples
        R: Frequency resolution in cents
        F_min: Lower frequency bound (reference frequency)
        F_max: Upper frequency bound
        gamma: Logarithmic compression factor

    Returns:
        Y_LF_IF_bin: Binned log-frequency spectrogram using instantaneous frequency
        F_coef: Frequency axis in Hz
        F_coef_cents: Frequency axis in cents
    """
    # Compute instantaneous frequencies
    F_coef_IF = libfmp.c8.compute_if(X, Fs, N, H)
    freq_lim_mask = np.logical_and(F_coef_IF >= F_min, F_coef_IF < F_max)
    F_coef_IF = F_coef_IF * freq_lim_mask

    # Initialize ouput array and compute frequency axis
    B = frequency_to_bin_index(np.array([F_max]), R, F_min)[0] + 1
    F_coef_hertz = 2 ** (np.arange(0, B) * R / 1200) * F_min
    F_coef_cents = np.arange(0, B*R, R)
    Y_LF_IF_bin = np.zeros((B, X.shape[1]))

    # Magnitude binning
    if gamma == 0:
        Y = np.abs(X) ** 2
    else:
        Y = np.log(1 + np.float32(gamma)*np.abs(X))
    for b in range(B):
        coef_mask = p_bin_if(b, F_coef_IF, R, F_min)

        Y_LF_IF_bin[b, :] = (Y * coef_mask).sum(axis=0)
    return Y_LF_IF_bin, F_coef_hertz, F_coef_cents


@jit(nopython=True)
def harmonic_summation(Y, num_harm=10, alpha=1):
    """Harmonic summation for spectrogram [FMP, Eq. (8.54)]

    Notebook: C8/C8S2_SalienceRepresentation.ipynb

    Args:
        Y: Magnitude spectrogram
        num_harm: Number of harmonics
        alpha: Weighting parameter

    Returns:
        Y_HS: Spectrogram after harmonic summation
    """
    Y_HS = np.zeros(Y.shape)
    Y_zero_pad = np.vstack((Y, np.zeros((Y.shape[0]*num_harm, Y.shape[1]))))
    K = Y.shape[0]
    for k in range(K):
        harm_idx = np.arange(1, num_harm+1)*(k)
        weights = alpha ** (np.arange(1, num_harm+1) - 1).reshape(-1, 1)
        Y_HS[k, :] = (Y_zero_pad[harm_idx, :] * weights).sum(axis=0)
    return Y_HS


@jit(nopython=True)
def harmonic_summation_lf(Y_LF_bin, R, num_harm=10, alpha=1):
    """Harmonic summation for log-frequency spectrogram [FMP, Eq. (8.55)]

    Notebook: C8/C8S2_SalienceRepresentation.ipynb

    Args:
        Y_LF_bin: Log-frequency spectrogram
        R: Frequency resolution in cents
        num_harm: Number of harmonics
        alpha: Weighting parameter

    Returns:
        Y_LF_bin_HS: Log-frequency spectrogram after harmonic summation
    """
    Y_LF_bin_HS = np.zeros(Y_LF_bin.shape)
    pad_len = int(np.floor(np.log2(num_harm) * 1200 / R))
    Y_LF_bin_zero_pad = np.vstack((Y_LF_bin, np.zeros((pad_len, Y_LF_bin.shape[1]))))
    B = Y_LF_bin.shape[0]
    for b in range(B):
        harmonics = np.arange(1, num_harm+1)
        harm_idx = b + np.floor(np.log2(harmonics) * 1200 / R).astype(np.int64)
        weights = alpha ** (np.arange(1, num_harm+1) - 1).reshape(-1, 1)
        Y_LF_bin_HS[b, :] = (Y_LF_bin_zero_pad[harm_idx, :] * weights).sum(axis=0)
    return Y_LF_bin_HS


def compute_salience_rep(x, Fs, N, H, R, F_min=55, F_max=1760, num_harm=10, freq_smooth_len=11, alpha=1, gamma=0):
    """Salience representation [FMP, Eq. (8.56)]

    Notebook: C8/C8S2_SalienceRepresentation.ipynb

    Args:
        x: Audio signal
        Fs: Sampling frequency
        N: Window length in samples
        H: Hopsize in samples
        R: Frequency resolution in cents
        F_min: Lower frequency bound (reference frequency)
        F_max: Upper frequency bound
        num_harm: Number of harmonics
        freq_smooth_len: Filter length for vertical smoothing
        gamma: Logarithmic compression factor

    Returns:
        Z: Salience representation
        F_coef_hertz: Frequency axis in Hz
        F_coef_cents: Frequency axis in cents
    """
    X = librosa.stft(x, n_fft=N, hop_length=H, win_length=N, pad_mode='constant')
    Y_LF_IF_bin, F_coef_hertz, F_coef_cents = compute_y_lf_if_bin(X, Fs, N, H, R, F_min, F_max, gamma=gamma)
    # smoothing
    Y_LF_IF_bin = ndimage.filters.convolve1d(Y_LF_IF_bin, np.hanning(freq_smooth_len), axis=0, mode='constant')
    Z = harmonic_summation_lf(Y_LF_IF_bin, R=R, num_harm=num_harm, alpha=alpha)
    return Z, F_coef_hertz, F_coef_cents