Source code for vak.transforms.functional

import numpy as np
import torch

__all__ = [
    "pad_to_window",
    "standardize_spect",
    "to_floattensor",
    "to_longtensor",
    "view_as_window_batch",
]



[docs]
def standardize_spect(spect, mean_freqs, std_freqs, non_zero_std):
    """standardize spectrogram by subtracting off mean and dividing by standard deviation.

    Parameters
    ----------
    spect : numpy.ndarray
        with shape (frequencies, time bins)
    mean_freqs : numpy.ndarray
        vector of mean values for each frequency bin across the fit set of spectrograms
    std_freqs : numpy.ndarray
        vector of standard deviations for each frequency bin across the fit set of spectrograms
    non_zero_std : numpy.ndarray
        boolean, indicates where std_freqs has non-zero values. Used to avoid divide-by-zero errors.

    Returns
    -------
    transformed : numpy.ndarray
        with same shape as spect but with (approximately) zero mean and unit standard deviation
        (mean and standard devation will still vary by batch).
    """
    tfm = spect - mean_freqs[:, np.newaxis]  # need axis for broadcasting
    # keep any stds that are zero from causing NaNs
    tfm[non_zero_std, :] = (
        tfm[non_zero_std, :] / std_freqs[non_zero_std, np.newaxis]
    )
    return tfm




[docs]
def pad_to_window(arr, window_size, padval=0.0, return_padding_mask=True):
    """pad a 1d or 2d array so that it can be reshaped
    into consecutive windows of specified size

    Parameters
    ----------
    arr : numpy.ndarray
        with 1 or 2 dimensions, e.g. a vector of labeled timebins
        or a spectrogram.
    window_size : int
        width of window in number of elements.
    padval : float
        value to pad with. Added to end of array, the
        "right side" if 2-dimensional.
    return_padding_mask : bool
        if True, return a boolean vector to use for cropping
        back down to size before padding. padding_mask has size
        equal to width of padded array, i.e. original size
        plus padding at the end, and has values of 1 where
        columns in padded are from the original array,
        and values of 0 where columns were added for padding.

    Returns
    -------
    padded : numpy.ndarray
        Padded with ``padval``
    padding_mask : numpy.ndarray
        Boolean vector with size equal to width of padded,
        i.e. original size plus padding at the end.
        Has values of ``True`` where columns in ``padded``
        are from the original array, and values of ``False``
        where columns were added for padding.
        Only returned if ``return_padding_mask`` is ``True``.
    """
    if arr.ndim == 1:
        width = arr.shape[0]
    elif arr.ndim == 2:
        height, width = arr.shape
    else:
        raise ValueError(
            f"input array must be 1d or 2d but number of dimensions was: {arr.ndim}"
        )

    target_width = int(np.ceil(width / window_size) * window_size)

    if arr.ndim == 1:
        padded = np.ones((target_width,)) * padval
        padded[:width] = arr
    elif arr.ndim == 2:
        padded = np.ones((height, target_width)) * padval
        padded[:, :width] = arr

    if return_padding_mask:
        padding_mask = np.zeros((target_width,), dtype=bool)
        padding_mask[:width] = True
        return padded, padding_mask
    else:
        return padded




[docs]
def view_as_window_batch(arr, window_width):
    """return view of a 1d or 2d array as a batch of non-overlapping windows

    Parameters
    ----------
    arr : numpy.ndarray
        with 1 or 2 dimensions, e.g. a vector of labeled timebins
        or a 2-d array representing a spectrogram.
        If the array has 2-d dimensions, the returned array will
        have dimensions (batch, height of array, window width)
    window_width : int
        width of window in number of elements.

    Returns
    -------
    batch_windows : numpy.ndarray
        with shape (batch size, window_size) if array is 1d,
        or with shape (batch size, height, window_size) if array is 2d.
        Batch size will be arr.shape[-1] // window_width.
        Window width must divide arr.shape[-1] evenly.
        To pad the array so it can be divided into windows of the specified
        width, use the `pad_to_window` transform

    Notes
    -----
    adapted from skimage.util.view_as_blocks
    https://github.com/scikit-image/scikit-image/blob/f1b7cf60fb80822849129cb76269b75b8ef18db1/skimage/util/shape.py#L9
    """
    if not isinstance(window_width, int) or window_width < 1:
        raise ValueError(
            f"`window_width` must be a positive integer, but was: {window_width}"
        )

    if arr.ndim == 1:
        window_shape = (window_width,)
    elif arr.ndim == 2:
        height, _ = arr.shape
        window_shape = (height, window_width)
    else:
        raise ValueError(
            f"input array must be 1d or 2d but number of dimensions was: {arr.ndim}"
        )

    window_shape = np.array(window_shape)
    arr_shape = np.array(arr.shape)
    if (arr_shape % window_shape).sum() != 0:
        raise ValueError(
            "'window_width' does not divide evenly into with 'arr' shape. "
            "Use 'pad_to_window' transform to pad array so it can be windowed."
        )

    new_shape = tuple(arr_shape // window_shape) + tuple(window_shape)
    new_strides = tuple(arr.strides * window_shape) + arr.strides
    batch_windows = np.lib.stride_tricks.as_strided(
        arr, shape=new_shape, strides=new_strides
    )
    # TODO: figure out if there's a better way to do this where we don't need to squeeze
    # The current version always add an initial dim of size 1
    batch_windows = np.squeeze(batch_windows, axis=0)
    # By squeezing just that first axis, we always end up with (batch, freq. bins, time bins) for a spectrogram
    return batch_windows




[docs]
def to_floattensor(arr):
    """convert Numpy array to torch.FloatTensor.

    Parameters
    ----------
    arr : numpy.ndarray

    Returns
    -------
    float_tensor
        with dtype 'float32'
    """
    return torch.from_numpy(arr).float()




[docs]
def to_longtensor(arr):
    """convert Numpy array to torch.LongTensor.

    Parameters
    ----------
    arr : numpy.ndarray

    Returns
    -------
    long_tensor : torch.Tensor
        with dtype 'float64'
    """
    return torch.from_numpy(arr).long()




[docs]
def add_channel(input, channel_dim=0):
    """Add a "channel" dimension to a tensor.

    Transform that makes it easy to treat a spectrogram as an image,
    by adding a dimension with a single 'channel', analogous to grayscale.
    In this way the tensor can be fed to e.g. convolutional layers.

    Parameters
    ----------
    input : torch.Tensor
    channel_dim : int
        dimension where "channel" is added. Default is 0.
    """
    return torch.unsqueeze(input, dim=channel_dim)