Source code for vak.datasets.frame_classification.window_dataset

"""A dataset class used for neural network models with the
frame classification task, where the source data consists of audio signals
or spectrograms of varying lengths.

Unlike :class:`vak.datasets.frame_classification.FramesDataset`,
this class does not return entire samples
from the source dataset.
Instead each paired samples :math:`(x_i, y_i)`
returned by this dataset class consists of
a window :math:`x_i` of fixed length
:math:`w` from the underlying data ``X`` of total length :math:`T`.
Each :math:`y_i` is a vector of the same size :math:`w`, containing
an integer class label for each *frame* in the window :math:`x_i`.
The entire dataset consists of some number of windows
:math:`I` determined by a ``stride`` parameter :math:`s`,
:math:`I = (T - w) / s`.
"""

from __future__ import annotations

import pathlib
from typing import Callable

import numpy as np
import numpy.typing as npt
import pandas as pd

from . import constants, helper
from .metadata import Metadata



[docs]
def get_window_inds(n_frames: int, window_size: int, stride: int = 1):
    """Get indices of windows for a :class:`WindowDataset`,
    given the number of frames in the dataset,
    the window size, and the stride.

    This function is used by :class:`WindowDataset`
    to compute the indices of windows in the dataset.
    The length of the vector of indices it returns
    is the number of windows in the dataset,
    i.e., the number of samples.

    Parameters
    ----------
    n_frames : int
    window_size : int
    stride : int

    Returns
    -------
    window_inds : numpy.ndarray
        Vector of indices for windows.
        During training, batches of windows are made
        by grabbing indices randomly from this vector,
        then getting windows of the specified size
        from the arrays representing the input data
        and targets for the neural network.
    """
    return np.arange(stop=n_frames - (window_size - 1), step=stride)




[docs]
class WindowDataset:
    """Dataset used for training neural network models
    on the frame classification task,
    where the source data consists of audio signals
    or spectrograms of varying lengths.

    Unlike
    :class:`vak.datasets.frame_classification.FramesDataset`,
    this class does not return entire samples
    from the source dataset.
    Instead each paired samples :math:`(x_i, y_i)`
    returned by this dataset class consists of
    a window :math:`x_i` of fixed length
    :math:`w` from the underlying data ``X`` of total length :math:`T`.
    Each :math:`y_i` is a vector of the same size :math:`w`, containing
    an integer class label for each *frame* in the window :math:`x_i`.
    The entire dataset consists of some number of windows
    :math:`I` determined by a ``stride`` parameter :math:`s`,
    :math:`I = (T - w) / s`.

    The underlying data consists of single arrays
    for both the input to the network ``X``
    and the targets for the network output ``Y``.
    These single arrays ``X`` and ``Y`` are
    created by concatenating samples from the source
    data, e.g., audio files or spectrogram arrays.
    (This is true for
    :class:`vak.datasets.frame_classification.FramesDataset`
    as well.)
    The dimensions of :math:`X`  will be (channels, ..., frames),
    i.e., audio will have dimensions (channels, samples)
    and spectrograms will have dimensions
    (channels, frequency bins, time bins).
    The signal :math:`X` may be either audio or spectrogram,
    meaning that a frame will be either a single sample
    in an audio signal or a single time bin in a spectrogram.
    The last dimension of ``X`` will always be the
    number of total frames in the dataset,
    either audio samples or spectrogram time bins,
    and ``Y`` will be the same size, containing
    an integer class label for each frame.

    Attributes
    ----------
    dataset_path : pathlib.Path
        Path to directory that represents a
        frame classification dataset,
        as created by
        :func:`vak.prep.prep_frame_classification_dataset`.
    split : str
        The name of a split from the dataset,
        one of {'train', 'val', 'test'}.
    subset : str, optional
        Name of subset to use.
        If specified, this takes precedence over split.
        Subsets are typically taken from the training data
        for use when generating a learning curve.
    dataset_df : pandas.DataFrame
        A frame classification dataset,
        represented as a :class:`pandas.DataFrame`.
        This will be only the rows that correspond
        to either ``subset`` or ``split`` from the
        ``dataset_df`` that was passed in when
        instantiating the class.
    input_type : str
        The type of input to the neural network model.
        One of {'audio', 'spect'}.
    frame_paths : numpy.ndarray
        Paths to npy files containing frames,
        either spectrograms or audio signals
        that are input to the model.
    frame_labels_paths : numpy.ndarray
        Paths to npy files containing vectors
        with a label for each frame.
        The targets for the outputs of the model.
    sample_ids : numpy.ndarray
        Indexing vector representing which sample
        from the dataset every frame belongs to.
    inds_in_sample : numpy.ndarray
        Indexing vector representing which index
        within each sample from the dataset
        that every frame belongs to.
    window_size : int
        Size of windows to return;
        number of frames.
    frame_dur: float
        Duration of a frame, i.e., a single sample in audio
        or a single timebin in a spectrogram.
    stride : int
        The size of the stride used to determine which windows
        are included in the dataset. The default is 1.
        Used to compute ``window_inds``,
        with the function
        :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`.
    window_inds : numpy.ndarray, optional
        A vector of valid window indices for the dataset.
        If specified, this takes precedence over ``stride``.
    transform : callable
        The transform applied to the frames,
         the input to the neural network :math:`x`.
    target_transform : callable
        The transform applied to the target for the output
        of the neural network :math:`y`.
    """


[docs]
    def __init__(
        self,
        dataset_path: str | pathlib.Path,
        dataset_df: pd.DataFrame,
        input_type: str,
        split: str,
        sample_ids: npt.NDArray,
        inds_in_sample: npt.NDArray,
        window_size: int,
        frame_dur: float,
        item_transform: Callable,
        stride: int = 1,
        subset: str | None = None,
        window_inds: npt.NDArray | None = None,
    ):
        """Initialize a new instance of a WindowDataset.

        Parameters
        ----------
        dataset_path : pathlib.Path
            Path to directory that represents a
            frame classification dataset,
            as created by
            :func:`vak.prep.prep_frame_classification_dataset`.
        dataset_df : pandas.DataFrame
            A frame classification dataset,
            represented as a :class:`pandas.DataFrame`.
        input_type : str
            The type of input to the neural network model.
            One of {'audio', 'spect'}.
        split : str
            The name of a split from the dataset,
            one of {'train', 'val', 'test'}.
        sample_ids : numpy.ndarray
            Indexing vector representing which sample
            from the dataset every frame belongs to.
        inds_in_sample : numpy.ndarray
            Indexing vector representing which index
            within each sample from the dataset
            that every frame belongs to.
        window_size : int
            Size of windows to return;
            number of frames.
        frame_dur: float
            Duration of a frame, i.e., a single sample in audio
            or a single timebin in a spectrogram.
        item_transform : callable
            The transform applied to each item :math:`(x, y)`
            that is returned by :meth:`WindowDataset.__getitem__`.
        stride : int
            The size of the stride used to determine which windows
            are included in the dataset. The default is 1.
            Used to compute ``window_inds``,
            with the function
            :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`.
        subset : str, optional
            Name of subset to use.
            If specified, this takes precedence over split.
            Subsets are typically taken from the training data
            for use when generating a learning curve.
        window_inds : numpy.ndarray, optional
            A vector of valid window indices for the dataset.
            If specified, this takes precedence over ``stride``.
        transform : callable
            The transform applied to the input to the neural network :math:`x`.
        target_transform : callable
            The transform applied to the target for the output
            of the neural network :math:`y`.
        """
        from ... import (
            prep,
        )  # avoid circular import, use for constants.INPUT_TYPES

        if input_type not in prep.constants.INPUT_TYPES:
            raise ValueError(
                f"``input_type`` must be one of: {prep.constants.INPUT_TYPES}\n"
                f"Value for ``input_type`` was: {input_type}"
            )

        self.dataset_path = pathlib.Path(dataset_path)
        self.split = split
        self.subset = subset
        # subset takes precedence over split, if specified
        if subset:
            dataset_df = dataset_df[dataset_df.subset == subset].copy()
        else:
            dataset_df = dataset_df[dataset_df.split == split].copy()
        self.dataset_df = dataset_df
        self.input_type = input_type
        self.frames_paths = self.dataset_df[
            constants.FRAMES_PATH_COL_NAME
        ].values
        self.frame_labels_paths = self.dataset_df[
            constants.FRAME_LABELS_NPY_PATH_COL_NAME
        ].values
        self.sample_ids = sample_ids
        self.inds_in_sample = inds_in_sample
        self.window_size = window_size
        self.frame_dur = float(frame_dur)
        self.stride = stride
        if window_inds is None:
            window_inds = get_window_inds(
                sample_ids.shape[-1], window_size, stride
            )
        self.window_inds = window_inds
        self.item_transform = item_transform


    @property
    def duration(self):
        return self.sample_ids.shape[-1] * self.frame_dur

    @property
    def shape(self):
        tmp_x_ind = 0
        tmp_item = self.__getitem__(tmp_x_ind)
        # used by vak functions that need to determine size of window,
        # e.g. when initializing a neural network model
        return tmp_item["frames"].shape

    def _load_frames(self, frames_path):
        """Helper function that loads "frames",
        the input to the frame classification model.
        Loads audio or spectrogram, depending on
        :attr:`self.input_type`.
        This function assumes that audio is in wav format
        and spectrograms are in npz files.
        """
        return helper.load_frames(frames_path, self.input_type)

    def __getitem__(self, idx):
        window_idx = self.window_inds[idx]
        sample_ids = self.sample_ids[
            window_idx : window_idx + self.window_size  # noqa: E203
        ]
        uniq_sample_ids = np.unique(sample_ids)
        if len(uniq_sample_ids) == 1:
            # we repeat ourselves here to avoid running a loop on one item
            sample_id = uniq_sample_ids[0]
            frames_path = self.dataset_path / self.frames_paths[sample_id]
            frames = self._load_frames(frames_path)
            frame_labels = np.load(
                self.dataset_path / self.frame_labels_paths[sample_id]
            )

        elif len(uniq_sample_ids) > 1:
            frames = []
            frame_labels = []
            for sample_id in sorted(uniq_sample_ids):
                frames_path = self.dataset_path / self.frames_paths[sample_id]
                frames.append(self._load_frames(frames_path))
                frame_labels.append(
                    np.load(
                        self.dataset_path / self.frame_labels_paths[sample_id]
                    )
                )

            if all([frames_.ndim == 1 for frames_ in frames]):
                # --> all 1-d audio vectors; if we specify `axis=1` here we'd get error
                frames = np.concatenate(frames)
            else:
                frames = np.concatenate(frames, axis=1)
            frame_labels = np.concatenate(frame_labels)
        else:
            raise ValueError(
                f"Unexpected number of ``uniq_sample_ids``: {uniq_sample_ids}"
            )

        inds_in_sample = self.inds_in_sample[window_idx]
        frames = frames[
            ...,
            inds_in_sample : inds_in_sample + self.window_size,  # noqa: E203
        ]
        frame_labels = frame_labels[
            inds_in_sample : inds_in_sample + self.window_size  # noqa: E203
        ]
        item = self.item_transform(frames, frame_labels)
        return item

    def __len__(self):
        """number of batches"""
        return len(self.window_inds)


[docs]
    @classmethod
    def from_dataset_path(
        cls,
        dataset_path: str | pathlib.Path,
        window_size: int,
        item_transform: Callable,
        stride: int = 1,
        split: str = "train",
        subset: str | None = None,
    ):
        """Make a :class:`WindowDataset` instance,
        given the path to a frame classification dataset.

        Parameters
        ----------
        dataset_path : pathlib.Path
            Path to directory that represents a
            frame classification dataset,
            as created by
            :func:`vak.prep.prep_frame_classification_dataset`.
        window_size : int
            Size of windows to return;
            number of frames.
        stride : int
            The size of the stride used to determine which windows
            are included in the dataset. The default is 1.
            Used to compute ``window_inds``,
            with the function
            :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`.
        split : str
            The name of a split from the dataset,
            one of {'train', 'val', 'test'}.
        subset : str, optional
            Name of subset to use.
            If specified, this takes precedence over split.
            Subsets are typically taken from the training data
            for use when generating a learning curve.
        transform : callable
            The transform applied to the input to the neural network :math:`x`.
        target_transform : callable
            The transform applied to the target for the output
            of the neural network :math:`y`.

        Returns
        -------
        dataset : vak.datasets.frame_classification.WindowDataset
        """
        dataset_path = pathlib.Path(dataset_path)
        metadata = Metadata.from_dataset_path(dataset_path)
        frame_dur = metadata.frame_dur
        input_type = metadata.input_type

        dataset_csv_path = dataset_path / metadata.dataset_csv_filename
        dataset_df = pd.read_csv(dataset_csv_path)

        split_path = dataset_path / split
        if subset:
            sample_ids_path = (
                split_path
                / helper.sample_ids_array_filename_for_subset(subset)
            )
        else:
            sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME
        sample_ids = np.load(sample_ids_path)

        if subset:
            inds_in_sample_path = (
                split_path
                / helper.inds_in_sample_array_filename_for_subset(subset)
            )
        else:
            inds_in_sample_path = (
                split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME
            )
        inds_in_sample = np.load(inds_in_sample_path)

        window_inds_path = split_path / constants.WINDOW_INDS_ARRAY_FILENAME
        if window_inds_path.exists():
            window_inds = np.load(window_inds_path)
        else:
            window_inds = None

        return cls(
            dataset_path,
            dataset_df,
            input_type,
            split,
            sample_ids,
            inds_in_sample,
            window_size,
            frame_dur,
            item_transform,
            stride,
            subset,
            window_inds,
        )