Source code for vak.datasets.frame_classification.window_dataset

"""A dataset class used for neural network models with the
frame classification task, where the source data consists of audio signals
or spectrograms of varying lengths.

Unlike :class:`vak.datasets.frame_classification.FramesDataset`,
this class does not return entire samples
from the source dataset.
Instead each paired samples :math:`(x_i, y_i)`
returned by this dataset class consists of
a window :math:`x_i` of fixed length
:math:`w` from the underlying data ``X`` of total length :math:`T`.
Each :math:`y_i` is a vector of the same size :math:`w`, containing
an integer class label for each *frame* in the window :math:`x_i`.
The entire dataset consists of some number of windows
:math:`I` determined by a ``stride`` parameter :math:`s`,
:math:`I = (T - w) / s`.
"""

from __future__ import annotations

import pathlib
from typing import Callable

import numpy as np
import numpy.typing as npt
import pandas as pd

from . import constants, helper
from .metadata import Metadata


[docs] def get_window_inds(n_frames: int, window_size: int, stride: int = 1): """Get indices of windows for a :class:`WindowDataset`, given the number of frames in the dataset, the window size, and the stride. This function is used by :class:`WindowDataset` to compute the indices of windows in the dataset. The length of the vector of indices it returns is the number of windows in the dataset, i.e., the number of samples. Parameters ---------- n_frames : int window_size : int stride : int Returns ------- window_inds : numpy.ndarray Vector of indices for windows. During training, batches of windows are made by grabbing indices randomly from this vector, then getting windows of the specified size from the arrays representing the input data and targets for the neural network. """ return np.arange(stop=n_frames - (window_size - 1), step=stride)
[docs] class WindowDataset: """Dataset used for training neural network models on the frame classification task, where the source data consists of audio signals or spectrograms of varying lengths. Unlike :class:`vak.datasets.frame_classification.FramesDataset`, this class does not return entire samples from the source dataset. Instead each paired samples :math:`(x_i, y_i)` returned by this dataset class consists of a window :math:`x_i` of fixed length :math:`w` from the underlying data ``X`` of total length :math:`T`. Each :math:`y_i` is a vector of the same size :math:`w`, containing an integer class label for each *frame* in the window :math:`x_i`. The entire dataset consists of some number of windows :math:`I` determined by a ``stride`` parameter :math:`s`, :math:`I = (T - w) / s`. The underlying data consists of single arrays for both the input to the network ``X`` and the targets for the network output ``Y``. These single arrays ``X`` and ``Y`` are created by concatenating samples from the source data, e.g., audio files or spectrogram arrays. (This is true for :class:`vak.datasets.frame_classification.FramesDataset` as well.) The dimensions of :math:`X` will be (channels, ..., frames), i.e., audio will have dimensions (channels, samples) and spectrograms will have dimensions (channels, frequency bins, time bins). The signal :math:`X` may be either audio or spectrogram, meaning that a frame will be either a single sample in an audio signal or a single time bin in a spectrogram. The last dimension of ``X`` will always be the number of total frames in the dataset, either audio samples or spectrogram time bins, and ``Y`` will be the same size, containing an integer class label for each frame. Attributes ---------- dataset_path : pathlib.Path Path to directory that represents a frame classification dataset, as created by :func:`vak.prep.prep_frame_classification_dataset`. split : str The name of a split from the dataset, one of {'train', 'val', 'test'}. subset : str, optional Name of subset to use. If specified, this takes precedence over split. Subsets are typically taken from the training data for use when generating a learning curve. dataset_df : pandas.DataFrame A frame classification dataset, represented as a :class:`pandas.DataFrame`. This will be only the rows that correspond to either ``subset`` or ``split`` from the ``dataset_df`` that was passed in when instantiating the class. input_type : str The type of input to the neural network model. One of {'audio', 'spect'}. frame_paths : numpy.ndarray Paths to npy files containing frames, either spectrograms or audio signals that are input to the model. frame_labels_paths : numpy.ndarray Paths to npy files containing vectors with a label for each frame. The targets for the outputs of the model. sample_ids : numpy.ndarray Indexing vector representing which sample from the dataset every frame belongs to. inds_in_sample : numpy.ndarray Indexing vector representing which index within each sample from the dataset that every frame belongs to. window_size : int Size of windows to return; number of frames. frame_dur: float Duration of a frame, i.e., a single sample in audio or a single timebin in a spectrogram. stride : int The size of the stride used to determine which windows are included in the dataset. The default is 1. Used to compute ``window_inds``, with the function :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`. window_inds : numpy.ndarray, optional A vector of valid window indices for the dataset. If specified, this takes precedence over ``stride``. transform : callable The transform applied to the frames, the input to the neural network :math:`x`. target_transform : callable The transform applied to the target for the output of the neural network :math:`y`. """
[docs] def __init__( self, dataset_path: str | pathlib.Path, dataset_df: pd.DataFrame, input_type: str, split: str, sample_ids: npt.NDArray, inds_in_sample: npt.NDArray, window_size: int, frame_dur: float, item_transform: Callable, stride: int = 1, subset: str | None = None, window_inds: npt.NDArray | None = None, ): """Initialize a new instance of a WindowDataset. Parameters ---------- dataset_path : pathlib.Path Path to directory that represents a frame classification dataset, as created by :func:`vak.prep.prep_frame_classification_dataset`. dataset_df : pandas.DataFrame A frame classification dataset, represented as a :class:`pandas.DataFrame`. input_type : str The type of input to the neural network model. One of {'audio', 'spect'}. split : str The name of a split from the dataset, one of {'train', 'val', 'test'}. sample_ids : numpy.ndarray Indexing vector representing which sample from the dataset every frame belongs to. inds_in_sample : numpy.ndarray Indexing vector representing which index within each sample from the dataset that every frame belongs to. window_size : int Size of windows to return; number of frames. frame_dur: float Duration of a frame, i.e., a single sample in audio or a single timebin in a spectrogram. item_transform : callable The transform applied to each item :math:`(x, y)` that is returned by :meth:`WindowDataset.__getitem__`. stride : int The size of the stride used to determine which windows are included in the dataset. The default is 1. Used to compute ``window_inds``, with the function :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`. subset : str, optional Name of subset to use. If specified, this takes precedence over split. Subsets are typically taken from the training data for use when generating a learning curve. window_inds : numpy.ndarray, optional A vector of valid window indices for the dataset. If specified, this takes precedence over ``stride``. transform : callable The transform applied to the input to the neural network :math:`x`. target_transform : callable The transform applied to the target for the output of the neural network :math:`y`. """ from ... import ( prep, ) # avoid circular import, use for constants.INPUT_TYPES if input_type not in prep.constants.INPUT_TYPES: raise ValueError( f"``input_type`` must be one of: {prep.constants.INPUT_TYPES}\n" f"Value for ``input_type`` was: {input_type}" ) self.dataset_path = pathlib.Path(dataset_path) self.split = split self.subset = subset # subset takes precedence over split, if specified if subset: dataset_df = dataset_df[dataset_df.subset == subset].copy() else: dataset_df = dataset_df[dataset_df.split == split].copy() self.dataset_df = dataset_df self.input_type = input_type self.frames_paths = self.dataset_df[ constants.FRAMES_PATH_COL_NAME ].values self.frame_labels_paths = self.dataset_df[ constants.FRAME_LABELS_NPY_PATH_COL_NAME ].values self.sample_ids = sample_ids self.inds_in_sample = inds_in_sample self.window_size = window_size self.frame_dur = float(frame_dur) self.stride = stride if window_inds is None: window_inds = get_window_inds( sample_ids.shape[-1], window_size, stride ) self.window_inds = window_inds self.item_transform = item_transform
@property def duration(self): return self.sample_ids.shape[-1] * self.frame_dur @property def shape(self): tmp_x_ind = 0 tmp_item = self.__getitem__(tmp_x_ind) # used by vak functions that need to determine size of window, # e.g. when initializing a neural network model return tmp_item["frames"].shape def _load_frames(self, frames_path): """Helper function that loads "frames", the input to the frame classification model. Loads audio or spectrogram, depending on :attr:`self.input_type`. This function assumes that audio is in wav format and spectrograms are in npz files. """ return helper.load_frames(frames_path, self.input_type) def __getitem__(self, idx): window_idx = self.window_inds[idx] sample_ids = self.sample_ids[ window_idx : window_idx + self.window_size # noqa: E203 ] uniq_sample_ids = np.unique(sample_ids) if len(uniq_sample_ids) == 1: # we repeat ourselves here to avoid running a loop on one item sample_id = uniq_sample_ids[0] frames_path = self.dataset_path / self.frames_paths[sample_id] frames = self._load_frames(frames_path) frame_labels = np.load( self.dataset_path / self.frame_labels_paths[sample_id] ) elif len(uniq_sample_ids) > 1: frames = [] frame_labels = [] for sample_id in sorted(uniq_sample_ids): frames_path = self.dataset_path / self.frames_paths[sample_id] frames.append(self._load_frames(frames_path)) frame_labels.append( np.load( self.dataset_path / self.frame_labels_paths[sample_id] ) ) if all([frames_.ndim == 1 for frames_ in frames]): # --> all 1-d audio vectors; if we specify `axis=1` here we'd get error frames = np.concatenate(frames) else: frames = np.concatenate(frames, axis=1) frame_labels = np.concatenate(frame_labels) else: raise ValueError( f"Unexpected number of ``uniq_sample_ids``: {uniq_sample_ids}" ) inds_in_sample = self.inds_in_sample[window_idx] frames = frames[ ..., inds_in_sample : inds_in_sample + self.window_size, # noqa: E203 ] frame_labels = frame_labels[ inds_in_sample : inds_in_sample + self.window_size # noqa: E203 ] item = self.item_transform(frames, frame_labels) return item def __len__(self): """number of batches""" return len(self.window_inds)
[docs] @classmethod def from_dataset_path( cls, dataset_path: str | pathlib.Path, window_size: int, item_transform: Callable, stride: int = 1, split: str = "train", subset: str | None = None, ): """Make a :class:`WindowDataset` instance, given the path to a frame classification dataset. Parameters ---------- dataset_path : pathlib.Path Path to directory that represents a frame classification dataset, as created by :func:`vak.prep.prep_frame_classification_dataset`. window_size : int Size of windows to return; number of frames. stride : int The size of the stride used to determine which windows are included in the dataset. The default is 1. Used to compute ``window_inds``, with the function :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`. split : str The name of a split from the dataset, one of {'train', 'val', 'test'}. subset : str, optional Name of subset to use. If specified, this takes precedence over split. Subsets are typically taken from the training data for use when generating a learning curve. transform : callable The transform applied to the input to the neural network :math:`x`. target_transform : callable The transform applied to the target for the output of the neural network :math:`y`. Returns ------- dataset : vak.datasets.frame_classification.WindowDataset """ dataset_path = pathlib.Path(dataset_path) metadata = Metadata.from_dataset_path(dataset_path) frame_dur = metadata.frame_dur input_type = metadata.input_type dataset_csv_path = dataset_path / metadata.dataset_csv_filename dataset_df = pd.read_csv(dataset_csv_path) split_path = dataset_path / split if subset: sample_ids_path = ( split_path / helper.sample_ids_array_filename_for_subset(subset) ) else: sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME sample_ids = np.load(sample_ids_path) if subset: inds_in_sample_path = ( split_path / helper.inds_in_sample_array_filename_for_subset(subset) ) else: inds_in_sample_path = ( split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME ) inds_in_sample = np.load(inds_in_sample_path) window_inds_path = split_path / constants.WINDOW_INDS_ARRAY_FILENAME if window_inds_path.exists(): window_inds = np.load(window_inds_path) else: window_inds = None return cls( dataset_path, dataset_df, input_type, split, sample_ids, inds_in_sample, window_size, frame_dur, item_transform, stride, subset, window_inds, )