Source code for vak.datapipes.frame_classification.infer_datapipe

"""A datapipe class used for neural network models with the
frame classification task, where the source data consists of audio signals
or spectrograms of varying lengths."""

from __future__ import annotations

import pathlib
from typing import TYPE_CHECKING

import numpy as np
import numpy.typing as npt
import pandas as pd

from ...transforms.defaults.frame_classification import InferItemTransform
from . import constants, helper
from .metadata import Metadata

if TYPE_CHECKING:
    from ...transforms import FramesStandardizer


[docs] class InferDatapipe: """A datapipe class used for neural network models with the frame classification task, where the source data consists of audio signals or spectrograms of varying lengths. Attributes ---------- dataset_path : pathlib.Path Path to directory that represents a frame classification dataset, as created by :func:`vak.prep.prep_frame_classification_dataset`. split : str The name of a split from the dataset, one of {'train', 'val', 'test'}. subset : str, optional Name of subset to use. If specified, this takes precedence over split. Subsets are typically taken from the training data for use when generating a learning curve. dataset_df : pandas.DataFrame A frame classification dataset, represented as a :class:`pandas.DataFrame`. This will be only the rows that correspond to either ``subset`` or ``split`` from the ``dataset_df`` that was passed in when instantiating the class. frames_paths : numpy.ndarray Paths to npy files containing frames, either spectrograms or audio signals that are input to the model. frame_labels_paths : numpy.ndarray Paths to npy files containing vectors with a label for each frame. The targets for the outputs of the model. input_type : str The type of input to the neural network model. One of {'audio', 'spect'}. sample_ids : numpy.ndarray Indexing vector representing which sample from the dataset every frame belongs to. inds_in_sample : numpy.ndarray Indexing vector representing which index within each sample from the dataset that every frame belongs to. frame_dur: float Duration of a frame, i.e., a single sample in audio or a single timebin in a spectrogram. window_size : int Size of windows to return; number of frames. frames_standardizer : vak.transforms.FramesStandardizer, optional Transform applied to frames, the input to the neural network model. Optional, default is None. If supplied, will be used with the transform applied to inputs and targets, :class:`vak.transforms.defaults.frame_classification.TrainItemTransform`. """
[docs] def __init__( self, dataset_path: str | pathlib.Path, dataset_df: pd.DataFrame, input_type: str, split: str, sample_ids: npt.NDArray, inds_in_sample: npt.NDArray, frame_dur: float, window_size: int, frames_standardizer: FramesStandardizer | None = None, frames_padval: float = 0.0, frame_labels_padval: int = -1, return_padding_mask: bool = False, subset: str | None = None, ): """Initialize a new instance of an :class:`InferDatapipe`. Parameters ---------- dataset_path : pathlib.Path Path to directory that represents a frame classification dataset, as created by :func:`vak.prep.prep_frame_classification_dataset`. dataset_df : pandas.DataFrame A frame classification dataset, represented as a :class:`pandas.DataFrame`. input_type : str The type of input to the neural network model. One of {'audio', 'spect'}. split : str The name of a split from the dataset, one of {'train', 'val', 'test'}. sample_ids : numpy.ndarray Indexing vector representing which sample from the dataset every frame belongs to. inds_in_sample : numpy.ndarray Indexing vector representing which index within each sample from the dataset that every frame belongs to. frame_dur: float Duration of a frame, i.e., a single sample in audio or a single timebin in a spectrogram. frames_standardizer : vak.transforms.FramesStandardizer, optional Transform applied to frames, the input to the neural network model. Optional, default is None. If supplied, will be used with the transform applied to inputs and targets, :class:`vak.transforms.defaults.frame_classification.InferItemTransform`. window_size : int Size of windows to return; number of frames. frames_padval : float Value to pad frames with. Added to end of array, the "right side". Argument to PadToWindow transform. Default is 0.0. frame_labels_padval : int Value to pad frame labels vector with. Added to the end of the array. Argument to PadToWindow transform. Default is -1. Used with ``ignore_index`` argument of :mod:`torch.nn.CrossEntropyLoss`. return_padding_mask : bool if True, the dictionary returned by ItemTransform classes will include a boolean vector to use for cropping back down to size before padding. padding_mask has size equal to width of padded array, i.e. original size plus padding at the end, and has values of 1 where columns in padded are from the original array, and values of 0 where columns were added for padding. subset : str, optional Name of subset to use. If specified, this takes precedence over split. Subsets are typically taken from the training data for use when generating a learning curve. """ from ... import ( prep, ) # avoid circular import, use for constants.INPUT_TYPES if input_type not in prep.constants.INPUT_TYPES: raise ValueError( f"``input_type`` must be one of: {prep.constants.INPUT_TYPES}\n" f"Value for ``input_type`` was: {input_type}" ) self.dataset_path = pathlib.Path(dataset_path) self.split = split self.subset = subset # subset takes precedence over split, if specified if subset: dataset_df = dataset_df[dataset_df.subset == subset].copy() else: dataset_df = dataset_df[dataset_df.split == split].copy() self.dataset_df = dataset_df self.input_type = input_type self.frames_paths = self.dataset_df[ constants.FRAMES_PATH_COL_NAME ].values if split != "predict": self.frame_labels_paths = self.dataset_df[ constants.MULTI_FRAME_LABELS_PATH_COL_NAME ].values else: self.frame_labels_paths = None self.sample_ids = sample_ids self.inds_in_sample = inds_in_sample self.frame_dur = float(frame_dur) self.item_transform = InferItemTransform( window_size, frames_standardizer, frames_padval, frame_labels_padval, return_padding_mask, )
@property def duration(self): return self.sample_ids.shape[-1] * self.frame_dur @property def shape(self): tmp_x_ind = 0 tmp_item = self.__getitem__(tmp_x_ind) return tmp_item["frames"].shape def _load_frames(self, frames_path): """Helper function that loads "frames", the input to the frame classification model. Loads audio or spectrogram, depending on :attr:`self.input_type`. This function assumes that audio is in wav format and spectrograms are in npz files. """ return helper.load_frames(frames_path, self.input_type) def __getitem__(self, idx): frames_path = self.dataset_path / self.frames_paths[idx] from vak import common if self.input_type == "audio": frames, _ = common.constants.AUDIO_FORMAT_FUNC_MAP[ constants.FRAME_CLASSIFICATION_DATASET_AUDIO_FORMAT ](frames_path) elif self.input_type == "spect": spect_dict = common.files.spect.load(frames_path) frames = spect_dict[common.constants.SPECT_KEY] frame_times = spect_dict[common.constants.TIMEBINS_KEY] item = {"frames": frames, "frames_path": frames_path} if self.frame_labels_paths is not None: frame_labels = np.load( self.dataset_path / self.frame_labels_paths[idx] ) item["frame_labels"] = frame_labels if self.item_transform: item = self.item_transform(**item) item["frame_times"] = frame_times return item def __len__(self): """number of batches""" return len(np.unique(self.sample_ids))
[docs] @classmethod def from_dataset_path( cls, dataset_path: str | pathlib.Path, window_size: int, frames_standardizer: FramesStandardizer | None = None, frames_padval: float = 0.0, frame_labels_padval: int = -1, return_padding_mask: bool = False, split: str = "val", subset: str | None = None, ): """Make a :class:`InferDatapipe` instance, given the path to a frame classification dataset. Parameters ---------- dataset_path : pathlib.Path Path to directory that represents a frame classification dataset, as created by :func:`vak.prep.prep_frame_classification_dataset`. window_size : int Size of windows to return; number of frames. frames_standardizer : vak.transforms.FramesStandardizer, optional Transform applied to frames, the input to the neural network model. Optional, default is None. If supplied, will be used with the transform applied to inputs and targets, :class:`vak.transforms.defaults.frame_classification.TrainItemTransform`. frames_padval : float Value to pad frames with. Added to end of array, the "right side". Argument to PadToWindow transform. Default is 0.0. frame_labels_padval : int Value to pad frame labels vector with. Added to the end of the array. Argument to PadToWindow transform. Default is -1. Used with ``ignore_index`` argument of :mod:`torch.nn.CrossEntropyLoss`. return_padding_mask : bool if True, the dictionary returned by ItemTransform classes will include a boolean vector to use for cropping back down to size before padding. padding_mask has size equal to width of padded array, i.e. original size plus padding at the end, and has values of 1 where columns in padded are from the original array, and values of 0 where columns were added for padding. split : str The name of a split from the dataset, one of {'train', 'val', 'test'}. Default is "val". subset : str, optional Name of subset to use. If specified, this takes precedence over split. Subsets are typically taken from the training data for use when generating a learning curve. Returns ------- infer_datapipe : InferDatapipe """ dataset_path = pathlib.Path(dataset_path) metadata = Metadata.from_dataset_path(dataset_path) frame_dur = metadata.frame_dur input_type = metadata.input_type dataset_csv_path = dataset_path / metadata.dataset_csv_filename dataset_df = pd.read_csv(dataset_csv_path) split_path = dataset_path / split if subset: sample_ids_path = ( split_path / helper.sample_ids_array_filename_for_subset(subset) ) else: sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME sample_ids = np.load(sample_ids_path) if subset: inds_in_sample_path = ( split_path / helper.inds_in_sample_array_filename_for_subset(subset) ) else: inds_in_sample_path = ( split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME ) inds_in_sample = np.load(inds_in_sample_path) return cls( dataset_path, dataset_df, input_type, split, sample_ids, inds_in_sample, frame_dur, window_size, frames_standardizer, frames_padval, frame_labels_padval, return_padding_mask, subset, )