"""A datapipe class used for neural network models with the
frame classification task, where the source data consists of audio signals
or spectrograms of varying lengths."""
from __future__ import annotations
import pathlib
from typing import TYPE_CHECKING
import numpy as np
import numpy.typing as npt
import pandas as pd
from ...transforms.defaults.frame_classification import InferItemTransform
from . import constants, helper
from .metadata import Metadata
if TYPE_CHECKING:
from ...transforms import FramesStandardizer
[docs]
class InferDatapipe:
"""A datapipe class used for
neural network models
with the frame classification task,
where the source data consists of audio signals
or spectrograms of varying lengths.
Attributes
----------
dataset_path : pathlib.Path
Path to directory that represents a
frame classification dataset,
as created by
:func:`vak.prep.prep_frame_classification_dataset`.
split : str
The name of a split from the dataset,
one of {'train', 'val', 'test'}.
subset : str, optional
Name of subset to use.
If specified, this takes precedence over split.
Subsets are typically taken from the training data
for use when generating a learning curve.
dataset_df : pandas.DataFrame
A frame classification dataset,
represented as a :class:`pandas.DataFrame`.
This will be only the rows that correspond
to either ``subset`` or ``split`` from the
``dataset_df`` that was passed in when
instantiating the class.
frames_paths : numpy.ndarray
Paths to npy files containing frames,
either spectrograms or audio signals
that are input to the model.
frame_labels_paths : numpy.ndarray
Paths to npy files containing vectors
with a label for each frame.
The targets for the outputs of the model.
input_type : str
The type of input to the neural network model.
One of {'audio', 'spect'}.
sample_ids : numpy.ndarray
Indexing vector representing which sample
from the dataset every frame belongs to.
inds_in_sample : numpy.ndarray
Indexing vector representing which index
within each sample from the dataset
that every frame belongs to.
frame_dur: float
Duration of a frame, i.e., a single sample in audio
or a single timebin in a spectrogram.
window_size : int
Size of windows to return;
number of frames.
frames_standardizer : vak.transforms.FramesStandardizer, optional
Transform applied to frames, the input to the neural network model.
Optional, default is None.
If supplied, will be used with the transform applied to inputs and targets,
:class:`vak.transforms.defaults.frame_classification.TrainItemTransform`.
"""
[docs]
def __init__(
self,
dataset_path: str | pathlib.Path,
dataset_df: pd.DataFrame,
input_type: str,
split: str,
sample_ids: npt.NDArray,
inds_in_sample: npt.NDArray,
frame_dur: float,
window_size: int,
frames_standardizer: FramesStandardizer | None = None,
frames_padval: float = 0.0,
frame_labels_padval: int = -1,
return_padding_mask: bool = False,
subset: str | None = None,
):
"""Initialize a new instance of an :class:`InferDatapipe`.
Parameters
----------
dataset_path : pathlib.Path
Path to directory that represents a
frame classification dataset,
as created by
:func:`vak.prep.prep_frame_classification_dataset`.
dataset_df : pandas.DataFrame
A frame classification dataset,
represented as a :class:`pandas.DataFrame`.
input_type : str
The type of input to the neural network model.
One of {'audio', 'spect'}.
split : str
The name of a split from the dataset,
one of {'train', 'val', 'test'}.
sample_ids : numpy.ndarray
Indexing vector representing which sample
from the dataset every frame belongs to.
inds_in_sample : numpy.ndarray
Indexing vector representing which index
within each sample from the dataset
that every frame belongs to.
frame_dur: float
Duration of a frame, i.e., a single sample in audio
or a single timebin in a spectrogram.
frames_standardizer : vak.transforms.FramesStandardizer, optional
Transform applied to frames, the input to the neural network model.
Optional, default is None.
If supplied, will be used with the transform applied to inputs and targets,
:class:`vak.transforms.defaults.frame_classification.InferItemTransform`.
window_size : int
Size of windows to return;
number of frames.
frames_padval : float
Value to pad frames with. Added to end of array, the "right side".
Argument to PadToWindow transform. Default is 0.0.
frame_labels_padval : int
Value to pad frame labels vector with. Added to the end of the array.
Argument to PadToWindow transform. Default is -1.
Used with ``ignore_index`` argument of :mod:`torch.nn.CrossEntropyLoss`.
return_padding_mask : bool
if True, the dictionary returned by ItemTransform classes will include
a boolean vector to use for cropping back down to size before padding.
padding_mask has size equal to width of padded array, i.e. original size
plus padding at the end, and has values of 1 where
columns in padded are from the original array,
and values of 0 where columns were added for padding.
subset : str, optional
Name of subset to use.
If specified, this takes precedence over split.
Subsets are typically taken from the training data
for use when generating a learning curve.
"""
from ... import (
prep,
) # avoid circular import, use for constants.INPUT_TYPES
if input_type not in prep.constants.INPUT_TYPES:
raise ValueError(
f"``input_type`` must be one of: {prep.constants.INPUT_TYPES}\n"
f"Value for ``input_type`` was: {input_type}"
)
self.dataset_path = pathlib.Path(dataset_path)
self.split = split
self.subset = subset
# subset takes precedence over split, if specified
if subset:
dataset_df = dataset_df[dataset_df.subset == subset].copy()
else:
dataset_df = dataset_df[dataset_df.split == split].copy()
self.dataset_df = dataset_df
self.input_type = input_type
self.frames_paths = self.dataset_df[
constants.FRAMES_PATH_COL_NAME
].values
if split != "predict":
self.frame_labels_paths = self.dataset_df[
constants.MULTI_FRAME_LABELS_PATH_COL_NAME
].values
else:
self.frame_labels_paths = None
self.sample_ids = sample_ids
self.inds_in_sample = inds_in_sample
self.frame_dur = float(frame_dur)
self.item_transform = InferItemTransform(
window_size,
frames_standardizer,
frames_padval,
frame_labels_padval,
return_padding_mask,
)
@property
def duration(self):
return self.sample_ids.shape[-1] * self.frame_dur
@property
def shape(self):
tmp_x_ind = 0
tmp_item = self.__getitem__(tmp_x_ind)
return tmp_item["frames"].shape
def _load_frames(self, frames_path):
"""Helper function that loads "frames",
the input to the frame classification model.
Loads audio or spectrogram, depending on
:attr:`self.input_type`.
This function assumes that audio is in wav format
and spectrograms are in npz files.
"""
return helper.load_frames(frames_path, self.input_type)
def __getitem__(self, idx):
frames_path = self.dataset_path / self.frames_paths[idx]
from vak import common
if self.input_type == "audio":
frames, _ = common.constants.AUDIO_FORMAT_FUNC_MAP[
constants.FRAME_CLASSIFICATION_DATASET_AUDIO_FORMAT
](frames_path)
elif self.input_type == "spect":
spect_dict = common.files.spect.load(frames_path)
frames = spect_dict[common.constants.SPECT_KEY]
frame_times = spect_dict[common.constants.TIMEBINS_KEY]
item = {"frames": frames, "frames_path": frames_path}
if self.frame_labels_paths is not None:
frame_labels = np.load(
self.dataset_path / self.frame_labels_paths[idx]
)
item["frame_labels"] = frame_labels
if self.item_transform:
item = self.item_transform(**item)
item["frame_times"] = frame_times
return item
def __len__(self):
"""number of batches"""
return len(np.unique(self.sample_ids))
[docs]
@classmethod
def from_dataset_path(
cls,
dataset_path: str | pathlib.Path,
window_size: int,
frames_standardizer: FramesStandardizer | None = None,
frames_padval: float = 0.0,
frame_labels_padval: int = -1,
return_padding_mask: bool = False,
split: str = "val",
subset: str | None = None,
):
"""Make a :class:`InferDatapipe` instance,
given the path to a frame classification dataset.
Parameters
----------
dataset_path : pathlib.Path
Path to directory that represents a
frame classification dataset,
as created by
:func:`vak.prep.prep_frame_classification_dataset`.
window_size : int
Size of windows to return;
number of frames.
frames_standardizer : vak.transforms.FramesStandardizer, optional
Transform applied to frames, the input to the neural network model.
Optional, default is None.
If supplied, will be used with the transform applied to inputs and targets,
:class:`vak.transforms.defaults.frame_classification.TrainItemTransform`.
frames_padval : float
Value to pad frames with. Added to end of array, the "right side".
Argument to PadToWindow transform. Default is 0.0.
frame_labels_padval : int
Value to pad frame labels vector with. Added to the end of the array.
Argument to PadToWindow transform. Default is -1.
Used with ``ignore_index`` argument of :mod:`torch.nn.CrossEntropyLoss`.
return_padding_mask : bool
if True, the dictionary returned by ItemTransform classes will include
a boolean vector to use for cropping back down to size before padding.
padding_mask has size equal to width of padded array, i.e. original size
plus padding at the end, and has values of 1 where
columns in padded are from the original array,
and values of 0 where columns were added for padding.
split : str
The name of a split from the dataset,
one of {'train', 'val', 'test'}.
Default is "val".
subset : str, optional
Name of subset to use.
If specified, this takes precedence over split.
Subsets are typically taken from the training data
for use when generating a learning curve.
Returns
-------
infer_datapipe : InferDatapipe
"""
dataset_path = pathlib.Path(dataset_path)
metadata = Metadata.from_dataset_path(dataset_path)
frame_dur = metadata.frame_dur
input_type = metadata.input_type
dataset_csv_path = dataset_path / metadata.dataset_csv_filename
dataset_df = pd.read_csv(dataset_csv_path)
split_path = dataset_path / split
if subset:
sample_ids_path = (
split_path
/ helper.sample_ids_array_filename_for_subset(subset)
)
else:
sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME
sample_ids = np.load(sample_ids_path)
if subset:
inds_in_sample_path = (
split_path
/ helper.inds_in_sample_array_filename_for_subset(subset)
)
else:
inds_in_sample_path = (
split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME
)
inds_in_sample = np.load(inds_in_sample_path)
return cls(
dataset_path,
dataset_df,
input_type,
split,
sample_ids,
inds_in_sample,
frame_dur,
window_size,
frames_standardizer,
frames_padval,
frame_labels_padval,
return_padding_mask,
subset,
)