Source code for vak.transforms.defaults.frame_classification

"""Default transforms for frame classification models.

These are "item" transforms because they apply transforms to input parameters
and then return them in an "item" (dictionary)
that is turn returned by the __getitem__ method of a vak.FramesDataset.
Having the transform return a dictionary makes it possible to avoid
coupling the FramesDataset __getitem__ implementation to the transforms
needed for specific neural network models, e.g., whether the returned
output includes a mask to crop off padding that was added.
"""

from __future__ import annotations

from typing import Callable

import torchvision.transforms

from .. import transforms as vak_transforms



[docs]
class TrainItemTransform:
    """Default transform used when training frame classification models"""


[docs]
    def __init__(
        self,
        spect_standardizer=None,
    ):
        if spect_standardizer is not None:
            if isinstance(spect_standardizer, vak_transforms.StandardizeSpect):
                frames_transform = [spect_standardizer]
            else:
                raise TypeError(
                    f"invalid type for spect_standardizer: {type(spect_standardizer)}. "
                    "Should be an instance of vak.transforms.StandardizeSpect"
                )
        else:
            frames_transform = []

        frames_transform.extend(
            [
                vak_transforms.ToFloatTensor(),
                vak_transforms.AddChannel(),
            ]
        )
        self.frames_transform = torchvision.transforms.Compose(
            frames_transform
        )
        self.frame_labels_transform = vak_transforms.ToLongTensor()


    def __call__(self, frames, frame_labels, spect_path=None):
        frames = self.frames_transform(frames)
        frame_labels = self.frame_labels_transform(frame_labels)
        item = {
            "frames": frames,
            "frame_labels": frame_labels,
        }

        if spect_path is not None:
            item["spect_path"] = spect_path

        return item




[docs]
class EvalItemTransform:
    """Default transform used when evaluating frame classification models.

    Returned item includes "source" spectrogram reshaped into a stack of windows,
    with padded added to make reshaping possible, and annotation also padded and
    reshaped.
    If return_padding_mask is True, item includes 'padding_mask' that
    can be used to crop off any predictions made on the padding.
    """


[docs]
    def __init__(
        self,
        window_size,
        spect_standardizer=None,
        padval=0.0,
        return_padding_mask=True,
        channel_dim=1,
    ):
        if spect_standardizer is not None:
            if not isinstance(
                spect_standardizer, vak_transforms.StandardizeSpect
            ):
                raise TypeError(
                    f"invalid type for spect_standardizer: {type(spect_standardizer)}. "
                    "Should be an instance of vak.transforms.StandardizeSpect"
                )
        self.spect_standardizer = spect_standardizer

        self.pad_to_window = vak_transforms.PadToWindow(
            window_size, padval, return_padding_mask=return_padding_mask
        )

        self.source_transform_after_pad = torchvision.transforms.Compose(
            [
                vak_transforms.ViewAsWindowBatch(window_size),
                vak_transforms.ToFloatTensor(),
                # below, add channel at first dimension because windows become batch
                vak_transforms.AddChannel(channel_dim=channel_dim),
            ]
        )

        self.annot_transform = vak_transforms.ToLongTensor()


    def __call__(self, frames, frame_labels, frames_path=None):
        if self.spect_standardizer:
            frames = self.spect_standardizer(frames)

        if self.pad_to_window.return_padding_mask:
            frames, padding_mask = self.pad_to_window(frames)
        else:
            frames = self.pad_to_window(frames)
            padding_mask = None
        frames = self.source_transform_after_pad(frames)

        frame_labels = self.annot_transform(frame_labels)

        item = {
            "frames": frames,
            "frame_labels": frame_labels,
        }

        if padding_mask is not None:
            item["padding_mask"] = padding_mask

        if frames_path is not None:
            # make sure frames_path is a str, not a pathlib.Path
            item["frames_path"] = str(frames_path)

        return item




[docs]
class PredictItemTransform:
    """Default transform used when using trained frame classification models
    to make predictions.

    Returned item includes "source" spectrogram reshaped into a stack of windows,
    with padded added to make reshaping possible.
    If return_padding_mask is True, item includes 'padding_mask' that
    can be used to crop off any predictions made on the padding.
    """


[docs]
    def __init__(
        self,
        window_size,
        spect_standardizer=None,
        padval=0.0,
        return_padding_mask=True,
        channel_dim=1,
    ):
        if spect_standardizer is not None:
            if not isinstance(
                spect_standardizer, vak_transforms.StandardizeSpect
            ):
                raise TypeError(
                    f"invalid type for spect_standardizer: {type(spect_standardizer)}. "
                    "Should be an instance of vak.transforms.StandardizeSpect"
                )
        self.spect_standardizer = spect_standardizer

        self.pad_to_window = vak_transforms.PadToWindow(
            window_size, padval, return_padding_mask=return_padding_mask
        )

        self.source_transform_after_pad = torchvision.transforms.Compose(
            [
                vak_transforms.ViewAsWindowBatch(window_size),
                vak_transforms.ToFloatTensor(),
                # below, add channel at first dimension because windows become batch
                vak_transforms.AddChannel(channel_dim=channel_dim),
            ]
        )


    def __call__(self, frames, frames_path=None):
        if self.spect_standardizer:
            frames = self.spect_standardizer(frames)

        if self.pad_to_window.return_padding_mask:
            frames, padding_mask = self.pad_to_window(frames)
        else:
            frames = self.pad_to_window(frames)
            padding_mask = None

        frames = self.source_transform_after_pad(frames)

        item = {
            "frames": frames,
        }

        if padding_mask is not None:
            item["padding_mask"] = padding_mask

        if frames_path is not None:
            # make sure frames_path is a str, not a pathlib.Path
            item["frames_path"] = str(frames_path)

        return item




[docs]
def get_default_frame_classification_transform(
    mode: str, transform_kwargs: dict | None = None
) -> tuple[Callable, Callable] | Callable:
    """Get default transform for frame classification model.

    Parameters
    ----------
    mode : str
    transform_kwargs : dict, optional
        Keyword arguments for transform class.
        Default is None.
        If supplied, should be a :class:`dict`,
        that can include the following key-value pairs:
            spect_standardizer : vak.transforms.StandardizeSpect
                instance that has already been fit to dataset, using fit_df method.
                Default is None, in which case no standardization transform is applied.
            window_size : int
                width of window in number of elements. Argument to PadToWindow transform.
            padval : float
                value to pad with. Added to end of array, the "right side" if 2-dimensional.
                Argument to PadToWindow transform. Default is 0.
            return_padding_mask : bool
                if True, the dictionary returned by ItemTransform classes will include
                a boolean vector to use for cropping back down to size before padding.
                padding_mask has size equal to width of padded array, i.e. original size
                plus padding at the end, and has values of 1 where
                columns in padded are from the original array,
                and values of 0 where columns were added for padding.

    Returns
    -------
    transform: TrainItemTransform, EvalItemTransform, or PredictItemTransform
    """
    if transform_kwargs is None:
        transform_kwargs = {}
    spect_standardizer = transform_kwargs.get("spect_standardizer", None)
    # regardless of mode, transform always starts with StandardizeSpect, if used
    if spect_standardizer is not None:
        if not isinstance(spect_standardizer, vak_transforms.StandardizeSpect):
            raise TypeError(
                f"invalid type for spect_standardizer: {type(spect_standardizer)}. "
                "Should be an instance of vak.transforms.StandardizeSpect"
            )

    if mode == "train":
        return TrainItemTransform(spect_standardizer)

    elif mode == "predict":
        item_transform = PredictItemTransform(
            spect_standardizer=spect_standardizer,
            window_size=transform_kwargs["window_size"],
            padval=transform_kwargs.get("padval", 0.0),
            return_padding_mask=transform_kwargs.get(
                "return_padding_mask", True
            ),
        )
        return item_transform

    elif mode == "eval":
        item_transform = EvalItemTransform(
            spect_standardizer=spect_standardizer,
            window_size=transform_kwargs["window_size"],
            padval=transform_kwargs.get("padval", 0.0),
            return_padding_mask=transform_kwargs.get(
                "return_padding_mask", True
            ),
        )
        return item_transform
    else:
        raise ValueError(f"invalid mode: {mode}")