Source code for vak.prep.split.split

"""Functions for creating splits of datasets used with neural network models,
such as the standard train-val-test splits used with supervised learning methods."""

from __future__ import annotations

import logging
import pathlib

import numpy as np
import pandas as pd

from ...common.labels import from_df as labels_from_df
from .algorithms import brute_force
from .algorithms.validate import validate_split_durations

logger = logging.getLogger(__name__)



[docs]
def train_test_dur_split_inds(
    durs,
    labels,
    labelset,
    train_dur,
    test_dur,
    val_dur=None,
    algo="brute_force",
):
    """Return indices to split a dataset into training, test, and validation sets of specified durations.

    Given the durations of a set of vocalizations, and labels from the annotations for those vocalizations,
    this function returns arrays of indices for splitting up the set into training, test,
    and validation sets.

    Using those indices will produce datasets that each contain instances of all labels in the set of labels.

    Parameters
    ----------
    durs : iterable
        Of float. Durations of audio files.
    labels : iterable
        Of numpy arrays of str or int. Labels for segments (syllables, phonemes, etc.) in audio files.
    labelset : set, list
        set of unique labels for segments in files. Used to verify that each returned array
        of indices will produce a set that contains instances of all labels found in original
        set.
    train_dur : float
        Target duration for training set, in seconds.
    test_dur : float
        Target duration for test set, in seconds.
    val_dur : float
        Target duration for validation set, in seconds. Default is None.
        If None, no indices are returned for validation set.
    algo : str
        algorithm to use. One of {'brute_force', 'inc_freq'}. Default is 'brute_force'. For more information
        on the algorithms, see the docstrings, e.g., vak.io.algorithms.brute_force

    Returns
    -------
    train_inds, test_inds, val_inds : numpy.ndarray
        indices to use with some array-like object to produce sets of specified durations
    """
    if len(durs) != len(labels):
        raise ValueError(
            f"length of durs, {len(durs)} does not equal length of labels, {len(labels)}"
        )

    total_dur = sum(durs)
    train_dur, val_dur, test_dur = validate_split_durations(
        train_dur, val_dur, test_dur, total_dur
    )

    total_target_dur = sum(
        [dur for dur in (train_dur, test_dur, val_dur) if dur is not None]
    )

    if total_target_dur > total_dur:
        raise ValueError(
            f"Total duration of dataset, {total_dur} seconds, is less than total target duration of "
            f"training, test, and (if specified) validation sets: {total_target_dur}"
        )

    logger.info(
        f"Total target duration of splits: {total_target_dur} seconds. "
        f"Will be drawn from dataset with total duration: {total_dur:.3f}.",
    )

    if algo == "brute_force":
        train_inds, val_inds, test_inds = brute_force(
            durs, labels, labelset, train_dur, val_dur, test_dur
        )
    else:
        raise NotImplementedError(f"algorithm {algo} not implemented")

    return train_inds, val_inds, test_inds




[docs]
def frame_classification_dataframe(
    dataset_df: pd.DataFrame,
    dataset_path: str | pathlib.Path,
    labelset: set,
    train_dur: float | None = None,
    test_dur: float | None = None,
    val_dur: float | None = None,
):
    """Create datasets splits from a dataframe
    representing a frame classification dataset.

    Splits dataset into training, test, and (optionally) validation subsets,
    specified by their duration.

    Additionally, adds a 'split' column to the dataframe,
    that assigns each row to 'train', 'val', 'test', or 'None'.

    Parameters
    ----------
    dataset_df : pandas.Dataframe
        A pandas DataFrame representing the samples in a dataset generated by ``vak prep``.
    dataset_path : str
        Path to dataset, a directory generated by running ``vak prep``.
    labelset : set, list
        The set of label classes for vocalizations in dataset.
    train_dur : float
        Total duration of training set, in seconds. Default is None
    test_dur : float
        Total duration of test set, in seconds. Default is None.
    val_dur : float
        Total duration of validation set, in seconds. Default is None.

    Returns
    -------
    dataset_df : pandas.Dataframe
        A copy of the input dataset with a 'split' column added,
        that assigns each vocalization (row) to a subset,
        i.e., train, validation, or test.
        If the vocalization was not added to one of the subsets,
        its value for 'split' will be 'None'.

    Notes
    -----
    Uses the function :func:`vak.dataset.split.train_test_dur_split_inds`
    to find indices for each subset.
    """
    dataset_path = pathlib.Path(dataset_path)
    if not dataset_path.exists() or not dataset_path.is_dir():
        raise NotADirectoryError(
            f"`dataset_path` not found or not recognized as a directory: {dataset_path}"
        )

    dataset_df = (
        dataset_df.copy()
    )  # don't want this function to have unexpected side effects, so return a copy
    labels = labels_from_df(dataset_df, dataset_path)

    durs = dataset_df["duration"].values
    train_inds, val_inds, test_inds = train_test_dur_split_inds(
        durs=durs,
        labels=labels,
        labelset=labelset,
        train_dur=train_dur,
        test_dur=test_dur,
        val_dur=val_dur,
    )

    # start off with all elements set to 'None'
    # so we don't have to change any that are not assigned to one of the subsets to 'None' after
    split_col = np.asarray(
        ["None" for _ in range(len(dataset_df))], dtype="object"
    )
    split_zip = zip(
        ["train", "val", "test"], [train_inds, val_inds, test_inds]
    )
    for split_name, split_inds in split_zip:
        if split_inds is not None:
            split_col[split_inds] = split_name

    # add split column to dataframe
    dataset_df["split"] = split_col

    return dataset_df




[docs]
def unit_dataframe(
    dataset_df: pd.DataFrame,
    dataset_path: str | pathlib.Path,
    labelset: set,
    train_dur: float | None = None,
    test_dur: float | None = None,
    val_dur: float | None = None,
):
    """Create datasets splits from a dataframe
    representing a unit dataset.

    Splits dataset into training, test, and (optionally) validation subsets,
    specified by their duration.

    Additionally adds a 'split' column to the dataframe,
    that assigns each row to 'train', 'val', 'test', or 'None'.

    Parameters
    ----------
    dataset_df : pandas.Dataframe
        A pandas DataFrame representing the samples in a dataset,
        generated by ``vak prep``.
    dataset_path : str
        Path to dataset, a directory generated by running ``vak prep``.
    labelset : set, list
        The set of label classes for vocalizations in dataset.
    train_dur : float
        Total duration of training set, in seconds. Default is None
    test_dur : float
        Total duration of test set, in seconds. Default is None.
    val_dur : float
        Total duration of validation set, in seconds. Default is None.

    Returns
    -------
    dataset_df : pandas.Dataframe
        A copy of the input dataset with a 'split' column added,
        that assigns each vocalization (row) to a subset,
        i.e., train, validation, or test.
        If the vocalization was not added to one of the subsets,
        its value for 'split' will be 'None'.

    Notes
    -----
    Uses the function :func:`vak.dataset.split.train_test_dur_split_inds`
    to find indices for each subset.
    """
    dataset_path = pathlib.Path(dataset_path)
    if not dataset_path.exists() or not dataset_path.is_dir():
        raise NotADirectoryError(
            f"`dataset_path` not found or not recognized as a directory: {dataset_path}"
        )

    dataset_df = (
        dataset_df.copy()
    )  # don't want this function to have unexpected side effects, so return a copy
    labels = [np.array([label]) for label in dataset_df.label.values]

    durs = dataset_df["duration"].values
    train_inds, val_inds, test_inds = train_test_dur_split_inds(
        durs=durs,
        labels=labels,
        labelset=labelset,
        train_dur=train_dur,
        test_dur=test_dur,
        val_dur=val_dur,
    )

    # start off with all elements set to 'None'
    # so we don't have to change any that are not assigned to one of the subsets to 'None' after
    split_col = np.asarray(
        ["None" for _ in range(len(dataset_df))], dtype="object"
    )
    split_zip = zip(
        ["train", "val", "test"], [train_inds, val_inds, test_inds]
    )
    for split_name, split_inds in split_zip:
        if split_inds is not None:
            split_col[split_inds] = split_name

    # add split column to dataframe
    dataset_df["split"] = split_col

    return dataset_df