Source code for vak.prep.split.split

"""Functions for creating splits of datasets used with neural network models,
such as the standard train-val-test splits used with supervised learning methods."""

from __future__ import annotations

import logging
import pathlib

import numpy as np
import pandas as pd

from ...common.labels import from_df as labels_from_df
from .algorithms import brute_force
from .algorithms.validate import validate_split_durations

logger = logging.getLogger(__name__)


[docs] def train_test_dur_split_inds( durs, labels, labelset, train_dur, test_dur, val_dur=None, algo="brute_force", ): """Return indices to split a dataset into training, test, and validation sets of specified durations. Given the durations of a set of vocalizations, and labels from the annotations for those vocalizations, this function returns arrays of indices for splitting up the set into training, test, and validation sets. Using those indices will produce datasets that each contain instances of all labels in the set of labels. Parameters ---------- durs : iterable Of float. Durations of audio files. labels : iterable Of numpy arrays of str or int. Labels for segments (syllables, phonemes, etc.) in audio files. labelset : set, list set of unique labels for segments in files. Used to verify that each returned array of indices will produce a set that contains instances of all labels found in original set. train_dur : float Target duration for training set, in seconds. test_dur : float Target duration for test set, in seconds. val_dur : float Target duration for validation set, in seconds. Default is None. If None, no indices are returned for validation set. algo : str algorithm to use. One of {'brute_force', 'inc_freq'}. Default is 'brute_force'. For more information on the algorithms, see the docstrings, e.g., vak.io.algorithms.brute_force Returns ------- train_inds, test_inds, val_inds : numpy.ndarray indices to use with some array-like object to produce sets of specified durations """ if len(durs) != len(labels): raise ValueError( f"length of durs, {len(durs)} does not equal length of labels, {len(labels)}" ) total_dur = sum(durs) train_dur, val_dur, test_dur = validate_split_durations( train_dur, val_dur, test_dur, total_dur ) total_target_dur = sum( [dur for dur in (train_dur, test_dur, val_dur) if dur is not None] ) if total_target_dur > total_dur: raise ValueError( f"Total duration of dataset, {total_dur} seconds, is less than total target duration of " f"training, test, and (if specified) validation sets: {total_target_dur}" ) logger.info( f"Total target duration of splits: {total_target_dur} seconds. " f"Will be drawn from dataset with total duration: {total_dur:.3f}.", ) if algo == "brute_force": train_inds, val_inds, test_inds = brute_force( durs, labels, labelset, train_dur, val_dur, test_dur ) else: raise NotImplementedError(f"algorithm {algo} not implemented") return train_inds, val_inds, test_inds
[docs] def frame_classification_dataframe( dataset_df: pd.DataFrame, dataset_path: str | pathlib.Path, labelset: set, train_dur: float | None = None, test_dur: float | None = None, val_dur: float | None = None, ): """Create datasets splits from a dataframe representing a frame classification dataset. Splits dataset into training, test, and (optionally) validation subsets, specified by their duration. Additionally, adds a 'split' column to the dataframe, that assigns each row to 'train', 'val', 'test', or 'None'. Parameters ---------- dataset_df : pandas.Dataframe A pandas DataFrame representing the samples in a dataset generated by ``vak prep``. dataset_path : str Path to dataset, a directory generated by running ``vak prep``. labelset : set, list The set of label classes for vocalizations in dataset. train_dur : float Total duration of training set, in seconds. Default is None test_dur : float Total duration of test set, in seconds. Default is None. val_dur : float Total duration of validation set, in seconds. Default is None. Returns ------- dataset_df : pandas.Dataframe A copy of the input dataset with a 'split' column added, that assigns each vocalization (row) to a subset, i.e., train, validation, or test. If the vocalization was not added to one of the subsets, its value for 'split' will be 'None'. Notes ----- Uses the function :func:`vak.dataset.split.train_test_dur_split_inds` to find indices for each subset. """ dataset_path = pathlib.Path(dataset_path) if not dataset_path.exists() or not dataset_path.is_dir(): raise NotADirectoryError( f"`dataset_path` not found or not recognized as a directory: {dataset_path}" ) dataset_df = ( dataset_df.copy() ) # don't want this function to have unexpected side effects, so return a copy labels = labels_from_df(dataset_df, dataset_path) durs = dataset_df["duration"].values train_inds, val_inds, test_inds = train_test_dur_split_inds( durs=durs, labels=labels, labelset=labelset, train_dur=train_dur, test_dur=test_dur, val_dur=val_dur, ) # start off with all elements set to 'None' # so we don't have to change any that are not assigned to one of the subsets to 'None' after split_col = np.asarray( ["None" for _ in range(len(dataset_df))], dtype="object" ) split_zip = zip( ["train", "val", "test"], [train_inds, val_inds, test_inds] ) for split_name, split_inds in split_zip: if split_inds is not None: split_col[split_inds] = split_name # add split column to dataframe dataset_df["split"] = split_col return dataset_df
[docs] def unit_dataframe( dataset_df: pd.DataFrame, dataset_path: str | pathlib.Path, labelset: set, train_dur: float | None = None, test_dur: float | None = None, val_dur: float | None = None, ): """Create datasets splits from a dataframe representing a unit dataset. Splits dataset into training, test, and (optionally) validation subsets, specified by their duration. Additionally adds a 'split' column to the dataframe, that assigns each row to 'train', 'val', 'test', or 'None'. Parameters ---------- dataset_df : pandas.Dataframe A pandas DataFrame representing the samples in a dataset, generated by ``vak prep``. dataset_path : str Path to dataset, a directory generated by running ``vak prep``. labelset : set, list The set of label classes for vocalizations in dataset. train_dur : float Total duration of training set, in seconds. Default is None test_dur : float Total duration of test set, in seconds. Default is None. val_dur : float Total duration of validation set, in seconds. Default is None. Returns ------- dataset_df : pandas.Dataframe A copy of the input dataset with a 'split' column added, that assigns each vocalization (row) to a subset, i.e., train, validation, or test. If the vocalization was not added to one of the subsets, its value for 'split' will be 'None'. Notes ----- Uses the function :func:`vak.dataset.split.train_test_dur_split_inds` to find indices for each subset. """ dataset_path = pathlib.Path(dataset_path) if not dataset_path.exists() or not dataset_path.is_dir(): raise NotADirectoryError( f"`dataset_path` not found or not recognized as a directory: {dataset_path}" ) dataset_df = ( dataset_df.copy() ) # don't want this function to have unexpected side effects, so return a copy labels = [np.array([label]) for label in dataset_df.label.values] durs = dataset_df["duration"].values train_inds, val_inds, test_inds = train_test_dur_split_inds( durs=durs, labels=labels, labelset=labelset, train_dur=train_dur, test_dur=test_dur, val_dur=val_dur, ) # start off with all elements set to 'None' # so we don't have to change any that are not assigned to one of the subsets to 'None' after split_col = np.asarray( ["None" for _ in range(len(dataset_df))], dtype="object" ) split_zip = zip( ["train", "val", "test"], [train_inds, val_inds, test_inds] ) for split_name, split_inds in split_zip: if split_inds is not None: split_col[split_inds] = split_name # add split column to dataframe dataset_df["split"] = split_col return dataset_df