Source code for vak.prep.frame_classification.learncurve

"""Functionality to prepare splits of frame classification datasets
to generate a learning curve."""

from __future__ import annotations

import logging
import pathlib
from typing import Sequence

import attrs
import dask.bag as db
import numpy as np
import pandas as pd
from dask.diagnostics import ProgressBar

from ... import common, datasets
from .. import split

logger = logging.getLogger(__name__)



[docs]
@attrs.define(frozen=True)
class Sample:
    """Dataclass representing one sample
    in a frame classification dataset.

    Used to add paths for arrays from the sample
    to a ``dataset_df``, and to build
    the ``sample_ids`` vector and ``inds_in_sample`` vector
    for the entire dataset."""

    source_id: int = attrs.field()
    sample_id_vec: np.ndarray
    inds_in_sample_vec: np.ndarray




[docs]
def make_index_vectors_for_each_subset(
    subsets_df: pd.DataFrame,
    dataset_path: str | pathlib.Path,
    input_type: str,
) -> pd.DataFrame:
    r"""Make npy files containing indexing vectors
    for each subset of the training data
    used to generate a learning curve
    with a frame classification dataset.

    This function is basically the same as
    :func:`vak.prep.frame_classification.make_splits.make_splits`,
    *except* that it only makes the indexing vectors
    for each subset of the training data.
    These indexing vectors are needed for each subset
    to properly grab windows from the npy files during training.
    There is no need to remake the npy files themselves though.

    All the indexing vectors for each split are saved
    in the "train" directory split inside ``dataset_path``.

    The indexing vectors are used by
    :class:`vak.datasets.frame_classification.WindowDataset`
    and :class:`vak.datasets.frame_classification.FramesDataset`.
    These vectors make it possible to work with files,
    to avoid loading the entire dataset into memory,
    and to avoid working with memory-mapped arrays.
    The first is the ``sample_ids`` vector,
    that represents the "ID" of any sample :math:`(x, y)` in the split.
    We use these IDs to load the array files corresponding to the samples.
    For a split with :math:`m` samples, this will be an array of length :math:`T`,
    the total number of frames across all samples,
    with elements :math:`i \in (0, 1, ..., m - 1)`
    indicating which frames correspond to which sample :math:`m_i`:
    :math:`(0, 0, 0, ..., 1, 1, ..., m - 1, m -1)`.
    The second vector is the ``inds_in_sample`` vector.
    This vector is the same length as ``sample_ids``, but its values represent
    the indices of frames within each sample :math:`x_t`.
    For a data set with :math:`T` total frames across all samples,
    where :math:`t_i` indicates the number of frames in each :math:`x_i`,
    this vector will look like :math:`(0, 1, ..., t_0, 0, 1, ..., t_1, ... t_m)`.

    Parameters
    ----------
    subset_df : pandas.DataFrame
        A :class:`pandas.DataFrame` representing the training data subsets.
        This DataFrame is created by
        :func:`vak.prep.frame_classification.learncurve.make_subsets_from_dataset_df`,
        and then passed into this function.
        It is created from  a ``pandas.DataFrame``
        returned by :func:`vak.prep.frame_classification.get_or_make_source_files`
        with a ``'split'`` column added.
    dataset_path : pathlib.Path
        Path to directory that represents dataset.
    input_type : str
        The type of input to the neural network model.
        One of {'audio', 'spect'}.

    Returns
    -------
    None
    """
    subsets = [
        subset for subset in sorted(subsets_df.subset.dropna().unique())
    ]
    for subset in subsets:
        logger.info(f"Making indexing vectors for subset: {subset}")
        subset_df = subsets_df[subsets_df.subset == subset].copy()
        frames_paths = subset_df[
            datasets.frame_classification.constants.FRAMES_PATH_COL_NAME
        ].values

        def _return_index_arrays(
            source_id_path_tup,
        ):
            """Function we use with dask to parallelize.
            Defined in-line so variables are in scope.
            """
            source_id, frames_path = source_id_path_tup

            frames_path = dataset_path / pathlib.Path(frames_path)

            frames = datasets.frame_classification.helper.load_frames(
                frames_path, input_type
            )

            n_frames = frames.shape[-1]
            sample_id_vec = np.ones((n_frames,)).astype(np.int32) * source_id
            inds_in_sample_vec = np.arange(n_frames)

            return Sample(
                source_id,
                sample_id_vec,
                inds_in_sample_vec,
            )

        # ---- make npy files for this split, parallelized with dask
        # using nested function just defined
        source_id_frames_path_tups = [
            (source_id, frames_path)
            for source_id, frames_path in enumerate(frames_paths)
        ]

        source_id_frames_path_bag = db.from_sequence(
            source_id_frames_path_tups
        )
        with ProgressBar():
            samples = list(source_id_frames_path_bag.map(_return_index_arrays))
        samples = sorted(samples, key=lambda sample: sample.source_id)

        # ---- save indexing vectors in train directory
        sample_id_vec = np.concatenate(
            list(sample.sample_id_vec for sample in samples)
        )
        np.save(
            dataset_path
            / "train"
            / datasets.frame_classification.helper.sample_ids_array_filename_for_subset(
                subset
            ),
            sample_id_vec,
        )
        inds_in_sample_vec = np.concatenate(
            list(sample.inds_in_sample_vec for sample in samples)
        )
        np.save(
            dataset_path
            / "train"
            / datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset(
                subset
            ),
            inds_in_sample_vec,
        )




[docs]
def make_subsets_from_dataset_df(
    dataset_df: pd.DataFrame,
    input_type: str,
    train_set_durs: Sequence[float],
    num_replicates: int,
    dataset_path: pathlib.Path,
    labelmap: dict,
) -> pd.DataFrame:
    """Make subsets of the training data split for a learning curve.

     Makes subsets given a dataframe representing the entire dataset,
     with one subset for each combination of (training set duration,
     replicate number). Each subset is randomly drawn
     from the total training split.

     Uses :func:`vak.prep.split.frame_classification_dataframe` to make
     subsets of the training data from ``dataset_df``.

     A new column will be added to the dataframe, `'subset'`,
     and additional rows for each subset.
     The dataframe is returned with these subsets added.
     (The `'split'` for these rows will still be `'train'`.)
     Additionally, a separate set of indexing vectors
     will be made for each subset, using
     :func:`vak.prep.frame_classification.learncurve.make_index_vectors_for_each_subset`.

    .. code-block:: console

       032312-vak-frame-classification-dataset-generated-231005_121809
       ├── 032312_prep_231005_121809.csv
       ├── labelmap.json
       ├── metadata.json
       ├── prep_231005_121809.log
       ├── TweetyNet_learncurve_audio_cbin_annot_notmat.toml
       ├── train
           ├── gy6or6_baseline_230312_0808.138.cbin.spect.frame_labels.npy
           ├── gy6or6_baseline_230312_0808.138.cbin.spect.frames.npy
           ├── gy6or6_baseline_230312_0809.141.cbin.spect.frame_labels.npy
           ├── gy6or6_baseline_230312_0809.141.cbin.spect.frames.npy
           ├── gy6or6_baseline_230312_0813.163.cbin.spect.frame_labels.npy
           ├── gy6or6_baseline_230312_0813.163.cbin.spect.frames.npy
           ├── gy6or6_baseline_230312_0816.179.cbin.spect.frame_labels.npy
           ├── gy6or6_baseline_230312_0816.179.cbin.spect.frames.npy
           ├── gy6or6_baseline_230312_0820.196.cbin.spect.frame_labels.npy
           ├── gy6or6_baseline_230312_0820.196.cbin.spect.frames.npy
           ├── inds_in_sample.npy
           ├── inds_in_sample-train-dur-4.0-replicate-1.npy
           ├── inds_in_sample-train-dur-4.0-replicate-2.npy
           ├── inds_in_sample-train-dur-6.0-replicate-1.npy
           ├── inds_in_sample-train-dur-6.0-replicate-2.npy
           ├── sample_ids.npy
           ├── sample_ids-train-dur-4.0-replicate-1.npy
           ├── sample_ids-train-dur-4.0-replicate-2.npy
           ├── sample_ids-train-dur-6.0-replicate-1.npy
           └── sample_ids-train-dur-6.0-replicate-2.npy
       ...

     Parameters
     ----------
     dataset_df : pandas.DataFrame
         Dataframe representing a dataset for frame classification models.
         It is returned by
         :func:`vak.prep.frame_classification.get_or_make_source_files`,
         and has a ``'split'`` column added.
     train_set_durs : list
         Durations in seconds of subsets taken from training data
         to create a learning curve, e.g., `[5., 10., 15., 20.]`.
     num_replicates : int
         number of times to replicate training for each training set duration
         to better estimate metrics for a training set of that size.
         Each replicate uses a different randomly drawn subset of the training
         data (but of the same duration).
     dataset_path : str, pathlib.Path
         Directory where splits will be saved.
     input_type : str
         The type of input to the neural network model.
         One of {'audio', 'spect'}.

     Returns
     -------
     dataset_df_out : pandas.DataFrame
         A pandas.DataFrame that has the original splits
         from ``dataset_df``, as well as the additional subsets
         of the training data added, along with additional
         columns, ``'subset', 'train_dur', 'replicate_num'``,
         that are used by :mod:`vak`.
         Other functions like :func:`vak.learncurve.learncurve`
         specify a specific subset of the training data
         by getting the subset name with the function
         :func:`vak.common.learncurve.get_train_dur_replicate_split_name`,
         and then filtering ``dataset_df_out`` with that name
         using the 'subset' column.
    """
    dataset_path = pathlib.Path(dataset_path)

    # get just train split, to pass to split.dataframe
    # so we don't end up with other splits in the training set
    train_split_df = dataset_df[dataset_df["split"] == "train"].copy()
    labelset = set([k for k in labelmap.keys() if k != "unlabeled"])

    # will concat after loop, then use ``csv_path`` to replace
    # original dataset df with this one
    subsets_df = []
    for train_dur in train_set_durs:
        logger.info(
            f"Subsetting training set for training set of duration: {train_dur}",
        )
        for replicate_num in range(1, num_replicates + 1):
            train_dur_replicate_subset_name = (
                common.learncurve.get_train_dur_replicate_subset_name(
                    train_dur, replicate_num
                )
            )

            train_dur_replicate_df = split.frame_classification_dataframe(
                # copy to avoid mutating original train_split_df
                train_split_df.copy(),
                dataset_path,
                train_dur=train_dur,
                labelset=labelset,
            )
            # remove rows where split set to 'None'
            train_dur_replicate_df = train_dur_replicate_df[
                train_dur_replicate_df.split == "train"
            ]
            # next line, make split name in csv match the split name used for directory in dataset dir
            train_dur_replicate_df["subset"] = train_dur_replicate_subset_name
            train_dur_replicate_df["train_dur"] = train_dur
            train_dur_replicate_df["replicate_num"] = replicate_num
            subsets_df.append(train_dur_replicate_df)

    subsets_df = pd.concat(subsets_df)

    make_index_vectors_for_each_subset(
        subsets_df,
        dataset_path,
        input_type,
    )

    # keep the same validation, test, and total train sets by concatenating them with the train subsets
    dataset_df["subset"] = None  # add column but have it be empty
    dataset_df = pd.concat((subsets_df, dataset_df))
    # We reset the entire index across all splits, instead of repeating indices,
    # and we set drop=False because we don't want to add a new column 'index' or 'level_0'.
    # Need to do this again after calling `make_npy_files_for_each_split` since we just
    # did `pd.concat` with the original dataframe
    dataset_df = dataset_df.reset_index(drop=True)
    return dataset_df