"""Functionality to prepare splits of frame classification datasets
to generate a learning curve."""
from __future__ import annotations
import logging
import pathlib
from typing import Sequence
import attrs
import dask.bag as db
import numpy as np
import pandas as pd
from dask.diagnostics import ProgressBar
from ... import common, datasets
from .. import split
logger = logging.getLogger(__name__)
[docs]
@attrs.define(frozen=True)
class Sample:
"""Dataclass representing one sample
in a frame classification dataset.
Used to add paths for arrays from the sample
to a ``dataset_df``, and to build
the ``sample_ids`` vector and ``inds_in_sample`` vector
for the entire dataset."""
source_id: int = attrs.field()
sample_id_vec: np.ndarray
inds_in_sample_vec: np.ndarray
[docs]
def make_index_vectors_for_each_subset(
subsets_df: pd.DataFrame,
dataset_path: str | pathlib.Path,
input_type: str,
) -> pd.DataFrame:
r"""Make npy files containing indexing vectors
for each subset of the training data
used to generate a learning curve
with a frame classification dataset.
This function is basically the same as
:func:`vak.prep.frame_classification.make_splits.make_splits`,
*except* that it only makes the indexing vectors
for each subset of the training data.
These indexing vectors are needed for each subset
to properly grab windows from the npy files during training.
There is no need to remake the npy files themselves though.
All the indexing vectors for each split are saved
in the "train" directory split inside ``dataset_path``.
The indexing vectors are used by
:class:`vak.datasets.frame_classification.WindowDataset`
and :class:`vak.datasets.frame_classification.FramesDataset`.
These vectors make it possible to work with files,
to avoid loading the entire dataset into memory,
and to avoid working with memory-mapped arrays.
The first is the ``sample_ids`` vector,
that represents the "ID" of any sample :math:`(x, y)` in the split.
We use these IDs to load the array files corresponding to the samples.
For a split with :math:`m` samples, this will be an array of length :math:`T`,
the total number of frames across all samples,
with elements :math:`i \in (0, 1, ..., m - 1)`
indicating which frames correspond to which sample :math:`m_i`:
:math:`(0, 0, 0, ..., 1, 1, ..., m - 1, m -1)`.
The second vector is the ``inds_in_sample`` vector.
This vector is the same length as ``sample_ids``, but its values represent
the indices of frames within each sample :math:`x_t`.
For a data set with :math:`T` total frames across all samples,
where :math:`t_i` indicates the number of frames in each :math:`x_i`,
this vector will look like :math:`(0, 1, ..., t_0, 0, 1, ..., t_1, ... t_m)`.
Parameters
----------
subset_df : pandas.DataFrame
A :class:`pandas.DataFrame` representing the training data subsets.
This DataFrame is created by
:func:`vak.prep.frame_classification.learncurve.make_subsets_from_dataset_df`,
and then passed into this function.
It is created from a ``pandas.DataFrame``
returned by :func:`vak.prep.frame_classification.get_or_make_source_files`
with a ``'split'`` column added.
dataset_path : pathlib.Path
Path to directory that represents dataset.
input_type : str
The type of input to the neural network model.
One of {'audio', 'spect'}.
Returns
-------
None
"""
subsets = [
subset for subset in sorted(subsets_df.subset.dropna().unique())
]
for subset in subsets:
logger.info(f"Making indexing vectors for subset: {subset}")
subset_df = subsets_df[subsets_df.subset == subset].copy()
frames_paths = subset_df[
datasets.frame_classification.constants.FRAMES_PATH_COL_NAME
].values
def _return_index_arrays(
source_id_path_tup,
):
"""Function we use with dask to parallelize.
Defined in-line so variables are in scope.
"""
source_id, frames_path = source_id_path_tup
frames_path = dataset_path / pathlib.Path(frames_path)
frames = datasets.frame_classification.helper.load_frames(
frames_path, input_type
)
n_frames = frames.shape[-1]
sample_id_vec = np.ones((n_frames,)).astype(np.int32) * source_id
inds_in_sample_vec = np.arange(n_frames)
return Sample(
source_id,
sample_id_vec,
inds_in_sample_vec,
)
# ---- make npy files for this split, parallelized with dask
# using nested function just defined
source_id_frames_path_tups = [
(source_id, frames_path)
for source_id, frames_path in enumerate(frames_paths)
]
source_id_frames_path_bag = db.from_sequence(
source_id_frames_path_tups
)
with ProgressBar():
samples = list(source_id_frames_path_bag.map(_return_index_arrays))
samples = sorted(samples, key=lambda sample: sample.source_id)
# ---- save indexing vectors in train directory
sample_id_vec = np.concatenate(
list(sample.sample_id_vec for sample in samples)
)
np.save(
dataset_path
/ "train"
/ datasets.frame_classification.helper.sample_ids_array_filename_for_subset(
subset
),
sample_id_vec,
)
inds_in_sample_vec = np.concatenate(
list(sample.inds_in_sample_vec for sample in samples)
)
np.save(
dataset_path
/ "train"
/ datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset(
subset
),
inds_in_sample_vec,
)
[docs]
def make_subsets_from_dataset_df(
dataset_df: pd.DataFrame,
input_type: str,
train_set_durs: Sequence[float],
num_replicates: int,
dataset_path: pathlib.Path,
labelmap: dict,
) -> pd.DataFrame:
"""Make subsets of the training data split for a learning curve.
Makes subsets given a dataframe representing the entire dataset,
with one subset for each combination of (training set duration,
replicate number). Each subset is randomly drawn
from the total training split.
Uses :func:`vak.prep.split.frame_classification_dataframe` to make
subsets of the training data from ``dataset_df``.
A new column will be added to the dataframe, `'subset'`,
and additional rows for each subset.
The dataframe is returned with these subsets added.
(The `'split'` for these rows will still be `'train'`.)
Additionally, a separate set of indexing vectors
will be made for each subset, using
:func:`vak.prep.frame_classification.learncurve.make_index_vectors_for_each_subset`.
.. code-block:: console
032312-vak-frame-classification-dataset-generated-231005_121809
├── 032312_prep_231005_121809.csv
├── labelmap.json
├── metadata.json
├── prep_231005_121809.log
├── TweetyNet_learncurve_audio_cbin_annot_notmat.toml
├── train
├── gy6or6_baseline_230312_0808.138.cbin.spect.frame_labels.npy
├── gy6or6_baseline_230312_0808.138.cbin.spect.frames.npy
├── gy6or6_baseline_230312_0809.141.cbin.spect.frame_labels.npy
├── gy6or6_baseline_230312_0809.141.cbin.spect.frames.npy
├── gy6or6_baseline_230312_0813.163.cbin.spect.frame_labels.npy
├── gy6or6_baseline_230312_0813.163.cbin.spect.frames.npy
├── gy6or6_baseline_230312_0816.179.cbin.spect.frame_labels.npy
├── gy6or6_baseline_230312_0816.179.cbin.spect.frames.npy
├── gy6or6_baseline_230312_0820.196.cbin.spect.frame_labels.npy
├── gy6or6_baseline_230312_0820.196.cbin.spect.frames.npy
├── inds_in_sample.npy
├── inds_in_sample-train-dur-4.0-replicate-1.npy
├── inds_in_sample-train-dur-4.0-replicate-2.npy
├── inds_in_sample-train-dur-6.0-replicate-1.npy
├── inds_in_sample-train-dur-6.0-replicate-2.npy
├── sample_ids.npy
├── sample_ids-train-dur-4.0-replicate-1.npy
├── sample_ids-train-dur-4.0-replicate-2.npy
├── sample_ids-train-dur-6.0-replicate-1.npy
└── sample_ids-train-dur-6.0-replicate-2.npy
...
Parameters
----------
dataset_df : pandas.DataFrame
Dataframe representing a dataset for frame classification models.
It is returned by
:func:`vak.prep.frame_classification.get_or_make_source_files`,
and has a ``'split'`` column added.
train_set_durs : list
Durations in seconds of subsets taken from training data
to create a learning curve, e.g., `[5., 10., 15., 20.]`.
num_replicates : int
number of times to replicate training for each training set duration
to better estimate metrics for a training set of that size.
Each replicate uses a different randomly drawn subset of the training
data (but of the same duration).
dataset_path : str, pathlib.Path
Directory where splits will be saved.
input_type : str
The type of input to the neural network model.
One of {'audio', 'spect'}.
Returns
-------
dataset_df_out : pandas.DataFrame
A pandas.DataFrame that has the original splits
from ``dataset_df``, as well as the additional subsets
of the training data added, along with additional
columns, ``'subset', 'train_dur', 'replicate_num'``,
that are used by :mod:`vak`.
Other functions like :func:`vak.learncurve.learncurve`
specify a specific subset of the training data
by getting the subset name with the function
:func:`vak.common.learncurve.get_train_dur_replicate_split_name`,
and then filtering ``dataset_df_out`` with that name
using the 'subset' column.
"""
dataset_path = pathlib.Path(dataset_path)
# get just train split, to pass to split.dataframe
# so we don't end up with other splits in the training set
train_split_df = dataset_df[dataset_df["split"] == "train"].copy()
labelset = set([k for k in labelmap.keys() if k != "unlabeled"])
# will concat after loop, then use ``csv_path`` to replace
# original dataset df with this one
subsets_df = []
for train_dur in train_set_durs:
logger.info(
f"Subsetting training set for training set of duration: {train_dur}",
)
for replicate_num in range(1, num_replicates + 1):
train_dur_replicate_subset_name = (
common.learncurve.get_train_dur_replicate_subset_name(
train_dur, replicate_num
)
)
train_dur_replicate_df = split.frame_classification_dataframe(
# copy to avoid mutating original train_split_df
train_split_df.copy(),
dataset_path,
train_dur=train_dur,
labelset=labelset,
)
# remove rows where split set to 'None'
train_dur_replicate_df = train_dur_replicate_df[
train_dur_replicate_df.split == "train"
]
# next line, make split name in csv match the split name used for directory in dataset dir
train_dur_replicate_df["subset"] = train_dur_replicate_subset_name
train_dur_replicate_df["train_dur"] = train_dur
train_dur_replicate_df["replicate_num"] = replicate_num
subsets_df.append(train_dur_replicate_df)
subsets_df = pd.concat(subsets_df)
make_index_vectors_for_each_subset(
subsets_df,
dataset_path,
input_type,
)
# keep the same validation, test, and total train sets by concatenating them with the train subsets
dataset_df["subset"] = None # add column but have it be empty
dataset_df = pd.concat((subsets_df, dataset_df))
# We reset the entire index across all splits, instead of repeating indices,
# and we set drop=False because we don't want to add a new column 'index' or 'level_0'.
# Need to do this again after calling `make_npy_files_for_each_split` since we just
# did `pd.concat` with the original dataframe
dataset_df = dataset_df.reset_index(drop=True)
return dataset_df