Source code for vak.prep.frame_classification.frame_classification

"""Function that prepares datasets for neural network models
that perform the frame classification task."""

from __future__ import annotations

import json
import logging
import pathlib
import warnings

import crowsetta.formats.seq
import pandas as pd

from ... import datasets
from ...common import labels
from ...common.converters import expanded_user_path, labelset_to_set
from ...common.logging import config_logging_for_cli, log_version
from ...common.timenow import get_timenow_as_str
from .. import dataset_df_helper, sequence_dataset
from . import validators
from .assign_samples_to_splits import assign_samples_to_splits
from .learncurve import make_subsets_from_dataset_df
from .make_splits import make_splits
from .source_files import get_or_make_source_files

logger = logging.getLogger(__name__)


[docs] def prep_frame_classification_dataset( data_dir: str | pathlib.Path, input_type: str, purpose: str, output_dir: str | pathlib.Path | None = None, audio_format: str | None = None, spect_format: str | None = None, spect_params: dict | None = None, annot_format: str | None = None, annot_file: str | pathlib.Path | None = None, labelset: set | None = None, audio_dask_bag_kwargs: dict | None = None, train_dur: float | None = None, val_dur: float | None = None, test_dur: float | None = None, train_set_durs: list[float] | None = None, num_replicates: int | None = None, spect_key: str = "s", timebins_key: str = "t", freqbins_key: str = "f", ): """Prepare datasets for neural network models that perform the frame classification task. For general information on dataset preparation, see the docstring for :func:`vak.prep.prep`. Parameters ---------- data_dir : str, Path Path to directory with files from which to make dataset. input_type : str The type of input to the neural network model. One of {'audio', 'spect'}. purpose : str Purpose of the dataset. One of {'train', 'eval', 'predict', 'learncurve'}. These correspond to commands of the vak command-line interface. input_type : str The type of input to the neural network model. One of {'audio', 'spect'}. output_dir : str Path to location where data sets should be saved. Default is ``None``, in which case it defaults to ``data_dir``. audio_format : str Format of audio files. One of {'wav', 'cbin'}. Default is ``None``, but either ``audio_format`` or ``spect_format`` must be specified. spect_format : str Format of files containing spectrograms as 2-d matrices. One of {'mat', 'npz'}. Default is None, but either audio_format or spect_format must be specified. spect_params : dict, vak.config.SpectParams Parameters for creating spectrograms. Default is ``None``. annot_format : str Format of annotations. Any format that can be used with the :module:`crowsetta` library is valid. Default is ``None``. annot_file : str Path to a single annotation file. Default is ``None``. Used when a single file contains annotates multiple audio or spectrogram files. labelset : str, list, set Set of unique labels for vocalizations. Strings or integers. Default is ``None``. If not ``None``, then files will be skipped where the associated annotation contains labels not found in ``labelset``. ``labelset`` is converted to a Python ``set`` using :func:`vak.converters.labelset_to_set`. See help for that function for details on how to specify ``labelset``. audio_dask_bag_kwargs : dict Keyword arguments used when calling :func:`dask.bag.from_sequence` inside :func:`vak.io.audio`, where it is used to parallelize the conversion of audio files into spectrograms. Option should be specified in config.toml file as an inline table, e.g., ``audio_dask_bag_kwargs = { npartitions = 20 }``. Allows for finer-grained control when needed to process files of different sizes. train_dur : float Total duration of training set, in seconds. When creating a learning curve, training subsets of shorter duration will be drawn from this set. Default is None. val_dur : float Total duration of validation set, in seconds. Default is None. test_dur : float Total duration of test set, in seconds. Default is None. train_set_durs : list of int, durations in seconds of subsets taken from training data to create a learning curve, e.g. [5, 10, 15, 20]. num_replicates : int number of times to replicate training for each training set duration to better estimate metrics for a training set of that size. Each replicate uses a different randomly drawn subset of the training data (but of the same duration). spect_key : str Key for accessing spectrogram in files. Default is 's'. timebins_key : str Key for accessing vector of time bins in files. Default is 't'. freqbins_key : str Key for accessing vector of frequency bins in files. Default is 'f'. Returns ------- dataset_df : pandas.DataFrame That represents a dataset. dataset_path : pathlib.Path Path to csv saved from ``dataset_df``. """ from .. import constants # avoid circular import # pre-conditions --------------------------------------------------------------------------------------------------- if purpose not in constants.VALID_PURPOSES: raise ValueError( f"purpose must be one of: {constants.VALID_PURPOSES}\n" f"Value for purpose was: {purpose}" ) if input_type not in constants.INPUT_TYPES: raise ValueError( f"``input_type`` must be one of: {constants.INPUT_TYPES}\n" f"Value for ``input_type`` was: {input_type}" ) if input_type == "audio" and spect_format is not None: raise ValueError( f"Input type was 'audio' but a ``spect_format`` was specified: '{spect_format}'. " f"Please specify ``audio_format`` only." ) if input_type == "audio" and audio_format is None: raise ValueError( "Input type was 'audio' but no ``audio_format`` was specified. " ) if audio_format is None and spect_format is None: raise ValueError( "Must specify either ``audio_format`` or ``spect_format``" ) if audio_format and spect_format: raise ValueError( "Cannot specify both ``audio_format`` and ``spect_format``, " "unclear whether to create spectrograms from audio files or " "use already-generated spectrograms from array files" ) if labelset is not None: labelset = labelset_to_set(labelset) data_dir = expanded_user_path(data_dir) if not data_dir.is_dir(): raise NotADirectoryError( f"Path specified for ``data_dir`` not found: {data_dir}" ) if output_dir: output_dir = expanded_user_path(output_dir) else: output_dir = data_dir if not output_dir.is_dir(): raise NotADirectoryError( f"Path specified for ``output_dir`` not found: {output_dir}" ) if annot_file is not None: annot_file = expanded_user_path(annot_file) if not annot_file.exists(): raise FileNotFoundError( f"Path specified for ``annot_file`` not found: {annot_file}" ) if purpose == "predict": if labelset is not None: warnings.warn( "The ``purpose`` argument was set to 'predict`, but a ``labelset`` was provided." "This would cause an error because the ``prep_spectrogram_dataset`` section will attempt to " "check whether the files in the ``data_dir`` have labels in " "``labelset``, even though those files don't have annotation.\n" "Setting ``labelset`` to None." ) labelset = None else: # if purpose is not predict if labelset is None: raise ValueError( f"The ``purpose`` argument was set to '{purpose}', but no ``labelset`` was provided." "This will cause an error when trying to split the dataset, " "e.g. into training and test splits, " "or a silent error, e.g. when calculating metrics with an evaluation set. " "Please specify a ``labelset`` when calling ``vak.prep.frame_classification.prep`` " f"with ``purpose='{purpose}'." ) logger.info(f"Purpose for frame classification dataset: {purpose}") # ---- set up directory that will contain dataset, and csv file name ----------------------------------------------- data_dir_name = data_dir.name timenow = get_timenow_as_str() dataset_path = ( output_dir / f"{data_dir_name}-vak-frame-classification-dataset-generated-{timenow}" ) dataset_path.mkdir() if annot_file and annot_format == "birdsong-recognition-dataset": # we do this normalization / canonicalization after we make dataset_path # so that we can put the new annot_file inside of dataset_path, instead of # making new files elsewhere on a user's system logger.info( "The ``annot_format`` argument was set to 'birdsong-recognition-format'; " "this format requires the audio files for their sampling rate " "to convert onset and offset times of birdsong syllables to seconds." "Converting this format to 'generic-seq' now with the times in seconds, " "so that the dataset prepared by vak will not require the audio files." ) birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(annot_file) annots = birdsongrec.to_annot() # note we point `annot_file` at a new file we're about to make annot_file = ( dataset_path / f"{annot_file.stem}.converted-to-generic-seq.csv" ) # and we remake Annotations here so that annot_path points to this new file, not the birdsong-rec Annotation.xml annots = [ crowsetta.Annotation( seq=annot.seq, annot_path=annot_file, notated_path=annot.notated_path, ) for annot in annots ] generic_seq = crowsetta.formats.seq.GenericSeq(annots=annots) generic_seq.to_file(annot_file) # and we now change `annot_format` as well. Both these will get passed to io.prep_spectrogram_dataset annot_format = "generic-seq" # NOTE we set up logging here (instead of cli) so the prep log is included in the dataset config_logging_for_cli( log_dst=dataset_path, log_stem="prep", level="INFO", force=True ) log_version(logger) dataset_csv_path = dataset_df_helper.get_dataset_csv_path( dataset_path, data_dir_name, timenow ) logger.info(f"Will prepare dataset as directory: {dataset_path}") # ---- get or make source files: either audio or spectrogram, possible paired with annotation files ---------------- source_files_df: pd.DataFrame = get_or_make_source_files( data_dir, input_type, audio_format, spect_format, spect_params, dataset_path, annot_format, annot_file, labelset, audio_dask_bag_kwargs, ) # save before (possibly) splitting, just in case duration args are not valid # (we can't know until we make dataset) source_files_df.to_csv(dataset_csv_path) # ---- assign samples to splits; adds a 'split' column to dataset_df, calling `vak.prep.split` if needed ----------- # once we assign a split, we consider this the ``dataset_df`` dataset_df: pd.DataFrame = assign_samples_to_splits( purpose, source_files_df, dataset_path, train_dur, val_dur, test_dur, labelset, ) # ---- create and save labelmap ------------------------------------------------------------------------------------ # we do this before creating array files since we need to load the labelmap to make frame label vectors if purpose != "predict": # TODO: add option to generate predict using existing dataset, so we can get labelmap from it map_unlabeled_segments = sequence_dataset.has_unlabeled_segments( dataset_df ) labelmap = labels.to_map( labelset, map_unlabeled=map_unlabeled_segments ) logger.info( f"Number of classes in labelmap: {len(labelmap)}", ) # save labelmap in case we need it later with (dataset_path / "labelmap.json").open("w") as fp: json.dump(labelmap, fp) else: labelmap = None # ---- actually move/copy/create files into directories representing splits ---------------------------------------- # now we're *remaking* the dataset_df (actually adding additional rows with the splits) dataset_df: pd.DataFrame = make_splits( dataset_df, dataset_path, input_type, purpose, labelmap, audio_format, spect_key, timebins_key, freqbins_key, ) # ---- if purpose is learncurve, additionally prep training data subsets for the learning curve -------------------- if purpose == "learncurve": dataset_df: pd.DataFrame = make_subsets_from_dataset_df( dataset_df, input_type, train_set_durs, num_replicates, dataset_path, labelmap, ) # ---- save csv file that captures provenance of source data ------------------------------------------------------- logger.info(f"Saving dataset csv file: {dataset_csv_path}") dataset_df.to_csv( dataset_csv_path, index=False ) # index is False to avoid having "Unnamed: 0" column when loading # ---- save metadata ----------------------------------------------------------------------------------------------- frame_dur = validators.validate_and_get_frame_dur(dataset_df, input_type) if input_type == "spect" and spect_format != "npz": # then change to npz since we canonicalize data so it's always npz arrays # We need this to be correct for other functions, e.g. predict when it loads spectrogram files spect_format = "npz" metadata = datasets.frame_classification.Metadata( dataset_csv_filename=str(dataset_csv_path.name), frame_dur=frame_dur, input_type=input_type, audio_format=audio_format, spect_format=spect_format, ) metadata.to_json(dataset_path) return dataset_df, dataset_path