Source code for vak.config.prep

"""parses [PREP] section of config"""
import inspect

import attr
import dask.bag
from attr import converters, validators
from attr.validators import instance_of

from .. import prep
from ..common.converters import expanded_user_path, labelset_to_set
from .validators import is_annot_format, is_audio_format, is_spect_format


[docs] def duration_from_toml_value(value): """converter for dataset split durations. If value is -1, that value is returned -- specifies "use the remainder of the dataset". Other values are converted to float when possible.""" if value == -1: return value else: return float(value)
[docs] def is_valid_duration(instance, attribute, value): """validator for dataset split durations""" if type(value) not in {int, float}: raise TypeError( f"invalid type for {attribute} of {instance}: {type(value)}. Type should be float or int." ) if value == -1: # specifies "use the remainder of the dataset" # so it is valid, but other negative values are not return if not value >= 0: raise ValueError( f"value specified for {attribute} of {instance} must be greater than or equal to zero, was {value}" )
[docs] def are_valid_dask_bag_kwargs(instance, attribute, value): """validator for ``audio_dask_bag_kwargs``""" if not isinstance(value, dict): raise TypeError( f"Option ``audio_dask_bag_kwargs`` should be a dict but was a {type(value)}.\n" "So that it parses as a dict, please specify this option " "as an inline table in the .toml file, e.g.\n" "`audio_dask_bag_kwargs = { npartitions = 20}`" ) kwargs = list(value.keys()) valid_bag_kwargs = list( inspect.signature(dask.bag.from_sequence).parameters.keys() ) if not all([kwarg in valid_bag_kwargs for kwarg in kwargs]): invalid_kwargs = [ kwarg for kwarg in kwargs if kwarg not in valid_bag_kwargs ] print( f"Invalid keyword arguments specified in ``audio_dask_bag_kwargs``: {invalid_kwargs}" )
[docs] @attr.s class PrepConfig: """class to represent [PREP] section of config.toml file Attributes ---------- data_dir : str path to directory with files from which to make dataset output_dir : str Path to location where data sets should be saved. Default is None, in which case data sets are saved in the current working directory. dataset_type : str String name of the type of dataset, e.g., 'frame_classification'. Dataset types are defined by machine learning tasks, e.g., a 'frame_classification' dataset would be used a :class:`vak.models.FrameClassificationModel` model. Valid dataset types are defined as :const:`vak.prep.prep.DATASET_TYPES`. audio_format : str format of audio files. One of {'wav', 'cbin'}. spect_format : str format of files containg spectrograms as 2-d matrices. One of {'mat', 'npy'}. annot_format : str format of annotations. Any format that can be used with the crowsetta library is valid. annot_file : str Path to a single annotation file. Default is None. Used when a single file contains annotations for multiple audio files. labelset : set of str or int, the set of labels that correspond to annotated segments that a network should learn to segment and classify. Note that if there are segments that are not annotated, e.g. silent gaps between songbird syllables, then `vak` will assign a dummy label to those segments -- you don't have to give them a label here. Value for ``labelset`` is converted to a Python ``set`` using ``vak.config.converters.labelset_from_toml_value``. See help for that function for details on how to specify labelset. audio_dask_bag_kwargs : dict Keyword arguments used when calling ``dask.bag.from_sequence`` inside ``vak.io.audio``, where it is used to parallelize the conversion of audio files into spectrograms. Option should be specified in config.toml file as an inline table, e.g., ``audio_dask_bag_kwargs = { npartitions = 20 }``. Allows for finer-grained control when needed to process files of different sizes. train_dur : float total duration of training set, in seconds. When creating a learning curve, training subsets of shorter duration (specified by the 'train_set_durs' option in the LEARNCURVE section of a config.toml file) will be drawn from this set. val_dur : float total duration of validation set, in seconds. test_dur : float total duration of test set, in seconds. train_set_durs : list, optional Durations of datasets to use for a learning curve. Float values, durations in seconds of subsets taken from training data to create a learning curve, e.g. [5., 10., 15., 20.]. Default is None. Required if config file has a learncurve section. num_replicates : int, optional Number of replicates to train for each training set duration in a learning curve. Each replicate uses a different randomly drawn subset of the training data (but of the same duration). Default is None. Required if config file has a learncurve section. """ data_dir = attr.ib(converter=expanded_user_path) output_dir = attr.ib(converter=expanded_user_path) dataset_type = attr.ib(validator=instance_of(str)) @dataset_type.validator def is_valid_dataset_type(self, attribute, value): if value not in prep.constants.DATASET_TYPES: raise ValueError( f"Invalid dataset type: {value}. " f"Valid dataset types are: {prep.constants.DATASET_TYPES}" ) input_type = attr.ib(validator=instance_of(str)) @input_type.validator def is_valid_input_type(self, attribute, value): if value not in prep.constants.INPUT_TYPES: raise ValueError( f"Invalid input type: {value}. Must be one of: {prep.constants.INPUT_TYPES}" ) audio_format = attr.ib( validator=validators.optional(is_audio_format), default=None ) spect_format = attr.ib( validator=validators.optional(is_spect_format), default=None ) annot_file = attr.ib( converter=converters.optional(expanded_user_path), default=None, ) annot_format = attr.ib( validator=validators.optional(is_annot_format), default=None ) labelset = attr.ib( converter=converters.optional(labelset_to_set), validator=validators.optional(instance_of(set)), default=None, ) audio_dask_bag_kwargs = attr.ib( validator=validators.optional(are_valid_dask_bag_kwargs), default=None ) train_dur = attr.ib( converter=converters.optional(duration_from_toml_value), validator=validators.optional(is_valid_duration), default=None, ) val_dur = attr.ib( converter=converters.optional(duration_from_toml_value), validator=validators.optional(is_valid_duration), default=None, ) test_dur = attr.ib( converter=converters.optional(duration_from_toml_value), validator=validators.optional(is_valid_duration), default=None, ) train_set_durs = attr.ib( validator=validators.optional(instance_of(list)), default=None ) num_replicates = attr.ib( validator=validators.optional(instance_of(int)), default=None ) def __attrs_post_init__(self): if self.audio_format is not None and self.spect_format is not None: raise ValueError("cannot specify audio_format and spect_format") if self.audio_format is None and self.spect_format is None: raise ValueError( "must specify either audio_format or spect_format" )