Source code for vak.prep.spectrogram_dataset.prep

from __future__ import annotations

import logging
import pathlib

import attrs
import crowsetta
import pandas as pd

from ...common import annotation, constants
from ...common.converters import expanded_user_path, labelset_to_set
from ...config.spect_params import SpectParamsConfig
from . import audio_helper, spect_helper

logger = logging.getLogger(__name__)


[docs] def prep_spectrogram_dataset( data_dir: str | pathlib.Path, annot_format: str | None = None, labelset: set | None = None, annot_file: str | pathlib.Path | None = None, audio_format: str | None = None, spect_format: str | None = None, spect_params: dict | None = None, spect_output_dir: str | pathlib.Path | None = None, audio_dask_bag_kwargs: dict | None = None, ) -> pd.DataFrame: """Make a dataset of spectrograms, optionally paired with annotations. Prepares dataset of vocalizations from a directory of audio or spectrogram files, and (optionally) annotation for those files. The dataset is returned as a pandas DataFrame. Datasets are used to train neural networks, predicting annotations for the dataset itself using a trained neural network, etc. If dataset is created from audio files, then array files containing spectrograms will be generated from the audio files and saved in ``spect_output_dir`` with the extension ``.spect.npz``. The ``spect_output_dir`` defaults to ``data_dir`` if is not specified. Parameters ---------- data_dir : str path to directory with audio or spectrogram files from which to make dataset annot_format : str format of annotations. Any format that can be used with the crowsetta library is valid. Default is None. labelset : str, list, set of str or int, set of unique labels for vocalizations. Default is None. If not None, then files will be skipped where the associated annotation contains labels not found in ``labelset``. ``labelset`` is converted to a Python ``set`` using ``vak.converters.labelset_to_set``. See help for that function for details on how to specify labelset. load_spects : bool if True, load spectrograms. If False, return a InferDatapipe without spectograms loaded. Default is True. Set to False when you want to create a InferDatapipe for use later, but don't want to load all the spectrograms into memory yet. audio_format : str format of audio files. One of {'wav', 'cbin'}. spect_format : str format of array files containing spectrograms as 2-d matrices. One of {'mat', 'npz'}. annot_file : str Path to a single annotation file. Default is None. Used when a single file contains annotations for multiple audio files. spect_params : dict, vak.config.spect.SpectParamsConfig. Parameters for creating spectrograms. Default is None (implying that spectrograms are already made). spect_output_dir : str Path to location where spectrogram files should be saved. Default is None, in which case it defaults to ``data_dir``. audio_dask_bag_kwargs : dict Keyword arguments used when calling ``dask.bag.from_sequence`` inside ``vak.io.audio``, where it is used to parallelize the conversion of audio files into spectrograms. Option should be specified in config.toml file as an inline table, e.g., ``audio_dask_bag_kwargs = { npartitions = 20 }``. Allows for finer-grained control when needed to process files of different sizes. Returns ------- source_files_df : pandas.DataFrame A set of source files that will be used to prepare a data set for use with neural network models, represented as a :class:`pandas.DataFrame`. Will contain paths to spectrogram files, possibly paired with annotation files, as well as the original audio files if the spectrograms were generated from audio by :func:`vak.prep.audio_helper.make_spectrogram_files_from_audio_files`. The columns of the dataframe are specified by :const:`vak.prep.spectrogram_dataset.spect_helper.DF_COLUMNS`. """ # ---- pre-conditions ---------------------------------------------------------------------------------------------- if labelset is not None: labelset = labelset_to_set(labelset) if audio_format is None and spect_format is None: raise ValueError("Must specify either audio_format or spect_format") if audio_format and spect_format: raise ValueError( "Cannot specify both audio_format and spect_format, " "unclear whether to create spectrograms from audio files or " "use already-generated spectrograms from array files" ) data_dir = expanded_user_path(data_dir) if not data_dir.is_dir(): raise NotADirectoryError(f"data_dir not found: {data_dir}") if spect_output_dir: spect_output_dir = expanded_user_path(spect_output_dir) if not spect_output_dir.is_dir(): raise NotADirectoryError( f"spect_output_dir not found: {spect_output_dir}" ) else: spect_output_dir = data_dir if annot_format is not None: if annot_file is None: annot_files = annotation.files_from_dir( annot_dir=data_dir, annot_format=annot_format ) scribe = crowsetta.Transcriber(format=annot_format) annot_list = [ scribe.from_file(annot_file).to_annot() for annot_file in annot_files ] else: scribe = crowsetta.Transcriber(format=annot_format) annot_list = scribe.from_file(annot_file).to_annot() if isinstance(annot_list, crowsetta.Annotation): # if e.g. only one annotated audio file in directory, wrap in a list to make iterable # fixes https://github.com/NickleDave/vak/issues/467 annot_list = [annot_list] else: # if annot_format not specified annot_list = None # ------ if making dataset from audio files, need to make into array files first! ---------------------------------- if audio_format: logger.info( f"making array files containing spectrograms from audio files in: {data_dir}", ) audio_files = audio_helper.files_from_dir(data_dir, audio_format) spect_files = audio_helper.make_spectrogram_files_from_audio_files( audio_format=audio_format, spect_params=spect_params, output_dir=spect_output_dir, audio_files=audio_files, annot_list=annot_list, annot_format=annot_format, labelset=labelset, dask_bag_kwargs=audio_dask_bag_kwargs, ) spect_format = "npz" spect_ext = constants.SPECT_NPZ_EXTENSION else: # if audio format is None spect_files = None # make sure we use the vak extension for spectrogram files spect_ext = constants.SPECT_FORMAT_EXT_MAP[spect_format] make_dataframe_kwargs = { "spect_format": spect_format, "labelset": labelset, "annot_list": annot_list, "annot_format": annot_format, "spect_ext": spect_ext, } if ( spect_files ): # because we just made them, and put them in spect_output_dir make_dataframe_kwargs["spect_files"] = spect_files logger.info( f"creating dataset from spectrogram files in: {spect_output_dir}", ) else: make_dataframe_kwargs["spect_dir"] = data_dir logger.info( f"creating dataset from spectrogram files in: {data_dir}", ) if spect_params: # get relevant keys for accessing arrays from array files if isinstance(spect_params, SpectParamsConfig): spect_params = attrs.asdict(spect_params) for key in [ "freqbins_key", "timebins_key", "spect_key", "audio_path_key", ]: make_dataframe_kwargs[key] = spect_params[key] source_files_df = spect_helper.make_dataframe_of_spect_files( **make_dataframe_kwargs ) return source_files_df