Source code for vak.prep.spectrogram_dataset.spect_helper

"""Function that converts a set of array files (.npz, .mat) containing spectrograms
into a pandas DataFrame that represents a dataset used by ``vak``.

The columns of the dataframe are specified by
 :const:`vak.prep.spectrogram_dataset.spect_helper.DF_COLUMNS`.
"""

from __future__ import annotations

import logging
import pathlib

import dask.bag as db
import numpy as np
import pandas as pd
from dask.diagnostics import ProgressBar

from ...common import constants, files
from ...common.annotation import map_annotated_to_annot
from ...common.converters import labelset_to_set

logger = logging.getLogger(__name__)


# constant, used for names of columns in DataFrame below
DF_COLUMNS = [
    "audio_path",
    "spect_path",
    "annot_path",
    "annot_format",
    "duration",
    "timebin_dur",
]


[docs] def make_dataframe_of_spect_files( spect_format: str, spect_dir: str | pathlib.Path | None = None, spect_files: list | None = None, spect_ext: str | None = None, annot_list: list | None = None, annot_format: str | None = None, labelset: set | None = None, n_decimals_trunc: int = 5, freqbins_key: str = "f", timebins_key: str = "t", spect_key: str = "s", audio_path_key: str = "audio_path", ) -> pd.DataFrame: """Get a set of spectrogram files from a directory, optionally paired with an annotation file or files, and returns a Pandas DataFrame that represents all the files. Spectrogram files are array in npz files created by numpy or in mat files created by Matlab. Parameters ---------- spect_format : str Format of files containing spectrograms. One of {'mat', 'npz'} spect_dir : str Path to directory of files containing spectrograms as arrays. Default is None. spect_files : list List of paths to array files. Default is None. annot_list : list List of annotations for array files. Default is None annot_format : str Name of annotation format. Added as a column to the DataFrame if specified. Used by other functions that open annotation files via their paths from the DataFrame. Should be a format that the crowsetta library recognizes. Default is None. labelset : str, list, set Set of unique labels for vocalizations, of str or int. Default is None. If not None, then files will be skipped where the associated annotation contains labels not found in ``labelset``. ``labelset`` is converted to a Python ``set`` using :func:`vak.common.converters.labelset_to_set`. See help for that function for details on how to specify labelset. n_decimals_trunc : int number of decimal places to keep when truncating the time bin duration calculated from the vector of time bins. Default is 3, i.e. assumes milliseconds is the last significant digit. freqbins_key : str Key for accessing vector of frequency bins in files. Default is 'f'. timebins_key : str Key for accessing vector of time bins in files. Default is 't'. spect_key : str Key for accessing spectrogram in files. Default is 's'. audio_path_key : str Key for accessing path to source audio file for spectrogram in files. Default is 'audio_path'. Returns ------- source_files_df : pandas.DataFrame A set of source files that will be used to prepare a data set for use with neural network models, represented as a :class:`pandas.DataFrame`. Will contain paths to spectrogram files, possibly paired with annotation files, as well as the original audio files if the spectrograms were generated from audio by :func:`vak.prep.audio_helper.make_spectrogram_files_from_audio_files`. The columns of the dataframe are specified by :const:`vak.prep.spectrogram_dataset.spect_helper.DF_COLUMNS`. Notes ----- Each file should contain a spectrogram as a matrix and two vectors associated with it, a vector of frequency bins and time bins, where the values in those vectors are the values at the bin centers. (As far as vak is concerned, "vector" and "matrix" are synonymous with "array".) Since both mat files and npz files load into a dictionary-like structure, the arrays will be accessed with keys. By convention, these keys are 's', 'f', and 't'. If you use different keys you can let this function know by changing the appropriate arguments: spect_key, freqbins_key, timebins_key """ # pre-conditions --------------------------------------------------------------------------------------------------- if spect_format not in constants.VALID_SPECT_FORMATS: raise ValueError( f"spect_format must be one of '{constants.VALID_SPECT_FORMATS}'; " f"format '{spect_format}' not recognized." ) if all([arg is None for arg in (spect_dir, spect_files)]): raise ValueError("must specify one of: spect_dir, spect_files") if spect_dir and spect_files: raise ValueError( "received values for spect_dir and spect_files, unclear which to use" ) if annot_list and annot_format is None: raise ValueError( "an annot_list was provided, but no annot_format was specified" ) if annot_format is not None and annot_list is None: raise ValueError( "an annot_format was specified but no annot_list or spect_annot_map was provided" ) if labelset is not None: labelset = labelset_to_set(labelset) # ---- get a list of spectrogram files + associated annotation files ----------------------------------------------- if spect_dir: # then get spect_files from that dir # note we already validated format above spect_files = sorted(pathlib.Path(spect_dir).glob(f"*{spect_format}")) if spect_files: # (or if we just got them from spect_dir) if annot_list: spect_annot_map = map_annotated_to_annot( spect_files, annot_list, annot_format, annotated_ext=spect_ext ) else: # no annotation, so map spectrogram files to None spect_annot_map = dict( (spect_path, None) for spect_path in spect_files ) # use labelset if supplied, to filter if ( labelset ): # then assume user wants to filter out files where annotation has labels not in labelset for spect_path, annot in list( spect_annot_map.items() ): # `list` so we can pop from dict without RuntimeError annot_labelset = set(annot.seq.labels) # below, set(labels_mapping) is a set of that dict's keys if not annot_labelset.issubset(set(labelset)): spect_annot_map.pop(spect_path) # because there's some label in labels that's not in labelset extra_labels = annot_labelset - set(labelset) logger.info( f"Found labels, {extra_labels}, in {pathlib.Path(spect_path).name}, " "that are not in labels_mapping. Skipping file.", ) continue # ---- validate set of spectrogram files --------------------------------------------------------------------------- # regardless of whether we just made it or user supplied it spect_paths = list(spect_annot_map.keys()) files.spect.is_valid_set_of_spect_files( spect_paths, spect_format, freqbins_key, timebins_key, spect_key, n_decimals_trunc, ) # now that we have validated that duration of time bins is consistent across files, we can just open one file # to get that time bin duration. This way validation function has no side effects, like returning time bin, and # this is still relatively fast compared to looping through all files again timebin_dur = files.spect.timebin_dur( spect_paths[0], spect_format, timebins_key, n_decimals_trunc ) # ---- actually make the dataframe --------------------------------------------------------------------------------- # this is defined here so all other arguments to 'to_dataframe' are in scope def _to_record(spect_annot_tuple): """helper function that enables parallelized creation of "records", i.e. rows for dataframe. Accepts a two-element tuple containing (1) a dictionary that represents a spectrogram and (2) annotation for that file""" spect_path, annot = spect_annot_tuple spect_path = pathlib.Path(spect_path) spect_dict = files.spect.load(spect_path, spect_format) spect_dur = spect_dict[spect_key].shape[-1] * timebin_dur if audio_path_key in spect_dict: audio_path = spect_dict[audio_path_key] if isinstance(audio_path, np.ndarray): # (because everything stored in .npz has to be in an ndarray) audio_path = audio_path.tolist() else: # try to figure out audio filename programmatically # if we can't, then we'll get back a None # (or an error) audio_path = files.spect.find_audio_fname(spect_path) if annot is not None: annot_path = annot.annot_path else: annot_path = np.nan def abspath(a_path): if isinstance(a_path, str) or isinstance(a_path, pathlib.Path): return str(pathlib.Path(a_path).absolute()) elif np.isnan(a_path): return a_path record = tuple( [ abspath(audio_path), abspath(spect_path), abspath(annot_path), ( annot_format if annot_format else constants.NO_ANNOTATION_FORMAT ), spect_dur, timebin_dur, ] ) return record spect_path_annot_tuples = db.from_sequence(spect_annot_map.items()) logger.info( "creating pandas.DataFrame representing dataset from spectrogram files", ) with ProgressBar(): records = list(spect_path_annot_tuples.map(_to_record)) return pd.DataFrame.from_records(data=records, columns=DF_COLUMNS)