Source code for vak.prep.audio_dataset

from __future__ import annotations

import logging
import pathlib

import crowsetta
import dask.bag as db
import numpy as np
import pandas as pd
from dask.diagnostics import ProgressBar

from ..common import annotation, constants
from ..common.converters import expanded_user_path, labelset_to_set
from ..common.typing import PathLike
from .spectrogram_dataset.audio_helper import files_from_dir

logger = logging.getLogger(__name__)


# constant, used for names of columns in DataFrame below
DF_COLUMNS = [
    "audio_path",
    "annot_path",
    "annot_format",
    "samplerate",
    "sample_dur",
    "duration",
]


[docs] def prep_audio_dataset( data_dir: PathLike, audio_format: str, annot_format: str | None = None, annot_file: str | pathlib.Path | None = None, labelset: set | None = None, ) -> pd.DataFrame: """Gets a set of audio files from a directory, optionally paired with an annotation file or files, and return a Pandas DataFrame that represents the set of files. Finds all files with ``audio_format`` in ``data_dir``, then finds any annotations with ``annot_format`` if specified, and additionally filter the audio and annotation files by ``labelset`` if specified. Then creates the dataframe with columns specified by ``vak.prep.audio_dataset.DF_COLUMNS``: ``"audio_path"``, ``"annot_path"``, ``"annot_format"``, ``"samplerate"``, ``"sample_dur",`` and ``"duration"``. Parameters ---------- data_dir : str, pathlib.Path Path to directory containing audio files that should be used in dataset. audio_format : str A :class:`string` representing the format of audio files. One of :constant:`vak.common.constants.VALID_AUDIO_FORMATS`. annot_format : str Name of annotation format. Added as a column to the DataFrame if specified. Used by other functions that open annotation files via their paths from the DataFrame. Should be a format that the :mod:`crowsetta` library recognizes. Default is None. annot_file : str Path to a single annotation file. Default is None. Used when a single file contains annotations for multiple audio files. labelset : str, list, set Iterable of str or int, set of unique labels for annotations. Default is None. If not None, then files will be skipped where the associated annotation contains labels *not* found in ``labelset``. ``labelset`` is converted to a Python ``set`` using :func:`vak.common.converters.labelset_to_set`. See docstring of that function for details on how to specify ``labelset``. Returns ------- source_files_df : pandas.Dataframe A set of source files that will be used to prepare a data set for use with neural network models, represented as a :class:`pandas.DataFrame`. Will contain paths to audio files, possibly paired with annotation files. The columns of the dataframe are specified by :const:`vak.prep.audio_dataset.DF_COLUMNS`. """ # pre-conditions --------------------------------------------------------------------------------------------------- if audio_format not in constants.VALID_AUDIO_FORMATS: raise ValueError( f"audio format must be one of '{constants.VALID_AUDIO_FORMATS}'; " f"format '{audio_format}' not recognized." ) if labelset is not None: labelset = labelset_to_set(labelset) data_dir = expanded_user_path(data_dir) if not data_dir.exists() or not data_dir.is_dir(): raise NotADirectoryError( f"`data_dir` not found, or not recognized as a directory:\n{data_dir}" ) audio_files = files_from_dir(data_dir, audio_format) if annot_format is not None: if annot_file is None: annot_files = annotation.files_from_dir( annot_dir=data_dir, annot_format=annot_format ) scribe = crowsetta.Transcriber(format=annot_format) annot_list = [ scribe.from_file(annot_file).to_annot() for annot_file in annot_files ] else: scribe = crowsetta.Transcriber(format=annot_format) annot_list = scribe.from_file(annot_file).to_annot() if isinstance(annot_list, crowsetta.Annotation): # if e.g. only one annotated audio file in directory, wrap in a list to make iterable # fixes https://github.com/NickleDave/vak/issues/467 annot_list = [annot_list] else: # if annot_format not specified annot_list = None if annot_list: audio_annot_map = annotation.map_annotated_to_annot( audio_files, annot_list, annot_format ) else: # no annotation, so map spectrogram files to None audio_annot_map = dict( (audio_path, None) for audio_path in audio_files ) # use mapping (if generated/supplied) with labelset, if supplied, to filter if labelset: # then remove annotations with labels not in labelset for audio_file, annot in list(audio_annot_map.items()): # loop in a verbose way (i.e. not a comprehension) # so we can give user warning when we skip files annot_labelset = set(annot.seq.labels) # below, set(labels_mapping) is a set of that dict's keys if not annot_labelset.issubset(set(labelset)): # because there's some label in labels that's not in labelset audio_annot_map.pop(audio_file) extra_labels = annot_labelset - labelset logger.info( f"Found labels, {extra_labels}, in {pathlib.Path(audio_file).name}, " "that are not in labels_mapping. Skipping file.", ) # ---- actually make the dataframe --------------------------------------------------------------------------------- # this is defined here so all other arguments to 'to_dataframe' are in scope def _to_record(audio_annot_tuple): """helper function that enables parallelized creation of "records", i.e. rows for dataframe, from . Accepts a two-element tuple containing (1) a dictionary that represents a spectrogram and (2) annotation for that file""" audio_path, annot = audio_annot_tuple dat, samplerate = constants.AUDIO_FORMAT_FUNC_MAP[audio_format]( audio_path ) sample_dur = 1.0 / samplerate audio_dur = dat.shape[-1] * sample_dur if annot is not None: annot_path = annot.annot_path else: annot_path = np.nan def abspath(a_path): if isinstance(a_path, str) or isinstance(a_path, pathlib.Path): return str(pathlib.Path(a_path).absolute()) elif np.isnan(a_path): return a_path record = tuple( [ abspath(audio_path), abspath(annot_path), ( annot_format if annot_format else constants.NO_ANNOTATION_FORMAT ), samplerate, sample_dur, audio_dur, ] ) return record audio_path_annot_tuples = db.from_sequence(audio_annot_map.items()) logger.info( "creating pandas.DataFrame representing dataset from audio files", ) with ProgressBar(): records = list(audio_path_annot_tuples.map(_to_record)) return pd.DataFrame.from_records(data=records, columns=DF_COLUMNS)