Source code for vak.prep.frame_classification.source_files

import logging
import pathlib

import pandas as pd

from ...common.converters import expanded_user_path, labelset_to_set
from .. import constants
from ..audio_dataset import prep_audio_dataset
from ..spectrogram_dataset.prep import prep_spectrogram_dataset

logger = logging.getLogger(__name__)


[docs] def get_or_make_source_files( data_dir: str | pathlib.Path, input_type: str, audio_format: str | None = None, spect_format: str | None = None, spect_params: dict | None = None, spect_output_dir: str | pathlib.Path | None = None, annot_format: str | None = None, annot_file: str | pathlib.Path | None = None, labelset: set | None = None, audio_dask_bag_kwargs: dict | None = None, ) -> pd.DataFrame: """Get source files for a dataset, or make them. Gets either audio or spectrogram files from ``data dir``, possibly paired with annotation files. If ``input_type`` is ``'audio'``, then this function will look for files with the extension for ``audio_format`` in ``data_dir``. If ``input_type`` is ``'spectrogram'``, and ``spect_format`` is specified, then this function will look for files with the extension for that format in ``data_dir``. If ``input_type`` is spectrogram, and ``audio_format`` is specified, this function will look for audio files with that extension and then generate spectrograms for them using ``spect_params``. If an ``annot_format`` is specified, this function will additionally look for annotation files for the audio or spectrogram files. If all annotations are in a single file, this can be specified with the ``annot_file`` parameter, and that will be used instead of looking for other annotation files. Parameters ---------- data_dir : str, Path Path to directory with files from which to make dataset. input_type : str The type of input to the neural network model. One of {'audio', 'spect'}. audio_format : str Format of audio files. One of {'wav', 'cbin'}. Default is ``None``, but either ``audio_format`` or ``spect_format`` must be specified. spect_format : str Format of files containing spectrograms as 2-d matrices. One of {'mat', 'npz'}. Default is None, but either audio_format or spect_format must be specified. spect_params : dict, vak.config.SpectParams Parameters for creating spectrograms. Default is ``None``. spect_output_dir : str Path to location where spectrogram files should be saved. Default is None. If ``input_type`` is ``'spect'``, then ``spect_output_dir`` defaults to ``data_dir``. annot_format : str Format of annotations. Any format that can be used with the :module:`crowsetta` library is valid. Default is ``None``. annot_file : str Path to a single annotation file. Default is ``None``. Used when a single file contains annotates multiple audio or spectrogram files. audio_dask_bag_kwargs : dict Keyword arguments used when calling :func:`dask.bag.from_sequence` inside :func:`vak.io.audio`, where it is used to parallelize the conversion of audio files into spectrograms. Option should be specified in config.toml file as an inline table, e.g., ``audio_dask_bag_kwargs = { npartitions = 20 }``. Allows for finer-grained control when needed to process files of different sizes. labelset : str, list, set Set of unique labels for vocalizations. Strings or integers. Default is ``None``. If not ``None``, then files will be skipped where the associated annotation contains labels not found in ``labelset``. ``labelset`` is converted to a Python ``set`` using :func:`vak.converters.labelset_to_set`. See help for that function for details on how to specify ``labelset``. Returns ------- source_files_df : pandas.DataFrame Source files that will become the dataset, represented as a pandas.DataFrame. Each row corresponds to one sample in the dataset, either an audio file or spectrogram file, possibly paired with annotations. """ if input_type not in constants.INPUT_TYPES: raise ValueError( f"``input_type`` must be one of: {constants.INPUT_TYPES}\n" f"Value for ``input_type`` was: {input_type}" ) if input_type == "audio" and spect_format is not None: raise ValueError( f"Input type was 'audio' but a ``spect_format`` was specified: '{spect_format}'. " f"Please specify ``audio_format`` only." ) if input_type == "audio" and audio_format is None: raise ValueError( "Input type was 'audio' but no ``audio_format`` was specified. " ) if audio_format is None and spect_format is None: raise ValueError( "Must specify either ``audio_format`` or ``spect_format``" ) if audio_format and spect_format: raise ValueError( "Cannot specify both ``audio_format`` and ``spect_format``, " "unclear whether to create spectrograms from audio files or " "use already-generated spectrograms from array files" ) if labelset is not None: labelset = labelset_to_set(labelset) data_dir = expanded_user_path(data_dir) if not data_dir.is_dir(): raise NotADirectoryError( f"Path specified for ``data_dir`` not found: {data_dir}" ) if annot_file is not None: annot_file = expanded_user_path(annot_file) if not annot_file.exists(): raise FileNotFoundError( f"Path specified for ``annot_file`` not found: {annot_file}" ) if input_type == "spect": source_files_df = prep_spectrogram_dataset( data_dir, annot_format, labelset, annot_file, audio_format, spect_format, spect_params, spect_output_dir, audio_dask_bag_kwargs, ) if source_files_df.empty: raise ValueError( "Calling `vak.prep.spectrogram_dataset.prep_spectrogram_dataset` " "with arguments passed to `vak.prep.prep_frame_classification_dataset` " "returned an empty dataframe.\n" "Please double-check arguments to `prep_frame_classification_dataset` function." ) elif input_type == "audio": source_files_df = prep_audio_dataset( audio_format, data_dir, annot_format, labelset, ) if source_files_df.empty: raise ValueError( "Calling `vak.prep.audio_dataset.prep_audio_dataset` " "with arguments passed to `vak.prep.prep_frame_classification_dataset` " "returned an empty dataframe.\n" "Please double-check arguments to `prep_frame_classification_dataset` function." ) return source_files_df