Source code for vak.common.annotation

from __future__ import annotations

import copy
import os
import pathlib
from collections import Counter
from typing import Optional, Union

import crowsetta
import numpy as np
import pandas as pd

from . import constants, files
from .typing import PathLike


[docs] def format_from_df(dataset_df: pd.DataFrame) -> str: """Get the format of annotations from a dataset, given a dataframe representing that dataset. Returns string name of annotation format. If no annotation format is specified, returns None. Raises an error if there are multiple formats. Parameters ---------- dataset_df : pandas.DataFrame Representing a dataset of vocalizations, with column 'annot_format'. Returns ------- annot_format : str format of annotations for vocalizations. """ annot_format = dataset_df["annot_format"].unique() if len(annot_format) == 1: annot_format = annot_format.item() if ( annot_format is None or annot_format == constants.NO_ANNOTATION_FORMAT ): return None elif len(annot_format) > 1: raise ValueError( f"unable to load labels for dataset, found multiple annotation formats: {annot_format}" ) return annot_format
[docs] def from_df( dataset_df: pd.DataFrame, annot_root: str | pathlib.Path | None = None ) -> list[crowsetta.Annotation] | None: """Get list of annotations from a dataframe representing a dataset. If no annotation format is specified for the dataframe (in the 'annot_format' column), returns None. Parameters ---------- dataset_df : DataFrame Dataframe representing a dataset of vocalizations, with columns 'annot_format' and 'annot_path'. annot_root : str or pathlib.Path, optional Path to root of directory where annotation files are located. If specified, then paths in the DataFrame from the 'annot_path' column are constructed relative to ``annot_root``. Default is None, in which case 'annot_paths' are used directly, as if they were absolute paths. Returns ------- annots : list of annotations for each row in the dataframe, represented as crowsetta.Annotation instances. Notes ----- This function encapsulates logic for handling different types of annotations; it determines whether each row has a separate annotation file, or if instead there is a single annotation file associated with all rows. If the latter, then the function opens that file and makes sure that each row from the dataframe can be paired with an annotation (using :func:`vak.annotation.map_annotated_to_annot`). If instead there is a unique annotation file per row in the dataframe, the format of the annotation files is determined with :func:`vak.annotation.format_from_df` and then each file is opened with :module:`crowsetta` -- in other words, we assume the mapping was already done when preparing the dataset, and that each row contains an annotation file paired with the file it annotates. """ if annot_root: annot_root = pathlib.Path(annot_root) if not annot_root.exists() or not annot_root.is_dir(): raise NotADirectoryError( f"`annot_root` not found or not recognized as a directory: {annot_root}" ) annot_format = format_from_df(dataset_df) if annot_format is None: return None scribe = crowsetta.Transcriber(format=annot_format) if len(dataset_df["annot_path"].unique()) == 1: # --> there is a single annotation file associated with all rows # this can be true in two different cases: # (1) many rows, all have the same file # (2) only one row, so there's only one annotation file (which may contain annotation for multiple source files) annot_path = dataset_df["annot_path"].unique().item() if annot_root: annot_path = annot_root / annot_path annots = scribe.from_file(annot_path).to_annot() # as long as we have at least as many annotations as there are rows in the dataframe if ( isinstance(annots, list) and len(annots) >= len(dataset_df) ) or ( # case 1 isinstance(annots, crowsetta.Annotation) and len(dataset_df) == 1 ): # case 2 if isinstance(annots, crowsetta.Annotation): annots = [ annots ] # wrap in list for map_annotated_to_annot to iterate over it # then we can try and map those annotations to the rows audio_annot_map = map_annotated_to_annot( dataset_df["audio_path"].values, annots, annot_format ) # sort by row of dataframe annots = [ audio_annot_map[audio_path] for audio_path in dataset_df["audio_path"].values ] else: raise ValueError( "unable to load labels from dataframe; found a single annotation file associated with all " "rows in dataframe, but loading it did not return a list of annotations for each row.\n" f"Single annotation file: {annot_path}\n" f"Loading it returned a {type(annots)}." ) elif len(dataset_df["annot_path"].unique()) == len(dataset_df): # --> there is a unique annotation file (path) for each row, iterate over them to get labels from each annot_paths = dataset_df["annot_path"].values if annot_root: annot_paths = [ annot_root / annot_path for annot_path in annot_paths ] annots = [ scribe.from_file(annot_path).to_annot() for annot_path in annot_paths ] else: raise ValueError( "unable to load labels from dataframe; did not find an annotation file for each row or " "a single annotation file associated with all rows." ) return annots
[docs] def files_from_dir(annot_dir, annot_format): """Get all annotation files of a given format from a directory or its sub-directories, using the file extension associated with that annotation format. """ if annot_format not in constants.VALID_ANNOT_FORMATS: raise ValueError( f"specified annotation format, {annot_format} not valid.\n" f"Valid formats are: {constants.VALID_ANNOT_FORMATS}" ) format_class = crowsetta.formats.by_name(annot_format) # handle the case where an annotation format can have more than one valid extension, # e.g., simple-seq has ``('.csv', '.txt')`` as extensions ext = None if isinstance(format_class.ext, str): # NOTE that by convention the `ext` attribute # of all Crowsetta annotation format classes # begins with a period ext = format_class.ext elif isinstance(format_class.ext, tuple): # then we actually have to determine whether there's any files for either format for ext_to_test in format_class.ext: if ( len(sorted(pathlib.Path(annot_dir).glob(f"*{ext_to_test}"))) > 0 ): ext = ext_to_test if ext is None: raise ValueError( f"Unable to determine which extension to use for format: {annot_format}. " f"Used extensions from class `{format_class}`, {format_class.ext}, " f"but no files were found with that/those extensions in annot_dir:\n{annot_dir}" ) annot_files = files.from_dir(annot_dir, ext) return annot_files
[docs] class AudioFilenameNotFoundError(Exception): """Error raised when a name of an audio filename cannot be found within another filename. Raised by ``audio_filename_from_path`` and ``_map_using_audio_stem_from_path``. """
[docs] def audio_filename_from_path(path: PathLike, audio_ext: str = None) -> str: """Find the name of an audio file within a filename by removing extensions until finding an audio extension, then return the name of that audio file without the extension (i.e., the "stem"). Removes extensions from a filename recursively, by calling `os.path.splitext`, until the extension is an audio file format handled by vak. Then return the stem, that is, the part that precedes the extension. Used to match audio, spectrogram, and annotation files by their stems. Stops after finding audio extensions so that it does not remove "extensions" that are actually other parts of a filename, e.g. a time or data separated by periods. Examples -------- >>> audio_filename_from_path('gy6or6_baseline_230312_0808.138.cbin.not.mat') 'gy6or6_baseline_230312_0808.138' >>> audio_filename_from_path('Bird0/spectrograms/0.wav.npz') '0' >>> audio_filename_from_path('Bird0/Wave/0.wav') '0' Parameters ---------- path : str, Path Path to a file that contains an audio filename in its name. audio_ext : str Extension corresponding to format of audio file. Must be one of ``vak.constants.VALID_AUDIO_FORMATS``. Default is None, in which case the function looks removes extensions until it finds any valid audio format extension. Returns ------- stem : str Part of filename that precedes audio extension. """ if audio_ext: if audio_ext.startswith("."): audio_ext = audio_ext[1:] if audio_ext not in constants.VALID_AUDIO_FORMATS: raise ValueError( f"Not a valid extension for audio formats: {audio_ext}\n" f"Valid formats are: {constants.VALID_AUDIO_FORMATS}" ) extensions_to_look_for = [audio_ext] else: extensions_to_look_for = constants.VALID_AUDIO_FORMATS name = pathlib.Path(path).name stem, ext = os.path.splitext(name) ext = ext.replace(".", "").lower() while ext not in extensions_to_look_for: new_stem, ext = os.path.splitext(stem) ext = ext.replace(".", "").lower() if new_stem == stem: raise AudioFilenameNotFoundError( f"Unable to find a valid audio filename in path:\n{path}.\n" f"Valid audio file extensions are:\n{constants.VALID_AUDIO_FORMATS}" ) else: stem = new_stem return stem
[docs] class MapUsingNotatedPathError(BaseException): """Error raised when :func:`vak.annotation._map_using_notated_path` cannot map the filename of an annotation file to the name of an annotated file""" pass
def _map_using_notated_path( annotated_files: list[PathLike], annot_list: list[crowsetta.Annotation], audio_ext: Optional[str] = None, ) -> dict: """Map a :class:`list` of annotated files to a :class:`list` of :class:`crowsetta.Annotation` instances, using the ``notated_path`` attribute of the :class:`~crowsetta.Annotation`. This function assumes that the annotation format includes the names of the files that it annotates. This is necessarily true for any format that puts annotations for multiple annotated files into a single annotation file. One of two helper functions used by :func:`~vak.annotation.map_annotated_to_annot`. Parameters ---------- annotated_files : list List of paths to the annotated files. annot_list : list List of ``crowsetta.Annotation`` instances. audio_ext : str Extension corresponding to audio format. Valid extension are listed in ``vak.constants.VALID_AUDIO_FORMATS``. Default is None, in which case the function looks for any valid format. Returns ------- annotated_annot_map : dict Where each key is path to annotated file, and its value is the corresponding ``crowsetta.Annotation``. """ # First check that we don't have duplicate keys that would cause this to fail silently keys = [] for annot in annot_list: try: stem = audio_filename_from_path(annot.notated_path, audio_ext) except AudioFilenameNotFoundError as e: # Do this as a loop with a super verbose error # instead of e.g. a single-line list comprehension # so we can help users troubleshoot, # see https://github.com/vocalpy/vak/issues/525 raise MapUsingNotatedPathError( "Unable to find an audio filename in the ``notated_path`` attribute of a ``crowsetta.Annotation``." f"The ``notated_path`` attribute was:\n{annot.notated_path}\n" f"The annotation was loaded from this path:\n{annot.annot_path}\n" f"The full annotation is:\n{annot}" ) from e keys.append(stem) keys_set = set(keys) if len(keys_set) < len(keys): duplicates = [ item for item, count in Counter(keys).items() if count > 1 ] raise ValueError( f"found multiple annotations with the same audio filename(s): {duplicates}" ) del keys, keys_set # ----> make a dict with audio filenames as keys, # so we can look up annotations # by getting the same filename from the annotated files themselves, # and using those as keys. audio_filename_annot_map = { # NOTE HERE WE GET FILENAMES FROM EACH annot.notated_path, # BELOW we get filenames from each annotated_file audio_filename_from_path(annot.notated_path): annot for annot in annot_list } # Make a copy of ``annotated_files`` from which # we remove files after mapping them to annotation, # to validate that function worked, # by making sure there are no items left in this copy after the loop. # If there is 1:1 mapping then there should be no items left. annotated_annot_map = {} annotated_files_copy = copy.deepcopy(annotated_files) for annotated_file in annotated_files: # stem annotated file so we can find audio OR spect files # that match with stems from each annot.notated_path; # e.g. find '~/path/to/llb3/llb3_0003_2018_04_23_14_18_54.wav.mat' that # should match with ``Annotation(notated_path='llb3_0003_2018_04_23_14_18_54.wav')`` audio_filename_from_annotated_file = audio_filename_from_path( annotated_file ) try: annot = audio_filename_annot_map[ audio_filename_from_annotated_file ] except KeyError as e: raise MapUsingNotatedPathError( "Could not map an annotation to an annotated file path " "using `vak.annotation.audio_filename_from_path` to get " "an audio filename from the annotated file path." f"The annotated file path:\n{annotated_file} " "The audio filename found using `vak.annotation.audio_filename_from_path` " f"was:\n{audio_filename_from_annotated_file}" ) from e annotated_annot_map[annotated_file] = annot annotated_files_copy.remove(annotated_file) if len(annotated_files_copy) > 0: raise MapUsingNotatedPathError( "Could not map the following source files to annotations: " f"{annotated_files_copy}" ) # we return dict[str: annot] since we will always have paths as strings in DataFrame columns # and we want to use those strings to index into this dictionary return {str(path): annot for path, annot in annotated_annot_map.items()}
[docs] class MapUsingExtensionError(BaseException): """Error raised when :func:`vak.annotation._map_using_ext` cannot map the filename of an annotation file to the name of an annotated file""" pass
def _map_using_ext( annotated_files: list[PathLike], annot_list: list[crowsetta.Annotation], annot_format: str, method: str, annotated_ext: str | None = None, ) -> dict: """Map a list of annotated files to a :class:`list` of :class:`crowsetta.Annotation` instances, by either removing the extension of the annotation format, or replacing it with the extension of the annotated file format. This function assumes a one-to-one mapping between annotation files and the files they annotate. and that the name of the annotated file is the name of the annotation file with its format-specific extension removed, e.g., a file in a csv-based format named 'bird1.wav.csv' annotates a file named `bird1.wav`. One of two helper functions used by :func:`~.vak.annotation.map_annotated_to_annot`. Parameters ---------- annotated_files : list List of paths to the annotated files. annot_list : list List of ``crowsetta.Annotation`` instances. annot_format : str String name of annotation format Valid names are listed in ``vak.constants.VALID_ANNOT_FORMATS``. method: str The "method" used to determine the annotated file name from the annotation file name. One of {'remove', 'replace'}. Corresponds to either removing the extension for the annotation file format, or replacing its extension with the extension of the annotated format. Returns ------- annotated_annot_map : dict Where each key is path to annotated file, and its value is the corresponding ``crowsetta.Annotation``. """ if method not in {"remove", "replace"}: raise ValueError( f"`method` must be one of: {{'remove', 'replace'}}, but was: '{method}'" ) annotated_files = [ pathlib.Path(annotated_file) for annotated_file in annotated_files ] if method == "replace": if annotated_ext is None: annotated_ext_set = set( [annotated_file.suffix for annotated_file in annotated_files] ) if len(annotated_ext_set) > 1: raise ValueError( "Found more than one extension in annotated files, " "unclear which extension to use when mapping to annotations " f"with 'replace' method. Extensions found: {annotated_ext_set}" ) annotated_ext = annotated_ext_set.pop() annot_class = crowsetta.formats.by_name(annot_format) # ---- make the dict that maps name of annotated files to crowsetta.Annotations # We do this using names instead of using the full paths so that this function # can be directory agnostic, i.e., we ignore the parent path and just use the filename # to do the matching. Currently `vak` assumes at a higher level that annotation files # and annotated files exist in the same `data_dir` but I am trying to write this # function in a slightly more general way. Not obvious to me if there's a way this could backfire. # For this function we assume 1:1 mapping between annotated and annotation files, # so they probably need to be unique filenames anyway regardless of what dir they are in? annotated_filename_annot_map = {} for annot in annot_list: annotated_name = None if isinstance(annot_class.ext, str): # NOTE that by convention the `ext` attribute # of all Crowsetta annotation format classes # begins with a period annotated_name = annot.annot_path.name.replace(annot_class.ext, "") elif isinstance(annot_class.ext, tuple): # handle the case where an annotation format can have multiple extensions, # e.g., ``Format.ext == ('.csv', '.txt')`` for ext in annot_class.ext: if annot.annot_path.name.endswith(ext): annotated_name = annot.annot_path.name.replace(ext, "") break if annotated_name is None: raise MapUsingExtensionError( "Could not determine annotated file from annotation path, " f"using extension '{annot_class.ext}' from class '{annot_class.__name__}' " f"associated with format '{annot_format}'. " f"Annotation path was:\n{annot.annot_path}" ) # NOTE we don't have to do anything else for method=='remove' # since we just removed the extension if method == "replace": annotated_name = annotated_name + annotated_ext annotated_filename_annot_map[annotated_name] = annot annotated_annot_map = {} # this is what we will return # Make a copy of ``annotated_files`` from which # we remove files after mapping them to annotation, # to validate that function worked, # by making sure there are no items left in this copy after the loop. # If there is 1:1 mapping then there should be no items left. annotated_files_copy = copy.deepcopy(annotated_files) for annotated_file in annotated_files: try: annot = annotated_filename_annot_map[annotated_file.name] annotated_files_copy.remove(annotated_file) except KeyError as e: raise MapUsingExtensionError( f"Did not find an annotation that produced annotated file: {annotated_file}" ) from e annotated_annot_map[annotated_file] = annot if len(annotated_files_copy) > 0: raise MapUsingExtensionError( "Could not map the following source files to annotations: " f"{annotated_files_copy}" ) # we return dict[str: annot] since we will always have paths as strings in DataFrame columns # and we want to use those strings to index into this dictionary return {str(path): annot for path, annot in annotated_annot_map.items()}
[docs] def map_annotated_to_annot( annotated_files: Union[list, np.array], annot_list: list[crowsetta.Annotation], annot_format: str, annotated_ext: str | None = None, ) -> dict: """Map annotated files, i.e. audio or spectrogram files, to their corresponding annotations. This function implements the three different ways that vak can map annotated files to their annotations. The first is when a single annotation file contains multiple annotations, and so the format by necessity must include the file annotated by each annotation. The second assumes that the annotated file can be determined programmatically by removing the extension from the annotation file, e.g. 'bird1.wav.csv' -> 'bird1.wav'. The third assumes that the annotated file can be determined by replacing the extension of the annotation file with the extension of the annotated file, e.g. 'bird1.csv' -> 'bird1.wav'. Returns a :class:`dict` where each key is a path to an annotated file, and the value for each key is a :class:`crowsetta.Annotation`. Mapping is done with two helper functions, :func:`~vak.annotation._map_using_notated_path` and :func:`~vak.annotation._map_using_ext`. The function :func:`~vak.annotation._map_using_notated_path` is used for annotation formats that include the name of the annotated file. The names of these formats (in :module:`crowsetta`) are: {'birdsong-recognition-dataset', 'generic-seq', 'yarden'}. The other function is are used for all other formats, and it assumes a one-to-one mapping from annotation file to annotated file. It assumes that the name of the annotated file can be found by removing the extension of the annotation format, e.g., 'bird1.wav.csv` -> 'bird1.wav'. The second, that is used if the first fails, assumes the name of the annotated file can be found by replacing the extension of the annotation format with the extension of the annotated files. Parameters ---------- annotated_files : list Of paths to audio or spectrogram files. annot_list : list Of Annotations corresponding to files in annotated_files annotated_ext : str Extension of annotated files. Default is None, in which case this function will look for extensions of any valid audio format (listed as ``vak.constants.VALID_AUDIO_FORMAT``). Specifying the format provides a slight speed up. Notes ----- For more detail, please see the page on file naming conventions in the reference section of the documentation: https://vak.readthedocs.io/en/latest/reference/filenames.html """ if isinstance( annotated_files, np.ndarray ): # e.g., vak DataFrame['spect_path'].values annotated_files = annotated_files.tolist() if annot_format in ( "birdsong-recognition-dataset", "yarden", "generic-seq", ): annotated_annot_map = _map_using_notated_path( annotated_files, annot_list ) else: try: annotated_annot_map = _map_using_ext( annotated_files, annot_list, annot_format, method="remove" ) except MapUsingExtensionError: try: annotated_annot_map = _map_using_ext( annotated_files, annot_list, annot_format, method="replace", annotated_ext=annotated_ext, ) except MapUsingExtensionError as e: raise ValueError( "Could not map annotated files to annotations.\n" "Please see this section in the `vak` documentation:\n" "https://vak.readthedocs.io/en/latest/howto/howto_prep_annotate.html" "#how-does-vak-know-which-annotations-go-with-which-annotated-files" ) from e return annotated_annot_map
[docs] def has_unlabeled(annot: crowsetta.Annotation, duration: float) -> bool: """Returns ``True`` if an annotated sequence has unlabeled segments. Tests whether an instance of ``crowsetta.Annotation.seq`` has intervals between the annotated segments with a non-zero duration, or any unannotated periods before or after the annotated segments. Parameters ---------- annot : crowsetta.Annotation A :class:`crowsetta.Annotation` with a ``seq`` attribute (that is a :class:`crowsetta.Sequence`). duration : float Total duration of the vocalization that is annotated by ``annot``. Needed to determine whether the duration is greater than the time of the last offset in the annotated segments. Returns ------- has_unlabeled : bool If True, there are unlabeled periods in the vocalization annotated by ``annot``. """ if duration <= 0: raise ValueError( f"Duration less than or equal to zero passed to ``has_unlabeled``.\n" f"Value for ``duration``: {duration}.\nValue for ``annot``: {annot}" ) if duration > 0 and len(annot.seq.segments) < 1: # Handle edge case where there are no annotated segments in annotation file # See https://github.com/vocalpy/vak/issues/378 return True has_unlabeled_intervals = np.any( (annot.seq.onsets_s[1:] - annot.seq.offsets_s[:-1]) > 0.0 ) has_unlabeled_before_first_onset = annot.seq.onsets_s[0] > 0.0 has_unlabeled_after_last_offset = duration - annot.seq.offsets_s[-1] > 0.0 return ( has_unlabeled_intervals or has_unlabeled_before_first_onset or has_unlabeled_after_last_offset )