from __future__ import annotations
import copy
import os
import pathlib
from collections import Counter
from typing import Optional, Union
import crowsetta
import numpy as np
import pandas as pd
from . import constants, files
from .typing import PathLike
[docs]
def from_df(
dataset_df: pd.DataFrame, annot_root: str | pathlib.Path | None = None
) -> list[crowsetta.Annotation] | None:
"""Get list of annotations from a dataframe
representing a dataset.
If no annotation format is specified for the dataframe
(in the 'annot_format' column), returns None.
Parameters
----------
dataset_df : DataFrame
Dataframe representing a dataset of vocalizations,
with columns 'annot_format' and 'annot_path'.
annot_root : str or pathlib.Path, optional
Path to root of directory where annotation files are located.
If specified, then paths in the DataFrame from the 'annot_path' column
are constructed relative to ``annot_root``.
Default is None, in which case 'annot_paths' are used directly,
as if they were absolute paths.
Returns
-------
annots : list
of annotations for each row in the dataframe,
represented as crowsetta.Annotation instances.
Notes
-----
This function encapsulates logic for handling different types of
annotations; it determines whether each row has a separate annotation file,
or if instead there is a single annotation file associated with all rows.
If the latter, then the function opens that file and makes sure that
each row from the dataframe can be paired with an annotation
(using :func:`vak.annotation.map_annotated_to_annot`).
If instead there is a unique annotation file per row in the dataframe,
the format of the annotation files is determined with
:func:`vak.annotation.format_from_df` and then each file is opened
with :module:`crowsetta` -- in other words, we assume the mapping
was already done when preparing the dataset, and that each row contains
an annotation file paired with the file it annotates.
"""
if annot_root:
annot_root = pathlib.Path(annot_root)
if not annot_root.exists() or not annot_root.is_dir():
raise NotADirectoryError(
f"`annot_root` not found or not recognized as a directory: {annot_root}"
)
annot_format = format_from_df(dataset_df)
if annot_format is None:
return None
scribe = crowsetta.Transcriber(format=annot_format)
if len(dataset_df["annot_path"].unique()) == 1:
# --> there is a single annotation file associated with all rows
# this can be true in two different cases:
# (1) many rows, all have the same file
# (2) only one row, so there's only one annotation file (which may contain annotation for multiple source files)
annot_path = dataset_df["annot_path"].unique().item()
if annot_root:
annot_path = annot_root / annot_path
annots = scribe.from_file(annot_path).to_annot()
# as long as we have at least as many annotations as there are rows in the dataframe
if (
isinstance(annots, list) and len(annots) >= len(dataset_df)
) or ( # case 1
isinstance(annots, crowsetta.Annotation) and len(dataset_df) == 1
): # case 2
if isinstance(annots, crowsetta.Annotation):
annots = [
annots
] # wrap in list for map_annotated_to_annot to iterate over it
# then we can try and map those annotations to the rows
audio_annot_map = map_annotated_to_annot(
dataset_df["audio_path"].values, annots, annot_format
)
# sort by row of dataframe
annots = [
audio_annot_map[audio_path]
for audio_path in dataset_df["audio_path"].values
]
else:
raise ValueError(
"unable to load labels from dataframe; found a single annotation file associated with all "
"rows in dataframe, but loading it did not return a list of annotations for each row.\n"
f"Single annotation file: {annot_path}\n"
f"Loading it returned a {type(annots)}."
)
elif len(dataset_df["annot_path"].unique()) == len(dataset_df):
# --> there is a unique annotation file (path) for each row, iterate over them to get labels from each
annot_paths = dataset_df["annot_path"].values
if annot_root:
annot_paths = [
annot_root / annot_path for annot_path in annot_paths
]
annots = [
scribe.from_file(annot_path).to_annot()
for annot_path in annot_paths
]
else:
raise ValueError(
"unable to load labels from dataframe; did not find an annotation file for each row or "
"a single annotation file associated with all rows."
)
return annots
[docs]
def files_from_dir(annot_dir, annot_format):
"""Get all annotation files of a given format
from a directory or its sub-directories,
using the file extension associated with that annotation format.
"""
if annot_format not in constants.VALID_ANNOT_FORMATS:
raise ValueError(
f"specified annotation format, {annot_format} not valid.\n"
f"Valid formats are: {constants.VALID_ANNOT_FORMATS}"
)
format_class = crowsetta.formats.by_name(annot_format)
# handle the case where an annotation format can have more than one valid extension,
# e.g., simple-seq has ``('.csv', '.txt')`` as extensions
ext = None
if isinstance(format_class.ext, str):
# NOTE that by convention the `ext` attribute
# of all Crowsetta annotation format classes
# begins with a period
ext = format_class.ext
elif isinstance(format_class.ext, tuple):
# then we actually have to determine whether there's any files for either format
for ext_to_test in format_class.ext:
if (
len(sorted(pathlib.Path(annot_dir).glob(f"*{ext_to_test}")))
> 0
):
ext = ext_to_test
if ext is None:
raise ValueError(
f"Unable to determine which extension to use for format: {annot_format}. "
f"Used extensions from class `{format_class}`, {format_class.ext}, "
f"but no files were found with that/those extensions in annot_dir:\n{annot_dir}"
)
annot_files = files.from_dir(annot_dir, ext)
return annot_files
[docs]
class AudioFilenameNotFoundError(Exception):
"""Error raised when a name of an audio filename
cannot be found within another filename.
Raised by ``audio_filename_from_path``
and ``_map_using_audio_stem_from_path``.
"""
[docs]
def audio_filename_from_path(path: PathLike, audio_ext: str = None) -> str:
"""Find the name of an audio file within a filename
by removing extensions until finding an audio extension,
then return the name of that audio file
without the extension (i.e., the "stem").
Removes extensions from a filename recursively,
by calling `os.path.splitext`,
until the extension is an audio file format handled by vak.
Then return the stem, that is,
the part that precedes the extension.
Used to match audio, spectrogram,
and annotation files by their stems.
Stops after finding audio extensions
so that it does not remove "extensions"
that are actually other parts of a filename,
e.g. a time or data separated by periods.
Examples
--------
>>> audio_filename_from_path('gy6or6_baseline_230312_0808.138.cbin.not.mat')
'gy6or6_baseline_230312_0808.138'
>>> audio_filename_from_path('Bird0/spectrograms/0.wav.npz')
'0'
>>> audio_filename_from_path('Bird0/Wave/0.wav')
'0'
Parameters
----------
path : str, Path
Path to a file that contains an audio filename in its name.
audio_ext : str
Extension corresponding to format of audio file.
Must be one of ``vak.constants.VALID_AUDIO_FORMATS``.
Default is None, in which case the function looks
removes extensions until it finds any valid audio
format extension.
Returns
-------
stem : str
Part of filename that precedes audio extension.
"""
if audio_ext:
if audio_ext.startswith("."):
audio_ext = audio_ext[1:]
if audio_ext not in constants.VALID_AUDIO_FORMATS:
raise ValueError(
f"Not a valid extension for audio formats: {audio_ext}\n"
f"Valid formats are: {constants.VALID_AUDIO_FORMATS}"
)
extensions_to_look_for = [audio_ext]
else:
extensions_to_look_for = constants.VALID_AUDIO_FORMATS
name = pathlib.Path(path).name
stem, ext = os.path.splitext(name)
ext = ext.replace(".", "").lower()
while ext not in extensions_to_look_for:
new_stem, ext = os.path.splitext(stem)
ext = ext.replace(".", "").lower()
if new_stem == stem:
raise AudioFilenameNotFoundError(
f"Unable to find a valid audio filename in path:\n{path}.\n"
f"Valid audio file extensions are:\n{constants.VALID_AUDIO_FORMATS}"
)
else:
stem = new_stem
return stem
[docs]
class MapUsingNotatedPathError(BaseException):
"""Error raised when :func:`vak.annotation._map_using_notated_path`
cannot map the filename of an annotation file to the name
of an annotated file"""
pass
def _map_using_notated_path(
annotated_files: list[PathLike],
annot_list: list[crowsetta.Annotation],
audio_ext: Optional[str] = None,
) -> dict:
"""Map a :class:`list` of annotated files to a :class:`list`
of :class:`crowsetta.Annotation` instances,
using the ``notated_path`` attribute of the
:class:`~crowsetta.Annotation`.
This function assumes that the annotation format
includes the names of the files that it annotates.
This is necessarily true for any format that puts
annotations for multiple annotated files into a single
annotation file.
One of two helper functions used by
:func:`~vak.annotation.map_annotated_to_annot`.
Parameters
----------
annotated_files : list
List of paths to the annotated files.
annot_list : list
List of ``crowsetta.Annotation`` instances.
audio_ext : str
Extension corresponding to audio format.
Valid extension are listed in
``vak.constants.VALID_AUDIO_FORMATS``.
Default is None, in which case the function
looks for any valid format.
Returns
-------
annotated_annot_map : dict
Where each key is path to annotated file, and
its value is the corresponding ``crowsetta.Annotation``.
"""
# First check that we don't have duplicate keys that would cause this to fail silently
keys = []
for annot in annot_list:
try:
stem = audio_filename_from_path(annot.notated_path, audio_ext)
except AudioFilenameNotFoundError as e:
# Do this as a loop with a super verbose error
# instead of e.g. a single-line list comprehension
# so we can help users troubleshoot,
# see https://github.com/vocalpy/vak/issues/525
raise MapUsingNotatedPathError(
"Unable to find an audio filename in the ``notated_path`` attribute of a ``crowsetta.Annotation``."
f"The ``notated_path`` attribute was:\n{annot.notated_path}\n"
f"The annotation was loaded from this path:\n{annot.annot_path}\n"
f"The full annotation is:\n{annot}"
) from e
keys.append(stem)
keys_set = set(keys)
if len(keys_set) < len(keys):
duplicates = [
item for item, count in Counter(keys).items() if count > 1
]
raise ValueError(
f"found multiple annotations with the same audio filename(s): {duplicates}"
)
del keys, keys_set
# ----> make a dict with audio filenames as keys,
# so we can look up annotations
# by getting the same filename from the annotated files themselves,
# and using those as keys.
audio_filename_annot_map = {
# NOTE HERE WE GET FILENAMES FROM EACH annot.notated_path,
# BELOW we get filenames from each annotated_file
audio_filename_from_path(annot.notated_path): annot
for annot in annot_list
}
# Make a copy of ``annotated_files`` from which
# we remove files after mapping them to annotation,
# to validate that function worked,
# by making sure there are no items left in this copy after the loop.
# If there is 1:1 mapping then there should be no items left.
annotated_annot_map = {}
annotated_files_copy = copy.deepcopy(annotated_files)
for annotated_file in annotated_files:
# stem annotated file so we can find audio OR spect files
# that match with stems from each annot.notated_path;
# e.g. find '~/path/to/llb3/llb3_0003_2018_04_23_14_18_54.wav.mat' that
# should match with ``Annotation(notated_path='llb3_0003_2018_04_23_14_18_54.wav')``
audio_filename_from_annotated_file = audio_filename_from_path(
annotated_file
)
try:
annot = audio_filename_annot_map[
audio_filename_from_annotated_file
]
except KeyError as e:
raise MapUsingNotatedPathError(
"Could not map an annotation to an annotated file path "
"using `vak.annotation.audio_filename_from_path` to get "
"an audio filename from the annotated file path."
f"The annotated file path:\n{annotated_file} "
"The audio filename found using `vak.annotation.audio_filename_from_path` "
f"was:\n{audio_filename_from_annotated_file}"
) from e
annotated_annot_map[annotated_file] = annot
annotated_files_copy.remove(annotated_file)
if len(annotated_files_copy) > 0:
raise MapUsingNotatedPathError(
"Could not map the following source files to annotations: "
f"{annotated_files_copy}"
)
# we return dict[str: annot] since we will always have paths as strings in DataFrame columns
# and we want to use those strings to index into this dictionary
return {str(path): annot for path, annot in annotated_annot_map.items()}
[docs]
class MapUsingExtensionError(BaseException):
"""Error raised when :func:`vak.annotation._map_using_ext`
cannot map the filename of an annotation file to the name
of an annotated file"""
pass
def _map_using_ext(
annotated_files: list[PathLike],
annot_list: list[crowsetta.Annotation],
annot_format: str,
method: str,
annotated_ext: str | None = None,
) -> dict:
"""Map a list of annotated files to a :class:`list` of
:class:`crowsetta.Annotation` instances,
by either removing the extension of the annotation format,
or replacing it with the extension of the annotated file format.
This function assumes a one-to-one mapping between
annotation files and the files they annotate.
and that the name of the annotated file is
the name of the annotation file with its
format-specific extension removed,
e.g., a file in a csv-based format named 'bird1.wav.csv'
annotates a file named `bird1.wav`.
One of two helper functions used by
:func:`~.vak.annotation.map_annotated_to_annot`.
Parameters
----------
annotated_files : list
List of paths to the annotated files.
annot_list : list
List of ``crowsetta.Annotation`` instances.
annot_format : str
String name of annotation format
Valid names are listed in
``vak.constants.VALID_ANNOT_FORMATS``.
method: str
The "method" used to determine the annotated
file name from the annotation file name.
One of {'remove', 'replace'}.
Corresponds to either removing the extension
for the annotation file format, or replacing
its extension with the extension of the annotated
format.
Returns
-------
annotated_annot_map : dict
Where each key is path to annotated file, and
its value is the corresponding ``crowsetta.Annotation``.
"""
if method not in {"remove", "replace"}:
raise ValueError(
f"`method` must be one of: {{'remove', 'replace'}}, but was: '{method}'"
)
annotated_files = [
pathlib.Path(annotated_file) for annotated_file in annotated_files
]
if method == "replace":
if annotated_ext is None:
annotated_ext_set = set(
[annotated_file.suffix for annotated_file in annotated_files]
)
if len(annotated_ext_set) > 1:
raise ValueError(
"Found more than one extension in annotated files, "
"unclear which extension to use when mapping to annotations "
f"with 'replace' method. Extensions found: {annotated_ext_set}"
)
annotated_ext = annotated_ext_set.pop()
annot_class = crowsetta.formats.by_name(annot_format)
# ---- make the dict that maps name of annotated files to crowsetta.Annotations
# We do this using names instead of using the full paths so that this function
# can be directory agnostic, i.e., we ignore the parent path and just use the filename
# to do the matching. Currently `vak` assumes at a higher level that annotation files
# and annotated files exist in the same `data_dir` but I am trying to write this
# function in a slightly more general way. Not obvious to me if there's a way this could backfire.
# For this function we assume 1:1 mapping between annotated and annotation files,
# so they probably need to be unique filenames anyway regardless of what dir they are in?
annotated_filename_annot_map = {}
for annot in annot_list:
annotated_name = None
if isinstance(annot_class.ext, str):
# NOTE that by convention the `ext` attribute
# of all Crowsetta annotation format classes
# begins with a period
annotated_name = annot.annot_path.name.replace(annot_class.ext, "")
elif isinstance(annot_class.ext, tuple):
# handle the case where an annotation format can have multiple extensions,
# e.g., ``Format.ext == ('.csv', '.txt')``
for ext in annot_class.ext:
if annot.annot_path.name.endswith(ext):
annotated_name = annot.annot_path.name.replace(ext, "")
break
if annotated_name is None:
raise MapUsingExtensionError(
"Could not determine annotated file from annotation path, "
f"using extension '{annot_class.ext}' from class '{annot_class.__name__}' "
f"associated with format '{annot_format}'. "
f"Annotation path was:\n{annot.annot_path}"
)
# NOTE we don't have to do anything else for method=='remove'
# since we just removed the extension
if method == "replace":
annotated_name = annotated_name + annotated_ext
annotated_filename_annot_map[annotated_name] = annot
annotated_annot_map = {} # this is what we will return
# Make a copy of ``annotated_files`` from which
# we remove files after mapping them to annotation,
# to validate that function worked,
# by making sure there are no items left in this copy after the loop.
# If there is 1:1 mapping then there should be no items left.
annotated_files_copy = copy.deepcopy(annotated_files)
for annotated_file in annotated_files:
try:
annot = annotated_filename_annot_map[annotated_file.name]
annotated_files_copy.remove(annotated_file)
except KeyError as e:
raise MapUsingExtensionError(
f"Did not find an annotation that produced annotated file: {annotated_file}"
) from e
annotated_annot_map[annotated_file] = annot
if len(annotated_files_copy) > 0:
raise MapUsingExtensionError(
"Could not map the following source files to annotations: "
f"{annotated_files_copy}"
)
# we return dict[str: annot] since we will always have paths as strings in DataFrame columns
# and we want to use those strings to index into this dictionary
return {str(path): annot for path, annot in annotated_annot_map.items()}
[docs]
def map_annotated_to_annot(
annotated_files: Union[list, np.array],
annot_list: list[crowsetta.Annotation],
annot_format: str,
annotated_ext: str | None = None,
) -> dict:
"""Map annotated files,
i.e. audio or spectrogram files,
to their corresponding annotations.
This function implements the three different ways that
vak can map annotated files to their annotations.
The first is when a single annotation file contains
multiple annotations, and so the format by necessity
must include the file annotated by each annotation.
The second assumes that the annotated file can be determined
programmatically by removing the extension from the annotation file,
e.g. 'bird1.wav.csv' -> 'bird1.wav'.
The third assumes that the annotated file can be determined
by replacing the extension of the annotation file
with the extension of the annotated file,
e.g. 'bird1.csv' -> 'bird1.wav'.
Returns a :class:`dict` where each key
is a path to an annotated file,
and the value for each key
is a :class:`crowsetta.Annotation`.
Mapping is done with two helper functions,
:func:`~vak.annotation._map_using_notated_path` and
:func:`~vak.annotation._map_using_ext`.
The function :func:`~vak.annotation._map_using_notated_path`
is used for annotation formats that include
the name of the annotated file.
The names of these formats (in :module:`crowsetta`) are:
{'birdsong-recognition-dataset', 'generic-seq', 'yarden'}.
The other function is are used for all other formats,
and it assumes a one-to-one mapping from annotation file
to annotated file.
It assumes that the name of the annotated file
can be found by removing the extension of the annotation
format, e.g., 'bird1.wav.csv` -> 'bird1.wav'.
The second, that is used if the first fails,
assumes the name of the annotated file
can be found by replacing the extension of the annotation
format with the extension of the annotated files.
Parameters
----------
annotated_files : list
Of paths to audio or spectrogram files.
annot_list : list
Of Annotations corresponding to files in annotated_files
annotated_ext : str
Extension of annotated files.
Default is None, in which case this function will
look for extensions of any valid audio format
(listed as ``vak.constants.VALID_AUDIO_FORMAT``).
Specifying the format provides a slight speed up.
Notes
-----
For more detail, please see
the page on file naming conventions in the
reference section of the documentation:
https://vak.readthedocs.io/en/latest/reference/filenames.html
"""
if isinstance(
annotated_files, np.ndarray
): # e.g., vak DataFrame['spect_path'].values
annotated_files = annotated_files.tolist()
if annot_format in (
"birdsong-recognition-dataset",
"yarden",
"generic-seq",
):
annotated_annot_map = _map_using_notated_path(
annotated_files, annot_list
)
else:
try:
annotated_annot_map = _map_using_ext(
annotated_files, annot_list, annot_format, method="remove"
)
except MapUsingExtensionError:
try:
annotated_annot_map = _map_using_ext(
annotated_files,
annot_list,
annot_format,
method="replace",
annotated_ext=annotated_ext,
)
except MapUsingExtensionError as e:
raise ValueError(
"Could not map annotated files to annotations.\n"
"Please see this section in the `vak` documentation:\n"
"https://vak.readthedocs.io/en/latest/howto/howto_prep_annotate.html"
"#how-does-vak-know-which-annotations-go-with-which-annotated-files"
) from e
return annotated_annot_map
[docs]
def has_unlabeled(annot: crowsetta.Annotation, duration: float) -> bool:
"""Returns ``True`` if an annotated sequence has unlabeled segments.
Tests whether an instance of ``crowsetta.Annotation.seq`` has
intervals between the annotated segments with a non-zero duration,
or any unannotated periods before or after the annotated segments.
Parameters
----------
annot : crowsetta.Annotation
A :class:`crowsetta.Annotation` with a ``seq`` attribute
(that is a :class:`crowsetta.Sequence`).
duration : float
Total duration of the vocalization
that is annotated by ``annot``.
Needed to determine whether the duration
is greater than the time
of the last offset in the annotated segments.
Returns
-------
has_unlabeled : bool
If True, there are unlabeled periods
in the vocalization annotated by ``annot``.
"""
if duration <= 0:
raise ValueError(
f"Duration less than or equal to zero passed to ``has_unlabeled``.\n"
f"Value for ``duration``: {duration}.\nValue for ``annot``: {annot}"
)
if duration > 0 and len(annot.seq.segments) < 1:
# Handle edge case where there are no annotated segments in annotation file
# See https://github.com/vocalpy/vak/issues/378
return True
has_unlabeled_intervals = np.any(
(annot.seq.onsets_s[1:] - annot.seq.offsets_s[:-1]) > 0.0
)
has_unlabeled_before_first_onset = annot.seq.onsets_s[0] > 0.0
has_unlabeled_after_last_offset = duration - annot.seq.offsets_s[-1] > 0.0
return (
has_unlabeled_intervals
or has_unlabeled_before_first_onset
or has_unlabeled_after_last_offset
)