[docs]defto_map(labelset:set,map_unlabeled:bool=True)->dict:"""Convert set of labels to `dict` mapping those labels to a series of consecutive integers from 0 to n inclusive, where n is the number of labels in the set. This 'labelmap' is used when mapping labels from annotations of a vocalization into a label for every time bin in a spectrogram of that vocalization. If ``map_unlabeled`` is True, then the label 'unlabeled' will be added to labelset, and will map to 0, so the total number of classes is n + 1. Parameters ---------- labelset : set Set of labels used to annotate a dataset. map_unlabeled : bool If True, include key 'unlabeled' in mapping. Any time bins in a spectrogram that do not have a label associated with them, e.g. a silent gap between vocalizations, will be assigned the integer that the 'unlabeled' key maps to. Returns ------- labelmap : dict Maps labels to integers. """ifnotisinstance(labelset,set):raiseTypeError(f"type of labelset must be set, got type {type(labelset)}")labellist=[]ifmap_unlabeledisTrue:labellist.append("unlabeled")labellist.extend(sorted(list(labelset)))labelmap=dict(zip(labellist,range(len(labellist))))returnlabelmap
[docs]defto_set(labels_list:list[np.ndarray|list])->set:"""Given a list of labels from annotations, return the set of (unique) labels. Parameters ---------- labels_list : list Of labels from annotations, either a list of numpy.ndarrays or a list of lists. Returns ------- labelset : set Unique set of labels found in ``labels_list``. Examples -------- >>> labels_list = [voc.annot.labels for voc in vds.voc_list] >>> labelset = to_set(labels_list) >>> print(labelset) {'a', 'b', 'c', 'd', 'e'} """all_labels=[lblforlabelsinlabels_listforlblinlabels]labelset=set(all_labels)returnlabelset
[docs]deffrom_df(dataset_df:pd.DataFrame,dataset_path:str|pathlib.Path)->list[np.ndarray]:"""Returns labels for each vocalization in a dataset. Takes Pandas DataFrame representing the dataset, loads annotation for each row in the DataFrame, and then returns labels from each annotation. Parameters ---------- dataset_df : pandas.DataFrame created by vak.io.dataframe.from_files Returns ------- labels : list of array-like, labels for each vocalization in the dataset. """dataset_path=pathlib.Path(dataset_path)ifnotdataset_path.exists()ornotdataset_path.is_dir():raiseNotADirectoryError(f"`dataset_path` not found or not recognized as a directory: {dataset_path}")annots=annotation.from_df(dataset_df,dataset_path)return[annot.seq.labelsforannotinannots]
ALPHANUMERIC="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"DUMMY_SINGLE_CHAR_LABELS=[chr(x)# some large range of characters not typically used as labelsforxinrange(162,2000)]# start with alphanumeric since more human readable;# mapping can be arbitrary as long as it's consistentDUMMY_SINGLE_CHAR_LABELS=(*ALPHANUMERIC,*DUMMY_SINGLE_CHAR_LABELS)# added to fix https://github.com/NickleDave/vak/issues/373
[docs]defmulti_char_labels_to_single_char(labelmap:dict,skip:tuple[str]=("unlabeled",))->dict:"""Return a copy of a ``labelmap`` where any labels that are strings with multiple characters are converted to single characters. This makes it possible to correctly compute metrics like Levenshtein edit distance. Labels that are strings with multiple characters are replaced by a single-label character from the constant ``vak.labels.DUMMY_SINGLE_CHAR_LABELS``. The replacement is grabbed with the index of the multi-character label from the sorted ``dict``. Parameters ---------- labelmap : dict That maps human-readable string labels to integers. As returned by ``vak.labels.to_map``. skip : tuple Of strings, labels to leave as multiple characters. Default is ('unlabeled',). Returns ------- labelmap : dict Where any keys with multiple characters in string are converted to dummy single characters. """current_str_labels=sorted(# sort to be extra sure we get same order every time# (even though OrderedDict is now default in Python).# Same order forces mapping to single characters to be deterministic across function calls.labelmap.keys())ifall([len(lbl)==1forlblincurrent_str_labels]):# no need to do re-mappingreturnlabelmap# We only use single character labels that are not already in labelmap,# to avoid over-writing a single-character label from the original labelmap# with the same single-character from DUMMY_SINGLE_CHAR_LABELS,# which would map it to a new integer and cause us to lose the original integer# from the mappingsingle_char_labels_not_in_labelmap=[lblforlblinDUMMY_SINGLE_CHAR_LABELSiflblnotinlabelmap]n_needed_to_remap=len([lblforlblincurrent_str_labelsiflen(lbl)>1])ifn_needed_to_remap>len(single_char_labels_not_in_labelmap):raiseValueError(f"Need to remap {n_needed_to_remap} multiple-character labels"f"but there are only {len(single_char_labels_not_in_labelmap)} available.")new_labelmap={}fordummy_label_ind,label_strinenumerate(current_str_labels):label_int=labelmap[label_str]if(len(label_str)>1andlabel_strnotinskip):# default for `skip` is ('unlabeled',)# replace with dummy labelnew_label_str=single_char_labels_not_in_labelmap[dummy_label_ind]new_labelmap[new_label_str]=label_intelse:new_labelmap[label_str]=label_intreturnnew_labelmap