Source code for vak.prep.parametric_umap.dataset_arrays

"""Helper functions for `vak.prep.dimensionality_reduction` module
that handle array files.
"""

from __future__ import annotations

import logging
import pathlib
import shutil

import pandas as pd

logger = logging.getLogger(__name__)


[docs] def move_files_into_split_subdirs( dataset_df: pd.DataFrame, dataset_path: pathlib.Path, purpose: str ) -> None: """Move npy files in dataset into sub-directories, one for each split in the dataset. This is run *after* calling :func:`vak.prep.unit_dataset.prep_unit_dataset` to generate ``dataset_df``. Parameters ---------- dataset_df : pandas.DataFrame A ``pandas.DataFrame`` returned by :func:`vak.prep.unit_dataset.prep_unit_dataset` with a ``'split'`` column added, as a result of calling :func:`vak.prep.split.unit_dataframe` or because it was added "manually" by calling :func:`vak.core.prep.prep_helper.add_split_col` (as is done for 'predict' when the entire ``DataFrame`` belongs to this "split"). dataset_path : pathlib.Path Path to directory that represents dataset. purpose: str A string indicating what the dataset will be used for. One of {'train', 'eval', 'predict', 'learncurve'}. Determined by :func:`vak.core.prep.prep` using the TOML configuration file. Returns ------- None The ``DataFrame`` is modified in place as the files are moved, so nothing is returned. """ moved_spect_paths = ( [] ) # to clean up after moving -- may be empty if we copy all spects (e.g., user generated) # ---- copy/move files into split sub-directories inside dataset directory # Next line, note we drop any na rows in the split column, since they don't belong to a split anyway split_names = sorted(dataset_df.split.dropna().unique()) for split_name in split_names: if split_name == "None": # these are files that didn't get assigned to a split continue split_subdir = dataset_path / split_name split_subdir.mkdir() split_df = dataset_df[dataset_df.split == split_name].copy() split_spect_paths = [ # this just converts from string to pathlib.Path pathlib.Path(spect_path) for spect_path in split_df["spect_path"].values ] is_in_dataset_dir = [ # if dataset_path is one of the parents of spect_path, we can move; otherwise, we copy dataset_path.resolve() in list(spect_path.parents) for spect_path in split_spect_paths ] if all(is_in_dataset_dir): move_spects = True elif all([not is_in_dir for is_in_dir in is_in_dataset_dir]): move_spects = False else: raise ValueError( "Expected to find either all spectrograms were in dataset directory, " "or all were in some other directory, but found a mixture. " f"Spectrogram paths for split being moved within dataset directory:\n{split_spect_paths}" ) new_spect_paths = [] # to fix DataFrame for spect_path in split_spect_paths: spect_path = pathlib.Path(spect_path) if move_spects: # because it's within dataset_path already new_spect_path = spect_path.rename( split_subdir / spect_path.name ) moved_spect_paths.append(spect_path) else: # copy instead of moving new_spect_path = shutil.copy(src=spect_path, dst=split_subdir) new_spect_paths.append( # rewrite paths relative to dataset directory's root, so dataset is portable pathlib.Path(new_spect_path).relative_to(dataset_path) ) # cast to str before rewrite so that type doesn't silently change for some rows new_spect_paths = [ str(new_spect_path) for new_spect_path in new_spect_paths ] dataset_df.loc[split_df.index, "spect_path"] = new_spect_paths # ---- clean up after moving/copying ------------------------------------------------------------------------------- # remove any directories that we just emptied if moved_spect_paths: unique_parents = set( [moved_spect.parent for moved_spect in moved_spect_paths] ) for parent in unique_parents: if len(list(parent.iterdir())) < 1: shutil.rmtree(parent)