Source code for vak.prep.parametric_umap.dataset_arrays

"""Helper functions for `vak.prep.dimensionality_reduction` module
that handle array files.
"""

from __future__ import annotations

import logging
import pathlib
import shutil

import pandas as pd

logger = logging.getLogger(__name__)



[docs]
def move_files_into_split_subdirs(
    dataset_df: pd.DataFrame, dataset_path: pathlib.Path, purpose: str
) -> None:
    """Move npy files in dataset into sub-directories, one for each split in the dataset.

    This is run *after* calling :func:`vak.prep.unit_dataset.prep_unit_dataset`
    to generate ``dataset_df``.

    Parameters
    ----------
    dataset_df : pandas.DataFrame
        A ``pandas.DataFrame`` returned by
        :func:`vak.prep.unit_dataset.prep_unit_dataset`
        with a ``'split'`` column added, as a result of calling
        :func:`vak.prep.split.unit_dataframe` or because it was added "manually"
        by calling :func:`vak.core.prep.prep_helper.add_split_col` (as is done
        for 'predict' when the entire ``DataFrame`` belongs to this
        "split").
    dataset_path : pathlib.Path
        Path to directory that represents dataset.
    purpose: str
        A string indicating what the dataset will be used for.
        One of {'train', 'eval', 'predict', 'learncurve'}.
        Determined by :func:`vak.core.prep.prep`
        using the TOML configuration file.

    Returns
    -------
    None

    The ``DataFrame`` is modified in place
    as the files are moved, so nothing is returned.
    """
    moved_spect_paths = (
        []
    )  # to clean up after moving -- may be empty if we copy all spects (e.g., user generated)
    # ---- copy/move files into split sub-directories inside dataset directory
    # Next line, note we drop any na rows in the split column, since they don't belong to a split anyway
    split_names = sorted(dataset_df.split.dropna().unique())

    for split_name in split_names:
        if split_name == "None":
            # these are files that didn't get assigned to a split
            continue
        split_subdir = dataset_path / split_name
        split_subdir.mkdir()

        split_df = dataset_df[dataset_df.split == split_name].copy()
        split_spect_paths = [
            # this just converts from string to pathlib.Path
            pathlib.Path(spect_path)
            for spect_path in split_df["spect_path"].values
        ]
        is_in_dataset_dir = [
            # if dataset_path is one of the parents of spect_path, we can move; otherwise, we copy
            dataset_path.resolve() in list(spect_path.parents)
            for spect_path in split_spect_paths
        ]
        if all(is_in_dataset_dir):
            move_spects = True
        elif all([not is_in_dir for is_in_dir in is_in_dataset_dir]):
            move_spects = False
        else:
            raise ValueError(
                "Expected to find either all spectrograms were in dataset directory, "
                "or all were in some other directory, but found a mixture. "
                f"Spectrogram paths for split being moved within dataset directory:\n{split_spect_paths}"
            )

        new_spect_paths = []  # to fix DataFrame
        for spect_path in split_spect_paths:
            spect_path = pathlib.Path(spect_path)
            if move_spects:  # because it's within dataset_path already
                new_spect_path = spect_path.rename(
                    split_subdir / spect_path.name
                )
                moved_spect_paths.append(spect_path)
            else:  # copy instead of moving
                new_spect_path = shutil.copy(src=spect_path, dst=split_subdir)

            new_spect_paths.append(
                # rewrite paths relative to dataset directory's root, so dataset is portable
                pathlib.Path(new_spect_path).relative_to(dataset_path)
            )

        # cast to str before rewrite so that type doesn't silently change for some rows
        new_spect_paths = [
            str(new_spect_path) for new_spect_path in new_spect_paths
        ]
        dataset_df.loc[split_df.index, "spect_path"] = new_spect_paths

    # ---- clean up after moving/copying -------------------------------------------------------------------------------
    # remove any directories that we just emptied
    if moved_spect_paths:
        unique_parents = set(
            [moved_spect.parent for moved_spect in moved_spect_paths]
        )
        for parent in unique_parents:
            if len(list(parent.iterdir())) < 1:
                shutil.rmtree(parent)