Source code for vak.prep.parametric_umap.dataset_arrays
"""Helper functions for `vak.prep.dimensionality_reduction` modulethat handle array files."""from__future__importannotationsimportloggingimportpathlibimportshutilimportpandasaspdlogger=logging.getLogger(__name__)
[docs]defmove_files_into_split_subdirs(dataset_df:pd.DataFrame,dataset_path:pathlib.Path,purpose:str)->None:"""Move npy files in dataset into sub-directories, one for each split in the dataset. This is run *after* calling :func:`vak.prep.unit_dataset.prep_unit_dataset` to generate ``dataset_df``. Parameters ---------- dataset_df : pandas.DataFrame A ``pandas.DataFrame`` returned by :func:`vak.prep.unit_dataset.prep_unit_dataset` with a ``'split'`` column added, as a result of calling :func:`vak.prep.split.unit_dataframe` or because it was added "manually" by calling :func:`vak.core.prep.prep_helper.add_split_col` (as is done for 'predict' when the entire ``DataFrame`` belongs to this "split"). dataset_path : pathlib.Path Path to directory that represents dataset. purpose: str A string indicating what the dataset will be used for. One of {'train', 'eval', 'predict', 'learncurve'}. Determined by :func:`vak.core.prep.prep` using the TOML configuration file. Returns ------- None The ``DataFrame`` is modified in place as the files are moved, so nothing is returned. """moved_spect_paths=([])# to clean up after moving -- may be empty if we copy all spects (e.g., user generated)# ---- copy/move files into split sub-directories inside dataset directory# Next line, note we drop any na rows in the split column, since they don't belong to a split anywaysplit_names=sorted(dataset_df.split.dropna().unique())forsplit_nameinsplit_names:ifsplit_name=="None":# these are files that didn't get assigned to a splitcontinuesplit_subdir=dataset_path/split_namesplit_subdir.mkdir()split_df=dataset_df[dataset_df.split==split_name].copy()split_spect_paths=[# this just converts from string to pathlib.Pathpathlib.Path(spect_path)forspect_pathinsplit_df["spect_path"].values]is_in_dataset_dir=[# if dataset_path is one of the parents of spect_path, we can move; otherwise, we copydataset_path.resolve()inlist(spect_path.parents)forspect_pathinsplit_spect_paths]ifall(is_in_dataset_dir):move_spects=Trueelifall([notis_in_dirforis_in_dirinis_in_dataset_dir]):move_spects=Falseelse:raiseValueError("Expected to find either all spectrograms were in dataset directory, ""or all were in some other directory, but found a mixture. "f"Spectrogram paths for split being moved within dataset directory:\n{split_spect_paths}")new_spect_paths=[]# to fix DataFrameforspect_pathinsplit_spect_paths:spect_path=pathlib.Path(spect_path)ifmove_spects:# because it's within dataset_path alreadynew_spect_path=spect_path.rename(split_subdir/spect_path.name)moved_spect_paths.append(spect_path)else:# copy instead of movingnew_spect_path=shutil.copy(src=spect_path,dst=split_subdir)new_spect_paths.append(# rewrite paths relative to dataset directory's root, so dataset is portablepathlib.Path(new_spect_path).relative_to(dataset_path))# cast to str before rewrite so that type doesn't silently change for some rowsnew_spect_paths=[str(new_spect_path)fornew_spect_pathinnew_spect_paths]dataset_df.loc[split_df.index,"spect_path"]=new_spect_paths# ---- clean up after moving/copying -------------------------------------------------------------------------------# remove any directories that we just emptiedifmoved_spect_paths:unique_parents=set([moved_spect.parentformoved_spectinmoved_spect_paths])forparentinunique_parents:iflen(list(parent.iterdir()))<1:shutil.rmtree(parent)