Source code for vak.prep.dataset_df_helper

"""Helper functions for working with datasets represented as a pandas.DataFrame"""

from __future__ import annotations

import pathlib

import numpy as np
import pandas as pd


[docs] def get_dataset_csv_filename(data_dir_name: str, timenow: str) -> str: """Get name of csv file representing dataset. This function is called by :func:`vak.prep.frame_classification.dataset_df.get_dataset_csv_path`. Parameters ---------- data_dir_name : str Name of directory specified as parameter ``data_dir`` when calling :func:`vak.core.prep.prep`. This becomes the "prefix" of the csv filename. timenow : str Timestamp. This becomes the "suffix" of the csv filename. Returns ------- dataset_csv_filename : str String, in the form f"{data_dir_name}_prep_{timenow}.csv" """ return f"{data_dir_name}_prep_{timenow}.csv"
[docs] def get_dataset_csv_path( dataset_path: pathlib.Path, data_dir_name: str, timenow: str ) -> pathlib.Path: """Returns the path that should be used to save a pandas DataFrame representing a dataset to a csv file. Parameters ---------- dataset_path : str, pathlib.Path Path to directory that represents dataset. data_dir_name : str Name of directory specified as parameter ``data_dir`` when calling :func:`vak.core.prep.prep`. This becomes the "prefix" of the csv filename. timenow : str Timestamp. This becomes the "suffix" of the csv filename. Returns ------- dataset_csv_path : pathlib.Path Path that is used when saving ``dataset_df`` as a csv file in the root of the dataset directory, ``dataset_path``. """ dataset_csv_filename = get_dataset_csv_filename(data_dir_name, timenow) dataset_csv_path = dataset_path / dataset_csv_filename return dataset_csv_path
[docs] def add_split_col(df: pd.DataFrame, split: str) -> pd.DataFrame: """Add a 'split' column to a pandas DataFrame. Used by :func:`vak.prep` to assign an entire dataset to the same split, e.g. 'train' or 'predict'. All rows in the 'split' column will have the value specified. Parameters ---------- df : pandas.DataFrame A dataframe that represents a dataset. split : str A string that will be assigned to every row in the added "split" column. One of {'train', 'val', 'test', 'predict'}. """ if split not in {"train", "val", "test", "predict"}: raise ValueError( f"value for split should be one of {{'train', 'val', 'test', 'predict'}}, but was '{split}'" ) split_col = np.asarray([split for _ in range(len(df))], dtype="object") df["split"] = split_col return df