"""Helper functions for working with datasets represented as a pandas.DataFrame"""from__future__importannotationsimportpathlibimportnumpyasnpimportpandasaspd
[docs]defget_dataset_csv_filename(data_dir_name:str,timenow:str)->str:"""Get name of csv file representing dataset. This function is called by :func:`vak.prep.frame_classification.dataset_df.get_dataset_csv_path`. Parameters ---------- data_dir_name : str Name of directory specified as parameter ``data_dir`` when calling :func:`vak.core.prep.prep`. This becomes the "prefix" of the csv filename. timenow : str Timestamp. This becomes the "suffix" of the csv filename. Returns ------- dataset_csv_filename : str String, in the form f"{data_dir_name}_prep_{timenow}.csv" """returnf"{data_dir_name}_prep_{timenow}.csv"
[docs]defget_dataset_csv_path(dataset_path:pathlib.Path,data_dir_name:str,timenow:str)->pathlib.Path:"""Returns the path that should be used to save a pandas DataFrame representing a dataset to a csv file. Parameters ---------- dataset_path : str, pathlib.Path Path to directory that represents dataset. data_dir_name : str Name of directory specified as parameter ``data_dir`` when calling :func:`vak.core.prep.prep`. This becomes the "prefix" of the csv filename. timenow : str Timestamp. This becomes the "suffix" of the csv filename. Returns ------- dataset_csv_path : pathlib.Path Path that is used when saving ``dataset_df`` as a csv file in the root of the dataset directory, ``dataset_path``. """dataset_csv_filename=get_dataset_csv_filename(data_dir_name,timenow)dataset_csv_path=dataset_path/dataset_csv_filenamereturndataset_csv_path
[docs]defadd_split_col(df:pd.DataFrame,split:str)->pd.DataFrame:"""Add a 'split' column to a pandas DataFrame. Used by :func:`vak.prep` to assign an entire dataset to the same split, e.g. 'train' or 'predict'. All rows in the 'split' column will have the value specified. Parameters ---------- df : pandas.DataFrame A dataframe that represents a dataset. split : str A string that will be assigned to every row in the added "split" column. One of {'train', 'val', 'test', 'predict'}. """ifsplitnotin{"train","val","test","predict"}:raiseValueError(f"value for split should be one of {{'train', 'val', 'test', 'predict'}}, but was '{split}'")split_col=np.asarray([splitfor_inrange(len(df))],dtype="object")df["split"]=split_colreturndf