"""Input/output functions."""
try:
import pandas as pd
except ImportError as e:
raise ImportError(
"pandas is required to use the i/o utils (mlcolvar.io)\n", e
)
import numpy as np
import torch
import os
from typing import Union, List
from mlcolvar.io._utils import _download_temp_file
from mlcolvar.data import DictDataset
__all__ = ["load_dataframe", "plumed_to_pandas", "create_dataset_from_files"]
def is_plumed_file(filename):
"""
Check if given file is in PLUMED format.
Parameters
----------
filename : string, optional
PLUMED output file
Returns
-------
bool
wheter is a plumed output file
"""
headers = pd.read_csv(filename, sep=" ", skipinitialspace=True, nrows=0)
is_plumed = True if " ".join(headers.columns[:2]) == "#! FIELDS" else False
return is_plumed
def plumed_to_pandas(filename="./COLVAR"):
"""
Load a PLUMED file and save it to a dataframe.
Parameters
----------
filename : string, optional
PLUMED output file
Returns
-------
df : DataFrame
Collective variables dataframe
"""
skip_rows = 1
# Read header
headers = pd.read_csv(filename, sep=" ", skipinitialspace=True, nrows=0)
# Discard #! FIELDS
headers = headers.columns[2:]
# Load dataframe and use headers for columns names
df = pd.read_csv(
filename,
sep=" ",
skipinitialspace=True,
header=None,
skiprows=range(skip_rows),
names=headers,
comment="#",
)
return df
[docs]
def load_dataframe(file_names: Union[str, list],
folder: str = None,
start: int = 0,
stop: int = None,
stride: int = 1,
load_args: List[dict] = None,
delete_download: bool = True,
**kwargs,
):
"""Load dataframe(s) from file(s). It can be used also to open files from internet (if the string contains http).
In case of PLUMED colvar files automatically handles the column names, otherwise it is just a wrapper for pd.load_csv function.
Parameters
----------
filenames : str or list[str]
filenames to be loaded
folder : str, optional
Common path for the files to be imported, by default None. If set, filenames become 'folder/file_name'.
start: int, optional
read from this row, default 0
stop: int, optional
read until this row, default None
stride: int, optional
read every this number, default 1
load_args: list[dict], optional
List of dictionaries with the loading arguments for each file (keys: start,stop,stride and pandas.read_csv options), by default None
delete_download: bool, optinal
whether to delete the downloaded file after it has been loaded, default True.
kwargs:
keyword arguments passed to pd.load_csv function
Returns
-------
pandas.DataFrame
Dataframe
Raises
------
TypeError
if data is not a valid type
"""
default_load_args = {'start' : start, 'stride': stride, 'stop': stop}
# if it is a single string
if type(file_names) == str:
file_names = [file_names]
elif type(file_names) != list:
raise TypeError(
f"only strings or list of strings are supported, not {type(file_names)}."
)
# set file paths
if folder is not None:
file_names = [os.path.join(folder, fname) for fname in file_names]
# check if per file args are given, otherwise set to {}
if load_args is None:
load_args = [default_load_args for _ in file_names]
else:
if start != 0 or stride != 1 or stop is not None:
raise ValueError(
"Both global and per-file loading parameters have been specified. Either use load_args for per-file parameters or start, stop, stride keywords for global behavior."
)
if (not isinstance(load_args, list)) or (len(file_names) != len(load_args)):
raise TypeError(
"load_args should be a list of dictionaries of arguments of same length as file_names. If you want to use the same args for all file pass them directly as **kwargs."
)
for i,arg in enumerate(load_args):
for key in default_load_args.keys():
if key not in arg.keys():
load_args[i][key] = default_load_args[key]
# list of file_names
df_list = []
for i, filename in enumerate(file_names):
# get correct loading args
start = load_args[i]['start']
stop = load_args[i]['stop']
stride = load_args[i]['stride']
# check if filename is an url
download = False
if "http" in filename:
download = True
url = filename
temp, filename = _download_temp_file(file_url=filename,
delete_download=delete_download,
return_name=True)
# check if file is in PLUMED format
if is_plumed_file(filename):
df_tmp = plumed_to_pandas(filename)
df_tmp["walker"] = [i for _ in range(len(df_tmp))]
df_tmp = df_tmp.iloc[start:stop:stride, :]
df_list.append(df_tmp)
# else use read_csv with optional kwargs
else:
df_tmp = pd.read_csv(filename, **kwargs)
df_tmp["walker"] = [i for _ in range(len(df_tmp))]
df_tmp = df_tmp.iloc[start:stop:stride, :]
df_list.append(df_tmp)
# delete temporary data if necessary
if download:
if delete_download:
temp.close()
else:
print(f"downloaded file ({url}) saved as ({filename}).")
# concatenate
df = pd.concat(df_list)
df.reset_index(drop=True, inplace=True)
return df
[docs]
def create_dataset_from_files(
file_names: Union[list, str],
folder: str = None,
create_labels: bool = None,
load_args: List[dict] = None,
filter_args: dict = None,
modifier_function=None,
return_dataframe: bool = False,
verbose: bool = True,
**kwargs,
):
"""
Initialize a dataset from (a list of) files. Suitable for supervised/unsupervised tasks.
Parameters
----------
file_names : list
Names of files from which import the data
folder : str, optional
Common path for the files to be imported, by default None. If set, filenames become 'folder/file_name'.
create_labels: bool, optional
Assign a label to each file, default True if more than a file is given, otherwise False
load_args: list[dict], optional
List of dictionaries with the arguments passed to load_dataframe function for each file (keys: start,stop,stride and pandas.read_csv options), by default None
filter_args: dict, optional
Dictionary of arguments which are passed to df.filter() to select descriptors (keys: items, like, regex), by default None
Note that 'time' and '*.bias' columns are always discarded.
return_dataframe : bool, optional
Return also the imported Pandas dataframe for convenience, by default False
modifier_function : function, optional
Function to be applied to the input data, by default None.
verbose : bool, optional
Print info on the datasets, by default True
kwargs : optional
args passed to mlcolvar.io.load_dataframe
Returns
-------
torch.Dataset
Torch labeled dataset of the given data
optional, pandas.Dataframe
Pandas dataframe of the given data #TODO improve
See also
--------
mlcolvar.io.load_dataframe
Function that is used to load the files
"""
if isinstance(file_names, str):
file_names = [file_names]
num_files = len(file_names)
# set file paths
if folder is not None:
file_names = [os.path.join(folder, fname) for fname in file_names]
# check if per file args are given, otherwise set to {}
if load_args is None:
load_args = [{} for _ in file_names]
else:
if (not isinstance(load_args, list)) or (len(file_names) != len(load_args)):
raise TypeError(
"load_args should be a list of dictionaries of arguments of same length as file_names. If you want to use the same args for all file pass them directly as **kwargs."
)
# check if create_labels if given, otherwise set it to True if more than one file is given
if create_labels is None:
create_labels = False if len(file_names) == 1 else True
# initialize pandas dataframe
df = pd.DataFrame()
# load data
for i in range(num_files):
df_tmp = load_dataframe(file_names[i], **load_args[i], **kwargs)
# add label in the dataframe
if create_labels:
df_tmp["labels"] = i
if verbose:
print(f"Class {i} dataframe shape: ", np.shape(df_tmp))
# update collective dataframe
df = pd.concat([df, df_tmp], ignore_index=True)
# filter inputs
df_data = df.filter(**filter_args) if filter_args is not None else df.copy()
df_data = df_data.filter(regex="^(?!.*labels)^(?!.*time)^(?!.*bias)^(?!.*walker)")
if verbose:
print(f"\n - Loaded dataframe {df.shape}:", list(df.columns))
print(f" - Descriptors {df_data.shape}:", list(df_data.columns))
# apply transformation
if modifier_function is not None:
df_data = df_data.apply(modifier_function)
# create DictDataset
dictionary = {"data": torch.Tensor(df_data.values)}
if create_labels:
dictionary["labels"] = torch.Tensor(df["labels"].values)
dataset = DictDataset(dictionary, feature_names=df_data.columns.values, data_type='descriptors')
if return_dataframe:
return dataset, df
else:
return dataset
# =================================================================================================
# ============================================= TESTS =============================================
# =================================================================================================
def test_datasetFromFile():
from mlcolvar.tests import data_dir
with data_dir() as data_folder:
data_folder = str(data_folder)
# Test with unlabeled dataset
torch_dataset, pd_dataframe = create_dataset_from_files(
file_names=["state_A.dat", "state_B.dat", "state_C.dat"],
folder=data_folder,
create_labels=False,
load_args=None,
filter_args=None,
return_dataframe=True,
start=0, # kwargs to load_dataframe
stop=5,
stride=1,
)
# Test no regex on two states
create_dataset_from_files(
file_names=["state_A.dat", "state_B.dat"],
folder=data_folder,
create_labels=True,
load_args=None,
filter_args=None,
return_dataframe=True,
start=0, # kwargs to load_dataframe
stop=5,
stride=1,
)
# Test with filter regex on two states
dataset = create_dataset_from_files(
file_names=["state_A.dat", "state_B.dat"],
folder=data_folder,
create_labels=True,
load_args=None,
filter_args={"regex": "n|o"},
return_dataframe=False,
start=0, # kwargs to load_dataframe
stop=5,
stride=1,
)
def test_modifier(x):
return x**2
# Test with filter regex on two states with modifier
create_dataset_from_files(
file_names=["state_A.dat", "state_B.dat"],
folder=data_folder,
create_labels=True,
load_args=None,
filter_args={"regex": "n|o"},
modifier_function=test_modifier,
return_dataframe=True,
start=0, # kwargs to load_dataframe
stop=5,
stride=1,
)
def test_load_dataframe():
from mlcolvar.tests import data_dir
with data_dir() as data_folder:
data_folder = str(data_folder)
# Test naive single file
pd_dataframe = load_dataframe(file_names="state_A.dat",
folder=data_folder,
start=0,
stop=5,
stride=1,
)
assert(len(pd_dataframe) == 5)
# Test with global loading parameters
pd_dataframe = load_dataframe(file_names=["state_A.dat", "state_B.dat", "state_C.dat"],
folder=data_folder,
start=0,
stop=5,
stride=1,
)
assert(len(pd_dataframe) == 15)
# Test with per-file loading parameters
load_args = [{"start": 0, "stop": 5, "stride": 1},
{"start": 0, "stop": 5, "stride": 1},
{"start": 0, "stop": 5, "stride": 1}]
pd_dataframe = load_dataframe(file_names=["state_A.dat", "state_B.dat", "state_C.dat"],
folder=data_folder,
load_args=load_args,
)
assert(len(pd_dataframe) == 15)
# Test with per-file loading parameters with default fallback
load_args = [{"start": 0, "stop": 6, "stride": 2},
{"start": 0, "stop": 6, "stride": 2},
{"start": 0, "stop": 6}] # this should fall back to default
pd_dataframe = load_dataframe(file_names=["state_A.dat", "state_B.dat", "state_C.dat"],
folder=data_folder,
load_args=load_args,
)
assert(len(pd_dataframe) == 12)
# test wrong length error
try:
load_args = [{"start": 0, "stop": 6, "stride": 2},
{"start": 0, "stop": 6}] # this should fall back to default
pd_dataframe = load_dataframe(file_names=["state_A.dat", "state_B.dat", "state_C.dat"],
folder=data_folder,
load_args=load_args,
)
except TypeError as e:
print("[TEST LOG] Checked this error: ", e)
# test load_args and global key conflict error
try:
load_args = [{"start": 0, "stop": 6, "stride": 2},
{"start": 0, "stop": 6}] # this should fall back to default
pd_dataframe = load_dataframe(file_names=["state_A.dat", "state_B.dat", "state_C.dat"],
folder=data_folder,
load_args=load_args,
start=10,
)
except ValueError as e:
print("[TEST LOG] Checked this error: ", e)