import numpy as np
import torch
import matplotlib
import matplotlib.pyplot as plt
import mlcolvar.utils.plot
try:
import sklearn
except ImportError:
print('The lasso module requires scikit-learn as additional dependency.')
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV, LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import balanced_accuracy_score
__all__ = [ "lasso_classification", "plot_lasso_classification", "lasso_regression", "plot_lasso_regression" ]
class SparsityScoring:
"""Scorer function used as metric in lasso_classification.
The score balances the accuracy of the predictions and the number of features n:
.. math:: \text{score} = - (1-\text{accuracy})*100 - \left|n-\text{min\_features}\right|
this implies that a feature is retained only if increases the accuracy by 1%.
"""
def __init__(self, min_features=0 ):
"""Initialize scorer with (optional) number of features
Parameters
----------
min_features : int, optional
minimum number of features, by default 0
"""
self.min_features = min_features
def __call__(self, estimator, X, y, **kwargs):
y_pred = estimator.predict(X)
# Accuracy
#acc = accuracy_score(y, y_pred)
acc = balanced_accuracy_score(y, y_pred)
# Number of features
coefficents = estimator.coef_
num = np.count_nonzero(coefficents)
# criterion
loss = (1-acc)*100 + np.abs(num-self.min_features)
return -1*loss
def accuracy_from_score(self, score, num_features):
loss = -1*score
err = loss - np.abs(num_features - self.min_features)
acc = 1 - err/100
return acc
[docs]
def lasso_classification(dataset,
min_features = 0,
Cs = 40,
scale_inputs = True,
feature_names = None,
print_info = True,
plot = True
):
"""Perform sparse classification via LASSO on a given DictDataset (requires keys: "data" and "labels").
The (inverse) regularization strength C is automatically chosen based on cross-validation on a set of values (Cs),
see sklearn.linear_model.LogisticRegressionCV. The scoring function used is `SparsityScoring`, balancing the accuracy and the number of features.
In the two-classes case a single classifier is built, otherwise a one-vs-rest classifier is constructed, composed by N different estimators are trained to classify each state from the others.
Parameters
----------
dataset : DictDataset
dataset with 'data' and 'labels'
min_features : int, optional
minimum number of features, by default 0
Cs : int or array-like, optional
Each of the values in Cs describes the inverse of regularization strength. If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. Like in support vector machines, smaller values specify stronger regularization., by default 40
scale_inputs : bool, optional
whether to standardize inputs based on mean and std.dev., by default True
feature_names : list, optional
names of the input features, if not given they are taken from `dataset.feature_names`, by default None
print_info : bool, optional
whether to print results, by default True
plot : bool, optional
whether to plot results, by default True
See also
--------
mlcolvar.explain.lasso.SparsityScoring
Scoring function used in LASSO classification
Returns
-------
classifier:
optimized estimator
feats:
dictionary with the names of the non-zero features, per label
coeffs:
dictionary with the coefficients of the non-zero features, per label
"""
# Convert dataset to numpy
with torch.no_grad():
raw_descriptors = dataset['data'].numpy()
labels = dataset['labels'].numpy().astype(int)
if feature_names is None:
if dataset.feature_names is None:
raise ValueError('Feature names not found (either in the dataset or as argument to the function).')
feature_names = dataset.feature_names
# Scaling inputs
if scale_inputs:
scaler = StandardScaler(with_mean=True, with_std=True)
descriptors = scaler.fit_transform(raw_descriptors)
else:
descriptors = raw_descriptors
# Define cross-validation for LASSO, using
# a custom scoring function based on accuracy and number of features
scorer = SparsityScoring(min_features=min_features)
_classifier = LogisticRegressionCV(Cs=Cs,
solver='liblinear',
multi_class='ovr',
fit_intercept=False,
penalty='l1',
max_iter = 500,
scoring=scorer)
# Fit classifier
feature_selector = SelectFromModel(_classifier)
feature_selector.fit(descriptors, labels)
classifier = feature_selector.estimator_
# Get selected features and coefficients
feats = {}
coeffs = {}
for i,key in enumerate(classifier.coefs_paths_.keys()):
index = np.abs(classifier.coef_).argsort()[i][::-1]
sorted_feature_names = feature_names[index]
sorted_coeffs = classifier.coef_[i,index]
idx = np.argwhere ( np.abs(sorted_coeffs)>1e-5 )[:,0]
selected_feature_names = sorted_feature_names[idx]
selected_coeffs = sorted_coeffs[idx]
feats[key] = selected_feature_names
coeffs[key] = selected_coeffs
# display summary
if print_info:
#score = classifier.score(descriptors,labels)
C_idx = np.argwhere(np.abs(classifier.Cs_ - classifier.C_[i]) < 1e-8)[0,0]
score = classifier.scores_[key].mean(axis=0)[C_idx]
accuracy = classifier.scoring.accuracy_from_score(score, len(selected_coeffs))
print(f'======= LASSO results ({key}) ========')
print(f'- Regularization : {classifier.C_[i]:.8f}')
print(f'- Score : {score:.2f}')
print(f'- Accuracy : {accuracy*100:.2f}%')
print(f'- # features : {len(selected_coeffs)}\n')
print(f'Features: ')
for j,(f,c) in enumerate(zip(selected_feature_names, selected_coeffs)):
print(f'({j+1}) {f:13s}: {c:.6f}')
print('==================================\n')
# plot results
if plot:
plot_lasso_classification(classifier, feats, coeffs)
return classifier, feats, coeffs
[docs]
def plot_lasso_classification(classifier, feats = None, coeffs = None, draw_labels='auto', axs = None):
"""Plot results of the LASSO classification."""
# check that the tested regularization values are more than 1
if len(classifier.Cs_) == 1:
print('Plotting is not available, as the regressor has been optimized with a single regularization value.')
return
# get number of classifiers (1 if n_states = 2, otherwise equal to n_states)
n_models = len(classifier.C_)
# define figure
if axs is None:
init_axs = True
_, axs = plt.subplots(3, n_models, figsize=(6*n_models, 9), sharex=True)
plt.suptitle('LASSO CLASSIFICATION')
else:
init_axs = False
for i,key in enumerate(classifier.scores_.keys()):
# (1) COEFFICIENTS PATH
ax = axs[0] if n_models == 1 else axs[0][i]
c = 'black'
ax.plot(classifier.Cs_, np.mean(classifier.coefs_paths_[key], axis=0),color=c,alpha=0.6)
ax.set_xscale('log')
ax.set_title(f'Coefficients path ({key})')
ax.set_xmargin(0)
ax.set_ylabel('Coefficients',color=c)
values = np.asarray( [np.mean(c, axis=0) for c in classifier.coefs_paths_.values() ])
ax.set_ylim(values.min(),values.max())
if draw_labels == 'auto':
draw_labels = False
if feats is not None and coeffs is not None:
draw_labels = True if len(feats[key])<= 3 else False
if draw_labels:
if feats is None or coeffs is None:
raise ValueError('To draw the names of the features one need to pass both the selected features (`feats`) an coefficients (`coeffs`).')
for name, coef in zip(feats[key],coeffs[key]):
ax.text(classifier.C_[i], coef, name,
fontsize='small', horizontalalignment='center', bbox=dict(facecolor='white', alpha=0.6))
# (2) ACCURACY AND SCORE
ax = axs[1] if n_models == 1 else axs[1][i]
c = 'fessa6'
score_path = classifier.scores_[key].mean(axis=0)
ax.plot(classifier.Cs_, score_path, color=c,alpha=0.9)
ax.set_title('Score')
if i == 0:
ax.set_ylabel('Score',color=c)
ax.set_ylim(-50,5)
ax2 = ax.twinx()
c = 'fessa5'
num_features_path = np.mean(np.count_nonzero(classifier.coefs_paths_[key], axis = -1), axis= 0)
ax2.plot(classifier.Cs_, classifier.scoring.accuracy_from_score(score_path, num_features_path), color=c, linestyle='-.',alpha=0.9)
if i == n_models-1:
ax2.set_ylabel('Accuracy',color=c)
ax2.set_ylim(0.5,1.05)
# (3) NUMBER OF FEATURES
ax = axs[2] if n_models == 1 else axs[2][i]
c = 'fessa0'
ax.plot(classifier.Cs_, num_features_path, color=c)
ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))
ax.set_title('Number of features')
ax.set_xlabel('Regularization strength'+r'$^{-1}$')
ax.set_ylabel('# features',color=c)
ax.set_ylim(0,None)
# selected regularization value
axs_i = axs if n_models == 1 else axs[:,i]
for ax in axs_i:
ax.axvline(classifier.C_[i],color='gray',linestyle='dotted')
ax.set_xmargin(0)
if init_axs:
matplotlib.pyplot.tight_layout()
[docs]
def lasso_regression(dataset,
alphas = None,
scale_inputs = True,
print_info = True,
plot = True,
):
"""Perform sparse regression via LASSO on a given DictDataset (requires keys: "data" and "target").
The regularization strength alpha is automatically chosen based on cross-validation on a set of values (alphas),
see sklearn.linear_model.LassoCV. The scoring function used is the MSE loss + alpha * L1 regularization.
Parameters
----------
dataset : DictDataset
dataset with 'data' and 'target'
alphas : int or array-like, optional
List of alphas where to compute the models. If None alphas are set automatically.
scale_inputs : bool, optional
whether to standardize inputs based on mean and std.dev., by default True
feature_names : list, optional
names of the input features, if not given they are taken from `dataset.feature_names`, by default None
print_info : bool, optional
whether to print results, by default True
plot : bool, optional
whether to plot results, by default True
Returns
-------
regressor:
optimized estimator
feats:
names of the non-zero features
coeffs:
coefficients of the non-zero features
"""
# Convert dataset to numpy
with torch.no_grad():
raw_descriptors = dataset['data'].numpy()
target = dataset['target'].numpy()
feature_names = dataset.feature_names
# Check target shape
if (target.ndim > 1) and (target.shape[1] != 1):
raise ValueError(f'Lasso Regressor only accepts scalar targets, while an array of shape {target.shape} is given.')
# Scaling inputs
if scale_inputs:
scaler = StandardScaler(with_mean=True, with_std=True)
descriptors = scaler.fit_transform(raw_descriptors)
else:
descriptors = raw_descriptors
# Define Cross-validation & fit
_regressor = LassoCV(alphas=alphas)
feature_selector = SelectFromModel(_regressor)
feature_selector.fit(descriptors, np.squeeze(target))
regressor = feature_selector.estimator_
# Save coefficients path
_, coefs_paths, dual_gaps = regressor.path(descriptors, target, alphas=regressor.alphas_)
regressor.coefs_paths_ = coefs_paths
# Get selected features and coefficients
index = np.abs(regressor.coef_).argsort()[::-1]
sorted_feature_names = feature_names[index]
sorted_coeffs = regressor.coef_[index]
idx = np.argwhere ( np.abs(sorted_coeffs)>1e-5 )[:,0]
selected_feature_names = sorted_feature_names[idx]
selected_coeffs = sorted_coeffs[idx]
# display summary
if print_info:
score = regressor.score(descriptors,target)
print(f'========= LASSO results ==========')
print(f'- Regularization : {regressor.alpha_:.8f}')
print(f'- Score : {score:.8f}')
print(f'- # features : {len(selected_coeffs)}')
print('\n======= Relevant features =======')
for i,(f,c) in enumerate(zip(selected_feature_names, selected_coeffs)):
print(f'({i+1}) {f:13s}: {c:.6f}')
print('=================================')
# plot results
if plot:
plot_lasso_regression(regressor, selected_feature_names, selected_coeffs)
return regressor, selected_feature_names, selected_coeffs
[docs]
def plot_lasso_regression(regressor, feats = None, coeffs = None, draw_labels='auto', axs = None):
"""Plot the results of the LASSO regression."""
# check that the tested regularization values are more than 1
if len(regressor.alphas_) == 1:
print('Plotting is not available, as the regressor has been optimized with a single regularization value.')
return
# define figure
if axs is None:
fig, axs = plt.subplots(3, 1, figsize=(6, 9), sharex=True)
plt.suptitle('LASSO REGRESSION')
init_axs = True
else:
init_axs = False
# (1) COEFFICIENTS PATH
ax = axs[0]
c = 'black'
alphas = regressor.alphas_
coefs_paths = regressor.coefs_paths_.T
axs[0].plot(alphas, coefs_paths,color=c,alpha=0.6)
ax.set_xscale('log')
ax.set_title('Coefficients path')
ax.set_xmargin(0)
ax.set_ylabel('Coefficients',color=c)
if draw_labels == 'auto':
draw_labels = False
if feats is not None and coeffs is not None:
draw_labels = True if len(feats)<= 3 else False
if draw_labels:
if feats is None or coeffs is None:
raise ValueError('To draw the labels one need to pass both the selected features (`feats`) an coefficients (`coeffs`).')
for name, coef in zip(feats,coeffs):
ax.text(regressor.alpha_, coef, name,
fontsize='small', horizontalalignment='center', bbox=dict(facecolor='white', alpha=0.6))
# (2) ACCURACY AND SCORE
ax = axs[1]
c = 'fessa6'
mse_path = regressor.mse_path_.mean(axis=1)
ax.plot(alphas, mse_path, color=c,alpha=0.9)
ax.set_title('Score')
ax.set_ylabel('MSE',color=c)
# (3) NUMBER OF FEATURES
ax = axs[2]
c = 'fessa0'
num_features_path = np.count_nonzero(coefs_paths, axis = 1)
ax.plot(alphas, num_features_path, color=c)
ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))
ax.set_title('Number of features')
ax.set_xlabel('Regularization strength')
ax.set_ylabel('# features',color=c)
# selected regularization value
for ax in axs:
ax.axvline(regressor.alpha_,color='gray',linestyle='dotted')
if init_axs:
matplotlib.pyplot.tight_layout()
def test_lasso_classification():
from mlcolvar.data import DictDataset
for n_states in [2,3]:
# create dataset
samples = 50
X = torch.randn((samples*2, 2))
# create labels
y = torch.zeros(samples*2)
for i in range(1, n_states):
y[samples * i :] += 1
dataset = DictDataset({"data": X, "labels": y})
dataset.feature_names = ['x1','x2']
for Cs in [ 40, [0.1], np.logspace(-4,0,10) ]:
for min_features in [0,1]:
_ = lasso_classification(dataset,
min_features = min_features,
Cs = Cs,
scale_inputs = True,
feature_names = None,
print_info = False,
plot = False )
def test_lasso_regression():
from mlcolvar.data import DictDataset
# create dataset
samples = 50
X = torch.randn((samples, 2))
y = torch.randn(samples)
dataset = DictDataset({"data": X, "target": y})
dataset.feature_names = ['x1','x2']
for alphas in [ None, [0.1], np.logspace(-4,0,10) ]:
_ = lasso_regression(dataset,
alphas = alphas,
scale_inputs = True,
print_info = False,
plot = False)